From ceab6e8bcce41c1a8af0cf872098ae745d9ce5ff Mon Sep 17 00:00:00 2001 From: Rob Armstrong Date: Thu, 27 Mar 2025 10:30:07 -0700 Subject: [PATCH] Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks. --- .clang-format | 49 + .pre-commit-config.yaml | 100 + .../UnifiedMemoryStreams.cu | 444 +- Samples/0_Introduction/asyncAPI/asyncAPI.cu | 186 +- Samples/0_Introduction/clock/clock.cu | 123 +- Samples/0_Introduction/clock_nvrtc/clock.cpp | 95 +- .../clock_nvrtc/clock_kernel.cu | 59 +- .../0_Introduction/cudaOpenMP/cudaOpenMP.cu | 183 +- .../fp16ScalarProduct/fp16ScalarProduct.cu | 287 +- Samples/0_Introduction/matrixMul/matrixMul.cu | 469 +- .../0_Introduction/matrixMulDrv/matrixMul.h | 14 +- .../matrixMulDrv/matrixMulDrv.cpp | 429 +- .../matrixMulDrv/matrixMul_kernel.cu | 121 +- .../matrixMulDynlinkJIT/cuda_drvapi_dynlink.c | 503 +- .../cuda_drvapi_dynlink_cuda.h | 2901 +++++----- .../matrixMulDynlinkJIT/helper_cuda_drvapi.h | 493 +- .../matrixMulDynlinkJIT/matrixMul.h | 6 +- .../matrixMulDynlinkJIT.cpp | 140 +- .../matrixMulDynlinkJIT/matrixMul_gold.cpp | 12 +- .../matrixMul_kernel_32_ptxdump.c | 2974 +++++----- .../matrixMul_kernel_32_ptxdump.h | 3 +- .../matrixMul_kernel_64_ptxdump.c | 3055 +++++------ .../matrixMul_kernel_64_ptxdump.h | 3 +- .../matrixMul_nvrtc/matrixMul.cpp | 315 +- .../matrixMul_nvrtc/matrixMul_kernel.cu | 115 +- Samples/0_Introduction/mergeSort/bitonic.cu | 402 +- Samples/0_Introduction/mergeSort/main.cpp | 136 +- Samples/0_Introduction/mergeSort/mergeSort.cu | 826 +-- .../mergeSort/mergeSort_common.h | 20 +- .../mergeSort/mergeSort_host.cpp | 520 +- .../mergeSort/mergeSort_validate.cpp | 144 +- .../simpleAWBarrier/simpleAWBarrier.cu | 357 +- .../simpleAssert/simpleAssert.cu | 92 +- .../simpleAssert_nvrtc/simpleAssert.cpp | 84 +- .../simpleAssert_nvrtc/simpleAssert_kernel.cu | 7 +- .../simpleAtomicIntrinsics.cu | 100 +- .../simpleAtomicIntrinsics_cpu.cpp | 245 +- .../simpleAtomicIntrinsics_kernel.cuh | 59 +- .../simpleAtomicIntrinsics.cpp | 122 +- .../simpleAtomicIntrinsics_cpu.cpp | 245 +- .../simpleAtomicIntrinsics_kernel.cuh | 59 +- .../simpleAttributes/simpleAttributes.cu | 268 +- .../0_Introduction/simpleCUDA2GL/README.md | 1 - Samples/0_Introduction/simpleCUDA2GL/main.cpp | 807 +-- .../simpleCUDA2GL/simpleCUDA2GL.cu | 38 +- .../simpleCallback/multithreading.cpp | 129 +- .../simpleCallback/multithreading.h | 61 +- .../simpleCallback/simpleCallback.cu | 289 +- .../simpleCooperativeGroups.cu | 174 +- .../simpleCubemapTexture.cu | 358 +- .../simpleDrvRuntime/simpleDrvRuntime.cpp | 276 +- .../simpleDrvRuntime/vectorAdd_kernel.cu | 9 +- .../simpleHyperQ/simpleHyperQ.cu | 326 +- Samples/0_Introduction/simpleIPC/README.md | 1 - Samples/0_Introduction/simpleIPC/simpleIPC.cu | 486 +- .../simpleLayeredTexture.cu | 297 +- .../0_Introduction/simpleMPI/simpleMPI.cpp | 131 +- Samples/0_Introduction/simpleMPI/simpleMPI.cu | 92 +- Samples/0_Introduction/simpleMPI/simpleMPI.h | 27 +- .../simpleMultiCopy/simpleMultiCopy.cu | 439 +- .../simpleMultiGPU/simpleMultiGPU.cu | 298 +- .../simpleMultiGPU/simpleMultiGPU.h | 28 +- .../simpleOccupancy/simpleOccupancy.cu | 238 +- Samples/0_Introduction/simpleP2P/README.md | 1 - Samples/0_Introduction/simpleP2P/simpleP2P.cu | 407 +- .../simplePitchLinearTexture.cu | 396 +- .../simplePrintf/simplePrintf.cu | 51 +- .../simpleStreams/simpleStreams.cu | 587 +- .../simpleSurfaceWrite/simpleSurfaceWrite.cu | 323 +- .../simpleTemplates/sharedmem.cuh | 162 +- .../simpleTemplates/simpleTemplates.cu | 297 +- .../simpleTexture/simpleTexture.cu | 265 +- .../0_Introduction/simpleTexture3D/README.md | 1 - .../simpleTexture3D/simpleTexture3D.cpp | 421 +- .../simpleTexture3D/simpleTexture3D_kernel.cu | 160 +- .../simpleTextureDrv/simpleTextureDrv.cpp | 451 +- .../simpleTextureDrv/simpleTexture_kernel.cu | 29 +- .../simpleVoteIntrinsics.cu | 402 +- .../simpleVote_kernel.cuh | 47 +- .../simpleZeroCopy/simpleZeroCopy.cu | 283 +- .../systemWideAtomics/README.md | 1 - .../systemWideAtomics/systemWideAtomics.cu | 475 +- Samples/0_Introduction/template/template.cu | 162 +- .../0_Introduction/template/template_cpu.cpp | 14 +- Samples/0_Introduction/vectorAdd/vectorAdd.cu | 270 +- .../vectorAddDrv/vectorAddDrv.cpp | 236 +- .../vectorAddDrv/vectorAdd_kernel.cu | 9 +- .../0_Introduction/vectorAddMMAP/README.md | 1 - .../vectorAddMMAP/multidevicealloc_memmap.cpp | 298 +- .../vectorAddMMAP/multidevicealloc_memmap.hpp | 10 +- .../vectorAddMMAP/vectorAddMMAP.cpp | 175 +- .../vectorAddMMAP/vectorAdd_kernel.cu | 9 +- .../vectorAdd_nvrtc/vectorAdd.cpp | 188 +- .../vectorAdd_nvrtc/vectorAdd_kernel.cu | 12 +- .../bandwidthTest/bandwidthTest.cu | 1526 +++--- .../1_Utilities/deviceQuery/deviceQuery.cpp | 487 +- .../deviceQueryDrv/deviceQueryDrv.cpp | 593 +- Samples/1_Utilities/topologyQuery/README.md | 1 - .../topologyQuery/topologyQuery.cu | 68 +- .../EGLStream_CUDA_CrossGPU/README.md | 1 - .../EGLStream_CUDA_CrossGPU/cuda_consumer.cpp | 378 +- .../EGLStream_CUDA_CrossGPU/cuda_consumer.h | 48 +- .../EGLStream_CUDA_CrossGPU/cuda_producer.cpp | 459 +- .../EGLStream_CUDA_CrossGPU/cuda_producer.h | 57 +- .../eglstrm_common.cpp | 615 +-- .../EGLStream_CUDA_CrossGPU/eglstrm_common.h | 70 +- .../EGLStream_CUDA_CrossGPU/helper.h | 323 +- .../EGLStream_CUDA_CrossGPU/kernel.cu | 178 +- .../EGLStream_CUDA_CrossGPU/main.cpp | 664 ++- .../EGLStream_CUDA_Interop/README.md | 1 - .../EGLStream_CUDA_Interop/cuda_consumer.cpp | 532 +- .../EGLStream_CUDA_Interop/cuda_consumer.h | 31 +- .../EGLStream_CUDA_Interop/cuda_producer.cpp | 676 +-- .../EGLStream_CUDA_Interop/cuda_producer.h | 49 +- .../EGLStream_CUDA_Interop/eglstrm_common.cpp | 159 +- .../EGLStream_CUDA_Interop/eglstrm_common.h | 89 +- .../EGLStream_CUDA_Interop/main.cpp | 329 +- .../FunctionPointers/FunctionPointers.cpp | 710 ++- .../FunctionPointers_kernels.cu | 534 +- .../FunctionPointers_kernels.h | 18 +- .../FunctionPointers/README.md | 1 - .../MC_EstimatePiInlineP/inc/cudasharedmem.h | 36 +- .../MC_EstimatePiInlineP/inc/piestimator.h | 21 +- .../MC_EstimatePiInlineP/inc/test.h | 33 +- .../MC_EstimatePiInlineP/src/main.cpp | 363 +- .../MC_EstimatePiInlineP/src/piestimator.cu | 368 +- .../MC_EstimatePiInlineP/src/test.cpp | 125 +- .../MC_EstimatePiInlineQ/inc/cudasharedmem.h | 36 +- .../MC_EstimatePiInlineQ/inc/piestimator.h | 19 +- .../MC_EstimatePiInlineQ/inc/test.h | 29 +- .../MC_EstimatePiInlineQ/src/main.cpp | 331 +- .../MC_EstimatePiInlineQ/src/piestimator.cu | 567 +- .../MC_EstimatePiInlineQ/src/test.cpp | 124 +- .../MC_EstimatePiP/inc/cudasharedmem.h | 36 +- .../MC_EstimatePiP/inc/piestimator.h | 21 +- .../MC_EstimatePiP/inc/test.h | 33 +- .../MC_EstimatePiP/src/main.cpp | 359 +- .../MC_EstimatePiP/src/piestimator.cu | 392 +- .../MC_EstimatePiP/src/test.cpp | 121 +- .../MC_EstimatePiQ/inc/cudasharedmem.h | 36 +- .../MC_EstimatePiQ/inc/piestimator.h | 19 +- .../MC_EstimatePiQ/inc/test.h | 29 +- .../MC_EstimatePiQ/src/main.cpp | 323 +- .../MC_EstimatePiQ/src/piestimator.cu | 419 +- .../MC_EstimatePiQ/src/test.cpp | 123 +- .../MC_SingleAsianOptionP/inc/asianoption.h | 30 +- .../MC_SingleAsianOptionP/inc/cudasharedmem.h | 36 +- .../MC_SingleAsianOptionP/inc/pricingengine.h | 21 +- .../MC_SingleAsianOptionP/inc/test.h | 35 +- .../MC_SingleAsianOptionP/src/main.cpp | 361 +- .../src/pricingengine.cu | 491 +- .../MC_SingleAsianOptionP/src/test.cpp | 171 +- Samples/2_Concepts_and_Techniques/README.md | 3 +- .../boxFilter/README.md | 1 - .../boxFilter/boxFilter.cpp | 789 +-- .../boxFilter/boxFilter_cpu.cpp | 145 +- .../boxFilter/boxFilter_kernel.cu | 605 +- .../convolutionSeparable.cu | 222 +- .../convolutionSeparable_common.h | 13 +- .../convolutionSeparable_gold.cpp | 49 +- .../convolutionSeparable/main.cpp | 203 +- .../convolutionTexture/convolutionTexture.cu | 123 +- .../convolutionTexture_common.h | 15 +- .../convolutionTexture_gold.cpp | 57 +- .../convolutionTexture/main.cpp | 245 +- .../dct8x8/BmpUtil.cpp | 368 +- .../dct8x8/BmpUtil.h | 122 +- .../2_Concepts_and_Techniques/dct8x8/Common.h | 49 +- .../dct8x8/DCT8x8_Gold.cpp | 359 +- .../dct8x8/DCT8x8_Gold.h | 15 +- .../dct8x8/dct8x8.cu | 853 +-- .../dct8x8/dct8x8_kernel1.cuh | 230 +- .../dct8x8/dct8x8_kernel2.cuh | 256 +- .../dct8x8/dct8x8_kernel_quantization.cuh | 104 +- .../dct8x8/dct8x8_kernel_short.cuh | 634 ++- .../eigenvalues/bisect_kernel_large.cuh | 1332 +++-- .../eigenvalues/bisect_kernel_large_multi.cuh | 325 +- .../eigenvalues/bisect_kernel_large_onei.cuh | 177 +- .../eigenvalues/bisect_kernel_small.cuh | 347 +- .../eigenvalues/bisect_large.cu | 502 +- .../eigenvalues/bisect_large.cuh | 91 +- .../eigenvalues/bisect_small.cu | 176 +- .../eigenvalues/bisect_small.cuh | 86 +- .../eigenvalues/bisect_util.cu | 585 +- .../eigenvalues/config.h | 4 +- .../eigenvalues/gerschgorin.cpp | 62 +- .../eigenvalues/gerschgorin.h | 5 +- .../eigenvalues/main.cu | 350 +- .../eigenvalues/matlab.cpp | 35 +- .../eigenvalues/matlab.h | 65 +- .../eigenvalues/structs.h | 143 +- .../eigenvalues/util.h | 57 +- .../histogram/histogram256.cu | 162 +- .../histogram/histogram64.cu | 193 +- .../histogram/histogram_common.h | 13 +- .../histogram/histogram_gold.cpp | 47 +- .../histogram/main.cpp | 308 +- .../imageDenoising/bmploader.cpp | 165 +- .../imageDenoising/imageDenoising.cu | 58 +- .../imageDenoising/imageDenoising.h | 45 +- .../imageDenoising/imageDenoisingGL.cpp | 751 ++- .../imageDenoising_copy_kernel.cuh | 32 +- .../imageDenoising_knn_kernel.cuh | 176 +- .../imageDenoising_nlm2_kernel.cuh | 250 +- .../imageDenoising_nlm_kernel.cuh | 187 +- .../inlinePTX/inlinePTX.cu | 31 +- .../inlinePTX_nvrtc/inlinePTX.cpp | 124 +- .../inlinePTX_nvrtc/inlinePTX_kernel.cu | 17 +- .../interval/boost/config.hpp | 35 +- .../boost/config/abi/borland_prefix.hpp | 11 +- .../boost/config/abi/borland_suffix.hpp | 13 +- .../interval/boost/config/abi/msvc_prefix.hpp | 14 +- .../interval/boost/config/abi/msvc_suffix.hpp | 8 +- .../interval/boost/config/abi_prefix.hpp | 11 +- .../interval/boost/config/abi_suffix.hpp | 12 +- .../interval/boost/config/auto_link.hpp | 311 +- .../boost/config/compiler/borland.hpp | 211 +- .../boost/config/compiler/codegear.hpp | 93 +- .../interval/boost/config/compiler/comeau.hpp | 54 +- .../boost/config/compiler/common_edg.hpp | 57 +- .../boost/config/compiler/compaq_cxx.hpp | 9 +- .../boost/config/compiler/digitalmars.hpp | 14 +- .../interval/boost/config/compiler/gcc.hpp | 194 +- .../boost/config/compiler/gcc_xml.hpp | 18 +- .../boost/config/compiler/greenhills.hpp | 16 +- .../interval/boost/config/compiler/hp_acc.hpp | 80 +- .../interval/boost/config/compiler/intel.hpp | 98 +- .../interval/boost/config/compiler/kai.hpp | 33 +- .../boost/config/compiler/metrowerks.hpp | 137 +- .../interval/boost/config/compiler/mpw.hpp | 52 +- .../interval/boost/config/compiler/pgi.hpp | 11 +- .../boost/config/compiler/sgi_mipspro.hpp | 8 +- .../boost/config/compiler/sunpro_cc.hpp | 124 +- .../interval/boost/config/compiler/vacpp.hpp | 41 +- .../boost/config/compiler/visualc.hpp | 244 +- .../interval/boost/config/no_tr1/cmath.hpp | 20 +- .../interval/boost/config/no_tr1/complex.hpp | 20 +- .../boost/config/no_tr1/functional.hpp | 20 +- .../interval/boost/config/no_tr1/memory.hpp | 20 +- .../interval/boost/config/no_tr1/utility.hpp | 20 +- .../interval/boost/config/platform/aix.hpp | 12 +- .../boost/config/platform/amigaos.hpp | 8 +- .../interval/boost/config/platform/beos.hpp | 11 +- .../interval/boost/config/platform/bsd.hpp | 34 +- .../interval/boost/config/platform/cygwin.hpp | 31 +- .../interval/boost/config/platform/hpux.hpp | 69 +- .../interval/boost/config/platform/irix.hpp | 17 +- .../interval/boost/config/platform/linux.hpp | 94 +- .../interval/boost/config/platform/macos.hpp | 69 +- .../interval/boost/config/platform/qnxnto.hpp | 17 +- .../boost/config/platform/solaris.hpp | 14 +- .../boost/config/platform/vxworks.hpp | 9 +- .../interval/boost/config/platform/win32.hpp | 33 +- .../interval/boost/config/posix_features.hpp | 142 +- .../boost/config/requires_threads.hpp | 54 +- .../boost/config/select_compiler_config.hpp | 85 +- .../boost/config/select_platform_config.hpp | 66 +- .../boost/config/select_stdlib_config.hpp | 37 +- .../boost/config/stdlib/dinkumware.hpp | 151 +- .../interval/boost/config/stdlib/libcomo.hpp | 70 +- .../boost/config/stdlib/libstdcpp3.hpp | 120 +- .../interval/boost/config/stdlib/modena.hpp | 63 +- .../interval/boost/config/stdlib/msl.hpp | 95 +- .../boost/config/stdlib/roguewave.hpp | 136 +- .../interval/boost/config/stdlib/sgi.hpp | 114 +- .../interval/boost/config/stdlib/stlport.hpp | 183 +- .../interval/boost/config/stdlib/vacpp.hpp | 55 +- .../interval/boost/config/suffix.hpp | 391 +- .../interval/boost/config/user.hpp | 87 +- .../interval/boost/config/warning_disable.hpp | 22 +- .../interval/boost/limits.hpp | 190 +- .../interval/boost/numeric/interval.hpp | 21 +- .../interval/boost/numeric/interval/arith.hpp | 505 +- .../boost/numeric/interval/arith2.hpp | 425 +- .../boost/numeric/interval/arith3.hpp | 52 +- .../boost/numeric/interval/checking.hpp | 149 +- .../boost/numeric/interval/compare.hpp | 2 +- .../numeric/interval/compare/certain.hpp | 102 +- .../numeric/interval/compare/explicit.hpp | 194 +- .../interval/compare/lexicographic.hpp | 122 +- .../numeric/interval/compare/possible.hpp | 102 +- .../boost/numeric/interval/compare/set.hpp | 66 +- .../numeric/interval/compare/tribool.hpp | 176 +- .../boost/numeric/interval/constants.hpp | 80 +- .../detail/alpha_rounding_control.hpp | 93 +- .../interval/detail/bcc_rounding_control.hpp | 31 +- .../boost/numeric/interval/detail/bugs.hpp | 79 +- .../interval/detail/c99_rounding_control.hpp | 27 +- .../detail/c99sub_rounding_control.hpp | 24 +- .../numeric/interval/detail/division.hpp | 284 +- .../interval/detail/ia64_rounding_control.hpp | 58 +- .../interval/detail/interval_prototype.hpp | 16 +- .../interval/detail/msvc_rounding_control.hpp | 169 +- .../interval/detail/ppc_rounding_control.hpp | 74 +- .../detail/sparc_rounding_control.hpp | 105 +- .../numeric/interval/detail/test_input.hpp | 61 +- .../interval/detail/x86_rounding_control.hpp | 82 +- .../detail/x86gcc_rounding_control.hpp | 25 +- .../boost/numeric/interval/ext/integer.hpp | 40 +- .../ext/x86_fast_rounding_control.hpp | 48 +- .../boost/numeric/interval/hw_rounding.hpp | 41 +- .../boost/numeric/interval/interval.hpp | 616 ++- .../interval/boost/numeric/interval/io.hpp | 15 +- .../boost/numeric/interval/limits.hpp | 33 +- .../boost/numeric/interval/policies.hpp | 45 +- .../boost/numeric/interval/rounded_arith.hpp | 208 +- .../boost/numeric/interval/rounded_transc.hpp | 248 +- .../boost/numeric/interval/rounding.hpp | 78 +- .../boost/numeric/interval/transc.hpp | 299 +- .../boost/numeric/interval/utility.hpp | 418 +- .../interval/cpu_interval.h | 393 +- .../interval/cuda_interval.h | 443 +- .../interval/cuda_interval_lib.h | 459 +- .../interval/cuda_interval_rounded_arith.h | 160 +- .../interval/interval.cu | 167 +- .../interval/interval.h | 28 +- .../particles/particleSystem.cpp | 529 +- .../particles/particleSystem.cuh | 64 +- .../particles/particleSystem.h | 154 +- .../particles/particleSystem_cuda.cu | 295 +- .../particles/particles.cpp | 1036 ++-- .../particles/particles_kernel.cuh | 35 +- .../particles/particles_kernel_impl.cuh | 426 +- .../particles/render_particles.cpp | 184 +- .../particles/render_particles.h | 66 +- .../particles/shaders.cpp | 41 +- .../radixSortThrust/doc/readme.txt | 2 +- .../radixSortThrust/radixSortThrust.cu | 318 +- .../reduction/reduction.cpp | 727 +-- .../reduction/reduction.h | 4 +- .../reduction/reduction_kernel.cu | 1356 +++-- .../reductionMultiBlockCG.cu | 419 +- .../scalarProd/scalarProd.cu | 164 +- .../scalarProd/scalarProd_cpu.cpp | 20 +- .../scalarProd/scalarProd_kernel.cuh | 83 +- .../2_Concepts_and_Techniques/scan/main.cpp | 264 +- .../2_Concepts_and_Techniques/scan/scan.cu | 295 +- .../scan/scan_common.h | 9 +- .../scan/scan_gold.cpp | 13 +- .../segmentationTreeThrust/common.cuh | 7 +- .../segmentationTreeThrust/kernels.cuh | 139 +- .../segmentationTree.cu | 1408 +++-- .../shfl_scan/shfl_integral_image.cuh | 449 +- .../shfl_scan/shfl_scan.cu | 563 +- .../shfl_scan/util.h | 54 +- .../sortingNetworks/bitonicSort.cu | 361 +- .../sortingNetworks/main.cpp | 187 +- .../sortingNetworks/oddEvenMergeSort.cu | 220 +- .../sortingNetworks_common.cuh | 24 +- .../sortingNetworks/sortingNetworks_common.h | 22 +- .../sortingNetworks_validate.cpp | 166 +- .../streamOrderedAllocation.cu | 306 +- .../streamOrderedAllocationIPC/README.md | 1 - .../streamOrderedAllocationIPC.cu | 707 ++- .../streamOrderedAllocationP2P.cu | 355 +- .../threadFenceReduction.cu | 600 +- .../threadFenceReduction_kernel.cuh | 543 +- .../threadMigration/threadMigration.cpp | 581 +- .../threadMigration/threadMigration_kernel.cu | 4 +- Samples/3_CUDA_Features/README.md | 1 - .../StreamPriorities/README.md | 1 - .../StreamPriorities/StreamPriorities.cu | 244 +- .../bf16TensorCoreGemm/bf16TensorCoreGemm.cu | 330 +- .../binaryPartitionCG/binaryPartitionCG.cu | 157 +- .../3_CUDA_Features/bindlessTexture/README.md | 1 - .../bindlessTexture/bindlessTexture.cpp | 513 +- .../bindlessTexture/bindlessTexture.h | 37 +- .../bindlessTexture/bindlessTexture_kernel.cu | 506 +- .../cdpAdvancedQuicksort.cu | 844 ++- .../cdpAdvancedQuicksort/cdpBitonicSort.cu | 328 +- .../cdpAdvancedQuicksort/cdpQuicksort.h | 64 +- .../cdpBezierTessellation/BezierLineCDP.cu | 264 +- .../cdpQuadtree/cdpQuadtree.cu | 1027 ++-- .../cdpSimplePrint/cdpSimplePrint.cu | 182 +- .../cdpSimpleQuicksort/cdpSimpleQuicksort.cu | 314 +- .../cudaCompressibleMemory/compMalloc.cpp | 27 +- .../cudaCompressibleMemory/saxpy.cu | 100 +- .../cudaTensorCoreGemm/cudaTensorCoreGemm.cu | 821 ++- .../dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu | 332 +- .../globalToShmemAsyncCopy.cu | 1483 +++-- .../graphConditionalNodes.cu | 135 +- .../graphMemoryFootprint.cu | 603 +- .../graphMemoryNodes/graphMemoryNodes.cu | 711 +-- .../immaTensorCoreGemm/immaTensorCoreGemm.cu | 813 ++- .../jacobiCudaGraphs/jacobi.cu | 620 +-- .../3_CUDA_Features/jacobiCudaGraphs/jacobi.h | 2 +- .../3_CUDA_Features/jacobiCudaGraphs/main.cpp | 285 +- .../memMapIPCDrv/memMapIpc.cpp | 948 ++-- .../3_CUDA_Features/newdelete/container.hpp | 100 +- .../3_CUDA_Features/newdelete/newdelete.cu | 336 +- Samples/3_CUDA_Features/ptxjit/ptxjit.cpp | 364 +- .../3_CUDA_Features/ptxjit/ptxjit_kernel.cu | 7 +- .../simpleCudaGraphs/simpleCudaGraphs.cu | 650 ++- .../tf32TensorCoreGemm/tf32TensorCoreGemm.cu | 344 +- .../warpAggregatedAtomicsCG.cu | 424 +- .../FilterBorderControlNPP.cpp | 1000 ++-- .../MersenneTwister.cpp | 251 +- Samples/4_CUDA_Libraries/README.md | 1 - .../batchCUBLAS/batchCUBLAS.cpp | 1040 ++-- .../batchCUBLAS/batchCUBLAS.h | 152 +- .../boxFilterNPP/boxFilterNPP.cpp | 323 +- .../cannyEdgeDetectorNPP.cpp | 398 +- .../conjugateGradient/main.cpp | 426 +- .../conjugateGradientCudaGraphs.cu | 674 +-- .../conjugateGradientMultiBlockCG.cu | 797 +-- .../conjugateGradientMultiDeviceCG.cu | 1267 ++--- .../conjugateGradientPrecond/main.cpp | 468 +- .../conjugateGradientUM/main.cpp | 429 +- .../cuSolverDn_LinearSolver.cpp | 891 +-- .../cuSolverDn_LinearSolver/mmio.c | 434 +- .../cuSolverDn_LinearSolver/mmio.h | 181 +- .../cuSolverDn_LinearSolver/mmio_wrapper.cpp | 558 +- .../cuSolverRf/cuSolverRf.cpp | 1550 +++--- Samples/4_CUDA_Libraries/cuSolverRf/mmio.c | 403 +- Samples/4_CUDA_Libraries/cuSolverRf/mmio.h | 181 +- .../cuSolverRf/mmio_wrapper.cpp | 807 ++- .../cuSolverSp_LinearSolver.cpp | 1188 ++-- .../cuSolverSp_LinearSolver/mmio.c | 414 +- .../cuSolverSp_LinearSolver/mmio.h | 181 +- .../cuSolverSp_LinearSolver/mmio_wrapper.cpp | 558 +- .../cuSolverSp_LowlevelCholesky.cpp | 414 +- .../cuSolverSp_LowlevelCholesky/mmio.c | 414 +- .../cuSolverSp_LowlevelCholesky/mmio.h | 181 +- .../mmio_wrapper.cpp | 558 +- .../cuSolverSp_LowlevelQR.cpp | 839 +-- .../cuSolverSp_LowlevelQR/mmio.c | 414 +- .../cuSolverSp_LowlevelQR/mmio.h | 181 +- .../cuSolverSp_LowlevelQR/mmio_wrapper.cpp | 807 ++- Samples/4_CUDA_Libraries/cudaNvSci/README.md | 1 - .../4_CUDA_Libraries/cudaNvSci/cudaNvSci.cpp | 1038 ++-- .../4_CUDA_Libraries/cudaNvSci/cudaNvSci.h | 113 +- .../cudaNvSci/imageKernels.cu | 127 +- Samples/4_CUDA_Libraries/cudaNvSci/main.cpp | 111 +- .../freeImageInteropNPP.cpp | 477 +- .../histEqualizationNPP.cpp | 427 +- Samples/4_CUDA_Libraries/jitLto/jitLto.cpp | 382 +- .../lineOfSight/lineOfSight.cu | 409 +- .../matrixMulCUBLAS/matrixMulCUBLAS.cpp | 433 +- Samples/4_CUDA_Libraries/nvJPEG/nvJPEG.cpp | 1069 ++-- .../nvJPEG_encoder/nvJPEG_encoder.cpp | 470 +- Samples/4_CUDA_Libraries/oceanFFT/README.md | 1 - .../4_CUDA_Libraries/oceanFFT/oceanFFT.cpp | 1120 ++-- .../oceanFFT/oceanFFT_kernel.cu | 165 +- Samples/4_CUDA_Libraries/randomFog/README.md | 1 - .../4_CUDA_Libraries/randomFog/randomFog.cpp | 1040 ++-- Samples/4_CUDA_Libraries/randomFog/rng.cpp | 425 +- Samples/4_CUDA_Libraries/randomFog/rng.h | 62 +- .../simpleCUBLAS/simpleCUBLAS.cpp | 320 +- .../simpleCUBLASXT/simpleCUBLASXT.cpp | 403 +- .../simpleCUBLAS_LU/simpleCUBLAS_LU.cpp | 529 +- .../simpleCUFFT/simpleCUFFT.cu | 315 +- .../simpleCUFFT_2d_MGPU.cu | 537 +- .../simpleCUFFT_MGPU/simpleCUFFT_MGPU.cu | 511 +- .../simpleCUFFT_callback.cu | 383 +- .../watershedSegmentationNPP.cpp | 462 +- .../BlackScholes/BlackScholes.cu | 331 +- .../BlackScholes/BlackScholes_gold.cpp | 86 +- .../BlackScholes/BlackScholes_kernel.cuh | 125 +- .../BlackScholes_nvrtc/BlackScholes.cpp | 374 +- .../BlackScholes_nvrtc/BlackScholes_gold.cpp | 86 +- .../BlackScholes_kernel.cuh | 119 +- Samples/5_Domain_Specific/FDTD3d/inc/FDTD3d.h | 10 +- .../5_Domain_Specific/FDTD3d/inc/FDTD3dGPU.h | 21 +- .../FDTD3d/inc/FDTD3dGPUKernel.cuh | 176 +- .../FDTD3d/inc/FDTD3dReference.h | 35 +- .../5_Domain_Specific/FDTD3d/src/FDTD3d.cpp | 323 +- .../5_Domain_Specific/FDTD3d/src/FDTD3dGPU.cu | 335 +- .../FDTD3d/src/FDTD3dReference.cpp | 277 +- .../HSOpticalFlow/addKernel.cuh | 20 +- .../5_Domain_Specific/HSOpticalFlow/common.h | 34 +- .../HSOpticalFlow/derivativesKernel.cuh | 145 +- .../HSOpticalFlow/downscaleKernel.cuh | 75 +- .../HSOpticalFlow/flowCUDA.cu | 309 +- .../HSOpticalFlow/flowCUDA.h | 23 +- .../HSOpticalFlow/flowGold.cpp | 581 +- .../HSOpticalFlow/flowGold.h | 23 +- .../5_Domain_Specific/HSOpticalFlow/main.cpp | 254 +- .../HSOpticalFlow/solverKernel.cuh | 212 +- .../HSOpticalFlow/upscaleKernel.cuh | 76 +- .../HSOpticalFlow/warpingKernel.cuh | 71 +- .../Mandelbrot/Mandelbrot.cpp | 1801 +++--- .../Mandelbrot/Mandelbrot_cuda.cu | 719 ++- .../Mandelbrot/Mandelbrot_gold.cpp | 440 +- .../Mandelbrot/Mandelbrot_gold.h | 80 +- .../Mandelbrot/Mandelbrot_kernel.cuh | 644 +-- .../Mandelbrot/Mandelbrot_kernel.h | 48 +- .../5_Domain_Specific/Mandelbrot/README.md | 1 - .../MonteCarloMultiGPU/MonteCarloMultiGPU.cpp | 702 ++- .../MonteCarloMultiGPU/MonteCarlo_common.h | 83 +- .../MonteCarloMultiGPU/MonteCarlo_gold.cpp | 143 +- .../MonteCarloMultiGPU/MonteCarlo_kernel.cu | 258 +- .../MonteCarlo_reduction.cuh | 58 +- .../MonteCarloMultiGPU/multithreading.cpp | 41 +- .../MonteCarloMultiGPU/multithreading.h | 21 +- .../MonteCarloMultiGPU/realtype.h | 2 +- .../NV12toBGRandResize/bgr_resize.cu | 142 +- .../NV12toBGRandResize/nv12_resize.cu | 130 +- .../NV12toBGRandResize/nv12_to_bgr_planar.cu | 191 +- .../NV12toBGRandResize/resize_convert.h | 53 +- .../resize_convert_main.cpp | 610 ++- .../NV12toBGRandResize/utils.cu | 172 +- .../NV12toBGRandResize/utils.h | 7 +- Samples/5_Domain_Specific/README.md | 3 +- .../5_Domain_Specific/SobelFilter/README.md | 1 - .../SobelFilter/SobelFilter.cpp | 609 ++- .../SobelFilter/SobelFilter_kernels.cu | 379 +- .../SobelFilter/SobelFilter_kernels.h | 9 +- Samples/5_Domain_Specific/SobolQRNG/sobol.cpp | 423 +- .../SobolQRNG/sobol_gold.cpp | 179 +- .../5_Domain_Specific/SobolQRNG/sobol_gold.h | 3 +- .../5_Domain_Specific/SobolQRNG/sobol_gpu.cu | 243 +- .../5_Domain_Specific/SobolQRNG/sobol_gpu.h | 3 +- .../SobolQRNG/sobol_primitives.cpp | 3 +- .../SobolQRNG/sobol_primitives.h | 11 +- .../bicubicTexture/README.md | 1 - .../bicubicTexture/bicubicTexture.cpp | 921 ++-- .../bicubicTexture/bicubicTexture_cuda.cu | 130 +- .../bicubicTexture/bicubicTexture_kernel.cuh | 422 +- .../bilateralFilter/README.md | 1 - .../bilateralFilter/bilateralFilter.cpp | 858 +-- .../bilateralFilter/bilateralFilter_cpu.cpp | 229 +- .../bilateralFilter/bilateral_kernel.cu | 268 +- .../bilateralFilter/bmploader.cpp | 170 +- .../binomialOptions/binomialOptions.cpp | 195 +- .../binomialOptions/binomialOptions_common.h | 13 +- .../binomialOptions/binomialOptions_gold.cpp | 134 +- .../binomialOptions/binomialOptions_kernel.cu | 152 +- .../binomialOptions/realtype.h | 2 +- .../binomialOptions_nvrtc/binomialOptions.cpp | 190 +- .../binomialOptions_common.h | 13 +- .../binomialOptions_gold.cpp | 134 +- .../binomialOptions_gpu.cpp | 133 +- .../binomialOptions_kernel.cu | 85 +- .../binomialOptions_nvrtc/common_gpu_header.h | 10 +- .../binomialOptions_nvrtc/realtype.h | 2 +- .../convolutionFFT2D/convolutionFFT2D.cu | 387 +- .../convolutionFFT2D/convolutionFFT2D.cuh | 543 +- .../convolutionFFT2D_common.h | 59 +- .../convolutionFFT2D_gold.cpp | 50 +- .../convolutionFFT2D/main.cpp | 898 ++- .../5_Domain_Specific/dwtHaar1D/dwtHaar1D.cu | 487 +- .../dwtHaar1D/dwtHaar1D_kernel.cuh | 218 +- Samples/5_Domain_Specific/dxtc/CudaMath.h | 125 +- Samples/5_Domain_Specific/dxtc/dds.h | 82 +- Samples/5_Domain_Specific/dxtc/dxtc.cu | 1132 ++-- Samples/5_Domain_Specific/dxtc/permutations.h | 190 +- .../fastWalshTransform/fastWalshTransform.cu | 155 +- .../fastWalshTransform_gold.cpp | 84 +- .../fastWalshTransform_kernel.cuh | 218 +- Samples/5_Domain_Specific/fluidsGL/defines.h | 27 +- .../5_Domain_Specific/fluidsGL/fluidsGL.cpp | 637 ++- .../fluidsGL/fluidsGL_kernels.cu | 432 +- .../fluidsGL/fluidsGL_kernels.cuh | 17 +- .../fluidsGL/fluidsGL_kernels.h | 17 +- .../5_Domain_Specific/marchingCubes/README.md | 1 - .../5_Domain_Specific/marchingCubes/defines.h | 2 +- .../marchingCubes/marchingCubes.cpp | 1205 ++-- .../marchingCubes/marchingCubes_kernel.cu | 1010 ++-- .../5_Domain_Specific/marchingCubes/tables.h | 448 +- Samples/5_Domain_Specific/nbody/bodysystem.h | 365 +- .../5_Domain_Specific/nbody/bodysystemcpu.h | 58 +- .../nbody/bodysystemcpu_impl.h | 378 +- .../5_Domain_Specific/nbody/bodysystemcuda.cu | 338 +- .../5_Domain_Specific/nbody/bodysystemcuda.h | 98 +- .../nbody/bodysystemcuda_impl.h | 660 +-- Samples/5_Domain_Specific/nbody/nbody.cpp | 2140 ++++---- .../nbody/render_particles.cpp | 534 +- .../nbody/render_particles.h | 76 +- Samples/5_Domain_Specific/nbody/tipsy.h | 126 +- .../p2pBandwidthLatencyTest.cu | 1230 ++--- .../5_Domain_Specific/postProcessGL/README.md | 1 - .../5_Domain_Specific/postProcessGL/main.cpp | 1140 ++-- .../postProcessGL/postProcessGL.cu | 281 +- .../quasirandomGenerator.cpp | 215 +- .../quasirandomGenerator_common.h | 2 +- .../quasirandomGenerator_gold.cpp | 457 +- .../quasirandomGenerator_kernel.cu | 199 +- .../quasirandomGenerator.cpp | 213 +- .../quasirandomGenerator_common.h | 2 +- .../quasirandomGenerator_gold.cpp | 463 +- .../quasirandomGenerator_gpu.cuh | 85 +- .../quasirandomGenerator_kernel.cu | 180 +- .../recursiveGaussian/README.md | 1 - .../recursiveGaussian/recursiveGaussian.cpp | 611 +-- .../recursiveGaussian_cuda.cu | 101 +- .../recursiveGaussian_kernel.cuh | 254 +- .../5_Domain_Specific/simpleD3D11/README.md | 1 - .../simpleD3D11/ShaderStructs.h | 15 +- .../simpleD3D11/simpleD3D11.cpp | 380 +- .../simpleD3D11/sinewave_cuda.cu | 74 +- .../simpleD3D11/sinewave_cuda.h | 16 +- .../simpleD3D11Texture/README.md | 1 - .../d3dx11effect/d3dx11effect.h | 642 ++- .../simpleD3D11Texture/simpleD3D11Texture.cpp | 1899 +++---- .../simpleD3D11Texture/texture_2d.cu | 58 +- .../simpleD3D11Texture/texture_3d.cu | 70 +- .../simpleD3D11Texture/texture_cube.cu | 79 +- .../simpleD3D12/DX12CudaSample.cpp | 126 +- .../simpleD3D12/DX12CudaSample.h | 64 +- .../simpleD3D12/DXSampleHelper.h | 184 +- .../5_Domain_Specific/simpleD3D12/Main.cpp | 10 +- .../5_Domain_Specific/simpleD3D12/README.md | 1 - .../simpleD3D12/ShaderStructs.h | 20 +- .../simpleD3D12/Win32Application.cpp | 143 +- .../simpleD3D12/Win32Application.h | 18 +- .../5_Domain_Specific/simpleD3D12/d3dx12.h | 1865 +++---- .../simpleD3D12/simpleD3D12.cpp | 869 ++- .../simpleD3D12/simpleD3D12.h | 146 +- .../simpleD3D12/sinewave_cuda.cu | 66 +- .../5_Domain_Specific/simpleD3D12/stdafx.h | 11 +- Samples/5_Domain_Specific/simpleGL/README.md | 1 - .../5_Domain_Specific/simpleGL/simpleGL.cu | 207 +- .../simpleVulkan/Build_instructions.txt | 2 +- .../5_Domain_Specific/simpleVulkan/README.md | 1 - .../simpleVulkan/SineWaveSimulation.cu | 168 +- .../simpleVulkan/SineWaveSimulation.h | 32 +- .../simpleVulkan/VulkanBaseApp.cpp | 3165 ++++++----- .../simpleVulkan/VulkanBaseApp.h | 219 +- .../5_Domain_Specific/simpleVulkan/linmath.h | 287 +- .../5_Domain_Specific/simpleVulkan/main.cpp | 883 ++- .../simpleVulkanMMAP/Build_instructions.txt | 2 +- .../simpleVulkanMMAP/MonteCarloPi.cu | 409 +- .../simpleVulkanMMAP/MonteCarloPi.h | 83 +- .../simpleVulkanMMAP/README.md | 1 - .../simpleVulkanMMAP/VulkanBaseApp.cpp | 3067 +++++------ .../simpleVulkanMMAP/VulkanBaseApp.h | 217 +- .../simpleVulkanMMAP/VulkanCudaInterop.h | 66 +- .../simpleVulkanMMAP/main.cpp | 558 +- .../smokeParticles/GLSLProgram.cpp | 351 +- .../smokeParticles/GLSLProgram.h | 53 +- .../smokeParticles/GpuArray.h | 424 +- .../smokeParticles/ParticleSystem.cpp | 442 +- .../smokeParticles/ParticleSystem.cuh | 24 +- .../smokeParticles/ParticleSystem.h | 126 +- .../smokeParticles/ParticleSystem_cuda.cu | 193 +- .../smokeParticles/SmokeRenderer.cpp | 857 +-- .../smokeParticles/SmokeRenderer.h | 196 +- .../smokeParticles/SmokeShaders.cpp | 372 +- .../smokeParticles/framebufferObject.cpp | 415 +- .../smokeParticles/framebufferObject.h | 140 +- .../5_Domain_Specific/smokeParticles/nvMath.h | 28 +- .../smokeParticles/nvMatrix.h | 648 +-- .../smokeParticles/nvQuaternion.h | 645 +-- .../smokeParticles/nvVector.h | 1335 ++--- .../smokeParticles/particleDemo.cpp | 1275 ++--- .../smokeParticles/particles_kernel.cuh | 22 +- .../particles_kernel_device.cuh | 105 +- .../smokeParticles/renderbuffer.cpp | 36 +- .../smokeParticles/renderbuffer.h | 27 +- .../stereoDisparity/stereoDisparity.cu | 339 +- .../stereoDisparity_kernel.cuh | 300 +- .../volumeFiltering/README.md | 1 - .../volumeFiltering/volume.cpp | 90 +- .../volumeFiltering/volume.h | 63 +- .../volumeFiltering/volumeFilter.h | 14 +- .../volumeFiltering/volumeFilter_kernel.cu | 144 +- .../volumeFiltering/volumeFiltering.cpp | 515 +- .../volumeFiltering/volumeRender.h | 28 +- .../volumeFiltering/volumeRender_kernel.cu | 986 ++-- .../5_Domain_Specific/volumeRender/README.md | 1 - .../volumeRender/volumeRender.cpp | 781 +-- .../volumeRender/volumeRender_kernel.cu | 496 +- .../vulkanImageCUDA/Build_instructions.txt | 2 +- .../vulkanImageCUDA/README.md | 1 - .../vulkanImageCUDA/linmath.h | 287 +- .../vulkanImageCUDA/vulkanImageCUDA.cu | 4860 ++++++++--------- .../LargeKernelParameter.cu | 205 +- .../UnifiedMemoryPerf/commonDefs.hpp | 61 +- .../UnifiedMemoryPerf/commonKernels.cu | 7 +- .../UnifiedMemoryPerf/helperFunctions.cpp | 484 +- .../UnifiedMemoryPerf/matrixMultiplyPerf.cu | 1076 ++-- .../alignedTypes/alignedTypes.cu | 305 +- .../alignedTypes/doc/alignedTypes.txt | 4 +- .../cudaGraphsPerfScaling/README.md | 1 - .../cudaGraphPerfScaling.cu | 194 +- Samples/6_Performance/transpose/transpose.cu | 839 +-- Samples/7_libNVVM/common/include/DDSWriter.h | 130 +- .../cuda-c-linking/cuda-c-linking.cpp | 456 +- .../7_libNVVM/cuda-c-linking/math-funcs.cu | 77 +- .../cuda-shared-memory/CMakeLists.txt | 2 +- .../7_libNVVM/device-side-launch/README.md | 20 +- Samples/7_libNVVM/device-side-launch/dsl.c | 361 +- Samples/7_libNVVM/ptxgen/ptxgen.c | 392 +- Samples/7_libNVVM/simple/simple.c | 375 +- Samples/7_libNVVM/syscalls/CMakeLists.txt | 2 +- Samples/7_libNVVM/uvmlite/README.md | 2 +- Samples/7_libNVVM/uvmlite/uvmlite.c | 463 +- .../EGLSync_CUDAEvent_Interop.cu | 1158 ++-- .../Tegra/EGLSync_CUDAEvent_Interop/README.md | 1 - .../EGLSync_CUDAEvent_Interop/egl_common.h | 23 +- .../graphics_interface.h | 265 +- .../Tegra/cuDLAErrorReporting/README.md | 1 - .../Tegra/cuDLAErrorReporting/main.cu | 730 ++- .../Tegra/cuDLAHybridMode/README.md | 1 - .../Tegra/cuDLAHybridMode/main.cu | 835 ++- .../Tegra/cuDLALayerwiseStatsHybrid/README.md | 1 - .../Tegra/cuDLALayerwiseStatsHybrid/main.cu | 307 +- .../cuDLALayerwiseStatsStandalone/README.md | 1 - .../cuDLALayerwiseStatsStandalone/main.cpp | 588 +- .../Tegra/cuDLAStandaloneMode/README.md | 1 - .../Tegra/cuDLAStandaloneMode/main.cpp | 1888 ++++--- .../Tegra/cudaNvSciBufMultiplanar/README.md | 1 - .../cudaNvSciBufMultiplanar.cpp | 382 +- .../cudaNvSciBufMultiplanar.h | 110 +- .../cudaNvSciBufMultiplanar/imageKernels.cu | 28 +- .../Tegra/cudaNvSciBufMultiplanar/main.cpp | 68 +- .../Tegra/cudaNvSciNvMedia/README.md | 1 - .../Tegra/cudaNvSciNvMedia/cuda_consumer.cu | 608 +-- .../Tegra/cudaNvSciNvMedia/cuda_consumer.h | 55 +- .../Tegra/cudaNvSciNvMedia/main.cpp | 333 +- .../cudaNvSciNvMedia/nvmedia_producer.cpp | 721 +-- .../Tegra/cudaNvSciNvMedia/nvmedia_producer.h | 34 +- .../nvmedia_utils/cmdline.cpp | 126 +- .../cudaNvSciNvMedia/nvmedia_utils/cmdline.h | 68 +- .../nvmedia_utils/config_parser.cpp | 730 +-- .../nvmedia_utils/config_parser.h | 111 +- .../nvmedia_utils/image_utils.cpp | 681 ++- .../nvmedia_utils/image_utils.h | 160 +- .../nvmedia_utils/log_utils.cpp | 100 +- .../nvmedia_utils/log_utils.h | 124 +- .../nvmedia_utils/misc_utils.cpp | 21 +- .../nvmedia_utils/misc_utils.h | 52 +- .../Tegra/cudaNvSciNvMedia/nvsci_setup.cpp | 197 +- .../Tegra/cudaNvSciNvMedia/nvsci_setup.h | 14 +- .../Tegra/fluidsGLES/README.md | 1 - .../Tegra/fluidsGLES/defines.h | 27 +- .../Tegra/fluidsGLES/fluidsGLES.cpp | 381 +- .../Tegra/fluidsGLES/fluidsGLES_kernels.cu | 430 +- .../Tegra/fluidsGLES/fluidsGLES_kernels.cuh | 17 +- .../Tegra/fluidsGLES/fluidsGLES_kernels.h | 19 +- .../Tegra/fluidsGLES/graphics_interface.h | 252 +- .../Tegra/nbody_opengles/README.md | 1 - .../Tegra/nbody_opengles/bodysystem.h | 365 +- .../Tegra/nbody_opengles/bodysystemcpu.h | 58 +- .../Tegra/nbody_opengles/bodysystemcpu_impl.h | 378 +- .../Tegra/nbody_opengles/bodysystemcuda.cu | 337 +- .../Tegra/nbody_opengles/bodysystemcuda.h | 91 +- .../nbody_opengles/bodysystemcuda_impl.h | 565 +- .../Tegra/nbody_opengles/nbody_opengles.cpp | 1822 +++--- .../Tegra/nbody_opengles/render_particles.cpp | 571 +- .../Tegra/nbody_opengles/render_particles.h | 104 +- .../Tegra/nbody_opengles/tipsy.h | 126 +- .../Tegra/simpleGLES/README.md | 1 - .../Tegra/simpleGLES/graphics_interface.c | 272 +- .../Tegra/simpleGLES/simpleGLES.cu | 734 +-- .../Tegra/simpleGLES_EGLOutput/README.md | 1 - .../graphics_interface_egloutput_via_egl.c | 937 ++-- .../simpleGLES_EGLOutput.cu | 612 +-- bin/x86_64/linux/release/APM_BlackScholes.txt | 4 +- .../linux/release/APM_BlackScholes_nvrtc.txt | 4 +- .../linux/release/APM_UnifiedMemoryPerf.txt | 2 +- bin/x86_64/linux/release/APM_batchCUBLAS.txt | 8 +- .../linux/release/APM_binaryPartitionCG.txt | 1 - .../linux/release/APM_binomialOptions.txt | 6 +- .../release/APM_binomialOptions_nvrtc.txt | 6 +- .../APM_conjugateGradientMultiBlockCG.txt | 2 +- .../APM_conjugateGradientMultiDeviceCG.txt | 2 +- .../release/APM_conjugateGradientPrecond.txt | 15 +- bin/x86_64/linux/release/APM_cppOverload.txt | 1 - .../release/APM_cuSolverDn_LinearSolver.txt | 10 +- bin/x86_64/linux/release/APM_cuSolverRf.txt | 48 +- .../release/APM_cuSolverSp_LinearSolver.txt | 32 +- .../APM_cuSolverSp_LowlevelCholesky.txt | 24 +- .../release/APM_cuSolverSp_LowlevelQR.txt | 24 +- .../linux/release/APM_cudaTensorCoreGemm.txt | 2 +- bin/x86_64/linux/release/APM_dct8x8.txt | 6 +- .../linux/release/APM_deviceQueryDrv.txt | 2 +- .../linux/release/APM_fp16ScalarProduct.txt | 4 +- .../linux/release/APM_immaTensorCoreGemm.txt | 2 +- bin/x86_64/linux/release/APM_interval.txt | 1 - .../release/APM_p2pBandwidthLatencyTest.txt | 32 +- .../release/APM_segmentationTreeThrust.txt | 1 - bin/x86_64/linux/release/APM_shfl_scan.txt | 2 +- .../linux/release/APM_simpleCUBLASXT.txt | 2 +- .../release/APM_simpleCooperativeGroups.txt | 1 - .../linux/release/APM_simpleCudaGraphs.txt | 4 +- .../linux/release/APM_simpleMultiCopy.txt | 2 +- .../linux/release/APM_simpleMultiGPU.txt | 3 +- .../linux/release/APM_simpleOccupancy.txt | 1 - .../linux/release/APM_systemWideAtomics.txt | 2 +- .../linux/release/APM_threadMigration.txt | 1 - .../release/APM_warpAggregatedAtomicsCG.txt | 2 +- 782 files changed, 107230 insertions(+), 106548 deletions(-) create mode 100644 .clang-format create mode 100644 .pre-commit-config.yaml mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.cpp mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.h mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/DXSampleHelper.h mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/Main.cpp mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/ShaderStructs.h mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/Win32Application.cpp mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/Win32Application.h mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/d3dx12.h mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.cpp mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.h mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/sinewave_cuda.cu mode change 100755 => 100644 Samples/5_Domain_Specific/simpleD3D12/stdafx.h diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..11b62b76 --- /dev/null +++ b/.clang-format @@ -0,0 +1,49 @@ +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: Consecutive +AlignConsecutiveDeclarations: Consecutive +AlignConsecutiveMacros: Consecutive +AlignEscapedNewlines: Left +AlignOperands: AlignAfterOperator +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: false +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: true + AfterControlStatement: false + AfterExternBlock: true + AfterFunction: true + AfterStruct: true + AfterUnion: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false +BreakBeforeBraces: Custom +BreakBeforeConceptDeclarations: true +BreakBeforeBinaryOperators: NonAssignment +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeComma +BreakInheritanceList: BeforeComma +ColumnLimit: 120 +DerivePointerAlignment: false +FixNamespaceComments: true +IncludeCategories: + - Regex: '^<.*>' + Priority: 1 + - Regex: '^".*"' + Priority: 2 +SortIncludes: true +IncludeBlocks: Regroup +IndentWidth: 4 +MaxEmptyLinesToKeep: 2 +PointerAlignment: Right +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +Standard: c++17 +TabWidth: 4 +UseTab: Never +... diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..0388a904 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,100 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +ci: + autofix_commit_msg: | + [pre-commit.ci] auto code formatting + autofix_prs: false + autoupdate_branch: '' + autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' + autoupdate_schedule: quarterly + skip: [] + submodules: false + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: end-of-file-fixer + exclude: | + (?x)^( + .*\.raw$| + .*\.bin$| + .*\.dat$| + .*\.nv12$| + data/.*| + Common/.* + ) + files: | + (?x)^( + .*\.txt$| + .*\.md$| + .*\.cpp$| + .*\.cxx$| + .*\.hpp$| + .*\.h$| + .*\.cu$| + .*\.cuh$ + ) + - id: mixed-line-ending + exclude: | + (?x)^( + .*\.raw$| + .*\.bin$| + .*\.dat$| + .*\.nv12$| + data/.*| + Common/.* + ) + files: | + (?x)^( + .*\.txt$| + .*\.md$| + .*\.cpp$| + .*\.cxx$| + .*\.hpp$| + .*\.h$| + .*\.cu$| + .*\.cuh$ + ) + - id: trailing-whitespace + exclude: | + (?x)^( + .*\.raw$| + .*\.bin$| + .*\.dat$| + .*\.nv12$| + data/.*| + Common/.* + ) + files: | + (?x)^( + .*\.txt$| + .*\.md$| + .*\.cpp$| + .*\.cxx$| + .*\.hpp$| + .*\.h$| + .*\.cu$| + .*\.cuh$ + ) + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v19.1.6 + hooks: + - id: clang-format + types_or: [file] + files: | + (?x)^( + ^.*\.c$| + ^.*\.cpp$| + ^.*\.cu$| + ^.*\.cuh$| + ^.*\.cxx$| + ^.*\.h$| + ^.*\.hpp$| + ^.*\.inl$| + ^.*\.mm$ + ) + exclude: | + (?x)^( + Common/.* + ) + args: ["-fallback-style=none", "-style=file", "-i"] diff --git a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu index 2bf0ac9f..ca5cfcbd 100644 --- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu +++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu @@ -31,10 +31,10 @@ */ // system includes +#include #include #include #include -#include #ifdef USE_PTHREADS #include #else @@ -51,291 +51,287 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent // functions -void srand48(long seed) { srand((unsigned int)seed); } +void srand48(long seed) { srand((unsigned int)seed); } double drand48() { return double(rand()) / RAND_MAX; } #endif const char *sSDKname = "UnifiedMemoryStreams"; // simple task -template -struct Task { - unsigned int size, id; - T *data; - T *result; - T *vector; +template struct Task +{ + unsigned int size, id; + T *data; + T *result; + T *vector; - Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){}; - Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) { - // allocate unified memory -- the operation performed in this example will - // be a DGEMV - checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size)); - checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size)); - checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size)); - checkCudaErrors(cudaDeviceSynchronize()); - } - - ~Task() { - // ensure all memory is deallocated - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaFree(data)); - checkCudaErrors(cudaFree(result)); - checkCudaErrors(cudaFree(vector)); - } - - void allocate(const unsigned int s, const unsigned int unique_id) { - // allocate unified memory outside of constructor - id = unique_id; - size = s; - checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size)); - checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size)); - checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size)); - checkCudaErrors(cudaDeviceSynchronize()); - - // populate data with random elements - for (unsigned int i = 0; i < size * size; i++) { - data[i] = drand48(); + Task() + : size(0) + , id(0) + , data(NULL) + , result(NULL) + , vector(NULL) {}; + Task(unsigned int s) + : size(s) + , id(0) + , data(NULL) + , result(NULL) + { + // allocate unified memory -- the operation performed in this example will + // be a DGEMV + checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size)); + checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size)); + checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size)); + checkCudaErrors(cudaDeviceSynchronize()); } - for (unsigned int i = 0; i < size; i++) { - result[i] = 0.; - vector[i] = drand48(); + ~Task() + { + // ensure all memory is deallocated + checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaFree(data)); + checkCudaErrors(cudaFree(result)); + checkCudaErrors(cudaFree(vector)); + } + + void allocate(const unsigned int s, const unsigned int unique_id) + { + // allocate unified memory outside of constructor + id = unique_id; + size = s; + checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size)); + checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size)); + checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size)); + checkCudaErrors(cudaDeviceSynchronize()); + + // populate data with random elements + for (unsigned int i = 0; i < size * size; i++) { + data[i] = drand48(); + } + + for (unsigned int i = 0; i < size; i++) { + result[i] = 0.; + vector[i] = drand48(); + } } - } }; #ifdef USE_PTHREADS -struct threadData_t { - int tid; - Task *TaskListPtr; - cudaStream_t *streams; - cublasHandle_t *handles; - int taskSize; +struct threadData_t +{ + int tid; + Task *TaskListPtr; + cudaStream_t *streams; + cublasHandle_t *handles; + int taskSize; }; typedef struct threadData_t threadData; #endif // simple host dgemv: assume data is in row-major format and square -template -void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) { - // rows - for (int i = 0; i < n; i++) { - result[i] *= beta; +template void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) +{ + // rows + for (int i = 0; i < n; i++) { + result[i] *= beta; - for (int j = 0; j < n; j++) { - result[i] += A[i * n + j] * x[j]; + for (int j = 0; j < n; j++) { + result[i] += A[i * n + j] * x[j]; + } } - } } // execute a single task on either host or device depending on size #ifdef USE_PTHREADS -void *execute(void *inpArgs) { - threadData *dataPtr = (threadData *)inpArgs; - cudaStream_t *stream = dataPtr->streams; - cublasHandle_t *handle = dataPtr->handles; - int tid = dataPtr->tid; +void *execute(void *inpArgs) +{ + threadData *dataPtr = (threadData *)inpArgs; + cudaStream_t *stream = dataPtr->streams; + cublasHandle_t *handle = dataPtr->handles; + int tid = dataPtr->tid; - for (int i = 0; i < dataPtr->taskSize; i++) { - Task &t = dataPtr->TaskListPtr[i]; + for (int i = 0; i < dataPtr->taskSize; i++) { + Task &t = dataPtr->TaskListPtr[i]; - if (t.size < 100) { - // perform on host - printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, - t.size); + if (t.size < 100) { + // perform on host + printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size); - // attach managed memory to a (dummy) stream to allow host access while - // the device is running - checkCudaErrors( - cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); - checkCudaErrors( - cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); - checkCudaErrors( - cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); - // necessary to ensure Async cudaStreamAttachMemAsync calls have finished - checkCudaErrors(cudaStreamSynchronize(stream[0])); - // call the host operation - gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); - } else { - // perform on device - printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, - t.size); - double one = 1.0; - double zero = 0.0; + // attach managed memory to a (dummy) stream to allow host access while + // the device is running + checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); + // necessary to ensure Async cudaStreamAttachMemAsync calls have finished + checkCudaErrors(cudaStreamSynchronize(stream[0])); + // call the host operation + gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); + } + else { + // perform on device + printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size); + double one = 1.0; + double zero = 0.0; - // attach managed memory to my stream - checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); - checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, - cudaMemAttachSingle)); - checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, - cudaMemAttachSingle)); - checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, - cudaMemAttachSingle)); - // call the device operation - checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size, - &one, t.data, t.size, t.vector, 1, &zero, - t.result, 1)); + // attach managed memory to my stream + checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); + checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle)); + // call the device operation + checkCudaErrors(cublasDgemv( + handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1)); + } } - } - pthread_exit(NULL); + pthread_exit(NULL); } #else -template -void execute(Task &t, cublasHandle_t *handle, cudaStream_t *stream, - int tid) { - if (t.size < 100) { - // perform on host - printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, - t.size); +template void execute(Task &t, cublasHandle_t *handle, cudaStream_t *stream, int tid) +{ + if (t.size < 100) { + // perform on host + printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size); - // attach managed memory to a (dummy) stream to allow host access while the - // device is running - checkCudaErrors( - cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); - checkCudaErrors( - cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); - checkCudaErrors( - cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); - // necessary to ensure Async cudaStreamAttachMemAsync calls have finished - checkCudaErrors(cudaStreamSynchronize(stream[0])); - // call the host operation - gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); - } else { - // perform on device - printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, - t.size); - double one = 1.0; - double zero = 0.0; + // attach managed memory to a (dummy) stream to allow host access while the + // device is running + checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); + // necessary to ensure Async cudaStreamAttachMemAsync calls have finished + checkCudaErrors(cudaStreamSynchronize(stream[0])); + // call the host operation + gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); + } + else { + // perform on device + printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size); + double one = 1.0; + double zero = 0.0; - // attach managed memory to my stream - checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); - checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, - cudaMemAttachSingle)); - checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, - cudaMemAttachSingle)); - checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, - cudaMemAttachSingle)); - // call the device operation - checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size, - &one, t.data, t.size, t.vector, 1, &zero, - t.result, 1)); - } + // attach managed memory to my stream + checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); + checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle)); + checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle)); + // call the device operation + checkCudaErrors(cublasDgemv( + handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1)); + } } #endif // populate a list of tasks with random sizes -template -void initialise_tasks(std::vector > &TaskList) { - for (unsigned int i = 0; i < TaskList.size(); i++) { - // generate random size - int size; - size = std::max((int)(drand48() * 1000.0), 64); - TaskList[i].allocate(size, i); - } +template void initialise_tasks(std::vector> &TaskList) +{ + for (unsigned int i = 0; i < TaskList.size(); i++) { + // generate random size + int size; + size = std::max((int)(drand48() * 1000.0), 64); + TaskList[i].allocate(size, i); + } } -int main(int argc, char **argv) { - // set device - cudaDeviceProp device_prop; - int dev_id = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); +int main(int argc, char **argv) +{ + // set device + cudaDeviceProp device_prop; + int dev_id = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); - if (!device_prop.managedMemory) { - // This samples requires being run on a device that supports Unified Memory - fprintf(stderr, "Unified Memory not supported on this device\n"); + if (!device_prop.managedMemory) { + // This samples requires being run on a device that supports Unified Memory + fprintf(stderr, "Unified Memory not supported on this device\n"); - exit(EXIT_WAIVED); - } + exit(EXIT_WAIVED); + } - if (device_prop.computeMode == cudaComputeModeProhibited) { - // This sample requires being run with a default or process exclusive mode - fprintf(stderr, - "This sample requires a device in either default or process " - "exclusive mode\n"); + if (device_prop.computeMode == cudaComputeModeProhibited) { + // This sample requires being run with a default or process exclusive mode + fprintf(stderr, + "This sample requires a device in either default or process " + "exclusive mode\n"); - exit(EXIT_WAIVED); - } + exit(EXIT_WAIVED); + } - // randomise task sizes - int seed = (int)time(NULL); - srand48(seed); + // randomise task sizes + int seed = (int)time(NULL); + srand48(seed); - // set number of threads - const int nthreads = 4; + // set number of threads + const int nthreads = 4; - // number of streams = number of threads - cudaStream_t *streams = new cudaStream_t[nthreads + 1]; - cublasHandle_t *handles = new cublasHandle_t[nthreads + 1]; + // number of streams = number of threads + cudaStream_t *streams = new cudaStream_t[nthreads + 1]; + cublasHandle_t *handles = new cublasHandle_t[nthreads + 1]; - for (int i = 0; i < nthreads + 1; i++) { - checkCudaErrors(cudaStreamCreate(&streams[i])); - checkCudaErrors(cublasCreate(&handles[i])); - } + for (int i = 0; i < nthreads + 1; i++) { + checkCudaErrors(cudaStreamCreate(&streams[i])); + checkCudaErrors(cublasCreate(&handles[i])); + } - // create list of N tasks - unsigned int N = 40; - std::vector > TaskList(N); - initialise_tasks(TaskList); + // create list of N tasks + unsigned int N = 40; + std::vector> TaskList(N); + initialise_tasks(TaskList); - printf("Executing tasks on host / device\n"); + printf("Executing tasks on host / device\n"); // run through all tasks using threads and streams #ifdef USE_PTHREADS - pthread_t threads[nthreads]; - threadData *InputToThreads = new threadData[nthreads]; + pthread_t threads[nthreads]; + threadData *InputToThreads = new threadData[nthreads]; - for (int i = 0; i < nthreads; i++) { - checkCudaErrors(cudaSetDevice(dev_id)); - InputToThreads[i].tid = i; - InputToThreads[i].streams = streams; - InputToThreads[i].handles = handles; + for (int i = 0; i < nthreads; i++) { + checkCudaErrors(cudaSetDevice(dev_id)); + InputToThreads[i].tid = i; + InputToThreads[i].streams = streams; + InputToThreads[i].handles = handles; - if ((TaskList.size() / nthreads) == 0) { - InputToThreads[i].taskSize = (TaskList.size() / nthreads); - InputToThreads[i].TaskListPtr = - &TaskList[i * (TaskList.size() / nthreads)]; - } else { - if (i == nthreads - 1) { - InputToThreads[i].taskSize = - (TaskList.size() / nthreads) + (TaskList.size() % nthreads); - InputToThreads[i].TaskListPtr = - &TaskList[i * (TaskList.size() / nthreads) + - (TaskList.size() % nthreads)]; - } else { - InputToThreads[i].taskSize = (TaskList.size() / nthreads); - InputToThreads[i].TaskListPtr = - &TaskList[i * (TaskList.size() / nthreads)]; - } + if ((TaskList.size() / nthreads) == 0) { + InputToThreads[i].taskSize = (TaskList.size() / nthreads); + InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)]; + } + else { + if (i == nthreads - 1) { + InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads); + InputToThreads[i].TaskListPtr = + &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)]; + } + else { + InputToThreads[i].taskSize = (TaskList.size() / nthreads); + InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)]; + } + } + + pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]); + } + for (int i = 0; i < nthreads; i++) { + pthread_join(threads[i], NULL); } - - pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]); - } - for (int i = 0; i < nthreads; i++) { - pthread_join(threads[i], NULL); - } #else - omp_set_num_threads(nthreads); + omp_set_num_threads(nthreads); #pragma omp parallel for schedule(dynamic) - for (int i = 0; i < TaskList.size(); i++) { - checkCudaErrors(cudaSetDevice(dev_id)); - int tid = omp_get_thread_num(); - execute(TaskList[i], handles, streams, tid); - } + for (int i = 0; i < TaskList.size(); i++) { + checkCudaErrors(cudaSetDevice(dev_id)); + int tid = omp_get_thread_num(); + execute(TaskList[i], handles, streams, tid); + } #endif - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); - // Destroy CUDA Streams, cuBlas handles - for (int i = 0; i < nthreads + 1; i++) { - cudaStreamDestroy(streams[i]); - cublasDestroy(handles[i]); - } + // Destroy CUDA Streams, cuBlas handles + for (int i = 0; i < nthreads + 1; i++) { + cudaStreamDestroy(streams[i]); + cublasDestroy(handles[i]); + } - // Free TaskList - std::vector >().swap(TaskList); + // Free TaskList + std::vector>().swap(TaskList); - printf("All Done!\n"); - exit(EXIT_SUCCESS); + printf("All Done!\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/0_Introduction/asyncAPI/asyncAPI.cu b/Samples/0_Introduction/asyncAPI/asyncAPI.cu index ab43c1dc..5a38d3ab 100644 --- a/Samples/0_Introduction/asyncAPI/asyncAPI.cu +++ b/Samples/0_Introduction/asyncAPI/asyncAPI.cu @@ -38,105 +38,107 @@ #include // includes CUDA Runtime -#include #include +#include // includes, project #include -#include // helper utility functions +#include // helper utility functions -__global__ void increment_kernel(int *g_data, int inc_value) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - g_data[idx] = g_data[idx] + inc_value; +__global__ void increment_kernel(int *g_data, int inc_value) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + g_data[idx] = g_data[idx] + inc_value; } -bool correct_output(int *data, const int n, const int x) { - for (int i = 0; i < n; i++) - if (data[i] != x) { - printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); - return false; +bool correct_output(int *data, const int n, const int x) +{ + for (int i = 0; i < n; i++) + if (data[i] != x) { + printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); + return false; + } + + return true; +} + +int main(int argc, char *argv[]) +{ + int devID; + cudaDeviceProp deviceProps; + + printf("[%s] - Starting...\n", argv[0]); + + // This will pick the best possible CUDA capable device + devID = findCudaDevice(argc, (const char **)argv); + + // get device name + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s]\n", deviceProps.name); + + int n = 16 * 1024 * 1024; + int nbytes = n * sizeof(int); + int value = 26; + + // allocate host memory + int *a = 0; + checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); + memset(a, 0, nbytes); + + // allocate device memory + int *d_a = 0; + checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); + checkCudaErrors(cudaMemset(d_a, 255, nbytes)); + + // set kernel launch configuration + dim3 threads = dim3(512, 1); + dim3 blocks = dim3(n / threads.x, 1); + + // create cuda event handles + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkResetTimer(&timer); + + checkCudaErrors(cudaDeviceSynchronize()); + float gpu_time = 0.0f; + + // asynchronously issue work to the GPU (all to stream 0) + checkCudaErrors(cudaProfilerStart()); + sdkStartTimer(&timer); + cudaEventRecord(start, 0); + cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); + increment_kernel<<>>(d_a, value); + cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); + cudaEventRecord(stop, 0); + sdkStopTimer(&timer); + checkCudaErrors(cudaProfilerStop()); + + // have CPU do some work while waiting for stage 1 to finish + unsigned long int counter = 0; + + while (cudaEventQuery(stop) == cudaErrorNotReady) { + counter++; } - return true; -} - -int main(int argc, char *argv[]) { - int devID; - cudaDeviceProp deviceProps; - - printf("[%s] - Starting...\n", argv[0]); - - // This will pick the best possible CUDA capable device - devID = findCudaDevice(argc, (const char **)argv); - - // get device name - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s]\n", deviceProps.name); - - int n = 16 * 1024 * 1024; - int nbytes = n * sizeof(int); - int value = 26; - - // allocate host memory - int *a = 0; - checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); - memset(a, 0, nbytes); - - // allocate device memory - int *d_a = 0; - checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); - checkCudaErrors(cudaMemset(d_a, 255, nbytes)); - - // set kernel launch configuration - dim3 threads = dim3(512, 1); - dim3 blocks = dim3(n / threads.x, 1); - - // create cuda event handles - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - sdkResetTimer(&timer); - - checkCudaErrors(cudaDeviceSynchronize()); - float gpu_time = 0.0f; - - // asynchronously issue work to the GPU (all to stream 0) - checkCudaErrors(cudaProfilerStart()); - sdkStartTimer(&timer); - cudaEventRecord(start, 0); - cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); - increment_kernel<<>>(d_a, value); - cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); - cudaEventRecord(stop, 0); - sdkStopTimer(&timer); - checkCudaErrors(cudaProfilerStop()); - - // have CPU do some work while waiting for stage 1 to finish - unsigned long int counter = 0; - - while (cudaEventQuery(stop) == cudaErrorNotReady) { - counter++; - } - - checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); - - // print the cpu and gpu times - printf("time spent executing by the GPU: %.2f\n", gpu_time); - printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); - printf("CPU executed %lu iterations while waiting for GPU to finish\n", - counter); - - // check the output for correctness - bool bFinalResults = correct_output(a, n, value); - - // release resources - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaFreeHost(a)); - checkCudaErrors(cudaFree(d_a)); - - exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); + checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); + + // print the cpu and gpu times + printf("time spent executing by the GPU: %.2f\n", gpu_time); + printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); + printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); + + // check the output for correctness + bool bFinalResults = correct_output(a, n, value); + + // release resources + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaFreeHost(a)); + checkCudaErrors(cudaFree(d_a)); + + exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/clock/clock.cu b/Samples/0_Introduction/clock/clock.cu index e400b4f3..f9fa5ae5 100644 --- a/Samples/0_Introduction/clock/clock.cu +++ b/Samples/0_Introduction/clock/clock.cu @@ -48,43 +48,46 @@ // This kernel computes a standard parallel reduction and evaluates the // time it takes to do that for each block. The timing results are stored // in device memory. -__global__ static void timedReduction(const float *input, float *output, - clock_t *timer) { - // __shared__ float shared[2 * blockDim.x]; - extern __shared__ float shared[]; +__global__ static void timedReduction(const float *input, float *output, clock_t *timer) +{ + // __shared__ float shared[2 * blockDim.x]; + extern __shared__ float shared[]; - const int tid = threadIdx.x; - const int bid = blockIdx.x; + const int tid = threadIdx.x; + const int bid = blockIdx.x; - if (tid == 0) timer[bid] = clock(); + if (tid == 0) + timer[bid] = clock(); - // Copy input. - shared[tid] = input[tid]; - shared[tid + blockDim.x] = input[tid + blockDim.x]; + // Copy input. + shared[tid] = input[tid]; + shared[tid + blockDim.x] = input[tid + blockDim.x]; + + // Perform reduction to find minimum. + for (int d = blockDim.x; d > 0; d /= 2) { + __syncthreads(); + + if (tid < d) { + float f0 = shared[tid]; + float f1 = shared[tid + d]; + + if (f1 < f0) { + shared[tid] = f1; + } + } + } + + // Write result. + if (tid == 0) + output[bid] = shared[0]; - // Perform reduction to find minimum. - for (int d = blockDim.x; d > 0; d /= 2) { __syncthreads(); - if (tid < d) { - float f0 = shared[tid]; - float f1 = shared[tid + d]; - - if (f1 < f0) { - shared[tid] = f1; - } - } - } - - // Write result. - if (tid == 0) output[bid] = shared[0]; - - __syncthreads(); - - if (tid == 0) timer[bid + gridDim.x] = clock(); + if (tid == 0) + timer[bid + gridDim.x] = clock(); } -#define NUM_BLOCKS 64 +#define NUM_BLOCKS 64 #define NUM_THREADS 256 // It's interesting to change the number of blocks and the number of threads to @@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output, // the memory. With more than 32 the speed scales linearly. // Start the main CUDA Sample here -int main(int argc, char **argv) { - printf("CUDA Clock sample\n"); +int main(int argc, char **argv) +{ + printf("CUDA Clock sample\n"); - // This will pick the best possible CUDA capable device - int dev = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + int dev = findCudaDevice(argc, (const char **)argv); - float *dinput = NULL; - float *doutput = NULL; - clock_t *dtimer = NULL; + float *dinput = NULL; + float *doutput = NULL; + clock_t *dtimer = NULL; - clock_t timer[NUM_BLOCKS * 2]; - float input[NUM_THREADS * 2]; + clock_t timer[NUM_BLOCKS * 2]; + float input[NUM_THREADS * 2]; - for (int i = 0; i < NUM_THREADS * 2; i++) { - input[i] = (float)i; - } + for (int i = 0; i < NUM_THREADS * 2; i++) { + input[i] = (float)i; + } - checkCudaErrors( - cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); - checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); - checkCudaErrors( - cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); + checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); + checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); + checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); - checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice)); - timedReduction<<>>( - dinput, doutput, dtimer); + timedReduction<<>>(dinput, doutput, dtimer); - checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, - cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaFree(dinput)); - checkCudaErrors(cudaFree(doutput)); - checkCudaErrors(cudaFree(dtimer)); + checkCudaErrors(cudaFree(dinput)); + checkCudaErrors(cudaFree(doutput)); + checkCudaErrors(cudaFree(dtimer)); - long double avgElapsedClocks = 0; + long double avgElapsedClocks = 0; - for (int i = 0; i < NUM_BLOCKS; i++) { - avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); - } + for (int i = 0; i < NUM_BLOCKS; i++) { + avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); + } - avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; - printf("Average clocks/block = %Lf\n", avgElapsedClocks); + avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; + printf("Average clocks/block = %Lf\n", avgElapsedClocks); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/0_Introduction/clock_nvrtc/clock.cpp b/Samples/0_Introduction/clock_nvrtc/clock.cpp index 45269e8e..d301703c 100644 --- a/Samples/0_Introduction/clock_nvrtc/clock.cpp +++ b/Samples/0_Introduction/clock_nvrtc/clock.cpp @@ -34,12 +34,11 @@ */ // System includes -#include -#include #include - #include #include +#include +#include // helper functions and utilities to work with CUDA #include @@ -71,64 +70,68 @@ // Start the main CUDA Sample here -int main(int argc, char **argv) { - printf("CUDA Clock sample\n"); +int main(int argc, char **argv) +{ + printf("CUDA Clock sample\n"); - typedef long clock_t; + typedef long clock_t; - clock_t timer[NUM_BLOCKS * 2]; + clock_t timer[NUM_BLOCKS * 2]; - float input[NUM_THREADS * 2]; + float input[NUM_THREADS * 2]; - for (int i = 0; i < NUM_THREADS * 2; i++) { - input[i] = (float)i; - } + for (int i = 0; i < NUM_THREADS * 2; i++) { + input[i] = (float)i; + } - char *cubin, *kernel_file; - size_t cubinSize; + char *cubin, *kernel_file; + size_t cubinSize; - kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - CUmodule module = loadCUBIN(cubin, argc, argv); - CUfunction kernel_addr; + CUmodule module = loadCUBIN(cubin, argc, argv); + CUfunction kernel_addr; - checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction")); + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction")); - dim3 cudaBlockSize(NUM_THREADS, 1, 1); - dim3 cudaGridSize(NUM_BLOCKS, 1, 1); + dim3 cudaBlockSize(NUM_THREADS, 1, 1); + dim3 cudaGridSize(NUM_BLOCKS, 1, 1); - CUdeviceptr dinput, doutput, dtimer; - checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2)); - checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS)); - checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); - checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2)); + CUdeviceptr dinput, doutput, dtimer; + checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2)); + checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS)); + checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); + checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2)); - void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer}; + void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer}; - checkCudaErrors(cuLaunchKernel( - kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ - sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */ - &arr[0], /* arguments */ - 0)); + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + cudaBlockSize.x, + cudaBlockSize.y, + cudaBlockSize.z, /* block dim */ + sizeof(float) * 2 * NUM_THREADS, + 0, /* shared mem, stream */ + &arr[0], /* arguments */ + 0)); - checkCudaErrors(cuCtxSynchronize()); - checkCudaErrors( - cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); - checkCudaErrors(cuMemFree(dinput)); - checkCudaErrors(cuMemFree(doutput)); - checkCudaErrors(cuMemFree(dtimer)); + checkCudaErrors(cuCtxSynchronize()); + checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); + checkCudaErrors(cuMemFree(dinput)); + checkCudaErrors(cuMemFree(doutput)); + checkCudaErrors(cuMemFree(dtimer)); - long double avgElapsedClocks = 0; + long double avgElapsedClocks = 0; - for (int i = 0; i < NUM_BLOCKS; i++) { - avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); - } + for (int i = 0; i < NUM_BLOCKS; i++) { + avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); + } - avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; - printf("Average clocks/block = %Lf\n", avgElapsedClocks); + avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; + printf("Average clocks/block = %Lf\n", avgElapsedClocks); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu index bcde872c..4bc854b9 100644 --- a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu +++ b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu @@ -37,38 +37,41 @@ // time it takes to do that for each block. The timing results are stored // in device memory. -extern "C" __global__ void timedReduction(const float *input, float *output, - clock_t *timer) { - // __shared__ float shared[2 * blockDim.x]; - extern __shared__ float shared[]; +extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer) +{ + // __shared__ float shared[2 * blockDim.x]; + extern __shared__ float shared[]; - const int tid = threadIdx.x; - const int bid = blockIdx.x; + const int tid = threadIdx.x; + const int bid = blockIdx.x; - if (tid == 0) timer[bid] = clock(); + if (tid == 0) + timer[bid] = clock(); - // Copy input. - shared[tid] = input[tid]; - shared[tid + blockDim.x] = input[tid + blockDim.x]; + // Copy input. + shared[tid] = input[tid]; + shared[tid + blockDim.x] = input[tid + blockDim.x]; + + // Perform reduction to find minimum. + for (int d = blockDim.x; d > 0; d /= 2) { + __syncthreads(); + + if (tid < d) { + float f0 = shared[tid]; + float f1 = shared[tid + d]; + + if (f1 < f0) { + shared[tid] = f1; + } + } + } + + // Write result. + if (tid == 0) + output[bid] = shared[0]; - // Perform reduction to find minimum. - for (int d = blockDim.x; d > 0; d /= 2) { __syncthreads(); - if (tid < d) { - float f0 = shared[tid]; - float f1 = shared[tid + d]; - - if (f1 < f0) { - shared[tid] = f1; - } - } - } - - // Write result. - if (tid == 0) output[bid] = shared[0]; - - __syncthreads(); - - if (tid == 0) timer[bid + gridDim.x] = clock(); + if (tid == 0) + timer[bid + gridDim.x] = clock(); } diff --git a/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu b/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu index 45967fa4..423c0194 100644 --- a/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu +++ b/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu @@ -32,128 +32,125 @@ #include #include -#include // stdio functions are used since C++ streams aren't necessarily thread safe +#include // stdio functions are used since C++ streams aren't necessarily thread safe using namespace std; // a simple kernel that simply increments each array element by b -__global__ void kernelAddConstant(int *g_a, const int b) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - g_a[idx] += b; +__global__ void kernelAddConstant(int *g_a, const int b) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + g_a[idx] += b; } // a predicate that checks whether each array element is set to its index plus b -int correctResult(int *data, const int n, const int b) { - for (int i = 0; i < n; i++) - if (data[i] != i + b) return 0; +int correctResult(int *data, const int n, const int b) +{ + for (int i = 0; i < n; i++) + if (data[i] != i + b) + return 0; - return 1; + return 1; } -int main(int argc, char *argv[]) { - int num_gpus = 0; // number of CUDA GPUs +int main(int argc, char *argv[]) +{ + int num_gpus = 0; // number of CUDA GPUs - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - ///////////////////////////////////////////////////////////////// - // determine the number of CUDA capable GPUs - // - cudaGetDeviceCount(&num_gpus); + ///////////////////////////////////////////////////////////////// + // determine the number of CUDA capable GPUs + // + cudaGetDeviceCount(&num_gpus); - if (num_gpus < 1) { - printf("no CUDA capable devices were detected\n"); - return 1; - } + if (num_gpus < 1) { + printf("no CUDA capable devices were detected\n"); + return 1; + } - ///////////////////////////////////////////////////////////////// - // display CPU and GPU configuration - // - printf("number of host CPUs:\t%d\n", omp_get_num_procs()); - printf("number of CUDA devices:\t%d\n", num_gpus); + ///////////////////////////////////////////////////////////////// + // display CPU and GPU configuration + // + printf("number of host CPUs:\t%d\n", omp_get_num_procs()); + printf("number of CUDA devices:\t%d\n", num_gpus); - for (int i = 0; i < num_gpus; i++) { - cudaDeviceProp dprop; - cudaGetDeviceProperties(&dprop, i); - printf(" %d: %s\n", i, dprop.name); - } + for (int i = 0; i < num_gpus; i++) { + cudaDeviceProp dprop; + cudaGetDeviceProperties(&dprop, i); + printf(" %d: %s\n", i, dprop.name); + } - printf("---------------------------\n"); + printf("---------------------------\n"); - ///////////////////////////////////////////////////////////////// - // initialize data - // - unsigned int n = num_gpus * 8192; - unsigned int nbytes = n * sizeof(int); - int *a = 0; // pointer to data on the CPU - int b = 3; // value by which the array is incremented - a = (int *)malloc(nbytes); + ///////////////////////////////////////////////////////////////// + // initialize data + // + unsigned int n = num_gpus * 8192; + unsigned int nbytes = n * sizeof(int); + int *a = 0; // pointer to data on the CPU + int b = 3; // value by which the array is incremented + a = (int *)malloc(nbytes); - if (0 == a) { - printf("couldn't allocate CPU memory\n"); - return 1; - } + if (0 == a) { + printf("couldn't allocate CPU memory\n"); + return 1; + } - for (unsigned int i = 0; i < n; i++) a[i] = i; + for (unsigned int i = 0; i < n; i++) + a[i] = i; - //////////////////////////////////////////////////////////////// - // run as many CPU threads as there are CUDA devices - // each CPU thread controls a different device, processing its - // portion of the data. It's possible to use more CPU threads - // than there are CUDA devices, in which case several CPU - // threads will be allocating resources and launching kernels - // on the same device. For example, try omp_set_num_threads(2*num_gpus); - // Recall that all variables declared inside an "omp parallel" scope are - // local to each CPU thread - // - omp_set_num_threads( - num_gpus); // create as many CPU threads as there are CUDA devices + //////////////////////////////////////////////////////////////// + // run as many CPU threads as there are CUDA devices + // each CPU thread controls a different device, processing its + // portion of the data. It's possible to use more CPU threads + // than there are CUDA devices, in which case several CPU + // threads will be allocating resources and launching kernels + // on the same device. For example, try omp_set_num_threads(2*num_gpus); + // Recall that all variables declared inside an "omp parallel" scope are + // local to each CPU thread + // + omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices // omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there // are CUDA devices #pragma omp parallel - { - unsigned int cpu_thread_id = omp_get_thread_num(); - unsigned int num_cpu_threads = omp_get_num_threads(); + { + unsigned int cpu_thread_id = omp_get_thread_num(); + unsigned int num_cpu_threads = omp_get_num_threads(); - // set and check the CUDA device for this CPU thread - int gpu_id = -1; - checkCudaErrors(cudaSetDevice( - cpu_thread_id % - num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices - checkCudaErrors(cudaGetDevice(&gpu_id)); - printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, - num_cpu_threads, gpu_id); + // set and check the CUDA device for this CPU thread + int gpu_id = -1; + checkCudaErrors( + cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices + checkCudaErrors(cudaGetDevice(&gpu_id)); + printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id); - int *d_a = - 0; // pointer to memory on the device associated with this CPU thread - int *sub_a = - a + - cpu_thread_id * n / - num_cpu_threads; // pointer to this CPU thread's portion of data - unsigned int nbytes_per_kernel = nbytes / num_cpu_threads; - dim3 gpu_threads(128); // 128 threads per block - dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads)); + int *d_a = 0; // pointer to memory on the device associated with this CPU thread + int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data + unsigned int nbytes_per_kernel = nbytes / num_cpu_threads; + dim3 gpu_threads(128); // 128 threads per block + dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads)); - checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel)); - checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel)); - checkCudaErrors( - cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice)); - kernelAddConstant<<>>(d_a, b); + checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel)); + checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel)); + checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice)); + kernelAddConstant<<>>(d_a, b); - checkCudaErrors( - cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaFree(d_a)); - } - printf("---------------------------\n"); + checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaFree(d_a)); + } + printf("---------------------------\n"); - if (cudaSuccess != cudaGetLastError()) - printf("%s\n", cudaGetErrorString(cudaGetLastError())); + if (cudaSuccess != cudaGetLastError()) + printf("%s\n", cudaGetErrorString(cudaGetLastError())); - //////////////////////////////////////////////////////////////// - // check the result - // - bool bResult = correctResult(a, n, b); + //////////////////////////////////////////////////////////////// + // check the result + // + bool bResult = correctResult(a, n, b); - if (a) free(a); // free CPU memory + if (a) + free(a); // free CPU memory - exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu b/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu index a8a502e9..297095fd 100644 --- a/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu +++ b/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu @@ -25,191 +25,188 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cuda_fp16.h" -#include "helper_cuda.h" - #include #include #include -#define NUM_OF_BLOCKS 128 +#include "cuda_fp16.h" +#include "helper_cuda.h" + +#define NUM_OF_BLOCKS 128 #define NUM_OF_THREADS 128 -__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) { - if (threadIdx.x < 64) - v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]); - __syncthreads(); - if (threadIdx.x < 32) - v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]); - __syncthreads(); - if (threadIdx.x < 16) - v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]); - __syncthreads(); - if (threadIdx.x < 8) - v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]); - __syncthreads(); - if (threadIdx.x < 4) - v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]); - __syncthreads(); - if (threadIdx.x < 2) - v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]); - __syncthreads(); - if (threadIdx.x < 1) - v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]); - __syncthreads(); +__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) +{ + if (threadIdx.x < 64) + v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]); + __syncthreads(); + if (threadIdx.x < 32) + v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]); + __syncthreads(); + if (threadIdx.x < 16) + v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]); + __syncthreads(); + if (threadIdx.x < 8) + v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]); + __syncthreads(); + if (threadIdx.x < 4) + v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]); + __syncthreads(); + if (threadIdx.x < 2) + v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]); + __syncthreads(); + if (threadIdx.x < 1) + v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]); + __syncthreads(); } -__forceinline__ __device__ void reduceInShared_native(half2 *const v) { - if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64]; - __syncthreads(); - if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32]; - __syncthreads(); - if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16]; - __syncthreads(); - if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8]; - __syncthreads(); - if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4]; - __syncthreads(); - if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2]; - __syncthreads(); - if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1]; - __syncthreads(); +__forceinline__ __device__ void reduceInShared_native(half2 *const v) +{ + if (threadIdx.x < 64) + v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64]; + __syncthreads(); + if (threadIdx.x < 32) + v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32]; + __syncthreads(); + if (threadIdx.x < 16) + v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16]; + __syncthreads(); + if (threadIdx.x < 8) + v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8]; + __syncthreads(); + if (threadIdx.x < 4) + v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4]; + __syncthreads(); + if (threadIdx.x < 2) + v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2]; + __syncthreads(); + if (threadIdx.x < 1) + v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1]; + __syncthreads(); } -__global__ void scalarProductKernel_intrinsics(half2 const *const a, - half2 const *const b, - float *const results, - size_t const size) { - const int stride = gridDim.x * blockDim.x; - __shared__ half2 shArray[NUM_OF_THREADS]; +__global__ void +scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size) +{ + const int stride = gridDim.x * blockDim.x; + __shared__ half2 shArray[NUM_OF_THREADS]; - shArray[threadIdx.x] = __float2half2_rn(0.f); - half2 value = __float2half2_rn(0.f); + shArray[threadIdx.x] = __float2half2_rn(0.f); + half2 value = __float2half2_rn(0.f); - for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) { - value = __hfma2(a[i], b[i], value); - } + for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) { + value = __hfma2(a[i], b[i], value); + } - shArray[threadIdx.x] = value; - __syncthreads(); - reduceInShared_intrinsics(shArray); + shArray[threadIdx.x] = value; + __syncthreads(); + reduceInShared_intrinsics(shArray); - if (threadIdx.x == 0) { - half2 result = shArray[0]; - float f_result = __low2float(result) + __high2float(result); - results[blockIdx.x] = f_result; - } + if (threadIdx.x == 0) { + half2 result = shArray[0]; + float f_result = __low2float(result) + __high2float(result); + results[blockIdx.x] = f_result; + } } -__global__ void scalarProductKernel_native(half2 const *const a, - half2 const *const b, - float *const results, - size_t const size) { - const int stride = gridDim.x * blockDim.x; - __shared__ half2 shArray[NUM_OF_THREADS]; +__global__ void +scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size) +{ + const int stride = gridDim.x * blockDim.x; + __shared__ half2 shArray[NUM_OF_THREADS]; - half2 value(0.f, 0.f); - shArray[threadIdx.x] = value; + half2 value(0.f, 0.f); + shArray[threadIdx.x] = value; - for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) { - value = a[i] * b[i] + value; - } + for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) { + value = a[i] * b[i] + value; + } - shArray[threadIdx.x] = value; - __syncthreads(); - reduceInShared_native(shArray); + shArray[threadIdx.x] = value; + __syncthreads(); + reduceInShared_native(shArray); - if (threadIdx.x == 0) { - half2 result = shArray[0]; - float f_result = (float)result.y + (float)result.x; - results[blockIdx.x] = f_result; - } + if (threadIdx.x == 0) { + half2 result = shArray[0]; + float f_result = (float)result.y + (float)result.x; + results[blockIdx.x] = f_result; + } } -void generateInput(half2 *a, size_t size) { - for (size_t i = 0; i < size; ++i) { - half2 temp; - temp.x = static_cast(rand() % 4); - temp.y = static_cast(rand() % 2); - a[i] = temp; - } +void generateInput(half2 *a, size_t size) +{ + for (size_t i = 0; i < size; ++i) { + half2 temp; + temp.x = static_cast(rand() % 4); + temp.y = static_cast(rand() % 2); + a[i] = temp; + } } -int main(int argc, char *argv[]) { - srand((unsigned int)time(NULL)); - size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16; +int main(int argc, char *argv[]) +{ + srand((unsigned int)time(NULL)); + size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16; - half2 *vec[2]; - half2 *devVec[2]; + half2 *vec[2]; + half2 *devVec[2]; - float *results; - float *devResults; + float *results; + float *devResults; - int devID = findCudaDevice(argc, (const char **)argv); + int devID = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp devProp; - checkCudaErrors(cudaGetDeviceProperties(&devProp, devID)); + cudaDeviceProp devProp; + checkCudaErrors(cudaGetDeviceProperties(&devProp, devID)); - if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) { - printf( - "ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or " - "higher.\n"); - return EXIT_WAIVED; - } + if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) { + printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or " + "higher.\n"); + return EXIT_WAIVED; + } - for (int i = 0; i < 2; ++i) { - checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i])); - checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i])); - } + for (int i = 0; i < 2; ++i) { + checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i])); + checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i])); + } - checkCudaErrors( - cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results)); - checkCudaErrors( - cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults)); + checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results)); + checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults)); - for (int i = 0; i < 2; ++i) { - generateInput(vec[i], size); - checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], - cudaMemcpyHostToDevice)); - } + for (int i = 0; i < 2; ++i) { + generateInput(vec[i], size); + checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice)); + } - scalarProductKernel_native<<>>( - devVec[0], devVec[1], devResults, size); + scalarProductKernel_native<<>>(devVec[0], devVec[1], devResults, size); - checkCudaErrors(cudaMemcpy(results, devResults, - NUM_OF_BLOCKS * sizeof *results, - cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost)); - float result_native = 0; - for (int i = 0; i < NUM_OF_BLOCKS; ++i) { - result_native += results[i]; - } - printf("Result native operators\t: %f \n", result_native); + float result_native = 0; + for (int i = 0; i < NUM_OF_BLOCKS; ++i) { + result_native += results[i]; + } + printf("Result native operators\t: %f \n", result_native); - scalarProductKernel_intrinsics<<>>( - devVec[0], devVec[1], devResults, size); + scalarProductKernel_intrinsics<<>>(devVec[0], devVec[1], devResults, size); - checkCudaErrors(cudaMemcpy(results, devResults, - NUM_OF_BLOCKS * sizeof *results, - cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost)); - float result_intrinsics = 0; - for (int i = 0; i < NUM_OF_BLOCKS; ++i) { - result_intrinsics += results[i]; - } - printf("Result intrinsics\t: %f \n", result_intrinsics); + float result_intrinsics = 0; + for (int i = 0; i < NUM_OF_BLOCKS; ++i) { + result_intrinsics += results[i]; + } + printf("Result intrinsics\t: %f \n", result_intrinsics); - printf("&&&& fp16ScalarProduct %s\n", - (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" - : "FAILED"); + printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED"); - for (int i = 0; i < 2; ++i) { - checkCudaErrors(cudaFree(devVec[i])); - checkCudaErrors(cudaFreeHost(vec[i])); - } + for (int i = 0; i < 2; ++i) { + checkCudaErrors(cudaFree(devVec[i])); + checkCudaErrors(cudaFreeHost(vec[i])); + } - checkCudaErrors(cudaFree(devResults)); - checkCudaErrors(cudaFreeHost(results)); + checkCudaErrors(cudaFree(devResults)); + checkCudaErrors(cudaFreeHost(results)); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/0_Introduction/matrixMul/matrixMul.cu b/Samples/0_Introduction/matrixMul/matrixMul.cu index 98c4184c..957be0f7 100644 --- a/Samples/0_Introduction/matrixMul/matrixMul.cu +++ b/Samples/0_Introduction/matrixMul/matrixMul.cu @@ -40,314 +40,303 @@ */ // System includes -#include #include +#include // CUDA runtime -#include #include +#include // Helper functions and utilities to work with CUDA -#include #include +#include /** * Matrix multiplication (CUDA Kernel) on the device: C = A * B * wA is A's width and wB is B's width */ -template __global__ void MatrixMulCUDA(float *C, float *A, - float *B, int wA, - int wB) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; +template __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB) +{ + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * by; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * by; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * bx; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * bx; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; - a <= aEnd; - a += aStep, b += bStep) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[ty][tx] = A[a + wA * ty + tx]; - Bs[ty][tx] = B[b + wB * ty + tx]; + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[ty][tx] = A[a + wA * ty + tx]; + Bs[ty][tx] = B[b + wB * ty + tx]; - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[ty][k] * Bs[k][tx]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[ty][k] * Bs[k][tx]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); } - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; - C[c + wB * ty + tx] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; + C[c + wB * ty + tx] = Csub; } -void ConstantInit(float *data, int size, float val) { - for (int i = 0; i < size; ++i) { - data[i] = val; - } +void ConstantInit(float *data, int size, float val) +{ + for (int i = 0; i < size; ++i) { + data[i] = val; + } } /** * Run a simple test of matrix multiplication using CUDA */ -int MatrixMultiply(int argc, char **argv, - int block_size, const dim3 &dimsA, - const dim3 &dimsB) { - // Allocate host memory for matrices A and B - unsigned int size_A = dimsA.x * dimsA.y; - unsigned int mem_size_A = sizeof(float) * size_A; - float *h_A; - checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); - unsigned int size_B = dimsB.x * dimsB.y; - unsigned int mem_size_B = sizeof(float) * size_B; - float *h_B; - checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); - cudaStream_t stream; +int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB) +{ + // Allocate host memory for matrices A and B + unsigned int size_A = dimsA.x * dimsA.y; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A; + checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); + unsigned int size_B = dimsB.x * dimsB.y; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B; + checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); + cudaStream_t stream; - // Initialize host memory - const float valB = 0.01f; - ConstantInit(h_A, size_A, 1.0f); - ConstantInit(h_B, size_B, valB); + // Initialize host memory + const float valB = 0.01f; + ConstantInit(h_A, size_A, 1.0f); + ConstantInit(h_B, size_B, valB); - // Allocate device memory - float *d_A, *d_B, *d_C; + // Allocate device memory + float *d_A, *d_B, *d_C; - // Allocate host matrix C - dim3 dimsC(dimsB.x, dimsA.y, 1); - unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); - float *h_C; - checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); + // Allocate host matrix C + dim3 dimsC(dimsB.x, dimsA.y, 1); + unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); + float *h_C; + checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); - if (h_C == NULL) { - fprintf(stderr, "Failed to allocate host matrix C!\n"); - exit(EXIT_FAILURE); - } + if (h_C == NULL) { + fprintf(stderr, "Failed to allocate host matrix C!\n"); + exit(EXIT_FAILURE); + } - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); - // Allocate CUDA events that we'll use for timing - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); + // Allocate CUDA events that we'll use for timing + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - // copy host memory to device - checkCudaErrors( - cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); - checkCudaErrors( - cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); + // copy host memory to device + checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); - // Setup execution parameters - dim3 threads(block_size, block_size); - dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); + // Setup execution parameters + dim3 threads(block_size, block_size); + dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); - // Create and start timer - printf("Computing result using CUDA Kernel...\n"); + // Create and start timer + printf("Computing result using CUDA Kernel...\n"); - // Performs warmup operation using matrixMul CUDA kernel - if (block_size == 16) { - MatrixMulCUDA<16> - <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - } else { - MatrixMulCUDA<32> - <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - } - - printf("done\n"); - checkCudaErrors(cudaStreamSynchronize(stream)); - - // Record the start event - checkCudaErrors(cudaEventRecord(start, stream)); - - // Execute the kernel - int nIter = 300; - - for (int j = 0; j < nIter; j++) { + // Performs warmup operation using matrixMul CUDA kernel if (block_size == 16) { - MatrixMulCUDA<16> - <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); - } else { - MatrixMulCUDA<32> - <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + MatrixMulCUDA<16><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); } - } - - // Record the stop event - checkCudaErrors(cudaEventRecord(stop, stream)); - - // Wait for the stop event to complete - checkCudaErrors(cudaEventSynchronize(stop)); - - float msecTotal = 0.0f; - checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); - - // Compute and print the performance - float msecPerMatrixMul = msecTotal / nIter; - double flopsPerMatrixMul = 2.0 * static_cast(dimsA.x) * - static_cast(dimsA.y) * - static_cast(dimsB.x); - double gigaFlops = - (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); - printf( - "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," - " WorkgroupSize= %u threads/block\n", - gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); - - // Copy result from device to host - checkCudaErrors( - cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - printf("Checking computed result for correctness: "); - bool correct = true; - - // test relative error by the formula - // |_cpu - _gpu|/<|x|, |y|> < eps - double eps = 1.e-6; // machine zero - - for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { - double abs_err = fabs(h_C[i] - (dimsA.x * valB)); - double dot_length = dimsA.x; - double abs_val = fabs(h_C[i]); - double rel_err = abs_err / abs_val / dot_length; - - if (rel_err > eps) { - printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", - i, h_C[i], dimsA.x * valB, eps); - correct = false; + else { + MatrixMulCUDA<32><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); } - } - printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + printf("done\n"); + checkCudaErrors(cudaStreamSynchronize(stream)); - // Clean up memory - checkCudaErrors(cudaFreeHost(h_A)); - checkCudaErrors(cudaFreeHost(h_B)); - checkCudaErrors(cudaFreeHost(h_C)); - checkCudaErrors(cudaFree(d_A)); - checkCudaErrors(cudaFree(d_B)); - checkCudaErrors(cudaFree(d_C)); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); - printf( - "\nNOTE: The CUDA Samples are not meant for performance " - "measurements. Results may vary when GPU Boost is enabled.\n"); + // Record the start event + checkCudaErrors(cudaEventRecord(start, stream)); - if (correct) { - return EXIT_SUCCESS; - } else { - return EXIT_FAILURE; - } + // Execute the kernel + int nIter = 300; + + for (int j = 0; j < nIter; j++) { + if (block_size == 16) { + MatrixMulCUDA<16><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + } + else { + MatrixMulCUDA<32><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + } + } + + // Record the stop event + checkCudaErrors(cudaEventRecord(stop, stream)); + + // Wait for the stop event to complete + checkCudaErrors(cudaEventSynchronize(stop)); + + float msecTotal = 0.0f; + checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); + + // Compute and print the performance + float msecPerMatrixMul = msecTotal / nIter; + double flopsPerMatrixMul = + 2.0 * static_cast(dimsA.x) * static_cast(dimsA.y) * static_cast(dimsB.x); + double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); + printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," + " WorkgroupSize= %u threads/block\n", + gigaFlops, + msecPerMatrixMul, + flopsPerMatrixMul, + threads.x * threads.y); + + // Copy result from device to host + checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + printf("Checking computed result for correctness: "); + bool correct = true; + + // test relative error by the formula + // |_cpu - _gpu|/<|x|, |y|> < eps + double eps = 1.e-6; // machine zero + + for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { + double abs_err = fabs(h_C[i] - (dimsA.x * valB)); + double dot_length = dimsA.x; + double abs_val = fabs(h_C[i]); + double rel_err = abs_err / abs_val / dot_length; + + if (rel_err > eps) { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps); + correct = false; + } + } + + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + + // Clean up memory + checkCudaErrors(cudaFreeHost(h_A)); + checkCudaErrors(cudaFreeHost(h_B)); + checkCudaErrors(cudaFreeHost(h_C)); + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_C)); + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + printf("\nNOTE: The CUDA Samples are not meant for performance " + "measurements. Results may vary when GPU Boost is enabled.\n"); + + if (correct) { + return EXIT_SUCCESS; + } + else { + return EXIT_FAILURE; + } } /** * Program main */ -int main(int argc, char **argv) { - printf("[Matrix Multiply Using CUDA] - Starting...\n"); +int main(int argc, char **argv) +{ + printf("[Matrix Multiply Using CUDA] - Starting...\n"); - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "?")) { - printf("Usage -device=n (n >= 0 for deviceID)\n"); - printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); - printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); - printf(" Note: Outer matrix dimensions of A & B matrices" \ - " must be equal.\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { + printf("Usage -device=n (n >= 0 for deviceID)\n"); + printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); + printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); + printf(" Note: Outer matrix dimensions of A & B matrices" + " must be equal.\n"); - exit(EXIT_SUCCESS); - } + exit(EXIT_SUCCESS); + } - // This will pick the best possible CUDA capable device, otherwise - // override the device ID based on input provided at the command line - int dev = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device, otherwise + // override the device ID based on input provided at the command line + int dev = findCudaDevice(argc, (const char **)argv); - int block_size = 32; + int block_size = 32; - dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); - dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); + dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); + dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); - // width of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { - dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); - } + // width of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { + dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); + } - // height of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { - dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); - } + // height of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { + dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); + } - // width of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { - dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); - } + // width of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { + dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); + } - // height of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { - dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); - } + // height of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { + dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); + } - if (dimsA.x != dimsB.y) { - printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", - dimsA.x, dimsB.y); - exit(EXIT_FAILURE); - } + if (dimsA.x != dimsB.y) { + printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y); + exit(EXIT_FAILURE); + } - printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, - dimsB.x, dimsB.y); + printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); - checkCudaErrors(cudaProfilerStart()); - int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB); - checkCudaErrors(cudaProfilerStop()); + checkCudaErrors(cudaProfilerStart()); + int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB); + checkCudaErrors(cudaProfilerStop()); - exit(matrix_result); + exit(matrix_result); } diff --git a/Samples/0_Introduction/matrixMulDrv/matrixMul.h b/Samples/0_Introduction/matrixMulDrv/matrixMul.h index 55e1f013..18cecda4 100644 --- a/Samples/0_Introduction/matrixMulDrv/matrixMul.h +++ b/Samples/0_Introduction/matrixMulDrv/matrixMul.h @@ -30,11 +30,11 @@ // Matrix dimensions // (chosen as multiples of the thread block size for simplicity) -#define WA (4 * block_size) // Matrix A width -#define HA (6 * block_size) // Matrix A height -#define WB (4 * block_size) // Matrix B width -#define HB WA // Matrix B height -#define WC WB // Matrix C width -#define HC HA // Matrix C height +#define WA (4 * block_size) // Matrix A width +#define HA (6 * block_size) // Matrix A height +#define WB (4 * block_size) // Matrix B width +#define HB WA // Matrix B height +#define WC WB // Matrix C width +#define HC HA // Matrix C height -#endif // _MATRIXMUL_H_ +#endif // _MATRIXMUL_H_ diff --git a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp index d99b5eba..8f3c83d3 100644 --- a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp +++ b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp @@ -46,23 +46,23 @@ // includes, system #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include // includes, project, CUDA +#include #include #include #include #include #include - -#include #include #include + #include "matrixMul.h" @@ -71,11 +71,9 @@ void runTest(int argc, char **argv); void randomInit(float *, int); -extern "C" void computeGold(float *, const float *, const float *, unsigned int, - unsigned int, unsigned int); +extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int); -static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, - int *blk_size); +static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size); #ifndef FATBIN_FILE #define FATBIN_FILE "matrixMul_kernel64.fatbin" @@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, //////////////////////////////////////////////////////////////////////////////// // Globals //////////////////////////////////////////////////////////////////////////////// -CUdevice cuDevice; +CUdevice cuDevice; CUcontext cuContext; -CUmodule cuModule; -size_t totalGlobalMem; +CUmodule cuModule; +size_t totalGlobalMem; const char *sSDKsample = "matrixMulDrv (Driver API)"; -void constantInit(float *data, int size, float val) { - for (int i = 0; i < size; ++i) { - data[i] = val; - } +void constantInit(float *data, int size, float val) +{ + for (int i = 0; i < size; ++i) { + data[i] = val; + } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("[ %s ]\n", sSDKsample); +int main(int argc, char **argv) +{ + printf("[ %s ]\n", sSDKsample); - runTest(argc, argv); + runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - // initialize CUDA - CUfunction matrixMul = NULL; - int block_size = 0; +void runTest(int argc, char **argv) +{ + // initialize CUDA + CUfunction matrixMul = NULL; + int block_size = 0; - initCUDA(argc, argv, &matrixMul, &block_size); + initCUDA(argc, argv, &matrixMul, &block_size); - // set seed for rand() - srand(2006); + // set seed for rand() + srand(2006); - // allocate host memory for matrices A and B - unsigned int size_A = WA * HA; - unsigned int mem_size_A = sizeof(float) * size_A; - float *h_A = reinterpret_cast(malloc(mem_size_A)); - unsigned int size_B = WB * HB; - unsigned int mem_size_B = sizeof(float) * size_B; - float *h_B = reinterpret_cast(malloc(mem_size_B)); + // allocate host memory for matrices A and B + unsigned int size_A = WA * HA; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A = reinterpret_cast(malloc(mem_size_A)); + unsigned int size_B = WB * HB; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B = reinterpret_cast(malloc(mem_size_B)); - // initialize host memory - const float valB = 0.01f; - constantInit(h_A, size_A, 1.0f); - constantInit(h_B, size_B, valB); + // initialize host memory + const float valB = 0.01f; + constantInit(h_A, size_A, 1.0f); + constantInit(h_B, size_B, valB); - // allocate device memory - CUdeviceptr d_A; - checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); - CUdeviceptr d_B; - checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); + // allocate device memory + CUdeviceptr d_A; + checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); + CUdeviceptr d_B; + checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); - // copy host memory to device - checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); - checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); + // copy host memory to device + checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); + checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); - // allocate device memory for result - size_t size_C = WC * HC; - size_t mem_size_C = sizeof(float) * size_C; + // allocate device memory for result + size_t size_C = WC * HC; + size_t mem_size_C = sizeof(float) * size_C; - CUdeviceptr d_C; - checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); + CUdeviceptr d_C; + checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); - // allocate mem for the result on host side - float *h_C = reinterpret_cast(malloc(mem_size_C)); + // allocate mem for the result on host side + float *h_C = reinterpret_cast(malloc(mem_size_C)); - // create and start timer - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); + // create and start timer + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); - // start the timer - sdkStartTimer(&timer); + // start the timer + sdkStartTimer(&timer); - // There are two ways to launch CUDA kernels via the Driver API. - // In this CUDA Sample, we illustrate both ways to pass parameters - // and specify parameters. By default we use the simpler method. - dim3 block(block_size, block_size, 1); - dim3 grid(WC / block_size, HC / block_size, 1); + // There are two ways to launch CUDA kernels via the Driver API. + // In this CUDA Sample, we illustrate both ways to pass parameters + // and specify parameters. By default we use the simpler method. + dim3 block(block_size, block_size, 1); + dim3 grid(WC / block_size, HC / block_size, 1); - if (1) { - // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel - // Launching (simplier method) - size_t Matrix_Width_A = (size_t)WA; - size_t Matrix_Width_B = (size_t)WB; - void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; - // new CUDA 4.0 Driver API Kernel launch call - checkCudaErrors(cuLaunchKernel( - matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, - 2 * block_size * block_size * sizeof(float), NULL, args, NULL)); - } else { - // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel - // Launching (advanced method) - int offset = 0; - char argBuffer[256]; - - // pass in launch parameters (not actually de-referencing CUdeviceptr). - // CUdeviceptr is storing the value of the parameters - *(reinterpret_cast(&argBuffer[offset])) = d_C; - offset += sizeof(d_C); - *(reinterpret_cast(&argBuffer[offset])) = d_A; - offset += sizeof(d_A); - *(reinterpret_cast(&argBuffer[offset])) = d_B; - offset += sizeof(d_B); - - size_t Matrix_Width_A = (size_t)WA; - size_t Matrix_Width_B = (size_t)WB; - - *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_A; - offset += sizeof(Matrix_Width_A); - *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_B; - offset += sizeof(Matrix_Width_B); - - void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, - CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, - CU_LAUNCH_PARAM_END}; - - // new CUDA 4.0 Driver API Kernel launch call - checkCudaErrors(cuLaunchKernel( - matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, - 2 * block_size * block_size * sizeof(float), NULL, NULL, - reinterpret_cast(&kernel_launch_config))); - } - - // copy result from device to host - checkCudaErrors(cuMemcpyDtoH(reinterpret_cast(h_C), d_C, mem_size_C)); - - // stop and destroy timer - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); - - printf("Checking computed result for correctness: "); - bool correct = true; - - for (int i = 0; i < static_cast(WC * HC); i++) { - if (fabs(h_C[i] - (WA * valB)) > 1e-5) { - printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, - h_C[i], WA * valB); - correct = false; + if (1) { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (simplier method) + size_t Matrix_Width_A = (size_t)WA; + size_t Matrix_Width_B = (size_t)WB; + void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; + // new CUDA 4.0 Driver API Kernel launch call + checkCudaErrors(cuLaunchKernel(matrixMul, + grid.x, + grid.y, + grid.z, + block.x, + block.y, + block.z, + 2 * block_size * block_size * sizeof(float), + NULL, + args, + NULL)); } - } + else { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (advanced method) + int offset = 0; + char argBuffer[256]; - printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + // pass in launch parameters (not actually de-referencing CUdeviceptr). + // CUdeviceptr is storing the value of the parameters + *(reinterpret_cast(&argBuffer[offset])) = d_C; + offset += sizeof(d_C); + *(reinterpret_cast(&argBuffer[offset])) = d_A; + offset += sizeof(d_A); + *(reinterpret_cast(&argBuffer[offset])) = d_B; + offset += sizeof(d_B); - printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); + size_t Matrix_Width_A = (size_t)WA; + size_t Matrix_Width_B = (size_t)WB; - // clean up memory - free(h_A); - free(h_B); - free(h_C); - checkCudaErrors(cuMemFree(d_A)); - checkCudaErrors(cuMemFree(d_B)); - checkCudaErrors(cuMemFree(d_C)); - checkCudaErrors(cuCtxDestroy(cuContext)); + *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_A; + offset += sizeof(Matrix_Width_A); + *(reinterpret_cast(&argBuffer[offset])) = Matrix_Width_B; + offset += sizeof(Matrix_Width_B); + + void *kernel_launch_config[5] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END}; + + // new CUDA 4.0 Driver API Kernel launch call + checkCudaErrors(cuLaunchKernel(matrixMul, + grid.x, + grid.y, + grid.z, + block.x, + block.y, + block.z, + 2 * block_size * block_size * sizeof(float), + NULL, + NULL, + reinterpret_cast(&kernel_launch_config))); + } + + // copy result from device to host + checkCudaErrors(cuMemcpyDtoH(reinterpret_cast(h_C), d_C, mem_size_C)); + + // stop and destroy timer + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); + + printf("Checking computed result for correctness: "); + bool correct = true; + + for (int i = 0; i < static_cast(WC * HC); i++) { + if (fabs(h_C[i] - (WA * valB)) > 1e-5) { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB); + correct = false; + } + } + + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); + + // clean up memory + free(h_A); + free(h_B); + free(h_C); + checkCudaErrors(cuMemFree(d_A)); + checkCudaErrors(cuMemFree(d_B)); + checkCudaErrors(cuMemFree(d_C)); + checkCudaErrors(cuCtxDestroy(cuContext)); } // Allocates a matrix with random float entries. -void randomInit(float *data, int size) { - for (int i = 0; i < size; ++i) { - data[i] = rand() / static_cast(RAND_MAX); - } -} - -static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, - int *blk_size) { - CUfunction cuFunction = 0; - int major = 0, minor = 0; - char deviceName[100]; - - cuDevice = findCudaDeviceDRV(argc, (const char **)argv); - - // get compute capabilities and the devicename - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice)); - printf("> GPU Device has SM %d.%d compute capability\n", major, minor); - - checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice)); - printf(" Total amount of global memory: %llu bytes\n", - (long long unsigned int)totalGlobalMem); - - checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); - - // first search for the module path before we load the results - std::string module_path; - std::ostringstream fatbin; - - if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { - exit(EXIT_FAILURE); - } else { - printf("> initCUDA loading module: <%s>\n", module_path.c_str()); - } - - if (!fatbin.str().size()) { - printf("fatbin file empty. exiting..\n"); - exit(EXIT_FAILURE); - } - - // Create module from binary file (FATBIN) - checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); - - // select the suitable kernel function - const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", - "matrixMul_bs8_64bit"}; - - int idx = 0; - int block_size = 32; - while (idx < 3) { - int threadsPerBlock = 0; - int blocksPerGrid = 0; - - checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx])); - checkCudaErrors(cuOccupancyMaxPotentialBlockSize( - &blocksPerGrid, &threadsPerBlock, cuFunction, 0, - 2 * block_size * block_size * sizeof(float), 0)); - if (block_size * block_size <= threadsPerBlock) { - printf("> %d block size selected\n", block_size); - break; - } else { - block_size /= 2; +void randomInit(float *data, int size) +{ + for (int i = 0; i < size; ++i) { + data[i] = rand() / static_cast(RAND_MAX); } - idx++; - } - - *pMatrixMul = cuFunction; - *blk_size = block_size; - - return 0; +} + +static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size) +{ + CUfunction cuFunction = 0; + int major = 0, minor = 0; + char deviceName[100]; + + cuDevice = findCudaDeviceDRV(argc, (const char **)argv); + + // get compute capabilities and the devicename + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice)); + printf("> GPU Device has SM %d.%d compute capability\n", major, minor); + + checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice)); + printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem); + + checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + + // first search for the module path before we load the results + std::string module_path; + std::ostringstream fatbin; + + if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { + exit(EXIT_FAILURE); + } + else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); + } + + if (!fatbin.str().size()) { + printf("fatbin file empty. exiting..\n"); + exit(EXIT_FAILURE); + } + + // Create module from binary file (FATBIN) + checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); + + // select the suitable kernel function + const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"}; + + int idx = 0; + int block_size = 32; + while (idx < 3) { + int threadsPerBlock = 0; + int blocksPerGrid = 0; + + checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx])); + checkCudaErrors(cuOccupancyMaxPotentialBlockSize( + &blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0)); + if (block_size * block_size <= threadsPerBlock) { + printf("> %d block size selected\n", block_size); + break; + } + else { + block_size /= 2; + } + idx++; + } + + *pMatrixMul = cuFunction; + *blk_size = block_size; + + return 0; } diff --git a/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu b/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu index cc29cb1d..3aba650a 100644 --- a/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu +++ b/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu @@ -42,86 +42,87 @@ //! wA is A's width and wB is B's width //////////////////////////////////////////////////////////////////////////////// template -__device__ void matrixMul(float *C, float *A, float *B, size_type wA, - size_type wB) { - // Block index - size_type bx = blockIdx.x; - size_type by = blockIdx.y; +__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB) +{ + // Block index + size_type bx = blockIdx.x; + size_type by = blockIdx.y; - // Thread index - size_type tx = threadIdx.x; - size_type ty = threadIdx.y; + // Thread index + size_type tx = threadIdx.x; + size_type ty = threadIdx.y; - // Index of the first sub-matrix of A processed by the block - size_type aBegin = wA * block_size * by; + // Index of the first sub-matrix of A processed by the block + size_type aBegin = wA * block_size * by; - // Index of the last sub-matrix of A processed by the block - size_type aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + size_type aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - size_type aStep = block_size; + // Step size used to iterate through the sub-matrices of A + size_type aStep = block_size; - // Index of the first sub-matrix of B processed by the block - size_type bBegin = block_size * bx; + // Index of the first sub-matrix of B processed by the block + size_type bBegin = block_size * bx; - // Step size used to iterate through the sub-matrices of B - size_type bStep = block_size * wB; + // Step size used to iterate through the sub-matrices of B + size_type bStep = block_size * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[block_size][block_size]; + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[block_size][block_size]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[block_size][block_size]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[block_size][block_size]; - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - AS(ty, tx) = A[a + wA * ty + tx]; - BS(ty, tx) = B[b + wB * ty + tx]; + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + AS(ty, tx) = A[a + wA * ty + tx]; + BS(ty, tx) = B[b + wB * ty + tx]; - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix #pragma unroll - for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx); + for (size_type k = 0; k < block_size; ++k) + Csub += AS(ty, k) * BS(k, tx); - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); - } + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); + } - // Write the block sub-matrix to device memory; - // each thread writes one element - size_type c = wB * block_size * by + block_size * bx; - C[c + wB * ty + tx] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes one element + size_type c = wB * block_size * by + block_size * bx; + C[c + wB * ty + tx] = Csub; } // C wrappers around our template kernel -extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, - size_t wA, size_t wB) { - matrixMul<8, size_t>(C, A, B, wA, wB); +extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB) +{ + matrixMul<8, size_t>(C, A, B, wA, wB); } -extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, - size_t wA, size_t wB) { - matrixMul<16, size_t>(C, A, B, wA, wB); +extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB) +{ + matrixMul<16, size_t>(C, A, B, wA, wB); } -extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, - size_t wA, size_t wB) { - matrixMul<32, size_t>(C, A, B, wA, wB); +extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB) +{ + matrixMul<32, size_t>(C, A, B, wA, wB); } -#endif // #ifndef _MATRIXMUL_KERNEL_H_ +#endif // #ifndef _MATRIXMUL_KERNEL_H_ diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c index c62f9a77..8adc4d87 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c @@ -15,210 +15,211 @@ // With these flags defined, this source file will dynamically // load the corresponding functions. Disabled by default. -//#define CUDA_INIT_D3D9 -//#define CUDA_INIT_D3D10 -//#define CUDA_INIT_D3D11 -//#define CUDA_INIT_OPENGL +// #define CUDA_INIT_D3D9 +// #define CUDA_INIT_D3D10 +// #define CUDA_INIT_D3D11 +// #define CUDA_INIT_OPENGL -#include #include "cuda_drvapi_dynlink.h" -tcuInit *_cuInit; -tcuDriverGetVersion *cuDriverGetVersion; -tcuDeviceGet *cuDeviceGet; -tcuDeviceGetCount *cuDeviceGetCount; -tcuDeviceGetName *cuDeviceGetName; -tcuDeviceComputeCapability *cuDeviceComputeCapability; -tcuDeviceTotalMem *cuDeviceTotalMem; -tcuDeviceGetProperties *cuDeviceGetProperties; -tcuDeviceGetAttribute *cuDeviceGetAttribute; -tcuGetErrorString *cuGetErrorString; -tcuCtxCreate *cuCtxCreate; -tcuCtxDestroy *cuCtxDestroy; -tcuCtxAttach *cuCtxAttach; -tcuCtxDetach *cuCtxDetach; -tcuCtxPushCurrent *cuCtxPushCurrent; -tcuCtxPopCurrent *cuCtxPopCurrent; -tcuCtxGetCurrent *cuCtxGetCurrent; -tcuCtxSetCurrent *cuCtxSetCurrent; -tcuCtxGetDevice *cuCtxGetDevice; -tcuCtxSynchronize *cuCtxSynchronize; -tcuModuleLoad *cuModuleLoad; -tcuModuleLoadData *cuModuleLoadData; -tcuModuleLoadDataEx *cuModuleLoadDataEx; -tcuModuleLoadFatBinary *cuModuleLoadFatBinary; -tcuModuleUnload *cuModuleUnload; -tcuModuleGetFunction *cuModuleGetFunction; -tcuModuleGetGlobal *cuModuleGetGlobal; -tcuModuleGetTexRef *cuModuleGetTexRef; -tcuModuleGetSurfRef *cuModuleGetSurfRef; -tcuMemGetInfo *cuMemGetInfo; -tcuMemAlloc *cuMemAlloc; -tcuMemAllocPitch *cuMemAllocPitch; -tcuMemFree *cuMemFree; -tcuMemGetAddressRange *cuMemGetAddressRange; -tcuMemAllocHost *cuMemAllocHost; -tcuMemFreeHost *cuMemFreeHost; -tcuMemHostAlloc *cuMemHostAlloc; -tcuMemHostGetFlags *cuMemHostGetFlags; +#include -tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer; -tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId; -tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId; -tcuIpcGetEventHandle *cuIpcGetEventHandle; -tcuIpcOpenEventHandle *cuIpcOpenEventHandle; -tcuIpcGetMemHandle *cuIpcGetMemHandle; -tcuIpcOpenMemHandle *cuIpcOpenMemHandle; -tcuIpcCloseMemHandle *cuIpcCloseMemHandle; +tcuInit *_cuInit; +tcuDriverGetVersion *cuDriverGetVersion; +tcuDeviceGet *cuDeviceGet; +tcuDeviceGetCount *cuDeviceGetCount; +tcuDeviceGetName *cuDeviceGetName; +tcuDeviceComputeCapability *cuDeviceComputeCapability; +tcuDeviceTotalMem *cuDeviceTotalMem; +tcuDeviceGetProperties *cuDeviceGetProperties; +tcuDeviceGetAttribute *cuDeviceGetAttribute; +tcuGetErrorString *cuGetErrorString; +tcuCtxCreate *cuCtxCreate; +tcuCtxDestroy *cuCtxDestroy; +tcuCtxAttach *cuCtxAttach; +tcuCtxDetach *cuCtxDetach; +tcuCtxPushCurrent *cuCtxPushCurrent; +tcuCtxPopCurrent *cuCtxPopCurrent; +tcuCtxGetCurrent *cuCtxGetCurrent; +tcuCtxSetCurrent *cuCtxSetCurrent; +tcuCtxGetDevice *cuCtxGetDevice; +tcuCtxSynchronize *cuCtxSynchronize; +tcuModuleLoad *cuModuleLoad; +tcuModuleLoadData *cuModuleLoadData; +tcuModuleLoadDataEx *cuModuleLoadDataEx; +tcuModuleLoadFatBinary *cuModuleLoadFatBinary; +tcuModuleUnload *cuModuleUnload; +tcuModuleGetFunction *cuModuleGetFunction; +tcuModuleGetGlobal *cuModuleGetGlobal; +tcuModuleGetTexRef *cuModuleGetTexRef; +tcuModuleGetSurfRef *cuModuleGetSurfRef; +tcuMemGetInfo *cuMemGetInfo; +tcuMemAlloc *cuMemAlloc; +tcuMemAllocPitch *cuMemAllocPitch; +tcuMemFree *cuMemFree; +tcuMemGetAddressRange *cuMemGetAddressRange; +tcuMemAllocHost *cuMemAllocHost; +tcuMemFreeHost *cuMemFreeHost; +tcuMemHostAlloc *cuMemHostAlloc; +tcuMemHostGetFlags *cuMemHostGetFlags; -tcuMemHostRegister *cuMemHostRegister; -tcuMemHostUnregister *cuMemHostUnregister; -tcuMemcpyHtoD *cuMemcpyHtoD; -tcuMemcpyDtoH *cuMemcpyDtoH; -tcuMemcpyDtoD *cuMemcpyDtoD; -tcuMemcpyDtoA *cuMemcpyDtoA; -tcuMemcpyAtoD *cuMemcpyAtoD; -tcuMemcpyHtoA *cuMemcpyHtoA; -tcuMemcpyAtoH *cuMemcpyAtoH; -tcuMemcpyAtoA *cuMemcpyAtoA; -tcuMemcpy2D *cuMemcpy2D; -tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned; -tcuMemcpy3D *cuMemcpy3D; -tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync; -tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync; -tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync; -tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync; -tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync; -tcuMemcpy2DAsync *cuMemcpy2DAsync; -tcuMemcpy3DAsync *cuMemcpy3DAsync; -tcuMemcpy *cuMemcpy; -tcuMemcpyPeer *cuMemcpyPeer; -tcuMemsetD8 *cuMemsetD8; -tcuMemsetD16 *cuMemsetD16; -tcuMemsetD32 *cuMemsetD32; -tcuMemsetD2D8 *cuMemsetD2D8; -tcuMemsetD2D16 *cuMemsetD2D16; -tcuMemsetD2D32 *cuMemsetD2D32; -tcuFuncSetBlockShape *cuFuncSetBlockShape; -tcuFuncSetSharedSize *cuFuncSetSharedSize; -tcuFuncGetAttribute *cuFuncGetAttribute; -tcuFuncSetCacheConfig *cuFuncSetCacheConfig; -tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig; -tcuLaunchKernel *cuLaunchKernel; -tcuArrayCreate *cuArrayCreate; -tcuArrayGetDescriptor *cuArrayGetDescriptor; -tcuArrayDestroy *cuArrayDestroy; -tcuArray3DCreate *cuArray3DCreate; -tcuArray3DGetDescriptor *cuArray3DGetDescriptor; -tcuTexRefCreate *cuTexRefCreate; -tcuTexRefDestroy *cuTexRefDestroy; -tcuTexRefSetArray *cuTexRefSetArray; -tcuTexRefSetAddress *cuTexRefSetAddress; -tcuTexRefSetAddress2D *cuTexRefSetAddress2D; -tcuTexRefSetFormat *cuTexRefSetFormat; -tcuTexRefSetAddressMode *cuTexRefSetAddressMode; -tcuTexRefSetFilterMode *cuTexRefSetFilterMode; -tcuTexRefSetFlags *cuTexRefSetFlags; -tcuTexRefGetAddress *cuTexRefGetAddress; -tcuTexRefGetArray *cuTexRefGetArray; -tcuTexRefGetAddressMode *cuTexRefGetAddressMode; -tcuTexRefGetFilterMode *cuTexRefGetFilterMode; -tcuTexRefGetFormat *cuTexRefGetFormat; -tcuTexRefGetFlags *cuTexRefGetFlags; -tcuSurfRefSetArray *cuSurfRefSetArray; -tcuSurfRefGetArray *cuSurfRefGetArray; -tcuParamSetSize *cuParamSetSize; -tcuParamSeti *cuParamSeti; -tcuParamSetf *cuParamSetf; -tcuParamSetv *cuParamSetv; -tcuParamSetTexRef *cuParamSetTexRef; -tcuLaunch *cuLaunch; -tcuLaunchGrid *cuLaunchGrid; -tcuLaunchGridAsync *cuLaunchGridAsync; -tcuEventCreate *cuEventCreate; -tcuEventRecord *cuEventRecord; -tcuEventQuery *cuEventQuery; -tcuEventSynchronize *cuEventSynchronize; -tcuEventDestroy *cuEventDestroy; -tcuEventElapsedTime *cuEventElapsedTime; -tcuStreamCreate *cuStreamCreate; -tcuStreamWaitEvent *cuStreamWaitEvent; -tcuStreamAddCallback *cuStreamAddCallback; -tcuStreamQuery *cuStreamQuery; -tcuStreamSynchronize *cuStreamSynchronize; -tcuStreamDestroy *cuStreamDestroy; -tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; -tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; -tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer; -tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags; -tcuGraphicsMapResources *cuGraphicsMapResources; -tcuGraphicsUnmapResources *cuGraphicsUnmapResources; -tcuGetExportTable *cuGetExportTable; -tcuCtxSetLimit *cuCtxSetLimit; -tcuCtxGetLimit *cuCtxGetLimit; -tcuCtxGetCacheConfig *cuCtxGetCacheConfig; -tcuCtxSetCacheConfig *cuCtxSetCacheConfig; -tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig; -tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig; -tcuCtxGetApiVersion *cuCtxGetApiVersion; +tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer; +tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId; +tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId; +tcuIpcGetEventHandle *cuIpcGetEventHandle; +tcuIpcOpenEventHandle *cuIpcOpenEventHandle; +tcuIpcGetMemHandle *cuIpcGetMemHandle; +tcuIpcOpenMemHandle *cuIpcOpenMemHandle; +tcuIpcCloseMemHandle *cuIpcCloseMemHandle; -tcuMipmappedArrayCreate *cuMipmappedArrayCreate; -tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel; -tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy; +tcuMemHostRegister *cuMemHostRegister; +tcuMemHostUnregister *cuMemHostUnregister; +tcuMemcpyHtoD *cuMemcpyHtoD; +tcuMemcpyDtoH *cuMemcpyDtoH; +tcuMemcpyDtoD *cuMemcpyDtoD; +tcuMemcpyDtoA *cuMemcpyDtoA; +tcuMemcpyAtoD *cuMemcpyAtoD; +tcuMemcpyHtoA *cuMemcpyHtoA; +tcuMemcpyAtoH *cuMemcpyAtoH; +tcuMemcpyAtoA *cuMemcpyAtoA; +tcuMemcpy2D *cuMemcpy2D; +tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned; +tcuMemcpy3D *cuMemcpy3D; +tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync; +tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync; +tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync; +tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync; +tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync; +tcuMemcpy2DAsync *cuMemcpy2DAsync; +tcuMemcpy3DAsync *cuMemcpy3DAsync; +tcuMemcpy *cuMemcpy; +tcuMemcpyPeer *cuMemcpyPeer; +tcuMemsetD8 *cuMemsetD8; +tcuMemsetD16 *cuMemsetD16; +tcuMemsetD32 *cuMemsetD32; +tcuMemsetD2D8 *cuMemsetD2D8; +tcuMemsetD2D16 *cuMemsetD2D16; +tcuMemsetD2D32 *cuMemsetD2D32; +tcuFuncSetBlockShape *cuFuncSetBlockShape; +tcuFuncSetSharedSize *cuFuncSetSharedSize; +tcuFuncGetAttribute *cuFuncGetAttribute; +tcuFuncSetCacheConfig *cuFuncSetCacheConfig; +tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig; +tcuLaunchKernel *cuLaunchKernel; +tcuArrayCreate *cuArrayCreate; +tcuArrayGetDescriptor *cuArrayGetDescriptor; +tcuArrayDestroy *cuArrayDestroy; +tcuArray3DCreate *cuArray3DCreate; +tcuArray3DGetDescriptor *cuArray3DGetDescriptor; +tcuTexRefCreate *cuTexRefCreate; +tcuTexRefDestroy *cuTexRefDestroy; +tcuTexRefSetArray *cuTexRefSetArray; +tcuTexRefSetAddress *cuTexRefSetAddress; +tcuTexRefSetAddress2D *cuTexRefSetAddress2D; +tcuTexRefSetFormat *cuTexRefSetFormat; +tcuTexRefSetAddressMode *cuTexRefSetAddressMode; +tcuTexRefSetFilterMode *cuTexRefSetFilterMode; +tcuTexRefSetFlags *cuTexRefSetFlags; +tcuTexRefGetAddress *cuTexRefGetAddress; +tcuTexRefGetArray *cuTexRefGetArray; +tcuTexRefGetAddressMode *cuTexRefGetAddressMode; +tcuTexRefGetFilterMode *cuTexRefGetFilterMode; +tcuTexRefGetFormat *cuTexRefGetFormat; +tcuTexRefGetFlags *cuTexRefGetFlags; +tcuSurfRefSetArray *cuSurfRefSetArray; +tcuSurfRefGetArray *cuSurfRefGetArray; +tcuParamSetSize *cuParamSetSize; +tcuParamSeti *cuParamSeti; +tcuParamSetf *cuParamSetf; +tcuParamSetv *cuParamSetv; +tcuParamSetTexRef *cuParamSetTexRef; +tcuLaunch *cuLaunch; +tcuLaunchGrid *cuLaunchGrid; +tcuLaunchGridAsync *cuLaunchGridAsync; +tcuEventCreate *cuEventCreate; +tcuEventRecord *cuEventRecord; +tcuEventQuery *cuEventQuery; +tcuEventSynchronize *cuEventSynchronize; +tcuEventDestroy *cuEventDestroy; +tcuEventElapsedTime *cuEventElapsedTime; +tcuStreamCreate *cuStreamCreate; +tcuStreamWaitEvent *cuStreamWaitEvent; +tcuStreamAddCallback *cuStreamAddCallback; +tcuStreamQuery *cuStreamQuery; +tcuStreamSynchronize *cuStreamSynchronize; +tcuStreamDestroy *cuStreamDestroy; +tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; +tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; +tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer; +tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags; +tcuGraphicsMapResources *cuGraphicsMapResources; +tcuGraphicsUnmapResources *cuGraphicsUnmapResources; +tcuGetExportTable *cuGetExportTable; +tcuCtxSetLimit *cuCtxSetLimit; +tcuCtxGetLimit *cuCtxGetLimit; +tcuCtxGetCacheConfig *cuCtxGetCacheConfig; +tcuCtxSetCacheConfig *cuCtxSetCacheConfig; +tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig; +tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig; +tcuCtxGetApiVersion *cuCtxGetApiVersion; -tcuProfilerStop *cuProfilerStop; +tcuMipmappedArrayCreate *cuMipmappedArrayCreate; +tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel; +tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy; + +tcuProfilerStop *cuProfilerStop; #ifdef CUDA_INIT_D3D9 // D3D9/CUDA interop (CUDA 1.x compatible API). These functions // are deprecated; please use the ones below -tcuD3D9Begin *cuD3D9Begin; -tcuD3D9End *cuD3DEnd; -tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer; -tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer; -tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer; -tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer; +tcuD3D9Begin *cuD3D9Begin; +tcuD3D9End *cuD3DEnd; +tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer; +tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer; +tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer; +tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer; // D3D9/CUDA interop (CUDA 2.x compatible) -tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice; -tcuD3D9RegisterResource *cuD3D9RegisterResource; -tcuD3D9UnregisterResource *cuD3D9UnregisterResource; -tcuD3D9MapResources *cuD3D9MapResources; -tcuD3D9UnmapResources *cuD3D9UnmapResources; -tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags; -tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions; -tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray; -tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer; -tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize; -tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch; +tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice; +tcuD3D9RegisterResource *cuD3D9RegisterResource; +tcuD3D9UnregisterResource *cuD3D9UnregisterResource; +tcuD3D9MapResources *cuD3D9MapResources; +tcuD3D9UnmapResources *cuD3D9UnmapResources; +tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags; +tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions; +tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray; +tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer; +tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize; +tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch; // D3D9/CUDA interop (CUDA 2.0+) -tcuD3D9GetDevice *cuD3D9GetDevice; -tcuD3D9CtxCreate *cuD3D9CtxCreate; -tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource; +tcuD3D9GetDevice *cuD3D9GetDevice; +tcuD3D9CtxCreate *cuD3D9CtxCreate; +tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource; #endif #ifdef CUDA_INIT_D3D10 // D3D10/CUDA interop (CUDA 3.0+) -tcuD3D10GetDevice *cuD3D10GetDevice; -tcuD3D10CtxCreate *cuD3D10CtxCreate; -tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource; +tcuD3D10GetDevice *cuD3D10GetDevice; +tcuD3D10CtxCreate *cuD3D10CtxCreate; +tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource; #endif #ifdef CUDA_INIT_D3D11 // D3D11/CUDA interop (CUDA 3.0+) -tcuD3D11GetDevice *cuD3D11GetDevice; -tcuD3D11CtxCreate *cuD3D11CtxCreate; -tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource; +tcuD3D11GetDevice *cuD3D11GetDevice; +tcuD3D11CtxCreate *cuD3D11CtxCreate; +tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource; #endif // GL/CUDA interop #ifdef CUDA_INIT_OPENGL -tcuGLCtxCreate *cuGLCtxCreate; -tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer; -tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage; +tcuGLCtxCreate *cuGLCtxCreate; +tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer; +tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -tcuWGLGetDevice *cuWGLGetDevice; +tcuWGLGetDevice *cuWGLGetDevice; #endif #endif @@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) { *pInstance = LoadLibrary(__CudaLibName); - if (*pInstance == NULL) - { + if (*pInstance == NULL) { printf("LoadLibrary \"%s\" failed!\n", __CudaLibName); return CUDA_ERROR_UNKNOWN; } @@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) return CUDA_SUCCESS; } -#define GET_PROC_EX(name, alias, required) \ - alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \ - if (alias == NULL && required) { \ - printf("Failed to find required function \"%s\" in %s\n", \ - #name, __CudaLibName); \ - return CUDA_ERROR_UNKNOWN; \ +#define GET_PROC_EX(name, alias, required) \ + alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \ + if (alias == NULL && required) { \ + printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \ + return CUDA_ERROR_UNKNOWN; \ } -#define GET_PROC_EX_V2(name, alias, required) \ - alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\ - if (alias == NULL && required) { \ - printf("Failed to find required function \"%s\" in %s\n", \ - STRINGIFY(name##_v2), __CudaLibName); \ - return CUDA_ERROR_UNKNOWN; \ +#define GET_PROC_EX_V2(name, alias, required) \ + alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \ + if (alias == NULL && required) { \ + printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \ + return CUDA_ERROR_UNKNOWN; \ } -#define GET_PROC_EX_V3(name, alias, required) \ - alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\ - if (alias == NULL && required) { \ - printf("Failed to find required function \"%s\" in %s\n", \ - STRINGIFY(name##_v3), __CudaLibName); \ - return CUDA_ERROR_UNKNOWN; \ +#define GET_PROC_EX_V3(name, alias, required) \ + alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \ + if (alias == NULL && required) { \ + printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \ + return CUDA_ERROR_UNKNOWN; \ } -#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX) +#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX) #include #if defined(__APPLE__) || defined(__MACOSX) static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib"; #elif defined(__ANDROID__) -#if defined (__aarch64__) +#if defined(__aarch64__) static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so"; #elif defined(__arm__) static char __CudaLibName[] = "/system/vendor/lib/libcuda.so"; @@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) { *pInstance = dlopen(__CudaLibName, RTLD_NOW); - if (*pInstance == NULL) - { + if (*pInstance == NULL) { printf("dlopen \"%s\" failed!\n", __CudaLibName); return CUDA_ERROR_UNKNOWN; } @@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) return CUDA_SUCCESS; } -#define GET_PROC_EX(name, alias, required) \ - alias = (t##name *)dlsym(CudaDrvLib, #name); \ - if (alias == NULL && required) { \ - printf("Failed to find required function \"%s\" in %s\n", \ - #name, __CudaLibName); \ - return CUDA_ERROR_UNKNOWN; \ +#define GET_PROC_EX(name, alias, required) \ + alias = (t##name *)dlsym(CudaDrvLib, #name); \ + if (alias == NULL && required) { \ + printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \ + return CUDA_ERROR_UNKNOWN; \ } -#define GET_PROC_EX_V2(name, alias, required) \ - alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \ - if (alias == NULL && required) { \ - printf("Failed to find required function \"%s\" in %s\n", \ - STRINGIFY(name##_v2), __CudaLibName); \ - return CUDA_ERROR_UNKNOWN; \ +#define GET_PROC_EX_V2(name, alias, required) \ + alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \ + if (alias == NULL && required) { \ + printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \ + return CUDA_ERROR_UNKNOWN; \ } -#define GET_PROC_EX_V3(name, alias, required) \ - alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \ - if (alias == NULL && required) { \ - printf("Failed to find required function \"%s\" in %s\n", \ - STRINGIFY(name##_v3), __CudaLibName); \ - return CUDA_ERROR_UNKNOWN; \ +#define GET_PROC_EX_V3(name, alias, required) \ + alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \ + if (alias == NULL && required) { \ + printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \ + return CUDA_ERROR_UNKNOWN; \ } #else #error unsupported platform #endif -#define CHECKED_CALL(call) \ - do { \ - CUresult result = (call); \ - if (CUDA_SUCCESS != result) { \ - return result; \ - } \ - } while(0) +#define CHECKED_CALL(call) \ + do { \ + CUresult result = (call); \ + if (CUDA_SUCCESS != result) { \ + return result; \ + } \ + } while (0) -#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1) -#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0) +#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1) +#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0) #define GET_PROC(name) GET_PROC_REQUIRED(name) -#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1) -#define GET_PROC_V3(name) GET_PROC_EX_V3(name,name,1) +#define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1) +#define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1) CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) { CUDADRIVER CudaDrvLib; - int driverVer = 1000; + int driverVer = 1000; CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib)); @@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) // available since 2.2. if not present, version 1.0 is assumed GET_PROC_OPTIONAL(cuDriverGetVersion); - if (cuDriverGetVersion) - { + if (cuDriverGetVersion) { CHECKED_CALL(cuDriverGetVersion(&driverVer)); } @@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC(cuStreamDestroy); // These are CUDA 5.0 new functions - if (driverVer >= 5000) - { + if (driverVer >= 5000) { GET_PROC(cuMipmappedArrayCreate); GET_PROC(cuMipmappedArrayDestroy); GET_PROC(cuMipmappedArrayGetLevel); } // These are CUDA 4.2 new functions - if (driverVer >= 4020) - { + if (driverVer >= 4020) { GET_PROC(cuFuncSetSharedMemConfig); GET_PROC(cuCtxGetSharedMemConfig); GET_PROC(cuCtxSetSharedMemConfig); } // These are CUDA 4.1 new functions - if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) - { + if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) { GET_PROC(cuDeviceGetByPCIBusId); GET_PROC(cuDeviceGetPCIBusId); GET_PROC(cuIpcGetEventHandle); @@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) } // These could be _v2 interfaces - if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) - { + if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) { GET_PROC_V2(cuCtxDestroy); GET_PROC_V2(cuCtxPopCurrent); GET_PROC_V2(cuCtxPushCurrent); @@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC_V2(cuEventDestroy); } - if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) - { + if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) { GET_PROC_V2(cuDeviceTotalMem); GET_PROC_V2(cuCtxCreate); GET_PROC_V2(cuModuleGetGlobal); @@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC_V2(cuTexRefSetAddress); GET_PROC_V2(cuTexRefGetAddress); - if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) - { + if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) { GET_PROC_V3(cuTexRefSetAddress2D); } - else - { + else { GET_PROC_V2(cuTexRefSetAddress2D); } } - else - { + else { // versions earlier than 3020 GET_PROC(cuDeviceTotalMem); GET_PROC(cuCtxCreate); @@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) } // The following functions are specific to CUDA versions - if (driverVer >= 4000) - { + if (driverVer >= 4000) { GET_PROC(cuCtxSetCurrent); GET_PROC(cuCtxGetCurrent); GET_PROC(cuMemHostRegister); @@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC(cuProfilerStop); } - if (driverVer >= 3010) - { + if (driverVer >= 3010) { GET_PROC(cuModuleGetSurfRef); GET_PROC(cuSurfRefSetArray); GET_PROC(cuSurfRefGetArray); @@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC(cuCtxGetLimit); } - if (driverVer >= 3000) - { + if (driverVer >= 3000) { GET_PROC(cuMemcpyDtoDAsync); GET_PROC(cuFuncSetCacheConfig); #ifdef CUDA_INIT_D3D11 @@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC(cuGraphicsUnregisterResource); GET_PROC(cuGraphicsSubResourceGetMappedArray); - if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) - { + if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) { GET_PROC_V2(cuGraphicsResourceGetMappedPointer); } - else - { + else { GET_PROC(cuGraphicsResourceGetMappedPointer); } @@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC(cuGetExportTable); } - if (driverVer >= 2030) - { + if (driverVer >= 2030) { GET_PROC(cuMemHostGetFlags); #ifdef CUDA_INIT_D3D10 GET_PROC(cuD3D10GetDevice); @@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) #endif } - if (driverVer >= 2010) - { + if (driverVer >= 2010) { GET_PROC(cuModuleLoadDataEx); GET_PROC(cuModuleLoadFatBinary); #ifdef CUDA_INIT_OPENGL GET_PROC(cuGLCtxCreate); GET_PROC(cuGraphicsGLRegisterBuffer); GET_PROC(cuGraphicsGLRegisterImage); -# ifdef WIN32 +#ifdef WIN32 GET_PROC(cuWGLGetDevice); -# endif +#endif #endif #ifdef CUDA_INIT_D3D9 GET_PROC(cuD3D9GetDevice); diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h index fea51a6e..5938d18d 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h @@ -43,7 +43,8 @@ #define CUDA_VERSION 3020 /* 3.2 */ #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif /** @@ -52,773 +53,761 @@ extern "C" { #if __CUDA_API_VERSION >= 3020 #if defined(_WIN64) || defined(__LP64__) -typedef unsigned long long CUdeviceptr; + typedef unsigned long long CUdeviceptr; #else -typedef unsigned int CUdeviceptr; + typedef unsigned int CUdeviceptr; #endif #endif /* __CUDA_API_VERSION >= 3020 */ -typedef int CUdevice; /**< CUDA device */ -typedef struct CUctx_st *CUcontext; /**< CUDA context */ -typedef struct CUmod_st *CUmodule; /**< CUDA module */ -typedef struct CUfunc_st *CUfunction; /**< CUDA function */ -typedef struct CUarray_st *CUarray; /**< CUDA array */ -typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ -typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ -typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ -typedef struct CUevent_st *CUevent; /**< CUDA event */ -typedef struct CUstream_st *CUstream; /**< CUDA stream */ -typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ -typedef unsigned long long CUtexObject; /**< CUDA texture object */ -typedef unsigned long long CUsurfObject; /**< CUDA surface object */ + typedef int CUdevice; /**< CUDA device */ + typedef struct CUctx_st *CUcontext; /**< CUDA context */ + typedef struct CUmod_st *CUmodule; /**< CUDA module */ + typedef struct CUfunc_st *CUfunction; /**< CUDA function */ + typedef struct CUarray_st *CUarray; /**< CUDA array */ + typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ + typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ + typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ + typedef struct CUevent_st *CUevent; /**< CUDA event */ + typedef struct CUstream_st *CUstream; /**< CUDA stream */ + typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ + typedef unsigned long long CUtexObject; /**< CUDA texture object */ + typedef unsigned long long CUsurfObject; /**< CUDA surface object */ -typedef struct CUuuid_st /**< CUDA definition of UUID */ -{ - char bytes[16]; -} CUuuid; + typedef struct CUuuid_st /**< CUDA definition of UUID */ + { + char bytes[16]; + } CUuuid; -/** - * Context creation flags - */ -typedef enum CUctx_flags_enum -{ - CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ - CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ - CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ - CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ - CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */ - CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ - CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ + /** + * Context creation flags + */ + typedef enum CUctx_flags_enum { + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */ + CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ #if __CUDA_API_VERSION < 4000 - CU_CTX_SCHED_MASK = 0x03, - CU_CTX_FLAGS_MASK = 0x1f + CU_CTX_SCHED_MASK = 0x03, + CU_CTX_FLAGS_MASK = 0x1f #else - CU_CTX_SCHED_MASK = 0x07, - CU_CTX_PRIMARY = 0x20, /**< Initialize and return the primary context */ - CU_CTX_FLAGS_MASK = 0x3f + CU_CTX_SCHED_MASK = 0x07, + CU_CTX_PRIMARY = 0x20, /**< Initialize and return the primary context */ + CU_CTX_FLAGS_MASK = 0x3f #endif -} CUctx_flags; + } CUctx_flags; -/** - * Event creation flags - */ -typedef enum CUevent_flags_enum -{ - CU_EVENT_DEFAULT = 0, /**< Default event flag */ - CU_EVENT_BLOCKING_SYNC = 1, /**< Event uses blocking synchronization */ - CU_EVENT_DISABLE_TIMING = 2 /**< Event will not record timing data */ -} CUevent_flags; + /** + * Event creation flags + */ + typedef enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0, /**< Default event flag */ + CU_EVENT_BLOCKING_SYNC = 1, /**< Event uses blocking synchronization */ + CU_EVENT_DISABLE_TIMING = 2 /**< Event will not record timing data */ + } CUevent_flags; -/** - * Array formats - */ -typedef enum CUarray_format_enum -{ - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ - CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ - CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ - CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ - CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ -} CUarray_format; + /** + * Array formats + */ + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ + } CUarray_format; -/** - * Texture reference addressing modes - */ -typedef enum CUaddress_mode_enum -{ - CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ - CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ - CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ - CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ -} CUaddress_mode; + /** + * Texture reference addressing modes + */ + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ + } CUaddress_mode; -/** - * Texture reference filtering modes - */ -typedef enum CUfilter_mode_enum -{ - CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ - CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ -} CUfilter_mode; + /** + * Texture reference filtering modes + */ + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ + } CUfilter_mode; -/** - * Device properties - */ -typedef enum CUdevice_attribute_enum -{ - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ - CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ - CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */ - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ - CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Maximum texture array width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Maximum texture array height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */ - CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ - CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ - CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ - CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76 /**< Minor compute capability version number */ + /** + * Device properties + */ + typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = + 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = + 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = + 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */ + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Maximum texture array width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Maximum texture array height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */ + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76 /**< Minor compute capability version number */ #if __CUDA_API_VERSION >= 4000 - , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ - CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device uses shares a unified address space with the host */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43 /**< Maximum layers in a 1D layered texture */ + , + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device uses shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43 /**< Maximum layers in a 1D layered texture */ #endif -} CUdevice_attribute; - -/** - * Legacy device properties - */ -typedef struct CUdevprop_st -{ - int maxThreadsPerBlock; /**< Maximum number of threads per block */ - int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ - int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ - int sharedMemPerBlock; /**< Shared memory available per block in bytes */ - int totalConstantMemory; /**< Constant memory available on device in bytes */ - int SIMDWidth; /**< Warp size in threads */ - int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ - int regsPerBlock; /**< 32-bit registers available per block */ - int clockRate; /**< Clock frequency in kilohertz */ - int textureAlign; /**< Alignment requirement for textures */ -} CUdevprop; - -/** - * Function properties - */ -typedef enum CUfunction_attribute_enum -{ - /** - * The maximum number of threads per block, beyond which a launch of the - * function would fail. This number depends on both the function and the - * device on which the function is currently loaded. - */ - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + } CUdevice_attribute; /** - * The size in bytes of statically-allocated shared memory required by - * this function. This does not include dynamically-allocated shared - * memory requested by the user at runtime. + * Legacy device properties */ - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + typedef struct CUdevprop_st + { + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int totalConstantMemory; /**< Constant memory available on device in bytes */ + int SIMDWidth; /**< Warp size in threads */ + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int regsPerBlock; /**< 32-bit registers available per block */ + int clockRate; /**< Clock frequency in kilohertz */ + int textureAlign; /**< Alignment requirement for textures */ + } CUdevprop; /** - * The size in bytes of user-allocated constant memory required by this - * function. + * Function properties */ - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + typedef enum CUfunction_attribute_enum { + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + + /** + * The size in bytes of statically-allocated shared memory required by + * this function. This does not include dynamically-allocated shared + * memory requested by the user at runtime. + */ + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + + /** + * The size in bytes of local memory used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + + /** + * The number of registers used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + * Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + */ + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. Note that + * this will return a value of 10 for legacy cubins that do not have a + * properly-encoded binary architecture version. + */ + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + + CU_FUNC_ATTRIBUTE_MAX + } CUfunction_attribute; /** - * The size in bytes of local memory used by each thread of this function. + * Function cache configurations */ - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + typedef enum CUfunc_cache_enum { + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ + CU_FUNC_CACHE_PREFER_L1 = 0x02 /**< prefer larger L1 cache and smaller shared memory */ + } CUfunc_cache; /** - * The number of registers used by each thread of this function. + * Shared memory configurations */ - CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + typedef enum CUsharedconfig_enum { + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ + } CUsharedconfig; /** - * The PTX virtual architecture version for which the function was - * compiled. This value is the major PTX version * 10 + the minor PTX - * version, so a PTX version 1.3 function would return the value 13. - * Note that this may return the undefined value of 0 for cubins - * compiled prior to CUDA 3.0. + * Memory types */ - CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, - - /** - * The binary architecture version for which the function was compiled. - * This value is the major binary version * 10 + the minor binary version, - * so a binary version 1.3 function would return the value 13. Note that - * this will return a value of 10 for legacy cubins that do not have a - * properly-encoded binary architecture version. - */ - CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, - - CU_FUNC_ATTRIBUTE_MAX -} CUfunction_attribute; - -/** - * Function cache configurations - */ -typedef enum CUfunc_cache_enum -{ - CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ - CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ - CU_FUNC_CACHE_PREFER_L1 = 0x02 /**< prefer larger L1 cache and smaller shared memory */ -} CUfunc_cache; - -/** - * Shared memory configurations - */ -typedef enum CUsharedconfig_enum -{ - CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ - CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ - CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ -} CUsharedconfig; - -/** - * Memory types - */ -typedef enum CUmemorytype_enum -{ - CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ - CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ - CU_MEMORYTYPE_ARRAY = 0x03 /**< Array memory */ + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ + CU_MEMORYTYPE_ARRAY = 0x03 /**< Array memory */ #if __CUDA_API_VERSION >= 4000 - , CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ + , + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ #endif -} CUmemorytype; + } CUmemorytype; -/** - * Compute Modes - */ -typedef enum CUcomputemode_enum -{ - CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ - CU_COMPUTEMODE_PROHIBITED = 2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ + /** + * Compute Modes + */ + typedef enum CUcomputemode_enum { + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ + CU_COMPUTEMODE_PROHIBITED = + 2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ #if __CUDA_API_VERSION >= 4000 - , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ + , + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single + process can be present on this device at a time) */ #endif -} CUcomputemode; + } CUcomputemode; -/** - * Online compiler options - */ -typedef enum CUjit_option_enum -{ /** - * Max number of registers that a thread may use.\n - * Option type: unsigned int + * Online compiler options */ - CU_JIT_MAX_REGISTERS = 0, + typedef enum CUjit_option_enum { + /** + * Max number of registers that a thread may use.\n + * Option type: unsigned int + */ + CU_JIT_MAX_REGISTERS = 0, + + /** + * IN: Specifies minimum number of threads per block to target compilation + * for\n + * OUT: Returns the number of threads the compiler actually targeted. + * This restricts the resource utilization fo the compiler (e.g. max + * registers) such that a block with the given number of threads should be + * able to launch based on register limitations. Note, this option does not + * currently take into account any other resource limitations, such as + * shared memory utilization.\n + * Option type: unsigned int + */ + CU_JIT_THREADS_PER_BLOCK, + + /** + * Returns a float value in the option of the wall clock time, in + * milliseconds, spent creating the cubin\n + * Option type: float + */ + CU_JIT_WALL_TIME, + + /** + * Pointer to a buffer in which to print any log messsages from PTXAS + * that are informational in nature (the buffer size is specified via + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n + * Option type: char* + */ + CU_JIT_INFO_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int + */ + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + + /** + * Pointer to a buffer in which to print any log messages from PTXAS that + * reflect errors (the buffer size is specified via option + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char* + */ + CU_JIT_ERROR_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int + */ + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + + /** + * Level of optimizations to apply to generated code (0 - 4), with 4 + * being the default and highest level of optimizations.\n + * Option type: unsigned int + */ + CU_JIT_OPTIMIZATION_LEVEL, + + /** + * No option value required. Determines the target based on the current + * attached context (default)\n + * Option type: No option value needed + */ + CU_JIT_TARGET_FROM_CUCONTEXT, + + /** + * Target is chosen based on supplied ::CUjit_target_enum.\n + * Option type: unsigned int for enumerated type ::CUjit_target_enum + */ + CU_JIT_TARGET, + + /** + * Specifies choice of fallback strategy if matching cubin is not found. + * Choice is based on supplied ::CUjit_fallback_enum.\n + * Option type: unsigned int for enumerated type ::CUjit_fallback_enum + */ + CU_JIT_FALLBACK_STRATEGY + + } CUjit_option; /** - * IN: Specifies minimum number of threads per block to target compilation - * for\n - * OUT: Returns the number of threads the compiler actually targeted. - * This restricts the resource utilization fo the compiler (e.g. max - * registers) such that a block with the given number of threads should be - * able to launch based on register limitations. Note, this option does not - * currently take into account any other resource limitations, such as - * shared memory utilization.\n - * Option type: unsigned int + * Online compilation targets */ - CU_JIT_THREADS_PER_BLOCK, + typedef enum CUjit_target_enum { + CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ + CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ + CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ + CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ + CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ + CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ + CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ + CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ + CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ + CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ + CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ + CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ + CU_TARGET_COMPUTE_70 = 70 /**< Compute device class 7.0.*/ + } CUjit_target; /** - * Returns a float value in the option of the wall clock time, in - * milliseconds, spent creating the cubin\n - * Option type: float + * Cubin matching fallback strategies */ - CU_JIT_WALL_TIME, + typedef enum CUjit_fallback_enum { + CU_PREFER_PTX = 0, /**< Prefer to compile ptx */ + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */ + } CUjit_fallback; /** - * Pointer to a buffer in which to print any log messsages from PTXAS - * that are informational in nature (the buffer size is specified via - * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n - * Option type: char* + * Flags to register a graphics resource */ - CU_JIT_INFO_LOG_BUFFER, + typedef enum CUgraphicsRegisterFlags_enum { + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04 + } CUgraphicsRegisterFlags; /** - * IN: Log buffer size in bytes. Log messages will be capped at this size - * (including null terminator)\n - * OUT: Amount of log buffer filled with messages\n - * Option type: unsigned int + * Flags for mapping and unmapping interop resources */ - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + typedef enum CUgraphicsMapResourceFlags_enum { + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 + } CUgraphicsMapResourceFlags; /** - * Pointer to a buffer in which to print any log messages from PTXAS that - * reflect errors (the buffer size is specified via option - * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n - * Option type: char* + * Array indices for cube faces */ - CU_JIT_ERROR_LOG_BUFFER, + typedef enum CUarray_cubemap_face_enum { + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ + } CUarray_cubemap_face; /** - * IN: Log buffer size in bytes. Log messages will be capped at this size - * (including null terminator)\n - * OUT: Amount of log buffer filled with messages\n - * Option type: unsigned int + * Limits */ - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + typedef enum CUlimit_enum { + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */ + } CUlimit; /** - * Level of optimizations to apply to generated code (0 - 4), with 4 - * being the default and highest level of optimizations.\n - * Option type: unsigned int + * Resource types */ - CU_JIT_OPTIMIZATION_LEVEL, + typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ + } CUresourcetype; /** - * No option value required. Determines the target based on the current - * attached context (default)\n - * Option type: No option value needed + * Error codes */ - CU_JIT_TARGET_FROM_CUCONTEXT, + typedef enum cudaError_enum { + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + CUDA_SUCCESS = 0, - /** - * Target is chosen based on supplied ::CUjit_target_enum.\n - * Option type: unsigned int for enumerated type ::CUjit_target_enum - */ - CU_JIT_TARGET, + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + CUDA_ERROR_INVALID_VALUE = 1, - /** - * Specifies choice of fallback strategy if matching cubin is not found. - * Choice is based on supplied ::CUjit_fallback_enum.\n - * Option type: unsigned int for enumerated type ::CUjit_fallback_enum - */ - CU_JIT_FALLBACK_STRATEGY + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + CUDA_ERROR_OUT_OF_MEMORY = 2, -} CUjit_option; + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + CUDA_ERROR_NOT_INITIALIZED = 3, -/** - * Online compilation targets - */ -typedef enum CUjit_target_enum -{ - CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ - CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ - CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ - CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ - CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ - CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ - CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ - CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ - CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ - CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ - CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ - CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ - CU_TARGET_COMPUTE_70 = 70 /**< Compute device class 7.0.*/ -} CUjit_target; + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + CUDA_ERROR_DEINITIALIZED = 4, -/** - * Cubin matching fallback strategies - */ -typedef enum CUjit_fallback_enum -{ - CU_PREFER_PTX = 0, /**< Prefer to compile ptx */ - CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */ -} CUjit_fallback; + /** + * This indicates profiling APIs are called while application is running + * in visual profiler mode. + */ + CUDA_ERROR_PROFILER_DISABLED = 5, + /** + * This indicates profiling has not been initialized for this context. + * Call cuProfilerInitialize() to resolve this. + */ + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, + /** + * This indicates profiler has already been started and probably + * cuProfilerStart() is incorrectly called. + */ + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, + /** + * This indicates profiler has already been stopped and probably + * cuProfilerStop() is incorrectly called. + */ + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + CUDA_ERROR_NO_DEVICE = 100, -/** - * Flags to register a graphics resource - */ -typedef enum CUgraphicsRegisterFlags_enum -{ - CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, - CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, - CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, - CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04 -} CUgraphicsRegisterFlags; - -/** - * Flags for mapping and unmapping interop resources - */ -typedef enum CUgraphicsMapResourceFlags_enum -{ - CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 -} CUgraphicsMapResourceFlags; - -/** - * Array indices for cube faces - */ -typedef enum CUarray_cubemap_face_enum -{ - CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ - CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ - CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ -} CUarray_cubemap_face; - -/** - * Limits - */ -typedef enum CUlimit_enum -{ - CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ - CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ - CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */ -} CUlimit; - -/** - * Resource types - */ -typedef enum CUresourcetype_enum -{ - CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ - CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ - CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ - CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ -} CUresourcetype; - -/** - * Error codes - */ -typedef enum cudaError_enum -{ - /** - * The API call returned with no errors. In the case of query calls, this - * can also mean that the operation being queried is complete (see - * ::cuEventQuery() and ::cuStreamQuery()). - */ - CUDA_SUCCESS = 0, - - /** - * This indicates that one or more of the parameters passed to the API call - * is not within an acceptable range of values. - */ - CUDA_ERROR_INVALID_VALUE = 1, - - /** - * The API call failed because it was unable to allocate enough memory to - * perform the requested operation. - */ - CUDA_ERROR_OUT_OF_MEMORY = 2, - - /** - * This indicates that the CUDA driver has not been initialized with - * ::cuInit() or that initialization has failed. - */ - CUDA_ERROR_NOT_INITIALIZED = 3, - - /** - * This indicates that the CUDA driver is in the process of shutting down. - */ - CUDA_ERROR_DEINITIALIZED = 4, - - /** - * This indicates profiling APIs are called while application is running - * in visual profiler mode. - */ - CUDA_ERROR_PROFILER_DISABLED = 5, - /** - * This indicates profiling has not been initialized for this context. - * Call cuProfilerInitialize() to resolve this. - */ - CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, - /** - * This indicates profiler has already been started and probably - * cuProfilerStart() is incorrectly called. - */ - CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, - /** - * This indicates profiler has already been stopped and probably - * cuProfilerStop() is incorrectly called. - */ - CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, - /** - * This indicates that no CUDA-capable devices were detected by the installed - * CUDA driver. - */ - CUDA_ERROR_NO_DEVICE = 100, - - /** - * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device. - */ - CUDA_ERROR_INVALID_DEVICE = 101, + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + CUDA_ERROR_INVALID_DEVICE = 101, - /** - * This indicates that the device kernel image is invalid. This can also - * indicate an invalid CUDA module. - */ - CUDA_ERROR_INVALID_IMAGE = 200, + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + CUDA_ERROR_INVALID_IMAGE = 200, - /** - * This most frequently indicates that there is no context bound to the - * current thread. This can also be returned if the context passed to an - * API call is not a valid handle (such as a context that has had - * ::cuCtxDestroy() invoked on it). This can also be returned if a user - * mixes different API versions (i.e. 3010 context with 3020 API calls). - * See ::cuCtxGetApiVersion() for more details. - */ - CUDA_ERROR_INVALID_CONTEXT = 201, + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + CUDA_ERROR_INVALID_CONTEXT = 201, - /** - * This indicated that the context being supplied as a parameter to the - * API call was already the active context. - * \deprecated - * This error return is deprecated as of CUDA 3.2. It is no longer an - * error to attempt to push the active context via ::cuCtxPushCurrent(). - */ - CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, - /** - * This indicates that a map or register operation has failed. - */ - CUDA_ERROR_MAP_FAILED = 205, + /** + * This indicates that a map or register operation has failed. + */ + CUDA_ERROR_MAP_FAILED = 205, - /** - * This indicates that an unmap or unregister operation has failed. - */ - CUDA_ERROR_UNMAP_FAILED = 206, + /** + * This indicates that an unmap or unregister operation has failed. + */ + CUDA_ERROR_UNMAP_FAILED = 206, - /** - * This indicates that the specified array is currently mapped and thus - * cannot be destroyed. - */ - CUDA_ERROR_ARRAY_IS_MAPPED = 207, + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + CUDA_ERROR_ARRAY_IS_MAPPED = 207, - /** - * This indicates that the resource is already mapped. - */ - CUDA_ERROR_ALREADY_MAPPED = 208, + /** + * This indicates that the resource is already mapped. + */ + CUDA_ERROR_ALREADY_MAPPED = 208, - /** - * This indicates that there is no kernel image available that is suitable - * for the device. This can occur when a user specifies code generation - * options for a particular CUDA source file that do not include the - * corresponding device configuration. - */ - CUDA_ERROR_NO_BINARY_FOR_GPU = 209, + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + CUDA_ERROR_NO_BINARY_FOR_GPU = 209, - /** - * This indicates that a resource has already been acquired. - */ - CUDA_ERROR_ALREADY_ACQUIRED = 210, + /** + * This indicates that a resource has already been acquired. + */ + CUDA_ERROR_ALREADY_ACQUIRED = 210, - /** - * This indicates that a resource is not mapped. - */ - CUDA_ERROR_NOT_MAPPED = 211, + /** + * This indicates that a resource is not mapped. + */ + CUDA_ERROR_NOT_MAPPED = 211, - /** - * This indicates that a mapped resource is not available for access as an - * array. - */ - CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, - /** - * This indicates that a mapped resource is not available for access as a - * pointer. - */ - CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, - /** - * This indicates that an uncorrectable ECC error was detected during - * execution. - */ - CUDA_ERROR_ECC_UNCORRECTABLE = 214, + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + CUDA_ERROR_ECC_UNCORRECTABLE = 214, - /** - * This indicates that the ::CUlimit passed to the API call is not - * supported by the active device. - */ - CUDA_ERROR_UNSUPPORTED_LIMIT = 215, + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_LIMIT = 215, - /** - * This indicates that the ::CUcontext passed to the API call can - * only be bound to a single CPU thread at a time but is already - * bound to a CPU thread. - */ - CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, - /** - * This indicates that the device kernel source is invalid. - */ - CUDA_ERROR_INVALID_SOURCE = 300, + /** + * This indicates that the device kernel source is invalid. + */ + CUDA_ERROR_INVALID_SOURCE = 300, - /** - * This indicates that the file specified was not found. - */ - CUDA_ERROR_FILE_NOT_FOUND = 301, + /** + * This indicates that the file specified was not found. + */ + CUDA_ERROR_FILE_NOT_FOUND = 301, - /** - * This indicates that a link to a shared object failed to resolve. - */ - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, + /** + * This indicates that a link to a shared object failed to resolve. + */ + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, - /** - * This indicates that initialization of a shared object failed. - */ - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, + /** + * This indicates that initialization of a shared object failed. + */ + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, - /** - * This indicates that an OS call failed. - */ - CUDA_ERROR_OPERATING_SYSTEM = 304, + /** + * This indicates that an OS call failed. + */ + CUDA_ERROR_OPERATING_SYSTEM = 304, - /** - * This indicates that a resource handle passed to the API call was not - * valid. Resource handles are opaque types like ::CUstream and ::CUevent. - */ - CUDA_ERROR_INVALID_HANDLE = 400, + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + CUDA_ERROR_INVALID_HANDLE = 400, - /** - * This indicates that a named symbol was not found. Examples of symbols - * are global/constant variable names, texture names, and surface names. - */ - CUDA_ERROR_NOT_FOUND = 500, + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names, and surface names. + */ + CUDA_ERROR_NOT_FOUND = 500, - /** - * This indicates that asynchronous operations issued previously have not - * completed yet. This result is not actually an error, but must be indicated - * differently than ::CUDA_SUCCESS (which indicates completion). Calls that - * may return this value include ::cuEventQuery() and ::cuStreamQuery(). - */ - CUDA_ERROR_NOT_READY = 600, + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + CUDA_ERROR_NOT_READY = 600, - /** - * An exception occurred on the device while executing a kernel. Common - * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. The context cannot be used, so it must - * be destroyed (and a new one should be created). All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - CUDA_ERROR_LAUNCH_FAILED = 700, + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The context cannot be used, so it must + * be destroyed (and a new one should be created). All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_LAUNCH_FAILED = 700, - /** - * This indicates that a launch did not occur because it did not have - * appropriate resources. This error usually indicates that the user has - * attempted to pass too many arguments to the device kernel, or the - * kernel launch specifies too many threads for the kernel's register - * count. Passing arguments of the wrong size (i.e. a 64-bit pointer - * when a 32-bit int is expected) is equivalent to passing too many - * arguments and can also result in this error. - */ - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, - /** - * This indicates that the device kernel took too long to execute. This can - * only occur if timeouts are enabled - see the device attribute - * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The - * context cannot be used (and must be destroyed similar to - * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from - * this context are invalid and must be reconstructed if the program is to - * continue using CUDA. - */ - CUDA_ERROR_LAUNCH_TIMEOUT = 702, + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The + * context cannot be used (and must be destroyed similar to + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from + * this context are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + CUDA_ERROR_LAUNCH_TIMEOUT = 702, - /** - * This error indicates a kernel launch that uses an incompatible texturing - * mode. - */ - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, - /** - * This error indicates that a call to ::cuCtxEnablePeerAccess() is - * trying to re-enable peer access to a context which has already - * had peer access to it enabled. - */ - CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, - /** - * This error indicates that a call to ::cuMemPeerRegister is trying to - * register memory from a context which has not had peer access - * enabled yet via ::cuCtxEnablePeerAccess(), or that - * ::cuCtxDisablePeerAccess() is trying to disable peer access - * which has not been enabled yet. - */ - CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, + /** + * This error indicates that a call to ::cuMemPeerRegister is trying to + * register memory from a context which has not had peer access + * enabled yet via ::cuCtxEnablePeerAccess(), or that + * ::cuCtxDisablePeerAccess() is trying to disable peer access + * which has not been enabled yet. + */ + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, - /** - * This error indicates that a call to ::cuMemPeerRegister is trying to - * register already-registered memory. - */ - CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706, + /** + * This error indicates that a call to ::cuMemPeerRegister is trying to + * register already-registered memory. + */ + CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706, - /** - * This error indicates that a call to ::cuMemPeerUnregister is trying to - * unregister memory that has not been registered. - */ - CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED = 707, + /** + * This error indicates that a call to ::cuMemPeerUnregister is trying to + * unregister memory that has not been registered. + */ + CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED = 707, - /** - * This error indicates that ::cuCtxCreate was called with the flag - * ::CU_CTX_PRIMARY on a device which already has initialized its - * primary context. - */ - CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, + /** + * This error indicates that ::cuCtxCreate was called with the flag + * ::CU_CTX_PRIMARY on a device which already has initialized its + * primary context. + */ + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, - /** - * This error indicates that the context current to the calling thread - * has been destroyed using ::cuCtxDestroy, or is a primary context which - * has not yet been initialized. - */ - CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, - /** - * A device-side assert triggered during kernel execution. The context - * cannot be used anymore, and must be destroyed. All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - CUDA_ERROR_ASSERT = 710, + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ASSERT = 710, - /** - * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices - * passed to ::cuCtxEnablePeerAccess(). - */ - CUDA_ERROR_TOO_MANY_PEERS = 711, + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_TOO_MANY_PEERS = 711, - /** - * This error indicates that the memory range passed to ::cuMemHostRegister() - * has already been registered. - */ - CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, - /** - * This error indicates that the pointer passed to ::cuMemHostUnregister() - * does not correspond to any currently registered memory region. - */ - CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, - /** - * This indicates that an unknown internal error has occurred. - */ - CUDA_ERROR_UNKNOWN = 999 -} CUresult; + /** + * This indicates that an unknown internal error has occurred. + */ + CUDA_ERROR_UNKNOWN = 999 + } CUresult; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #define CUDA_CB __stdcall @@ -826,27 +815,27 @@ typedef enum cudaError_enum #define CUDA_CB #endif -/** - * CUDA stream callback - * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. - * \param status ::CUDA_SUCCESS or any persistent error on the stream. - * \param userData User parameter provided at registration. - */ -typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); + /** + * CUDA stream callback + * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. + * \param status ::CUDA_SUCCESS or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ + typedef void(CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); #if __CUDA_API_VERSION >= 4000 /** * If set, host memory is portable between CUDA contexts. * Flag for ::cuMemHostAlloc() */ -#define CU_MEMHOSTALLOC_PORTABLE 0x01 +#define CU_MEMHOSTALLOC_PORTABLE 0x01 /** * If set, host memory is mapped into CUDA address space and * ::cuMemHostGetDevicePointer() may be called on the host pointer. * Flag for ::cuMemHostAlloc() */ -#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 /** * If set, host memory is allocated as write-combined - fast to write, @@ -854,300 +843,298 @@ typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void * (MOVNTDQA). * Flag for ::cuMemHostAlloc() */ -#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 /** * If set, host memory is portable between CUDA contexts. * Flag for ::cuMemHostRegister() */ -#define CU_MEMHOSTREGISTER_PORTABLE 0x01 +#define CU_MEMHOSTREGISTER_PORTABLE 0x01 /** * If set, host memory is mapped into CUDA address space and * ::cuMemHostGetDevicePointer() may be called on the host pointer. * Flag for ::cuMemHostRegister() */ -#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 /** * If set, peer memory is mapped into CUDA address space and * ::cuMemPeerGetDevicePointer() may be called on the host pointer. * Flag for ::cuMemPeerRegister() */ -#define CU_MEMPEERREGISTER_DEVICEMAP 0x02 +#define CU_MEMPEERREGISTER_DEVICEMAP 0x02 #endif #if __CUDA_API_VERSION >= 3020 -/** - * 2D memory copy parameters - */ -typedef struct CUDA_MEMCPY2D_st -{ - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ + /** + * 2D memory copy parameters + */ + typedef struct CUDA_MEMCPY2D_st + { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ - size_t Height; /**< Height of 2D memory copy */ -} CUDA_MEMCPY2D; + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ + size_t Height; /**< Height of 2D memory copy */ + } CUDA_MEMCPY2D; -/** - * 3D memory copy parameters - */ -typedef struct CUDA_MEMCPY3D_st -{ - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - size_t srcZ; /**< Source Z */ - size_t srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - void *reserved0; /**< Must be NULL */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + /** + * 3D memory copy parameters + */ + typedef struct CUDA_MEMCPY3D_st + { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - size_t dstZ; /**< Destination Z */ - size_t dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - void *reserved1; /**< Must be NULL */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ - size_t Height; /**< Height of 3D memory copy */ - size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D; + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ + } CUDA_MEMCPY3D; -/** - * 3D memory cross-context copy parameters - */ -typedef struct CUDA_MEMCPY3D_PEER_st -{ - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - size_t srcZ; /**< Source Z */ - size_t srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + /** + * 3D memory cross-context copy parameters + */ + typedef struct CUDA_MEMCPY3D_PEER_st + { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - size_t dstZ; /**< Destination Z */ - size_t dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ - size_t Height; /**< Height of 3D memory copy */ - size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D_PEER; + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ + } CUDA_MEMCPY3D_PEER; -/** - * Array descriptor - */ -typedef struct CUDA_ARRAY_DESCRIPTOR_st -{ - size_t Width; /**< Width of array */ - size_t Height; /**< Height of array */ + /** + * Array descriptor + */ + typedef struct CUDA_ARRAY_DESCRIPTOR_st + { + size_t Width; /**< Width of array */ + size_t Height; /**< Height of array */ - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ -} CUDA_ARRAY_DESCRIPTOR; + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + } CUDA_ARRAY_DESCRIPTOR; -/** - * 3D array descriptor - */ -typedef struct CUDA_ARRAY3D_DESCRIPTOR_st -{ - size_t Width; /**< Width of 3D array */ - size_t Height; /**< Height of 3D array */ - size_t Depth; /**< Depth of 3D array */ + /** + * 3D array descriptor + */ + typedef struct CUDA_ARRAY3D_DESCRIPTOR_st + { + size_t Width; /**< Width of 3D array */ + size_t Height; /**< Height of 3D array */ + size_t Depth; /**< Depth of 3D array */ - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - unsigned int Flags; /**< Flags */ -} CUDA_ARRAY3D_DESCRIPTOR; + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ + } CUDA_ARRAY3D_DESCRIPTOR; #endif /* __CUDA_API_VERSION >= 3020 */ #if __CUDA_API_VERSION >= 5000 -/** - * CUDA Resource descriptor - */ -typedef struct CUDA_RESOURCE_DESC_st -{ - CUresourcetype resType; /**< Resource type */ - - union + /** + * CUDA Resource descriptor + */ + typedef struct CUDA_RESOURCE_DESC_st { - struct - { - CUarray hArray; /**< CUDA array */ - } array; - struct - { - CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ - } mipmap; - struct - { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t sizeInBytes; /**< Size in bytes */ - } linear; - struct - { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t width; /**< Width of the array in elements */ - size_t height; /**< Height of the array in elements */ - size_t pitchInBytes; /**< Pitch between two rows in bytes */ - } pitch2D; - struct - { - int reserved[32]; - } __reserved; - } res; + CUresourcetype resType; /**< Resource type */ - unsigned int flags; /**< Flags (must be zero) */ -} CUDA_RESOURCE_DESC; + union + { + struct + { + CUarray hArray; /**< CUDA array */ + } array; + struct + { + CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ + } mipmap; + struct + { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct + { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct + { + int reserved[32]; + } __reserved; + } res; -/** - * Texture descriptor - */ -typedef struct CUDA_TEXTURE_DESC_st -{ - CUaddress_mode addressMode[3]; /**< Address modes */ - CUfilter_mode filterMode; /**< Filter mode */ - unsigned int flags; /**< Flags */ - unsigned int maxAnisotropy; /**< Maximum anistropy ratio */ - CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ - float mipmapLevelBias; /**< Mipmap level bias */ - float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ - float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ - int _reserved[16]; -} CUDA_TEXTURE_DESC; + unsigned int flags; /**< Flags (must be zero) */ + } CUDA_RESOURCE_DESC; -/** - * Resource view format - */ -typedef enum CUresourceViewFormat_enum -{ - CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ - CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ - CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ -} CUresourceViewFormat; + /** + * Texture descriptor + */ + typedef struct CUDA_TEXTURE_DESC_st + { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anistropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + int _reserved[16]; + } CUDA_TEXTURE_DESC; -/** - * Resource view descriptor - */ -typedef struct CUDA_RESOURCE_VIEW_DESC_st -{ - CUresourceViewFormat format; /**< Resource view format */ - size_t width; /**< Width of the resource view */ - size_t height; /**< Height of the resource view */ - size_t depth; /**< Depth of the resource view */ - unsigned int firstMipmapLevel; /**< First defined mipmap level */ - unsigned int lastMipmapLevel; /**< Last defined mipmap level */ - unsigned int firstLayer; /**< First layer index */ - unsigned int lastLayer; /**< Last layer index */ - unsigned int _reserved[16]; -} CUDA_RESOURCE_VIEW_DESC; + /** + * Resource view format + */ + typedef enum CUresourceViewFormat_enum { + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ + } CUresourceViewFormat; -/** - * GPU Direct v3 tokens - */ -typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st -{ - unsigned long long p2pToken; - unsigned int vaSpaceToken; -} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; + /** + * Resource view descriptor + */ + typedef struct CUDA_RESOURCE_VIEW_DESC_st + { + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int _reserved[16]; + } CUDA_RESOURCE_VIEW_DESC; + + /** + * GPU Direct v3 tokens + */ + typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st + { + unsigned long long p2pToken; + unsigned int vaSpaceToken; + } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; #endif - /** * If set, the CUDA array is a collection of layers, where each layer is either a 1D * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number * of layers, not the depth of a 3D array. */ -#define CUDA_ARRAY3D_LAYERED 0x01 +#define CUDA_ARRAY3D_LAYERED 0x01 /** * Deprecated, use CUDA_ARRAY3D_LAYERED */ -#define CUDA_ARRAY3D_2DARRAY 0x01 +#define CUDA_ARRAY3D_2DARRAY 0x01 /** * This flag must be set in order to bind a surface reference * to the CUDA array */ -#define CUDA_ARRAY3D_SURFACE_LDST 0x02 +#define CUDA_ARRAY3D_SURFACE_LDST 0x02 /** * Override the texref format with a format inferred from the array. @@ -1160,25 +1147,25 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st * in the range [0,1]. * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_READ_AS_INTEGER 0x01 +#define CU_TRSF_READ_AS_INTEGER 0x01 /** * Use normalized texture coordinates in the range [0,1) instead of [0,dim). * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_NORMALIZED_COORDINATES 0x02 +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 /** * Perform sRGB->linear conversion during texture read. * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_SRGB 0x10 +#define CU_TRSF_SRGB 0x10 /** * End of array terminator for the \p extra parameter to * ::cuLaunchKernel */ -#define CU_LAUNCH_PARAM_END ((void*)0x00) +#define CU_LAUNCH_PARAM_END ((void *)0x00) /** * Indicator that the next value in the \p extra parameter to @@ -1189,7 +1176,7 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no * effect. */ -#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *)0x01) /** * Indicator that the next value in the \p extra parameter to @@ -1199,7 +1186,7 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st * in the \p extra array if the value associated with * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. */ -#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *)0x02) /** * For texture references loaded into the module, use default texunit from @@ -1211,93 +1198,93 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st * CUDA API made obselete at API version 3020 */ #if defined(__CUDA_API_VERSION_INTERNAL) -#define CUdeviceptr CUdeviceptr_v1 -#define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st -#define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1 -#define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st -#define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1 -#define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st -#define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1 -#define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st -#define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1 +#define CUdeviceptr CUdeviceptr_v1 +#define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st +#define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1 +#define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st +#define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1 +#define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st +#define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1 +#define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st +#define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1 #endif /* CUDA_FORCE_LEGACY32_INTERNAL */ #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 -typedef unsigned int CUdeviceptr; + typedef unsigned int CUdeviceptr; -typedef struct CUDA_MEMCPY2D_st -{ - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + typedef struct CUDA_MEMCPY2D_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ - unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ - unsigned int Height; /**< Height of 2D memory copy */ -} CUDA_MEMCPY2D; + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ + unsigned int Height; /**< Height of 2D memory copy */ + } CUDA_MEMCPY2D; -typedef struct CUDA_MEMCPY3D_st -{ - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - unsigned int srcZ; /**< Source Z */ - unsigned int srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - void *reserved0; /**< Must be NULL */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ - unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + typedef struct CUDA_MEMCPY3D_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + unsigned int srcZ; /**< Source Z */ + unsigned int srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - unsigned int dstZ; /**< Destination Z */ - unsigned int dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - void *reserved1; /**< Must be NULL */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ - unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + unsigned int dstZ; /**< Destination Z */ + unsigned int dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ - unsigned int Height; /**< Height of 3D memory copy */ - unsigned int Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D; + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ + unsigned int Height; /**< Height of 3D memory copy */ + unsigned int Depth; /**< Depth of 3D memory copy */ + } CUDA_MEMCPY3D; -typedef struct CUDA_ARRAY_DESCRIPTOR_st -{ - unsigned int Width; /**< Width of array */ - unsigned int Height; /**< Height of array */ + typedef struct CUDA_ARRAY_DESCRIPTOR_st + { + unsigned int Width; /**< Width of array */ + unsigned int Height; /**< Height of array */ - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ -} CUDA_ARRAY_DESCRIPTOR; + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + } CUDA_ARRAY_DESCRIPTOR; -typedef struct CUDA_ARRAY3D_DESCRIPTOR_st -{ - unsigned int Width; /**< Width of 3D array */ - unsigned int Height; /**< Height of 3D array */ - unsigned int Depth; /**< Depth of 3D array */ + typedef struct CUDA_ARRAY3D_DESCRIPTOR_st + { + unsigned int Width; /**< Width of 3D array */ + unsigned int Height; /**< Height of 3D array */ + unsigned int Depth; /**< Depth of 3D array */ - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - unsigned int Flags; /**< Flags */ -} CUDA_ARRAY3D_DESCRIPTOR; + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ + } CUDA_ARRAY3D_DESCRIPTOR; #endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */ @@ -1306,13 +1293,13 @@ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st * and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies * the number of slices, not the depth of a 3D array. */ -#define CUDA_ARRAY3D_2DARRAY 0x01 +#define CUDA_ARRAY3D_2DARRAY 0x01 /** * This flag must be set in order to bind a surface reference * to the CUDA array */ -#define CUDA_ARRAY3D_SURFACE_LDST 0x02 +#define CUDA_ARRAY3D_SURFACE_LDST 0x02 /** * Override the texref format with a format inferred from the array. @@ -1325,19 +1312,19 @@ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st * in the range [0,1]. * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_READ_AS_INTEGER 0x01 +#define CU_TRSF_READ_AS_INTEGER 0x01 /** * Use normalized texture coordinates in the range [0,1) instead of [0,dim). * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_NORMALIZED_COORDINATES 0x02 +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 /** * Perform sRGB->linear conversion during texture read. * Flag for ::cuTexRefSetFlags() */ -#define CU_TRSF_SRGB 0x10 +#define CU_TRSF_SRGB 0x10 /** * For texture references loaded into the module, use default texunit from @@ -1345,7 +1332,7 @@ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st */ #define CU_PARAM_TR_DEFAULT -1 -/** @} */ /* END CUDA_TYPES */ + /** @} */ /* END CUDA_TYPES */ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #define CUDAAPI __stdcall @@ -1353,86 +1340,90 @@ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st #define CUDAAPI #endif -/** - * \defgroup CUDA_INITIALIZE Initialization - * - * This section describes the initialization functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ + /** + * \defgroup CUDA_INITIALIZE Initialization + * + * This section describes the initialization functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ -/********************************* - ** Initialization - *********************************/ -typedef CUresult CUDAAPI tcuInit(unsigned int Flags); + /********************************* + ** Initialization + *********************************/ + typedef CUresult CUDAAPI tcuInit(unsigned int Flags); -/********************************* - ** Driver Version Query - *********************************/ -typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion); + /********************************* + ** Driver Version Query + *********************************/ + typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion); -/************************************ - ** - ** Device management - ** - ***********************************/ + /************************************ + ** + ** Device management + ** + ***********************************/ -typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal); -typedef CUresult CUDAAPI tcuDeviceGetCount(int *count); -typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev); -typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev); + typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal); + typedef CUresult CUDAAPI tcuDeviceGetCount(int *count); + typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev); + typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev); #if __CUDA_API_VERSION >= 3020 -typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev); + typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev); #else -typedef CUresult CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev); +typedef CUresult CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev); #endif -typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev); -typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); -typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char **pStr); + typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev); + typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char **pStr); -/************************************ - ** - ** Context management - ** - ***********************************/ + /************************************ + ** + ** Context management + ** + ***********************************/ -typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); -typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx); -typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags); -typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx); -typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx); -typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx); + typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); + typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx); + typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags); + typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx); + typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx); + typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx); -typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx); -typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext *pctx); + typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx); + typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext *pctx); -typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device); -typedef CUresult CUDAAPI tcuCtxSynchronize(void); + typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device); + typedef CUresult CUDAAPI tcuCtxSynchronize(void); -/************************************ - ** - ** Module management - ** - ***********************************/ + /************************************ + ** + ** Module management + ** + ***********************************/ -typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname); -typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image); -typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); -typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); -typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod); -typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname); + typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image); + typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, + const void *image, + unsigned int numOptions, + CUjit_option *options, + void **optionValues); + typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); + typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod); + typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); #if __CUDA_API_VERSION >= 3020 -typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); + typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); #else -typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name); +typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name); #endif -typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); -typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); + typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); + typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); /************************************ ** @@ -1440,44 +1431,42 @@ typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod ** ***********************************/ #if __CUDA_API_VERSION >= 3020 -typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total); -typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize); -typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); -typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr, - size_t *pPitch, - size_t WidthInBytes, - size_t Height, - // size of biggest r/w to be performed by kernels on this memory - // 4, 8 or 16 bytes - unsigned int ElementSizeBytes - ); + typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total); + typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize); + typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); + typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr, + size_t *pPitch, + size_t WidthInBytes, + size_t Height, + // size of biggest r/w to be performed by kernels on this memory + // 4, 8 or 16 bytes + unsigned int ElementSizeBytes); #else typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total); typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize); typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr); -typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr, +typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, - unsigned int WidthInBytes, - unsigned int Height, + unsigned int WidthInBytes, + unsigned int Height, // size of biggest r/w to be performed by kernels on this memory // 4, 8 or 16 bytes - unsigned int ElementSizeBytes - ); + unsigned int ElementSizeBytes); #endif -typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr); + typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr); #if __CUDA_API_VERSION >= 3020 -typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize); -typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); + typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize); + typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); #else typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize); #endif -typedef CUresult CUDAAPI tcuMemFreeHost(void *p); -typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); + typedef CUresult CUDAAPI tcuMemFreeHost(void *p); + typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); -typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p); + typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p); #if __CUDA_API_VERSION >= 4010 /** @@ -1485,34 +1474,39 @@ typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p); */ #define CU_IPC_HANDLE_SIZE 64 -typedef struct CUipcEventHandle_st -{ - char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcEventHandle; + typedef struct CUipcEventHandle_st + { + char reserved[CU_IPC_HANDLE_SIZE]; + } CUipcEventHandle; -typedef struct CUipcMemHandle_st -{ - char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcMemHandle; + typedef struct CUipcMemHandle_st + { + char reserved[CU_IPC_HANDLE_SIZE]; + } CUipcMemHandle; -typedef enum CUipcMem_flags_enum -{ - CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ -} CUipcMem_flags; + typedef enum CUipcMem_flags_enum { + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = + 0x1 /**< Automatically enable peer access between remote devices as needed */ + } CUipcMem_flags; -typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId); -typedef CUresult CUDAAPI tcuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); -typedef CUresult CUDAAPI tcuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); -typedef CUresult CUDAAPI tcuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); -typedef CUresult CUDAAPI tcuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); -typedef CUresult CUDAAPI tcuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); -typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr); + typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId); + typedef CUresult CUDAAPI tcuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); + typedef CUresult CUDAAPI tcuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); + typedef CUresult CUDAAPI tcuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); + typedef CUresult CUDAAPI tcuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); + typedef CUresult CUDAAPI tcuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr); #endif -typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); -typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);; -typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + typedef CUresult CUDAAPI tcuMemHostUnregister(void *p); + ; + typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, + CUcontext dstContext, + CUdeviceptr srcDevice, + CUcontext srcContext, + size_t ByteCount); /************************************ ** @@ -1525,51 +1519,65 @@ typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstConte // 1D functions #if __CUDA_API_VERSION >= 3020 -// system <-> device memory -typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + // system <-> device memory + typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); -// device <-> device memory -typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + // device <-> device memory + typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); -// device <-> array memory -typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + // device <-> array memory + typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); -// system <-> array memory -typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + // system <-> array memory + typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); -// array <-> array memory -typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + // array <-> array memory + typedef CUresult CUDAAPI + tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); #else // system <-> device memory -typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount); -typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount); // device <-> device memory -typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount); // device <-> array memory -typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount); -typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, + unsigned int dstOffset, + CUdeviceptr srcDevice, + unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, + CUarray srcArray, + unsigned int srcOffset, + unsigned int ByteCount); // system <-> array memory -typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); -typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, + unsigned int dstOffset, + const void *srcHost, + unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); // array <-> array memory -typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, + unsigned int dstOffset, + CUarray srcArray, + unsigned int srcOffset, + unsigned int ByteCount); #endif -// 2D memcpy + // 2D memcpy -typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy); -typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); + typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy); + typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); -// 3D memcpy + // 3D memcpy -typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy); + typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy); /************************************ ** @@ -1584,382 +1592,439 @@ typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy); // 1D functions #if __CUDA_API_VERSION >= 3020 -// system <-> device memory -typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice, - const void *srcHost, size_t ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, - CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + // system <-> device memory + typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice, + const void *srcHost, + size_t ByteCount, + CUstream hStream); + typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, + CUdeviceptr srcDevice, + size_t ByteCount, + CUstream hStream); -// device <-> device memory -typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice, - CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + // device <-> device memory + typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice, + CUdeviceptr srcDevice, + size_t ByteCount, + CUstream hStream); -// system <-> array memory -typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, - const void *srcHost, size_t ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, - size_t ByteCount, CUstream hStream); + // system <-> array memory + typedef CUresult CUDAAPI + tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + typedef CUresult CUDAAPI + tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); #else // system <-> device memory -typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice, - const void *srcHost, unsigned int ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, - CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice, + const void *srcHost, + unsigned int ByteCount, + CUstream hStream); +typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, + CUdeviceptr srcDevice, + unsigned int ByteCount, + CUstream hStream); // device <-> device memory -typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice, - CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice, + CUdeviceptr srcDevice, + unsigned int ByteCount, + CUstream hStream); // system <-> array memory -typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, - const void *srcHost, unsigned int ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, - unsigned int ByteCount, CUstream hStream); +typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, + unsigned int dstOffset, + const void *srcHost, + unsigned int ByteCount, + CUstream hStream); +typedef CUresult CUDAAPI +tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); #endif -// 2D memcpy -typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + // 2D memcpy + typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); -// 3D memcpy -typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + // 3D memcpy + typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); -/************************************ - ** - ** Memset - ** - ***********************************/ -typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N); -typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N); -typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N); + /************************************ + ** + ** Memset + ** + ***********************************/ + typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N); + typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N); + typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N); #if __CUDA_API_VERSION >= 3020 -typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height); -typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height); -typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height); + typedef CUresult CUDAAPI + tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height); + typedef CUresult CUDAAPI + tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height); + typedef CUresult CUDAAPI + tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height); #else -typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); -typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); -typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); +typedef CUresult CUDAAPI +tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); +typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, + unsigned int dstPitch, + unsigned short us, + unsigned int Width, + unsigned int Height); +typedef CUresult CUDAAPI +tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); #endif -/************************************ - ** - ** Function management - ** - ***********************************/ + /************************************ + ** + ** Function management + ** + ***********************************/ -typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); -typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); -typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); -typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); -typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); + typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); + typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); + typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); + typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); + typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); -typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, - unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, - unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, - unsigned int sharedMemBytes, - CUstream hStream, void **kernelParams, void **extra); + typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra); -/************************************ - ** - ** Array management - ** - ***********************************/ + /************************************ + ** + ** Array management + ** + ***********************************/ -typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); -typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); -typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray); + typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); + typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); + typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray); -typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); -typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); + typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); + typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); #if __CUDA_API_VERSION >= 5000 -typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); -typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); -typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); + typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle, + const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, + unsigned int numMipmapLevels); + typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray, + CUmipmappedArray hMipmappedArray, + unsigned int level); + typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); #endif -/************************************ - ** - ** Texture reference management - ** - ***********************************/ -typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef); -typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef); + /************************************ + ** + ** Texture reference management + ** + ***********************************/ + typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef); + typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); + typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); #if __CUDA_API_VERSION >= 3020 -typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); -typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); + typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); + typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, + const CUDA_ARRAY_DESCRIPTOR *desc, + CUdeviceptr dptr, + size_t Pitch); #else -typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes); -typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch); +typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, + CUtexref hTexRef, + CUdeviceptr dptr, + unsigned int bytes); +typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, + const CUDA_ARRAY_DESCRIPTOR *desc, + CUdeviceptr dptr, + unsigned int Pitch); #endif -typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); -typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); -typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); -typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); + typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); + typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); + typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); + typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); -typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); -typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); + typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); + typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); + typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); + typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); + typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); -/************************************ - ** - ** Surface reference management - ** - ***********************************/ + /************************************ + ** + ** Surface reference management + ** + ***********************************/ -typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); -typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); + typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); + typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); -/************************************ - ** - ** Parameter management - ** - ***********************************/ + /************************************ + ** + ** Parameter management + ** + ***********************************/ -typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes); -typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value); -typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value); -typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); -typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); + typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes); + typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value); + typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value); + typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); + typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); -/************************************ - ** - ** Launch functions - ** - ***********************************/ + /************************************ + ** + ** Launch functions + ** + ***********************************/ -typedef CUresult CUDAAPI tcuLaunch(CUfunction f); -typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height); -typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); + typedef CUresult CUDAAPI tcuLaunch(CUfunction f); + typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height); + typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); -/************************************ - ** - ** Events - ** - ***********************************/ -typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags); -typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream); -typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent); -typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent); -typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent); -typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + /************************************ + ** + ** Events + ** + ***********************************/ + typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags); + typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream); + typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent); + typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent); + typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent); + typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); -/************************************ - ** - ** Streams - ** - ***********************************/ -typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags); -typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); -typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + /************************************ + ** + ** Streams + ** + ***********************************/ + typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags); + typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, + CUstreamCallback callback, + void *userData, + unsigned int flags); -typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream); -typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream); -typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream); + typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream); + typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream); + typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream); -/************************************ - ** - ** Graphics interop - ** - ***********************************/ -typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource); -typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); + /************************************ + ** + ** Graphics interop + ** + ***********************************/ + typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource); + typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, + CUgraphicsResource resource, + unsigned int arrayIndex, + unsigned int mipLevel); #if __CUDA_API_VERSION >= 3020 -typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); + typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, + size_t *pSize, + CUgraphicsResource resource); #else -typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); +typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, + unsigned int *pSize, + CUgraphicsResource resource); #endif -typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); -typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); -typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, + CUgraphicsResource *resources, + CUstream hStream); + typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, + CUgraphicsResource *resources, + CUstream hStream); -/************************************ - ** - ** Export tables - ** - ***********************************/ -typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); + /************************************ + ** + ** Export tables + ** + ***********************************/ + typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); -/************************************ - ** - ** Limits - ** - ***********************************/ + /************************************ + ** + ** Limits + ** + ***********************************/ -typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value); -typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit); -typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache *pconfig); -typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config); -typedef CUresult CUDAAPI tcuCtxGetSharedMemConfig(CUsharedconfig *pConfig); -typedef CUresult CUDAAPI tcuCtxSetSharedMemConfig(CUsharedconfig config); -typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int *version); + typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value); + typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit); + typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache *pconfig); + typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config); + typedef CUresult CUDAAPI tcuCtxGetSharedMemConfig(CUsharedconfig *pConfig); + typedef CUresult CUDAAPI tcuCtxSetSharedMemConfig(CUsharedconfig config); + typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int *version); -/************************************ - ** - ** Profiler - ** - ***********************************/ -typedef CUresult CUDAAPI tcuProfilerStop(void); + /************************************ + ** + ** Profiler + ** + ***********************************/ + typedef CUresult CUDAAPI tcuProfilerStop(void); -/************************************ - ************************************/ + /************************************ + ************************************/ -extern CUresult CUDAAPI cuInit(unsigned int, int cudaVersion); + extern CUresult CUDAAPI cuInit(unsigned int, int cudaVersion); -extern tcuDriverGetVersion *cuDriverGetVersion; -extern tcuDeviceGet *cuDeviceGet; -extern tcuDeviceGetCount *cuDeviceGetCount; -extern tcuDeviceGetName *cuDeviceGetName; -extern tcuDeviceComputeCapability *cuDeviceComputeCapability; -extern tcuDeviceGetProperties *cuDeviceGetProperties; -extern tcuDeviceGetAttribute *cuDeviceGetAttribute; -extern tcuGetErrorString *cuGetErrorString; -extern tcuCtxDestroy *cuCtxDestroy; -extern tcuCtxAttach *cuCtxAttach; -extern tcuCtxDetach *cuCtxDetach; -extern tcuCtxPushCurrent *cuCtxPushCurrent; -extern tcuCtxPopCurrent *cuCtxPopCurrent; + extern tcuDriverGetVersion *cuDriverGetVersion; + extern tcuDeviceGet *cuDeviceGet; + extern tcuDeviceGetCount *cuDeviceGetCount; + extern tcuDeviceGetName *cuDeviceGetName; + extern tcuDeviceComputeCapability *cuDeviceComputeCapability; + extern tcuDeviceGetProperties *cuDeviceGetProperties; + extern tcuDeviceGetAttribute *cuDeviceGetAttribute; + extern tcuGetErrorString *cuGetErrorString; + extern tcuCtxDestroy *cuCtxDestroy; + extern tcuCtxAttach *cuCtxAttach; + extern tcuCtxDetach *cuCtxDetach; + extern tcuCtxPushCurrent *cuCtxPushCurrent; + extern tcuCtxPopCurrent *cuCtxPopCurrent; -extern tcuCtxSetCurrent *cuCtxSetCurrent; -extern tcuCtxGetCurrent *cuCtxGetCurrent; + extern tcuCtxSetCurrent *cuCtxSetCurrent; + extern tcuCtxGetCurrent *cuCtxGetCurrent; -extern tcuCtxGetDevice *cuCtxGetDevice; -extern tcuCtxSynchronize *cuCtxSynchronize; -extern tcuModuleLoad *cuModuleLoad; -extern tcuModuleLoadData *cuModuleLoadData; -extern tcuModuleLoadDataEx *cuModuleLoadDataEx; -extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary; -extern tcuModuleUnload *cuModuleUnload; -extern tcuModuleGetFunction *cuModuleGetFunction; -extern tcuModuleGetTexRef *cuModuleGetTexRef; -extern tcuModuleGetSurfRef *cuModuleGetSurfRef; -extern tcuMemFreeHost *cuMemFreeHost; -extern tcuMemHostAlloc *cuMemHostAlloc; -extern tcuMemHostGetFlags *cuMemHostGetFlags; + extern tcuCtxGetDevice *cuCtxGetDevice; + extern tcuCtxSynchronize *cuCtxSynchronize; + extern tcuModuleLoad *cuModuleLoad; + extern tcuModuleLoadData *cuModuleLoadData; + extern tcuModuleLoadDataEx *cuModuleLoadDataEx; + extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary; + extern tcuModuleUnload *cuModuleUnload; + extern tcuModuleGetFunction *cuModuleGetFunction; + extern tcuModuleGetTexRef *cuModuleGetTexRef; + extern tcuModuleGetSurfRef *cuModuleGetSurfRef; + extern tcuMemFreeHost *cuMemFreeHost; + extern tcuMemHostAlloc *cuMemHostAlloc; + extern tcuMemHostGetFlags *cuMemHostGetFlags; -extern tcuMemHostRegister *cuMemHostRegister; -extern tcuMemHostUnregister *cuMemHostUnregister; -extern tcuMemcpy *cuMemcpy; -extern tcuMemcpyPeer *cuMemcpyPeer; + extern tcuMemHostRegister *cuMemHostRegister; + extern tcuMemHostUnregister *cuMemHostUnregister; + extern tcuMemcpy *cuMemcpy; + extern tcuMemcpyPeer *cuMemcpyPeer; -extern tcuDeviceTotalMem *cuDeviceTotalMem; -extern tcuCtxCreate *cuCtxCreate; -extern tcuModuleGetGlobal *cuModuleGetGlobal; -extern tcuMemGetInfo *cuMemGetInfo; -extern tcuMemAlloc *cuMemAlloc; -extern tcuMemAllocPitch *cuMemAllocPitch; -extern tcuMemFree *cuMemFree; -extern tcuMemGetAddressRange *cuMemGetAddressRange; -extern tcuMemAllocHost *cuMemAllocHost; -extern tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer; -extern tcuFuncSetBlockShape *cuFuncSetBlockShape; -extern tcuFuncSetSharedSize *cuFuncSetSharedSize; -extern tcuFuncGetAttribute *cuFuncGetAttribute; -extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig; -extern tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig; -extern tcuLaunchKernel *cuLaunchKernel; -extern tcuArrayDestroy *cuArrayDestroy; -extern tcuTexRefCreate *cuTexRefCreate; -extern tcuTexRefDestroy *cuTexRefDestroy; -extern tcuTexRefSetArray *cuTexRefSetArray; -extern tcuTexRefSetFormat *cuTexRefSetFormat; -extern tcuTexRefSetAddressMode *cuTexRefSetAddressMode; -extern tcuTexRefSetFilterMode *cuTexRefSetFilterMode; -extern tcuTexRefSetFlags *cuTexRefSetFlags; -extern tcuTexRefGetArray *cuTexRefGetArray; -extern tcuTexRefGetAddressMode *cuTexRefGetAddressMode; -extern tcuTexRefGetFilterMode *cuTexRefGetFilterMode; -extern tcuTexRefGetFormat *cuTexRefGetFormat; -extern tcuTexRefGetFlags *cuTexRefGetFlags; -extern tcuSurfRefSetArray *cuSurfRefSetArray; -extern tcuSurfRefGetArray *cuSurfRefGetArray; -extern tcuParamSetSize *cuParamSetSize; -extern tcuParamSeti *cuParamSeti; -extern tcuParamSetf *cuParamSetf; -extern tcuParamSetv *cuParamSetv; -extern tcuParamSetTexRef *cuParamSetTexRef; -extern tcuLaunch *cuLaunch; -extern tcuLaunchGrid *cuLaunchGrid; -extern tcuLaunchGridAsync *cuLaunchGridAsync; -extern tcuEventCreate *cuEventCreate; -extern tcuEventRecord *cuEventRecord; -extern tcuEventQuery *cuEventQuery; -extern tcuEventSynchronize *cuEventSynchronize; -extern tcuEventDestroy *cuEventDestroy; -extern tcuEventElapsedTime *cuEventElapsedTime; -extern tcuStreamCreate *cuStreamCreate; -extern tcuStreamQuery *cuStreamQuery; -extern tcuStreamWaitEvent *cuStreamWaitEvent; -extern tcuStreamAddCallback *cuStreamAddCallback; -extern tcuStreamSynchronize *cuStreamSynchronize; -extern tcuStreamDestroy *cuStreamDestroy; -extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; -extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; -extern tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags; -extern tcuGraphicsMapResources *cuGraphicsMapResources; -extern tcuGraphicsUnmapResources *cuGraphicsUnmapResources; -extern tcuGetExportTable *cuGetExportTable; -extern tcuCtxSetLimit *cuCtxSetLimit; -extern tcuCtxGetLimit *cuCtxGetLimit; + extern tcuDeviceTotalMem *cuDeviceTotalMem; + extern tcuCtxCreate *cuCtxCreate; + extern tcuModuleGetGlobal *cuModuleGetGlobal; + extern tcuMemGetInfo *cuMemGetInfo; + extern tcuMemAlloc *cuMemAlloc; + extern tcuMemAllocPitch *cuMemAllocPitch; + extern tcuMemFree *cuMemFree; + extern tcuMemGetAddressRange *cuMemGetAddressRange; + extern tcuMemAllocHost *cuMemAllocHost; + extern tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer; + extern tcuFuncSetBlockShape *cuFuncSetBlockShape; + extern tcuFuncSetSharedSize *cuFuncSetSharedSize; + extern tcuFuncGetAttribute *cuFuncGetAttribute; + extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig; + extern tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig; + extern tcuLaunchKernel *cuLaunchKernel; + extern tcuArrayDestroy *cuArrayDestroy; + extern tcuTexRefCreate *cuTexRefCreate; + extern tcuTexRefDestroy *cuTexRefDestroy; + extern tcuTexRefSetArray *cuTexRefSetArray; + extern tcuTexRefSetFormat *cuTexRefSetFormat; + extern tcuTexRefSetAddressMode *cuTexRefSetAddressMode; + extern tcuTexRefSetFilterMode *cuTexRefSetFilterMode; + extern tcuTexRefSetFlags *cuTexRefSetFlags; + extern tcuTexRefGetArray *cuTexRefGetArray; + extern tcuTexRefGetAddressMode *cuTexRefGetAddressMode; + extern tcuTexRefGetFilterMode *cuTexRefGetFilterMode; + extern tcuTexRefGetFormat *cuTexRefGetFormat; + extern tcuTexRefGetFlags *cuTexRefGetFlags; + extern tcuSurfRefSetArray *cuSurfRefSetArray; + extern tcuSurfRefGetArray *cuSurfRefGetArray; + extern tcuParamSetSize *cuParamSetSize; + extern tcuParamSeti *cuParamSeti; + extern tcuParamSetf *cuParamSetf; + extern tcuParamSetv *cuParamSetv; + extern tcuParamSetTexRef *cuParamSetTexRef; + extern tcuLaunch *cuLaunch; + extern tcuLaunchGrid *cuLaunchGrid; + extern tcuLaunchGridAsync *cuLaunchGridAsync; + extern tcuEventCreate *cuEventCreate; + extern tcuEventRecord *cuEventRecord; + extern tcuEventQuery *cuEventQuery; + extern tcuEventSynchronize *cuEventSynchronize; + extern tcuEventDestroy *cuEventDestroy; + extern tcuEventElapsedTime *cuEventElapsedTime; + extern tcuStreamCreate *cuStreamCreate; + extern tcuStreamQuery *cuStreamQuery; + extern tcuStreamWaitEvent *cuStreamWaitEvent; + extern tcuStreamAddCallback *cuStreamAddCallback; + extern tcuStreamSynchronize *cuStreamSynchronize; + extern tcuStreamDestroy *cuStreamDestroy; + extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; + extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; + extern tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags; + extern tcuGraphicsMapResources *cuGraphicsMapResources; + extern tcuGraphicsUnmapResources *cuGraphicsUnmapResources; + extern tcuGetExportTable *cuGetExportTable; + extern tcuCtxSetLimit *cuCtxSetLimit; + extern tcuCtxGetLimit *cuCtxGetLimit; -// These functions could be using the CUDA 3.2 interface (_v2) -extern tcuMemcpyHtoD *cuMemcpyHtoD; -extern tcuMemcpyDtoH *cuMemcpyDtoH; -extern tcuMemcpyDtoD *cuMemcpyDtoD; -extern tcuMemcpyDtoA *cuMemcpyDtoA; -extern tcuMemcpyAtoD *cuMemcpyAtoD; -extern tcuMemcpyHtoA *cuMemcpyHtoA; -extern tcuMemcpyAtoH *cuMemcpyAtoH; -extern tcuMemcpyAtoA *cuMemcpyAtoA; -extern tcuMemcpy2D *cuMemcpy2D; -extern tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned; -extern tcuMemcpy3D *cuMemcpy3D; -extern tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync; -extern tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync; -extern tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync; -extern tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync; -extern tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync; -extern tcuMemcpy2DAsync *cuMemcpy2DAsync; -extern tcuMemcpy3DAsync *cuMemcpy3DAsync; -extern tcuMemsetD8 *cuMemsetD8; -extern tcuMemsetD16 *cuMemsetD16; -extern tcuMemsetD32 *cuMemsetD32; -extern tcuMemsetD2D8 *cuMemsetD2D8; -extern tcuMemsetD2D16 *cuMemsetD2D16; -extern tcuMemsetD2D32 *cuMemsetD2D32; -extern tcuArrayCreate *cuArrayCreate; -extern tcuArrayGetDescriptor *cuArrayGetDescriptor; -extern tcuArray3DCreate *cuArray3DCreate; -extern tcuArray3DGetDescriptor *cuArray3DGetDescriptor; -extern tcuTexRefSetAddress *cuTexRefSetAddress; -extern tcuTexRefSetAddress2D *cuTexRefSetAddress2D; -extern tcuTexRefGetAddress *cuTexRefGetAddress; -extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer; + // These functions could be using the CUDA 3.2 interface (_v2) + extern tcuMemcpyHtoD *cuMemcpyHtoD; + extern tcuMemcpyDtoH *cuMemcpyDtoH; + extern tcuMemcpyDtoD *cuMemcpyDtoD; + extern tcuMemcpyDtoA *cuMemcpyDtoA; + extern tcuMemcpyAtoD *cuMemcpyAtoD; + extern tcuMemcpyHtoA *cuMemcpyHtoA; + extern tcuMemcpyAtoH *cuMemcpyAtoH; + extern tcuMemcpyAtoA *cuMemcpyAtoA; + extern tcuMemcpy2D *cuMemcpy2D; + extern tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned; + extern tcuMemcpy3D *cuMemcpy3D; + extern tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync; + extern tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync; + extern tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync; + extern tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync; + extern tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync; + extern tcuMemcpy2DAsync *cuMemcpy2DAsync; + extern tcuMemcpy3DAsync *cuMemcpy3DAsync; + extern tcuMemsetD8 *cuMemsetD8; + extern tcuMemsetD16 *cuMemsetD16; + extern tcuMemsetD32 *cuMemsetD32; + extern tcuMemsetD2D8 *cuMemsetD2D8; + extern tcuMemsetD2D16 *cuMemsetD2D16; + extern tcuMemsetD2D32 *cuMemsetD2D32; + extern tcuArrayCreate *cuArrayCreate; + extern tcuArrayGetDescriptor *cuArrayGetDescriptor; + extern tcuArray3DCreate *cuArray3DCreate; + extern tcuArray3DGetDescriptor *cuArray3DGetDescriptor; + extern tcuTexRefSetAddress *cuTexRefSetAddress; + extern tcuTexRefSetAddress2D *cuTexRefSetAddress2D; + extern tcuTexRefGetAddress *cuTexRefGetAddress; + extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer; -extern tcuMipmappedArrayCreate *cuMipmappedArrayCreate; -extern tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel; -extern tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy; + extern tcuMipmappedArrayCreate *cuMipmappedArrayCreate; + extern tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel; + extern tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy; -extern tcuProfilerStop *cuProfilerStop; + extern tcuProfilerStop *cuProfilerStop; #ifdef __cplusplus } #endif -//#undef __CUDA_API_VERSION +// #undef __CUDA_API_VERSION #endif //__cuda_drvapi_dynlink_cuda_h__ diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h index 4ca66fde..7c61ff2a 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h @@ -14,21 +14,17 @@ #ifndef HELPER_CUDA_DRVAPI_H #define HELPER_CUDA_DRVAPI_H +#include #include #include #include -#include - #ifndef MAX #define MAX(a, b) (a > b ? a : b) #endif #ifndef HELPER_CUDA_DRVAPI_H -inline int ftoi(float value) { - return (value >= 0 ? static_cast(value + 0.5) - : static_cast(value - 0.5)); -} +inline int ftoi(float value) { return (value >= 0 ? static_cast(value + 0.5) : static_cast(value - 0.5)); } #endif #ifndef EXIT_WAIVED @@ -47,311 +43,302 @@ inline int ftoi(float value) { #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) // These are the inline versions for all of the SDK helper functions -inline void __checkCudaErrors(CUresult err, const char *file, const int line) { - if (CUDA_SUCCESS != err) { - const char *errorStr = NULL; - cuGetErrorString(err, &errorStr); - fprintf(stderr, - "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, " - "line %i.\n", - err, errorStr, file, line); - exit(EXIT_FAILURE); - } +inline void __checkCudaErrors(CUresult err, const char *file, const int line) +{ + if (CUDA_SUCCESS != err) { + const char *errorStr = NULL; + cuGetErrorString(err, &errorStr); + fprintf(stderr, + "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, " + "line %i.\n", + err, + errorStr, + file, + line); + exit(EXIT_FAILURE); + } } #endif // This function wraps the CUDA Driver API into a template function -template -inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, - int device) { - checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device)); +template inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) +{ + checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device)); } #endif // Beginning of GPU Architecture definitions -inline int _ConvertSMVer2CoresDRV(int major, int minor) { - // Defines for GPU Architecture types (using the SM version to determine the # - // of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM - // minor version - int Cores; - } sSMtoCores; +inline int _ConvertSMVer2CoresDRV(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # + // of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM + // minor version + int Cores; + } sSMtoCores; - sSMtoCores nGpuArchCoresPerSM[] = { - {0x30, 192}, - {0x32, 192}, - {0x35, 192}, - {0x37, 192}, - {0x50, 128}, - {0x52, 128}, - {0x53, 128}, - {0x60, 64}, - {0x61, 128}, - {0x62, 128}, - {0x70, 64}, - {0x72, 64}, - {0x75, 64}, - {0x80, 64}, - {0x86, 128}, - {0x87, 128}, - {0x90, 128}, - {-1, -1}}; + sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x90, 128}, + {-1, -1}}; - int index = 0; + int index = 0; - while (nGpuArchCoresPerSM[index].SM != -1) { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { - return nGpuArchCoresPerSM[index].Cores; + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; } - index++; - } - - // If we don't find the values, we default use the previous one to run - // properly - printf( - "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", - major, minor, nGpuArchCoresPerSM[index - 1].Cores); - return nGpuArchCoresPerSM[index - 1].Cores; + // If we don't find the values, we default use the previous one to run + // properly + printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", + major, + minor, + nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; } - // end of GPU Architecture definitions +// end of GPU Architecture definitions #ifdef __cuda_cuda_h__ // General GPU Device CUDA Initialization -inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) { - int cuDevice = 0; - int deviceCount = 0; - checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); +inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) +{ + int cuDevice = 0; + int deviceCount = 0; + checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); - checkCudaErrors(cuDeviceGetCount(&deviceCount)); + checkCudaErrors(cuDeviceGetCount(&deviceCount)); - if (deviceCount == 0) { - fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); - exit(EXIT_FAILURE); - } + if (deviceCount == 0) { + fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } - int dev = 0; - dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device="); + int dev = 0; + dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device="); - if (dev < 0) { - dev = 0; - } + if (dev < 0) { + dev = 0; + } - if (dev > deviceCount - 1) { - fprintf(stderr, "\n"); - fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", - deviceCount); - fprintf(stderr, - ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", - dev); - fprintf(stderr, "\n"); - return -dev; - } + if (dev > deviceCount - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount); + fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev); + fprintf(stderr, "\n"); + return -dev; + } - checkCudaErrors(cuDeviceGet(&cuDevice, dev)); - char name[100]; - checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); + checkCudaErrors(cuDeviceGet(&cuDevice, dev)); + char name[100]; + checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); - int computeMode; - getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); - if (computeMode == CU_COMPUTEMODE_PROHIBITED) { - fprintf(stderr, - "Error: device is running in , no " - "threads can use this CUDA Device.\n"); - return -1; - } + if (computeMode == CU_COMPUTEMODE_PROHIBITED) { + fprintf(stderr, + "Error: device is running in , no " + "threads can use this CUDA Device.\n"); + return -1; + } - if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) { - printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name); - } + if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) { + printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name); + } - return dev; + return dev; } // This function returns the best GPU based on performance -inline int gpuGetMaxGflopsDeviceIdDRV() { - CUdevice current_device = 0; - CUdevice max_perf_device = 0; - int device_count = 0; - int sm_per_multiproc = 0; - unsigned long long max_compute_perf = 0; - int major = 0; - int minor = 0; - int multiProcessorCount; - int clockRate; - int devices_prohibited = 0; +inline int gpuGetMaxGflopsDeviceIdDRV() +{ + CUdevice current_device = 0; + CUdevice max_perf_device = 0; + int device_count = 0; + int sm_per_multiproc = 0; + unsigned long long max_compute_perf = 0; + int major = 0; + int minor = 0; + int multiProcessorCount; + int clockRate; + int devices_prohibited = 0; - cuInit(0, __CUDA_API_VERSION); - checkCudaErrors(cuDeviceGetCount(&device_count)); + cuInit(0, __CUDA_API_VERSION); + checkCudaErrors(cuDeviceGetCount(&device_count)); - if (device_count == 0) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n"); - exit(EXIT_FAILURE); - } - - // Find the best CUDA capable GPU device - current_device = 0; - - while (current_device < device_count) { - checkCudaErrors(cuDeviceGetAttribute( - &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - current_device)); - checkCudaErrors(cuDeviceGetAttribute( - &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device)); - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); - - int computeMode; - getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, - current_device); - - if (computeMode != CU_COMPUTEMODE_PROHIBITED) { - if (major == 9999 && minor == 9999) { - sm_per_multiproc = 1; - } else { - sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); - } - - unsigned long long compute_perf = - (unsigned long long)(multiProcessorCount * sm_per_multiproc * - clockRate); - - if (compute_perf > max_compute_perf) { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } else { - devices_prohibited++; + if (device_count == 0) { + fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); } - ++current_device; - } + // Find the best CUDA capable GPU device + current_device = 0; - if (devices_prohibited == device_count) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode " - "prohibited.\n"); - exit(EXIT_FAILURE); - } + while (current_device < device_count) { + checkCudaErrors( + cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device)); + checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device)); + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); - return max_perf_device; + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); + + if (computeMode != CU_COMPUTEMODE_PROHIBITED) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } + else { + sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); + } + + unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate); + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode " + "prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; } // General initialization call to pick the best CUDA Device -inline CUdevice findCudaDeviceDRV(int argc, const char **argv) { - CUdevice cuDevice; - int devID = 0; +inline CUdevice findCudaDeviceDRV(int argc, const char **argv) +{ + CUdevice cuDevice; + int devID = 0; - // If the command-line has a device number specified, use it - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - devID = gpuDeviceInitDRV(argc, argv); + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + devID = gpuDeviceInitDRV(argc, argv); - if (devID < 0) { - printf("exiting...\n"); - exit(EXIT_SUCCESS); + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } + } + else { + // Otherwise pick the device with highest Gflops/s + char name[100]; + devID = gpuGetMaxGflopsDeviceIdDRV(); + checkCudaErrors(cuDeviceGet(&cuDevice, devID)); + cuDeviceGetName(name, 100, cuDevice); + printf("> Using CUDA Device [%d]: %s\n", devID, name); } - } else { - // Otherwise pick the device with highest Gflops/s - char name[100]; - devID = gpuGetMaxGflopsDeviceIdDRV(); - checkCudaErrors(cuDeviceGet(&cuDevice, devID)); - cuDeviceGetName(name, 100, cuDevice); - printf("> Using CUDA Device [%d]: %s\n", devID, name); - } - cuDeviceGet(&cuDevice, devID); + cuDeviceGet(&cuDevice, devID); - return cuDevice; + return cuDevice; } -inline CUdevice findIntegratedGPUDrv() { - CUdevice current_device = 0; - int device_count = 0; - int devices_prohibited = 0; - int isIntegrated; +inline CUdevice findIntegratedGPUDrv() +{ + CUdevice current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + int isIntegrated; - cuInit(0, __CUDA_API_VERSION); - checkCudaErrors(cuDeviceGetCount(&device_count)); + cuInit(0, __CUDA_API_VERSION); + checkCudaErrors(cuDeviceGetCount(&device_count)); - if (device_count == 0) { - fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - // Find the integrated GPU which is compute capable - while (current_device < device_count) { - int computeMode = -1; - checkCudaErrors(cuDeviceGetAttribute( - &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device)); - checkCudaErrors(cuDeviceGetAttribute( - &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device)); - - // If GPU is integrated and is not running on Compute Mode prohibited use - // that - if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) { - int major = 0, minor = 0; - char deviceName[256]; - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - current_device)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - current_device)); - checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - current_device, deviceName, major, minor); - - return current_device; - } else { - devices_prohibited++; + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); } - current_device++; - } + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1; + checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device)); + checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device)); - if (devices_prohibited == device_count) { - fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n"); - exit(EXIT_FAILURE); - } + // If GPU is integrated and is not running on Compute Mode prohibited use + // that + if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) { + int major = 0, minor = 0; + char deviceName[256]; + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device)); + checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor); - return -1; + return current_device; + } + else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; } // General check for CUDA GPU SM Capabilities -inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, - int devID) { - CUdevice cuDevice; - char name[256]; - int major = 0, minor = 0; +inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID) +{ + CUdevice cuDevice; + char name[256]; + int major = 0, minor = 0; - checkCudaErrors(cuDeviceGet(&cuDevice, devID)); - checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + checkCudaErrors(cuDeviceGet(&cuDevice, devID)); + checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - if ((major > major_version) || - (major == major_version && minor >= minor_version)) { - printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, - major, minor); - return true; - } else { - printf( - "No GPU device was found that can support CUDA compute capability " - "%d.%d.\n", - major_version, minor_version); - return false; - } + if ((major > major_version) || (major == major_version && minor >= minor_version)) { + printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor); + return true; + } + else { + printf("No GPU device was found that can support CUDA compute capability " + "%d.%d.\n", + major_version, + minor_version); + return false; + } } #endif - // end of CUDA Helper Functions - -#endif // HELPER_CUDA_DRVAPI_H +// end of CUDA Helper Functions +#endif // HELPER_CUDA_DRVAPI_H diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h index a6fa9300..f5f98c1d 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h @@ -34,8 +34,8 @@ #define WA (4 * block_size) // Matrix A width #define HA (6 * block_size) // Matrix A height #define WB (4 * block_size) // Matrix B width -#define HB WA // Matrix B height -#define WC WB // Matrix C width -#define HC HA // Matrix C height +#define HB WA // Matrix B height +#define WC WB // Matrix C width +#define HC HA // Matrix C height #endif // _MATRIXMUL_H_ diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp index 4b7f6b61..3260a2ee 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp @@ -43,10 +43,10 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes, CUDA #include "cuda_drvapi_dynlink.h" @@ -60,7 +60,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int); #if defined _MSC_VER -#pragma warning (disable : 4312) +#pragma warning(disable : 4312) #endif @@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int, // Globals //////////////////////////////////////////////////////////////////////////////// CUcontext g_cuContext; -bool noprompt = false; +bool noprompt = false; static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)"; @@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)"; //////////////////////////////////////////////////////////////////////////////// void randomInit(float *data, size_t size) { - for (size_t i = 0; i < size; ++i) - { + for (size_t i = 0; i < size; ++i) { data[i] = rand() / (float)RAND_MAX; } } @@ -89,33 +88,29 @@ void randomInit(float *data, size_t size) //////////////////////////////////////////////////////////////////////////////// CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out) { - CUresult status; - CUdevice cuDevice; - CUmodule cuModule; + CUresult status; + CUdevice cuDevice; + CUmodule cuModule; CUfunction cuFunction; - int major, minor, block_size, devID = 0; - char deviceName[256]; + int major, minor, block_size, devID = 0; + char deviceName[256]; // link to cuda driver dynamically checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); // This assumes that the user is attempting to specify a explicit device -device=n - if (argc > 1) - { + if (argc > 1) { bool bFound = false; - for (int param=0; param < argc; param++) - { - if (!strncmp(argv[param], "-device", 7)) - { - int i=(int)strlen(argv[1]); + for (int param = 0; param < argc; param++) { + if (!strncmp(argv[param], "-device", 7)) { + int i = (int)strlen(argv[1]); - while (argv[1][i] != '=') - { + while (argv[1][i] != '=') { i--; } - devID = atoi(&argv[1][++i]); + devID = atoi(&argv[1][++i]); bFound = true; } @@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size int deviceCount = 0; checkCudaErrors(cuDeviceGetCount(&deviceCount)); - if (deviceCount == 0) - { + if (deviceCount == 0) { fprintf(stderr, "No devices supporting CUDA detected, exiting...\n"); exit(EXIT_SUCCESS); } - if (devID < 0) devID = 0; + if (devID < 0) + devID = 0; - if (devID > deviceCount -1) - { + if (devID > deviceCount - 1) { fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount); status = CUDA_ERROR_NOT_FOUND; @@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice)); printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor); - block_size = 32; + block_size = 32; *block_size_out = block_size; // create context for picked device status = cuCtxCreate(&g_cuContext, 0, cuDevice); - if (CUDA_SUCCESS != status) - { + if (CUDA_SUCCESS != status) { cuCtxDestroy(g_cuContext); exit(EXIT_SUCCESS); } @@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size { // in this branch we use compilation with parameters const unsigned int jitNumOptions = 3; - CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; - void **jitOptVals = new void *[jitNumOptions]; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void *[jitNumOptions]; // set up size of compilation log buffer - jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; int jitLogBufferSize = 1024; - jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; // set up pointer to the compilation log buffer - jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; char *jitLogBuffer = new char[jitLogBufferSize]; - jitOptVals[1] = jitLogBuffer; + jitOptVals[1] = jitLogBuffer; // set up pointer to set the Maximum # of registers for a particular kernel - jitOptions[2] = CU_JIT_MAX_REGISTERS; + jitOptions[2] = CU_JIT_MAX_REGISTERS; int jitRegCount = 32; - jitOptVals[2] = (void *)(size_t)jitRegCount; + jitOptVals[2] = (void *)(size_t)jitRegCount; // compile with set parameters printf("> Compiling CUDA module\n"); #if defined(_WIN64) || defined(__LP64__) - status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); + status = + cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); #else - status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); + status = + cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); #endif printf("> PTX JIT log:\n%s\n", jitLogBuffer); - delete [] jitOptions; - delete [] jitOptVals; - delete [] jitLogBuffer; + delete[] jitOptions; + delete[] jitOptVals; + delete[] jitLogBuffer; } - if (CUDA_SUCCESS != status) - { + if (CUDA_SUCCESS != status) { printf("Error while compiling PTX\n"); cuCtxDestroy(g_cuContext); exit(EXIT_FAILURE); } // retrieve CUDA function from the compiled module - status = cuModuleGetFunction(&cuFunction, cuModule, - (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit"); + status = cuModuleGetFunction( + &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit"); - if (CUDA_SUCCESS != status) - { + if (CUDA_SUCCESS != status) { cuCtxDestroy(g_cuContext); exit(EXIT_FAILURE); } @@ -233,21 +226,21 @@ int main(int argc, char **argv) printf("[ %s ]\n", sSDKsample); // initialize CUDA - CUfunction matrixMul = NULL; - int block_size = 0; + CUfunction matrixMul = NULL; + int block_size = 0; checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size)); // set seed for rand() srand(2006); // allocate host memory for matrices A and B - size_t size_A = WA * HA; - size_t mem_size_A = sizeof(float) * size_A; - size_t size_B = WB * HB; - size_t mem_size_B = sizeof(float) * size_B; + size_t size_A = WA * HA; + size_t mem_size_A = sizeof(float) * size_A; + size_t size_B = WB * HB; + size_t mem_size_B = sizeof(float) * size_B; - float *h_A = (float *) malloc(mem_size_A); - float *h_B = (float *) malloc(mem_size_B); + float *h_A = (float *)malloc(mem_size_A); + float *h_B = (float *)malloc(mem_size_B); // initialize host memory randomInit(h_A, size_A); @@ -264,26 +257,24 @@ int main(int argc, char **argv) checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); // allocate device memory for result - size_t size_C = WC * HC; - size_t mem_size_C = sizeof(float) * size_C; + size_t size_C = WC * HC; + size_t mem_size_C = sizeof(float) * size_C; CUdeviceptr d_C; checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); // allocate mem for the result on host side - float *h_C = (float *) malloc(mem_size_C); + float *h_C = (float *)malloc(mem_size_C); #if __CUDA_API_VERSION >= 4000 { // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) - int Matrix_Width_A = WA; - int Matrix_Width_B = WB; - void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B }; + int Matrix_Width_A = WA; + int Matrix_Width_B = WB; + void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; - checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1, - block_size , block_size , 1, - 0, - NULL, args, NULL)); + checkCudaErrors(cuLaunchKernel( + matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL)); } #else // __CUDA_API_VERSION <= 3020 { @@ -312,7 +303,7 @@ int main(int argc, char **argv) checkCudaErrors(cuParamSetSize(matrixMul, offset)); checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1)); - checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float))); + checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float))); // set execution configuration for the CUDA kernel checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size)); @@ -322,19 +313,18 @@ int main(int argc, char **argv) checkCudaErrors(cuCtxSynchronize()); // copy result from device to host - checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C)); + checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C)); // compute reference solution - float *reference = (float *) malloc(mem_size_C); + float *reference = (float *)malloc(mem_size_C); computeGold(reference, h_A, h_B, HA, WA, WB); // check result - float diff=0.0f; + float diff = 0.0f; - for (unsigned int i=0; i #include +#include // CUDA runtime #include + #include "nvrtc_helper.h" // Helper functions and utilities to work with CUDA #include -void constantInit(float *data, int size, float val) { - for (int i = 0; i < size; ++i) { - data[i] = val; - } +void constantInit(float *data, int size, float val) +{ + for (int i = 0; i < size; ++i) { + data[i] = val; + } } /** * Run a simple test of matrix multiplication using CUDA */ -int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, - dim3 &dimsB) { - // Allocate host memory for matrices A and B - unsigned int size_A = dimsA.x * dimsA.y; - unsigned int mem_size_A = sizeof(float) * size_A; - float *h_A = (float *)malloc(mem_size_A); - unsigned int size_B = dimsB.x * dimsB.y; - unsigned int mem_size_B = sizeof(float) * size_B; - float *h_B = (float *)malloc(mem_size_B); +int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) +{ + // Allocate host memory for matrices A and B + unsigned int size_A = dimsA.x * dimsA.y; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A = (float *)malloc(mem_size_A); + unsigned int size_B = dimsB.x * dimsB.y; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B = (float *)malloc(mem_size_B); - // Initialize host memory - const float valB = 0.01f; - constantInit(h_A, size_A, 1.0f); - constantInit(h_B, size_B, valB); + // Initialize host memory + const float valB = 0.01f; + constantInit(h_A, size_A, 1.0f); + constantInit(h_B, size_B, valB); - // Allocate device memory - CUdeviceptr d_A, d_B, d_C; + // Allocate device memory + CUdeviceptr d_A, d_B, d_C; - char *cubin, *kernel_file; - size_t cubinSize; + char *cubin, *kernel_file; + size_t cubinSize; - kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1); + kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1); - CUmodule module = loadCUBIN(cubin, argc, argv); + CUmodule module = loadCUBIN(cubin, argc, argv); - // Allocate host matrix C - dim3 dimsC(dimsB.x, dimsA.y, 1); - unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); - float *h_C = (float *)malloc(mem_size_C); + // Allocate host matrix C + dim3 dimsC(dimsB.x, dimsA.y, 1); + unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); + float *h_C = (float *)malloc(mem_size_C); - if (h_C == NULL) { - fprintf(stderr, "Failed to allocate host matrix C!\n"); - exit(EXIT_FAILURE); - } - - checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); - checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); - checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); - - // copy host memory to device - checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); - checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); - - // Setup execution parameters - dim3 threads(block_size, block_size); - dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); - - // Create and start timer - printf("Computing result using CUDA Kernel...\n"); - - CUfunction kernel_addr; - if (block_size == 16) { - checkCudaErrors( - cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16")); - } else { - checkCudaErrors( - cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32")); - } - - void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, - (void *)&dimsB.x}; - - // Execute the kernel - int nIter = 300; - - for (int j = 0; j < nIter; j++) { - checkCudaErrors( - cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */ - threads.x, threads.y, threads.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &arr[0], /* arguments */ - 0)); - - checkCudaErrors(cuCtxSynchronize()); - } - - // Copy result from device to host - checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C)); - - printf("Checking computed result for correctness: "); - - bool correct = true; - - // test relative error by the formula - // |_cpu - _gpu|/<|x|, |y|> < eps - - double eps = 1.e-6; // machine zero - - for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) { - double abs_err = fabs(h_C[i] - (dimsA.x * valB)); - double dot_length = dimsA.x; - double abs_val = fabs(h_C[i]); - double rel_err = abs_err / abs_val / dot_length; - - if (rel_err > eps) { - printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, - h_C[i], dimsA.x * valB, eps); - correct = false; + if (h_C == NULL) { + fprintf(stderr, "Failed to allocate host matrix C!\n"); + exit(EXIT_FAILURE); } - } - printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); + checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); + checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); + // copy host memory to device + checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); + checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); - // Clean up memory - free(h_A); - free(h_B); - free(h_C); + // Setup execution parameters + dim3 threads(block_size, block_size); + dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); - checkCudaErrors(cuMemFree(d_A)); - checkCudaErrors(cuMemFree(d_B)); - checkCudaErrors(cuMemFree(d_C)); + // Create and start timer + printf("Computing result using CUDA Kernel...\n"); - if (correct) { - return EXIT_SUCCESS; - } else { - return EXIT_FAILURE; - } + CUfunction kernel_addr; + if (block_size == 16) { + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16")); + } + else { + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32")); + } + + void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x}; + + // Execute the kernel + int nIter = 300; + + for (int j = 0; j < nIter; j++) { + checkCudaErrors(cuLaunchKernel(kernel_addr, + grid.x, + grid.y, + grid.z, /* grid dim */ + threads.x, + threads.y, + threads.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &arr[0], /* arguments */ + 0)); + + checkCudaErrors(cuCtxSynchronize()); + } + + // Copy result from device to host + checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C)); + + printf("Checking computed result for correctness: "); + + bool correct = true; + + // test relative error by the formula + // |_cpu - _gpu|/<|x|, |y|> < eps + + double eps = 1.e-6; // machine zero + + for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) { + double abs_err = fabs(h_C[i] - (dimsA.x * valB)); + double dot_length = dimsA.x; + double abs_val = fabs(h_C[i]); + double rel_err = abs_err / abs_val / dot_length; + + if (rel_err > eps) { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps); + correct = false; + } + } + + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); + + // Clean up memory + free(h_A); + free(h_B); + free(h_C); + + checkCudaErrors(cuMemFree(d_A)); + checkCudaErrors(cuMemFree(d_B)); + checkCudaErrors(cuMemFree(d_C)); + + if (correct) { + return EXIT_SUCCESS; + } + else { + return EXIT_FAILURE; + } } /** * Program main */ -int main(int argc, char **argv) { - printf("[Matrix Multiply Using CUDA] - Starting...\n"); +int main(int argc, char **argv) +{ + printf("[Matrix Multiply Using CUDA] - Starting...\n"); - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "?")) { - printf("Usage -device=n (n >= 0 for deviceID)\n"); - printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); - printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); - printf( - " Note: Outer matrix dimensions of A & B matrices must be equal.\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { + printf("Usage -device=n (n >= 0 for deviceID)\n"); + printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); + printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); + printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n"); - exit(EXIT_SUCCESS); - } + exit(EXIT_SUCCESS); + } - int block_size = 32; + int block_size = 32; - // original: - dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); - dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); + // original: + dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); + dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); - // reduce sizes to avoid running out of memory - // dim3 dimsA(32,32, 1); - // dim3 dimsB(32,32,1); + // reduce sizes to avoid running out of memory + // dim3 dimsA(32,32, 1); + // dim3 dimsB(32,32,1); - // width of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { - dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); - } + // width of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { + dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); + } - // height of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { - dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); - } + // height of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { + dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); + } - // width of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { - dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); - } + // width of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { + dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); + } - // height of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { - dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); - } + // height of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { + dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); + } - if (dimsA.x != dimsB.y) { - printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", - dimsA.x, dimsB.y); - exit(EXIT_FAILURE); - } + if (dimsA.x != dimsB.y) { + printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y); + exit(EXIT_FAILURE); + } - printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, - dimsB.y); + printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); - int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); + int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); - exit(matrix_result); + exit(matrix_result); } diff --git a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu index bd541cf9..0b8ce226 100644 --- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu +++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu @@ -48,84 +48,83 @@ #include -template -__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) { - // Handle to thread block group - cooperative_groups::thread_block cta = - cooperative_groups::this_thread_block(); - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; +template __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) +{ + // Handle to thread block group + cooperative_groups::thread_block cta = cooperative_groups::this_thread_block(); + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * by; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * by; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * bx; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * bx; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[ty][tx] = A[a + wA * ty + tx]; - Bs[ty][tx] = B[b + wB * ty + tx]; + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[ty][tx] = A[a + wA * ty + tx]; + Bs[ty][tx] = B[b + wB * ty + tx]; - // Synchronize to make sure the matrices are loaded - cooperative_groups::sync(cta); + // Synchronize to make sure the matrices are loaded + cooperative_groups::sync(cta); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[ty][k] * Bs[k][tx]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[ty][k] * Bs[k][tx]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + cooperative_groups::sync(cta); } - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - cooperative_groups::sync(cta); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; - C[c + wB * ty + tx] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; + C[c + wB * ty + tx] = Csub; } -extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, - int wA, int wB) { - matrixMulCUDA<16>(C, A, B, wA, wB); +extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB) +{ + matrixMulCUDA<16>(C, A, B, wA, wB); } -extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, - int wA, int wB) { - matrixMulCUDA<32>(C, A, B, wA, wB); +extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB) +{ + matrixMulCUDA<32>(C, A, B, wA, wB); } diff --git a/Samples/0_Introduction/mergeSort/bitonic.cu b/Samples/0_Introduction/mergeSort/bitonic.cu index 56fbae12..144e55e4 100644 --- a/Samples/0_Introduction/mergeSort/bitonic.cu +++ b/Samples/0_Introduction/mergeSort/bitonic.cu @@ -28,252 +28,254 @@ #include namespace cg = cooperative_groups; -#include #include +#include + #include "mergeSort_common.h" -inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, - uint &valB, uint arrowDir) { - uint t; +inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir) +{ + uint t; - if ((keyA > keyB) == arrowDir) { - t = keyA; - keyA = keyB; - keyB = t; - t = valA; - valA = valB; - valB = t; - } + if ((keyA > keyB) == arrowDir) { + t = keyA; + keyA = keyB; + keyB = t; + t = valA; + valA = valB; + valB = t; + } } -__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint arrayLength, uint sortDir) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Shared memory storage for one or more short vectors - __shared__ uint s_key[SHARED_SIZE_LIMIT]; - __shared__ uint s_val[SHARED_SIZE_LIMIT]; +__global__ void +bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Shared memory storage for one or more short vectors + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; - // Offset to the beginning of subbatch and load data - d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = d_SrcKey[0]; - s_val[threadIdx.x + 0] = d_SrcVal[0]; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + // Offset to the beginning of subbatch and load data + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[0]; + s_val[threadIdx.x + 0] = d_SrcVal[0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; - for (uint size = 2; size < arrayLength; size <<= 1) { - // Bitonic merge - uint dir = (threadIdx.x & (size / 2)) != 0; + for (uint size = 2; size < arrayLength; size <<= 1) { + // Bitonic merge + uint dir = (threadIdx.x & (size / 2)) != 0; - for (uint stride = size / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], dir); + for (uint stride = size / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir); + } } - } - // ddd == sortDir for the last bitonic merge step - { - for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], sortDir); + // ddd == sortDir for the last bitonic merge step + { + for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir); + } } - } - cg::sync(cta); - d_DstKey[0] = s_key[threadIdx.x + 0]; - d_DstVal[0] = s_val[threadIdx.x + 0]; - d_DstKey[(SHARED_SIZE_LIMIT / 2)] = - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - d_DstVal[(SHARED_SIZE_LIMIT / 2)] = - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + cg::sync(cta); + d_DstKey[0] = s_key[threadIdx.x + 0]; + d_DstVal[0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; } // Helper function (also used by odd-even merge sort) -extern "C" uint factorRadix2(uint *log2L, uint L) { - if (!L) { - *log2L = 0; - return 0; - } else { - for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++) - ; +extern "C" uint factorRadix2(uint *log2L, uint L) +{ + if (!L) { + *log2L = 0; + return 0; + } + else { + for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++) + ; - return L; - } + return L; + } } -extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint batchSize, uint arrayLength, - uint sortDir) { - // Nothing to sort - if (arrayLength < 2) { - return; - } +extern "C" void bitonicSortShared(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint batchSize, + uint arrayLength, + uint sortDir) +{ + // Nothing to sort + if (arrayLength < 2) { + return; + } - // Only power-of-two array lengths are supported by this implementation - uint log2L; - uint factorizationRemainder = factorRadix2(&log2L, arrayLength); - assert(factorizationRemainder == 1); + // Only power-of-two array lengths are supported by this implementation + uint log2L; + uint factorizationRemainder = factorRadix2(&log2L, arrayLength); + assert(factorizationRemainder == 1); - uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; - uint threadCount = SHARED_SIZE_LIMIT / 2; + uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; + uint threadCount = SHARED_SIZE_LIMIT / 2; - assert(arrayLength <= SHARED_SIZE_LIMIT); - assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0); + assert(arrayLength <= SHARED_SIZE_LIMIT); + assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0); - bitonicSortSharedKernel<<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir); - getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n"); + bitonicSortSharedKernel<<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir); + getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n"); } //////////////////////////////////////////////////////////////////////////////// // Merge step 3: merge elementary intervals //////////////////////////////////////////////////////////////////////////////// -static inline __host__ __device__ uint iDivUp(uint a, uint b) { - return ((a % b) == 0) ? (a / b) : (a / b + 1); -} +static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); } -static inline __host__ __device__ uint getSampleCount(uint dividend) { - return iDivUp(dividend, SAMPLE_STRIDE); +static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); } + +template +static inline __device__ void +ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir) +{ + uint t; + + if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1)) + || ((arrowDir != sortDir) && (flagB == 1))) { + t = keyA; + keyA = keyB; + keyB = t; + t = valA; + valA = valB; + valB = t; + t = flagA; + flagA = flagB; + flagB = t; + } } template -static inline __device__ void ComparatorExtended(uint &keyA, uint &valA, - uint &flagA, uint &keyB, - uint &valB, uint &flagB, - uint arrowDir) { - uint t; +__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint *d_LimitsA, + uint *d_LimitsB, + uint stride, + uint N) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ uint s_key[2 * SAMPLE_STRIDE]; + __shared__ uint s_val[2 * SAMPLE_STRIDE]; + __shared__ uint s_inf[2 * SAMPLE_STRIDE]; - if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || - ((arrowDir == sortDir) && (flagA == 1)) || - ((arrowDir != sortDir) && (flagB == 1))) { - t = keyA; - keyA = keyB; - keyB = t; - t = valA; - valA = valB; - valB = t; - t = flagA; - flagA = flagB; - flagB = t; - } -} + const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); + const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; + d_SrcKey += segmentBase; + d_SrcVal += segmentBase; + d_DstKey += segmentBase; + d_DstVal += segmentBase; -template -__global__ void bitonicMergeElementaryIntervalsKernel( - uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, - uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ uint s_key[2 * SAMPLE_STRIDE]; - __shared__ uint s_val[2 * SAMPLE_STRIDE]; - __shared__ uint s_inf[2 * SAMPLE_STRIDE]; + // Set up threadblock-wide parameters + __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst; - const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); - const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; - d_SrcKey += segmentBase; - d_SrcVal += segmentBase; - d_DstKey += segmentBase; - d_DstVal += segmentBase; + if (threadIdx.x == 0) { + uint segmentElementsA = stride; + uint segmentElementsB = umin(stride, N - segmentBase - stride); + uint segmentSamplesA = stride / SAMPLE_STRIDE; + uint segmentSamplesB = getSampleCount(segmentElementsB); + uint segmentSamples = segmentSamplesA + segmentSamplesB; - // Set up threadblock-wide parameters - __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst; + startSrcA = d_LimitsA[blockIdx.x]; + startSrcB = d_LimitsB[blockIdx.x]; + startDst = startSrcA + startSrcB; - if (threadIdx.x == 0) { - uint segmentElementsA = stride; - uint segmentElementsB = umin(stride, N - segmentBase - stride); - uint segmentSamplesA = stride / SAMPLE_STRIDE; - uint segmentSamplesB = getSampleCount(segmentElementsB); - uint segmentSamples = segmentSamplesA + segmentSamplesB; + uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA; + uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB; + lenSrcA = endSrcA - startSrcA; + lenSrcB = endSrcB - startSrcB; + } - startSrcA = d_LimitsA[blockIdx.x]; - startSrcB = d_LimitsB[blockIdx.x]; - startDst = startSrcA + startSrcB; + s_inf[threadIdx.x + 0] = 1; + s_inf[threadIdx.x + SAMPLE_STRIDE] = 1; - uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] - : segmentElementsA; - uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] - : segmentElementsB; - lenSrcA = endSrcA - startSrcA; - lenSrcB = endSrcB - startSrcB; - } - - s_inf[threadIdx.x + 0] = 1; - s_inf[threadIdx.x + SAMPLE_STRIDE] = 1; - - // Load input data - cg::sync(cta); - - if (threadIdx.x < lenSrcA) { - s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x]; - s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x]; - s_inf[threadIdx.x] = 0; - } - - // Prepare for bitonic merge by inversing the ordering - if (threadIdx.x < lenSrcB) { - s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = - d_SrcKey[stride + startSrcB + threadIdx.x]; - s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = - d_SrcVal[stride + startSrcB + threadIdx.x]; - s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0; - } - - //"Extended" bitonic merge - for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) { + // Load input data cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - ComparatorExtended(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0], - s_key[pos + stride], s_val[pos + stride], - s_inf[pos + stride], sortDir); - } - // Store sorted data - cg::sync(cta); - d_DstKey += startDst; - d_DstVal += startDst; + if (threadIdx.x < lenSrcA) { + s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x]; + s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x]; + s_inf[threadIdx.x] = 0; + } - if (threadIdx.x < lenSrcA) { - d_DstKey[threadIdx.x] = s_key[threadIdx.x]; - d_DstVal[threadIdx.x] = s_val[threadIdx.x]; - } + // Prepare for bitonic merge by inversing the ordering + if (threadIdx.x < lenSrcB) { + s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x]; + s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x]; + s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0; + } - if (threadIdx.x < lenSrcB) { - d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; - d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x]; - } + //"Extended" bitonic merge + for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + ComparatorExtended(s_key[pos + 0], + s_val[pos + 0], + s_inf[pos + 0], + s_key[pos + stride], + s_val[pos + stride], + s_inf[pos + stride], + sortDir); + } + + // Store sorted data + cg::sync(cta); + d_DstKey += startDst; + d_DstVal += startDst; + + if (threadIdx.x < lenSrcA) { + d_DstKey[threadIdx.x] = s_key[threadIdx.x]; + d_DstVal[threadIdx.x] = s_val[threadIdx.x]; + } + + if (threadIdx.x < lenSrcB) { + d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; + d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x]; + } } -extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, +extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, uint *d_LimitsA, - uint *d_LimitsB, uint stride, - uint N, uint sortDir) { - uint lastSegmentElements = N % (2 * stride); + uint *d_LimitsB, + uint stride, + uint N, + uint sortDir) +{ + uint lastSegmentElements = N % (2 * stride); - uint mergePairs = (lastSegmentElements > stride) - ? getSampleCount(N) - : (N - lastSegmentElements) / SAMPLE_STRIDE; + uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; - if (sortDir) { - bitonicMergeElementaryIntervalsKernel<1U><<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, - N); - getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n"); - } else { - bitonicMergeElementaryIntervalsKernel<0U><<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, - N); - getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n"); - } + if (sortDir) { + bitonicMergeElementaryIntervalsKernel<1U> + <<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N); + getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n"); + } + else { + bitonicMergeElementaryIntervalsKernel<0U> + <<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N); + getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n"); + } } diff --git a/Samples/0_Introduction/mergeSort/main.cpp b/Samples/0_Introduction/mergeSort/main.cpp index 0f172194..8eb50184 100644 --- a/Samples/0_Introduction/mergeSort/main.cpp +++ b/Samples/0_Introduction/mergeSort/main.cpp @@ -26,96 +26,94 @@ */ #include +#include +#include +#include #include #include -#include -#include -#include + #include "mergeSort_common.h" //////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; - uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; - StopWatchInterface *hTimer = NULL; +int main(int argc, char **argv) +{ + uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; + uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; + StopWatchInterface *hTimer = NULL; - const uint N = 4 * 1048576; - const uint DIR = 1; - const uint numValues = 65536; + const uint N = 4 * 1048576; + const uint DIR = 1; + const uint numValues = 65536; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - int dev = findCudaDevice(argc, (const char **)argv); + int dev = findCudaDevice(argc, (const char **)argv); - if (dev == -1) { - return EXIT_FAILURE; - } + if (dev == -1) { + return EXIT_FAILURE; + } - printf("Allocating and initializing host arrays...\n\n"); - sdkCreateTimer(&hTimer); - h_SrcKey = (uint *)malloc(N * sizeof(uint)); - h_SrcVal = (uint *)malloc(N * sizeof(uint)); - h_DstKey = (uint *)malloc(N * sizeof(uint)); - h_DstVal = (uint *)malloc(N * sizeof(uint)); + printf("Allocating and initializing host arrays...\n\n"); + sdkCreateTimer(&hTimer); + h_SrcKey = (uint *)malloc(N * sizeof(uint)); + h_SrcVal = (uint *)malloc(N * sizeof(uint)); + h_DstKey = (uint *)malloc(N * sizeof(uint)); + h_DstVal = (uint *)malloc(N * sizeof(uint)); - srand(2009); + srand(2009); - for (uint i = 0; i < N; i++) { - h_SrcKey[i] = rand() % numValues; - } + for (uint i = 0; i < N; i++) { + h_SrcKey[i] = rand() % numValues; + } - fillValues(h_SrcVal, N); + fillValues(h_SrcVal, N); - printf("Allocating and initializing CUDA arrays...\n\n"); - checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint))); - checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint))); - checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint))); - checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint))); - checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint))); - checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint))); - checkCudaErrors( - cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice)); + printf("Allocating and initializing CUDA arrays...\n\n"); + checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint))); + checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice)); - printf("Initializing GPU merge sort...\n"); - initMergeSort(); + printf("Initializing GPU merge sort...\n"); + initMergeSort(); - printf("Running GPU merge sort...\n"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - printf("Time: %f ms\n", sdkGetTimerValue(&hTimer)); + printf("Running GPU merge sort...\n"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + printf("Time: %f ms\n", sdkGetTimerValue(&hTimer)); - printf("Reading back GPU merge sort results...\n"); - checkCudaErrors( - cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost)); - checkCudaErrors( - cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost)); + printf("Reading back GPU merge sort results...\n"); + checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost)); - printf("Inspecting the results...\n"); - uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR); + printf("Inspecting the results...\n"); + uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR); - uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N); + uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N); - printf("Shutting down...\n"); - closeMergeSort(); - sdkDeleteTimer(&hTimer); - checkCudaErrors(cudaFree(d_SrcVal)); - checkCudaErrors(cudaFree(d_SrcKey)); - checkCudaErrors(cudaFree(d_BufVal)); - checkCudaErrors(cudaFree(d_BufKey)); - checkCudaErrors(cudaFree(d_DstVal)); - checkCudaErrors(cudaFree(d_DstKey)); - free(h_DstVal); - free(h_DstKey); - free(h_SrcVal); - free(h_SrcKey); + printf("Shutting down...\n"); + closeMergeSort(); + sdkDeleteTimer(&hTimer); + checkCudaErrors(cudaFree(d_SrcVal)); + checkCudaErrors(cudaFree(d_SrcKey)); + checkCudaErrors(cudaFree(d_BufVal)); + checkCudaErrors(cudaFree(d_BufKey)); + checkCudaErrors(cudaFree(d_DstVal)); + checkCudaErrors(cudaFree(d_DstKey)); + free(h_DstVal); + free(h_DstKey); + free(h_SrcVal); + free(h_SrcKey); - exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE); + exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/mergeSort/mergeSort.cu b/Samples/0_Introduction/mergeSort/mergeSort.cu index c3847a93..d2048a17 100644 --- a/Samples/0_Introduction/mergeSort/mergeSort.cu +++ b/Samples/0_Introduction/mergeSort/mergeSort.cu @@ -39,491 +39,499 @@ namespace cg = cooperative_groups; #include + #include "mergeSort_common.h" //////////////////////////////////////////////////////////////////////////////// // Helper functions //////////////////////////////////////////////////////////////////////////////// -static inline __host__ __device__ uint iDivUp(uint a, uint b) { - return ((a % b) == 0) ? (a / b) : (a / b + 1); -} +static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); } -static inline __host__ __device__ uint getSampleCount(uint dividend) { - return iDivUp(dividend, SAMPLE_STRIDE); -} +static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); } #define W (sizeof(uint) * 8) -static inline __device__ uint nextPowerOfTwo(uint x) { - /* - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return ++x; - */ - return 1U << (W - __clz(x - 1)); +static inline __device__ uint nextPowerOfTwo(uint x) +{ + /* + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; + */ + return 1U << (W - __clz(x - 1)); } -template -static inline __device__ uint binarySearchInclusive(uint val, uint *data, - uint L, uint stride) { - if (L == 0) { - return 0; - } - - uint pos = 0; - - for (; stride > 0; stride >>= 1) { - uint newPos = umin(pos + stride, L); - - if ((sortDir && (data[newPos - 1] <= val)) || - (!sortDir && (data[newPos - 1] >= val))) { - pos = newPos; +template static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride) +{ + if (L == 0) { + return 0; } - } - return pos; + uint pos = 0; + + for (; stride > 0; stride >>= 1) { + uint newPos = umin(pos + stride, L); + + if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) { + pos = newPos; + } + } + + return pos; } -template -static inline __device__ uint binarySearchExclusive(uint val, uint *data, - uint L, uint stride) { - if (L == 0) { - return 0; - } - - uint pos = 0; - - for (; stride > 0; stride >>= 1) { - uint newPos = umin(pos + stride, L); - - if ((sortDir && (data[newPos - 1] < val)) || - (!sortDir && (data[newPos - 1] > val))) { - pos = newPos; +template static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride) +{ + if (L == 0) { + return 0; } - } - return pos; + uint pos = 0; + + for (; stride > 0; stride >>= 1) { + uint newPos = umin(pos + stride, L); + + if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) { + pos = newPos; + } + } + + return pos; } //////////////////////////////////////////////////////////////////////////////// // Bottom-level merge sort (binary search-based) //////////////////////////////////////////////////////////////////////////////// template -__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint arrayLength) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ uint s_key[SHARED_SIZE_LIMIT]; - __shared__ uint s_val[SHARED_SIZE_LIMIT]; +__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; - d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = d_SrcKey[0]; - s_val[threadIdx.x + 0] = d_SrcVal[0]; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[0]; + s_val[threadIdx.x + 0] = d_SrcVal[0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; - for (uint stride = 1; stride < arrayLength; stride <<= 1) { - uint lPos = threadIdx.x & (stride - 1); - uint *baseKey = s_key + 2 * (threadIdx.x - lPos); - uint *baseVal = s_val + 2 * (threadIdx.x - lPos); + for (uint stride = 1; stride < arrayLength; stride <<= 1) { + uint lPos = threadIdx.x & (stride - 1); + uint *baseKey = s_key + 2 * (threadIdx.x - lPos); + uint *baseVal = s_val + 2 * (threadIdx.x - lPos); + + cg::sync(cta); + uint keyA = baseKey[lPos + 0]; + uint valA = baseVal[lPos + 0]; + uint keyB = baseKey[lPos + stride]; + uint valB = baseVal[lPos + stride]; + uint posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos; + uint posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos; + + cg::sync(cta); + baseKey[posA] = keyA; + baseVal[posA] = valA; + baseKey[posB] = keyB; + baseVal[posB] = valB; + } cg::sync(cta); - uint keyA = baseKey[lPos + 0]; - uint valA = baseVal[lPos + 0]; - uint keyB = baseKey[lPos + stride]; - uint valB = baseVal[lPos + stride]; - uint posA = - binarySearchExclusive(keyA, baseKey + stride, stride, stride) + - lPos; - uint posB = - binarySearchInclusive(keyB, baseKey + 0, stride, stride) + - lPos; - - cg::sync(cta); - baseKey[posA] = keyA; - baseVal[posA] = valA; - baseKey[posB] = keyB; - baseVal[posB] = valB; - } - - cg::sync(cta); - d_DstKey[0] = s_key[threadIdx.x + 0]; - d_DstVal[0] = s_val[threadIdx.x + 0]; - d_DstKey[(SHARED_SIZE_LIMIT / 2)] = - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - d_DstVal[(SHARED_SIZE_LIMIT / 2)] = - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstKey[0] = s_key[threadIdx.x + 0]; + d_DstVal[0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; } -static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, - uint *d_SrcVal, uint batchSize, uint arrayLength, - uint sortDir) { - if (arrayLength < 2) { - return; - } +static void mergeSortShared(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint batchSize, + uint arrayLength, + uint sortDir) +{ + if (arrayLength < 2) { + return; + } - assert(SHARED_SIZE_LIMIT % arrayLength == 0); - assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0); - uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; - uint threadCount = SHARED_SIZE_LIMIT / 2; + assert(SHARED_SIZE_LIMIT % arrayLength == 0); + assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0); + uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; + uint threadCount = SHARED_SIZE_LIMIT / 2; - if (sortDir) { - mergeSortSharedKernel<1U><<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength); - getLastCudaError("mergeSortShared<1><<<>>> failed\n"); - } else { - mergeSortSharedKernel<0U><<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength); - getLastCudaError("mergeSortShared<0><<<>>> failed\n"); - } + if (sortDir) { + mergeSortSharedKernel<1U><<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength); + getLastCudaError("mergeSortShared<1><<<>>> failed\n"); + } + else { + mergeSortSharedKernel<0U><<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength); + getLastCudaError("mergeSortShared<0><<<>>> failed\n"); + } } //////////////////////////////////////////////////////////////////////////////// // Merge step 1: generate sample ranks //////////////////////////////////////////////////////////////////////////////// template -__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, - uint *d_SrcKey, uint stride, uint N, - uint threadCount) { - uint pos = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void +generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount) +{ + uint pos = blockIdx.x * blockDim.x + threadIdx.x; - if (pos >= threadCount) { - return; - } + if (pos >= threadCount) { + return; + } - const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); - const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); - d_SrcKey += segmentBase; - d_RanksA += segmentBase / SAMPLE_STRIDE; - d_RanksB += segmentBase / SAMPLE_STRIDE; + const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); + const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + d_SrcKey += segmentBase; + d_RanksA += segmentBase / SAMPLE_STRIDE; + d_RanksB += segmentBase / SAMPLE_STRIDE; - const uint segmentElementsA = stride; - const uint segmentElementsB = umin(stride, N - segmentBase - stride); - const uint segmentSamplesA = getSampleCount(segmentElementsA); - const uint segmentSamplesB = getSampleCount(segmentElementsB); + const uint segmentElementsA = stride; + const uint segmentElementsB = umin(stride, N - segmentBase - stride); + const uint segmentSamplesA = getSampleCount(segmentElementsA); + const uint segmentSamplesB = getSampleCount(segmentElementsB); - if (i < segmentSamplesA) { - d_RanksA[i] = i * SAMPLE_STRIDE; - d_RanksB[i] = binarySearchExclusive( - d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, - nextPowerOfTwo(segmentElementsB)); - } + if (i < segmentSamplesA) { + d_RanksA[i] = i * SAMPLE_STRIDE; + d_RanksB[i] = binarySearchExclusive( + d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB)); + } - if (i < segmentSamplesB) { - d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE; - d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive( - d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, - nextPowerOfTwo(segmentElementsA)); - } + if (i < segmentSamplesB) { + d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE; + d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive( + d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA)); + } } -static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, - uint stride, uint N, uint sortDir) { - uint lastSegmentElements = N % (2 * stride); - uint threadCount = - (lastSegmentElements > stride) - ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) - : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); +static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir) +{ + uint lastSegmentElements = N % (2 * stride); + uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) + : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - if (sortDir) { - generateSampleRanksKernel<1U><<>>( - d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); - getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n"); - } else { - generateSampleRanksKernel<0U><<>>( - d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); - getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n"); - } + if (sortDir) { + generateSampleRanksKernel<1U> + <<>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); + getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n"); + } + else { + generateSampleRanksKernel<0U> + <<>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); + getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n"); + } } //////////////////////////////////////////////////////////////////////////////// // Merge step 2: generate sample ranks and indices //////////////////////////////////////////////////////////////////////////////// -__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, - uint stride, uint N, - uint threadCount) { - uint pos = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount) +{ + uint pos = blockIdx.x * blockDim.x + threadIdx.x; - if (pos >= threadCount) { - return; - } + if (pos >= threadCount) { + return; + } - const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); - const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); - d_Ranks += (pos - i) * 2; - d_Limits += (pos - i) * 2; + const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); + const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + d_Ranks += (pos - i) * 2; + d_Limits += (pos - i) * 2; - const uint segmentElementsA = stride; - const uint segmentElementsB = umin(stride, N - segmentBase - stride); - const uint segmentSamplesA = getSampleCount(segmentElementsA); - const uint segmentSamplesB = getSampleCount(segmentElementsB); + const uint segmentElementsA = stride; + const uint segmentElementsB = umin(stride, N - segmentBase - stride); + const uint segmentSamplesA = getSampleCount(segmentElementsA); + const uint segmentSamplesB = getSampleCount(segmentElementsB); - if (i < segmentSamplesA) { - uint dstPos = binarySearchExclusive<1U>( - d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, - nextPowerOfTwo(segmentSamplesB)) + - i; - d_Limits[dstPos] = d_Ranks[i]; - } + if (i < segmentSamplesA) { + uint dstPos = binarySearchExclusive<1U>( + d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + + i; + d_Limits[dstPos] = d_Ranks[i]; + } - if (i < segmentSamplesB) { - uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i], - d_Ranks, segmentSamplesA, - nextPowerOfTwo(segmentSamplesA)) + - i; - d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; - } + if (i < segmentSamplesB) { + uint dstPos = binarySearchInclusive<1U>( + d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + + i; + d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; + } } -static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, - uint *d_RanksA, uint *d_RanksB, uint stride, - uint N) { - uint lastSegmentElements = N % (2 * stride); - uint threadCount = - (lastSegmentElements > stride) - ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) - : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); +static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N) +{ + uint lastSegmentElements = N % (2 * stride); + uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) + : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - mergeRanksAndIndicesKernel<<>>( - d_LimitsA, d_RanksA, stride, N, threadCount); - getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n"); + mergeRanksAndIndicesKernel<<>>(d_LimitsA, d_RanksA, stride, N, threadCount); + getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n"); - mergeRanksAndIndicesKernel<<>>( - d_LimitsB, d_RanksB, stride, N, threadCount); - getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n"); + mergeRanksAndIndicesKernel<<>>(d_LimitsB, d_RanksB, stride, N, threadCount); + getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n"); } //////////////////////////////////////////////////////////////////////////////// // Merge step 3: merge elementary intervals //////////////////////////////////////////////////////////////////////////////// template -inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey, - uint *srcAVal, uint *srcBKey, uint *srcBVal, - uint lenA, uint nPowTwoLenA, uint lenB, - uint nPowTwoLenB, cg::thread_block cta) { - uint keyA, valA, keyB, valB, dstPosA, dstPosB; +inline __device__ void merge(uint *dstKey, + uint *dstVal, + uint *srcAKey, + uint *srcAVal, + uint *srcBKey, + uint *srcBVal, + uint lenA, + uint nPowTwoLenA, + uint lenB, + uint nPowTwoLenB, + cg::thread_block cta) +{ + uint keyA, valA, keyB, valB, dstPosA, dstPosB; - if (threadIdx.x < lenA) { - keyA = srcAKey[threadIdx.x]; - valA = srcAVal[threadIdx.x]; - dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB) + - threadIdx.x; - } + if (threadIdx.x < lenA) { + keyA = srcAKey[threadIdx.x]; + valA = srcAVal[threadIdx.x]; + dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x; + } - if (threadIdx.x < lenB) { - keyB = srcBKey[threadIdx.x]; - valB = srcBVal[threadIdx.x]; - dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA) + - threadIdx.x; - } + if (threadIdx.x < lenB) { + keyB = srcBKey[threadIdx.x]; + valB = srcBVal[threadIdx.x]; + dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x; + } - cg::sync(cta); + cg::sync(cta); - if (threadIdx.x < lenA) { - dstKey[dstPosA] = keyA; - dstVal[dstPosA] = valA; - } + if (threadIdx.x < lenA) { + dstKey[dstPosA] = keyA; + dstVal[dstPosA] = valA; + } - if (threadIdx.x < lenB) { - dstKey[dstPosB] = keyB; - dstVal[dstPosB] = valB; - } + if (threadIdx.x < lenB) { + dstKey[dstPosB] = keyB; + dstVal[dstPosB] = valB; + } } template -__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint *d_LimitsA, uint *d_LimitsB, - uint stride, uint N) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ uint s_key[2 * SAMPLE_STRIDE]; - __shared__ uint s_val[2 * SAMPLE_STRIDE]; +__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint *d_LimitsA, + uint *d_LimitsB, + uint stride, + uint N) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ uint s_key[2 * SAMPLE_STRIDE]; + __shared__ uint s_val[2 * SAMPLE_STRIDE]; - const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); - const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; - d_SrcKey += segmentBase; - d_SrcVal += segmentBase; - d_DstKey += segmentBase; - d_DstVal += segmentBase; + const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); + const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; + d_SrcKey += segmentBase; + d_SrcVal += segmentBase; + d_DstKey += segmentBase; + d_DstVal += segmentBase; - // Set up threadblock-wide parameters - __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; + // Set up threadblock-wide parameters + __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; - if (threadIdx.x == 0) { - uint segmentElementsA = stride; - uint segmentElementsB = umin(stride, N - segmentBase - stride); - uint segmentSamplesA = getSampleCount(segmentElementsA); - uint segmentSamplesB = getSampleCount(segmentElementsB); - uint segmentSamples = segmentSamplesA + segmentSamplesB; + if (threadIdx.x == 0) { + uint segmentElementsA = stride; + uint segmentElementsB = umin(stride, N - segmentBase - stride); + uint segmentSamplesA = getSampleCount(segmentElementsA); + uint segmentSamplesB = getSampleCount(segmentElementsB); + uint segmentSamples = segmentSamplesA + segmentSamplesB; - startSrcA = d_LimitsA[blockIdx.x]; - startSrcB = d_LimitsB[blockIdx.x]; - uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] - : segmentElementsA; - uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] - : segmentElementsB; - lenSrcA = endSrcA - startSrcA; - lenSrcB = endSrcB - startSrcB; - startDstA = startSrcA + startSrcB; - startDstB = startDstA + lenSrcA; - } - - // Load main input data - cg::sync(cta); - - if (threadIdx.x < lenSrcA) { - s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x]; - s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x]; - } - - if (threadIdx.x < lenSrcB) { - s_key[threadIdx.x + SAMPLE_STRIDE] = - d_SrcKey[stride + startSrcB + threadIdx.x]; - s_val[threadIdx.x + SAMPLE_STRIDE] = - d_SrcVal[stride + startSrcB + threadIdx.x]; - } - - // Merge data in shared memory - cg::sync(cta); - merge(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE, - s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB, - SAMPLE_STRIDE, cta); - - // Store merged data - cg::sync(cta); - - if (threadIdx.x < lenSrcA) { - d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x]; - d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x]; - } - - if (threadIdx.x < lenSrcB) { - d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; - d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x]; - } -} - -static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint *d_LimitsA, uint *d_LimitsB, - uint stride, uint N, uint sortDir) { - uint lastSegmentElements = N % (2 * stride); - uint mergePairs = (lastSegmentElements > stride) - ? getSampleCount(N) - : (N - lastSegmentElements) / SAMPLE_STRIDE; - - if (sortDir) { - mergeElementaryIntervalsKernel<1U><<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, - N); - getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n"); - } else { - mergeElementaryIntervalsKernel<0U><<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, - N); - getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n"); - } -} - -extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint batchSize, uint arrayLength, - uint sortDir); - -extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint *d_LimitsA, - uint *d_LimitsB, uint stride, - uint N, uint sortDir); - -static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB; -static const uint MAX_SAMPLE_COUNT = 32768; - -extern "C" void initMergeSort(void) { - checkCudaErrors( - cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint))); - checkCudaErrors( - cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint))); - checkCudaErrors( - cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint))); - checkCudaErrors( - cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint))); -} - -extern "C" void closeMergeSort(void) { - checkCudaErrors(cudaFree(d_RanksA)); - checkCudaErrors(cudaFree(d_RanksB)); - checkCudaErrors(cudaFree(d_LimitsB)); - checkCudaErrors(cudaFree(d_LimitsA)); -} - -extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey, - uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal, - uint N, uint sortDir) { - uint stageCount = 0; - - for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++) - ; - - uint *ikey, *ival, *okey, *oval; - - if (stageCount & 1) { - ikey = d_BufKey; - ival = d_BufVal; - okey = d_DstKey; - oval = d_DstVal; - } else { - ikey = d_DstKey; - ival = d_DstVal; - okey = d_BufKey; - oval = d_BufVal; - } - - assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT)); - assert(N % SHARED_SIZE_LIMIT == 0); - mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, - SHARED_SIZE_LIMIT, sortDir); - - for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) { - uint lastSegmentElements = N % (2 * stride); - - // Find sample ranks and prepare for limiters merge - generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir); - - // Merge ranks and indices - mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N); - - // Merge elementary intervals - mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, - stride, N, sortDir); - - if (lastSegmentElements <= stride) { - // Last merge segment consists of a single array which just needs to be - // passed through - checkCudaErrors(cudaMemcpy( - okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), - lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice)); - checkCudaErrors(cudaMemcpy( - oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), - lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice)); + startSrcA = d_LimitsA[blockIdx.x]; + startSrcB = d_LimitsB[blockIdx.x]; + uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA; + uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB; + lenSrcA = endSrcA - startSrcA; + lenSrcB = endSrcB - startSrcB; + startDstA = startSrcA + startSrcB; + startDstB = startDstA + lenSrcA; } - uint *t; - t = ikey; - ikey = okey; - okey = t; - t = ival; - ival = oval; - oval = t; - } + // Load main input data + cg::sync(cta); + + if (threadIdx.x < lenSrcA) { + s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x]; + s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x]; + } + + if (threadIdx.x < lenSrcB) { + s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x]; + s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x]; + } + + // Merge data in shared memory + cg::sync(cta); + merge(s_key, + s_val, + s_key + 0, + s_val + 0, + s_key + SAMPLE_STRIDE, + s_val + SAMPLE_STRIDE, + lenSrcA, + SAMPLE_STRIDE, + lenSrcB, + SAMPLE_STRIDE, + cta); + + // Store merged data + cg::sync(cta); + + if (threadIdx.x < lenSrcA) { + d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x]; + d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x]; + } + + if (threadIdx.x < lenSrcB) { + d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; + d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x]; + } +} + +static void mergeElementaryIntervals(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint *d_LimitsA, + uint *d_LimitsB, + uint stride, + uint N, + uint sortDir) +{ + uint lastSegmentElements = N % (2 * stride); + uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; + + if (sortDir) { + mergeElementaryIntervalsKernel<1U> + <<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N); + getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n"); + } + else { + mergeElementaryIntervalsKernel<0U> + <<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N); + getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n"); + } +} + +extern "C" void bitonicSortShared(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint batchSize, + uint arrayLength, + uint sortDir); + +extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint *d_LimitsA, + uint *d_LimitsB, + uint stride, + uint N, + uint sortDir); + +static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB; +static const uint MAX_SAMPLE_COUNT = 32768; + +extern "C" void initMergeSort(void) +{ + checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint))); +} + +extern "C" void closeMergeSort(void) +{ + checkCudaErrors(cudaFree(d_RanksA)); + checkCudaErrors(cudaFree(d_RanksB)); + checkCudaErrors(cudaFree(d_LimitsB)); + checkCudaErrors(cudaFree(d_LimitsA)); +} + +extern "C" void mergeSort(uint *d_DstKey, + uint *d_DstVal, + uint *d_BufKey, + uint *d_BufVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint N, + uint sortDir) +{ + uint stageCount = 0; + + for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++) + ; + + uint *ikey, *ival, *okey, *oval; + + if (stageCount & 1) { + ikey = d_BufKey; + ival = d_BufVal; + okey = d_DstKey; + oval = d_DstVal; + } + else { + ikey = d_DstKey; + ival = d_DstVal; + okey = d_BufKey; + oval = d_BufVal; + } + + assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT)); + assert(N % SHARED_SIZE_LIMIT == 0); + mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir); + + for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) { + uint lastSegmentElements = N % (2 * stride); + + // Find sample ranks and prepare for limiters merge + generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir); + + // Merge ranks and indices + mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N); + + // Merge elementary intervals + mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir); + + if (lastSegmentElements <= stride) { + // Last merge segment consists of a single array which just needs to be + // passed through + checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements), + ikey + (N - lastSegmentElements), + lastSegmentElements * sizeof(uint), + cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements), + ival + (N - lastSegmentElements), + lastSegmentElements * sizeof(uint), + cudaMemcpyDeviceToDevice)); + } + + uint *t; + t = ikey; + ikey = okey; + okey = t; + t = ival; + ival = oval; + oval = t; + } } diff --git a/Samples/0_Introduction/mergeSort/mergeSort_common.h b/Samples/0_Introduction/mergeSort/mergeSort_common.h index d59d1baf..10835ff4 100644 --- a/Samples/0_Introduction/mergeSort/mergeSort_common.h +++ b/Samples/0_Introduction/mergeSort/mergeSort_common.h @@ -31,19 +31,17 @@ typedef unsigned int uint; #define SHARED_SIZE_LIMIT 1024U -#define SAMPLE_STRIDE 128 +#define SAMPLE_STRIDE 128 //////////////////////////////////////////////////////////////////////////////// // Extensive sort validation routine //////////////////////////////////////////////////////////////////////////////// -extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, - uint arrayLength, uint numValues, - uint sortDir); +extern "C" uint +validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir); extern "C" void fillValues(uint *val, uint N); -extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, - uint batchSize, uint arrayLength); +extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength); //////////////////////////////////////////////////////////////////////////////// // CUDA merge sort @@ -52,13 +50,11 @@ extern "C" void initMergeSort(void); extern "C" void closeMergeSort(void); -extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, - uint *bufVal, uint *srcKey, uint *srcVal, uint N, - uint sortDir); +extern "C" void +mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir); //////////////////////////////////////////////////////////////////////////////// // CPU "emulation" //////////////////////////////////////////////////////////////////////////////// -extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, - uint *bufVal, uint *srcKey, uint *srcVal, uint N, - uint sortDir); +extern "C" void +mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir); diff --git a/Samples/0_Introduction/mergeSort/mergeSort_host.cpp b/Samples/0_Introduction/mergeSort/mergeSort_host.cpp index 1006cf1f..4a322f14 100644 --- a/Samples/0_Introduction/mergeSort/mergeSort_host.cpp +++ b/Samples/0_Introduction/mergeSort/mergeSort_host.cpp @@ -29,329 +29,335 @@ #include #include #include + #include "mergeSort_common.h" //////////////////////////////////////////////////////////////////////////////// // Helper functions //////////////////////////////////////////////////////////////////////////////// -static void checkOrder(uint *data, uint N, uint sortDir) { - if (N <= 1) { - return; - } - - for (uint i = 0; i < N - 1; i++) - if ((sortDir && (data[i] > data[i + 1])) || - (!sortDir && (data[i] < data[i + 1]))) { - fprintf(stderr, "checkOrder() failed!!!\n"); - exit(EXIT_FAILURE); +static void checkOrder(uint *data, uint N, uint sortDir) +{ + if (N <= 1) { + return; } + + for (uint i = 0; i < N - 1; i++) + if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) { + fprintf(stderr, "checkOrder() failed!!!\n"); + exit(EXIT_FAILURE); + } } static uint umin(uint a, uint b) { return (a <= b) ? a : b; } -static uint getSampleCount(uint dividend) { - return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) - : (dividend / SAMPLE_STRIDE); +static uint getSampleCount(uint dividend) +{ + return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE); } -static uint nextPowerOfTwo(uint x) { - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return ++x; +static uint nextPowerOfTwo(uint x) +{ + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; } -static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) { - if (L == 0) { - return 0; - } - - uint pos = 0; - - for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) { - uint newPos = umin(pos + stride, L); - - if ((sortDir && (data[newPos - 1] <= val)) || - (!sortDir && (data[newPos - 1] >= val))) { - pos = newPos; +static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) +{ + if (L == 0) { + return 0; } - } - return pos; + uint pos = 0; + + for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) { + uint newPos = umin(pos + stride, L); + + if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) { + pos = newPos; + } + } + + return pos; } -static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) { - if (L == 0) { - return 0; - } - - uint pos = 0; - - for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) { - uint newPos = umin(pos + stride, L); - - if ((sortDir && (data[newPos - 1] < val)) || - (!sortDir && (data[newPos - 1] > val))) { - pos = newPos; +static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) +{ + if (L == 0) { + return 0; } - } - return pos; + uint pos = 0; + + for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) { + uint newPos = umin(pos + stride, L); + + if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) { + pos = newPos; + } + } + + return pos; } //////////////////////////////////////////////////////////////////////////////// // Merge step 1: find sample ranks in each segment //////////////////////////////////////////////////////////////////////////////// -static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, - uint stride, uint N, uint sortDir) { - uint lastSegmentElements = N % (2 * stride); - uint sampleCount = - (lastSegmentElements > stride) - ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) - : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); +static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir) +{ + uint lastSegmentElements = N % (2 * stride); + uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) + : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - for (uint pos = 0; pos < sampleCount; pos++) { - const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); - const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + for (uint pos = 0; pos < sampleCount; pos++) { + const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); + const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); - const uint lenA = stride; - const uint lenB = umin(stride, N - segmentBase - stride); - const uint nA = stride / SAMPLE_STRIDE; - const uint nB = getSampleCount(lenB); + const uint lenA = stride; + const uint lenB = umin(stride, N - segmentBase - stride); + const uint nA = stride / SAMPLE_STRIDE; + const uint nB = getSampleCount(lenB); - if (i < nA) { - ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE; - ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = - binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE], - srcKey + segmentBase + stride, lenB, sortDir); + if (i < nA) { + ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE; + ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive( + srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir); + } + + if (i < nB) { + ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE; + ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive( + srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir); + } } - - if (i < nB) { - ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE; - ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = - binarySearchInclusive( - srcKey[segmentBase + stride + i * SAMPLE_STRIDE], - srcKey + segmentBase, lenA, sortDir); - } - } } //////////////////////////////////////////////////////////////////////////////// // Merge step 2: merge ranks and indices to derive elementary intervals //////////////////////////////////////////////////////////////////////////////// -static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, - uint N) { - uint lastSegmentElements = N % (2 * stride); - uint sampleCount = - (lastSegmentElements > stride) - ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) - : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); +static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N) +{ + uint lastSegmentElements = N % (2 * stride); + uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) + : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - for (uint pos = 0; pos < sampleCount; pos++) { - const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); - const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + for (uint pos = 0; pos < sampleCount; pos++) { + const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); + const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); - const uint lenA = stride; - const uint lenB = umin(stride, N - segmentBase - stride); - const uint nA = stride / SAMPLE_STRIDE; - const uint nB = getSampleCount(lenB); + const uint lenA = stride; + const uint lenB = umin(stride, N - segmentBase - stride); + const uint nA = stride / SAMPLE_STRIDE; + const uint nB = getSampleCount(lenB); - if (i < nA) { - uint dstPosA = - binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], - ranks + (segmentBase + stride) / SAMPLE_STRIDE, - nB, 1) + - i; - assert(dstPosA < nA + nB); - limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = - ranks[(segmentBase + 0) / SAMPLE_STRIDE + i]; + if (i < nA) { + uint dstPosA = + binarySearchExclusive( + ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1) + + i; + assert(dstPosA < nA + nB); + limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i]; + } + + if (i < nB) { + uint dstPosA = + binarySearchInclusive( + ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) + + i; + assert(dstPosA < nA + nB); + limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i]; + } } - - if (i < nB) { - uint dstPosA = binarySearchInclusive( - ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], - ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) + - i; - assert(dstPosA < nA + nB); - limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = - ranks[(segmentBase + stride) / SAMPLE_STRIDE + i]; - } - } } //////////////////////////////////////////////////////////////////////////////// // Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE) //////////////////////////////////////////////////////////////////////////////// -static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal, - uint *srcBKey, uint *srcBVal, uint lenA, uint lenB, - uint sortDir) { - checkOrder(srcAKey, lenA, sortDir); - checkOrder(srcBKey, lenB, sortDir); +static void merge(uint *dstKey, + uint *dstVal, + uint *srcAKey, + uint *srcAVal, + uint *srcBKey, + uint *srcBVal, + uint lenA, + uint lenB, + uint sortDir) +{ + checkOrder(srcAKey, lenA, sortDir); + checkOrder(srcBKey, lenB, sortDir); - for (uint i = 0; i < lenA; i++) { - uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i; - assert(dstPos < lenA + lenB); - dstKey[dstPos] = srcAKey[i]; - dstVal[dstPos] = srcAVal[i]; - } + for (uint i = 0; i < lenA; i++) { + uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i; + assert(dstPos < lenA + lenB); + dstKey[dstPos] = srcAKey[i]; + dstVal[dstPos] = srcAVal[i]; + } - for (uint i = 0; i < lenB; i++) { - uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i; - assert(dstPos < lenA + lenB); - dstKey[dstPos] = srcBKey[i]; - dstVal[dstPos] = srcBVal[i]; - } + for (uint i = 0; i < lenB; i++) { + uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i; + assert(dstPos < lenA + lenB); + dstKey[dstPos] = srcBKey[i]; + dstVal[dstPos] = srcBVal[i]; + } } -static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey, - uint *srcVal, uint *limitsA, uint *limitsB, - uint stride, uint N, uint sortDir) { - uint lastSegmentElements = N % (2 * stride); - uint mergePairs = (lastSegmentElements > stride) - ? getSampleCount(N) - : (N - lastSegmentElements) / SAMPLE_STRIDE; +static void mergeElementaryIntervals(uint *dstKey, + uint *dstVal, + uint *srcKey, + uint *srcVal, + uint *limitsA, + uint *limitsB, + uint stride, + uint N, + uint sortDir) +{ + uint lastSegmentElements = N % (2 * stride); + uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; - for (uint pos = 0; pos < mergePairs; pos++) { - uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1); - uint segmentBase = (pos - i) * SAMPLE_STRIDE; + for (uint pos = 0; pos < mergePairs; pos++) { + uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1); + uint segmentBase = (pos - i) * SAMPLE_STRIDE; - const uint lenA = stride; - const uint lenB = umin(stride, N - segmentBase - stride); - const uint nA = stride / SAMPLE_STRIDE; - const uint nB = getSampleCount(lenB); - const uint n = nA + nB; + const uint lenA = stride; + const uint lenB = umin(stride, N - segmentBase - stride); + const uint nA = stride / SAMPLE_STRIDE; + const uint nB = getSampleCount(lenB); + const uint n = nA + nB; - const uint startPosA = limitsA[pos]; - const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA; - const uint startPosB = limitsB[pos]; - const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB; - const uint startPosDst = startPosA + startPosB; + const uint startPosA = limitsA[pos]; + const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA; + const uint startPosB = limitsB[pos]; + const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB; + const uint startPosDst = startPosA + startPosB; - assert(startPosA <= endPosA && endPosA <= lenA); - assert(startPosB <= endPosB && endPosB <= lenB); - assert((endPosA - startPosA) <= SAMPLE_STRIDE); - assert((endPosB - startPosB) <= SAMPLE_STRIDE); + assert(startPosA <= endPosA && endPosA <= lenA); + assert(startPosB <= endPosB && endPosB <= lenB); + assert((endPosA - startPosA) <= SAMPLE_STRIDE); + assert((endPosB - startPosB) <= SAMPLE_STRIDE); - merge(dstKey + segmentBase + startPosDst, - dstVal + segmentBase + startPosDst, - (srcKey + segmentBase + 0) + startPosA, - (srcVal + segmentBase + 0) + startPosA, - (srcKey + segmentBase + stride) + startPosB, - (srcVal + segmentBase + stride) + startPosB, endPosA - startPosA, - endPosB - startPosB, sortDir); - } + merge(dstKey + segmentBase + startPosDst, + dstVal + segmentBase + startPosDst, + (srcKey + segmentBase + 0) + startPosA, + (srcVal + segmentBase + 0) + startPosA, + (srcKey + segmentBase + stride) + startPosB, + (srcVal + segmentBase + stride) + startPosB, + endPosA - startPosA, + endPosB - startPosB, + sortDir); + } } //////////////////////////////////////////////////////////////////////////////// // Retarded bubble sort //////////////////////////////////////////////////////////////////////////////// -static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) { - if (N <= 1) { - return; - } - - for (uint bottom = 0; bottom < N - 1; bottom++) { - uint savePos = bottom; - uint saveKey = key[bottom]; - - for (uint i = bottom + 1; i < N; i++) - if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) { - savePos = i; - saveKey = key[i]; - } - - if (savePos != bottom) { - uint t; - t = key[savePos]; - key[savePos] = key[bottom]; - key[bottom] = t; - t = val[savePos]; - val[savePos] = val[bottom]; - val[bottom] = t; +static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) +{ + if (N <= 1) { + return; + } + + for (uint bottom = 0; bottom < N - 1; bottom++) { + uint savePos = bottom; + uint saveKey = key[bottom]; + + for (uint i = bottom + 1; i < N; i++) + if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) { + savePos = i; + saveKey = key[i]; + } + + if (savePos != bottom) { + uint t; + t = key[savePos]; + key[savePos] = key[bottom]; + key[bottom] = t; + t = val[savePos]; + val[savePos] = val[bottom]; + val[bottom] = t; + } } - } } //////////////////////////////////////////////////////////////////////////////// // Interface function //////////////////////////////////////////////////////////////////////////////// -extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, - uint *bufVal, uint *srcKey, uint *srcVal, uint N, - uint sortDir) { - uint *ikey, *ival, *okey, *oval; - uint stageCount = 0; +extern "C" void +mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir) +{ + uint *ikey, *ival, *okey, *oval; + uint stageCount = 0; - for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++) - ; + for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++) + ; - if (stageCount & 1) { - ikey = bufKey; - ival = bufVal; - okey = dstKey; - oval = dstVal; - } else { - ikey = dstKey; - ival = dstVal; - okey = bufKey; - oval = bufVal; - } - - printf("Bottom-level sort...\n"); - memcpy(ikey, srcKey, N * sizeof(uint)); - memcpy(ival, srcVal, N * sizeof(uint)); - - for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) { - bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), - sortDir); - } - - printf("Merge...\n"); - uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint)); - uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint)); - uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint)); - uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint)); - memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint)); - memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint)); - memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint)); - memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint)); - - for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) { - uint lastSegmentElements = N % (2 * stride); - - // Find sample ranks and prepare for limiters merge - generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir); - - // Merge ranks and indices - mergeRanksAndIndices(limitsA, ranksA, stride, N); - mergeRanksAndIndices(limitsB, ranksB, stride, N); - - // Merge elementary intervals - mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, - N, sortDir); - - if (lastSegmentElements <= stride) { - // Last merge segment consists of a single array which just needs to be - // passed through - memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), - lastSegmentElements * sizeof(uint)); - memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), - lastSegmentElements * sizeof(uint)); + if (stageCount & 1) { + ikey = bufKey; + ival = bufVal; + okey = dstKey; + oval = dstVal; + } + else { + ikey = dstKey; + ival = dstVal; + okey = bufKey; + oval = bufVal; } - uint *t; - t = ikey; - ikey = okey; - okey = t; - t = ival; - ival = oval; - oval = t; - } + printf("Bottom-level sort...\n"); + memcpy(ikey, srcKey, N * sizeof(uint)); + memcpy(ival, srcVal, N * sizeof(uint)); - free(limitsB); - free(limitsA); - free(ranksB); - free(ranksA); + for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) { + bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir); + } + + printf("Merge...\n"); + uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint)); + uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint)); + uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint)); + uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint)); + memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint)); + memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint)); + memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint)); + memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint)); + + for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) { + uint lastSegmentElements = N % (2 * stride); + + // Find sample ranks and prepare for limiters merge + generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir); + + // Merge ranks and indices + mergeRanksAndIndices(limitsA, ranksA, stride, N); + mergeRanksAndIndices(limitsB, ranksB, stride, N); + + // Merge elementary intervals + mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir); + + if (lastSegmentElements <= stride) { + // Last merge segment consists of a single array which just needs to be + // passed through + memcpy( + okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint)); + memcpy( + oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint)); + } + + uint *t; + t = ikey; + ikey = okey; + okey = t; + t = ival; + ival = oval; + oval = t; + } + + free(limitsB); + free(limitsA); + free(ranksB); + free(ranksA); } diff --git a/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp b/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp index cba97641..4eac52a7 100644 --- a/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp +++ b/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp @@ -29,104 +29,100 @@ #include #include #include + #include "mergeSort_common.h" //////////////////////////////////////////////////////////////////////////////// // Validate sorted keys array (check for integrity and proper order) //////////////////////////////////////////////////////////////////////////////// -extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, - uint arrayLength, uint numValues, - uint sortDir) { - uint *srcHist; - uint *resHist; +extern "C" uint +validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir) +{ + uint *srcHist; + uint *resHist; - if (arrayLength < 2) { - printf("validateSortedKeys(): arrays too short, exiting...\n"); - return 1; - } - - printf("...inspecting keys array: "); - srcHist = (uint *)malloc(numValues * sizeof(uint)); - resHist = (uint *)malloc(numValues * sizeof(uint)); - - int flag = 1; - - for (uint j = 0; j < batchSize; - j++, srcKey += arrayLength, resKey += arrayLength) { - // Build histograms for keys arrays - memset(srcHist, 0, numValues * sizeof(uint)); - memset(resHist, 0, numValues * sizeof(uint)); - - for (uint i = 0; i < arrayLength; i++) { - if ((srcKey[i] < numValues) && (resKey[i] < numValues)) { - srcHist[srcKey[i]]++; - resHist[resKey[i]]++; - } else { - fprintf( - stderr, - "***Set %u source/result key arrays are not limited properly***\n", - j); - flag = 0; - goto brk; - } + if (arrayLength < 2) { + printf("validateSortedKeys(): arrays too short, exiting...\n"); + return 1; } - // Compare the histograms - for (uint i = 0; i < numValues; i++) - if (srcHist[i] != resHist[i]) { - fprintf(stderr, - "***Set %u source/result keys histograms do not match***\n", j); - flag = 0; - goto brk; - } + printf("...inspecting keys array: "); + srcHist = (uint *)malloc(numValues * sizeof(uint)); + resHist = (uint *)malloc(numValues * sizeof(uint)); - // Finally check the ordering - for (uint i = 0; i < arrayLength - 1; i++) - if ((sortDir && (resKey[i] > resKey[i + 1])) || - (!sortDir && (resKey[i] < resKey[i + 1]))) { - fprintf(stderr, - "***Set %u result key array is not ordered properly***\n", j); - flag = 0; - goto brk; - } - } + int flag = 1; + + for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) { + // Build histograms for keys arrays + memset(srcHist, 0, numValues * sizeof(uint)); + memset(resHist, 0, numValues * sizeof(uint)); + + for (uint i = 0; i < arrayLength; i++) { + if ((srcKey[i] < numValues) && (resKey[i] < numValues)) { + srcHist[srcKey[i]]++; + resHist[resKey[i]]++; + } + else { + fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j); + flag = 0; + goto brk; + } + } + + // Compare the histograms + for (uint i = 0; i < numValues; i++) + if (srcHist[i] != resHist[i]) { + fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j); + flag = 0; + goto brk; + } + + // Finally check the ordering + for (uint i = 0; i < arrayLength - 1; i++) + if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) { + fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j); + flag = 0; + goto brk; + } + } brk: - free(resHist); - free(srcHist); + free(resHist); + free(srcHist); - if (flag) printf("OK\n"); + if (flag) + printf("OK\n"); - return flag; + return flag; } //////////////////////////////////////////////////////////////////////////////// // Value validation / stability check routines //////////////////////////////////////////////////////////////////////////////// -extern "C" void fillValues(uint *val, uint N) { - for (uint i = 0; i < N; i++) val[i] = i; +extern "C" void fillValues(uint *val, uint N) +{ + for (uint i = 0; i < N; i++) + val[i] = i; } -extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, - uint batchSize, uint arrayLength) { - int correctFlag = 1, stableFlag = 1; +extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength) +{ + int correctFlag = 1, stableFlag = 1; - printf("...inspecting keys and values array: "); + printf("...inspecting keys and values array: "); - for (uint i = 0; i < batchSize; - i++, resKey += arrayLength, resVal += arrayLength) { - for (uint j = 0; j < arrayLength; j++) { - if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0; + for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) { + for (uint j = 0; j < arrayLength; j++) { + if (resKey[j] != srcKey[resVal[j]]) + correctFlag = 0; - if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && - (resVal[j] > resVal[j + 1])) - stableFlag = 0; + if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1])) + stableFlag = 0; + } } - } - printf(correctFlag ? "OK\n" : "***corrupted!!!***\n"); - printf(stableFlag ? "...stability property: stable!\n" - : "...stability property: NOT stable\n"); + printf(correctFlag ? "OK\n" : "***corrupted!!!***\n"); + printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n"); - return correctFlag; + return correctFlag; } diff --git a/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu b/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu index b0dcafab..46ffc886 100644 --- a/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu +++ b/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu @@ -29,106 +29,105 @@ #include // Includes CUDA -#include -#include #include +#include +#include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check +#include // helper functions for CUDA error check namespace cg = cooperative_groups; #if __CUDA_ARCH__ >= 700 template -__device__ void reduceBlockData( - cuda::barrier &barrier, - cg::thread_block_tile<32> &tile32, double &threadSum, double *result) { - extern __shared__ double tmp[]; - -#pragma unroll - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - threadSum += tile32.shfl_down(threadSum, offset); - } - if (tile32.thread_rank() == 0) { - tmp[tile32.meta_group_rank()] = threadSum; - } - - auto token = barrier.arrive(); - - barrier.wait(std::move(token)); - - // The warp 0 will perform last round of reduction - if (tile32.meta_group_rank() == 0) { - double beta = tile32.thread_rank() < tile32.meta_group_size() - ? tmp[tile32.thread_rank()] - : 0.0; +__device__ void reduceBlockData(cuda::barrier &barrier, + cg::thread_block_tile<32> &tile32, + double &threadSum, + double *result) +{ + extern __shared__ double tmp[]; #pragma unroll for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - beta += tile32.shfl_down(beta, offset); + threadSum += tile32.shfl_down(threadSum, offset); + } + if (tile32.thread_rank() == 0) { + tmp[tile32.meta_group_rank()] = threadSum; } - if (tile32.thread_rank() == 0) { - if (writeSquareRoot) - *result = sqrt(beta); - else - *result = beta; + auto token = barrier.arrive(); + + barrier.wait(std::move(token)); + + // The warp 0 will perform last round of reduction + if (tile32.meta_group_rank() == 0) { + double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; + +#pragma unroll + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + beta += tile32.shfl_down(beta, offset); + } + + if (tile32.thread_rank() == 0) { + if (writeSquareRoot) + *result = sqrt(beta); + else + *result = beta; + } } - } } #endif -__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, - double *partialResults, int size) { +__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size) +{ #if __CUDA_ARCH__ >= 700 #pragma diag_suppress static_var_with_dynamic_init - cg::thread_block cta = cg::this_thread_block(); - cg::grid_group grid = cg::this_grid(); - ; - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + cg::thread_block cta = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); + ; + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - __shared__ cuda::barrier barrier; + __shared__ cuda::barrier barrier; - if (threadIdx.x == 0) { - init(&barrier, blockDim.x); - } - - cg::sync(cta); - - double threadSum = 0.0; - for (int i = grid.thread_rank(); i < size; i += grid.size()) { - threadSum += (double)(vecA[i] * vecB[i]); - } - - // Each thread block performs reduction of partial dotProducts and writes to - // global mem. - reduceBlockData(barrier, tile32, threadSum, - &partialResults[blockIdx.x]); - - cg::sync(grid); - - // One block performs the final summation of partial dot products - // of all the thread blocks and writes the sqrt of final dot product. - if (blockIdx.x == 0) { - threadSum = 0.0; - for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) { - threadSum += partialResults[i]; + if (threadIdx.x == 0) { + init(&barrier, blockDim.x); } - reduceBlockData(barrier, tile32, threadSum, &partialResults[0]); - } - cg::sync(grid); + cg::sync(cta); - const double finalValue = partialResults[0]; + double threadSum = 0.0; + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + threadSum += (double)(vecA[i] * vecB[i]); + } - // Perform normalization of vecA & vecB. - for (int i = grid.thread_rank(); i < size; i += grid.size()) { - vecA[i] = (float)vecA[i] / finalValue; - vecB[i] = (float)vecB[i] / finalValue; - } + // Each thread block performs reduction of partial dotProducts and writes to + // global mem. + reduceBlockData(barrier, tile32, threadSum, &partialResults[blockIdx.x]); + + cg::sync(grid); + + // One block performs the final summation of partial dot products + // of all the thread blocks and writes the sqrt of final dot product. + if (blockIdx.x == 0) { + threadSum = 0.0; + for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) { + threadSum += partialResults[i]; + } + reduceBlockData(barrier, tile32, threadSum, &partialResults[0]); + } + + cg::sync(grid); + + const double finalValue = partialResults[0]; + + // Perform normalization of vecA & vecB. + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + vecA[i] = (float)vecA[i] / finalValue; + vecB[i] = (float)vecB[i] / finalValue; + } #endif } @@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n", argv[0]); +int main(int argc, char **argv) +{ + printf("%s starting...\n", argv[0]); - // This will pick the best possible CUDA capable device - int dev = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + int dev = findCudaDevice(argc, (const char **)argv); - int major = 0; - checkCudaErrors( - cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + int major = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); - // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher. - if (major < 7) { - printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n"); - exit(EXIT_WAIVED); - } - - int supportsCooperativeLaunch = 0; - checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, - cudaDevAttrCooperativeLaunch, dev)); - - if (!supportsCooperativeLaunch) { - printf( - "\nSelected GPU (%d) does not support Cooperative Kernel Launch, " - "Waiving the run\n", - dev); - exit(EXIT_WAIVED); - } - - int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev); - - printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); -} - -int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) { - float *vecA, *d_vecA; - float *vecB, *d_vecB; - double *d_partialResults; - int size = 10000000; - - checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size)); - checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size)); - - checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size)); - checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size)); - - float baseVal = 2.0; - for (int i = 0; i < size; i++) { - vecA[i] = vecB[i] = baseVal; - } - - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, - cudaMemcpyHostToDevice, stream)); - - // Kernel configuration, where a one-dimensional - // grid and one-dimensional blocks are configured. - int minGridSize = 0, blockSize = 0; - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( - &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size)); - - int smemSize = ((blockSize / 32) + 1) * sizeof(double); - - int numBlocksPerSm = 0; - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize)); - - int multiProcessorCount = 0; - checkCudaErrors(cudaDeviceGetAttribute( - &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId)); - - minGridSize = multiProcessorCount * numBlocksPerSm; - checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double))); - - printf( - "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d " - "blockSize = %d\n", - minGridSize, blockSize); - - dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1); - - void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, - (void *)&d_partialResults, (void *)&size}; - - checkCudaErrors( - cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid, - dimBlock, kernelArgs, smemSize, stream)); - - checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - float expectedResult = (baseVal / sqrt(size * baseVal * baseVal)); - unsigned int matches = 0; - for (int i = 0; i < size; i++) { - if ((vecA[i] - expectedResult) > 0.00001) { - printf("mismatch at i = %d\n", i); - break; - } else { - matches++; + // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher. + if (major < 7) { + printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n"); + exit(EXIT_WAIVED); } - } - printf("Result = %s\n", matches == size ? "PASSED" : "FAILED"); - checkCudaErrors(cudaFree(d_vecA)); - checkCudaErrors(cudaFree(d_vecB)); - checkCudaErrors(cudaFree(d_partialResults)); + int supportsCooperativeLaunch = 0; + checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev)); - checkCudaErrors(cudaFreeHost(vecA)); - checkCudaErrors(cudaFreeHost(vecB)); - return matches == size; + if (!supportsCooperativeLaunch) { + printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, " + "Waiving the run\n", + dev); + exit(EXIT_WAIVED); + } + + int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev); + + printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); +} + +int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) +{ + float *vecA, *d_vecA; + float *vecB, *d_vecB; + double *d_partialResults; + int size = 10000000; + + checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size)); + checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size)); + + checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size)); + + float baseVal = 2.0; + for (int i = 0; i < size; i++) { + vecA[i] = vecB[i] = baseVal; + } + + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream)); + + // Kernel configuration, where a one-dimensional + // grid and one-dimensional blocks are configured. + int minGridSize = 0, blockSize = 0; + checkCudaErrors( + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size)); + + int smemSize = ((blockSize / 32) + 1) * sizeof(double); + + int numBlocksPerSm = 0; + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize)); + + int multiProcessorCount = 0; + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId)); + + minGridSize = multiProcessorCount * numBlocksPerSm; + checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double))); + + printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d " + "blockSize = %d\n", + minGridSize, + blockSize); + + dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1); + + void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size}; + + checkCudaErrors(cudaLaunchCooperativeKernel( + (void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream)); + + checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + float expectedResult = (baseVal / sqrt(size * baseVal * baseVal)); + unsigned int matches = 0; + for (int i = 0; i < size; i++) { + if ((vecA[i] - expectedResult) > 0.00001) { + printf("mismatch at i = %d\n", i); + break; + } + else { + matches++; + } + } + + printf("Result = %s\n", matches == size ? "PASSED" : "FAILED"); + checkCudaErrors(cudaFree(d_vecA)); + checkCudaErrors(cudaFree(d_vecB)); + checkCudaErrors(cudaFree(d_partialResults)); + + checkCudaErrors(cudaFreeHost(vecA)); + checkCudaErrors(cudaFreeHost(vecB)); + return matches == size; } diff --git a/Samples/0_Introduction/simpleAssert/simpleAssert.cu b/Samples/0_Introduction/simpleAssert/simpleAssert.cu index 1206da3c..64df5d20 100644 --- a/Samples/0_Introduction/simpleAssert/simpleAssert.cu +++ b/Samples/0_Introduction/simpleAssert/simpleAssert.cu @@ -34,17 +34,17 @@ #endif // Includes, system -#include #include +#include // Includes CUDA #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check +#include // helper functions for CUDA error check const char *sampleName = "simpleAssert"; @@ -58,9 +58,10 @@ bool testResult = true; //! Tests assert function. //! Thread whose id > N will print assertion failed error message. //////////////////////////////////////////////////////////////////////////////// -__global__ void testKernel(int N) { - int gtid = blockIdx.x * blockDim.x + threadIdx.x; - assert(gtid < N); +__global__ void testKernel(int N) +{ + int gtid = blockIdx.x * blockDim.x + threadIdx.x; + assert(gtid < N); } //////////////////////////////////////////////////////////////////////////////// @@ -70,59 +71,60 @@ void runTest(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n", sampleName); +int main(int argc, char **argv) +{ + printf("%s starting...\n", sampleName); - runTest(argc, argv); + runTest(argc, argv); - printf("%s completed, returned %s\n", sampleName, - testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } -void runTest(int argc, char **argv) { - int Nblocks = 2; - int Nthreads = 32; - cudaError_t error; +void runTest(int argc, char **argv) +{ + int Nblocks = 2; + int Nthreads = 32; + cudaError_t error; #ifndef _WIN32 - utsname OS_System_Type; - uname(&OS_System_Type); + utsname OS_System_Type; + uname(&OS_System_Type); - printf("OS_System_Type.release = %s\n", OS_System_Type.release); + printf("OS_System_Type.release = %s\n", OS_System_Type.release); - if (!strcasecmp(OS_System_Type.sysname, "Darwin")) { - printf("simpleAssert is not current supported on Mac OSX\n\n"); - exit(EXIT_SUCCESS); - } else { - printf("OS Info: <%s>\n\n", OS_System_Type.version); - } + if (!strcasecmp(OS_System_Type.sysname, "Darwin")) { + printf("simpleAssert is not current supported on Mac OSX\n\n"); + exit(EXIT_SUCCESS); + } + else { + printf("OS Info: <%s>\n\n", OS_System_Type.version); + } #endif - // This will pick the best possible CUDA capable device - findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + findCudaDevice(argc, (const char **)argv); - // Kernel configuration, where a one-dimensional - // grid and one-dimensional blocks are configured. - dim3 dimGrid(Nblocks); - dim3 dimBlock(Nthreads); + // Kernel configuration, where a one-dimensional + // grid and one-dimensional blocks are configured. + dim3 dimGrid(Nblocks); + dim3 dimBlock(Nthreads); - printf("Launch kernel to generate assertion failures\n"); - testKernel<<>>(60); + printf("Launch kernel to generate assertion failures\n"); + testKernel<<>>(60); - // Synchronize (flushes assert output). - printf("\n-- Begin assert output\n\n"); - error = cudaDeviceSynchronize(); - printf("\n-- End assert output\n\n"); + // Synchronize (flushes assert output). + printf("\n-- Begin assert output\n\n"); + error = cudaDeviceSynchronize(); + printf("\n-- End assert output\n\n"); - // Check for errors and failed asserts in asynchronous kernel launch. - if (error == cudaErrorAssert) { - printf( - "Device assert failed as expected, " - "CUDA error message is: %s\n\n", - cudaGetErrorString(error)); - } + // Check for errors and failed asserts in asynchronous kernel launch. + if (error == cudaErrorAssert) { + printf("Device assert failed as expected, " + "CUDA error message is: %s\n\n", + cudaGetErrorString(error)); + } - testResult = error == cudaErrorAssert; + testResult = error == cudaErrorAssert; } diff --git a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp index a9497660..409dc7c5 100644 --- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp +++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp @@ -34,15 +34,16 @@ #endif // Includes, system -#include #include +#include // Includes CUDA #include + #include "nvrtc_helper.h" // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h const char *sampleName = "simpleAssert_nvrtc"; @@ -58,56 +59,63 @@ void runTest(int argc, char **argv); // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n", sampleName); +int main(int argc, char **argv) +{ + printf("%s starting...\n", sampleName); - runTest(argc, argv); + runTest(argc, argv); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } -void runTest(int argc, char **argv) { - int Nblocks = 2; - int Nthreads = 32; +void runTest(int argc, char **argv) +{ + int Nblocks = 2; + int Nthreads = 32; - // Kernel configuration, where a one-dimensional - // grid and one-dimensional blocks are configured. + // Kernel configuration, where a one-dimensional + // grid and one-dimensional blocks are configured. - dim3 dimGrid(Nblocks); - dim3 dimBlock(Nthreads); + dim3 dimGrid(Nblocks); + dim3 dimBlock(Nthreads); - printf("Launch kernel to generate assertion failures\n"); - char *cubin, *kernel_file; - size_t cubinSize; + printf("Launch kernel to generate assertion failures\n"); + char *cubin, *kernel_file; + size_t cubinSize; - kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - CUmodule module = loadCUBIN(cubin, argc, argv); - CUfunction kernel_addr; + CUmodule module = loadCUBIN(cubin, argc, argv); + CUfunction kernel_addr; - checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel")); + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel")); - int count = 60; - void *args[] = {(void *)&count}; + int count = 60; + void *args[] = {(void *)&count}; - checkCudaErrors(cuLaunchKernel( - kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */ - dimBlock.x, dimBlock.y, dimBlock.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &args[0], /* arguments */ - 0)); + checkCudaErrors(cuLaunchKernel(kernel_addr, + dimGrid.x, + dimGrid.y, + dimGrid.z, /* grid dim */ + dimBlock.x, + dimBlock.y, + dimBlock.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &args[0], /* arguments */ + 0)); - // Synchronize (flushes assert output). - printf("\n-- Begin assert output\n\n"); - CUresult res = cuCtxSynchronize(); + // Synchronize (flushes assert output). + printf("\n-- Begin assert output\n\n"); + CUresult res = cuCtxSynchronize(); - printf("\n-- End assert output\n\n"); + printf("\n-- End assert output\n\n"); - // Check for errors and failed asserts in asynchronous kernel launch. - if (res == CUDA_ERROR_ASSERT) { - printf("Device assert failed as expected\n"); - } + // Check for errors and failed asserts in asynchronous kernel launch. + if (res == CUDA_ERROR_ASSERT) { + printf("Device assert failed as expected\n"); + } - testResult = res == CUDA_ERROR_ASSERT; + testResult = res == CUDA_ERROR_ASSERT; } diff --git a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu index 5144d329..ada02586 100644 --- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu +++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu @@ -32,7 +32,8 @@ //! Thread whose id > N will print assertion failed error message. //////////////////////////////////////////////////////////////////////////////// -extern "C" __global__ void testKernel(int N) { - int gtid = blockIdx.x * blockDim.x + threadIdx.x; - assert(gtid < N); +extern "C" __global__ void testKernel(int N) +{ + int gtid = blockIdx.x * blockDim.x + threadIdx.x; + assert(gtid < N); } diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu index 390c4aac..78fe3ee6 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu +++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu @@ -30,10 +30,10 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include #ifdef _WIN32 #define WINDOWS_LEAN_AND_MEAN @@ -45,10 +45,10 @@ #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check +#include // helper functions for CUDA error check // Includes, kernels #include "simpleAtomicIntrinsics_kernel.cuh" @@ -68,67 +68,67 @@ extern "C" bool computeGold(int *gpuData, const int len); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n", sampleName); +int main(int argc, char **argv) +{ + printf("%s starting...\n", sampleName); - runTest(argc, argv); + runTest(argc, argv); - printf("%s completed, returned %s\n", sampleName, - testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - cudaStream_t stream; - // This will pick the best possible CUDA capable device - findCudaDevice(argc, (const char **)argv); +void runTest(int argc, char **argv) +{ + cudaStream_t stream; + // This will pick the best possible CUDA capable device + findCudaDevice(argc, (const char **)argv); - StopWatchInterface *timer; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + StopWatchInterface *timer; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); - unsigned int numThreads = 256; - unsigned int numBlocks = 64; - unsigned int numData = 11; - unsigned int memSize = sizeof(int) * numData; + unsigned int numThreads = 256; + unsigned int numBlocks = 64; + unsigned int numData = 11; + unsigned int memSize = sizeof(int) * numData; - // allocate mem for the result on host side - int *hOData; - checkCudaErrors(cudaMallocHost(&hOData, memSize)); + // allocate mem for the result on host side + int *hOData; + checkCudaErrors(cudaMallocHost(&hOData, memSize)); - // initialize the memory - for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; + // initialize the memory + for (unsigned int i = 0; i < numData; i++) + hOData[i] = 0; - // To make the AND and XOR tests generate something other than 0... - hOData[8] = hOData[10] = 0xff; + // To make the AND and XOR tests generate something other than 0... + hOData[8] = hOData[10] = 0xff; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - // allocate device memory for result - int *dOData; - checkCudaErrors(cudaMalloc((void **)&dOData, memSize)); - // copy host memory to device to initialize to zero - checkCudaErrors( - cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + // allocate device memory for result + int *dOData; + checkCudaErrors(cudaMalloc((void **)&dOData, memSize)); + // copy host memory to device to initialize to zero + checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream)); - // execute the kernel - testKernel<<>>(dOData); + // execute the kernel + testKernel<<>>(dOData); - // Copy result from device to host - checkCudaErrors( - cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); + // Copy result from device to host + checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); - // Compute reference solution - testResult = computeGold(hOData, numThreads * numBlocks); + // Compute reference solution + testResult = computeGold(hOData, numThreads * numBlocks); - // Cleanup memory - checkCudaErrors(cudaFreeHost(hOData)); - checkCudaErrors(cudaFree(dOData)); + // Cleanup memory + checkCudaErrors(cudaFreeHost(hOData)); + checkCudaErrors(cudaFree(dOData)); } diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp index 29e36ff6..92d6622d 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp +++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp @@ -42,141 +42,142 @@ extern "C" int computeGold(int *gpuData, const int len); //! @param idata input data as provided to device //! @param len number of elements in reference / idata //////////////////////////////////////////////////////////////////////////////// -int computeGold(int *gpuData, const int len) { - int val = 0; +int computeGold(int *gpuData, const int len) +{ + int val = 0; - for (int i = 0; i < len; ++i) { - val += 10; - } - - if (val != gpuData[0]) { - printf("atomicAdd failed\n"); - return false; - } - - val = 0; - - for (int i = 0; i < len; ++i) { - val -= 10; - } - - if (val != gpuData[1]) { - printf("atomicSub failed\n"); - return false; - } - - bool found = false; - - for (int i = 0; i < len; ++i) { - // third element should be a member of [0, len) - if (i == gpuData[2]) { - found = true; - break; + for (int i = 0; i < len; ++i) { + val += 10; } - } - if (!found) { - printf("atomicExch failed\n"); - return false; - } - - val = -(1 << 8); - - for (int i = 0; i < len; ++i) { - // fourth element should be len-1 - val = max(val, i); - } - - if (val != gpuData[3]) { - printf("atomicMax failed\n"); - return false; - } - - val = 1 << 8; - - for (int i = 0; i < len; ++i) { - val = min(val, i); - } - - if (val != gpuData[4]) { - printf("atomicMin failed\n"); - return false; - } - - int limit = 17; - val = 0; - - for (int i = 0; i < len; ++i) { - val = (val >= limit) ? 0 : val + 1; - } - - if (val != gpuData[5]) { - printf("atomicInc failed\n"); - return false; - } - - limit = 137; - val = 0; - - for (int i = 0; i < len; ++i) { - val = ((val == 0) || (val > limit)) ? limit : val - 1; - } - - if (val != gpuData[6]) { - printf("atomicDec failed\n"); - return false; - } - - found = false; - - for (int i = 0; i < len; ++i) { - // eighth element should be a member of [0, len) - if (i == gpuData[7]) { - found = true; - break; + if (val != gpuData[0]) { + printf("atomicAdd failed\n"); + return false; } - } - if (!found) { - printf("atomicCAS failed\n"); - return false; - } + val = 0; - val = 0xff; + for (int i = 0; i < len; ++i) { + val -= 10; + } - for (int i = 0; i < len; ++i) { - // 9th element should be 1 - val &= (2 * i + 7); - } + if (val != gpuData[1]) { + printf("atomicSub failed\n"); + return false; + } - if (val != gpuData[8]) { - printf("atomicAnd failed\n"); - return false; - } + bool found = false; - val = 0; + for (int i = 0; i < len; ++i) { + // third element should be a member of [0, len) + if (i == gpuData[2]) { + found = true; + break; + } + } - for (int i = 0; i < len; ++i) { - // 10th element should be 0xff - val |= (1 << i); - } + if (!found) { + printf("atomicExch failed\n"); + return false; + } - if (val != gpuData[9]) { - printf("atomicOr failed\n"); - return false; - } + val = -(1 << 8); - val = 0xff; + for (int i = 0; i < len; ++i) { + // fourth element should be len-1 + val = max(val, i); + } - for (int i = 0; i < len; ++i) { - // 11th element should be 0xff - val ^= i; - } + if (val != gpuData[3]) { + printf("atomicMax failed\n"); + return false; + } - if (val != gpuData[10]) { - printf("atomicXor failed\n"); - return false; - } + val = 1 << 8; - return true; + for (int i = 0; i < len; ++i) { + val = min(val, i); + } + + if (val != gpuData[4]) { + printf("atomicMin failed\n"); + return false; + } + + int limit = 17; + val = 0; + + for (int i = 0; i < len; ++i) { + val = (val >= limit) ? 0 : val + 1; + } + + if (val != gpuData[5]) { + printf("atomicInc failed\n"); + return false; + } + + limit = 137; + val = 0; + + for (int i = 0; i < len; ++i) { + val = ((val == 0) || (val > limit)) ? limit : val - 1; + } + + if (val != gpuData[6]) { + printf("atomicDec failed\n"); + return false; + } + + found = false; + + for (int i = 0; i < len; ++i) { + // eighth element should be a member of [0, len) + if (i == gpuData[7]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicCAS failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 9th element should be 1 + val &= (2 * i + 7); + } + + if (val != gpuData[8]) { + printf("atomicAnd failed\n"); + return false; + } + + val = 0; + + for (int i = 0; i < len; ++i) { + // 10th element should be 0xff + val |= (1 << i); + } + + if (val != gpuData[9]) { + printf("atomicOr failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 11th element should be 0xff + val ^= i; + } + + if (val != gpuData[10]) { + printf("atomicXor failed\n"); + return false; + } + + return true; } diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh index 67c58427..09714d08 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh +++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh @@ -35,48 +35,49 @@ //! @param g_idata input data in global memory //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void testKernel(int *g_odata) { - // access thread id - const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void testKernel(int *g_odata) +{ + // access thread id + const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; - // Test various atomic instructions + // Test various atomic instructions - // Arithmetic atomic instructions + // Arithmetic atomic instructions - // Atomic addition - atomicAdd(&g_odata[0], 10); + // Atomic addition + atomicAdd(&g_odata[0], 10); - // Atomic subtraction (final should be 0) - atomicSub(&g_odata[1], 10); + // Atomic subtraction (final should be 0) + atomicSub(&g_odata[1], 10); - // Atomic exchange - atomicExch(&g_odata[2], tid); + // Atomic exchange + atomicExch(&g_odata[2], tid); - // Atomic maximum - atomicMax(&g_odata[3], tid); + // Atomic maximum + atomicMax(&g_odata[3], tid); - // Atomic minimum - atomicMin(&g_odata[4], tid); + // Atomic minimum + atomicMin(&g_odata[4], tid); - // Atomic increment (modulo 17+1) - atomicInc((unsigned int *)&g_odata[5], 17); + // Atomic increment (modulo 17+1) + atomicInc((unsigned int *)&g_odata[5], 17); - // Atomic decrement - atomicDec((unsigned int *)&g_odata[6], 137); + // Atomic decrement + atomicDec((unsigned int *)&g_odata[6], 137); - // Atomic compare-and-swap - atomicCAS(&g_odata[7], tid - 1, tid); + // Atomic compare-and-swap + atomicCAS(&g_odata[7], tid - 1, tid); - // Bitwise atomic instructions + // Bitwise atomic instructions - // Atomic AND - atomicAnd(&g_odata[8], 2 * tid + 7); + // Atomic AND + atomicAnd(&g_odata[8], 2 * tid + 7); - // Atomic OR - atomicOr(&g_odata[9], 1 << tid); + // Atomic OR + atomicOr(&g_odata[9], 1 << tid); - // Atomic XOR - atomicXor(&g_odata[10], tid); + // Atomic XOR + atomicXor(&g_odata[10], tid); } -#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_ +#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_ diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp index c276fdc5..0e716c49 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp +++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp @@ -30,10 +30,10 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include #ifdef _WIN32 #define WINDOWS_LEAN_AND_MEAN @@ -46,7 +46,7 @@ #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h const char *sampleName = "simpleAtomicIntrinsics_nvrtc"; @@ -64,84 +64,90 @@ extern "C" bool computeGold(int *gpuData, const int len); // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n", sampleName); +int main(int argc, char **argv) +{ + printf("%s starting...\n", sampleName); - runTest(argc, argv); + runTest(argc, argv); - printf("%s completed, returned %s\n", sampleName, - testResult ? "OK" : "ERROR!"); + printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - int dev = 0; +void runTest(int argc, char **argv) +{ + int dev = 0; - char *cubin, *kernel_file; - size_t cubinSize; + char *cubin, *kernel_file; + size_t cubinSize; - kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - CUmodule module = loadCUBIN(cubin, argc, argv); - CUfunction kernel_addr; + CUmodule module = loadCUBIN(cubin, argc, argv); + CUfunction kernel_addr; - checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel")); + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel")); - StopWatchInterface *timer; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + StopWatchInterface *timer; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); - unsigned int numThreads = 256; - unsigned int numBlocks = 64; - unsigned int numData = 11; - unsigned int memSize = sizeof(int) * numData; + unsigned int numThreads = 256; + unsigned int numBlocks = 64; + unsigned int numData = 11; + unsigned int memSize = sizeof(int) * numData; - // allocate mem for the result on host side - int *hOData = (int *)malloc(memSize); + // allocate mem for the result on host side + int *hOData = (int *)malloc(memSize); - // initialize the memory - for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; + // initialize the memory + for (unsigned int i = 0; i < numData; i++) + hOData[i] = 0; - // To make the AND and XOR tests generate something other than 0... - hOData[8] = hOData[10] = 0xff; + // To make the AND and XOR tests generate something other than 0... + hOData[8] = hOData[10] = 0xff; - // allocate device memory for result - CUdeviceptr dOData; - checkCudaErrors(cuMemAlloc(&dOData, memSize)); - checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize)); + // allocate device memory for result + CUdeviceptr dOData; + checkCudaErrors(cuMemAlloc(&dOData, memSize)); + checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize)); - // execute the kernel - dim3 cudaBlockSize(numThreads, 1, 1); - dim3 cudaGridSize(numBlocks, 1, 1); + // execute the kernel + dim3 cudaBlockSize(numThreads, 1, 1); + dim3 cudaGridSize(numBlocks, 1, 1); - void *arr[] = {(void *)&dOData}; - checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - cudaBlockSize.x, cudaBlockSize.y, - cudaBlockSize.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &arr[0], /* arguments */ - 0)); + void *arr[] = {(void *)&dOData}; + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + cudaBlockSize.x, + cudaBlockSize.y, + cudaBlockSize.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &arr[0], /* arguments */ + 0)); - checkCudaErrors(cuCtxSynchronize()); + checkCudaErrors(cuCtxSynchronize()); - checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize)); + checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize)); - // Copy result from device to host - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); + // Copy result from device to host + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); - // Compute reference solution - testResult = computeGold(hOData, numThreads * numBlocks); + // Compute reference solution + testResult = computeGold(hOData, numThreads * numBlocks); - // Cleanup memory - free(hOData); - checkCudaErrors(cuMemFree(dOData)); + // Cleanup memory + free(hOData); + checkCudaErrors(cuMemFree(dOData)); } diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp index 3ccf0144..8510b49f 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp +++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp @@ -43,139 +43,140 @@ extern "C" int computeGold(int *gpuData, const int len); //! @param len number of elements in reference / idata //////////////////////////////////////////////////////////////////////////////// -int computeGold(int *gpuData, const int len) { - int val = 0; +int computeGold(int *gpuData, const int len) +{ + int val = 0; - for (int i = 0; i < len; ++i) { - val += 10; - } - - if (val != gpuData[0]) { - printf("atomicAdd failed\n"); - return false; - } - - val = 0; - - for (int i = 0; i < len; ++i) { - val -= 10; - } - - if (val != gpuData[1]) { - printf("atomicSub failed\n"); - return false; - } - - bool found = false; - - for (int i = 0; i < len; ++i) { - // third element should be a member of [0, len) - if (i == gpuData[2]) { - found = true; - break; + for (int i = 0; i < len; ++i) { + val += 10; } - } - if (!found) { - printf("atomicExch failed\n"); - return false; - } - - val = -(1 << 8); - - for (int i = 0; i < len; ++i) { - // fourth element should be len-1 - val = max(val, i); - } - - if (val != gpuData[3]) { - printf("atomicMax failed\n"); - return false; - } - - val = 1 << 8; - - for (int i = 0; i < len; ++i) { - val = min(val, i); - } - - if (val != gpuData[4]) { - printf("atomicMin failed\n"); - return false; - } - - int limit = 17; - val = 0; - - for (int i = 0; i < len; ++i) { - val = (val >= limit) ? 0 : val + 1; - } - - if (val != gpuData[5]) { - printf("atomicInc failed\n"); - return false; - } - - limit = 137; - val = 0; - - for (int i = 0; i < len; ++i) { - val = ((val == 0) || (val > limit)) ? limit : val - 1; - } - - if (val != gpuData[6]) { - printf("atomicDec failed\n"); - return false; - } - - found = false; - - for (int i = 0; i < len; ++i) { - // eighth element should be a member of [0, len) - if (i == gpuData[7]) { - found = true; - break; + if (val != gpuData[0]) { + printf("atomicAdd failed\n"); + return false; } - } - if (!found) { - printf("atomicCAS failed\n"); - return false; - } + val = 0; - val = 0xff; - for (int i = 0; i < len; ++i) { - // 9th element should be 1 - val &= (2 * i + 7); - } + for (int i = 0; i < len; ++i) { + val -= 10; + } - if (val != gpuData[8]) { - printf("atomicAnd failed\n"); - return false; - } + if (val != gpuData[1]) { + printf("atomicSub failed\n"); + return false; + } - val = 0; - for (int i = 0; i < len; ++i) { - // 10th element should be 0xff - val |= (1 << i); - } + bool found = false; - if (val != gpuData[9]) { - printf("atomicOr failed\n"); - return false; - } + for (int i = 0; i < len; ++i) { + // third element should be a member of [0, len) + if (i == gpuData[2]) { + found = true; + break; + } + } - val = 0xff; + if (!found) { + printf("atomicExch failed\n"); + return false; + } - for (int i = 0; i < len; ++i) { - // 11th element should be 0xff - val ^= i; - } + val = -(1 << 8); - if (val != gpuData[10]) { - printf("atomicXor failed\n"); - return false; - } + for (int i = 0; i < len; ++i) { + // fourth element should be len-1 + val = max(val, i); + } - return true; + if (val != gpuData[3]) { + printf("atomicMax failed\n"); + return false; + } + + val = 1 << 8; + + for (int i = 0; i < len; ++i) { + val = min(val, i); + } + + if (val != gpuData[4]) { + printf("atomicMin failed\n"); + return false; + } + + int limit = 17; + val = 0; + + for (int i = 0; i < len; ++i) { + val = (val >= limit) ? 0 : val + 1; + } + + if (val != gpuData[5]) { + printf("atomicInc failed\n"); + return false; + } + + limit = 137; + val = 0; + + for (int i = 0; i < len; ++i) { + val = ((val == 0) || (val > limit)) ? limit : val - 1; + } + + if (val != gpuData[6]) { + printf("atomicDec failed\n"); + return false; + } + + found = false; + + for (int i = 0; i < len; ++i) { + // eighth element should be a member of [0, len) + if (i == gpuData[7]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicCAS failed\n"); + return false; + } + + val = 0xff; + for (int i = 0; i < len; ++i) { + // 9th element should be 1 + val &= (2 * i + 7); + } + + if (val != gpuData[8]) { + printf("atomicAnd failed\n"); + return false; + } + + val = 0; + for (int i = 0; i < len; ++i) { + // 10th element should be 0xff + val |= (1 << i); + } + + if (val != gpuData[9]) { + printf("atomicOr failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 11th element should be 0xff + val ^= i; + } + + if (val != gpuData[10]) { + printf("atomicXor failed\n"); + return false; + } + + return true; } diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh index ca96af08..3c4d284c 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh +++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh @@ -36,45 +36,46 @@ //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -extern "C" __global__ void testKernel(int *g_odata) { - // access thread id - const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; +extern "C" __global__ void testKernel(int *g_odata) +{ + // access thread id + const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; - // Test various atomic instructions - // Arithmetic atomic instructions - // Atomic addition - atomicAdd(&g_odata[0], 10); + // Test various atomic instructions + // Arithmetic atomic instructions + // Atomic addition + atomicAdd(&g_odata[0], 10); - // Atomic subtraction (final should be 0) - atomicSub(&g_odata[1], 10); + // Atomic subtraction (final should be 0) + atomicSub(&g_odata[1], 10); - // Atomic exchange - atomicExch(&g_odata[2], tid); + // Atomic exchange + atomicExch(&g_odata[2], tid); - // Atomic maximum - atomicMax(&g_odata[3], tid); + // Atomic maximum + atomicMax(&g_odata[3], tid); - // Atomic minimum - atomicMin(&g_odata[4], tid); + // Atomic minimum + atomicMin(&g_odata[4], tid); - // Atomic increment (modulo 17+1) - atomicInc((unsigned int *)&g_odata[5], 17); + // Atomic increment (modulo 17+1) + atomicInc((unsigned int *)&g_odata[5], 17); - // Atomic decrement - atomicDec((unsigned int *)&g_odata[6], 137); + // Atomic decrement + atomicDec((unsigned int *)&g_odata[6], 137); - // Atomic compare-and-swap - atomicCAS(&g_odata[7], tid - 1, tid); + // Atomic compare-and-swap + atomicCAS(&g_odata[7], tid - 1, tid); - // Bitwise atomic instructions - // Atomic AND - atomicAnd(&g_odata[8], 2 * tid + 7); + // Bitwise atomic instructions + // Atomic AND + atomicAnd(&g_odata[8], 2 * tid + 7); - // Atomic OR - atomicOr(&g_odata[9], 1 << tid); + // Atomic OR + atomicOr(&g_odata[9], 1 << tid); - // Atomic XOR - atomicXor(&g_odata[10], tid); + // Atomic XOR + atomicXor(&g_odata[10], tid); } -#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_ +#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_ diff --git a/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu b/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu index 31084ec3..f0a4c8cc 100644 --- a/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu +++ b/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu @@ -26,30 +26,31 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes CUDA #include // includes, project #include -#include // helper functions for SDK examples +#include // helper functions for SDK examples //////////////////////////////////////////////////////////////////////////////// // declaration, forward void runTest(int argc, char **argv); -cudaAccessPolicyWindow initAccessPolicyWindow(void) { - cudaAccessPolicyWindow accessPolicyWindow = {0}; - accessPolicyWindow.base_ptr = (void *)0; - accessPolicyWindow.num_bytes = 0; - accessPolicyWindow.hitRatio = 0.f; - accessPolicyWindow.hitProp = cudaAccessPropertyNormal; - accessPolicyWindow.missProp = cudaAccessPropertyStreaming; - return accessPolicyWindow; +cudaAccessPolicyWindow initAccessPolicyWindow(void) +{ + cudaAccessPolicyWindow accessPolicyWindow = {0}; + accessPolicyWindow.base_ptr = (void *)0; + accessPolicyWindow.num_bytes = 0; + accessPolicyWindow.hitRatio = 0.f; + accessPolicyWindow.hitProp = cudaAccessPropertyNormal; + accessPolicyWindow.missProp = cudaAccessPropertyStreaming; + return accessPolicyWindow; } //////////////////////////////////////////////////////////////////////////////// @@ -60,35 +61,35 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) { //! @param bigDataSize input bigData size //! @param hitcount how many data access are done within block //////////////////////////////////////////////////////////////////////////////// -static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, - int bigDataSize, int hitCount) { - __shared__ unsigned int hit; - int row = blockIdx.y * blockDim.y + threadIdx.y; - int col = blockIdx.x * blockDim.x + threadIdx.x; - int tID = row * blockDim.y + col; - uint32_t psRand = tID; +static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount) +{ + __shared__ unsigned int hit; + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + int tID = row * blockDim.y + col; + uint32_t psRand = tID; - atomicExch(&hit, 0); - __syncthreads(); - while (hit < hitCount) { - psRand ^= psRand << 13; - psRand ^= psRand >> 17; - psRand ^= psRand << 5; + atomicExch(&hit, 0); + __syncthreads(); + while (hit < hitCount) { + psRand ^= psRand << 13; + psRand ^= psRand >> 17; + psRand ^= psRand << 5; - int idx = tID - psRand; - if (idx < 0) { - idx = -idx; + int idx = tID - psRand; + if (idx < 0) { + idx = -idx; + } + + if ((tID % 2) == 0) { + data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize]; + } + else { + trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize]; + } + + atomicAdd(&hit, 1); } - - if ((tID % 2) == 0) { - data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize]; - } else { - trash[psRand % bigDataSize] = - trash[psRand % bigDataSize] + trash[idx % bigDataSize]; - } - - atomicAdd(&hit, 1); - } } //////////////////////////////////////////////////////////////////////////////// // Program main @@ -98,117 +99,110 @@ int main(int argc, char **argv) { runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - bool bTestResult = true; - cudaAccessPolicyWindow accessPolicyWindow; - cudaDeviceProp deviceProp; - cudaStreamAttrValue streamAttrValue; - cudaStream_t stream; - cudaStreamAttrID streamAttrID; - dim3 threads(32, 32); - int *dataDevicePointer; - int *dataHostPointer; - int dataSize; - int *bigDataDevicePointer; - int *bigDataHostPointer; - int bigDataSize; - StopWatchInterface *timer = 0; +void runTest(int argc, char **argv) +{ + bool bTestResult = true; + cudaAccessPolicyWindow accessPolicyWindow; + cudaDeviceProp deviceProp; + cudaStreamAttrValue streamAttrValue; + cudaStream_t stream; + cudaStreamAttrID streamAttrID; + dim3 threads(32, 32); + int *dataDevicePointer; + int *dataHostPointer; + int dataSize; + int *bigDataDevicePointer; + int *bigDataHostPointer; + int bigDataSize; + StopWatchInterface *timer = 0; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - int devID = findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - // Get device properties - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - dim3 blocks(deviceProp.maxGridSize[1], 1); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + // Get device properties + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + dim3 blocks(deviceProp.maxGridSize[1], 1); - // Make sure device the l2 optimization - if (deviceProp.persistingL2CacheMaxSize == 0) { - printf( - "Waiving execution as device %d does not support persisting L2 " - "Caching\n", - devID); - exit(EXIT_WAIVED); - } - - // Create stream to assiocate with window - checkCudaErrors(cudaStreamCreate(&stream)); - - // Set the amount of l2 cache that will be persisting to maximum the device - // can support - checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, - deviceProp.persistingL2CacheMaxSize)); - - // Stream attribute to set - streamAttrID = cudaStreamAttributeAccessPolicyWindow; - - // Default window - streamAttrValue.accessPolicyWindow = initAccessPolicyWindow(); - accessPolicyWindow = initAccessPolicyWindow(); - - // Allocate size of both buffers - bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int); - dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int); - - // Allocate data - checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int))); - checkCudaErrors( - cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int))); - - for (int i = 0; i < bigDataSize; ++i) { - if (i < dataSize) { - dataHostPointer[i] = i; + // Make sure device the l2 optimization + if (deviceProp.persistingL2CacheMaxSize == 0) { + printf("Waiving execution as device %d does not support persisting L2 " + "Caching\n", + devID); + exit(EXIT_WAIVED); } - bigDataHostPointer[bigDataSize - i - 1] = i; - } + // Create stream to assiocate with window + checkCudaErrors(cudaStreamCreate(&stream)); - checkCudaErrors( - cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int))); - checkCudaErrors( - cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int))); - checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer, - dataSize * sizeof(int), - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer, - bigDataSize * sizeof(int), - cudaMemcpyHostToDevice, stream)); + // Set the amount of l2 cache that will be persisting to maximum the device + // can support + checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize)); - // Make a window for the buffer of interest - accessPolicyWindow.base_ptr = (void *)dataDevicePointer; - accessPolicyWindow.num_bytes = dataSize * sizeof(int); - accessPolicyWindow.hitRatio = 1.f; - accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; - accessPolicyWindow.missProp = cudaAccessPropertyNormal; - streamAttrValue.accessPolicyWindow = accessPolicyWindow; + // Stream attribute to set + streamAttrID = cudaStreamAttributeAccessPolicyWindow; - // Assign window to stream - checkCudaErrors( - cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue)); + // Default window + streamAttrValue.accessPolicyWindow = initAccessPolicyWindow(); + accessPolicyWindow = initAccessPolicyWindow(); - // Demote any previous persisting lines - checkCudaErrors(cudaCtxResetPersistingL2Cache()); + // Allocate size of both buffers + bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int); + dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int); - checkCudaErrors(cudaStreamSynchronize(stream)); - kernCacheSegmentTest<<>>( - dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF); + // Allocate data + checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int))); + checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int))); - checkCudaErrors(cudaStreamSynchronize(stream)); - // check if kernel execution generated and error - getLastCudaError("Kernel execution failed"); + for (int i = 0; i < bigDataSize; ++i) { + if (i < dataSize) { + dataHostPointer[i] = i; + } - // Free memory - checkCudaErrors(cudaFreeHost(dataHostPointer)); - checkCudaErrors(cudaFreeHost(bigDataHostPointer)); - checkCudaErrors(cudaFree(dataDevicePointer)); - checkCudaErrors(cudaFree(bigDataDevicePointer)); + bigDataHostPointer[bigDataSize - i - 1] = i; + } - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); + checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int))); + checkCudaErrors( + cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync( + bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream)); - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + // Make a window for the buffer of interest + accessPolicyWindow.base_ptr = (void *)dataDevicePointer; + accessPolicyWindow.num_bytes = dataSize * sizeof(int); + accessPolicyWindow.hitRatio = 1.f; + accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; + accessPolicyWindow.missProp = cudaAccessPropertyNormal; + streamAttrValue.accessPolicyWindow = accessPolicyWindow; + + // Assign window to stream + checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue)); + + // Demote any previous persisting lines + checkCudaErrors(cudaCtxResetPersistingL2Cache()); + + checkCudaErrors(cudaStreamSynchronize(stream)); + kernCacheSegmentTest<<>>( + dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF); + + checkCudaErrors(cudaStreamSynchronize(stream)); + // check if kernel execution generated and error + getLastCudaError("Kernel execution failed"); + + // Free memory + checkCudaErrors(cudaFreeHost(dataHostPointer)); + checkCudaErrors(cudaFreeHost(bigDataHostPointer)); + checkCudaErrors(cudaFree(dataDevicePointer)); + checkCudaErrors(cudaFree(bigDataDevicePointer)); + + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); + + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/simpleCUDA2GL/README.md b/Samples/0_Introduction/simpleCUDA2GL/README.md index bd22edcf..8d70b68c 100644 --- a/Samples/0_Introduction/simpleCUDA2GL/README.md +++ b/Samples/0_Introduction/simpleCUDA2GL/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/0_Introduction/simpleCUDA2GL/main.cpp b/Samples/0_Introduction/simpleCUDA2GL/main.cpp index 47261d14..fa1571d0 100644 --- a/Samples/0_Introduction/simpleCUDA2GL/main.cpp +++ b/Samples/0_Introduction/simpleCUDA2GL/main.cpp @@ -50,8 +50,8 @@ #endif // CUDA includes -#include #include +#include // CUDA utilities and system includes #include @@ -59,8 +59,8 @@ #include // Shared Library Test Functions -#define MAX_EPSILON 10 -#define REFRESH_DELAY 10 // ms +#define MAX_EPSILON 10 +#define REFRESH_DELAY 10 // ms const char *sSDKname = "simpleCUDA2GL"; @@ -71,23 +71,23 @@ CheckRender *g_CheckRender = NULL; //////////////////////////////////////////////////////////////////////////////// // constants / global variables -unsigned int window_width = 512; -unsigned int window_height = 512; -unsigned int image_width = 512; -unsigned int image_height = 512; -int iGLUTWindowHandle = 0; // handle to the GLUT window +unsigned int window_width = 512; +unsigned int window_height = 512; +unsigned int image_width = 512; +unsigned int image_height = 512; +int iGLUTWindowHandle = 0; // handle to the GLUT window // pbo and fbo variables #ifdef USE_TEXSUBIMAGE2D -GLuint pbo_dest; +GLuint pbo_dest; struct cudaGraphicsResource *cuda_pbo_dest_resource; #else -unsigned int *cuda_dest_resource; -GLuint shDrawTex; // draws a texture +unsigned int *cuda_dest_resource; +GLuint shDrawTex; // draws a texture struct cudaGraphicsResource *cuda_tex_result_resource; #endif -GLuint fbo_source; +GLuint fbo_source; struct cudaGraphicsResource *cuda_tex_screen_resource; unsigned int size_tex_data; @@ -95,19 +95,19 @@ unsigned int num_texels; unsigned int num_values; // (offscreen) render target fbo variables -GLuint tex_screen; // where we render the image -GLuint tex_cudaResult; // where we will copy the CUDA result +GLuint tex_screen; // where we render the image +GLuint tex_cudaResult; // where we will copy the CUDA result -char *ref_file = NULL; -bool enable_cuda = true; +char *ref_file = NULL; +bool enable_cuda = true; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; // Timer -static int fpsCount = 0; -static int fpsLimit = 1; -StopWatchInterface *timer = NULL; +static int fpsCount = 0; +static int fpsLimit = 1; +StopWatchInterface *timer = NULL; #ifndef USE_TEXTURE_RGBA8UI #pragma message("Note: Using Texture fmt GL_RGBA16F_ARB") @@ -124,8 +124,7 @@ StopWatchInterface *timer = NULL; GLuint shDraw; //////////////////////////////////////////////////////////////////////////////// -extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, - unsigned int *g_odata, int imgw); +extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw); // Forward declarations void runStdProgram(int argc, char **argv); @@ -140,8 +139,7 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource); void deletePBO(GLuint *pbo); #endif -void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, - unsigned int size_y); +void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y); void deleteTexture(GLuint *tex); // rendering callbacks @@ -155,55 +153,55 @@ void mainMenu(int i); //////////////////////////////////////////////////////////////////////////////// //! Create PBO //////////////////////////////////////////////////////////////////////////////// -void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) { - // set up vertex data parameter - num_texels = image_width * image_height; - num_values = num_texels * 4; - size_tex_data = sizeof(GLubyte) * num_values; - void *data = malloc(size_tex_data); +void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) +{ + // set up vertex data parameter + num_texels = image_width * image_height; + num_values = num_texels * 4; + size_tex_data = sizeof(GLubyte) * num_values; + void *data = malloc(size_tex_data); - // create buffer object - glGenBuffers(1, pbo); - glBindBuffer(GL_ARRAY_BUFFER, *pbo); - glBufferData(GL_ARRAY_BUFFER, size_tex_data, data, GL_DYNAMIC_DRAW); - free(data); + // create buffer object + glGenBuffers(1, pbo); + glBindBuffer(GL_ARRAY_BUFFER, *pbo); + glBufferData(GL_ARRAY_BUFFER, size_tex_data, data, GL_DYNAMIC_DRAW); + free(data); - glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindBuffer(GL_ARRAY_BUFFER, 0); - // register this buffer object with CUDA - checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, - cudaGraphicsMapFlagsNone)); + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, cudaGraphicsMapFlagsNone)); - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } -void deletePBO(GLuint *pbo) { - glDeleteBuffers(1, pbo); - SDK_CHECK_ERROR_GL(); - *pbo = 0; +void deletePBO(GLuint *pbo) +{ + glDeleteBuffers(1, pbo); + SDK_CHECK_ERROR_GL(); + *pbo = 0; } #endif -const GLenum fbo_targets[] = { - GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT, - GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT}; +const GLenum fbo_targets[] = {GL_COLOR_ATTACHMENT0_EXT, + GL_COLOR_ATTACHMENT1_EXT, + GL_COLOR_ATTACHMENT2_EXT, + GL_COLOR_ATTACHMENT3_EXT}; #ifndef USE_TEXSUBIMAGE2D -static const char *glsl_drawtex_vertshader_src = - "void main(void)\n" - "{\n" - " gl_Position = gl_Vertex;\n" - " gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n" - "}\n"; +static const char *glsl_drawtex_vertshader_src = "void main(void)\n" + "{\n" + " gl_Position = gl_Vertex;\n" + " gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n" + "}\n"; -static const char *glsl_drawtex_fragshader_src = - "#version 130\n" - "uniform usampler2D texImage;\n" - "void main()\n" - "{\n" - " vec4 c = texture(texImage, gl_TexCoord[0].xy);\n" - " gl_FragColor = c / 255.0;\n" - "}\n"; +static const char *glsl_drawtex_fragshader_src = "#version 130\n" + "uniform usampler2D texImage;\n" + "void main()\n" + "{\n" + " vec4 c = texture(texImage, gl_TexCoord[0].xy);\n" + " gl_FragColor = c / 255.0;\n" + "}\n"; #endif static const char *glsl_draw_fragshader_src = @@ -227,26 +225,26 @@ static const char *glsl_draw_fragshader_src = #endif // copy image and process using CUDA -void generateCUDAImage() { - // run the Cuda kernel - unsigned int *out_data; +void generateCUDAImage() +{ + // run the Cuda kernel + unsigned int *out_data; #ifdef USE_TEXSUBIMAGE2D - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&out_data, &num_bytes, cuda_pbo_dest_resource)); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&out_data, &num_bytes, cuda_pbo_dest_resource)); // printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n", // num_bytes, size_tex_data); #else - out_data = cuda_dest_resource; + out_data = cuda_dest_resource; #endif - // calculate grid size - dim3 block(16, 16, 1); - // dim3 block(16, 16, 1); - dim3 grid(image_width / block.x, image_height / block.y, 1); - // execute CUDA kernel - launch_cudaProcess(grid, block, 0, out_data, image_width); + // calculate grid size + dim3 block(16, 16, 1); + // dim3 block(16, 16, 1); + dim3 grid(image_width / block.x, image_height / block.y, 1); + // execute CUDA kernel + launch_cudaProcess(grid, block, 0, out_data, image_width); // CUDA generated data in cuda memory or in a mapped PBO made of BGRA 8 bits // 2 solutions, here : @@ -254,168 +252,168 @@ void generateCUDAImage() { // possible hidden conversion // - map the texture and blit the result thanks to CUDA API #ifdef USE_TEXSUBIMAGE2D - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_dest_resource, 0)); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_dest_resource, 0)); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest); - glBindTexture(GL_TEXTURE_2D, tex_cudaResult); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - SDK_CHECK_ERROR_GL(); - glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + glBindTexture(GL_TEXTURE_2D, tex_cudaResult); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + SDK_CHECK_ERROR_GL(); + glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); #else - // We want to copy cuda_dest_resource data to the texture - // map buffer objects to get CUDA device pointers - cudaArray *texture_ptr; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0)); - checkCudaErrors(cudaGraphicsSubResourceGetMappedArray( - &texture_ptr, cuda_tex_result_resource, 0, 0)); + // We want to copy cuda_dest_resource data to the texture + // map buffer objects to get CUDA device pointers + cudaArray *texture_ptr; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0)); + checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(&texture_ptr, cuda_tex_result_resource, 0, 0)); - int num_texels = image_width * image_height; - int num_values = num_texels * 4; - int size_tex_data = sizeof(GLubyte) * num_values; - checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, - size_tex_data, cudaMemcpyDeviceToDevice)); + int num_texels = image_width * image_height; + int num_values = num_texels * 4; + int size_tex_data = sizeof(GLubyte) * num_values; + checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice)); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0)); #endif } // display image to the screen as textured quad -void displayImage(GLuint texture) { - glBindTexture(GL_TEXTURE_2D, texture); - glEnable(GL_TEXTURE_2D); - glDisable(GL_DEPTH_TEST); - glDisable(GL_LIGHTING); - glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); +void displayImage(GLuint texture) +{ + glBindTexture(GL_TEXTURE_2D, texture); + glEnable(GL_TEXTURE_2D); + glDisable(GL_DEPTH_TEST); + glDisable(GL_LIGHTING); + glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); - glMatrixMode(GL_PROJECTION); - glPushMatrix(); - glLoadIdentity(); - glOrtho(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0); + glMatrixMode(GL_PROJECTION); + glPushMatrix(); + glLoadIdentity(); + glOrtho(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glViewport(0, 0, window_width, window_height); + glViewport(0, 0, window_width, window_height); // if the texture is a 8 bits UI, scale the fetch with a GLSL shader #ifndef USE_TEXSUBIMAGE2D - glUseProgram(shDrawTex); - GLint id = glGetUniformLocation(shDrawTex, "texImage"); - glUniform1i(id, 0); // texture unit 0 to "texImage" - SDK_CHECK_ERROR_GL(); + glUseProgram(shDrawTex); + GLint id = glGetUniformLocation(shDrawTex, "texImage"); + glUniform1i(id, 0); // texture unit 0 to "texImage" + SDK_CHECK_ERROR_GL(); #endif - glBegin(GL_QUADS); - glTexCoord2f(0.0, 0.0); - glVertex3f(-1.0, -1.0, 0.5); - glTexCoord2f(1.0, 0.0); - glVertex3f(1.0, -1.0, 0.5); - glTexCoord2f(1.0, 1.0); - glVertex3f(1.0, 1.0, 0.5); - glTexCoord2f(0.0, 1.0); - glVertex3f(-1.0, 1.0, 0.5); - glEnd(); + glBegin(GL_QUADS); + glTexCoord2f(0.0, 0.0); + glVertex3f(-1.0, -1.0, 0.5); + glTexCoord2f(1.0, 0.0); + glVertex3f(1.0, -1.0, 0.5); + glTexCoord2f(1.0, 1.0); + glVertex3f(1.0, 1.0, 0.5); + glTexCoord2f(0.0, 1.0); + glVertex3f(-1.0, 1.0, 0.5); + glEnd(); - glMatrixMode(GL_PROJECTION); - glPopMatrix(); + glMatrixMode(GL_PROJECTION); + glPopMatrix(); - glDisable(GL_TEXTURE_2D); + glDisable(GL_TEXTURE_2D); #ifndef USE_TEXSUBIMAGE2D - glUseProgram(0); + glUseProgram(0); #endif - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } //////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - if (enable_cuda) { - generateCUDAImage(); - displayImage(tex_cudaResult); - } - - // NOTE: I needed to add this call so the timing is consistent. - // Need to investigate why - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - - // flip backbuffer - glutSwapBuffers(); - - // If specified, Check rendering against reference, - if (ref_file && g_CheckRender && g_CheckRender->IsQAReadback()) { - static int pass = 0; - - if (pass > 0) { - g_CheckRender->readback(window_width, window_height); - char currentOutputPPM[256]; - sprintf(currentOutputPPM, "kilt.ppm"); - g_CheckRender->savePPM(currentOutputPPM, true, NULL); - - if (!g_CheckRender->PPMvsPPM(currentOutputPPM, - sdkFindFilePath(ref_file, pArgv[0]), - MAX_EPSILON, 0.30f)) { - g_TotalErrors++; - } - - Cleanup((g_TotalErrors == 0) ? EXIT_SUCCESS : EXIT_FAILURE); + if (enable_cuda) { + generateCUDAImage(); + displayImage(tex_cudaResult); } - pass++; - } + // NOTE: I needed to add this call so the timing is consistent. + // Need to investigate why + cudaDeviceSynchronize(); + sdkStopTimer(&timer); - // Update fps counter, fps/title display and log - if (++fpsCount == fpsLimit) { - char cTitle[256]; - float fps = 1000.0f / sdkGetAverageTimerValue(&timer); - sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, - window_height, fps); - glutSetWindowTitle(cTitle); - // printf("%s\n", cTitle); - fpsCount = 0; - fpsLimit = (int)((fps > 1.0f) ? fps : 1.0f); - sdkResetTimer(&timer); - } + // flip backbuffer + glutSwapBuffers(); + + // If specified, Check rendering against reference, + if (ref_file && g_CheckRender && g_CheckRender->IsQAReadback()) { + static int pass = 0; + + if (pass > 0) { + g_CheckRender->readback(window_width, window_height); + char currentOutputPPM[256]; + sprintf(currentOutputPPM, "kilt.ppm"); + g_CheckRender->savePPM(currentOutputPPM, true, NULL); + + if (!g_CheckRender->PPMvsPPM(currentOutputPPM, sdkFindFilePath(ref_file, pArgv[0]), MAX_EPSILON, 0.30f)) { + g_TotalErrors++; + } + + Cleanup((g_TotalErrors == 0) ? EXIT_SUCCESS : EXIT_FAILURE); + } + + pass++; + } + + // Update fps counter, fps/title display and log + if (++fpsCount == fpsLimit) { + char cTitle[256]; + float fps = 1000.0f / sdkGetAverageTimerValue(&timer); + sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, window_height, fps); + glutSetWindowTitle(cTitle); + // printf("%s\n", cTitle); + fpsCount = 0; + fpsLimit = (int)((fps > 1.0f) ? fps : 1.0f); + sdkResetTimer(&timer); + } } -void timerEvent(int value) { - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); +void timerEvent(int value) +{ + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); } //////////////////////////////////////////////////////////////////////////////// //! Keyboard events handler //////////////////////////////////////////////////////////////////////////////// -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case (27): - Cleanup(EXIT_SUCCESS); - break; + Cleanup(EXIT_SUCCESS); + break; case ' ': - enable_cuda ^= 1; + enable_cuda ^= 1; #ifdef USE_TEXTURE_RGBA8UI - if (enable_cuda) { - glClearColorIuiEXT(128, 128, 128, 255); - } else { - glClearColor(0.5, 0.5, 0.5, 1.0); - } + if (enable_cuda) { + glClearColorIuiEXT(128, 128, 128, 255); + } + else { + glClearColor(0.5, 0.5, 0.5, 1.0); + } #endif - break; - } + break; + } } -void reshape(int w, int h) { - window_width = w; - window_height = h; +void reshape(int w, int h) +{ + window_width = w; + window_height = h; } void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); } @@ -423,325 +421,328 @@ void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, - unsigned int size_y) { - // create a texture - glGenTextures(1, tex_cudaResult); - glBindTexture(GL_TEXTURE_2D, *tex_cudaResult); +void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y) +{ + // create a texture + glGenTextures(1, tex_cudaResult); + glBindTexture(GL_TEXTURE_2D, *tex_cudaResult); - // set basic parameters - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + // set basic parameters + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); #ifdef USE_TEXSUBIMAGE2D - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - SDK_CHECK_ERROR_GL(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + SDK_CHECK_ERROR_GL(); #else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, - GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL); - SDK_CHECK_ERROR_GL(); - // register this texture with CUDA - checkCudaErrors(cudaGraphicsGLRegisterImage( - &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, - cudaGraphicsMapFlagsWriteDiscard)); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL); + SDK_CHECK_ERROR_GL(); + // register this texture with CUDA + checkCudaErrors(cudaGraphicsGLRegisterImage( + &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard)); #endif } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void deleteTexture(GLuint *tex) { - glDeleteTextures(1, tex); - SDK_CHECK_ERROR_GL(); +void deleteTexture(GLuint *tex) +{ + glDeleteTextures(1, tex); + SDK_CHECK_ERROR_GL(); - *tex = 0; + *tex = 0; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ #if defined(__linux__) - char *Xstatus = getenv("DISPLAY"); - if (Xstatus == NULL) { - printf("Waiving execution as X server is not running\n"); - exit(EXIT_WAIVED); - } - setenv("DISPLAY", ":0", 0); + char *Xstatus = getenv("DISPLAY"); + if (Xstatus == NULL) { + printf("Waiving execution as X server is not running\n"); + exit(EXIT_WAIVED); + } + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + } - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf("[%s]\n", argv[0]); - printf(" Does not explicitly support -device=n\n"); - printf( - " This sample requires OpenGL. Only -file= are " - "supported\n"); - printf("exiting...\n"); - exit(EXIT_WAIVED); - } + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf("[%s]\n", argv[0]); + printf(" Does not explicitly support -device=n\n"); + printf(" This sample requires OpenGL. Only -file= are " + "supported\n"); + printf("exiting...\n"); + exit(EXIT_WAIVED); + } - if (ref_file) { - printf("(Test with OpenGL verification)\n"); - runStdProgram(argc, argv); - } else { - printf("(Interactive OpenGL Demo)\n"); - runStdProgram(argc, argv); - } + if (ref_file) { + printf("(Test with OpenGL verification)\n"); + runStdProgram(argc, argv); + } + else { + printf("(Interactive OpenGL Demo)\n"); + runStdProgram(argc, argv); + } - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void FreeResource() { - sdkDeleteTimer(&timer); +void FreeResource() +{ + sdkDeleteTimer(&timer); // unregister this buffer object with CUDA // checkCudaErrors(cudaGraphicsUnregisterResource(cuda_tex_screen_resource)); #ifdef USE_TEXSUBIMAGE2D - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_dest_resource)); - deletePBO(&pbo_dest); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_dest_resource)); + deletePBO(&pbo_dest); #else - cudaFree(cuda_dest_resource); + cudaFree(cuda_dest_resource); #endif - deleteTexture(&tex_screen); - deleteTexture(&tex_cudaResult); + deleteTexture(&tex_screen); + deleteTexture(&tex_cudaResult); - if (iGLUTWindowHandle) { - glutDestroyWindow(iGLUTWindowHandle); - } + if (iGLUTWindowHandle) { + glutDestroyWindow(iGLUTWindowHandle); + } - // finalize logs and leave - printf("simpleCUDA2GL Exiting...\n"); + // finalize logs and leave + printf("simpleCUDA2GL Exiting...\n"); } -void Cleanup(int iExitCode) { - FreeResource(); - printf("PPM Images are %s\n", - (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching"); - exit(iExitCode); +void Cleanup(int iExitCode) +{ + FreeResource(); + printf("PPM Images are %s\n", (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching"); + exit(iExitCode); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -GLuint compileGLSLprogram(const char *vertex_shader_src, - const char *fragment_shader_src) { - GLuint v, f, p = 0; +GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_shader_src) +{ + GLuint v, f, p = 0; - p = glCreateProgram(); + p = glCreateProgram(); - if (vertex_shader_src) { - v = glCreateShader(GL_VERTEX_SHADER); - glShaderSource(v, 1, &vertex_shader_src, NULL); - glCompileShader(v); + if (vertex_shader_src) { + v = glCreateShader(GL_VERTEX_SHADER); + glShaderSource(v, 1, &vertex_shader_src, NULL); + glCompileShader(v); - // check if shader compiled - GLint compiled = 0; - glGetShaderiv(v, GL_COMPILE_STATUS, &compiled); + // check if shader compiled + GLint compiled = 0; + glGetShaderiv(v, GL_COMPILE_STATUS, &compiled); - if (!compiled) { - //#ifdef NV_REPORT_COMPILE_ERRORS - char temp[256] = ""; - glGetShaderInfoLog(v, 256, NULL, temp); - printf("Vtx Compile failed:\n%s\n", temp); - //#endif - glDeleteShader(v); - return 0; - } else { - glAttachShader(p, v); + if (!compiled) { + // #ifdef NV_REPORT_COMPILE_ERRORS + char temp[256] = ""; + glGetShaderInfoLog(v, 256, NULL, temp); + printf("Vtx Compile failed:\n%s\n", temp); + // #endif + glDeleteShader(v); + return 0; + } + else { + glAttachShader(p, v); + } } - } - if (fragment_shader_src) { - f = glCreateShader(GL_FRAGMENT_SHADER); - glShaderSource(f, 1, &fragment_shader_src, NULL); - glCompileShader(f); + if (fragment_shader_src) { + f = glCreateShader(GL_FRAGMENT_SHADER); + glShaderSource(f, 1, &fragment_shader_src, NULL); + glCompileShader(f); - // check if shader compiled - GLint compiled = 0; - glGetShaderiv(f, GL_COMPILE_STATUS, &compiled); + // check if shader compiled + GLint compiled = 0; + glGetShaderiv(f, GL_COMPILE_STATUS, &compiled); - if (!compiled) { - //#ifdef NV_REPORT_COMPILE_ERRORS - char temp[256] = ""; - glGetShaderInfoLog(f, 256, NULL, temp); - printf("frag Compile failed:\n%s\n", temp); - //#endif - glDeleteShader(f); - return 0; - } else { - glAttachShader(p, f); + if (!compiled) { + // #ifdef NV_REPORT_COMPILE_ERRORS + char temp[256] = ""; + glGetShaderInfoLog(f, 256, NULL, temp); + printf("frag Compile failed:\n%s\n", temp); + // #endif + glDeleteShader(f); + return 0; + } + else { + glAttachShader(p, f); + } } - } - glLinkProgram(p); + glLinkProgram(p); - int infologLength = 0; - int charsWritten = 0; + int infologLength = 0; + int charsWritten = 0; - glGetProgramiv(p, GL_INFO_LOG_LENGTH, (GLint *)&infologLength); + glGetProgramiv(p, GL_INFO_LOG_LENGTH, (GLint *)&infologLength); - if (infologLength > 0) { - char *infoLog = (char *)malloc(infologLength); - glGetProgramInfoLog(p, infologLength, (GLsizei *)&charsWritten, infoLog); - printf("Shader compilation error: %s\n", infoLog); - free(infoLog); - } + if (infologLength > 0) { + char *infoLog = (char *)malloc(infologLength); + glGetProgramInfoLog(p, infologLength, (GLsizei *)&charsWritten, infoLog); + printf("Shader compilation error: %s\n", infoLog); + free(infoLog); + } - return p; + return p; } //////////////////////////////////////////////////////////////////////////////// //! Allocate the "render target" of CUDA //////////////////////////////////////////////////////////////////////////////// #ifndef USE_TEXSUBIMAGE2D -void initCUDABuffers() { - // set up vertex data parameter - num_texels = image_width * image_height; - num_values = num_texels * 4; - size_tex_data = sizeof(GLubyte) * num_values; - checkCudaErrors(cudaMalloc((void **)&cuda_dest_resource, size_tex_data)); - // checkCudaErrors(cudaHostAlloc((void**)&cuda_dest_resource, size_tex_data, - // )); +void initCUDABuffers() +{ + // set up vertex data parameter + num_texels = image_width * image_height; + num_values = num_texels * 4; + size_tex_data = sizeof(GLubyte) * num_values; + checkCudaErrors(cudaMalloc((void **)&cuda_dest_resource, size_tex_data)); + // checkCudaErrors(cudaHostAlloc((void**)&cuda_dest_resource, size_tex_data, + // )); } #endif //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void initGLBuffers() { +void initGLBuffers() +{ // create pbo #ifdef USE_TEXSUBIMAGE2D - createPBO(&pbo_dest, &cuda_pbo_dest_resource); + createPBO(&pbo_dest, &cuda_pbo_dest_resource); #endif - // create texture that will receive the result of CUDA - createTextureDst(&tex_cudaResult, image_width, image_height); - // load shader programs - shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src); + // create texture that will receive the result of CUDA + createTextureDst(&tex_cudaResult, image_width, image_height); + // load shader programs + shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src); #ifndef USE_TEXSUBIMAGE2D - shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, - glsl_drawtex_fragshader_src); + shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, glsl_drawtex_fragshader_src); #endif - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } //////////////////////////////////////////////////////////////////////////////// //! Run standard demo loop with or without GL verification //////////////////////////////////////////////////////////////////////////////// -void runStdProgram(int argc, char **argv) { - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with OpenGL/CUDA - // interop. - if (false == initGL(&argc, argv)) { - return; - } +void runStdProgram(int argc, char **argv) +{ + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with OpenGL/CUDA + // interop. + if (false == initGL(&argc, argv)) { + return; + } - // Now initialize CUDA context (GL context has been created already) - findCudaDevice(argc, (const char **)argv); + // Now initialize CUDA context (GL context has been created already) + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&timer); - sdkResetTimer(&timer); + sdkCreateTimer(&timer); + sdkResetTimer(&timer); - // register callbacks - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + // register callbacks + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - // create menu - glutCreateMenu(mainMenu); - glutAddMenuEntry("Quit (esc)", '\033'); - glutAttachMenu(GLUT_RIGHT_BUTTON); + // create menu + glutCreateMenu(mainMenu); + glutAddMenuEntry("Quit (esc)", '\033'); + glutAttachMenu(GLUT_RIGHT_BUTTON); - initGLBuffers(); + initGLBuffers(); #ifndef USE_TEXSUBIMAGE2D - initCUDABuffers(); + initCUDABuffers(); #endif - // Creating the Auto-Validation Code - if (ref_file) { - g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); - g_CheckRender->setPixelFormat(GL_RGBA); - g_CheckRender->setExecPath(argv[0]); - g_CheckRender->EnableQAReadback(true); - } + // Creating the Auto-Validation Code + if (ref_file) { + g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); + g_CheckRender->setPixelFormat(GL_RGBA); + g_CheckRender->setExecPath(argv[0]); + g_CheckRender->EnableQAReadback(true); + } - printf( - "\n" - "\tControls\n" - "\t(right click mouse button for Menu)\n" - "\t[esc] - Quit\n\n"); + printf("\n" + "\tControls\n" + "\t(right click mouse button for Menu)\n" + "\t[esc] - Quit\n\n"); - // start rendering mainloop - glutMainLoop(); + // start rendering mainloop + glutMainLoop(); - // Normally unused return path - Cleanup(EXIT_SUCCESS); + // Normally unused return path + Cleanup(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////// //! Initialize GL //////////////////////////////////////////////////////////////////////////////// -bool initGL(int *argc, char **argv) { - // Create GL context - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); - glutInitWindowSize(window_width, window_height); - iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing"); +bool initGL(int *argc, char **argv) +{ + // Create GL context + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); + glutInitWindowSize(window_width, window_height); + iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing"); - // initialize necessary OpenGL extensions - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported("GL_ARB_pixel_buffer_object " - "GL_EXT_framebuffer_object")) { - printf("ERROR: Support for necessary OpenGL extensions missing."); - fflush(stderr); - return false; - } + // initialize necessary OpenGL extensions + if (!isGLVersionSupported(2, 0) + || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object " + "GL_EXT_framebuffer_object")) { + printf("ERROR: Support for necessary OpenGL extensions missing."); + fflush(stderr); + return false; + } // default initialization #ifndef USE_TEXTURE_RGBA8UI - glClearColor(0.5, 0.5, 0.5, 1.0); + glClearColor(0.5, 0.5, 0.5, 1.0); #else - glClearColorIuiEXT(128, 128, 128, 255); + glClearColorIuiEXT(128, 128, 128, 255); #endif - glDisable(GL_DEPTH_TEST); + glDisable(GL_DEPTH_TEST); - // viewport - glViewport(0, 0, window_width, window_height); + // viewport + glViewport(0, 0, window_width, window_height); - // projection - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, - 10.0f); + // projection + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, 10.0f); - glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); - glEnable(GL_LIGHT0); - float red[] = {1.0f, 0.1f, 0.1f, 1.0f}; - float white[] = {1.0f, 1.0f, 1.0f, 1.0f}; - glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red); - glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, white); - glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 60.0f); + glEnable(GL_LIGHT0); + float red[] = {1.0f, 0.1f, 0.1f, 1.0f}; + float white[] = {1.0f, 1.0f, 1.0f, 1.0f}; + glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red); + glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, white); + glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 60.0f); - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); - return true; + return true; } diff --git a/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu b/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu index a080fdd7..7248dcc1 100644 --- a/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu +++ b/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu @@ -35,28 +35,30 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); } __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); } // convert floating point rgb color to 8-bit integer -__device__ int rgbToInt(float r, float g, float b) { - r = clamp(r, 0.0f, 255.0f); - g = clamp(g, 0.0f, 255.0f); - b = clamp(b, 0.0f, 255.0f); - return (int(b) << 16) | (int(g) << 8) | int(r); +__device__ int rgbToInt(float r, float g, float b) +{ + r = clamp(r, 0.0f, 255.0f); + g = clamp(g, 0.0f, 255.0f); + b = clamp(b, 0.0f, 255.0f); + return (int(b) << 16) | (int(g) << 8) | int(r); } -__global__ void cudaProcess(unsigned int *g_odata, int imgw) { - extern __shared__ uchar4 sdata[]; +__global__ void cudaProcess(unsigned int *g_odata, int imgw) +{ + extern __shared__ uchar4 sdata[]; - int tx = threadIdx.x; - int ty = threadIdx.y; - int bw = blockDim.x; - int bh = blockDim.y; - int x = blockIdx.x * bw + tx; - int y = blockIdx.y * bh + ty; + int tx = threadIdx.x; + int ty = threadIdx.y; + int bw = blockDim.x; + int bh = blockDim.y; + int x = blockIdx.x * bw + tx; + int y = blockIdx.y * bh + ty; - uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0); - g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x); + uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0); + g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x); } -extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, - unsigned int *g_odata, int imgw) { - cudaProcess<<>>(g_odata, imgw); +extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw) +{ + cudaProcess<<>>(g_odata, imgw); } diff --git a/Samples/0_Introduction/simpleCallback/multithreading.cpp b/Samples/0_Introduction/simpleCallback/multithreading.cpp index 97ff820a..d4c89913 100644 --- a/Samples/0_Introduction/simpleCallback/multithreading.cpp +++ b/Samples/0_Introduction/simpleCallback/multithreading.cpp @@ -29,115 +29,124 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // Create thread -CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { - return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL); +CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) +{ + return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL); } // Wait for thread to finish -void cutEndThread(CUTThread thread) { - WaitForSingleObject(thread, INFINITE); - CloseHandle(thread); +void cutEndThread(CUTThread thread) +{ + WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); } // Wait for multiple threads -void cutWaitForThreads(const CUTThread *threads, int num) { - WaitForMultipleObjects(num, threads, true, INFINITE); +void cutWaitForThreads(const CUTThread *threads, int num) +{ + WaitForMultipleObjects(num, threads, true, INFINITE); - for (int i = 0; i < num; i++) { - CloseHandle(threads[i]); - } + for (int i = 0; i < num; i++) { + CloseHandle(threads[i]); + } } // Create barrier. -CUTBarrier cutCreateBarrier(int releaseCount) { - CUTBarrier barrier; +CUTBarrier cutCreateBarrier(int releaseCount) +{ + CUTBarrier barrier; - InitializeCriticalSection(&barrier.criticalSection); - barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent")); - barrier.count = 0; - barrier.releaseCount = releaseCount; + InitializeCriticalSection(&barrier.criticalSection); + barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent")); + barrier.count = 0; + barrier.releaseCount = releaseCount; - return barrier; + return barrier; } // Increment barrier. (execution continues) -void cutIncrementBarrier(CUTBarrier *barrier) { - int myBarrierCount; - EnterCriticalSection(&barrier->criticalSection); - myBarrierCount = ++barrier->count; - LeaveCriticalSection(&barrier->criticalSection); +void cutIncrementBarrier(CUTBarrier *barrier) +{ + int myBarrierCount; + EnterCriticalSection(&barrier->criticalSection); + myBarrierCount = ++barrier->count; + LeaveCriticalSection(&barrier->criticalSection); - if (myBarrierCount >= barrier->releaseCount) { - SetEvent(barrier->barrierEvent); - } + if (myBarrierCount >= barrier->releaseCount) { + SetEvent(barrier->barrierEvent); + } } // Wait for barrier release. -void cutWaitForBarrier(CUTBarrier *barrier) { - WaitForSingleObject(barrier->barrierEvent, INFINITE); -} +void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); } // Destroy barrier void cutDestroyBarrier(CUTBarrier *barrier) {} #else // Create thread -CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { - pthread_t thread; - pthread_create(&thread, NULL, func, data); - return thread; +CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) +{ + pthread_t thread; + pthread_create(&thread, NULL, func, data); + return thread; } // Wait for thread to finish void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); } // Wait for multiple threads -void cutWaitForThreads(const CUTThread *threads, int num) { - for (int i = 0; i < num; i++) { - cutEndThread(threads[i]); - } +void cutWaitForThreads(const CUTThread *threads, int num) +{ + for (int i = 0; i < num; i++) { + cutEndThread(threads[i]); + } } // Create barrier. -CUTBarrier cutCreateBarrier(int releaseCount) { - CUTBarrier barrier; +CUTBarrier cutCreateBarrier(int releaseCount) +{ + CUTBarrier barrier; - barrier.count = 0; - barrier.releaseCount = releaseCount; + barrier.count = 0; + barrier.releaseCount = releaseCount; - pthread_mutex_init(&barrier.mutex, 0); - pthread_cond_init(&barrier.conditionVariable, 0); + pthread_mutex_init(&barrier.mutex, 0); + pthread_cond_init(&barrier.conditionVariable, 0); - return barrier; + return barrier; } // Increment barrier. (execution continues) -void cutIncrementBarrier(CUTBarrier *barrier) { - int myBarrierCount; - pthread_mutex_lock(&barrier->mutex); - myBarrierCount = ++barrier->count; - pthread_mutex_unlock(&barrier->mutex); +void cutIncrementBarrier(CUTBarrier *barrier) +{ + int myBarrierCount; + pthread_mutex_lock(&barrier->mutex); + myBarrierCount = ++barrier->count; + pthread_mutex_unlock(&barrier->mutex); - if (myBarrierCount >= barrier->releaseCount) { - pthread_cond_signal(&barrier->conditionVariable); - } + if (myBarrierCount >= barrier->releaseCount) { + pthread_cond_signal(&barrier->conditionVariable); + } } // Wait for barrier release. -void cutWaitForBarrier(CUTBarrier *barrier) { - pthread_mutex_lock(&barrier->mutex); +void cutWaitForBarrier(CUTBarrier *barrier) +{ + pthread_mutex_lock(&barrier->mutex); - while (barrier->count < barrier->releaseCount) { - pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex); - } + while (barrier->count < barrier->releaseCount) { + pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex); + } - pthread_mutex_unlock(&barrier->mutex); + pthread_mutex_unlock(&barrier->mutex); } // Destroy barrier -void cutDestroyBarrier(CUTBarrier *barrier) { - pthread_mutex_destroy(&barrier->mutex); - pthread_cond_destroy(&barrier->conditionVariable); +void cutDestroyBarrier(CUTBarrier *barrier) +{ + pthread_mutex_destroy(&barrier->mutex); + pthread_cond_destroy(&barrier->conditionVariable); } #endif diff --git a/Samples/0_Introduction/simpleCallback/multithreading.h b/Samples/0_Introduction/simpleCallback/multithreading.h index a0895ba7..d9b4b2a0 100644 --- a/Samples/0_Introduction/simpleCallback/multithreading.h +++ b/Samples/0_Introduction/simpleCallback/multithreading.h @@ -37,15 +37,16 @@ typedef HANDLE CUTThread; typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *); -struct CUTBarrier { - CRITICAL_SECTION criticalSection; - HANDLE barrierEvent; - int releaseCount; - int count; +struct CUTBarrier +{ + CRITICAL_SECTION criticalSection; + HANDLE barrierEvent; + int releaseCount; + int count; }; #define CUT_THREADPROC unsigned WINAPI -#define CUT_THREADEND return 0 +#define CUT_THREADEND return 0 #else // POSIX threads. @@ -55,44 +56,46 @@ typedef pthread_t CUTThread; typedef void *(*CUT_THREADROUTINE)(void *); #define CUT_THREADPROC void * -#define CUT_THREADEND return 0 +#define CUT_THREADEND return 0 -struct CUTBarrier { - pthread_mutex_t mutex; - pthread_cond_t conditionVariable; - int releaseCount; - int count; +struct CUTBarrier +{ + pthread_mutex_t mutex; + pthread_cond_t conditionVariable; + int releaseCount; + int count; }; #endif #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -// Create thread. -CUTThread cutStartThread(CUT_THREADROUTINE, void *data); + // Create thread. + CUTThread cutStartThread(CUT_THREADROUTINE, void *data); -// Wait for thread to finish. -void cutEndThread(CUTThread thread); + // Wait for thread to finish. + void cutEndThread(CUTThread thread); -// Wait for multiple threads. -void cutWaitForThreads(const CUTThread *threads, int num); + // Wait for multiple threads. + void cutWaitForThreads(const CUTThread *threads, int num); -// Create barrier. -CUTBarrier cutCreateBarrier(int releaseCount); + // Create barrier. + CUTBarrier cutCreateBarrier(int releaseCount); -// Increment barrier. (execution continues) -void cutIncrementBarrier(CUTBarrier *barrier); + // Increment barrier. (execution continues) + void cutIncrementBarrier(CUTBarrier *barrier); -// Wait for barrier release. -void cutWaitForBarrier(CUTBarrier *barrier); + // Wait for barrier release. + void cutWaitForBarrier(CUTBarrier *barrier); -// Destroy barrier -void cutDestroyBarrier(CUTBarrier *barrier); + // Destroy barrier + void cutDestroyBarrier(CUTBarrier *barrier); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif -#endif // MULTITHREADING_H +#endif // MULTITHREADING_H diff --git a/Samples/0_Introduction/simpleCallback/simpleCallback.cu b/Samples/0_Introduction/simpleCallback/simpleCallback.cu index 088c6741..c8df6ee0 100644 --- a/Samples/0_Introduction/simpleCallback/simpleCallback.cu +++ b/Samples/0_Introduction/simpleCallback/simpleCallback.cu @@ -43,172 +43,173 @@ #include // helper functions and utilities to work with CUDA -#include #include +#include #include "multithreading.h" -const int N_workloads = 8; +const int N_workloads = 8; const int N_elements_per_workload = 100000; CUTBarrier thread_barrier; -void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, - void *data); +void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data); -struct heterogeneous_workload { - int id; - int cudaDeviceID; +struct heterogeneous_workload +{ + int id; + int cudaDeviceID; - int *h_data; - int *d_data; - cudaStream_t stream; + int *h_data; + int *d_data; + cudaStream_t stream; - bool success; + bool success; }; -__global__ void incKernel(int *data, int N) { - int i = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void incKernel(int *data, int N) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) data[i]++; + if (i < N) + data[i]++; } -CUT_THREADPROC launch(void *void_arg) { - heterogeneous_workload *workload = (heterogeneous_workload *)void_arg; +CUT_THREADPROC launch(void *void_arg) +{ + heterogeneous_workload *workload = (heterogeneous_workload *)void_arg; - // Select GPU for this CPU thread - checkCudaErrors(cudaSetDevice(workload->cudaDeviceID)); + // Select GPU for this CPU thread + checkCudaErrors(cudaSetDevice(workload->cudaDeviceID)); - // Allocate Resources - checkCudaErrors(cudaStreamCreate(&workload->stream)); - checkCudaErrors( - cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int))); - checkCudaErrors(cudaHostAlloc(&workload->h_data, - N_elements_per_workload * sizeof(int), - cudaHostAllocPortable)); + // Allocate Resources + checkCudaErrors(cudaStreamCreate(&workload->stream)); + checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int))); + checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable)); - // CPU thread generates data - for (int i = 0; i < N_elements_per_workload; ++i) { - workload->h_data[i] = workload->id + i; - } - - // Schedule work for GPU in CUDA stream without blocking the CPU thread - // Note: Dedicated streams enable concurrent execution of workloads on the GPU - dim3 block(512); - dim3 grid((N_elements_per_workload + block.x - 1) / block.x); - - checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data, - N_elements_per_workload * sizeof(int), - cudaMemcpyHostToDevice, workload->stream)); - incKernel<<stream>>>(workload->d_data, - N_elements_per_workload); - checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data, - N_elements_per_workload * sizeof(int), - cudaMemcpyDeviceToHost, workload->stream)); - - // New in CUDA 5.0: Add a CPU callback which is called once all currently - // pending operations in the CUDA stream have finished - checkCudaErrors( - cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0)); - - CUT_THREADEND; - // CPU thread end of life, GPU continues to process data... -} - -CUT_THREADPROC postprocess(void *void_arg) { - heterogeneous_workload *workload = (heterogeneous_workload *)void_arg; - // ... GPU is done with processing, continue on new CPU thread... - - // Select GPU for this CPU thread - checkCudaErrors(cudaSetDevice(workload->cudaDeviceID)); - - // CPU thread consumes results from GPU - workload->success = true; - - for (int i = 0; i < N_workloads; ++i) { - workload->success &= workload->h_data[i] == i + workload->id + 1; - } - - // Free Resources - checkCudaErrors(cudaFree(workload->d_data)); - checkCudaErrors(cudaFreeHost(workload->h_data)); - checkCudaErrors(cudaStreamDestroy(workload->stream)); - - // Signal the end of the heterogeneous workload to main thread - cutIncrementBarrier(&thread_barrier); - - CUT_THREADEND; -} - -void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, - void *data) { - // Check status of GPU after stream operations are done - checkCudaErrors(status); - - // Spawn new CPU worker thread and continue processing on the CPU - cutStartThread(postprocess, data); -} - -int main(int argc, char **argv) { - int N_gpus, max_gpus = 0; - int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration - - printf("Starting simpleCallback\n"); - - checkCudaErrors(cudaGetDeviceCount(&N_gpus)); - printf("Found %d CUDA capable GPUs\n", N_gpus); - - if (N_gpus > 32) { - printf("simpleCallback only supports 32 GPU(s)\n"); - } - - for (int devid = 0; devid < N_gpus; devid++) { - int SMversion; - cudaDeviceProp deviceProp; - cudaSetDevice(devid); - cudaGetDeviceProperties(&deviceProp, devid); - SMversion = deviceProp.major << 4 + deviceProp.minor; - printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, - deviceProp.major, deviceProp.minor); - printf(", %s GPU Callback Functions\n", - (SMversion >= 0x11) ? "capable" : "NOT capable"); - - if (SMversion >= 0x11) { - gpuInfo[max_gpus++] = devid; + // CPU thread generates data + for (int i = 0; i < N_elements_per_workload; ++i) { + workload->h_data[i] = workload->id + i; } - } - printf("%d GPUs available to run Callback Functions\n", max_gpus); + // Schedule work for GPU in CUDA stream without blocking the CPU thread + // Note: Dedicated streams enable concurrent execution of workloads on the GPU + dim3 block(512); + dim3 grid((N_elements_per_workload + block.x - 1) / block.x); - heterogeneous_workload *workloads; - workloads = (heterogeneous_workload *)malloc(N_workloads * - sizeof(heterogeneous_workload)); - ; - thread_barrier = cutCreateBarrier(N_workloads); + checkCudaErrors(cudaMemcpyAsync(workload->d_data, + workload->h_data, + N_elements_per_workload * sizeof(int), + cudaMemcpyHostToDevice, + workload->stream)); + incKernel<<stream>>>(workload->d_data, N_elements_per_workload); + checkCudaErrors(cudaMemcpyAsync(workload->h_data, + workload->d_data, + N_elements_per_workload * sizeof(int), + cudaMemcpyDeviceToHost, + workload->stream)); - // Main thread spawns a CPU worker thread for each heterogeneous workload - printf("Starting %d heterogeneous computing workloads\n", N_workloads); + // New in CUDA 5.0: Add a CPU callback which is called once all currently + // pending operations in the CUDA stream have finished + checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0)); - for (int i = 0; i < N_workloads; ++i) { - workloads[i].id = i; - workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus; - - cutStartThread(launch, &workloads[i]); - } - - // Sleep until all workloads have finished - cutWaitForBarrier(&thread_barrier); - printf("Total of %d workloads finished:\n", N_workloads); - - bool success = true; - - for (int i = 0; i < N_workloads; ++i) { - success &= workloads[i].success; - } - - printf("%s\n", success ? "Success" : "Failure"); - - free(workloads); - - exit(success ? EXIT_SUCCESS : EXIT_FAILURE); + CUT_THREADEND; + // CPU thread end of life, GPU continues to process data... +} + +CUT_THREADPROC postprocess(void *void_arg) +{ + heterogeneous_workload *workload = (heterogeneous_workload *)void_arg; + // ... GPU is done with processing, continue on new CPU thread... + + // Select GPU for this CPU thread + checkCudaErrors(cudaSetDevice(workload->cudaDeviceID)); + + // CPU thread consumes results from GPU + workload->success = true; + + for (int i = 0; i < N_workloads; ++i) { + workload->success &= workload->h_data[i] == i + workload->id + 1; + } + + // Free Resources + checkCudaErrors(cudaFree(workload->d_data)); + checkCudaErrors(cudaFreeHost(workload->h_data)); + checkCudaErrors(cudaStreamDestroy(workload->stream)); + + // Signal the end of the heterogeneous workload to main thread + cutIncrementBarrier(&thread_barrier); + + CUT_THREADEND; +} + +void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data) +{ + // Check status of GPU after stream operations are done + checkCudaErrors(status); + + // Spawn new CPU worker thread and continue processing on the CPU + cutStartThread(postprocess, data); +} + +int main(int argc, char **argv) +{ + int N_gpus, max_gpus = 0; + int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration + + printf("Starting simpleCallback\n"); + + checkCudaErrors(cudaGetDeviceCount(&N_gpus)); + printf("Found %d CUDA capable GPUs\n", N_gpus); + + if (N_gpus > 32) { + printf("simpleCallback only supports 32 GPU(s)\n"); + } + + for (int devid = 0; devid < N_gpus; devid++) { + int SMversion; + cudaDeviceProp deviceProp; + cudaSetDevice(devid); + cudaGetDeviceProperties(&deviceProp, devid); + SMversion = deviceProp.major << 4 + deviceProp.minor; + printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor); + printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable"); + + if (SMversion >= 0x11) { + gpuInfo[max_gpus++] = devid; + } + } + + printf("%d GPUs available to run Callback Functions\n", max_gpus); + + heterogeneous_workload *workloads; + workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload)); + ; + thread_barrier = cutCreateBarrier(N_workloads); + + // Main thread spawns a CPU worker thread for each heterogeneous workload + printf("Starting %d heterogeneous computing workloads\n", N_workloads); + + for (int i = 0; i < N_workloads; ++i) { + workloads[i].id = i; + workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus; + + cutStartThread(launch, &workloads[i]); + } + + // Sleep until all workloads have finished + cutWaitForBarrier(&thread_barrier); + printf("Total of %d workloads finished:\n", N_workloads); + + bool success = true; + + for (int i = 0; i < N_workloads; ++i) { + success &= workloads[i].success; + } + + printf("%s\n", success ? "Success" : "Failure"); + + free(workloads); + + exit(success ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu b/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu index cbc7a123..b7ca20a0 100644 --- a/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu +++ b/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu @@ -38,8 +38,8 @@ * */ -#include #include +#include using namespace cooperative_groups; @@ -49,35 +49,36 @@ using namespace cooperative_groups; * calculates the sum of val across the group g. The workspace array, x, * must be large enough to contain g.size() integers. */ -__device__ int sumReduction(thread_group g, int *x, int val) { - // rank of this thread in the group - int lane = g.thread_rank(); +__device__ int sumReduction(thread_group g, int *x, int val) +{ + // rank of this thread in the group + int lane = g.thread_rank(); - // for each iteration of this loop, the number of threads active in the - // reduction, i, is halved, and each active thread (with index [lane]) - // performs a single summation of it's own value with that - // of a "partner" (with index [lane+i]). - for (int i = g.size() / 2; i > 0; i /= 2) { - // store value for this thread in temporary array - x[lane] = val; + // for each iteration of this loop, the number of threads active in the + // reduction, i, is halved, and each active thread (with index [lane]) + // performs a single summation of it's own value with that + // of a "partner" (with index [lane+i]). + for (int i = g.size() / 2; i > 0; i /= 2) { + // store value for this thread in temporary array + x[lane] = val; - // synchronize all threads in group - g.sync(); + // synchronize all threads in group + g.sync(); - if (lane < i) - // active threads perform summation of their value with - // their partner's value - val += x[lane + i]; + if (lane < i) + // active threads perform summation of their value with + // their partner's value + val += x[lane + i]; - // synchronize all threads in group - g.sync(); - } + // synchronize all threads in group + g.sync(); + } - // master thread in group returns result, and others return -1. - if (g.thread_rank() == 0) - return val; - else - return -1; + // master thread in group returns result, and others return -1. + if (g.thread_rank() == 0) + return val; + else + return -1; } /** @@ -85,93 +86,92 @@ __device__ int sumReduction(thread_group g, int *x, int val) { * * Creates cooperative groups and performs reductions */ -__global__ void cgkernel() { - // threadBlockGroup includes all threads in the block - thread_block threadBlockGroup = this_thread_block(); - int threadBlockGroupSize = threadBlockGroup.size(); +__global__ void cgkernel() +{ + // threadBlockGroup includes all threads in the block + thread_block threadBlockGroup = this_thread_block(); + int threadBlockGroupSize = threadBlockGroup.size(); - // workspace array in shared memory required for reduction - extern __shared__ int workspace[]; + // workspace array in shared memory required for reduction + extern __shared__ int workspace[]; - int input, output, expectedOutput; + int input, output, expectedOutput; - // input to reduction, for each thread, is its' rank in the group - input = threadBlockGroup.thread_rank(); + // input to reduction, for each thread, is its' rank in the group + input = threadBlockGroup.thread_rank(); - // expected output from analytical formula (n-1)(n)/2 - // (noting that indexing starts at 0 rather than 1) - expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2; + // expected output from analytical formula (n-1)(n)/2 + // (noting that indexing starts at 0 rather than 1) + expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2; - // perform reduction - output = sumReduction(threadBlockGroup, workspace, input); + // perform reduction + output = sumReduction(threadBlockGroup, workspace, input); - // master thread in group prints out result - if (threadBlockGroup.thread_rank() == 0) { - printf( - " Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n", - (int)threadBlockGroup.size() - 1, output, expectedOutput); + // master thread in group prints out result + if (threadBlockGroup.thread_rank() == 0) { + printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n", + (int)threadBlockGroup.size() - 1, + output, + expectedOutput); - printf(" Now creating %d groups, each of size 16 threads:\n\n", - (int)threadBlockGroup.size() / 16); - } + printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16); + } - threadBlockGroup.sync(); + threadBlockGroup.sync(); - // each tiledPartition16 group includes 16 threads - thread_block_tile<16> tiledPartition16 = - tiled_partition<16>(threadBlockGroup); + // each tiledPartition16 group includes 16 threads + thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup); - // This offset allows each group to have its own unique area in the workspace - // array - int workspaceOffset = - threadBlockGroup.thread_rank() - tiledPartition16.thread_rank(); + // This offset allows each group to have its own unique area in the workspace + // array + int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank(); - // input to reduction, for each thread, is its' rank in the group - input = tiledPartition16.thread_rank(); + // input to reduction, for each thread, is its' rank in the group + input = tiledPartition16.thread_rank(); - // expected output from analytical formula (n-1)(n)/2 - // (noting that indexing starts at 0 rather than 1) - expectedOutput = 15 * 16 / 2; + // expected output from analytical formula (n-1)(n)/2 + // (noting that indexing starts at 0 rather than 1) + expectedOutput = 15 * 16 / 2; - // Perform reduction - output = sumReduction(tiledPartition16, workspace + workspaceOffset, input); + // Perform reduction + output = sumReduction(tiledPartition16, workspace + workspaceOffset, input); - // each master thread prints out result - if (tiledPartition16.thread_rank() == 0) - printf( - " Sum of all ranks 0..15 in this tiledPartition16 group is %d " - "(expected %d)\n", - output, expectedOutput); + // each master thread prints out result + if (tiledPartition16.thread_rank() == 0) + printf(" Sum of all ranks 0..15 in this tiledPartition16 group is %d " + "(expected %d)\n", + output, + expectedOutput); - return; + return; } /** * Host main routine */ -int main() { - // Error code to check return values for CUDA calls - cudaError_t err; +int main() +{ + // Error code to check return values for CUDA calls + cudaError_t err; - // Launch the kernel + // Launch the kernel - int blocksPerGrid = 1; - int threadsPerBlock = 64; + int blocksPerGrid = 1; + int threadsPerBlock = 64; - printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock); + printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock); - // we use the optional third argument to specify the size - // of shared memory required in the kernel - cgkernel<<>>(); - err = cudaDeviceSynchronize(); + // we use the optional third argument to specify the size + // of shared memory required in the kernel + cgkernel<<>>(); + err = cudaDeviceSynchronize(); - if (err != cudaSuccess) { - fprintf(stderr, "Failed to launch kernel (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } + if (err != cudaSuccess) { + fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } - printf("\n...Done.\n\n"); + printf("\n...Done.\n\n"); - return 0; + return 0; } diff --git a/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu b/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu index 35979a44..bce14bc8 100644 --- a/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu +++ b/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu @@ -26,27 +26,27 @@ */ /* -* This sample demonstrates how to use texture fetches from layered 2D textures -* in CUDA C -* -* This sample first generates a 3D input data array for the layered texture -* and the expected output. Then it starts CUDA C kernels, one for each layer, -* which fetch their layer's texture data (using normalized texture coordinates) -* transform it to the expected output, and write it to a 3D output data array. -*/ + * This sample demonstrates how to use texture fetches from layered 2D textures + * in CUDA C + * + * This sample first generates a 3D input data array for the layered texture + * and the expected output. Then it starts CUDA C kernels, one for each layer, + * which fetch their layer's texture data (using normalized texture coordinates) + * transform it to the expected output, and write it to a 3D output data array. + */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes CUDA #include // helper functions and utilities to work with CUDA -#include #include +#include static const char *sSDKname = "simpleCubemapTexture"; @@ -56,213 +56,207 @@ static const char *sSDKname = "simpleCubemapTexture"; //! Transform a cubemap face of a linear buffe using cubemap texture lookups //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void transformKernel(float *g_odata, int width, - cudaTextureObject_t tex) { - // calculate this thread's data point - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex) +{ + // calculate this thread's data point + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - // 0.5f offset and division are necessary to access the original data points - // in the texture (such that bilinear interpolation will not be activated). - // For details, see also CUDA Programming Guide, Appendix D + // 0.5f offset and division are necessary to access the original data points + // in the texture (such that bilinear interpolation will not be activated). + // For details, see also CUDA Programming Guide, Appendix D - float u = ((x + 0.5f) / (float)width) * 2.f - 1.f; - float v = ((y + 0.5f) / (float)width) * 2.f - 1.f; + float u = ((x + 0.5f) / (float)width) * 2.f - 1.f; + float v = ((y + 0.5f) / (float)width) * 2.f - 1.f; - float cx, cy, cz; + float cx, cy, cz; - for (unsigned int face = 0; face < 6; face++) { - // Layer 0 is positive X face - if (face == 0) { - cx = 1; - cy = -v; - cz = -u; - } - // Layer 1 is negative X face - else if (face == 1) { - cx = -1; - cy = -v; - cz = u; - } - // Layer 2 is positive Y face - else if (face == 2) { - cx = u; - cy = 1; - cz = v; - } - // Layer 3 is negative Y face - else if (face == 3) { - cx = u; - cy = -1; - cz = -v; - } - // Layer 4 is positive Z face - else if (face == 4) { - cx = u; - cy = -v; - cz = 1; - } - // Layer 4 is negative Z face - else if (face == 5) { - cx = -u; - cy = -v; - cz = -1; - } + for (unsigned int face = 0; face < 6; face++) { + // Layer 0 is positive X face + if (face == 0) { + cx = 1; + cy = -v; + cz = -u; + } + // Layer 1 is negative X face + else if (face == 1) { + cx = -1; + cy = -v; + cz = u; + } + // Layer 2 is positive Y face + else if (face == 2) { + cx = u; + cy = 1; + cz = v; + } + // Layer 3 is negative Y face + else if (face == 3) { + cx = u; + cy = -1; + cz = -v; + } + // Layer 4 is positive Z face + else if (face == 4) { + cx = u; + cy = -v; + cz = 1; + } + // Layer 4 is negative Z face + else if (face == 5) { + cx = -u; + cy = -v; + cz = -1; + } - // read from texture, do expected transformation and write to global memory - g_odata[face * width * width + y * width + x] = - -texCubemap(tex, cx, cy, cz); - } + // read from texture, do expected transformation and write to global memory + g_odata[face * width * width + y * width + x] = -texCubemap(tex, cx, cy, cz); + } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - int devID = findCudaDevice(argc, (const char **)argv); +int main(int argc, char **argv) +{ + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); - bool bResult = true; + bool bResult = true; - // get number of SMs on this GPU - cudaDeviceProp deviceProps; + // get number of SMs on this GPU + cudaDeviceProp deviceProps; - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, - deviceProps.multiProcessorCount); - printf("SM %d.%d\n", deviceProps.major, deviceProps.minor); + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount); + printf("SM %d.%d\n", deviceProps.major, deviceProps.minor); - if (deviceProps.major < 2) { - printf( - "%s requires SM 2.0 or higher for support of Texture Arrays. Test " - "will exit... \n", - sSDKname); + if (deviceProps.major < 2) { + printf("%s requires SM 2.0 or higher for support of Texture Arrays. Test " + "will exit... \n", + sSDKname); - exit(EXIT_WAIVED); - } - - // generate input data for layered texture - unsigned int width = 64, num_faces = 6, num_layers = 1; - unsigned int cubemap_size = width * width * num_faces; - unsigned int size = cubemap_size * num_layers * sizeof(float); - float *h_data = (float *)malloc(size); - - for (int i = 0; i < (int)(cubemap_size * num_layers); i++) { - h_data[i] = (float)i; - } - - // this is the expected transformation of the input data (the expected output) - float *h_data_ref = (float *)malloc(size); - - for (unsigned int layer = 0; layer < num_layers; layer++) { - for (int i = 0; i < (int)(cubemap_size); i++) { - h_data_ref[layer * cubemap_size + i] = - -h_data[layer * cubemap_size + i] + layer; + exit(EXIT_WAIVED); } - } - // allocate device memory for result - float *d_data = NULL; - checkCudaErrors(cudaMalloc((void **)&d_data, size)); + // generate input data for layered texture + unsigned int width = 64, num_faces = 6, num_layers = 1; + unsigned int cubemap_size = width * width * num_faces; + unsigned int size = cubemap_size * num_layers * sizeof(float); + float *h_data = (float *)malloc(size); - // allocate array and copy image data - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); - cudaArray *cu_3darray; - // checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc, - // make_cudaExtent(width, height, num_layers), cudaArrayLayered )); - checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, - make_cudaExtent(width, width, num_faces), - cudaArrayCubemap)); - cudaMemcpy3DParms myparms = {0}; - myparms.srcPos = make_cudaPos(0, 0, 0); - myparms.dstPos = make_cudaPos(0, 0, 0); - myparms.srcPtr = - make_cudaPitchedPtr(h_data, width * sizeof(float), width, width); - myparms.dstArray = cu_3darray; - myparms.extent = make_cudaExtent(width, width, num_faces); - myparms.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(&myparms)); + for (int i = 0; i < (int)(cubemap_size * num_layers); i++) { + h_data[i] = (float)i; + } - cudaTextureObject_t tex; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + // this is the expected transformation of the input data (the expected output) + float *h_data_ref = (float *)malloc(size); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = cu_3darray; + for (unsigned int layer = 0; layer < num_layers; layer++) { + for (int i = 0; i < (int)(cubemap_size); i++) { + h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer; + } + } - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + // allocate device memory for result + float *d_data = NULL; + checkCudaErrors(cudaMalloc((void **)&d_data, size)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.addressMode[2] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + // allocate array and copy image data + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaArray *cu_3darray; + // checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc, + // make_cudaExtent(width, height, num_layers), cudaArrayLayered )); + checkCudaErrors( + cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap)); + cudaMemcpy3DParms myparms = {0}; + myparms.srcPos = make_cudaPos(0, 0, 0); + myparms.dstPos = make_cudaPos(0, 0, 0); + myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width); + myparms.dstArray = cu_3darray; + myparms.extent = make_cudaExtent(width, width, num_faces); + myparms.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(&myparms)); - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); + cudaTextureObject_t tex; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - dim3 dimBlock(8, 8, 1); - dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = cu_3darray; - printf( - "Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each " - "block has 8 x 8 threads\n", - width, num_layers, dimGrid.x, dimGrid.y); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - transformKernel<<>>(d_data, width, - tex); // warmup (for better timing) + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.addressMode[2] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - // check if kernel execution generated an error - getLastCudaError("warmup Kernel execution failed"); + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); - checkCudaErrors(cudaDeviceSynchronize()); + dim3 dimBlock(8, 8, 1); + dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1); - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each " + "block has 8 x 8 threads\n", + width, + num_layers, + dimGrid.x, + dimGrid.y); - // execute the kernel - transformKernel<<>>(d_data, width, tex); + transformKernel<<>>(d_data, width, + tex); // warmup (for better timing) - // check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); + // check if kernel execution generated an error + getLastCudaError("warmup Kernel execution failed"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer); - printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer)); - printf("%.2f Mtexlookups/sec\n", - (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6)); - sdkDeleteTimer(&timer); + checkCudaErrors(cudaDeviceSynchronize()); - // allocate mem for the result on host side - float *h_odata = (float *)malloc(size); - // copy result from device to host - checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost)); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); - // write regression file if necessary - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // write file for regression test - sdkWriteFile("./data/regression.dat", h_odata, width * width, 0.0f, - false); - } else { - printf("Comparing kernel output to expected data\n"); + // execute the kernel + transformKernel<<>>(d_data, width, tex); + + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer); + printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer)); + printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6)); + sdkDeleteTimer(&timer); + + // allocate mem for the result on host side + float *h_odata = (float *)malloc(size); + // copy result from device to host + checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost)); + + // write regression file if necessary + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // write file for regression test + sdkWriteFile("./data/regression.dat", h_odata, width * width, 0.0f, false); + } + else { + printf("Comparing kernel output to expected data\n"); #define MIN_EPSILON_ERROR 5e-3f - bResult = - compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f); - } + bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f); + } - // cleanup memory - free(h_data); - free(h_data_ref); - free(h_odata); + // cleanup memory + free(h_data); + free(h_data_ref); + free(h_odata); - checkCudaErrors(cudaDestroyTextureObject(tex)); - checkCudaErrors(cudaFree(d_data)); - checkCudaErrors(cudaFreeArray(cu_3darray)); + checkCudaErrors(cudaDestroyTextureObject(tex)); + checkCudaErrors(cudaFree(d_data)); + checkCudaErrors(cudaFreeArray(cu_3darray)); - exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp index d1c03ef6..77bcf10a 100644 --- a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp +++ b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp @@ -33,12 +33,12 @@ */ // Includes +#include #include #include +#include #include #include -#include -#include // includes, project #include @@ -62,165 +62,165 @@ float *d_B; float *d_C; // Functions -int CleanupNoFailure(CUcontext &cuContext); +int CleanupNoFailure(CUcontext &cuContext); void RandomInit(float *, int); bool findModulePath(const char *, string &, char **, ostringstream &); -static void check(CUresult result, char const *const func, - const char *const file, int const line) { - if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, - static_cast(result), func); - exit(EXIT_FAILURE); - } +static void check(CUresult result, char const *const func, const char *const file, int const line) +{ + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast(result), func); + exit(EXIT_FAILURE); + } } #define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__) // Host code -int main(int argc, char **argv) { - printf("simpleDrvRuntime..\n"); - int N = 50000, devID = 0; - size_t size = N * sizeof(float); - CUdevice cuDevice; - CUfunction vecAdd_kernel; - CUmodule cuModule = 0; - CUcontext cuContext; +int main(int argc, char **argv) +{ + printf("simpleDrvRuntime..\n"); + int N = 50000, devID = 0; + size_t size = N * sizeof(float); + CUdevice cuDevice; + CUfunction vecAdd_kernel; + CUmodule cuModule = 0; + CUcontext cuContext; - // Initialize - checkCudaDrvErrors(cuInit(0)); + // Initialize + checkCudaDrvErrors(cuInit(0)); - cuDevice = findCudaDevice(argc, (const char **)argv); - // Create context - checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + cuDevice = findCudaDevice(argc, (const char **)argv); + // Create context + checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice)); - // first search for the module path before we load the results - string module_path; - ostringstream fatbin; + // first search for the module path before we load the results + string module_path; + ostringstream fatbin; - if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) { - exit(EXIT_FAILURE); - } else { - printf("> initCUDA loading module: <%s>\n", module_path.c_str()); - } - - if (!fatbin.str().size()) { - printf("fatbin file empty. exiting..\n"); - exit(EXIT_FAILURE); - } - - // Create module from binary file (FATBIN) - checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); - - // Get function handle from module - checkCudaDrvErrors( - cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel")); - - // Allocate input vectors h_A and h_B in host memory - checkCudaErrors(cudaMallocHost(&h_A, size)); - checkCudaErrors(cudaMallocHost(&h_B, size)); - checkCudaErrors(cudaMallocHost(&h_C, size)); - - // Initialize input vectors - RandomInit(h_A, N); - RandomInit(h_B, N); - - // Allocate vectors in device memory - checkCudaErrors(cudaMalloc((void **)(&d_A), size)); - checkCudaErrors(cudaMalloc((void **)(&d_B), size)); - checkCudaErrors(cudaMalloc((void **)(&d_C), size)); - - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - // Copy vectors from host memory to device memory - checkCudaErrors( - cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream)); - checkCudaErrors( - cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream)); - - int threadsPerBlock = 256; - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - - void *args[] = {&d_A, &d_B, &d_C, &N}; - - // Launch the CUDA kernel - checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, - threadsPerBlock, 1, 1, 0, stream, args, - NULL)); - - // Copy result from device memory to host memory - // h_C contains the result in host memory - checkCudaErrors( - cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - // Verify result - int i; - - for (i = 0; i < N; ++i) { - float sum = h_A[i] + h_B[i]; - - if (fabs(h_C[i] - sum) > 1e-7f) { - break; + if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) { + exit(EXIT_FAILURE); + } + else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); } - } - checkCudaDrvErrors(cuModuleUnload(cuModule)); - CleanupNoFailure(cuContext); - printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL"); + if (!fatbin.str().size()) { + printf("fatbin file empty. exiting..\n"); + exit(EXIT_FAILURE); + } - exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); + // Create module from binary file (FATBIN) + checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); + + // Get function handle from module + checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel")); + + // Allocate input vectors h_A and h_B in host memory + checkCudaErrors(cudaMallocHost(&h_A, size)); + checkCudaErrors(cudaMallocHost(&h_B, size)); + checkCudaErrors(cudaMallocHost(&h_C, size)); + + // Initialize input vectors + RandomInit(h_A, N); + RandomInit(h_B, N); + + // Allocate vectors in device memory + checkCudaErrors(cudaMalloc((void **)(&d_A), size)); + checkCudaErrors(cudaMalloc((void **)(&d_B), size)); + checkCudaErrors(cudaMalloc((void **)(&d_C), size)); + + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + // Copy vectors from host memory to device memory + checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream)); + + int threadsPerBlock = 256; + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + void *args[] = {&d_A, &d_B, &d_C, &N}; + + // Launch the CUDA kernel + checkCudaDrvErrors( + cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL)); + + // Copy result from device memory to host memory + // h_C contains the result in host memory + checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + // Verify result + int i; + + for (i = 0; i < N; ++i) { + float sum = h_A[i] + h_B[i]; + + if (fabs(h_C[i] - sum) > 1e-7f) { + break; + } + } + + checkCudaDrvErrors(cuModuleUnload(cuModule)); + CleanupNoFailure(cuContext); + printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL"); + + exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); } -int CleanupNoFailure(CUcontext &cuContext) { - // Free device memory - checkCudaErrors(cudaFree(d_A)); - checkCudaErrors(cudaFree(d_B)); - checkCudaErrors(cudaFree(d_C)); +int CleanupNoFailure(CUcontext &cuContext) +{ + // Free device memory + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_C)); - // Free host memory - if (h_A) { - checkCudaErrors(cudaFreeHost(h_A)); - } + // Free host memory + if (h_A) { + checkCudaErrors(cudaFreeHost(h_A)); + } - if (h_B) { - checkCudaErrors(cudaFreeHost(h_B)); - } + if (h_B) { + checkCudaErrors(cudaFreeHost(h_B)); + } - if (h_C) { - checkCudaErrors(cudaFreeHost(h_C)); - } + if (h_C) { + checkCudaErrors(cudaFreeHost(h_C)); + } - checkCudaDrvErrors(cuCtxDestroy(cuContext)); + checkCudaDrvErrors(cuCtxDestroy(cuContext)); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } // Allocates an array with random float entries. -void RandomInit(float *data, int n) { - for (int i = 0; i < n; ++i) { - data[i] = rand() / (float)RAND_MAX; - } -} - -bool inline findModulePath(const char *module_file, string &module_path, - char **argv, ostringstream &ostrm) { - char *actual_path = sdkFindFilePath(module_file, argv[0]); - - if (actual_path) { - module_path = actual_path; - } else { - printf("> findModulePath file not found: <%s> \n", module_file); - return false; - } - - if (module_path.empty()) { - printf("> findModulePath could not find file: <%s> \n", module_file); - return false; - } else { - printf("> findModulePath found file at <%s>\n", module_path.c_str()); - if (module_path.rfind("fatbin") != string::npos) { - ifstream fileIn(module_path.c_str(), ios::binary); - ostrm << fileIn.rdbuf(); +void RandomInit(float *data, int n) +{ + for (int i = 0; i < n; ++i) { + data[i] = rand() / (float)RAND_MAX; + } +} + +bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm) +{ + char *actual_path = sdkFindFilePath(module_file, argv[0]); + + if (actual_path) { + module_path = actual_path; + } + else { + printf("> findModulePath file not found: <%s> \n", module_file); + return false; + } + + if (module_path.empty()) { + printf("> findModulePath could not find file: <%s> \n", module_file); + return false; + } + else { + printf("> findModulePath found file at <%s>\n", module_path.c_str()); + if (module_path.rfind("fatbin") != string::npos) { + ifstream fileIn(module_path.c_str(), ios::binary); + ostrm << fileIn.rdbuf(); + } + return true; } - return true; - } } diff --git a/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu b/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu index 8dba27ac..00662fa4 100644 --- a/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu +++ b/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu @@ -34,9 +34,10 @@ */ // Device code -extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, - float *C, int N) { - int i = blockDim.x * blockIdx.x + threadIdx.x; +extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < N) C[i] = A[i] + B[i]; + if (i < N) + C[i] = A[i] + B[i]; } diff --git a/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu b/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu index 0447eb7e..2972d88b 100644 --- a/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu +++ b/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu @@ -44,188 +44,188 @@ const char *sSDKsample = "hyperQ"; // This subroutine does no real work but runs for at least the specified number // of clock ticks. -__device__ void clock_block(clock_t *d_o, clock_t clock_count) { - unsigned int start_clock = (unsigned int)clock(); +__device__ void clock_block(clock_t *d_o, clock_t clock_count) +{ + unsigned int start_clock = (unsigned int)clock(); - clock_t clock_offset = 0; + clock_t clock_offset = 0; - while (clock_offset < clock_count) { - unsigned int end_clock = (unsigned int)clock(); + while (clock_offset < clock_count) { + unsigned int end_clock = (unsigned int)clock(); - // The code below should work like - // this (thanks to modular arithmetics): - // - // clock_offset = (clock_t) (end_clock > start_clock ? - // end_clock - start_clock : - // end_clock + (0xffffffffu - start_clock)); - // - // Indeed, let m = 2^32 then - // end - start = end + m - start (mod m). + // The code below should work like + // this (thanks to modular arithmetics): + // + // clock_offset = (clock_t) (end_clock > start_clock ? + // end_clock - start_clock : + // end_clock + (0xffffffffu - start_clock)); + // + // Indeed, let m = 2^32 then + // end - start = end + m - start (mod m). - clock_offset = (clock_t)(end_clock - start_clock); - } + clock_offset = (clock_t)(end_clock - start_clock); + } - d_o[0] = clock_offset; + d_o[0] = clock_offset; } // We create two identical kernels calling clock_block(), we create two so that // we can identify dependencies in the profile timeline ("kernel_B" is always // dependent on "kernel_A" in the same stream). -__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { - clock_block(d_o, clock_count); -} -__global__ void kernel_B(clock_t *d_o, clock_t clock_count) { - clock_block(d_o, clock_count); -} +__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); } +__global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); } // Single-warp reduction kernel (note: this is not optimized for simplicity) -__global__ void sum(clock_t *d_clocks, int N) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ clock_t s_clocks[32]; +__global__ void sum(clock_t *d_clocks, int N) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ clock_t s_clocks[32]; - clock_t my_sum = 0; + clock_t my_sum = 0; - for (int i = threadIdx.x; i < N; i += blockDim.x) { - my_sum += d_clocks[i]; - } - - s_clocks[threadIdx.x] = my_sum; - cg::sync(cta); - - for (int i = warpSize / 2; i > 0; i /= 2) { - if (threadIdx.x < i) { - s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i]; + for (int i = threadIdx.x; i < N; i += blockDim.x) { + my_sum += d_clocks[i]; } + s_clocks[threadIdx.x] = my_sum; cg::sync(cta); - } - if (threadIdx.x == 0) { - d_clocks[0] = s_clocks[0]; - } -} + for (int i = warpSize / 2; i > 0; i /= 2) { + if (threadIdx.x < i) { + s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i]; + } -int main(int argc, char **argv) { - int nstreams = 32; // One stream for each pair of kernels - float kernel_time = 10; // Time each kernel should run in ms - float elapsed_time; - int cuda_device = 0; - - printf("starting %s...\n", sSDKsample); - - // Get number of streams (if overridden on the command line) - if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) { - nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams"); - } - - // Use command-line specified CUDA device, otherwise use device with - // highest Gflops/s - cuda_device = findCudaDevice(argc, (const char **)argv); - - // Get device properties - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDevice(&cuda_device)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); - - // HyperQ is available in devices of Compute Capability 3.5 and higher - if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) { - if (deviceProp.concurrentKernels == 0) { - printf( - "> GPU does not support concurrent kernel execution (SM 3.5 or " - "higher required)\n"); - printf(" CUDA kernel runs will be serialized\n"); - } else { - printf("> GPU does not support HyperQ\n"); - printf(" CUDA kernel runs will have limited concurrency\n"); + cg::sync(cta); } - } - printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", - deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); - - // Allocate host memory for the output (reduced to a single value) - clock_t *a = 0; - checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t))); - - // Allocate device memory for the output (one value for each kernel) - clock_t *d_a = 0; - checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t))); - - // Allocate and initialize an array of stream handles - cudaStream_t *streams = - (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t)); - - for (int i = 0; i < nstreams; i++) { - checkCudaErrors(cudaStreamCreate(&(streams[i]))); - } - - // Create CUDA event handles - cudaEvent_t start_event, stop_event; - checkCudaErrors(cudaEventCreate(&start_event)); - checkCudaErrors(cudaEventCreate(&stop_event)); - - // Target time per kernel is kernel_time ms, clockRate is in KHz - // Target number of clocks = target time * clock frequency -#if defined(__arm__) || defined(__aarch64__) - // the kernel takes more time than the channel reset time on arm archs, so to - // prevent hangs reduce time_clocks. - clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100)); -#else - clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate); -#endif - clock_t total_clocks = 0; - - // Start the clock - checkCudaErrors(cudaEventRecord(start_event, 0)); - - // Queue pairs of {kernel_A, kernel_B} in separate streams - for (int i = 0; i < nstreams; ++i) { - kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks); - total_clocks += time_clocks; - kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks); - total_clocks += time_clocks; - } - - // Stop the clock in stream 0 (i.e. all previous kernels will be complete) - checkCudaErrors(cudaEventRecord(stop_event, 0)); - - // At this point the CPU has dispatched all work for the GPU and can - // continue processing other tasks in parallel. In this sample we just want - // to wait until all work is done so we use a blocking cudaMemcpy below. - - // Run the sum kernel and copy the result back to host - sum<<<1, 32>>>(d_a, 2 * nstreams); - checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost)); - - // stop_event will have been recorded but including the synchronize here to - // prevent copy/paste errors! - checkCudaErrors(cudaEventSynchronize(stop_event)); - checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); - - printf( - "Expected time for serial execution of %d sets of kernels is between " - "approx. %.3fs and %.3fs\n", - nstreams, (nstreams + 1) * kernel_time / 1000.0f, - 2 * nstreams * kernel_time / 1000.0f); - printf( - "Expected time for fully concurrent execution of %d sets of kernels is " - "approx. %.3fs\n", - nstreams, 2 * kernel_time / 1000.0f); - printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f); - - bool bTestResult = (a[0] >= total_clocks); - - // Release resources - for (int i = 0; i < nstreams; i++) { - cudaStreamDestroy(streams[i]); - } - - free(streams); - cudaEventDestroy(start_event); - cudaEventDestroy(stop_event); - cudaFreeHost(a); - cudaFree(d_a); - - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + if (threadIdx.x == 0) { + d_clocks[0] = s_clocks[0]; + } +} + +int main(int argc, char **argv) +{ + int nstreams = 32; // One stream for each pair of kernels + float kernel_time = 10; // Time each kernel should run in ms + float elapsed_time; + int cuda_device = 0; + + printf("starting %s...\n", sSDKsample); + + // Get number of streams (if overridden on the command line) + if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) { + nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams"); + } + + // Use command-line specified CUDA device, otherwise use device with + // highest Gflops/s + cuda_device = findCudaDevice(argc, (const char **)argv); + + // Get device properties + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDevice(&cuda_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); + + // HyperQ is available in devices of Compute Capability 3.5 and higher + if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) { + if (deviceProp.concurrentKernels == 0) { + printf("> GPU does not support concurrent kernel execution (SM 3.5 or " + "higher required)\n"); + printf(" CUDA kernel runs will be serialized\n"); + } + else { + printf("> GPU does not support HyperQ\n"); + printf(" CUDA kernel runs will have limited concurrency\n"); + } + } + + printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", + deviceProp.major, + deviceProp.minor, + deviceProp.multiProcessorCount); + + // Allocate host memory for the output (reduced to a single value) + clock_t *a = 0; + checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t))); + + // Allocate device memory for the output (one value for each kernel) + clock_t *d_a = 0; + checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t))); + + // Allocate and initialize an array of stream handles + cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t)); + + for (int i = 0; i < nstreams; i++) { + checkCudaErrors(cudaStreamCreate(&(streams[i]))); + } + + // Create CUDA event handles + cudaEvent_t start_event, stop_event; + checkCudaErrors(cudaEventCreate(&start_event)); + checkCudaErrors(cudaEventCreate(&stop_event)); + + // Target time per kernel is kernel_time ms, clockRate is in KHz + // Target number of clocks = target time * clock frequency +#if defined(__arm__) || defined(__aarch64__) + // the kernel takes more time than the channel reset time on arm archs, so to + // prevent hangs reduce time_clocks. + clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100)); +#else + clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate); +#endif + clock_t total_clocks = 0; + + // Start the clock + checkCudaErrors(cudaEventRecord(start_event, 0)); + + // Queue pairs of {kernel_A, kernel_B} in separate streams + for (int i = 0; i < nstreams; ++i) { + kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks); + total_clocks += time_clocks; + kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks); + total_clocks += time_clocks; + } + + // Stop the clock in stream 0 (i.e. all previous kernels will be complete) + checkCudaErrors(cudaEventRecord(stop_event, 0)); + + // At this point the CPU has dispatched all work for the GPU and can + // continue processing other tasks in parallel. In this sample we just want + // to wait until all work is done so we use a blocking cudaMemcpy below. + + // Run the sum kernel and copy the result back to host + sum<<<1, 32>>>(d_a, 2 * nstreams); + checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost)); + + // stop_event will have been recorded but including the synchronize here to + // prevent copy/paste errors! + checkCudaErrors(cudaEventSynchronize(stop_event)); + checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); + + printf("Expected time for serial execution of %d sets of kernels is between " + "approx. %.3fs and %.3fs\n", + nstreams, + (nstreams + 1) * kernel_time / 1000.0f, + 2 * nstreams * kernel_time / 1000.0f); + printf("Expected time for fully concurrent execution of %d sets of kernels is " + "approx. %.3fs\n", + nstreams, + 2 * kernel_time / 1000.0f); + printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f); + + bool bTestResult = (a[0] >= total_clocks); + + // Release resources + for (int i = 0; i < nstreams; i++) { + cudaStreamDestroy(streams[i]); + } + + free(streams); + cudaEventDestroy(start_event); + cudaEventDestroy(stop_event); + cudaFreeHost(a); + cudaFree(d_a); + + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/simpleIPC/README.md b/Samples/0_Introduction/simpleIPC/README.md index 8c3640f4..ef5ec6cf 100644 --- a/Samples/0_Introduction/simpleIPC/README.md +++ b/Samples/0_Introduction/simpleIPC/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/0_Introduction/simpleIPC/simpleIPC.cu b/Samples/0_Introduction/simpleIPC/simpleIPC.cu index 5261e509..1e2a9a93 100644 --- a/Samples/0_Introduction/simpleIPC/simpleIPC.cu +++ b/Samples/0_Introduction/simpleIPC/simpleIPC.cu @@ -32,6 +32,7 @@ #include #include #include + #include "helper_cuda.h" #include "helper_multiprocess.h" static const char shmName[] = "simpleIPCshm"; @@ -39,7 +40,7 @@ static const char shmName[] = "simpleIPCshm"; // For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited // in the same way. #define MAX_DEVICES (32) -#define DATA_SIZE (64ULL << 20ULL) // 64MB +#define DATA_SIZE (64ULL << 20ULL) // 64MB #if defined(__linux__) #define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x) @@ -49,281 +50,280 @@ static const char shmName[] = "simpleIPCshm"; #error Unsupported system #endif -typedef struct shmStruct_st { - size_t nprocesses; - int barrier; - int sense; - int devices[MAX_DEVICES]; - cudaIpcMemHandle_t memHandle[MAX_DEVICES]; - cudaIpcEventHandle_t eventHandle[MAX_DEVICES]; +typedef struct shmStruct_st +{ + size_t nprocesses; + int barrier; + int sense; + int devices[MAX_DEVICES]; + cudaIpcMemHandle_t memHandle[MAX_DEVICES]; + cudaIpcEventHandle_t eventHandle[MAX_DEVICES]; } shmStruct; -__global__ void simpleKernel(char *ptr, int sz, char val) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - for (; idx < sz; idx += (gridDim.x * blockDim.x)) { - ptr[idx] = val; - } +__global__ void simpleKernel(char *ptr, int sz, char val) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (; idx < sz; idx += (gridDim.x * blockDim.x)) { + ptr[idx] = val; + } } -static void barrierWait(volatile int *barrier, volatile int *sense, - unsigned int n) { - int count; +static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n) +{ + int count; - // Check-in - count = cpu_atomic_add32(barrier, 1); - if (count == n) // Last one in - *sense = 1; - while (!*sense) - ; + // Check-in + count = cpu_atomic_add32(barrier, 1); + if (count == n) // Last one in + *sense = 1; + while (!*sense) + ; - // Check-out - count = cpu_atomic_add32(barrier, -1); - if (count == 0) // Last one out - *sense = 0; - while (*sense) - ; + // Check-out + count = cpu_atomic_add32(barrier, -1); + if (count == 0) // Last one out + *sense = 0; + while (*sense) + ; } -static void childProcess(int id) { - volatile shmStruct *shm = NULL; - cudaStream_t stream; - sharedMemoryInfo info; - size_t procCount, i; - int blocks = 0; - int threads = 128; - cudaDeviceProp prop; - std::vector ptrs; - std::vector events; - std::vector verification_buffer(DATA_SIZE); +static void childProcess(int id) +{ + volatile shmStruct *shm = NULL; + cudaStream_t stream; + sharedMemoryInfo info; + size_t procCount, i; + int blocks = 0; + int threads = 128; + cudaDeviceProp prop; + std::vector ptrs; + std::vector events; + std::vector verification_buffer(DATA_SIZE); - if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { - printf("Failed to create shared memory slab\n"); - exit(EXIT_FAILURE); - } - shm = (volatile shmStruct *)info.addr; - procCount = shm->nprocesses; - - printf("Process %d: Starting on device %d...\n", id, shm->devices[id]); - - checkCudaErrors(cudaSetDevice(shm->devices[id])); - checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id])); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks, simpleKernel, threads, 0)); - blocks *= prop.multiProcessorCount; - - // Open and track all the allocations and events created in the master - // process for use later - for (i = 0; i < procCount; i++) { - void *ptr = NULL; - cudaEvent_t event; - - // Notice, we don't need to explicitly enable peer access for - // allocations on other devices. - checkCudaErrors( - cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], - cudaIpcMemLazyEnablePeerAccess)); - checkCudaErrors(cudaIpcOpenEventHandle( - &event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i])); - - ptrs.push_back(ptr); - events.push_back(event); - } - - // At each iteration of the loop, each sibling process will push work on - // their respective devices accessing the next peer mapped buffer allocated - // by the master process (these can come from other sibling processes as - // well). To coordinate each process' access, we force the stream to wait for - // the work already accessing this buffer asynchronously through IPC events, - // allowing the CPU processes to continue to queue more work. - for (i = 0; i < procCount; i++) { - size_t bufferId = (i + id) % procCount; - // Wait for the buffer to be accessed to be ready - checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0)); - // Push a simple kernel on it - simpleKernel<<>>((char *)ptrs[bufferId], - DATA_SIZE, id); - checkCudaErrors(cudaGetLastError()); - // Signal that this buffer is ready for the next consumer - checkCudaErrors(cudaEventRecord(events[bufferId], stream)); - // Wait for all my sibling processes to push this stage of their work - // before proceeding to the next. This prevents siblings from racing - // ahead and clobbering the recorded event or waiting on the wrong - // recorded event. - barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); - if (id == 0) { - printf("Step %lld done\n", (unsigned long long)i); + if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); } - } + shm = (volatile shmStruct *)info.addr; + procCount = shm->nprocesses; - // Now wait for my buffer to be ready so I can copy it locally and verify it - checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0)); - checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, - cudaMemcpyDeviceToHost, stream)); - // And wait for all the queued up work to complete - checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Process %d: Starting on device %d...\n", id, shm->devices[id]); - printf("Process %d: verifying...\n", id); + checkCudaErrors(cudaSetDevice(shm->devices[id])); + checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id])); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0)); + blocks *= prop.multiProcessorCount; - // The contents should have the id of the sibling just after me - char compareId = (char)((id + 1) % procCount); - for (unsigned long long j = 0; j < DATA_SIZE; j++) { - if (verification_buffer[j] != compareId) { - printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j, - (int)verification_buffer[j], (int)compareId); + // Open and track all the allocations and events created in the master + // process for use later + for (i = 0; i < procCount; i++) { + void *ptr = NULL; + cudaEvent_t event; + + // Notice, we don't need to explicitly enable peer access for + // allocations on other devices. + checkCudaErrors( + cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess)); + checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i])); + + ptrs.push_back(ptr); + events.push_back(event); } - } - // Clean up! - for (i = 0; i < procCount; i++) { - checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i])); - checkCudaErrors(cudaEventDestroy(events[i])); - } + // At each iteration of the loop, each sibling process will push work on + // their respective devices accessing the next peer mapped buffer allocated + // by the master process (these can come from other sibling processes as + // well). To coordinate each process' access, we force the stream to wait for + // the work already accessing this buffer asynchronously through IPC events, + // allowing the CPU processes to continue to queue more work. + for (i = 0; i < procCount; i++) { + size_t bufferId = (i + id) % procCount; + // Wait for the buffer to be accessed to be ready + checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0)); + // Push a simple kernel on it + simpleKernel<<>>((char *)ptrs[bufferId], DATA_SIZE, id); + checkCudaErrors(cudaGetLastError()); + // Signal that this buffer is ready for the next consumer + checkCudaErrors(cudaEventRecord(events[bufferId], stream)); + // Wait for all my sibling processes to push this stage of their work + // before proceeding to the next. This prevents siblings from racing + // ahead and clobbering the recorded event or waiting on the wrong + // recorded event. + barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); + if (id == 0) { + printf("Step %lld done\n", (unsigned long long)i); + } + } - checkCudaErrors(cudaStreamDestroy(stream)); + // Now wait for my buffer to be ready so I can copy it locally and verify it + checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0)); + checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream)); + // And wait for all the queued up work to complete + checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Process %d complete!\n", id); + printf("Process %d: verifying...\n", id); + + // The contents should have the id of the sibling just after me + char compareId = (char)((id + 1) % procCount); + for (unsigned long long j = 0; j < DATA_SIZE; j++) { + if (verification_buffer[j] != compareId) { + printf("Process %d: Verification mismatch at %lld: %d != %d\n", + id, + j, + (int)verification_buffer[j], + (int)compareId); + } + } + + // Clean up! + for (i = 0; i < procCount; i++) { + checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i])); + checkCudaErrors(cudaEventDestroy(events[i])); + } + + checkCudaErrors(cudaStreamDestroy(stream)); + + printf("Process %d complete!\n", id); } -static void parentProcess(char *app) { - sharedMemoryInfo info; - int devCount, i; - volatile shmStruct *shm = NULL; - std::vector ptrs; - std::vector events; - std::vector processes; +static void parentProcess(char *app) +{ + sharedMemoryInfo info; + int devCount, i; + volatile shmStruct *shm = NULL; + std::vector ptrs; + std::vector events; + std::vector processes; - checkCudaErrors(cudaGetDeviceCount(&devCount)); + checkCudaErrors(cudaGetDeviceCount(&devCount)); - if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { - printf("Failed to create shared memory slab\n"); - exit(EXIT_FAILURE); - } - shm = (volatile shmStruct *)info.addr; - memset((void *)shm, 0, sizeof(*shm)); - - // Pick all the devices that can access each other's memory for this test - // Keep in mind that CUDA has minimal support for fork() without a - // corresponding exec() in the child process, but in this case our - // spawnProcess will always exec, so no need to worry. - for (i = 0; i < devCount; i++) { - bool allPeers = true; - cudaDeviceProp prop; - checkCudaErrors(cudaGetDeviceProperties(&prop, i)); - - // CUDA IPC is only supported on devices with unified addressing - if (!prop.unifiedAddressing) { - printf("Device %d does not support unified addressing, skipping...\n", i); - continue; + if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); } - // This sample requires two processes accessing each device, so we need - // to ensure exclusive or prohibited mode is not set - if (prop.computeMode != cudaComputeModeDefault) { - printf("Device %d is in an unsupported compute mode for this sample\n", - i); - continue; + shm = (volatile shmStruct *)info.addr; + memset((void *)shm, 0, sizeof(*shm)); + + // Pick all the devices that can access each other's memory for this test + // Keep in mind that CUDA has minimal support for fork() without a + // corresponding exec() in the child process, but in this case our + // spawnProcess will always exec, so no need to worry. + for (i = 0; i < devCount; i++) { + bool allPeers = true; + cudaDeviceProp prop; + checkCudaErrors(cudaGetDeviceProperties(&prop, i)); + + // CUDA IPC is only supported on devices with unified addressing + if (!prop.unifiedAddressing) { + printf("Device %d does not support unified addressing, skipping...\n", i); + continue; + } + // This sample requires two processes accessing each device, so we need + // to ensure exclusive or prohibited mode is not set + if (prop.computeMode != cudaComputeModeDefault) { + printf("Device %d is in an unsupported compute mode for this sample\n", i); + continue; + } + + for (int j = 0; j < shm->nprocesses; j++) { + int canAccessPeerIJ, canAccessPeerJI; + checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i)); + checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j])); + if (!canAccessPeerIJ || !canAccessPeerJI) { + allPeers = false; + break; + } + } + if (allPeers) { + // Enable peers here. This isn't necessary for IPC, but it will + // setup the peers for the device. For systems that only allow 8 + // peers per GPU at a time, this acts to remove devices from CanAccessPeer + for (int j = 0; j < shm->nprocesses; j++) { + checkCudaErrors(cudaSetDevice(i)); + checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0)); + checkCudaErrors(cudaSetDevice(shm->devices[j])); + checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0)); + } + shm->devices[shm->nprocesses++] = i; + if (shm->nprocesses >= MAX_DEVICES) + break; + } + else { + printf("Device %d is not peer capable with some other selected peers, " + "skipping\n", + i); + } } - for (int j = 0; j < shm->nprocesses; j++) { - int canAccessPeerIJ, canAccessPeerJI; - checkCudaErrors( - cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i)); - checkCudaErrors( - cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j])); - if (!canAccessPeerIJ || !canAccessPeerJI) { - allPeers = false; - break; - } - } - if (allPeers) { - // Enable peers here. This isn't necessary for IPC, but it will - // setup the peers for the device. For systems that only allow 8 - // peers per GPU at a time, this acts to remove devices from CanAccessPeer - for (int j = 0; j < shm->nprocesses; j++) { - checkCudaErrors(cudaSetDevice(i)); - checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0)); - checkCudaErrors(cudaSetDevice(shm->devices[j])); - checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0)); - } - shm->devices[shm->nprocesses++] = i; - if (shm->nprocesses >= MAX_DEVICES) break; - } else { - printf( - "Device %d is not peer capable with some other selected peers, " - "skipping\n", - i); - } - } - - if (shm->nprocesses == 0) { - printf("No CUDA devices support IPC\n"); - exit(EXIT_WAIVED); - } - - // Now allocate memory and an event for each process and fill the shared - // memory buffer with the IPC handles to communicate - for (i = 0; i < shm->nprocesses; i++) { - void *ptr = NULL; - cudaEvent_t event; - - checkCudaErrors(cudaSetDevice(shm->devices[i])); - checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE)); - checkCudaErrors( - cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr)); - checkCudaErrors(cudaEventCreate( - &event, cudaEventDisableTiming | cudaEventInterprocess)); - checkCudaErrors(cudaIpcGetEventHandle( - (cudaIpcEventHandle_t *)&shm->eventHandle[i], event)); - - ptrs.push_back(ptr); - events.push_back(event); - } - - // Launch the child processes! - for (i = 0; i < shm->nprocesses; i++) { - char devIdx[12]; // Increased size to ensure enough space for formatted integer - char *const args[] = {app, devIdx, NULL}; - Process process; - - snprintf(devIdx, sizeof(devIdx), "%d", i); - - if (spawnProcess(&process, app, args)) { - printf("Failed to create process\n"); - exit(EXIT_FAILURE); + if (shm->nprocesses == 0) { + printf("No CUDA devices support IPC\n"); + exit(EXIT_WAIVED); } - processes.push_back(process); - } + // Now allocate memory and an event for each process and fill the shared + // memory buffer with the IPC handles to communicate + for (i = 0; i < shm->nprocesses; i++) { + void *ptr = NULL; + cudaEvent_t event; - // And wait for them to finish - for (i = 0; i < processes.size(); i++) { - if (waitProcess(&processes[i]) != EXIT_SUCCESS) { - printf("Process %d failed!\n", i); - exit(EXIT_FAILURE); + checkCudaErrors(cudaSetDevice(shm->devices[i])); + checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE)); + checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr)); + checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess)); + checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event)); + + ptrs.push_back(ptr); + events.push_back(event); } - } - // Clean up! - for (i = 0; i < shm->nprocesses; i++) { - checkCudaErrors(cudaSetDevice(shm->devices[i])); - checkCudaErrors(cudaEventSynchronize(events[i])); - checkCudaErrors(cudaEventDestroy(events[i])); - checkCudaErrors(cudaFree(ptrs[i])); - } + // Launch the child processes! + for (i = 0; i < shm->nprocesses; i++) { + char devIdx[12]; // Increased size to ensure enough space for formatted integer + char *const args[] = {app, devIdx, NULL}; + Process process; - sharedMemoryClose(&info); + snprintf(devIdx, sizeof(devIdx), "%d", i); + + if (spawnProcess(&process, app, args)) { + printf("Failed to create process\n"); + exit(EXIT_FAILURE); + } + + processes.push_back(process); + } + + // And wait for them to finish + for (i = 0; i < processes.size(); i++) { + if (waitProcess(&processes[i]) != EXIT_SUCCESS) { + printf("Process %d failed!\n", i); + exit(EXIT_FAILURE); + } + } + + // Clean up! + for (i = 0; i < shm->nprocesses; i++) { + checkCudaErrors(cudaSetDevice(shm->devices[i])); + checkCudaErrors(cudaEventSynchronize(events[i])); + checkCudaErrors(cudaEventDestroy(events[i])); + checkCudaErrors(cudaFree(ptrs[i])); + } + + sharedMemoryClose(&info); } -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ #if defined(__arm__) || defined(__aarch64__) - printf("Not supported on ARM\n"); - return EXIT_WAIVED; + printf("Not supported on ARM\n"); + return EXIT_WAIVED; #else - if (argc == 1) { - parentProcess(argv[0]); - } else { - childProcess(atoi(argv[1])); - } - return EXIT_SUCCESS; + if (argc == 1) { + parentProcess(argv[0]); + } + else { + childProcess(atoi(argv[1])); + } + return EXIT_SUCCESS; #endif } diff --git a/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu b/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu index 797302e9..b7f17e05 100644 --- a/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu +++ b/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu @@ -26,27 +26,27 @@ */ /* -* This sample demonstrates how to use texture fetches from layered 2D textures -* in CUDA C -* -* This sample first generates a 3D input data array for the layered texture -* and the expected output. Then it starts CUDA C kernels, one for each layer, -* which fetch their layer's texture data (using normalized texture coordinates) -* transform it to the expected output, and write it to a 3D output data array. -*/ + * This sample demonstrates how to use texture fetches from layered 2D textures + * in CUDA C + * + * This sample first generates a 3D input data array for the layered texture + * and the expected output. Then it starts CUDA C kernels, one for each layer, + * which fetch their layer's texture data (using normalized texture coordinates) + * transform it to the expected output, and write it to a 3D output data array. + */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes, kernels #include // includes, project #include -#include // helper for shared that are common to CUDA Samples +#include // helper for shared that are common to CUDA Samples static const char *sSDKname = "simpleLayeredTexture"; @@ -54,163 +54,156 @@ static const char *sSDKname = "simpleLayeredTexture"; //! Transform a layer of a layered 2D texture using texture lookups //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void transformKernel(float *g_odata, int width, int height, - int layer, cudaTextureObject_t tex) { - // calculate this thread's data point - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex) +{ + // calculate this thread's data point + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - // 0.5f offset and division are necessary to access the original data points - // in the texture (such that bilinear interpolation will not be activated). - // For details, see also CUDA Programming Guide, Appendix D - float u = (x + 0.5f) / (float)width; - float v = (y + 0.5f) / (float)height; + // 0.5f offset and division are necessary to access the original data points + // in the texture (such that bilinear interpolation will not be activated). + // For details, see also CUDA Programming Guide, Appendix D + float u = (x + 0.5f) / (float)width; + float v = (y + 0.5f) / (float)height; - // read from texture, do expected transformation and write to global memory - g_odata[layer * width * height + y * width + x] = - -tex2DLayered(tex, u, v, layer) + layer; + // read from texture, do expected transformation and write to global memory + g_odata[layer * width * height + y * width + x] = -tex2DLayered(tex, u, v, layer) + layer; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("[%s] - Starting...\n", sSDKname); +int main(int argc, char **argv) +{ + printf("[%s] - Starting...\n", sSDKname); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - int devID = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); - bool bResult = true; + bool bResult = true; - // get number of SMs on this GPU - cudaDeviceProp deviceProps; + // get number of SMs on this GPU + cudaDeviceProp deviceProps; - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, - deviceProps.multiProcessorCount); - printf("SM %d.%d\n", deviceProps.major, deviceProps.minor); + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount); + printf("SM %d.%d\n", deviceProps.major, deviceProps.minor); - // generate input data for layered texture - unsigned int width = 512, height = 512, num_layers = 5; - unsigned int size = width * height * num_layers * sizeof(float); - float *h_data = (float *)malloc(size); + // generate input data for layered texture + unsigned int width = 512, height = 512, num_layers = 5; + unsigned int size = width * height * num_layers * sizeof(float); + float *h_data = (float *)malloc(size); - for (unsigned int layer = 0; layer < num_layers; layer++) - for (int i = 0; i < (int)(width * height); i++) { - h_data[layer * width * height + i] = (float)i; + for (unsigned int layer = 0; layer < num_layers; layer++) + for (int i = 0; i < (int)(width * height); i++) { + h_data[layer * width * height + i] = (float)i; + } + + // this is the expected transformation of the input data (the expected output) + float *h_data_ref = (float *)malloc(size); + + for (unsigned int layer = 0; layer < num_layers; layer++) + for (int i = 0; i < (int)(width * height); i++) { + h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer; + } + + // allocate device memory for result + float *d_data = NULL; + checkCudaErrors(cudaMalloc((void **)&d_data, size)); + + // allocate array and copy image data + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaArray *cu_3darray; + checkCudaErrors( + cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered)); + cudaMemcpy3DParms myparms = {0}; + myparms.srcPos = make_cudaPos(0, 0, 0); + myparms.dstPos = make_cudaPos(0, 0, 0); + myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height); + myparms.dstArray = cu_3darray; + myparms.extent = make_cudaExtent(width, height, num_layers); + myparms.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(&myparms)); + + cudaTextureObject_t tex; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = cu_3darray; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; + + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); + + dim3 dimBlock(8, 8, 1); + dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); + + printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has " + "8 x 8 threads\n", + width, + height, + dimGrid.x, + dimGrid.y); + + transformKernel<<>>(d_data, width, height, 0, + tex); // warmup (for better timing) + + // check if kernel execution generated an error + getLastCudaError("warmup Kernel execution failed"); + + checkCudaErrors(cudaDeviceSynchronize()); + + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + + // execute the kernel + for (unsigned int layer = 0; layer < num_layers; layer++) + transformKernel<<>>(d_data, width, height, layer, tex); + + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer); + printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer)); + printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6)); + sdkDeleteTimer(&timer); + + // allocate mem for the result on host side + float *h_odata = (float *)malloc(size); + // copy result from device to host + checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost)); + + // write regression file if necessary + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // write file for regression test + sdkWriteFile("./data/regression.dat", h_odata, width * height, 0.0f, false); } - - // this is the expected transformation of the input data (the expected output) - float *h_data_ref = (float *)malloc(size); - - for (unsigned int layer = 0; layer < num_layers; layer++) - for (int i = 0; i < (int)(width * height); i++) { - h_data_ref[layer * width * height + i] = - -h_data[layer * width * height + i] + layer; - } - - // allocate device memory for result - float *d_data = NULL; - checkCudaErrors(cudaMalloc((void **)&d_data, size)); - - // allocate array and copy image data - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); - cudaArray *cu_3darray; - checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, - make_cudaExtent(width, height, num_layers), - cudaArrayLayered)); - cudaMemcpy3DParms myparms = {0}; - myparms.srcPos = make_cudaPos(0, 0, 0); - myparms.dstPos = make_cudaPos(0, 0, 0); - myparms.srcPtr = - make_cudaPitchedPtr(h_data, width * sizeof(float), width, height); - myparms.dstArray = cu_3darray; - myparms.extent = make_cudaExtent(width, height, num_layers); - myparms.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(&myparms)); - - cudaTextureObject_t tex; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = cu_3darray; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); - - dim3 dimBlock(8, 8, 1); - dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); - - printf( - "Covering 2D data array of %d x %d: Grid size is %d x %d, each block has " - "8 x 8 threads\n", - width, height, dimGrid.x, dimGrid.y); - - transformKernel<<>>(d_data, width, height, 0, - tex); // warmup (for better timing) - - // check if kernel execution generated an error - getLastCudaError("warmup Kernel execution failed"); - - checkCudaErrors(cudaDeviceSynchronize()); - - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - - // execute the kernel - for (unsigned int layer = 0; layer < num_layers; layer++) - transformKernel<<>>(d_data, width, height, layer, - tex); - - // check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); - - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer); - printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer)); - printf("%.2f Mtexlookups/sec\n", - (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / - 1e6)); - sdkDeleteTimer(&timer); - - // allocate mem for the result on host side - float *h_odata = (float *)malloc(size); - // copy result from device to host - checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost)); - - // write regression file if necessary - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // write file for regression test - sdkWriteFile("./data/regression.dat", h_odata, width * height, 0.0f, - false); - } else { - printf("Comparing kernel output to expected data\n"); + else { + printf("Comparing kernel output to expected data\n"); #define MIN_EPSILON_ERROR 5e-3f - bResult = compareData(h_odata, h_data_ref, width * height * num_layers, - MIN_EPSILON_ERROR, 0.0f); - } + bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f); + } - // cleanup memory - free(h_data); - free(h_data_ref); - free(h_odata); + // cleanup memory + free(h_data); + free(h_data_ref); + free(h_odata); - checkCudaErrors(cudaDestroyTextureObject(tex)); - checkCudaErrors(cudaFree(d_data)); - checkCudaErrors(cudaFreeArray(cu_3darray)); + checkCudaErrors(cudaDestroyTextureObject(tex)); + checkCudaErrors(cudaFree(d_data)); + checkCudaErrors(cudaFreeArray(cu_3darray)); - exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/simpleMPI/simpleMPI.cpp b/Samples/0_Introduction/simpleMPI/simpleMPI.cpp index a72f65f9..fa33c4a0 100644 --- a/Samples/0_Introduction/simpleMPI/simpleMPI.cpp +++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cpp @@ -26,15 +26,15 @@ */ /* Simple example demonstrating how to use MPI with CUDA -* -* Generate some random numbers on one node. -* Dispatch them to all nodes. -* Compute their square root on each node's GPU. -* Compute the average of the results using MPI. -* -* simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms -* on Windows, please download the Microsoft HPC Pack SDK 2008 -*/ + * + * Generate some random numbers on one node. + * Dispatch them to all nodes. + * Compute their square root on each node's GPU. + * Compute the average of the results using MPI. + * + * simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms + * on Windows, please download the Microsoft HPC Pack SDK 2008 + */ // MPI include #include @@ -42,87 +42,88 @@ // System includes #include -using std::cout; using std::cerr; +using std::cout; using std::endl; // User include #include "simpleMPI.h" // Error handling macros -#define MPI_CHECK(call) \ - if ((call) != MPI_SUCCESS) { \ - cerr << "MPI error calling \"" #call "\"\n"; \ - my_abort(-1); \ - } +#define MPI_CHECK(call) \ + if ((call) != MPI_SUCCESS) { \ + cerr << "MPI error calling \"" #call "\"\n"; \ + my_abort(-1); \ + } // Host code // No CUDA here, only MPI -int main(int argc, char *argv[]) { - // Dimensions of the dataset - int blockSize = 256; - int gridSize = 10000; - int dataSizePerNode = gridSize * blockSize; +int main(int argc, char *argv[]) +{ + // Dimensions of the dataset + int blockSize = 256; + int gridSize = 10000; + int dataSizePerNode = gridSize * blockSize; - // Initialize MPI state - MPI_CHECK(MPI_Init(&argc, &argv)); + // Initialize MPI state + MPI_CHECK(MPI_Init(&argc, &argv)); - // Get our MPI node number and node count - int commSize, commRank; - MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize)); - MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank)); + // Get our MPI node number and node count + int commSize, commRank; + MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize)); + MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank)); - // Generate some random numbers on the root node (node 0) - int dataSizeTotal = dataSizePerNode * commSize; - float *dataRoot = NULL; + // Generate some random numbers on the root node (node 0) + int dataSizeTotal = dataSizePerNode * commSize; + float *dataRoot = NULL; - // Are we the root node? - if (commRank == 0) { - cout << "Running on " << commSize << " nodes" << endl; - dataRoot = new float[dataSizeTotal]; - initData(dataRoot, dataSizeTotal); - } + // Are we the root node? + if (commRank == 0) { + cout << "Running on " << commSize << " nodes" << endl; + dataRoot = new float[dataSizeTotal]; + initData(dataRoot, dataSizeTotal); + } - // Allocate a buffer on each node - float *dataNode = new float[dataSizePerNode]; + // Allocate a buffer on each node + float *dataNode = new float[dataSizePerNode]; - // Dispatch a portion of the input data to each node - MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, - dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD)); + // Dispatch a portion of the input data to each node + MPI_CHECK( + MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD)); - if (commRank == 0) { - // No need for root data any more - delete[] dataRoot; - } + if (commRank == 0) { + // No need for root data any more + delete[] dataRoot; + } - // On each node, run computation on GPU - computeGPU(dataNode, blockSize, gridSize); + // On each node, run computation on GPU + computeGPU(dataNode, blockSize, gridSize); - // Reduction to the root node, computing the sum of output elements - float sumNode = sum(dataNode, dataSizePerNode); - float sumRoot; + // Reduction to the root node, computing the sum of output elements + float sumNode = sum(dataNode, dataSizePerNode); + float sumRoot; - MPI_CHECK( - MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD)); + MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD)); - if (commRank == 0) { - float average = sumRoot / dataSizeTotal; - cout << "Average of square roots is: " << average << endl; - } + if (commRank == 0) { + float average = sumRoot / dataSizeTotal; + cout << "Average of square roots is: " << average << endl; + } - // Cleanup - delete[] dataNode; - MPI_CHECK(MPI_Finalize()); + // Cleanup + delete[] dataNode; + MPI_CHECK(MPI_Finalize()); - if (commRank == 0) { - cout << "PASSED\n"; - } + if (commRank == 0) { + cout << "PASSED\n"; + } - return 0; + return 0; } // Shut down MPI cleanly if something goes wrong -void my_abort(int err) { - cout << "Test FAILED\n"; - MPI_Abort(MPI_COMM_WORLD, err); +void my_abort(int err) +{ + cout << "Test FAILED\n"; + MPI_Abort(MPI_COMM_WORLD, err); } diff --git a/Samples/0_Introduction/simpleMPI/simpleMPI.cu b/Samples/0_Introduction/simpleMPI/simpleMPI.cu index 53e656d3..d6ddc5b5 100644 --- a/Samples/0_Introduction/simpleMPI/simpleMPI.cu +++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cu @@ -26,14 +26,14 @@ */ /* Simple example demonstrating how to use MPI with CUDA -* -* Generate some random numbers on one node. -* Dispatch them to all nodes. -* Compute their square root on each node's GPU. -* Compute the average of the results using MPI. -* -* simpleMPI.cu: GPU part, compiled with nvcc -*/ + * + * Generate some random numbers on one node. + * Dispatch them to all nodes. + * Compute their square root on each node's GPU. + * Compute the average of the results using MPI. + * + * simpleMPI.cu: GPU part, compiled with nvcc + */ #include using std::cerr; @@ -42,61 +42,63 @@ using std::endl; #include "simpleMPI.h" // Error handling macro -#define CUDA_CHECK(call) \ - if ((call) != cudaSuccess) { \ - cudaError_t err = cudaGetLastError(); \ - cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \ - my_abort(err); \ - } +#define CUDA_CHECK(call) \ + if ((call) != cudaSuccess) { \ + cudaError_t err = cudaGetLastError(); \ + cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \ + my_abort(err); \ + } // Device code // Very simple GPU Kernel that computes square roots of input numbers -__global__ void simpleMPIKernel(float *input, float *output) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - output[tid] = sqrt(input[tid]); +__global__ void simpleMPIKernel(float *input, float *output) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + output[tid] = sqrt(input[tid]); } // Initialize an array with random data (between 0 and 1) -void initData(float *data, int dataSize) { - for (int i = 0; i < dataSize; i++) { - data[i] = (float)rand() / RAND_MAX; - } +void initData(float *data, int dataSize) +{ + for (int i = 0; i < dataSize; i++) { + data[i] = (float)rand() / RAND_MAX; + } } // CUDA computation on each node // No MPI here, only CUDA -void computeGPU(float *hostData, int blockSize, int gridSize) { - int dataSize = blockSize * gridSize; +void computeGPU(float *hostData, int blockSize, int gridSize) +{ + int dataSize = blockSize * gridSize; - // Allocate data on GPU memory - float *deviceInputData = NULL; - CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float))); + // Allocate data on GPU memory + float *deviceInputData = NULL; + CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float))); - float *deviceOutputData = NULL; - CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float))); + float *deviceOutputData = NULL; + CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float))); - // Copy to GPU memory - CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), - cudaMemcpyHostToDevice)); + // Copy to GPU memory + CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice)); - // Run kernel - simpleMPIKernel<<>>(deviceInputData, deviceOutputData); + // Run kernel + simpleMPIKernel<<>>(deviceInputData, deviceOutputData); - // Copy data back to CPU memory - CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), - cudaMemcpyDeviceToHost)); + // Copy data back to CPU memory + CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost)); - // Free GPU memory - CUDA_CHECK(cudaFree(deviceInputData)); - CUDA_CHECK(cudaFree(deviceOutputData)); + // Free GPU memory + CUDA_CHECK(cudaFree(deviceInputData)); + CUDA_CHECK(cudaFree(deviceOutputData)); } -float sum(float *data, int size) { - float accum = 0.f; +float sum(float *data, int size) +{ + float accum = 0.f; - for (int i = 0; i < size; i++) { - accum += data[i]; - } + for (int i = 0; i < size; i++) { + accum += data[i]; + } - return accum; + return accum; } diff --git a/Samples/0_Introduction/simpleMPI/simpleMPI.h b/Samples/0_Introduction/simpleMPI/simpleMPI.h index 589e977f..a8211745 100644 --- a/Samples/0_Introduction/simpleMPI/simpleMPI.h +++ b/Samples/0_Introduction/simpleMPI/simpleMPI.h @@ -26,19 +26,20 @@ */ /* Simple example demonstrating how to use MPI with CUDA -* -* Generate some random numbers on one node. -* Dispatch them to all nodes. -* Compute their square root on each node's GPU. -* Compute the average of the results using MPI. -* -* simpleMPI.h: common header file -*/ + * + * Generate some random numbers on one node. + * Dispatch them to all nodes. + * Compute their square root on each node's GPU. + * Compute the average of the results using MPI. + * + * simpleMPI.h: common header file + */ // Forward declarations -extern "C" { -void initData(float *data, int dataSize); -void computeGPU(float *hostData, int blockSize, int gridSize); -float sum(float *data, int size); -void my_abort(int err); +extern "C" +{ + void initData(float *data, int dataSize); + void computeGPU(float *hostData, int blockSize, int gridSize); + float sum(float *data, int size); + void my_abort(int err); } diff --git a/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu b/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu index 1796cd25..20e0d450 100644 --- a/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu +++ b/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu @@ -38,7 +38,7 @@ * * Elapsed times are averaged over nreps repetitions (10 by default). * -*/ + */ const char *sSDKname = "simpleMultiCopy"; @@ -50,25 +50,26 @@ const char *sSDKname = "simpleMultiCopy"; // includes, project #include -#include // helper for shared that are common to CUDA Samples +#include // helper for shared that are common to CUDA Samples // includes, kernels // Declare the CUDA kernels here and main() code that is needed to launch // Compute workload on the system -__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < N) { - for (int i = 0; i < inner_reps; ++i) { - g_out[idx] = g_in[idx] + 1; + if (idx < N) { + for (int i = 0; i < inner_reps; ++i) { + g_out[idx] = g_in[idx] + 1; + } } - } } #define STREAM_COUNT 4 // Uncomment to simulate data source/sink IO times -//#define SIMULATE_IO +// #define SIMULATE_IO int *h_data_source; int *h_data_sink; @@ -79,13 +80,13 @@ int *d_data_in[STREAM_COUNT]; int *h_data_out[STREAM_COUNT]; int *d_data_out[STREAM_COUNT]; -cudaEvent_t cycleDone[STREAM_COUNT]; +cudaEvent_t cycleDone[STREAM_COUNT]; cudaStream_t stream[STREAM_COUNT]; cudaEvent_t start, stop; -int N = 1 << 22; -int nreps = 10; // number of times each experiment is repeated +int N = 1 << 22; +int nreps = 10; // number of times each experiment is repeated int inner_reps = 5; int memsize; @@ -96,278 +97,268 @@ dim3 grid; int thread_blocks; float processWithStreams(int streams_used); -void init(); -bool test(); +void init(); +bool test(); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - int cuda_device = 0; - float scale_factor; - cudaDeviceProp deviceProp; +int main(int argc, char *argv[]) +{ + int cuda_device = 0; + float scale_factor; + cudaDeviceProp deviceProp; - printf("[%s] - Starting...\n", sSDKname); + printf("[%s] - Starting...\n", sSDKname); - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device="); + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device="); - if (cuda_device < 0) { - printf("Invalid command line parameters\n"); - exit(EXIT_FAILURE); - } else { - printf("cuda_device = %d\n", cuda_device); - cuda_device = gpuDeviceInit(cuda_device); + if (cuda_device < 0) { + printf("Invalid command line parameters\n"); + exit(EXIT_FAILURE); + } + else { + printf("cuda_device = %d\n", cuda_device); + cuda_device = gpuDeviceInit(cuda_device); - if (cuda_device < 0) { - printf("No CUDA Capable devices found, exiting...\n"); - exit(EXIT_SUCCESS); - } + if (cuda_device < 0) { + printf("No CUDA Capable devices found, exiting...\n"); + exit(EXIT_SUCCESS); + } + } } - } else { - // Otherwise pick the device with the highest Gflops/s - cuda_device = gpuGetMaxGflopsDeviceId(); - checkCudaErrors(cudaSetDevice(cuda_device)); + else { + // Otherwise pick the device with the highest Gflops/s + cuda_device = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(cuda_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); + printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name); + } + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); - printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name); - } + printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", + deviceProp.name, + deviceProp.multiProcessorCount, + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); - printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, - deviceProp.multiProcessorCount, - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - deviceProp.multiProcessorCount); + // Anything that is less than 32 Cores will have scaled down workload + scale_factor = + max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), + 1.0f); + N = (int)((float)N / scale_factor); - // Anything that is less than 32 Cores will have scaled down workload - scale_factor = - max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - (float)deviceProp.multiProcessorCount)), - 1.0f); - N = (int)((float)N / scale_factor); + printf("> Device name: %s\n", deviceProp.name); + printf("> CUDA Capability %d.%d hardware with %d multi-processors\n", + deviceProp.major, + deviceProp.minor, + deviceProp.multiProcessorCount); + printf("> scale_factor = %.2f\n", 1.0f / scale_factor); + printf("> array_size = %d\n\n", N); - printf("> Device name: %s\n", deviceProp.name); - printf("> CUDA Capability %d.%d hardware with %d multi-processors\n", - deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); - printf("> scale_factor = %.2f\n", 1.0f / scale_factor); - printf("> array_size = %d\n\n", N); + memsize = N * sizeof(int); - memsize = N * sizeof(int); + thread_blocks = N / block.x; - thread_blocks = N / block.x; + grid.x = thread_blocks % 65535; + grid.y = (thread_blocks / 65535 + 1); - grid.x = thread_blocks % 65535; - grid.y = (thread_blocks / 65535 + 1); + // Allocate resources - // Allocate resources + h_data_source = (int *)malloc(memsize); + h_data_sink = (int *)malloc(memsize); - h_data_source = (int *)malloc(memsize); - h_data_sink = (int *)malloc(memsize); + for (int i = 0; i < STREAM_COUNT; ++i) { + checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault)); + checkCudaErrors(cudaMalloc(&d_data_in[i], memsize)); + checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize)); - for (int i = 0; i < STREAM_COUNT; ++i) { - checkCudaErrors( - cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault)); - checkCudaErrors(cudaMalloc(&d_data_in[i], memsize)); - checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize)); + checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault)); + checkCudaErrors(cudaMalloc(&d_data_out[i], memsize)); - checkCudaErrors( - cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault)); - checkCudaErrors(cudaMalloc(&d_data_out[i], memsize)); + checkCudaErrors(cudaStreamCreate(&stream[i])); + checkCudaErrors(cudaEventCreate(&cycleDone[i])); - checkCudaErrors(cudaStreamCreate(&stream[i])); - checkCudaErrors(cudaEventCreate(&cycleDone[i])); + cudaEventRecord(cycleDone[i], stream[i]); + } - cudaEventRecord(cycleDone[i], stream[i]); - } + cudaEventCreate(&start); + cudaEventCreate(&stop); - cudaEventCreate(&start); - cudaEventCreate(&stop); + init(); - init(); + // Kernel warmup + incKernel<<>>(d_data_out[0], d_data_in[0], N, inner_reps); - // Kernel warmup - incKernel<<>>(d_data_out[0], d_data_in[0], N, inner_reps); + // Time copies and kernel + cudaEventRecord(start, 0); + checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0)); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - // Time copies and kernel - cudaEventRecord(start, 0); - checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, - cudaMemcpyHostToDevice, 0)); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + float memcpy_h2d_time; + cudaEventElapsedTime(&memcpy_h2d_time, start, stop); - float memcpy_h2d_time; - cudaEventElapsedTime(&memcpy_h2d_time, start, stop); + cudaEventRecord(start, 0); + checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0)); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - cudaEventRecord(start, 0); - checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, - cudaMemcpyDeviceToHost, 0)); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + float memcpy_d2h_time; + cudaEventElapsedTime(&memcpy_d2h_time, start, stop); - float memcpy_d2h_time; - cudaEventElapsedTime(&memcpy_d2h_time, start, stop); + cudaEventRecord(start, 0); + incKernel<<>>(d_data_out[0], d_data_in[0], N, inner_reps); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - cudaEventRecord(start, 0); - incKernel<<>>(d_data_out[0], d_data_in[0], N, inner_reps); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + float kernel_time; + cudaEventElapsedTime(&kernel_time, start, stop); - float kernel_time; - cudaEventElapsedTime(&kernel_time, start, stop); + printf("\n"); + printf("Relevant properties of this CUDA device\n"); + printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution " + "(device property \"deviceOverlap\")\n", + deviceProp.deviceOverlap ? "X" : " "); + // printf("(%s) Can execute several GPU kernels simultaneously (compute + // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " "); + printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n" + " (Compute Capability >= 2.0 AND (Tesla product OR Quadro " + "4000/5000/6000/K5000)\n", + (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " "); - printf("\n"); - printf("Relevant properties of this CUDA device\n"); - printf( - "(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution " - "(device property \"deviceOverlap\")\n", - deviceProp.deviceOverlap ? "X" : " "); - // printf("(%s) Can execute several GPU kernels simultaneously (compute - // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " "); - printf( - "(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n" - " (Compute Capability >= 2.0 AND (Tesla product OR Quadro " - "4000/5000/6000/K5000)\n", - (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " "); + printf("\n"); + printf("Measured timings (throughput):\n"); + printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time); + printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time); + printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time); - printf("\n"); - printf("Measured timings (throughput):\n"); - printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, - (memsize * 1e-6) / memcpy_h2d_time); - printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, - (memsize * 1e-6) / memcpy_d2h_time); - printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, - (inner_reps * memsize * 2e-6) / kernel_time); + printf("\n"); + printf("Theoretical limits for speedup gained from overlapped data " + "transfers:\n"); + printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time); + printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time)); + printf("Compute can overlap with both data transfers: %f ms\n", + max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time)); - printf("\n"); - printf( - "Theoretical limits for speedup gained from overlapped data " - "transfers:\n"); - printf("No overlap at all (transfer-kernel-transfer): %f ms \n", - memcpy_h2d_time + memcpy_d2h_time + kernel_time); - printf("Compute can overlap with one transfer: %f ms\n", - max((memcpy_h2d_time + memcpy_d2h_time), kernel_time)); - printf("Compute can overlap with both data transfers: %f ms\n", - max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time)); + // Process pipelined work + float serial_time = processWithStreams(1); + float overlap_time = processWithStreams(STREAM_COUNT); - // Process pipelined work - float serial_time = processWithStreams(1); - float overlap_time = processWithStreams(STREAM_COUNT); + printf("\nAverage measured timings over %d repetitions:\n", nreps); + printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps); + printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps); + printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps); - printf("\nAverage measured timings over %d repetitions:\n", nreps); - printf(" Avg. time when execution fully serialized\t: %f ms\n", - serial_time / nreps); - printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, - overlap_time / nreps); - printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", - (serial_time - overlap_time) / nreps); + printf("\nMeasured throughput:\n"); + printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time); + printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time); - printf("\nMeasured throughput:\n"); - printf(" Fully serialized execution\t\t: %f GB/s\n", - (nreps * (memsize * 2e-6)) / serial_time); - printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, - (nreps * (memsize * 2e-6)) / overlap_time); + // Verify the results, we will use the results for final output + bool bResults = test(); - // Verify the results, we will use the results for final output - bool bResults = test(); + // Free resources - // Free resources + free(h_data_source); + free(h_data_sink); - free(h_data_source); - free(h_data_sink); + for (int i = 0; i < STREAM_COUNT; ++i) { + cudaFreeHost(h_data_in[i]); + cudaFree(d_data_in[i]); - for (int i = 0; i < STREAM_COUNT; ++i) { - cudaFreeHost(h_data_in[i]); - cudaFree(d_data_in[i]); + cudaFreeHost(h_data_out[i]); + cudaFree(d_data_out[i]); - cudaFreeHost(h_data_out[i]); - cudaFree(d_data_out[i]); + cudaStreamDestroy(stream[i]); + cudaEventDestroy(cycleDone[i]); + } - cudaStreamDestroy(stream[i]); - cudaEventDestroy(cycleDone[i]); - } + cudaEventDestroy(start); + cudaEventDestroy(stop); - cudaEventDestroy(start); - cudaEventDestroy(stop); - - // Test result - exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE); + // Test result + exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE); } -float processWithStreams(int streams_used) { - int current_stream = 0; +float processWithStreams(int streams_used) +{ + int current_stream = 0; - float time; + float time; - // Do processing in a loop - // - // Note: All memory commands are processed in the order they are issued, - // independent of the stream they are enqueued in. Hence the pattern by - // which the copy and kernel commands are enqueued in the stream - // has an influence on the achieved overlap. + // Do processing in a loop + // + // Note: All memory commands are processed in the order they are issued, + // independent of the stream they are enqueued in. Hence the pattern by + // which the copy and kernel commands are enqueued in the stream + // has an influence on the achieved overlap. - cudaEventRecord(start, 0); + cudaEventRecord(start, 0); - for (int i = 0; i < nreps; ++i) { - int next_stream = (current_stream + 1) % streams_used; + for (int i = 0; i < nreps; ++i) { + int next_stream = (current_stream + 1) % streams_used; #ifdef SIMULATE_IO - // Store the result - memcpy(h_data_sink, h_data_out[current_stream], memsize); + // Store the result + memcpy(h_data_sink, h_data_out[current_stream], memsize); - // Read new input - memcpy(h_data_in[next_stream], h_data_source, memsize); + // Read new input + memcpy(h_data_in[next_stream], h_data_source, memsize); #endif - // Ensure that processing and copying of the last cycle has finished - cudaEventSynchronize(cycleDone[next_stream]); + // Ensure that processing and copying of the last cycle has finished + cudaEventSynchronize(cycleDone[next_stream]); - // Process current frame - incKernel<<>>( - d_data_out[current_stream], d_data_in[current_stream], N, inner_reps); + // Process current frame + incKernel<<>>( + d_data_out[current_stream], d_data_in[current_stream], N, inner_reps); - // Upload next frame - checkCudaErrors( - cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize, - cudaMemcpyHostToDevice, stream[next_stream])); + // Upload next frame + checkCudaErrors(cudaMemcpyAsync( + d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream])); - // Download current frame - checkCudaErrors(cudaMemcpyAsync( - h_data_out[current_stream], d_data_out[current_stream], memsize, - cudaMemcpyDeviceToHost, stream[current_stream])); + // Download current frame + checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream], + d_data_out[current_stream], + memsize, + cudaMemcpyDeviceToHost, + stream[current_stream])); - checkCudaErrors( - cudaEventRecord(cycleDone[current_stream], stream[current_stream])); + checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream])); - current_stream = next_stream; - } - - cudaEventRecord(stop, 0); - - cudaDeviceSynchronize(); - - cudaEventElapsedTime(&time, start, stop); - - return time; -} - -void init() { - for (int i = 0; i < N; ++i) { - h_data_source[i] = 0; - } - - for (int i = 0; i < STREAM_COUNT; ++i) { - memcpy(h_data_in[i], h_data_source, memsize); - } -} - -bool test() { - bool passed = true; - - for (int j = 0; j < STREAM_COUNT; ++j) { - for (int i = 0; i < N; ++i) { - passed &= (h_data_out[j][i] == 1); + current_stream = next_stream; } - } - return passed; + cudaEventRecord(stop, 0); + + cudaDeviceSynchronize(); + + cudaEventElapsedTime(&time, start, stop); + + return time; +} + +void init() +{ + for (int i = 0; i < N; ++i) { + h_data_source[i] = 0; + } + + for (int i = 0; i < STREAM_COUNT; ++i) { + memcpy(h_data_in[i], h_data_source, memsize); + } +} + +bool test() +{ + bool passed = true; + + for (int j = 0; j < STREAM_COUNT; ++j) { + for (int i = 0; i < N; ++i) { + passed &= (h_data_out[j][i] == 1); + } + } + + return passed; } diff --git a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu index 9fac0734..94324151 100644 --- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu +++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu @@ -37,15 +37,15 @@ */ // System includes -#include #include +#include // CUDA runtime #include // helper functions and utilities to work with CUDA -#include #include +#include #ifndef MAX #define MAX(a, b) (a > b ? a : b) @@ -57,180 +57,176 @@ // Data configuration //////////////////////////////////////////////////////////////////////////////// const int MAX_GPU_COUNT = 32; -const int DATA_N = 1048576 * 32; +const int DATA_N = 1048576 * 32; //////////////////////////////////////////////////////////////////////////////// // Simple reduction kernel. // Refer to the 'reduction' CUDA Sample describing // reduction optimization strategies //////////////////////////////////////////////////////////////////////////////// -__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int threadN = gridDim.x * blockDim.x; - float sum = 0; +__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int threadN = gridDim.x * blockDim.x; + float sum = 0; - for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos]; + for (int pos = tid; pos < N; pos += threadN) + sum += d_Input[pos]; - d_Result[tid] = sum; + d_Result[tid] = sum; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // Solver config - TGPUplan plan[MAX_GPU_COUNT]; +int main(int argc, char **argv) +{ + // Solver config + TGPUplan plan[MAX_GPU_COUNT]; - // GPU reduction results - float h_SumGPU[MAX_GPU_COUNT]; + // GPU reduction results + float h_SumGPU[MAX_GPU_COUNT]; - float sumGPU; - double sumCPU, diff; + float sumGPU; + double sumCPU, diff; - int i, j, gpuBase, GPU_N; + int i, j, gpuBase, GPU_N; - const int BLOCK_N = 32; - const int THREAD_N = 256; - const int ACCUM_N = BLOCK_N * THREAD_N; + const int BLOCK_N = 32; + const int THREAD_N = 256; + const int ACCUM_N = BLOCK_N * THREAD_N; - printf("Starting simpleMultiGPU\n"); - checkCudaErrors(cudaGetDeviceCount(&GPU_N)); + printf("Starting simpleMultiGPU\n"); + checkCudaErrors(cudaGetDeviceCount(&GPU_N)); - if (GPU_N > MAX_GPU_COUNT) { - GPU_N = MAX_GPU_COUNT; - } - - printf("CUDA-capable device count: %i\n", GPU_N); - - printf("Generating input data...\n\n"); - - // Subdividing input data across GPUs - // Get data sizes for each GPU - for (i = 0; i < GPU_N; i++) { - plan[i].dataN = DATA_N / GPU_N; - } - - // Take into account "odd" data sizes - for (i = 0; i < DATA_N % GPU_N; i++) { - plan[i].dataN++; - } - - // Assign data ranges to GPUs - gpuBase = 0; - - for (i = 0; i < GPU_N; i++) { - plan[i].h_Sum = h_SumGPU + i; - gpuBase += plan[i].dataN; - } - - // Create streams for issuing GPU command asynchronously and allocate memory - // (GPU and System page-locked) - for (i = 0; i < GPU_N; i++) { - checkCudaErrors(cudaSetDevice(i)); - checkCudaErrors(cudaStreamCreate(&plan[i].stream)); - // Allocate memory - checkCudaErrors( - cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float))); - checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, - ACCUM_N * sizeof(float))); - checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, - plan[i].dataN * sizeof(float))); - - for (j = 0; j < plan[i].dataN; j++) { - plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX; - } - } - - // Start timing and compute on GPU(s) - printf("Computing with %d GPUs...\n", GPU_N); - // create and start timer - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - - // start the timer - sdkStartTimer(&timer); - - // Copy data to GPU, launch the kernel and copy data back. All asynchronously - for (i = 0; i < GPU_N; i++) { - // Set device - checkCudaErrors(cudaSetDevice(i)); - - // Copy input data from CPU - checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data, - plan[i].dataN * sizeof(float), - cudaMemcpyHostToDevice, plan[i].stream)); - - // Perform GPU computations - reduceKernel<<>>( - plan[i].d_Sum, plan[i].d_Data, plan[i].dataN); - getLastCudaError("reduceKernel() execution failed.\n"); - - // Read back GPU results - checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum, - ACCUM_N * sizeof(float), - cudaMemcpyDeviceToHost, plan[i].stream)); - } - - // Process GPU results - for (i = 0; i < GPU_N; i++) { - float sum; - - // Set device - checkCudaErrors(cudaSetDevice(i)); - - // Wait for all operations to finish - cudaStreamSynchronize(plan[i].stream); - - // Finalize GPU reduction for current subvector - sum = 0; - - for (j = 0; j < ACCUM_N; j++) { - sum += plan[i].h_Sum_from_device[j]; + if (GPU_N > MAX_GPU_COUNT) { + GPU_N = MAX_GPU_COUNT; } - *(plan[i].h_Sum) = (float)sum; + printf("CUDA-capable device count: %i\n", GPU_N); - // Shut down this GPU - checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device)); - checkCudaErrors(cudaFree(plan[i].d_Sum)); - checkCudaErrors(cudaFree(plan[i].d_Data)); - checkCudaErrors(cudaStreamDestroy(plan[i].stream)); - } + printf("Generating input data...\n\n"); - sumGPU = 0; - - for (i = 0; i < GPU_N; i++) { - sumGPU += h_SumGPU[i]; - } - - sdkStopTimer(&timer); - printf(" GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); - - // Compute on Host CPU - printf("Computing with Host CPU...\n\n"); - - sumCPU = 0; - - for (i = 0; i < GPU_N; i++) { - for (j = 0; j < plan[i].dataN; j++) { - sumCPU += plan[i].h_Data[j]; + // Subdividing input data across GPUs + // Get data sizes for each GPU + for (i = 0; i < GPU_N; i++) { + plan[i].dataN = DATA_N / GPU_N; } - } - // Compare GPU and CPU results - printf("Comparing GPU and Host CPU results...\n"); - diff = fabs(sumCPU - sumGPU) / fabs(sumCPU); - printf(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); - printf(" Relative difference: %E \n\n", diff); + // Take into account "odd" data sizes + for (i = 0; i < DATA_N % GPU_N; i++) { + plan[i].dataN++; + } - // Cleanup and shutdown - for (i = 0; i < GPU_N; i++) { - checkCudaErrors(cudaSetDevice(i)); - checkCudaErrors(cudaFreeHost(plan[i].h_Data)); - } + // Assign data ranges to GPUs + gpuBase = 0; - exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE); + for (i = 0; i < GPU_N; i++) { + plan[i].h_Sum = h_SumGPU + i; + gpuBase += plan[i].dataN; + } + + // Create streams for issuing GPU command asynchronously and allocate memory + // (GPU and System page-locked) + for (i = 0; i < GPU_N; i++) { + checkCudaErrors(cudaSetDevice(i)); + checkCudaErrors(cudaStreamCreate(&plan[i].stream)); + // Allocate memory + checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float))); + checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float))); + checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float))); + + for (j = 0; j < plan[i].dataN; j++) { + plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX; + } + } + + // Start timing and compute on GPU(s) + printf("Computing with %d GPUs...\n", GPU_N); + // create and start timer + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + + // start the timer + sdkStartTimer(&timer); + + // Copy data to GPU, launch the kernel and copy data back. All asynchronously + for (i = 0; i < GPU_N; i++) { + // Set device + checkCudaErrors(cudaSetDevice(i)); + + // Copy input data from CPU + checkCudaErrors(cudaMemcpyAsync( + plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream)); + + // Perform GPU computations + reduceKernel<<>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN); + getLastCudaError("reduceKernel() execution failed.\n"); + + // Read back GPU results + checkCudaErrors(cudaMemcpyAsync( + plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream)); + } + + // Process GPU results + for (i = 0; i < GPU_N; i++) { + float sum; + + // Set device + checkCudaErrors(cudaSetDevice(i)); + + // Wait for all operations to finish + cudaStreamSynchronize(plan[i].stream); + + // Finalize GPU reduction for current subvector + sum = 0; + + for (j = 0; j < ACCUM_N; j++) { + sum += plan[i].h_Sum_from_device[j]; + } + + *(plan[i].h_Sum) = (float)sum; + + // Shut down this GPU + checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device)); + checkCudaErrors(cudaFree(plan[i].d_Sum)); + checkCudaErrors(cudaFree(plan[i].d_Data)); + checkCudaErrors(cudaStreamDestroy(plan[i].stream)); + } + + sumGPU = 0; + + for (i = 0; i < GPU_N; i++) { + sumGPU += h_SumGPU[i]; + } + + sdkStopTimer(&timer); + printf(" GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); + + // Compute on Host CPU + printf("Computing with Host CPU...\n\n"); + + sumCPU = 0; + + for (i = 0; i < GPU_N; i++) { + for (j = 0; j < plan[i].dataN; j++) { + sumCPU += plan[i].h_Data[j]; + } + } + + // Compare GPU and CPU results + printf("Comparing GPU and Host CPU results...\n"); + diff = fabs(sumCPU - sumGPU) / fabs(sumCPU); + printf(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); + printf(" Relative difference: %E \n\n", diff); + + // Cleanup and shutdown + for (i = 0; i < GPU_N; i++) { + checkCudaErrors(cudaSetDevice(i)); + checkCudaErrors(cudaFreeHost(plan[i].h_Data)); + } + + exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h index 90bd0d87..3bfeef21 100644 --- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h +++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h @@ -37,26 +37,26 @@ #ifndef SIMPLEMULTIGPU_H #define SIMPLEMULTIGPU_H -typedef struct { - // Host-side input data - int dataN; - float *h_Data; +typedef struct +{ + // Host-side input data + int dataN; + float *h_Data; - // Partial sum for this GPU - float *h_Sum; + // Partial sum for this GPU + float *h_Sum; - // Device buffers - float *d_Data, *d_Sum; + // Device buffers + float *d_Data, *d_Sum; - // Reduction copied back from GPU - float *h_Sum_from_device; + // Reduction copied back from GPU + float *h_Sum_from_device; - // Stream for asynchronous command execution - cudaStream_t stream; + // Stream for asynchronous command execution + cudaStream_t stream; } TGPUplan; -extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, - int BLOCK_N, int THREAD_N, cudaStream_t &s); +extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s); #endif diff --git a/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu b/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu index 15bfbcc0..81d0b08d 100644 --- a/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu +++ b/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu @@ -25,8 +25,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include // helper functions for CUDA error check #include -#include // helper functions for CUDA error check const int manualBlockSize = 32; @@ -38,13 +38,14 @@ const int manualBlockSize = 32; // execution configuration, including anything the launch configurator // API suggests. //////////////////////////////////////////////////////////////////////////////// -__global__ void square(int *array, int arrayCount) { - extern __shared__ int dynamicSmem[]; - int idx = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void square(int *array, int arrayCount) +{ + extern __shared__ int dynamicSmem[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < arrayCount) { - array[idx] *= array[idx]; - } + if (idx < arrayCount) { + array[idx] *= array[idx]; + } } //////////////////////////////////////////////////////////////////////////////// @@ -58,29 +59,28 @@ __global__ void square(int *array, int arrayCount) { // This wrapper routine computes the occupancy of kernel, and reports // it in terms of active warps / maximum warps per SM. //////////////////////////////////////////////////////////////////////////////// -static double reportPotentialOccupancy(void *kernel, int blockSize, - size_t dynamicSMem) { - int device; - cudaDeviceProp prop; +static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem) +{ + int device; + cudaDeviceProp prop; - int numBlocks; - int activeWarps; - int maxWarps; + int numBlocks; + int activeWarps; + int maxWarps; - double occupancy; + double occupancy; - checkCudaErrors(cudaGetDevice(&device)); - checkCudaErrors(cudaGetDeviceProperties(&prop, device)); + checkCudaErrors(cudaGetDevice(&device)); + checkCudaErrors(cudaGetDeviceProperties(&prop, device)); - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, kernel, blockSize, dynamicSMem)); + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem)); - activeWarps = numBlocks * blockSize / prop.warpSize; - maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize; + activeWarps = numBlocks * blockSize / prop.warpSize; + maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize; - occupancy = (double)activeWarps / maxWarps; + occupancy = (double)activeWarps / maxWarps; - return occupancy; + return occupancy; } //////////////////////////////////////////////////////////////////////////////// @@ -99,65 +99,63 @@ static double reportPotentialOccupancy(void *kernel, int blockSize, // This function configures the launch based on the "automatic" // argument, records the runtime, and reports occupancy and runtime. //////////////////////////////////////////////////////////////////////////////// -static int launchConfig(int *array, int arrayCount, bool automatic) { - int blockSize; - int minGridSize; - int gridSize; - size_t dynamicSMemUsage = 0; +static int launchConfig(int *array, int arrayCount, bool automatic) +{ + int blockSize; + int minGridSize; + int gridSize; + size_t dynamicSMemUsage = 0; - cudaEvent_t start; - cudaEvent_t end; + cudaEvent_t start; + cudaEvent_t end; - float elapsedTime; + float elapsedTime; - double potentialOccupancy; + double potentialOccupancy; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&end)); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&end)); - if (automatic) { - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( - &minGridSize, &blockSize, (void *)square, dynamicSMemUsage, - arrayCount)); + if (automatic) { + checkCudaErrors( + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount)); - std::cout << "Suggested block size: " << blockSize << std::endl - << "Minimum grid size for maximum occupancy: " << minGridSize - << std::endl; - } else { - // This block size is too small. Given limited number of - // active blocks per multiprocessor, the number of active - // threads will be limited, and thus unable to achieve maximum - // occupancy. + std::cout << "Suggested block size: " << blockSize << std::endl + << "Minimum grid size for maximum occupancy: " << minGridSize << std::endl; + } + else { + // This block size is too small. Given limited number of + // active blocks per multiprocessor, the number of active + // threads will be limited, and thus unable to achieve maximum + // occupancy. + // + blockSize = manualBlockSize; + } + + // Round up // - blockSize = manualBlockSize; - } + gridSize = (arrayCount + blockSize - 1) / blockSize; - // Round up - // - gridSize = (arrayCount + blockSize - 1) / blockSize; + // Launch and profile + // + checkCudaErrors(cudaEventRecord(start)); + square<<>>(array, arrayCount); + checkCudaErrors(cudaEventRecord(end)); - // Launch and profile - // - checkCudaErrors(cudaEventRecord(start)); - square<<>>(array, arrayCount); - checkCudaErrors(cudaEventRecord(end)); + checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaDeviceSynchronize()); + // Calculate occupancy + // + potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage); - // Calculate occupancy - // - potentialOccupancy = - reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage); + std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl; - std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" - << std::endl; + // Report elapsed time + // + checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end)); + std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl; - // Report elapsed time - // - checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end)); - std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl; - - return 0; + return 0; } //////////////////////////////////////////////////////////////////////////////// @@ -166,41 +164,41 @@ static int launchConfig(int *array, int arrayCount, bool automatic) { // The test generates an array and squares it with a CUDA kernel, then // verifies the result. //////////////////////////////////////////////////////////////////////////////// -static int test(bool automaticLaunchConfig, const int count = 1000000) { - int *array; - int *dArray; - int size = count * sizeof(int); +static int test(bool automaticLaunchConfig, const int count = 1000000) +{ + int *array; + int *dArray; + int size = count * sizeof(int); - array = new int[count]; + array = new int[count]; - for (int i = 0; i < count; i += 1) { - array[i] = i; - } - - checkCudaErrors(cudaMalloc(&dArray, size)); - checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice)); - - for (int i = 0; i < count; i += 1) { - array[i] = 0; - } - - launchConfig(dArray, count, automaticLaunchConfig); - - checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaFree(dArray)); - - // Verify the return data - // - for (int i = 0; i < count; i += 1) { - if (array[i] != i * i) { - std::cout << "element " << i << " expected " << i * i << " actual " - << array[i] << std::endl; - return 1; + for (int i = 0; i < count; i += 1) { + array[i] = i; } - } - delete[] array; - return 0; + checkCudaErrors(cudaMalloc(&dArray, size)); + checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice)); + + for (int i = 0; i < count; i += 1) { + array[i] = 0; + } + + launchConfig(dArray, count, automaticLaunchConfig); + + checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaFree(dArray)); + + // Verify the return data + // + for (int i = 0; i < count; i += 1) { + if (array[i] != i * i) { + std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl; + return 1; + } + } + delete[] array; + + return 0; } //////////////////////////////////////////////////////////////////////////////// @@ -210,31 +208,31 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) { // automatically configured launch, and reports the occupancy and // performance. //////////////////////////////////////////////////////////////////////////////// -int main() { - int status; +int main() +{ + int status; - std::cout << "starting Simple Occupancy" << std::endl << std::endl; + std::cout << "starting Simple Occupancy" << std::endl << std::endl; - std::cout << "[ Manual configuration with " << manualBlockSize - << " threads per block ]" << std::endl; + std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl; - status = test(false); - if (status) { - std::cerr << "Test failed\n" << std::endl; - return -1; - } + status = test(false); + if (status) { + std::cerr << "Test failed\n" << std::endl; + return -1; + } - std::cout << std::endl; + std::cout << std::endl; - std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl; - status = test(true); - if (status) { - std::cerr << "Test failed\n" << std::endl; - return -1; - } + std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl; + status = test(true); + if (status) { + std::cerr << "Test failed\n" << std::endl; + return -1; + } - std::cout << std::endl; - std::cout << "Test PASSED\n" << std::endl; + std::cout << std::endl; + std::cout << "Test PASSED\n" << std::endl; - return 0; + return 0; } diff --git a/Samples/0_Introduction/simpleP2P/README.md b/Samples/0_Introduction/simpleP2P/README.md index 8d9b3770..f2dc3c4c 100644 --- a/Samples/0_Introduction/simpleP2P/README.md +++ b/Samples/0_Introduction/simpleP2P/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/0_Introduction/simpleP2P/simpleP2P.cu b/Samples/0_Introduction/simpleP2P/simpleP2P.cu index 58b37e60..10192b7b 100644 --- a/Samples/0_Introduction/simpleP2P/simpleP2P.cu +++ b/Samples/0_Introduction/simpleP2P/simpleP2P.cu @@ -31,230 +31,233 @@ */ // includes, system -#include #include +#include // CUDA includes #include // includes, project #include -#include // helper for shared that are common to CUDA Samples +#include // helper for shared that are common to CUDA Samples -__global__ void SimpleKernel(float *src, float *dst) { - // Just a dummy kernel, doing enough for us to verify that everything - // worked - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - dst[idx] = src[idx] * 2.0f; +__global__ void SimpleKernel(float *src, float *dst) +{ + // Just a dummy kernel, doing enough for us to verify that everything + // worked + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + dst[idx] = src[idx] * 2.0f; } inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; } -int main(int argc, char **argv) { - printf("[%s] - Starting...\n", argv[0]); +int main(int argc, char **argv) +{ + printf("[%s] - Starting...\n", argv[0]); - if (!IsAppBuiltAs64()) { - printf( - "%s is only supported with on 64-bit OSs and the application must be " - "built as a 64-bit target. Test is being waived.\n", - argv[0]); - exit(EXIT_WAIVED); - } - - // Number of GPUs - printf("Checking for multiple GPUs...\n"); - int gpu_n; - checkCudaErrors(cudaGetDeviceCount(&gpu_n)); - printf("CUDA-capable device count: %i\n", gpu_n); - - if (gpu_n < 2) { - printf( - "Two or more GPUs with Peer-to-Peer access capability are required for " - "%s.\n", - argv[0]); - printf("Waiving test.\n"); - exit(EXIT_WAIVED); - } - - // Query device properties - cudaDeviceProp prop[64]; - int gpuid[2]; // we want to find the first two GPU's that can support P2P - - for (int i = 0; i < gpu_n; i++) { - checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); - } - // Check possibility for peer access - printf("\nChecking GPU(s) for support of peer to peer memory access...\n"); - - int can_access_peer; - int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs - p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1; - - // Show all the combinations of supported P2P GPUs - for (int i = 0; i < gpu_n; i++) { - for (int j = 0; j < gpu_n; j++) { - if (i == j) { - continue; - } - checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j)); - printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name, - i, prop[j].name, j, can_access_peer ? "Yes" : "No"); - if (can_access_peer && p2pCapableGPUs[0] == -1) { - p2pCapableGPUs[0] = i; - p2pCapableGPUs[1] = j; - } + if (!IsAppBuiltAs64()) { + printf("%s is only supported with on 64-bit OSs and the application must be " + "built as a 64-bit target. Test is being waived.\n", + argv[0]); + exit(EXIT_WAIVED); } - } - if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) { - printf( - "Two or more GPUs with Peer-to-Peer access capability are required for " - "%s.\n", - argv[0]); - printf( - "Peer to Peer access is not available amongst GPUs in the system, " - "waiving test.\n"); + // Number of GPUs + printf("Checking for multiple GPUs...\n"); + int gpu_n; + checkCudaErrors(cudaGetDeviceCount(&gpu_n)); + printf("CUDA-capable device count: %i\n", gpu_n); - exit(EXIT_WAIVED); - } - - // Use first pair of p2p capable GPUs detected. - gpuid[0] = p2pCapableGPUs[0]; - gpuid[1] = p2pCapableGPUs[1]; - - // Enable peer access - printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], - gpuid[1]); - checkCudaErrors(cudaSetDevice(gpuid[0])); - checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0)); - checkCudaErrors(cudaSetDevice(gpuid[1])); - checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0)); - - // Allocate buffers - const size_t buf_size = 1024 * 1024 * 16 * sizeof(float); - printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", - int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]); - checkCudaErrors(cudaSetDevice(gpuid[0])); - float *g0; - checkCudaErrors(cudaMalloc(&g0, buf_size)); - checkCudaErrors(cudaSetDevice(gpuid[1])); - float *g1; - checkCudaErrors(cudaMalloc(&g1, buf_size)); - float *h0; - checkCudaErrors( - cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA - - // Create CUDA event handles - printf("Creating event handles...\n"); - cudaEvent_t start_event, stop_event; - float time_memcpy; - int eventflags = cudaEventBlockingSync; - checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags)); - checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags)); - - // P2P memcopy() benchmark - checkCudaErrors(cudaEventRecord(start_event, 0)); - - for (int i = 0; i < 100; i++) { - // With UVA we don't need to specify source and target devices, the - // runtime figures this out by itself from the pointers - // Ping-pong copy between GPUs - if (i % 2 == 0) { - checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault)); - } else { - checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault)); + if (gpu_n < 2) { + printf("Two or more GPUs with Peer-to-Peer access capability are required for " + "%s.\n", + argv[0]); + printf("Waiving test.\n"); + exit(EXIT_WAIVED); } - } - checkCudaErrors(cudaEventRecord(stop_event, 0)); - checkCudaErrors(cudaEventSynchronize(stop_event)); - checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); - printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n", - gpuid[0], gpuid[1], - (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / - 1024.0f / 1024.0f); + // Query device properties + cudaDeviceProp prop[64]; + int gpuid[2]; // we want to find the first two GPU's that can support P2P - // Prepare host buffer and copy to GPU 0 - printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]); - - for (int i = 0; i < buf_size / sizeof(float); i++) { - h0[i] = float(i % 4096); - } - - checkCudaErrors(cudaSetDevice(gpuid[0])); - checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault)); - - // Kernel launch configuration - const dim3 threads(512, 1); - const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1); - - // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing - // output to the GPU 1 buffer - printf( - "Run kernel on GPU%d, taking source data from GPU%d and writing to " - "GPU%d...\n", - gpuid[1], gpuid[0], gpuid[1]); - checkCudaErrors(cudaSetDevice(gpuid[1])); - SimpleKernel<<>>(g0, g1); - - checkCudaErrors(cudaDeviceSynchronize()); - - // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing - // output to the GPU 0 buffer - printf( - "Run kernel on GPU%d, taking source data from GPU%d and writing to " - "GPU%d...\n", - gpuid[0], gpuid[1], gpuid[0]); - checkCudaErrors(cudaSetDevice(gpuid[0])); - SimpleKernel<<>>(g1, g0); - - checkCudaErrors(cudaDeviceSynchronize()); - - // Copy data back to host and verify - printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]); - checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault)); - - int error_count = 0; - - for (int i = 0; i < buf_size / sizeof(float); i++) { - // Re-generate input data and apply 2x '* 2.0f' computation of both - // kernel runs - if (h0[i] != float(i % 4096) * 2.0f * 2.0f) { - printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], - (float(i % 4096) * 2.0f * 2.0f)); - - if (error_count++ > 10) { - break; - } + for (int i = 0; i < gpu_n; i++) { + checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); } - } + // Check possibility for peer access + printf("\nChecking GPU(s) for support of peer to peer memory access...\n"); - // Disable peer access (also unregisters memory for non-UVA cases) - printf("Disabling peer access...\n"); - checkCudaErrors(cudaSetDevice(gpuid[0])); - checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1])); - checkCudaErrors(cudaSetDevice(gpuid[1])); - checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0])); + int can_access_peer; + int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs + p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1; - // Cleanup and shutdown - printf("Shutting down...\n"); - checkCudaErrors(cudaEventDestroy(start_event)); - checkCudaErrors(cudaEventDestroy(stop_event)); - checkCudaErrors(cudaSetDevice(gpuid[0])); - checkCudaErrors(cudaFree(g0)); - checkCudaErrors(cudaSetDevice(gpuid[1])); - checkCudaErrors(cudaFree(g1)); - checkCudaErrors(cudaFreeHost(h0)); + // Show all the combinations of supported P2P GPUs + for (int i = 0; i < gpu_n; i++) { + for (int j = 0; j < gpu_n; j++) { + if (i == j) { + continue; + } + checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j)); + printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", + prop[i].name, + i, + prop[j].name, + j, + can_access_peer ? "Yes" : "No"); + if (can_access_peer && p2pCapableGPUs[0] == -1) { + p2pCapableGPUs[0] = i; + p2pCapableGPUs[1] = j; + } + } + } - for (int i = 0; i < gpu_n; i++) { - checkCudaErrors(cudaSetDevice(i)); - } + if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) { + printf("Two or more GPUs with Peer-to-Peer access capability are required for " + "%s.\n", + argv[0]); + printf("Peer to Peer access is not available amongst GPUs in the system, " + "waiving test.\n"); - if (error_count != 0) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } else { - printf("Test passed\n"); - exit(EXIT_SUCCESS); - } + exit(EXIT_WAIVED); + } + + // Use first pair of p2p capable GPUs detected. + gpuid[0] = p2pCapableGPUs[0]; + gpuid[1] = p2pCapableGPUs[1]; + + // Enable peer access + printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]); + checkCudaErrors(cudaSetDevice(gpuid[0])); + checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0)); + checkCudaErrors(cudaSetDevice(gpuid[1])); + checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0)); + + // Allocate buffers + const size_t buf_size = 1024 * 1024 * 16 * sizeof(float); + printf( + "Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]); + checkCudaErrors(cudaSetDevice(gpuid[0])); + float *g0; + checkCudaErrors(cudaMalloc(&g0, buf_size)); + checkCudaErrors(cudaSetDevice(gpuid[1])); + float *g1; + checkCudaErrors(cudaMalloc(&g1, buf_size)); + float *h0; + checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA + + // Create CUDA event handles + printf("Creating event handles...\n"); + cudaEvent_t start_event, stop_event; + float time_memcpy; + int eventflags = cudaEventBlockingSync; + checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags)); + checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags)); + + // P2P memcopy() benchmark + checkCudaErrors(cudaEventRecord(start_event, 0)); + + for (int i = 0; i < 100; i++) { + // With UVA we don't need to specify source and target devices, the + // runtime figures this out by itself from the pointers + // Ping-pong copy between GPUs + if (i % 2 == 0) { + checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault)); + } + else { + checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault)); + } + } + + checkCudaErrors(cudaEventRecord(stop_event, 0)); + checkCudaErrors(cudaEventSynchronize(stop_event)); + checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); + printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n", + gpuid[0], + gpuid[1], + (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f); + + // Prepare host buffer and copy to GPU 0 + printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]); + + for (int i = 0; i < buf_size / sizeof(float); i++) { + h0[i] = float(i % 4096); + } + + checkCudaErrors(cudaSetDevice(gpuid[0])); + checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault)); + + // Kernel launch configuration + const dim3 threads(512, 1); + const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1); + + // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing + // output to the GPU 1 buffer + printf("Run kernel on GPU%d, taking source data from GPU%d and writing to " + "GPU%d...\n", + gpuid[1], + gpuid[0], + gpuid[1]); + checkCudaErrors(cudaSetDevice(gpuid[1])); + SimpleKernel<<>>(g0, g1); + + checkCudaErrors(cudaDeviceSynchronize()); + + // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing + // output to the GPU 0 buffer + printf("Run kernel on GPU%d, taking source data from GPU%d and writing to " + "GPU%d...\n", + gpuid[0], + gpuid[1], + gpuid[0]); + checkCudaErrors(cudaSetDevice(gpuid[0])); + SimpleKernel<<>>(g1, g0); + + checkCudaErrors(cudaDeviceSynchronize()); + + // Copy data back to host and verify + printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]); + checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault)); + + int error_count = 0; + + for (int i = 0; i < buf_size / sizeof(float); i++) { + // Re-generate input data and apply 2x '* 2.0f' computation of both + // kernel runs + if (h0[i] != float(i % 4096) * 2.0f * 2.0f) { + printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f)); + + if (error_count++ > 10) { + break; + } + } + } + + // Disable peer access (also unregisters memory for non-UVA cases) + printf("Disabling peer access...\n"); + checkCudaErrors(cudaSetDevice(gpuid[0])); + checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1])); + checkCudaErrors(cudaSetDevice(gpuid[1])); + checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0])); + + // Cleanup and shutdown + printf("Shutting down...\n"); + checkCudaErrors(cudaEventDestroy(start_event)); + checkCudaErrors(cudaEventDestroy(stop_event)); + checkCudaErrors(cudaSetDevice(gpuid[0])); + checkCudaErrors(cudaFree(g0)); + checkCudaErrors(cudaSetDevice(gpuid[1])); + checkCudaErrors(cudaFree(g1)); + checkCudaErrors(cudaFreeHost(h0)); + + for (int i = 0; i < gpu_n; i++) { + checkCudaErrors(cudaSetDevice(i)); + } + + if (error_count != 0) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + else { + printf("Test passed\n"); + exit(EXIT_SUCCESS); + } } diff --git a/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu b/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu index 93d9b295..061b700d 100644 --- a/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu +++ b/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu @@ -26,16 +26,16 @@ */ /* pitchLinearTexture -* -* This example demonstrates how to use textures bound to pitch linear memory. -* It performs a shift of matrix elements using wrap addressing mode (aka -* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array, -* in order to highlight the differences in using each. -* -* Textures binding to pitch linear memory is a new feature in CUDA 2.2, -* and allows use of texture features such as wrap addressing mode and -* filtering which are not possible with textures bound to regular linear memory -*/ + * + * This example demonstrates how to use textures bound to pitch linear memory. + * It performs a shift of matrix elements using wrap addressing mode (aka + * periodic boundary conditions) on two arrays, a pitch linear and a CUDA array, + * in order to highlight the differences in using each. + * + * Textures binding to pitch linear memory is a new feature in CUDA 2.2, + * and allows use of texture features such as wrap addressing mode and + * filtering which are not possible with textures bound to regular linear memory + */ // includes, system #include @@ -50,13 +50,13 @@ #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check +#include // helper functions for CUDA error check -#define NUM_REPS 100 // number of repetitions performed -#define TILE_DIM 16 // tile/block size +#define NUM_REPS 100 // number of repetitions performed +#define TILE_DIM 16 // tile/block size const char *sSDKsample = "simplePitchLinearTexture"; @@ -70,29 +70,26 @@ bool bTestResult = true; //! Shifts matrix elements using pitch linear array //! @param odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height, - int shiftX, int shiftY, - cudaTextureObject_t texRefPL) { - int xid = blockIdx.x * blockDim.x + threadIdx.x; - int yid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void +shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL) +{ + int xid = blockIdx.x * blockDim.x + threadIdx.x; + int yid = blockIdx.y * blockDim.y + threadIdx.y; - odata[yid * pitch + xid] = tex2D( - texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height); + odata[yid * pitch + xid] = tex2D(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height); } //////////////////////////////////////////////////////////////////////////////// //! Shifts matrix elements using regular array //! @param odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void shiftArray(float *odata, int pitch, int width, int height, - int shiftX, int shiftY, - cudaTextureObject_t texRefArray) { - int xid = blockIdx.x * blockDim.x + threadIdx.x; - int yid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void +shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray) +{ + int xid = blockIdx.x * blockDim.x + threadIdx.x; + int yid = blockIdx.y * blockDim.y + threadIdx.y; - odata[yid * pitch + xid] = - tex2D(texRefArray, (xid + shiftX) / (float)width, - (yid + shiftY) / (float)height); + odata[yid * pitch + xid] = tex2D(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height); } //////////////////////////////////////////////////////////////////////////////// @@ -102,210 +99,199 @@ void runTest(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n\n", sSDKsample); +int main(int argc, char **argv) +{ + printf("%s starting...\n\n", sSDKsample); - runTest(argc, argv); + runTest(argc, argv); - printf("%s completed, returned %s\n", sSDKsample, - bTestResult ? "OK" : "ERROR!"); - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!"); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - // Set array size - const int nx = 2048; - const int ny = 2048; +void runTest(int argc, char **argv) +{ + // Set array size + const int nx = 2048; + const int ny = 2048; - // Setup shifts applied to x and y data - const int x_shift = 5; - const int y_shift = 7; + // Setup shifts applied to x and y data + const int x_shift = 5; + const int y_shift = 7; - if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) { - printf("nx and ny must be multiples of TILE_DIM\n"); - exit(EXIT_FAILURE); - } - - // Setup execution configuration parameters - dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM); - - // This will pick the best possible CUDA capable device - int devID = findCudaDevice(argc, (const char **)argv); - - // CUDA events for timing - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - // Host allocation and initialization - float *h_idata = (float *)malloc(sizeof(float) * nx * ny); - float *h_odata = (float *)malloc(sizeof(float) * nx * ny); - float *gold = (float *)malloc(sizeof(float) * nx * ny); - - for (int i = 0; i < nx * ny; ++i) { - h_idata[i] = (float)i; - } - - // Device memory allocation - // Pitch linear input data - float *d_idataPL; - size_t d_pitchBytes; - - checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, - nx * sizeof(float), ny)); - - // Array input data - cudaArray *d_idataArray; - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - - checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny)); - - // Pitch linear output data - float *d_odata; - checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, - nx * sizeof(float), ny)); - - // Copy host data to device - // Pitch linear - size_t h_pitchBytes = nx * sizeof(float); - - checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, - nx * sizeof(float), ny, cudaMemcpyHostToDevice)); - - // Array - checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, - nx * ny * sizeof(float), - cudaMemcpyHostToDevice)); - - cudaTextureObject_t texRefPL; - cudaTextureObject_t texRefArray; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = d_idataPL; - texRes.res.pitch2D.desc = channelDesc; - texRes.res.pitch2D.width = nx; - texRes.res.pitch2D.height = ny; - texRes.res.pitch2D.pitchInBytes = h_pitchBytes; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_idataArray; - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL)); - - // Reference calculation - for (int j = 0; j < ny; ++j) { - int jshift = (j + y_shift) % ny; - - for (int i = 0; i < nx; ++i) { - int ishift = (i + x_shift) % nx; - gold[j * nx + i] = h_idata[jshift * nx + ishift]; + if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) { + printf("nx and ny must be multiples of TILE_DIM\n"); + exit(EXIT_FAILURE); } - } - // Run ShiftPitchLinear kernel - checkCudaErrors( - cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny)); + // Setup execution configuration parameters + dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM); - checkCudaErrors(cudaEventRecord(start, 0)); + // This will pick the best possible CUDA capable device + int devID = findCudaDevice(argc, (const char **)argv); - for (int i = 0; i < NUM_REPS; ++i) { - shiftPitchLinear<<>>(d_odata, - (int)(d_pitchBytes / sizeof(float)), - nx, ny, x_shift, y_shift, texRefPL); - } + // CUDA events for timing + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(stop)); - float timePL; - checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop)); + // Host allocation and initialization + float *h_idata = (float *)malloc(sizeof(float) * nx * ny); + float *h_odata = (float *)malloc(sizeof(float) * nx * ny); + float *gold = (float *)malloc(sizeof(float) * nx * ny); - // Check results - checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, - nx * sizeof(float), ny, cudaMemcpyDeviceToHost)); + for (int i = 0; i < nx * ny; ++i) { + h_idata[i] = (float)i; + } - bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f); + // Device memory allocation + // Pitch linear input data + float *d_idataPL; + size_t d_pitchBytes; - bTestResult = true; + checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny)); - if (res == false) { - printf("*** shiftPitchLinear failed ***\n"); - bTestResult = false; - } + // Array input data + cudaArray *d_idataArray; + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - // Run ShiftArray kernel - checkCudaErrors( - cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny)); - checkCudaErrors(cudaEventRecord(start, 0)); + checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny)); - for (int i = 0; i < NUM_REPS; ++i) { - shiftArray<<>>(d_odata, - (int)(d_pitchBytes / sizeof(float)), nx, - ny, x_shift, y_shift, texRefArray); - } + // Pitch linear output data + float *d_odata; + checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny)); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(stop)); - float timeArray; - checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop)); + // Copy host data to device + // Pitch linear + size_t h_pitchBytes = nx * sizeof(float); - // Check results - checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, - nx * sizeof(float), ny, cudaMemcpyDeviceToHost)); - res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f); + checkCudaErrors( + cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice)); - if (res == false) { - printf("*** shiftArray failed ***\n"); - bTestResult = false; - } + // Array + checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice)); - float bandwidthPL = - 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS); - float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / - (timeArray / NUM_REPS); + cudaTextureObject_t texRefPL; + cudaTextureObject_t texRefArray; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", - bandwidthPL, bandwidthArray); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = d_idataPL; + texRes.res.pitch2D.desc = channelDesc; + texRes.res.pitch2D.width = nx; + texRes.res.pitch2D.height = ny; + texRes.res.pitch2D.pitchInBytes = h_pitchBytes; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS)); - float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS)); + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - printf( - "\nTexture fetch rate (Mpix/s) for pitch linear: " - "%.2e; for array: %.2e\n\n", - fetchRatePL, fetchRateArray); + checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_idataArray; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; + checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL)); - // Cleanup - free(h_idata); - free(h_odata); - free(gold); + // Reference calculation + for (int j = 0; j < ny; ++j) { + int jshift = (j + y_shift) % ny; - checkCudaErrors(cudaDestroyTextureObject(texRefPL)); - checkCudaErrors(cudaDestroyTextureObject(texRefArray)); - checkCudaErrors(cudaFree(d_idataPL)); - checkCudaErrors(cudaFreeArray(d_idataArray)); - checkCudaErrors(cudaFree(d_odata)); + for (int i = 0; i < nx; ++i) { + int ishift = (i + x_shift) % nx; + gold[j * nx + i] = h_idata[jshift * nx + ishift]; + } + } - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); + // Run ShiftPitchLinear kernel + checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny)); + + checkCudaErrors(cudaEventRecord(start, 0)); + + for (int i = 0; i < NUM_REPS; ++i) { + shiftPitchLinear<<>>( + d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL); + } + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + float timePL; + checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop)); + + // Check results + checkCudaErrors( + cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost)); + + bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f); + + bTestResult = true; + + if (res == false) { + printf("*** shiftPitchLinear failed ***\n"); + bTestResult = false; + } + + // Run ShiftArray kernel + checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny)); + checkCudaErrors(cudaEventRecord(start, 0)); + + for (int i = 0; i < NUM_REPS; ++i) { + shiftArray<<>>( + d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray); + } + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + float timeArray; + checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop)); + + // Check results + checkCudaErrors( + cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost)); + res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f); + + if (res == false) { + printf("*** shiftArray failed ***\n"); + bTestResult = false; + } + + float bandwidthPL = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS); + float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS); + + printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray); + + float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS)); + float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS)); + + printf("\nTexture fetch rate (Mpix/s) for pitch linear: " + "%.2e; for array: %.2e\n\n", + fetchRatePL, + fetchRateArray); + + // Cleanup + free(h_idata); + free(h_odata); + free(gold); + + checkCudaErrors(cudaDestroyTextureObject(texRefPL)); + checkCudaErrors(cudaDestroyTextureObject(texRefArray)); + checkCudaErrors(cudaFree(d_idataPL)); + checkCudaErrors(cudaFreeArray(d_idataArray)); + checkCudaErrors(cudaFree(d_odata)); + + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); } diff --git a/Samples/0_Introduction/simplePrintf/simplePrintf.cu b/Samples/0_Introduction/simplePrintf/simplePrintf.cu index aae9e18e..694fa21d 100644 --- a/Samples/0_Introduction/simplePrintf/simplePrintf.cu +++ b/Samples/0_Introduction/simplePrintf/simplePrintf.cu @@ -26,48 +26,49 @@ */ // System includes -#include #include +#include // CUDA runtime #include // helper functions and utilities to work with CUDA -#include #include +#include #ifndef MAX #define MAX(a, b) (a > b ? a : b) #endif -__global__ void testKernel(int val) { - printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x, - threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + - threadIdx.x, - val); +__global__ void testKernel(int val) +{ + printf("[%d, %d]:\t\tValue is:%d\n", + blockIdx.y * gridDim.x + blockIdx.x, + threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x, + val); } -int main(int argc, char **argv) { - int devID; - cudaDeviceProp props; +int main(int argc, char **argv) +{ + int devID; + cudaDeviceProp props; - // This will pick the best possible CUDA capable device - devID = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + devID = findCudaDevice(argc, (const char **)argv); - // Get GPU information - checkCudaErrors(cudaGetDevice(&devID)); - checkCudaErrors(cudaGetDeviceProperties(&props, devID)); - printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, - props.major, props.minor); + // Get GPU information + checkCudaErrors(cudaGetDevice(&devID)); + checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor); - printf("printf() is called. Output:\n\n"); + printf("printf() is called. Output:\n\n"); - // Kernel configuration, where a two-dimensional grid and - // three-dimensional blocks are configured. - dim3 dimGrid(2, 2); - dim3 dimBlock(2, 2, 2); - testKernel<<>>(10); - cudaDeviceSynchronize(); + // Kernel configuration, where a two-dimensional grid and + // three-dimensional blocks are configured. + dim3 dimGrid(2, 2); + dim3 dimBlock(2, 2, 2); + testKernel<<>>(10); + cudaDeviceSynchronize(); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/0_Introduction/simpleStreams/simpleStreams.cu b/Samples/0_Introduction/simpleStreams/simpleStreams.cu index fb15916c..eb92b55c 100644 --- a/Samples/0_Introduction/simpleStreams/simpleStreams.cu +++ b/Samples/0_Introduction/simpleStreams/simpleStreams.cu @@ -44,141 +44,137 @@ * * Elapsed times are averaged over nreps repetitions (10 by default). * -*/ + */ const char *sSDKsample = "simpleStreams"; -const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", - "cudaEventDisableTiming", NULL}; +const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL}; -const char *sDeviceSyncMethod[] = { - "cudaDeviceScheduleAuto", "cudaDeviceScheduleSpin", - "cudaDeviceScheduleYield", "INVALID", - "cudaDeviceScheduleBlockingSync", NULL}; +const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto", + "cudaDeviceScheduleSpin", + "cudaDeviceScheduleYield", + "INVALID", + "cudaDeviceScheduleBlockingSync", + NULL}; // System includes -#include #include +#include // CUDA runtime #include // helper functions and utilities to work with CUDA -#include #include +#include #ifndef WIN32 -#include // for mmap() / munmap() +#include // for mmap() / munmap() #endif // Macro to aligned up to the memory size in question -#define MEMORY_ALIGNMENT 4096 +#define MEMORY_ALIGNMENT 4096 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1))) -__global__ void init_array(int *g_data, int *factor, int num_iterations) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void init_array(int *g_data, int *factor, int num_iterations) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = 0; i < num_iterations; i++) { - g_data[idx] += *factor; // non-coalesced on purpose, to burn time - } -} - -bool correct_data(int *a, const int n, const int c) { - for (int i = 0; i < n; i++) { - if (a[i] != c) { - printf("%d: %d %d\n", i, a[i], c); - return false; + for (int i = 0; i < num_iterations; i++) { + g_data[idx] += *factor; // non-coalesced on purpose, to burn time } - } - - return true; } -inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, - int **ppAligned_a, int nbytes) { +bool correct_data(int *a, const int n, const int c) +{ + for (int i = 0; i < n; i++) { + if (a[i] != c) { + printf("%d: %d %d\n", i, a[i], c); + return false; + } + } + + return true; +} + +inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes) +{ #if CUDART_VERSION >= 4000 #if !defined(__arm__) && !defined(__aarch64__) - if (bPinGenericMemory) { + if (bPinGenericMemory) { // allocate a generic page-aligned chunk of system memory #ifdef WIN32 - printf( - "> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned " - "system memory)\n", - (float)nbytes / 1048576.0f); - *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), - MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned " + "system memory)\n", + (float)nbytes / 1048576.0f); + *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); #else - printf( - "> mmap() allocating %4.2f Mbytes (generic page-aligned system " - "memory)\n", - (float)nbytes / 1048576.0f); - *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system " + "memory)\n", + (float)nbytes / 1048576.0f); + *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); #endif - *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT); + *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT); - printf( - "> cudaHostRegister() registering %4.2f Mbytes of generic allocated " - "system memory\n", - (float)nbytes / 1048576.0f); - // pin allocate memory - checkCudaErrors( - cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped)); - } else + printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated " + "system memory\n", + (float)nbytes / 1048576.0f); + // pin allocate memory + checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped)); + } + else #endif #endif - { - printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", - (float)nbytes / 1048576.0f); - // allocate host memory (pinned is required for achieve asynchronicity) - checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes)); - *ppAligned_a = *pp_a; - } + { + printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f); + // allocate host memory (pinned is required for achieve asynchronicity) + checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes)); + *ppAligned_a = *pp_a; + } } -inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, - int **ppAligned_a, int nbytes) { +inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes) +{ #if CUDART_VERSION >= 4000 #if !defined(__arm__) && !defined(__aarch64__) - // CUDA 4.0 support pinning of generic host memory - if (bPinGenericMemory) { - // unpin and delete host memory - checkCudaErrors(cudaHostUnregister(*ppAligned_a)); + // CUDA 4.0 support pinning of generic host memory + if (bPinGenericMemory) { + // unpin and delete host memory + checkCudaErrors(cudaHostUnregister(*ppAligned_a)); #ifdef WIN32 - VirtualFree(*pp_a, 0, MEM_RELEASE); + VirtualFree(*pp_a, 0, MEM_RELEASE); #else - munmap(*pp_a, nbytes); + munmap(*pp_a, nbytes); #endif - } else + } + else #endif #endif - { - cudaFreeHost(*pp_a); - } + { + cudaFreeHost(*pp_a); + } } -static const char *sSyncMethod[] = { - "0 (Automatic Blocking)", - "1 (Spin Blocking)", - "2 (Yield Blocking)", - "3 (Undefined Blocking Method)", - "4 (Blocking Sync Event) = low CPU utilization", - NULL}; +static const char *sSyncMethod[] = {"0 (Automatic Blocking)", + "1 (Spin Blocking)", + "2 (Yield Blocking)", + "3 (Undefined Blocking Method)", + "4 (Blocking Sync Event) = low CPU utilization", + NULL}; -void printHelp() { - printf("Usage: %s [options below]\n", sSDKsample); - printf("\t--sync_method=n for CPU/GPU synchronization\n"); - printf("\t n=%s\n", sSyncMethod[0]); - printf("\t n=%s\n", sSyncMethod[1]); - printf("\t n=%s\n", sSyncMethod[2]); - printf("\t n=%s\n", sSyncMethod[4]); - printf( - "\t--use_generic_memory (default) use generic page-aligned for system " - "memory\n"); - printf( - "\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate " - "system memory\n"); +void printHelp() +{ + printf("Usage: %s [options below]\n", sSDKsample); + printf("\t--sync_method=n for CPU/GPU synchronization\n"); + printf("\t n=%s\n", sSyncMethod[0]); + printf("\t n=%s\n", sSyncMethod[1]); + printf("\t n=%s\n", sSyncMethod[2]); + printf("\t n=%s\n", sSyncMethod[4]); + printf("\t--use_generic_memory (default) use generic page-aligned for system " + "memory\n"); + printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate " + "system memory\n"); } #if defined(__APPLE__) || defined(MACOSX) @@ -187,259 +183,240 @@ void printHelp() { #define DEFAULT_PINNED_GENERIC_MEMORY true #endif -int main(int argc, char **argv) { - int cuda_device = 0; - int nstreams = 4; // number of streams for CUDA calls - int nreps = 10; // number of times each experiment is repeated - int n = 16 * 1024 * 1024; // number of ints in the data set - int nbytes = n * sizeof(int); // number of data bytes - dim3 threads, blocks; // kernel launch configuration - float elapsed_time, time_memcpy, time_kernel; // timing variables - float scale_factor = 1.0f; +int main(int argc, char **argv) +{ + int cuda_device = 0; + int nstreams = 4; // number of streams for CUDA calls + int nreps = 10; // number of times each experiment is repeated + int n = 16 * 1024 * 1024; // number of ints in the data set + int nbytes = n * sizeof(int); // number of data bytes + dim3 threads, blocks; // kernel launch configuration + float elapsed_time, time_memcpy, time_kernel; // timing variables + float scale_factor = 1.0f; - // allocate generic memory and pin it laster instead of using cudaHostAlloc() + // allocate generic memory and pin it laster instead of using cudaHostAlloc() - bool bPinGenericMemory = - DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior - int device_sync_method = - cudaDeviceBlockingSync; // by default we use BlockingSync + bool bPinGenericMemory = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior + int device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync - int niterations; // number of iterations for the loop inside the kernel + int niterations; // number of iterations for the loop inside the kernel - printf("[ %s ]\n\n", sSDKsample); + printf("[ %s ]\n\n", sSDKsample); - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printHelp(); - return EXIT_SUCCESS; - } - - if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, - "sync_method")) >= 0) { - if (device_sync_method == 0 || device_sync_method == 1 || - device_sync_method == 2 || device_sync_method == 4) { - printf("Device synchronization method set to = %s\n", - sSyncMethod[device_sync_method]); - printf("Setting reps to 100 to demonstrate steady state\n"); - nreps = 100; - } else { - printf("Invalid command line option sync_method=\"%d\"\n", - device_sync_method); - return EXIT_FAILURE; + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printHelp(); + return EXIT_SUCCESS; } - } else { - printHelp(); - return EXIT_SUCCESS; - } - if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) { + if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) { + if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) { + printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]); + printf("Setting reps to 100 to demonstrate steady state\n"); + nreps = 100; + } + else { + printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method); + return EXIT_FAILURE; + } + } + else { + printHelp(); + return EXIT_SUCCESS; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) { #if defined(__APPLE__) || defined(MACOSX) - bPinGenericMemory = false; // Generic Pinning of System Paged memory not - // currently supported on Mac OSX + bPinGenericMemory = false; // Generic Pinning of System Paged memory not + // currently supported on Mac OSX #else - bPinGenericMemory = true; + bPinGenericMemory = true; #endif - } - - if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) { - bPinGenericMemory = false; - } - - printf("\n> "); - cuda_device = findCudaDevice(argc, (const char **)argv); - - // check the compute capability of the device - int num_devices = 0; - checkCudaErrors(cudaGetDeviceCount(&num_devices)); - - if (0 == num_devices) { - printf( - "your system does not have a CUDA capable device, waiving test...\n"); - return EXIT_WAIVED; - } - - // check if the command-line chosen device ID is within range, exit if not - if (cuda_device >= num_devices) { - printf( - "cuda_device=%d is invalid, must choose device ID between 0 and %d\n", - cuda_device, num_devices - 1); - return EXIT_FAILURE; - } - - checkCudaErrors(cudaSetDevice(cuda_device)); - - // Checking for compute capabilities - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); - - niterations = 5; - - // Check if GPU can map host memory (Generic Method), if not then we override - // bPinGenericMemory to be false - if (bPinGenericMemory) { - printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, - deviceProp.canMapHostMemory ? "Yes" : "No"); - - if (deviceProp.canMapHostMemory == 0) { - printf( - "Using cudaMallocHost, CUDA device does not support mapping of " - "generic host memory\n"); - bPinGenericMemory = false; } - } - // Anything that is less than 32 Cores will have scaled down workload - scale_factor = - max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - (float)deviceProp.multiProcessorCount)), - 1.0f); - n = (int)rint((float)n / scale_factor); + if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) { + bPinGenericMemory = false; + } - printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, - deviceProp.minor); - printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n", - deviceProp.multiProcessorCount, - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - deviceProp.multiProcessorCount); + printf("\n> "); + cuda_device = findCudaDevice(argc, (const char **)argv); - printf("> scale_factor = %1.4f\n", 1.0f / scale_factor); - printf("> array_size = %d\n\n", n); + // check the compute capability of the device + int num_devices = 0; + checkCudaErrors(cudaGetDeviceCount(&num_devices)); - // enable use of blocking sync, to reduce CPU usage - printf("> Using CPU/GPU Device Synchronization method (%s)\n", - sDeviceSyncMethod[device_sync_method]); - checkCudaErrors(cudaSetDeviceFlags( - device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0))); + if (0 == num_devices) { + printf("your system does not have a CUDA capable device, waiving test...\n"); + return EXIT_WAIVED; + } - // allocate host memory - int c = 5; // value to which the array will be initialized - int *h_a = 0; // pointer to the array data in host memory - int *hAligned_a = 0; // pointer to the array data in host memory (aligned to - // MEMORY_ALIGNMENT) + // check if the command-line chosen device ID is within range, exit if not + if (cuda_device >= num_devices) { + printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1); + return EXIT_FAILURE; + } - // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if - // using the new CUDA 4.0 features - AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes); + checkCudaErrors(cudaSetDevice(cuda_device)); - // allocate device memory - int *d_a = 0, - *d_c = 0; // pointers to data and init value in the device memory - checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); - checkCudaErrors(cudaMemset(d_a, 0x0, nbytes)); - checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int))); - checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice)); + // Checking for compute capabilities + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); - printf("\nStarting Test\n"); + niterations = 5; - // allocate and initialize an array of stream handles - cudaStream_t *streams = - (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t)); + // Check if GPU can map host memory (Generic Method), if not then we override + // bPinGenericMemory to be false + if (bPinGenericMemory) { + printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No"); - for (int i = 0; i < nstreams; i++) { - checkCudaErrors(cudaStreamCreate(&(streams[i]))); - } + if (deviceProp.canMapHostMemory == 0) { + printf("Using cudaMallocHost, CUDA device does not support mapping of " + "generic host memory\n"); + bPinGenericMemory = false; + } + } - // create CUDA event handles - // use blocking sync - cudaEvent_t start_event, stop_event; - int eventflags = - ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync - : cudaEventDefault); + // Anything that is less than 32 Cores will have scaled down workload + scale_factor = + max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), + 1.0f); + n = (int)rint((float)n / scale_factor); - checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags)); - checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags)); + printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor); + printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n", + deviceProp.multiProcessorCount, + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); - // time memcopy from device - checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to - // ensure that all previous - // CUDA calls have - // completed - checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, - cudaMemcpyDeviceToHost, streams[0])); - checkCudaErrors(cudaEventRecord(stop_event, 0)); - checkCudaErrors(cudaEventSynchronize( - stop_event)); // block until the event is actually recorded - checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); - printf("memcopy:\t%.2f\n", time_memcpy); + printf("> scale_factor = %1.4f\n", 1.0f / scale_factor); + printf("> array_size = %d\n\n", n); - // time kernel - threads = dim3(512, 1); - blocks = dim3(n / threads.x, 1); - checkCudaErrors(cudaEventRecord(start_event, 0)); - init_array<<>>(d_a, d_c, niterations); - checkCudaErrors(cudaEventRecord(stop_event, 0)); - checkCudaErrors(cudaEventSynchronize(stop_event)); - checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event)); - printf("kernel:\t\t%.2f\n", time_kernel); + // enable use of blocking sync, to reduce CPU usage + printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]); + checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0))); - ////////////////////////////////////////////////////////////////////// - // time non-streamed execution for reference - threads = dim3(512, 1); - blocks = dim3(n / threads.x, 1); - checkCudaErrors(cudaEventRecord(start_event, 0)); + // allocate host memory + int c = 5; // value to which the array will be initialized + int *h_a = 0; // pointer to the array data in host memory + int *hAligned_a = 0; // pointer to the array data in host memory (aligned to + // MEMORY_ALIGNMENT) - for (int k = 0; k < nreps; k++) { - init_array<<>>(d_a, d_c, niterations); - checkCudaErrors( - cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost)); - } + // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if + // using the new CUDA 4.0 features + AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes); - checkCudaErrors(cudaEventRecord(stop_event, 0)); - checkCudaErrors(cudaEventSynchronize(stop_event)); - checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); - printf("non-streamed:\t%.2f\n", elapsed_time / nreps); + // allocate device memory + int *d_a = 0, + *d_c = 0; // pointers to data and init value in the device memory + checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); + checkCudaErrors(cudaMemset(d_a, 0x0, nbytes)); + checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int))); + checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice)); - ////////////////////////////////////////////////////////////////////// - // time execution with nstreams streams - threads = dim3(512, 1); - blocks = dim3(n / (nstreams * threads.x), 1); - memset(hAligned_a, 255, - nbytes); // set host memory bits to all 1s, for testing correctness - checkCudaErrors(cudaMemset( - d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness - checkCudaErrors(cudaEventRecord(start_event, 0)); + printf("\nStarting Test\n"); + + // allocate and initialize an array of stream handles + cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t)); - for (int k = 0; k < nreps; k++) { - // asynchronously launch nstreams kernels, each operating on its own portion - // of data for (int i = 0; i < nstreams; i++) { - init_array<<>>(d_a + i * n / nstreams, - d_c, niterations); + checkCudaErrors(cudaStreamCreate(&(streams[i]))); } - // asynchronously launch nstreams memcopies. Note that memcopy in stream x - // will only - // commence executing when all previous CUDA calls in stream x have - // completed + // create CUDA event handles + // use blocking sync + cudaEvent_t start_event, stop_event; + int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault); + + checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags)); + checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags)); + + // time memcopy from device + checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to + // ensure that all previous + // CUDA calls have + // completed + checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0])); + checkCudaErrors(cudaEventRecord(stop_event, 0)); + checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded + checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); + printf("memcopy:\t%.2f\n", time_memcpy); + + // time kernel + threads = dim3(512, 1); + blocks = dim3(n / threads.x, 1); + checkCudaErrors(cudaEventRecord(start_event, 0)); + init_array<<>>(d_a, d_c, niterations); + checkCudaErrors(cudaEventRecord(stop_event, 0)); + checkCudaErrors(cudaEventSynchronize(stop_event)); + checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event)); + printf("kernel:\t\t%.2f\n", time_kernel); + + ////////////////////////////////////////////////////////////////////// + // time non-streamed execution for reference + threads = dim3(512, 1); + blocks = dim3(n / threads.x, 1); + checkCudaErrors(cudaEventRecord(start_event, 0)); + + for (int k = 0; k < nreps; k++) { + init_array<<>>(d_a, d_c, niterations); + checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost)); + } + + checkCudaErrors(cudaEventRecord(stop_event, 0)); + checkCudaErrors(cudaEventSynchronize(stop_event)); + checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); + printf("non-streamed:\t%.2f\n", elapsed_time / nreps); + + ////////////////////////////////////////////////////////////////////// + // time execution with nstreams streams + threads = dim3(512, 1); + blocks = dim3(n / (nstreams * threads.x), 1); + memset(hAligned_a, 255, + nbytes); // set host memory bits to all 1s, for testing correctness + checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness + checkCudaErrors(cudaEventRecord(start_event, 0)); + + for (int k = 0; k < nreps; k++) { + // asynchronously launch nstreams kernels, each operating on its own portion + // of data + for (int i = 0; i < nstreams; i++) { + init_array<<>>(d_a + i * n / nstreams, d_c, niterations); + } + + // asynchronously launch nstreams memcopies. Note that memcopy in stream x + // will only + // commence executing when all previous CUDA calls in stream x have + // completed + for (int i = 0; i < nstreams; i++) { + checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams, + d_a + i * n / nstreams, + nbytes / nstreams, + cudaMemcpyDeviceToHost, + streams[i])); + } + } + + checkCudaErrors(cudaEventRecord(stop_event, 0)); + checkCudaErrors(cudaEventSynchronize(stop_event)); + checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); + printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps); + + // check whether the output is correct + printf("-------------------------------\n"); + bool bResults = correct_data(hAligned_a, n, c * nreps * niterations); + + // release resources for (int i = 0; i < nstreams; i++) { - checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams, - d_a + i * n / nstreams, nbytes / nstreams, - cudaMemcpyDeviceToHost, streams[i])); + checkCudaErrors(cudaStreamDestroy(streams[i])); } - } - checkCudaErrors(cudaEventRecord(stop_event, 0)); - checkCudaErrors(cudaEventSynchronize(stop_event)); - checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); - printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps); + checkCudaErrors(cudaEventDestroy(start_event)); + checkCudaErrors(cudaEventDestroy(stop_event)); - // check whether the output is correct - printf("-------------------------------\n"); - bool bResults = correct_data(hAligned_a, n, c * nreps * niterations); + // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0) + FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes); - // release resources - for (int i = 0; i < nstreams; i++) { - checkCudaErrors(cudaStreamDestroy(streams[i])); - } + checkCudaErrors(cudaFree(d_a)); + checkCudaErrors(cudaFree(d_c)); - checkCudaErrors(cudaEventDestroy(start_event)); - checkCudaErrors(cudaEventDestroy(stop_event)); - - // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0) - FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes); - - checkCudaErrors(cudaFree(d_a)); - checkCudaErrors(cudaFree(d_c)); - - return bResults ? EXIT_SUCCESS : EXIT_FAILURE; + return bResults ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu b/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu index 094476f1..5066f13e 100644 --- a/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu +++ b/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu @@ -34,10 +34,10 @@ */ // Includes, system -#include -#include -#include #include +#include +#include +#include #ifdef _WIN32 #define WINDOWS_LEAN_AND_MEAN @@ -49,18 +49,18 @@ #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check +#include // helper functions for CUDA error check #define MIN_EPSILON_ERROR 5e-3f //////////////////////////////////////////////////////////////////////////////// // Define the files that are to be save and the reference images for validation const char *imageFilename = "teapot512.pgm"; -const char *refFilename = "ref_rotated.pgm"; -float angle = 0.5f; // angle to rotate image by (in radians) +const char *refFilename = "ref_rotated.pgm"; +float angle = 0.5f; // angle to rotate image by (in radians) // Auto-Verification Code bool testResult = true; @@ -73,223 +73,218 @@ static const char *sampleName = "simpleSurfaceWrite"; //! Write to a cuArray (texture data source) using surface writes //! @param gIData input data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void surfaceWriteKernel(float *gIData, int width, int height, - cudaSurfaceObject_t outputSurface) { - // calculate surface coordinates - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface) +{ + // calculate surface coordinates + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - // read from global memory and write to cuarray (via surface reference) - surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, - cudaBoundaryModeTrap); + // read from global memory and write to cuarray (via surface reference) + surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap); } //////////////////////////////////////////////////////////////////////////////// //! Transform an image using texture lookups //! @param gOData output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void transformKernel(float *gOData, int width, int height, - float theta, cudaTextureObject_t tex) { - // calculate normalized texture coordinates - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex) +{ + // calculate normalized texture coordinates + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - float u = x / (float)width; - float v = y / (float)height; + float u = x / (float)width; + float v = y / (float)height; - // transform coordinates - u -= 0.5f; - v -= 0.5f; - float tu = u * cosf(theta) - v * sinf(theta) + 0.5f; - float tv = v * cosf(theta) + u * sinf(theta) + 0.5f; + // transform coordinates + u -= 0.5f; + v -= 0.5f; + float tu = u * cosf(theta) - v * sinf(theta) + 0.5f; + float tv = v * cosf(theta) + u * sinf(theta) + 0.5f; - // read from texture and write to global memory - gOData[y * width + x] = tex2D(tex, tu, tv); + // read from texture and write to global memory + gOData[y * width + x] = tex2D(tex, tu, tv); } //////////////////////////////////////////////////////////////////////////////// // Declaration, forward void runTest(int argc, char **argv); -extern "C" void computeGold(float *reference, float *idata, - const unsigned int len); +extern "C" void computeGold(float *reference, float *idata, const unsigned int len); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n", sampleName); +int main(int argc, char **argv) +{ + printf("%s starting...\n", sampleName); - // Process command-line arguments - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", - (char **)&imageFilename); + // Process command-line arguments + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename); - if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { - getCmdLineArgumentString(argc, (const char **)argv, "reference", - (char **)&refFilename); - } else { - printf("-input flag should be used with -reference flag"); - exit(EXIT_FAILURE); - } - } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { - printf("-reference flag should be used with -input flag"); - exit(EXIT_FAILURE); + if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { + getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename); + } + else { + printf("-input flag should be used with -reference flag"); + exit(EXIT_FAILURE); + } + } + else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { + printf("-reference flag should be used with -input flag"); + exit(EXIT_FAILURE); + } } - } - runTest(argc, argv); + runTest(argc, argv); - printf("%s completed, returned %s\n", sampleName, - testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - // Use command-line specified CUDA device, - // otherwise use device with highest Gflops/s - int devID = findCudaDevice(argc, (const char **)argv); +void runTest(int argc, char **argv) +{ + // Use command-line specified CUDA device, + // otherwise use device with highest Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); - // Get number of SMs on this GPU - cudaDeviceProp deviceProps; + // Get number of SMs on this GPU + cudaDeviceProp deviceProps; - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n", - deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major, - deviceProps.minor); + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n", + deviceProps.name, + deviceProps.multiProcessorCount, + deviceProps.major, + deviceProps.minor); - // Load image from disk - float *hData = NULL; - unsigned int width, height; - char *imagePath = sdkFindFilePath(imageFilename, argv[0]); + // Load image from disk + float *hData = NULL; + unsigned int width, height; + char *imagePath = sdkFindFilePath(imageFilename, argv[0]); - if (imagePath == NULL) { - printf("Unable to source image input file: %s\n", imageFilename); - exit(EXIT_FAILURE); - } + if (imagePath == NULL) { + printf("Unable to source image input file: %s\n", imageFilename); + exit(EXIT_FAILURE); + } - sdkLoadPGM(imagePath, &hData, &width, &height); + sdkLoadPGM(imagePath, &hData, &width, &height); - unsigned int size = width * height * sizeof(float); - printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); + unsigned int size = width * height * sizeof(float); + printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); - // Load reference image from image (output) - float *hDataRef = (float *)malloc(size); - char *refPath = sdkFindFilePath(refFilename, argv[0]); + // Load reference image from image (output) + float *hDataRef = (float *)malloc(size); + char *refPath = sdkFindFilePath(refFilename, argv[0]); - if (refPath == NULL) { - printf("Unable to find reference image file: %s\n", refFilename); - exit(EXIT_FAILURE); - } + if (refPath == NULL) { + printf("Unable to find reference image file: %s\n", refFilename); + exit(EXIT_FAILURE); + } - sdkLoadPGM(refPath, &hDataRef, &width, &height); + sdkLoadPGM(refPath, &hDataRef, &width, &height); - // Allocate device memory for result - float *dData = NULL; - checkCudaErrors(cudaMalloc((void **)&dData, size)); + // Allocate device memory for result + float *dData = NULL; + checkCudaErrors(cudaMalloc((void **)&dData, size)); - // Allocate array and copy image data - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); - cudaArray *cuArray; - checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, - cudaArraySurfaceLoadStore)); + // Allocate array and copy image data + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaArray *cuArray; + checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore)); - dim3 dimBlock(8, 8, 1); - dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); + dim3 dimBlock(8, 8, 1); + dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); - cudaSurfaceObject_t outputSurface; - cudaResourceDesc surfRes; - memset(&surfRes, 0, sizeof(cudaResourceDesc)); - surfRes.resType = cudaResourceTypeArray; - surfRes.res.array.array = cuArray; + cudaSurfaceObject_t outputSurface; + cudaResourceDesc surfRes; + memset(&surfRes, 0, sizeof(cudaResourceDesc)); + surfRes.resType = cudaResourceTypeArray; + surfRes.res.array.array = cuArray; - checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes)); + checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes)); #if 1 - checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice)); - surfaceWriteKernel<<>>(dData, width, height, - outputSurface); -#else // This is what differs from the example simpleTexture - checkCudaErrors( - cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice)); + surfaceWriteKernel<<>>(dData, width, height, outputSurface); +#else // This is what differs from the example simpleTexture + checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice)); #endif - cudaTextureObject_t tex; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t tex; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = cuArray; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = cuArray; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); - // Warmup - transformKernel<<>>(dData, width, height, angle, tex); + // Warmup + transformKernel<<>>(dData, width, height, angle, tex); - checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaDeviceSynchronize()); - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); - // Execute the kernel - transformKernel<<>>(dData, width, height, angle, tex); + // Execute the kernel + transformKernel<<>>(dData, width, height, angle, tex); - // Check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); + // Check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - printf("%.2f Mpixels/sec\n", - (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); - sdkDeleteTimer(&timer); + cudaDeviceSynchronize(); + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); + sdkDeleteTimer(&timer); - // Allocate mem for the result on host side - float *hOData = (float *)malloc(size); - // copy result from device to host - checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost)); + // Allocate mem for the result on host side + float *hOData = (float *)malloc(size); + // copy result from device to host + checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost)); - // Write result to file - char outputFilename[1024]; - strcpy(outputFilename, "output.pgm"); - sdkSavePGM("output.pgm", hOData, width, height); - printf("Wrote '%s'\n", outputFilename); + // Write result to file + char outputFilename[1024]; + strcpy(outputFilename, "output.pgm"); + sdkSavePGM("output.pgm", hOData, width, height); + printf("Wrote '%s'\n", outputFilename); - // Write regression file if necessary - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // Write file for regression test - sdkWriteFile("./data/regression.dat", hOData, width * height, 0.0f, - false); - } else { - // We need to reload the data from disk, - // because it is inverted upon output - sdkLoadPGM(outputFilename, &hOData, &width, &height); + // Write regression file if necessary + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // Write file for regression test + sdkWriteFile("./data/regression.dat", hOData, width * height, 0.0f, false); + } + else { + // We need to reload the data from disk, + // because it is inverted upon output + sdkLoadPGM(outputFilename, &hOData, &width, &height); - printf("Comparing files\n"); - printf("\toutput: <%s>\n", outputFilename); - printf("\treference: <%s>\n", refPath); - testResult = - compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f); - } + printf("Comparing files\n"); + printf("\toutput: <%s>\n", outputFilename); + printf("\treference: <%s>\n", refPath); + testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f); + } - checkCudaErrors(cudaDestroySurfaceObject(outputSurface)); - checkCudaErrors(cudaDestroyTextureObject(tex)); - checkCudaErrors(cudaFree(dData)); - checkCudaErrors(cudaFreeArray(cuArray)); - free(imagePath); - free(refPath); + checkCudaErrors(cudaDestroySurfaceObject(outputSurface)); + checkCudaErrors(cudaDestroyTextureObject(tex)); + checkCudaErrors(cudaFree(dData)); + checkCudaErrors(cudaFreeArray(cuArray)); + free(imagePath); + free(refPath); } diff --git a/Samples/0_Introduction/simpleTemplates/sharedmem.cuh b/Samples/0_Introduction/simpleTemplates/sharedmem.cuh index c817c86a..1293c788 100644 --- a/Samples/0_Introduction/simpleTemplates/sharedmem.cuh +++ b/Samples/0_Introduction/simpleTemplates/sharedmem.cuh @@ -68,106 +68,118 @@ // this // struct by putting an undefined symbol in the function body so it won't // compile. -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - __device__ T *getPointer() { - extern __device__ void error(void); - error(); - return NULL; - } +template struct SharedMemory +{ + // Ensure that we won't compile any un-specialized types + __device__ T *getPointer() + { + extern __device__ void error(void); + error(); + return NULL; + } }; // Following are the specializations for the following types. // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double // One could also specialize it for user-defined types. -template <> -struct SharedMemory { - __device__ int *getPointer() { - extern __shared__ int s_int[]; - return s_int; - } +template <> struct SharedMemory +{ + __device__ int *getPointer() + { + extern __shared__ int s_int[]; + return s_int; + } }; -template <> -struct SharedMemory { - __device__ unsigned int *getPointer() { - extern __shared__ unsigned int s_uint[]; - return s_uint; - } +template <> struct SharedMemory +{ + __device__ unsigned int *getPointer() + { + extern __shared__ unsigned int s_uint[]; + return s_uint; + } }; -template <> -struct SharedMemory { - __device__ char *getPointer() { - extern __shared__ char s_char[]; - return s_char; - } +template <> struct SharedMemory +{ + __device__ char *getPointer() + { + extern __shared__ char s_char[]; + return s_char; + } }; -template <> -struct SharedMemory { - __device__ unsigned char *getPointer() { - extern __shared__ unsigned char s_uchar[]; - return s_uchar; - } +template <> struct SharedMemory +{ + __device__ unsigned char *getPointer() + { + extern __shared__ unsigned char s_uchar[]; + return s_uchar; + } }; -template <> -struct SharedMemory { - __device__ short *getPointer() { - extern __shared__ short s_short[]; - return s_short; - } +template <> struct SharedMemory +{ + __device__ short *getPointer() + { + extern __shared__ short s_short[]; + return s_short; + } }; -template <> -struct SharedMemory { - __device__ unsigned short *getPointer() { - extern __shared__ unsigned short s_ushort[]; - return s_ushort; - } +template <> struct SharedMemory +{ + __device__ unsigned short *getPointer() + { + extern __shared__ unsigned short s_ushort[]; + return s_ushort; + } }; -template <> -struct SharedMemory { - __device__ long *getPointer() { - extern __shared__ long s_long[]; - return s_long; - } +template <> struct SharedMemory +{ + __device__ long *getPointer() + { + extern __shared__ long s_long[]; + return s_long; + } }; -template <> -struct SharedMemory { - __device__ unsigned long *getPointer() { - extern __shared__ unsigned long s_ulong[]; - return s_ulong; - } +template <> struct SharedMemory +{ + __device__ unsigned long *getPointer() + { + extern __shared__ unsigned long s_ulong[]; + return s_ulong; + } }; -template <> -struct SharedMemory { - __device__ bool *getPointer() { - extern __shared__ bool s_bool[]; - return s_bool; - } +template <> struct SharedMemory +{ + __device__ bool *getPointer() + { + extern __shared__ bool s_bool[]; + return s_bool; + } }; -template <> -struct SharedMemory { - __device__ float *getPointer() { - extern __shared__ float s_float[]; - return s_float; - } +template <> struct SharedMemory +{ + __device__ float *getPointer() + { + extern __shared__ float s_float[]; + return s_float; + } }; -template <> -struct SharedMemory { - __device__ double *getPointer() { - extern __shared__ double s_double[]; - return s_double; - } +template <> struct SharedMemory +{ + __device__ double *getPointer() + { + extern __shared__ double s_double[]; + return s_double; + } }; -#endif //_SHAREDMEM_H_ +#endif //_SHAREDMEM_H_ diff --git a/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu b/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu index 5710c312..8a7dae30 100644 --- a/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu +++ b/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu @@ -26,23 +26,23 @@ */ /* This sample is a templatized version of the template project. -* It also shows how to correctly templatize dynamically allocated shared -* memory arrays. -* Host code. -*/ + * It also shows how to correctly templatize dynamically allocated shared + * memory arrays. + * Host code. + */ // System includes -#include #include -#include #include +#include +#include // CUDA runtime #include // helper functions and utilities to work with CUDA -#include #include +#include #ifndef MAX #define MAX(a, b) (a > b ? a : b) @@ -58,55 +58,55 @@ int g_TotalFailures = 0; //! @param g_idata input data in global memory //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -template -__global__ void testKernel(T *g_idata, T *g_odata) { - // Shared mem size is determined by the host app at run time - SharedMemory smem; - T *sdata = smem.getPointer(); +template __global__ void testKernel(T *g_idata, T *g_odata) +{ + // Shared mem size is determined by the host app at run time + SharedMemory smem; + T *sdata = smem.getPointer(); - // access thread id - const unsigned int tid = threadIdx.x; - // access number of threads in this block - const unsigned int num_threads = blockDim.x; + // access thread id + const unsigned int tid = threadIdx.x; + // access number of threads in this block + const unsigned int num_threads = blockDim.x; - // read in input data from global memory - sdata[tid] = g_idata[tid]; - __syncthreads(); + // read in input data from global memory + sdata[tid] = g_idata[tid]; + __syncthreads(); - // perform some computations - sdata[tid] = (T)num_threads * sdata[tid]; - __syncthreads(); + // perform some computations + sdata[tid] = (T)num_threads * sdata[tid]; + __syncthreads(); - // write data to global memory - g_odata[tid] = sdata[tid]; + // write data to global memory + g_odata[tid] = sdata[tid]; } //////////////////////////////////////////////////////////////////////////////// // declaration, forward -template -void runTest(int argc, char **argv, int len); +template void runTest(int argc, char **argv, int len); -template -void computeGold(T *reference, T *idata, const unsigned int len) { - const T T_len = static_cast(len); +template void computeGold(T *reference, T *idata, const unsigned int len) +{ + const T T_len = static_cast(len); - for (unsigned int i = 0; i < len; ++i) { - reference[i] = idata[i] * T_len; - } + for (unsigned int i = 0; i < len; ++i) { + reference[i] = idata[i] * T_len; + } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("> runTest\n"); - runTest(argc, argv, 32); - printf("> runTest\n"); - runTest(argc, argv, 64); +int main(int argc, char **argv) +{ + printf("> runTest\n"); + runTest(argc, argv, 32); + printf("> runTest\n"); + runTest(argc, argv, 64); - printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures); + printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures); - exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } // To completely templatize runTest (below) with cutil, we need to use @@ -114,151 +114,152 @@ int main(int argc, char **argv) { // functions for different types. // Here's the generic wrapper for cutCompare* -template -class ArrayComparator { - public: - bool compare(const T *reference, T *data, unsigned int len) { - fprintf(stderr, - "Error: no comparison function implemented for this type\n"); - return false; - } +template class ArrayComparator +{ +public: + bool compare(const T *reference, T *data, unsigned int len) + { + fprintf(stderr, "Error: no comparison function implemented for this type\n"); + return false; + } }; // Here's the specialization for ints: -template <> -class ArrayComparator { - public: - bool compare(const int *reference, int *data, unsigned int len) { - return compareData(reference, data, len, 0.15f, 0.0f); - } +template <> class ArrayComparator +{ +public: + bool compare(const int *reference, int *data, unsigned int len) + { + return compareData(reference, data, len, 0.15f, 0.0f); + } }; // Here's the specialization for floats: -template <> -class ArrayComparator { - public: - bool compare(const float *reference, float *data, unsigned int len) { - return compareData(reference, data, len, 0.15f, 0.15f); - } +template <> class ArrayComparator +{ +public: + bool compare(const float *reference, float *data, unsigned int len) + { + return compareData(reference, data, len, 0.15f, 0.15f); + } }; // Here's the generic wrapper for cutWriteFile* -template -class ArrayFileWriter { - public: - bool write(const char *filename, T *data, unsigned int len, float epsilon) { - fprintf(stderr, - "Error: no file write function implemented for this type\n"); - return false; - } +template class ArrayFileWriter +{ +public: + bool write(const char *filename, T *data, unsigned int len, float epsilon) + { + fprintf(stderr, "Error: no file write function implemented for this type\n"); + return false; + } }; // Here's the specialization for ints: -template <> -class ArrayFileWriter { - public: - bool write(const char *filename, int *data, unsigned int len, float epsilon) { - return sdkWriteFile(filename, data, len, epsilon, false); - } +template <> class ArrayFileWriter +{ +public: + bool write(const char *filename, int *data, unsigned int len, float epsilon) + { + return sdkWriteFile(filename, data, len, epsilon, false); + } }; // Here's the specialization for floats: -template <> -class ArrayFileWriter { - public: - bool write(const char *filename, float *data, unsigned int len, - float epsilon) { - return sdkWriteFile(filename, data, len, epsilon, false); - } +template <> class ArrayFileWriter +{ +public: + bool write(const char *filename, float *data, unsigned int len, float epsilon) + { + return sdkWriteFile(filename, data, len, epsilon, false); + } }; //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -template -void runTest(int argc, char **argv, int len) { - int devID; - cudaDeviceProp deviceProps; +template void runTest(int argc, char **argv, int len) +{ + int devID; + cudaDeviceProp deviceProps; - devID = findCudaDevice(argc, (const char **)argv); + devID = findCudaDevice(argc, (const char **)argv); - // get number of SMs on this GPU - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, - deviceProps.multiProcessorCount); + // get number of SMs on this GPU + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); - // create and start timer - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); + // create and start timer + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); - // start the timer - sdkStartTimer(&timer); + // start the timer + sdkStartTimer(&timer); - unsigned int num_threads = len; - unsigned int mem_size = sizeof(float) * num_threads; + unsigned int num_threads = len; + unsigned int mem_size = sizeof(float) * num_threads; - // allocate host memory - T *h_idata = (T *)malloc(mem_size); + // allocate host memory + T *h_idata = (T *)malloc(mem_size); - // initialize the memory - for (unsigned int i = 0; i < num_threads; ++i) { - h_idata[i] = (T)i; - } + // initialize the memory + for (unsigned int i = 0; i < num_threads; ++i) { + h_idata[i] = (T)i; + } - // allocate device memory - T *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); - // copy host memory to device - checkCudaErrors( - cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); + // allocate device memory + T *d_idata; + checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); + // copy host memory to device + checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); - // allocate device memory for result - T *d_odata; - checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); + // allocate device memory for result + T *d_odata; + checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); - // setup execution parameters - dim3 grid(1, 1, 1); - dim3 threads(num_threads, 1, 1); + // setup execution parameters + dim3 grid(1, 1, 1); + dim3 threads(num_threads, 1, 1); - // execute the kernel - testKernel<<>>(d_idata, d_odata); + // execute the kernel + testKernel<<>>(d_idata, d_odata); - // check if kernel execution generated and error - getLastCudaError("Kernel execution failed"); + // check if kernel execution generated and error + getLastCudaError("Kernel execution failed"); - // allocate mem for the result on host side - T *h_odata = (T *)malloc(mem_size); - // copy result from device to host - checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, - cudaMemcpyDeviceToHost)); + // allocate mem for the result on host side + T *h_odata = (T *)malloc(mem_size); + // copy result from device to host + checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost)); - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); - // compute reference solution - T *reference = (T *)malloc(mem_size); - computeGold(reference, h_idata, num_threads); + // compute reference solution + T *reference = (T *)malloc(mem_size); + computeGold(reference, h_idata, num_threads); - ArrayComparator comparator; - ArrayFileWriter writer; + ArrayComparator comparator; + ArrayFileWriter writer; - // check result - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // write file for regression test - writer.write("./data/regression.dat", h_odata, num_threads, 0.0f); - } else { - // custom output handling when no regression test running - // in this case check if the result is equivalent to the expected solution - bool res = comparator.compare(reference, h_odata, num_threads); - printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH"); - g_TotalFailures += (1 != res); - } + // check result + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // write file for regression test + writer.write("./data/regression.dat", h_odata, num_threads, 0.0f); + } + else { + // custom output handling when no regression test running + // in this case check if the result is equivalent to the expected solution + bool res = comparator.compare(reference, h_odata, num_threads); + printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH"); + g_TotalFailures += (1 != res); + } - // cleanup memory - free(h_idata); - free(h_odata); - free(reference); - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(d_odata)); + // cleanup memory + free(h_idata); + free(h_odata); + free(reference); + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(d_odata)); } diff --git a/Samples/0_Introduction/simpleTexture/simpleTexture.cu b/Samples/0_Introduction/simpleTexture/simpleTexture.cu index 7f755f45..6b418cef 100644 --- a/Samples/0_Introduction/simpleTexture/simpleTexture.cu +++ b/Samples/0_Introduction/simpleTexture/simpleTexture.cu @@ -34,10 +34,10 @@ */ // Includes, system -#include -#include -#include #include +#include +#include +#include #ifdef _WIN32 #define WINDOWS_LEAN_AND_MEAN @@ -49,22 +49,22 @@ #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check +#include // helper functions for CUDA error check #define MAX_EPSILON_ERROR 5e-3f // Define the files that are to be save and the reference images for validation const char *imageFilename = "teapot512.pgm"; -const char *refFilename = "ref_rotated.pgm"; +const char *refFilename = "ref_rotated.pgm"; const char *sampleName = "simpleTexture"; //////////////////////////////////////////////////////////////////////////////// // Constants -const float angle = 0.5f; // angle to rotate image by (in radians) +const float angle = 0.5f; // angle to rotate image by (in radians) // Auto-Verification Code bool testResult = true; @@ -73,22 +73,22 @@ bool testResult = true; //! Transform an image using texture lookups //! @param outputData output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void transformKernel(float *outputData, int width, int height, - float theta, cudaTextureObject_t tex) { - // calculate normalized texture coordinates - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex) +{ + // calculate normalized texture coordinates + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - float u = (float)x - (float)width / 2; - float v = (float)y - (float)height / 2; - float tu = u * cosf(theta) - v * sinf(theta); - float tv = v * cosf(theta) + u * sinf(theta); + float u = (float)x - (float)width / 2; + float v = (float)y - (float)height / 2; + float tu = u * cosf(theta) - v * sinf(theta); + float tv = v * cosf(theta) + u * sinf(theta); - tu /= (float)width; - tv /= (float)height; + tu /= (float)width; + tv /= (float)height; - // read from texture and write to global memory - outputData[y * width + x] = tex2D(tex, tu + 0.5f, tv + 0.5f); + // read from texture and write to global memory + outputData[y * width + x] = tex2D(tex, tu + 0.5f, tv + 0.5f); } //////////////////////////////////////////////////////////////////////////////// @@ -98,154 +98,151 @@ void runTest(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s starting...\n", sampleName); +int main(int argc, char **argv) +{ + printf("%s starting...\n", sampleName); - // Process command-line arguments - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", - (char **)&imageFilename); + // Process command-line arguments + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename); - if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { - getCmdLineArgumentString(argc, (const char **)argv, "reference", - (char **)&refFilename); - } else { - printf("-input flag should be used with -reference flag"); - exit(EXIT_FAILURE); - } - } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { - printf("-reference flag should be used with -input flag"); - exit(EXIT_FAILURE); + if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { + getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename); + } + else { + printf("-input flag should be used with -reference flag"); + exit(EXIT_FAILURE); + } + } + else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { + printf("-reference flag should be used with -input flag"); + exit(EXIT_FAILURE); + } } - } - runTest(argc, argv); + runTest(argc, argv); - printf("%s completed, returned %s\n", sampleName, - testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - int devID = findCudaDevice(argc, (const char **)argv); +void runTest(int argc, char **argv) +{ + int devID = findCudaDevice(argc, (const char **)argv); - // load image from disk - float *hData = NULL; - unsigned int width, height; - char *imagePath = sdkFindFilePath(imageFilename, argv[0]); + // load image from disk + float *hData = NULL; + unsigned int width, height; + char *imagePath = sdkFindFilePath(imageFilename, argv[0]); - if (imagePath == NULL) { - printf("Unable to source image file: %s\n", imageFilename); - exit(EXIT_FAILURE); - } + if (imagePath == NULL) { + printf("Unable to source image file: %s\n", imageFilename); + exit(EXIT_FAILURE); + } - sdkLoadPGM(imagePath, &hData, &width, &height); + sdkLoadPGM(imagePath, &hData, &width, &height); - unsigned int size = width * height * sizeof(float); - printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); + unsigned int size = width * height * sizeof(float); + printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); - // Load reference image from image (output) - float *hDataRef = (float *)malloc(size); - char *refPath = sdkFindFilePath(refFilename, argv[0]); + // Load reference image from image (output) + float *hDataRef = (float *)malloc(size); + char *refPath = sdkFindFilePath(refFilename, argv[0]); - if (refPath == NULL) { - printf("Unable to find reference image file: %s\n", refFilename); - exit(EXIT_FAILURE); - } + if (refPath == NULL) { + printf("Unable to find reference image file: %s\n", refFilename); + exit(EXIT_FAILURE); + } - sdkLoadPGM(refPath, &hDataRef, &width, &height); + sdkLoadPGM(refPath, &hDataRef, &width, &height); - // Allocate device memory for result - float *dData = NULL; - checkCudaErrors(cudaMalloc((void **)&dData, size)); + // Allocate device memory for result + float *dData = NULL; + checkCudaErrors(cudaMalloc((void **)&dData, size)); - // Allocate array and copy image data - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); - cudaArray *cuArray; - checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height)); - checkCudaErrors( - cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice)); + // Allocate array and copy image data + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaArray *cuArray; + checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height)); + checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice)); - cudaTextureObject_t tex; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t tex; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = cuArray; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = cuArray; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); - dim3 dimBlock(8, 8, 1); - dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); + dim3 dimBlock(8, 8, 1); + dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); - // Warmup - transformKernel<<>>(dData, width, height, angle, tex); + // Warmup + transformKernel<<>>(dData, width, height, angle, tex); - checkCudaErrors(cudaDeviceSynchronize()); - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + checkCudaErrors(cudaDeviceSynchronize()); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); - // Execute the kernel - transformKernel<<>>(dData, width, height, angle, tex); + // Execute the kernel + transformKernel<<>>(dData, width, height, angle, tex); - // Check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); + // Check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - printf("%.2f Mpixels/sec\n", - (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); - sdkDeleteTimer(&timer); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); + sdkDeleteTimer(&timer); - // Allocate mem for the result on host side - float *hOutputData = (float *)malloc(size); - // copy result from device to host - checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost)); + // Allocate mem for the result on host side + float *hOutputData = (float *)malloc(size); + // copy result from device to host + checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost)); - // Write result to file - char outputFilename[1024]; - strcpy(outputFilename, imagePath); - strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm"); - sdkSavePGM(outputFilename, hOutputData, width, height); - printf("Wrote '%s'\n", outputFilename); + // Write result to file + char outputFilename[1024]; + strcpy(outputFilename, imagePath); + strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm"); + sdkSavePGM(outputFilename, hOutputData, width, height); + printf("Wrote '%s'\n", outputFilename); - // Write regression file if necessary - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // Write file for regression test - sdkWriteFile("./data/regression.dat", hOutputData, width * height, - 0.0f, false); - } else { - // We need to reload the data from disk, - // because it is inverted upon output - sdkLoadPGM(outputFilename, &hOutputData, &width, &height); + // Write regression file if necessary + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // Write file for regression test + sdkWriteFile("./data/regression.dat", hOutputData, width * height, 0.0f, false); + } + else { + // We need to reload the data from disk, + // because it is inverted upon output + sdkLoadPGM(outputFilename, &hOutputData, &width, &height); - printf("Comparing files\n"); - printf("\toutput: <%s>\n", outputFilename); - printf("\treference: <%s>\n", refPath); + printf("Comparing files\n"); + printf("\toutput: <%s>\n", outputFilename); + printf("\treference: <%s>\n", refPath); - testResult = compareData(hOutputData, hDataRef, width * height, - MAX_EPSILON_ERROR, 0.15f); - } + testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f); + } - checkCudaErrors(cudaDestroyTextureObject(tex)); - checkCudaErrors(cudaFree(dData)); - checkCudaErrors(cudaFreeArray(cuArray)); - free(imagePath); - free(refPath); + checkCudaErrors(cudaDestroyTextureObject(tex)); + checkCudaErrors(cudaFree(dData)); + checkCudaErrors(cudaFreeArray(cuArray)); + free(imagePath); + free(refPath); } diff --git a/Samples/0_Introduction/simpleTexture3D/README.md b/Samples/0_Introduction/simpleTexture3D/README.md index 235f1276..141b5143 100644 --- a/Samples/0_Introduction/simpleTexture3D/README.md +++ b/Samples/0_Introduction/simpleTexture3D/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp index 24120ba8..6ee4a6a3 100644 --- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp +++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp @@ -32,11 +32,11 @@ using 3D texture lookups. */ -#include -#include -#include -#include #include +#include +#include +#include +#include #if defined(__APPLE__) || defined(MACOSX) #pragma clang diagnostic ignored "-Wdeprecated-declarations" @@ -49,53 +49,52 @@ #endif // includes, cuda -#include -#include #include +#include +#include // CUDA utilities and system includes #include #include #include -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; #define MAX_EPSILON_ERROR 5.0f -#define THRESHOLD 0.15f +#define THRESHOLD 0.15f const char *sSDKsample = "simpleTexture3D"; -const char *volumeFilename = "Bucky.raw"; -const cudaExtent volumeSize = make_cudaExtent(32, 32, 32); +const char *volumeFilename = "Bucky.raw"; +const cudaExtent volumeSize = make_cudaExtent(32, 32, 32); const uint width = 512, height = 512; const dim3 blockSize(16, 16, 1); const dim3 gridSize(width / blockSize.x, height / blockSize.y); -float w = 0.5; // texture coordinate in z +float w = 0.5; // texture coordinate in z -GLuint pbo; // OpenGL pixel buffer object -struct cudaGraphicsResource - *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) +GLuint pbo; // OpenGL pixel buffer object +struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) bool linearFiltering = true; -bool animate = true; +bool animate = true; StopWatchInterface *timer = NULL; uint *d_output = NULL; // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; volatile int g_GraphicsMapFlag = 0; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; #ifndef MAX @@ -105,288 +104,294 @@ char **pArgv = NULL; extern "C" void cleanup(); extern "C" void setTextureFilterMode(bool bLinearFilter); extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize); -extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, - uint imageW, uint imageH, float w); -extern void cleanupCuda(); +extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w); +extern void cleanupCuda(); void loadVolumeData(char *exec_path); -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - fpsLimit = ftoi(MAX(1.0f, ifps)); - sdkResetTimer(&timer); - } + fpsLimit = ftoi(MAX(1.0f, ifps)); + sdkResetTimer(&timer); + } } // render image using CUDA -void render() { - // map PBO to get CUDA device pointer - g_GraphicsMapFlag++; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_output, &num_bytes, cuda_pbo_resource)); - // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); +void render() +{ + // map PBO to get CUDA device pointer + g_GraphicsMapFlag++; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); + // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); - // call CUDA kernel, writing results to PBO - render_kernel(gridSize, blockSize, d_output, width, height, w); + // call CUDA kernel, writing results to PBO + render_kernel(gridSize, blockSize, d_output, width, height, w); - getLastCudaError("render_kernel failed"); + getLastCudaError("render_kernel failed"); - if (g_GraphicsMapFlag) { - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - g_GraphicsMapFlag--; - } + if (g_GraphicsMapFlag) { + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + g_GraphicsMapFlag--; + } } // display results using OpenGL (called by GLUT) -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - render(); + render(); - // display results - glClear(GL_COLOR_BUFFER_BIT); + // display results + glClear(GL_COLOR_BUFFER_BIT); - // draw image from PBO - glDisable(GL_DEPTH_TEST); - glRasterPos2i(0, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + // draw image from PBO + glDisable(GL_DEPTH_TEST); + glRasterPos2i(0, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - glutSwapBuffers(); - glutReportErrors(); + glutSwapBuffers(); + glutReportErrors(); - sdkStopTimer(&timer); - computeFPS(); + sdkStopTimer(&timer); + computeFPS(); } -void idle() { - if (animate) { - w += 0.01f; - glutPostRedisplay(); - } +void idle() +{ + if (animate) { + w += 0.01f; + glutPostRedisplay(); + } } -void keyboard(unsigned char key, int x, int y) { - switch (key) { +void keyboard(unsigned char key, int x, int y) +{ + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); - glutDestroyWindow(glutGetWindow()); - return; + exit(EXIT_SUCCESS); + glutDestroyWindow(glutGetWindow()); + return; #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif case '=': case '+': - w += 0.01f; - break; + w += 0.01f; + break; case '-': - w -= 0.01f; - break; + w -= 0.01f; + break; case 'f': - linearFiltering = !linearFiltering; - setTextureFilterMode(linearFiltering); - break; + linearFiltering = !linearFiltering; + setTextureFilterMode(linearFiltering); + break; case ' ': - animate = !animate; - break; + animate = !animate; + break; default: - break; - } + break; + } - glutPostRedisplay(); + glutPostRedisplay(); } -void reshape(int x, int y) { - glViewport(0, 0, x, y); +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } -void cleanup() { - sdkDeleteTimer(&timer); +void cleanup() +{ + sdkDeleteTimer(&timer); - // add extra check to unmap the resource before unregistering it - if (g_GraphicsMapFlag) { - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - g_GraphicsMapFlag--; - } + // add extra check to unmap the resource before unregistering it + if (g_GraphicsMapFlag) { + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + g_GraphicsMapFlag--; + } - // unregister this buffer object from CUDA C - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); - glDeleteBuffers(1, &pbo); - cleanupCuda(); + // unregister this buffer object from CUDA C + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); + glDeleteBuffers(1, &pbo); + cleanupCuda(); } -void initGLBuffers() { - // create pixel buffer object - glGenBuffers(1, &pbo); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, - 0, GL_STREAM_DRAW_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); +void initGLBuffers() +{ + // create pixel buffer object + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - // register this buffer object with CUDA - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); } // Load raw data from disk -uchar *loadRawFile(const char *filename, size_t size) { - FILE *fp = fopen(filename, "rb"); +uchar *loadRawFile(const char *filename, size_t size) +{ + FILE *fp = fopen(filename, "rb"); - if (!fp) { - fprintf(stderr, "Error opening file '%s'\n", filename); - return 0; - } + if (!fp) { + fprintf(stderr, "Error opening file '%s'\n", filename); + return 0; + } - uchar *data = (uchar *)malloc(size); - size_t read = fread(data, 1, size, fp); - fclose(fp); + uchar *data = (uchar *)malloc(size); + size_t read = fread(data, 1, size, fp); + fclose(fp); - printf("Read '%s', %zu bytes\n", filename, read); + printf("Read '%s', %zu bytes\n", filename, read); - return data; + return data; } -void initGL(int *argc, char **argv) { - // initialize GLUT callback functions - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); - glutInitWindowSize(width, height); - glutCreateWindow("CUDA 3D texture"); - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); - glutIdleFunc(idle); +void initGL(int *argc, char **argv) +{ + // initialize GLUT callback functions + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); + glutInitWindowSize(width, height); + glutCreateWindow("CUDA 3D texture"); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutIdleFunc(idle); - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Required OpenGL extensions are missing."); - exit(EXIT_FAILURE); - } + if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions are missing."); + exit(EXIT_FAILURE); + } } -void runAutoTest(const char *ref_file, char *exec_path) { - checkCudaErrors( - cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4)); +void runAutoTest(const char *ref_file, char *exec_path) +{ + checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4)); - // render the volumeData - render_kernel(gridSize, blockSize, d_output, width, height, w); + // render the volumeData + render_kernel(gridSize, blockSize, d_output, width, height, w); - checkCudaErrors(cudaDeviceSynchronize()); - getLastCudaError("render_kernel failed"); + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("render_kernel failed"); - void *h_output = malloc(width * height * sizeof(GLubyte) * 4); - checkCudaErrors(cudaMemcpy(h_output, d_output, - width * height * sizeof(GLubyte) * 4, - cudaMemcpyDeviceToHost)); - sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, - "simpleTexture3D.bin"); + void *h_output = malloc(width * height * sizeof(GLubyte) * 4); + checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost)); + sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin"); - bool bTestResult = sdkCompareBin2BinFloat( - "simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), - width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path); + bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", + sdkFindFilePath(ref_file, exec_path), + width * height, + MAX_EPSILON_ERROR, + THRESHOLD, + exec_path); - checkCudaErrors(cudaFree(d_output)); - free(h_output); + checkCudaErrors(cudaFree(d_output)); + free(h_output); - sdkStopTimer(&timer); - sdkDeleteTimer(&timer); + sdkStopTimer(&timer); + sdkDeleteTimer(&timer); - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } -void loadVolumeData(char *exec_path) { - // load volume data - const char *path = sdkFindFilePath(volumeFilename, exec_path); +void loadVolumeData(char *exec_path) +{ + // load volume data + const char *path = sdkFindFilePath(volumeFilename, exec_path); - if (path == NULL) { - fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", - volumeFilename); - exit(EXIT_FAILURE); - } + if (path == NULL) { + fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename); + exit(EXIT_FAILURE); + } - size_t size = volumeSize.width * volumeSize.height * volumeSize.depth; - uchar *h_volume = loadRawFile(path, size); + size_t size = volumeSize.width * volumeSize.height * volumeSize.depth; + uchar *h_volume = loadRawFile(path, size); - initCuda(h_volume, volumeSize); - sdkCreateTimer(&timer); + initCuda(h_volume, volumeSize); + sdkCreateTimer(&timer); - free(h_volume); + free(h_volume); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; - char *ref_file = NULL; + char *ref_file = NULL; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - fpsLimit = frameCheckNumber; - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + fpsLimit = frameCheckNumber; + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + } - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - if (ref_file) { - loadVolumeData(argv[0]); - runAutoTest(ref_file, argv[0]); - } else { - initGL(&argc, argv); + if (ref_file) { + loadVolumeData(argv[0]); + runAutoTest(ref_file, argv[0]); + } + else { + initGL(&argc, argv); - // OpenGL buffers - initGLBuffers(); + // OpenGL buffers + initGLBuffers(); - loadVolumeData(argv[0]); - } + loadVolumeData(argv[0]); + } - printf( - "Press space to toggle animation\n" - "Press '+' and '-' to change displayed slice\n"); + printf("Press space to toggle animation\n" + "Press '+' and '-' to change displayed slice\n"); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - glutMainLoop(); + glutMainLoop(); - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } diff --git a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu index 06894be9..a290772f 100644 --- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu +++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu @@ -28,111 +28,111 @@ #ifndef _SIMPLETEXTURE3D_KERNEL_CU_ #define _SIMPLETEXTURE3D_KERNEL_CU_ -#include -#include -#include -#include - #include #include +#include +#include +#include +#include -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; -cudaArray *d_volumeArray = 0; -cudaTextureObject_t tex; // 3D texture +cudaArray *d_volumeArray = 0; +cudaTextureObject_t tex; // 3D texture -__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, - cudaTextureObject_t texObj) { - uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; +__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj) +{ + uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; + uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; - float u = x / (float)imageW; - float v = y / (float)imageH; - // read from 3D texture - float voxel = tex3D(texObj, u, v, w); + float u = x / (float)imageW; + float v = y / (float)imageH; + // read from 3D texture + float voxel = tex3D(texObj, u, v, w); - if ((x < imageW) && (y < imageH)) { - // write output color - uint i = __umul24(y, imageW) + x; - d_output[i] = voxel * 255; - } + if ((x < imageW) && (y < imageH)) { + // write output color + uint i = __umul24(y, imageW) + x; + d_output[i] = voxel * 255; + } } -extern "C" void setTextureFilterMode(bool bLinearFilter) { - if (tex) { - checkCudaErrors(cudaDestroyTextureObject(tex)); - } - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); +extern "C" void setTextureFilterMode(bool bLinearFilter) +{ + if (tex) { + checkCudaErrors(cudaDestroyTextureObject(tex)); + } + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_volumeArray; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_volumeArray; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = - bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint; - ; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.addressMode[2] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = true; + texDescr.filterMode = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint; + ; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.addressMode[2] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); } -extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) { - // create 3D array - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); +extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) +{ + // create 3D array + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); - // copy data to 3D array - cudaMemcpy3DParms copyParams = {0}; - copyParams.srcPtr = - make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), - volumeSize.width, volumeSize.height); - copyParams.dstArray = d_volumeArray; - copyParams.extent = volumeSize; - copyParams.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(©Params)); + // copy data to 3D array + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = + make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height); + copyParams.dstArray = d_volumeArray; + copyParams.extent = volumeSize; + copyParams.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(©Params)); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_volumeArray; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_volumeArray; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - // access with normalized texture coordinates - texDescr.normalizedCoords = true; - // linear interpolation - texDescr.filterMode = cudaFilterModeLinear; - // wrap texture coordinates - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.addressMode[2] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeNormalizedFloat; + // access with normalized texture coordinates + texDescr.normalizedCoords = true; + // linear interpolation + texDescr.filterMode = cudaFilterModeLinear; + // wrap texture coordinates + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.addressMode[2] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); } -extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, - uint imageW, uint imageH, float w) { - d_render<<>>(d_output, imageW, imageH, w, tex); +extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w) +{ + d_render<<>>(d_output, imageW, imageH, w, tex); } -void cleanupCuda() { - if (tex) { - checkCudaErrors(cudaDestroyTextureObject(tex)); - } - if (d_volumeArray) { - checkCudaErrors(cudaFreeArray(d_volumeArray)); - } +void cleanupCuda() +{ + if (tex) { + checkCudaErrors(cudaDestroyTextureObject(tex)); + } + if (d_volumeArray) { + checkCudaErrors(cudaFreeArray(d_volumeArray)); + } } -#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_ +#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_ diff --git a/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp b/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp index 70e092e8..322b7eb5 100644 --- a/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp +++ b/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp @@ -26,29 +26,29 @@ */ /* -* This sample demonstrates how use texture fetches in CUDA -* -* This sample takes an input PGM image (image_filename) and generates -* an output PGM image (image_filename_out). This CUDA kernel performs -* a simple 2D transform (rotation) on the texture coordinates (u,v). -* The results between simpleTexture and simpleTextureDrv are identical. -* The main difference is the implementation. simpleTextureDrv makes calls -* to the CUDA driver API and demonstrates how to use cuModuleLoad to load -* the CUDA ptx (*.ptx) kernel just prior to kernel launch. -* -*/ + * This sample demonstrates how use texture fetches in CUDA + * + * This sample takes an input PGM image (image_filename) and generates + * an output PGM image (image_filename_out). This CUDA kernel performs + * a simple 2D transform (rotation) on the texture coordinates (u,v). + * The results between simpleTexture and simpleTextureDrv are identical. + * The main difference is the implementation. simpleTextureDrv makes calls + * to the CUDA driver API and demonstrates how to use cuModuleLoad to load + * the CUDA ptx (*.ptx) kernel just prior to kernel launch. + * + */ // includes, system -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include // includes, CUDA -#include #include +#include // includes, project #include #include @@ -56,8 +56,8 @@ using namespace std; const char *image_filename = "teapot512.pgm"; -const char *ref_filename = "ref_rotated.pgm"; -float angle = 0.5f; // angle to rotate image by (in radians) +const char *ref_filename = "ref_rotated.pgm"; +float angle = 0.5f; // angle to rotate image by (in radians) #define MIN_EPSILON_ERROR 5e-3f @@ -65,8 +65,7 @@ float angle = 0.5f; // angle to rotate image by (in radians) // declaration, forward void runTest(int argc, char **argv); -extern "C" void computeGold(float *reference, float *idata, - const unsigned int len); +extern "C" void computeGold(float *reference, float *idata, const unsigned int len); static CUresult initCUDA(int argc, char **argv, CUfunction *); @@ -80,212 +79,227 @@ const char *sSDKsample = "simpleTextureDrv (Driver API)"; //////////////////////////////////////////////////////////////////////////////// // Globals //////////////////////////////////////////////////////////////////////////////// -CUdevice cuDevice; +CUdevice cuDevice; CUcontext cuContext; -CUmodule cuModule; +CUmodule cuModule; -void showHelp() { - printf("\n> [%s] Command line options\n", sSDKsample); - printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n"); +void showHelp() +{ + printf("\n> [%s] Command line options\n", sSDKsample); + printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n"); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - showHelp(); - return 0; - } +int main(int argc, char **argv) +{ + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + showHelp(); + return 0; + } - runTest(argc, argv); + runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - bool bTestResults = true; +void runTest(int argc, char **argv) +{ + bool bTestResults = true; - // initialize CUDA - CUfunction transform = NULL; + // initialize CUDA + CUfunction transform = NULL; - if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) { - exit(EXIT_FAILURE); - } + if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) { + exit(EXIT_FAILURE); + } - // load image from disk - float *h_data = NULL; - unsigned int width, height; - char *image_path = sdkFindFilePath(image_filename, argv[0]); + // load image from disk + float *h_data = NULL; + unsigned int width, height; + char *image_path = sdkFindFilePath(image_filename, argv[0]); - if (image_path == NULL) { - printf("Unable to find image file: '%s'\n", image_filename); - exit(EXIT_FAILURE); - } + if (image_path == NULL) { + printf("Unable to find image file: '%s'\n", image_filename); + exit(EXIT_FAILURE); + } - sdkLoadPGM(image_path, &h_data, &width, &height); + sdkLoadPGM(image_path, &h_data, &width, &height); - size_t size = width * height * sizeof(float); - printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height); + size_t size = width * height * sizeof(float); + printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height); - // load reference image from image (output) - float *h_data_ref = (float *)malloc(size); - char *ref_path = sdkFindFilePath(ref_filename, argv[0]); + // load reference image from image (output) + float *h_data_ref = (float *)malloc(size); + char *ref_path = sdkFindFilePath(ref_filename, argv[0]); - if (ref_path == NULL) { - printf("Unable to find reference file %s\n", ref_filename); - exit(EXIT_FAILURE); - } + if (ref_path == NULL) { + printf("Unable to find reference file %s\n", ref_filename); + exit(EXIT_FAILURE); + } - sdkLoadPGM(ref_path, &h_data_ref, &width, &height); + sdkLoadPGM(ref_path, &h_data_ref, &width, &height); - // allocate device memory for result - CUdeviceptr d_data = (CUdeviceptr)NULL; - checkCudaErrors(cuMemAlloc(&d_data, size)); + // allocate device memory for result + CUdeviceptr d_data = (CUdeviceptr)NULL; + checkCudaErrors(cuMemAlloc(&d_data, size)); - // allocate array and copy image data - CUarray cu_array; - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = width; - desc.Height = height; - checkCudaErrors(cuArrayCreate(&cu_array, &desc)); - CUDA_MEMCPY2D copyParam; - memset(©Param, 0, sizeof(copyParam)); - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = cu_array; - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = h_data; - copyParam.srcPitch = width * sizeof(float); - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = height; - checkCudaErrors(cuMemcpy2D(©Param)); + // allocate array and copy image data + CUarray cu_array; + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = width; + desc.Height = height; + checkCudaErrors(cuArrayCreate(&cu_array, &desc)); + CUDA_MEMCPY2D copyParam; + memset(©Param, 0, sizeof(copyParam)); + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = cu_array; + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = h_data; + copyParam.srcPitch = width * sizeof(float); + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = height; + checkCudaErrors(cuMemcpy2D(©Param)); - // set texture parameters - CUtexObject TexObject; - CUDA_RESOURCE_DESC ResDesc; - memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC)); - ResDesc.resType = CU_RESOURCE_TYPE_ARRAY; - ResDesc.res.array.hArray = cu_array; + // set texture parameters + CUtexObject TexObject; + CUDA_RESOURCE_DESC ResDesc; + memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC)); + ResDesc.resType = CU_RESOURCE_TYPE_ARRAY; + ResDesc.res.array.hArray = cu_array; - CUDA_TEXTURE_DESC TexDesc; - memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC)); - TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; - TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; - TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; - TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; - TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + CUDA_TEXTURE_DESC TexDesc; + memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC)); + TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; + TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; + TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; + TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; + TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL)); + checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL)); - // There are two ways to launch CUDA kernels via the Driver API. - // In this CUDA Sample, we illustrate both ways to pass parameters - // and specify parameters. By default we use the simpler method. - int block_size = 8; - StopWatchInterface *timer = NULL; + // There are two ways to launch CUDA kernels via the Driver API. + // In this CUDA Sample, we illustrate both ways to pass parameters + // and specify parameters. By default we use the simpler method. + int block_size = 8; + StopWatchInterface *timer = NULL; - if (1) { - // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel - // Launching (simpler method) - void *args[5] = {&d_data, &width, &height, &angle, &TexObject}; + if (1) { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (simpler method) + void *args[5] = {&d_data, &width, &height, &angle, &TexObject}; + + checkCudaErrors(cuLaunchKernel( + transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL)); + checkCudaErrors(cuCtxSynchronize()); + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + + // launch kernel again for performance measurement + checkCudaErrors(cuLaunchKernel( + transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL)); + } + else { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (advanced method) + int offset = 0; + char argBuffer[256]; + + // pass in launch parameters (not actually de-referencing CUdeviceptr). + // CUdeviceptr is + // storing the value of the parameters + *((CUdeviceptr *)&argBuffer[offset]) = d_data; + offset += sizeof(d_data); + *((unsigned int *)&argBuffer[offset]) = width; + offset += sizeof(width); + *((unsigned int *)&argBuffer[offset]) = height; + offset += sizeof(height); + *((float *)&argBuffer[offset]) = angle; + offset += sizeof(angle); + *((CUtexObject *)&argBuffer[offset]) = TexObject; + offset += sizeof(TexObject); + + void *kernel_launch_config[5] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END}; + + // new CUDA 4.0 Driver API Kernel launch call (warmup) + checkCudaErrors(cuLaunchKernel(transform, + (width / block_size), + (height / block_size), + 1, + block_size, + block_size, + 1, + 0, + NULL, + NULL, + (void **)&kernel_launch_config)); + checkCudaErrors(cuCtxSynchronize()); + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + + // launch kernel again for performance measurement + checkCudaErrors(cuLaunchKernel(transform, + (width / block_size), + (height / block_size), + 1, + block_size, + block_size, + 1, + 0, + 0, + NULL, + (void **)&kernel_launch_config)); + } - checkCudaErrors(cuLaunchKernel(transform, (width / block_size), - (height / block_size), 1, block_size, - block_size, 1, 0, NULL, args, NULL)); checkCudaErrors(cuCtxSynchronize()); - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); + sdkDeleteTimer(&timer); - // launch kernel again for performance measurement - checkCudaErrors(cuLaunchKernel(transform, (width / block_size), - (height / block_size), 1, block_size, - block_size, 1, 0, NULL, args, NULL)); - } else { - // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel - // Launching (advanced method) - int offset = 0; - char argBuffer[256]; + // allocate mem for the result on host side + float *h_odata = (float *)malloc(size); + // copy result from device to host + checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size)); - // pass in launch parameters (not actually de-referencing CUdeviceptr). - // CUdeviceptr is - // storing the value of the parameters - *((CUdeviceptr *)&argBuffer[offset]) = d_data; - offset += sizeof(d_data); - *((unsigned int *)&argBuffer[offset]) = width; - offset += sizeof(width); - *((unsigned int *)&argBuffer[offset]) = height; - offset += sizeof(height); - *((float *)&argBuffer[offset]) = angle; - offset += sizeof(angle); - *((CUtexObject *)&argBuffer[offset]) = TexObject; - offset += sizeof(TexObject); + // write result to file + char output_filename[1024]; + strcpy(output_filename, image_path); + strcpy(output_filename + strlen(image_path) - 4, "_out.pgm"); + sdkSavePGM(output_filename, h_odata, width, height); + printf("Wrote '%s'\n", output_filename); - void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, - CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, - CU_LAUNCH_PARAM_END}; + // write regression file if necessary + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // write file for regression test + sdkWriteFile("./data/regression.dat", h_odata, width * height, 0.0f, false); + } + else { + // We need to reload the data from disk, because it is inverted upon output + sdkLoadPGM(output_filename, &h_odata, &width, &height); - // new CUDA 4.0 Driver API Kernel launch call (warmup) - checkCudaErrors(cuLaunchKernel( - transform, (width / block_size), (height / block_size), 1, block_size, - block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config)); - checkCudaErrors(cuCtxSynchronize()); - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + printf("Comparing files\n"); + printf("\toutput: <%s>\n", output_filename); + printf("\treference: <%s>\n", ref_path); + bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f); + } - // launch kernel again for performance measurement - checkCudaErrors(cuLaunchKernel( - transform, (width / block_size), (height / block_size), 1, block_size, - block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config)); - } + // cleanup memory + checkCudaErrors(cuTexObjectDestroy(TexObject)); + checkCudaErrors(cuMemFree(d_data)); + checkCudaErrors(cuArrayDestroy(cu_array)); - checkCudaErrors(cuCtxSynchronize()); - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - printf("%.2f Mpixels/sec\n", - (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); - sdkDeleteTimer(&timer); + free(image_path); + free(ref_path); - // allocate mem for the result on host side - float *h_odata = (float *)malloc(size); - // copy result from device to host - checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size)); + checkCudaErrors(cuCtxDestroy(cuContext)); - // write result to file - char output_filename[1024]; - strcpy(output_filename, image_path); - strcpy(output_filename + strlen(image_path) - 4, "_out.pgm"); - sdkSavePGM(output_filename, h_odata, width, height); - printf("Wrote '%s'\n", output_filename); - - // write regression file if necessary - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // write file for regression test - sdkWriteFile("./data/regression.dat", h_odata, width * height, 0.0f, - false); - } else { - // We need to reload the data from disk, because it is inverted upon output - sdkLoadPGM(output_filename, &h_odata, &width, &height); - - printf("Comparing files\n"); - printf("\toutput: <%s>\n", output_filename); - printf("\treference: <%s>\n", ref_path); - bTestResults = compareData(h_odata, h_data_ref, width * height, - MIN_EPSILON_ERROR, 0.15f); - } - - // cleanup memory - checkCudaErrors(cuTexObjectDestroy(TexObject)); - checkCudaErrors(cuMemFree(d_data)); - checkCudaErrors(cuArrayDestroy(cu_array)); - - free(image_path); - free(ref_path); - - checkCudaErrors(cuCtxDestroy(cuContext)); - - exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// @@ -293,45 +307,44 @@ void runTest(int argc, char **argv) { //! kernel function. After the module is loaded, cuModuleGetFunction //! retrieves the CUDA function pointer "cuFunction" //////////////////////////////////////////////////////////////////////////////// -static CUresult initCUDA(int argc, char **argv, CUfunction *transform) { - CUfunction cuFunction = 0; - int major = 0, minor = 0, devID = 0; - char deviceName[100]; - string module_path; +static CUresult initCUDA(int argc, char **argv, CUfunction *transform) +{ + CUfunction cuFunction = 0; + int major = 0, minor = 0, devID = 0; + char deviceName[100]; + string module_path; - cuDevice = findCudaDeviceDRV(argc, (const char **)argv); + cuDevice = findCudaDeviceDRV(argc, (const char **)argv); - // get compute capabilities and the devicename - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice)); - printf("> GPU Device has SM %d.%d compute capability\n", major, minor); + // get compute capabilities and the devicename + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice)); + printf("> GPU Device has SM %d.%d compute capability\n", major, minor); - checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); - // first search for the module_path before we try to load the results - std::ostringstream fatbin; + // first search for the module_path before we try to load the results + std::ostringstream fatbin; - if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { - exit(EXIT_FAILURE); - } else { - printf("> initCUDA loading module: <%s>\n", module_path.c_str()); - } + if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { + exit(EXIT_FAILURE); + } + else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); + } - if (!fatbin.str().size()) { - printf("fatbin file empty. exiting..\n"); - exit(EXIT_FAILURE); - } + if (!fatbin.str().size()) { + printf("fatbin file empty. exiting..\n"); + exit(EXIT_FAILURE); + } - // Create module from binary file (FATBIN) - checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); + // Create module from binary file (FATBIN) + checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); - checkCudaErrors( - cuModuleGetFunction(&cuFunction, cuModule, "transformKernel")); + checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel")); - *transform = cuFunction; + *transform = cuFunction; - return CUDA_SUCCESS; + return CUDA_SUCCESS; } diff --git a/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu b/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu index 0b48f11f..05bdac12 100644 --- a/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu +++ b/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu @@ -33,23 +33,22 @@ //! Transform an image using texture lookups //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -extern "C" __global__ void transformKernel(float *g_odata, int width, - int height, float theta, - CUtexObject tex) { - // calculate normalized texture coordinates - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex) +{ + // calculate normalized texture coordinates + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - float u = (float)x - (float)width / 2; - float v = (float)y - (float)height / 2; - float tu = u * cosf(theta) - v * sinf(theta); - float tv = v * cosf(theta) + u * sinf(theta); + float u = (float)x - (float)width / 2; + float v = (float)y - (float)height / 2; + float tu = u * cosf(theta) - v * sinf(theta); + float tv = v * cosf(theta) + u * sinf(theta); - tu /= (float)width; - tv /= (float)height; + tu /= (float)width; + tv /= (float)height; - // read from texture and write to global memory - g_odata[y * width + x] = tex2D(tex, tu + 0.5f, tv + 0.5f); + // read from texture and write to global memory + g_odata[y * width + x] = tex2D(tex, tu + 0.5f, tv + 0.5f); } -#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_ +#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_ diff --git a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu index 1b781c46..ffd0d009 100644 --- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu +++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu @@ -53,257 +53,237 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0"; #include "simpleVote_kernel.cuh" // Generate the test pattern for Tests 1 and 2 -void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) { - // For testing VOTE.Any (all of these threads will return 0) - for (int i = 0; i < size / 4; i++) { - VOTE_PATTERN[i] = 0x00000000; - } - - // For testing VOTE.Any (1/2 these threads will return 1) - for (int i = 2 * size / 8; i < 4 * size / 8; i++) { - VOTE_PATTERN[i] = (i & 0x01) ? i : 0; - } - - // For testing VOTE.all (1/2 of these threads will return 0) - for (int i = 2 * size / 4; i < 3 * size / 4; i++) { - VOTE_PATTERN[i] = (i & 0x01) ? 0 : i; - } - - // For testing VOTE.all (all of these threads will return 1) - for (int i = 3 * size / 4; i < 4 * size / 4; i++) { - VOTE_PATTERN[i] = 0xffffffff; - } -} - -int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, - const char *voteType) { - int i, sum = 0; - - for (sum = 0, i = start; i < end; i++) { - sum += h_result[i]; - } - - if (sum > 0) { - printf("\t<%s>[%d - %d] = ", voteType, start, end - 1); - - for (i = start; i < end; i++) { - printf("%d", h_result[i]); +void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) +{ + // For testing VOTE.Any (all of these threads will return 0) + for (int i = 0; i < size / 4; i++) { + VOTE_PATTERN[i] = 0x00000000; } - printf("%d values FAILED\n", sum); - } - - return (sum > 0); -} - -int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, - const char *voteType) { - int i, sum = 0; - - for (sum = 0, i = start; i < end; i++) { - sum += h_result[i]; - } - - if (sum != warp_size) { - printf("\t<%s>[%d - %d] = ", voteType, start, end - 1); - - for (i = start; i < end; i++) { - printf("%d", h_result[i]); + // For testing VOTE.Any (1/2 these threads will return 1) + for (int i = 2 * size / 8; i < 4 * size / 8; i++) { + VOTE_PATTERN[i] = (i & 0x01) ? i : 0; } - printf(" - FAILED\n"); - } + // For testing VOTE.all (1/2 of these threads will return 0) + for (int i = 2 * size / 4; i < 3 * size / 4; i++) { + VOTE_PATTERN[i] = (i & 0x01) ? 0 : i; + } - return (sum != warp_size); + // For testing VOTE.all (all of these threads will return 1) + for (int i = 3 * size / 4; i < 4 * size / 4; i++) { + VOTE_PATTERN[i] = 0xffffffff; + } +} + +int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType) +{ + int i, sum = 0; + + for (sum = 0, i = start; i < end; i++) { + sum += h_result[i]; + } + + if (sum > 0) { + printf("\t<%s>[%d - %d] = ", voteType, start, end - 1); + + for (i = start; i < end; i++) { + printf("%d", h_result[i]); + } + + printf("%d values FAILED\n", sum); + } + + return (sum > 0); +} + +int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType) +{ + int i, sum = 0; + + for (sum = 0, i = start; i < end; i++) { + sum += h_result[i]; + } + + if (sum != warp_size) { + printf("\t<%s>[%d - %d] = ", voteType, start, end - 1); + + for (i = start; i < end; i++) { + printf("%d", h_result[i]); + } + + printf(" - FAILED\n"); + } + + return (sum != warp_size); } // Verification code for Kernel #1 -int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, - int warp_size) { - int error_count = 0; +int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size) +{ + int error_count = 0; - error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, - warp_size, "Vote.Any"); - error_count += - checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4, - 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); - error_count += - checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, - 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); - error_count += - checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, - 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); + error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); + error_count += checkErrors2( + h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); + error_count += checkErrors2( + h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); + error_count += checkErrors2( + h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); - printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); - return error_count; + printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); + return error_count; } // Verification code for Kernel #2 -int checkResultsVoteAllKernel2(unsigned int *h_result, int size, - int warp_size) { - int error_count = 0; +int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size) +{ + int error_count = 0; - error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, - warp_size, "Vote.All"); - error_count += - checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4, - 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); - error_count += - checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, - 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); - error_count += - checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, - 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); + error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); + error_count += checkErrors1( + h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); + error_count += checkErrors1( + h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); + error_count += checkErrors2( + h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); - printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); - return error_count; + printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); + return error_count; } // Verification code for Kernel #3 -int checkResultsVoteAnyKernel3(bool *hinfo, int size) { - int i, error_count = 0; +int checkResultsVoteAnyKernel3(bool *hinfo, int size) +{ + int i, error_count = 0; - for (i = 0; i < size * 3; i++) { - switch (i % 3) { - case 0: + for (i = 0; i < size * 3; i++) { + switch (i % 3) { + case 0: - // First warp should be all zeros. - if (hinfo[i] != (i >= size * 1)) { - error_count++; + // First warp should be all zeros. + if (hinfo[i] != (i >= size * 1)) { + error_count++; + } + + break; + + case 1: + + // First warp and half of second should be all zeros. + if (hinfo[i] != (i >= size * 3 / 2)) { + error_count++; + } + + break; + + case 2: + + // First two warps should be all zeros. + if (hinfo[i] != (i >= size * 2)) { + error_count++; + } + + break; } - - break; - - case 1: - - // First warp and half of second should be all zeros. - if (hinfo[i] != (i >= size * 3 / 2)) { - error_count++; - } - - break; - - case 2: - - // First two warps should be all zeros. - if (hinfo[i] != (i >= size * 2)) { - error_count++; - } - - break; } - } - printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); - return error_count; + printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); + return error_count; } -int main(int argc, char **argv) { - unsigned int *h_input, *h_result; - unsigned int *d_input, *d_result; +int main(int argc, char **argv) +{ + unsigned int *h_input, *h_result; + unsigned int *d_input, *d_result; - bool *dinfo = NULL, *hinfo = NULL; - int error_count[3] = {0, 0, 0}; + bool *dinfo = NULL, *hinfo = NULL; + int error_count[3] = {0, 0, 0}; - cudaDeviceProp deviceProp; - int devID, warp_size = 32; + cudaDeviceProp deviceProp; + int devID, warp_size = 32; - printf("%s\n", sSDKsample); + printf("%s\n", sSDKsample); - // This will pick the best possible CUDA capable device - devID = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + devID = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - // Statistics about the GPU device - printf( - "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", - deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); + // Statistics about the GPU device + printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); - h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * - sizeof(unsigned int)); - h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * - sizeof(unsigned int)); - checkCudaErrors( - cudaMalloc(reinterpret_cast(&d_input), - VOTE_DATA_GROUP * warp_size * sizeof(unsigned int))); - checkCudaErrors( - cudaMalloc(reinterpret_cast(&d_result), - VOTE_DATA_GROUP * warp_size * sizeof(unsigned int))); - genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size); - checkCudaErrors(cudaMemcpy(d_input, h_input, - VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), - cudaMemcpyHostToDevice)); + h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)); + h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)); + checkCudaErrors( + cudaMalloc(reinterpret_cast(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int))); + checkCudaErrors( + cudaMalloc(reinterpret_cast(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int))); + genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size); + checkCudaErrors( + cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice)); - // Start of Vote Any Test Kernel #1 - printf("[VOTE Kernel Test 1/3]\n"); - printf("\tRunning <> kernel1 ...\n"); - { - checkCudaErrors(cudaDeviceSynchronize()); - dim3 gridBlock(1, 1); - dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); - VoteAnyKernel1<<>>(d_input, d_result, - VOTE_DATA_GROUP * warp_size); - getLastCudaError("VoteAnyKernel() execution failed\n"); - checkCudaErrors(cudaDeviceSynchronize()); - } - checkCudaErrors(cudaMemcpy(h_result, d_result, - VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), - cudaMemcpyDeviceToHost)); - error_count[0] += checkResultsVoteAnyKernel1( - h_result, VOTE_DATA_GROUP * warp_size, warp_size); + // Start of Vote Any Test Kernel #1 + printf("[VOTE Kernel Test 1/3]\n"); + printf("\tRunning <> kernel1 ...\n"); + { + checkCudaErrors(cudaDeviceSynchronize()); + dim3 gridBlock(1, 1); + dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); + VoteAnyKernel1<<>>(d_input, d_result, VOTE_DATA_GROUP * warp_size); + getLastCudaError("VoteAnyKernel() execution failed\n"); + checkCudaErrors(cudaDeviceSynchronize()); + } + checkCudaErrors( + cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size); - // Start of Vote All Test Kernel #2 - printf("\n[VOTE Kernel Test 2/3]\n"); - printf("\tRunning <> kernel2 ...\n"); - { - checkCudaErrors(cudaDeviceSynchronize()); - dim3 gridBlock(1, 1); - dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); - VoteAllKernel2<<>>(d_input, d_result, - VOTE_DATA_GROUP * warp_size); - getLastCudaError("VoteAllKernel() execution failed\n"); - checkCudaErrors(cudaDeviceSynchronize()); - } - checkCudaErrors(cudaMemcpy(h_result, d_result, - VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), - cudaMemcpyDeviceToHost)); - error_count[1] += checkResultsVoteAllKernel2( - h_result, VOTE_DATA_GROUP * warp_size, warp_size); + // Start of Vote All Test Kernel #2 + printf("\n[VOTE Kernel Test 2/3]\n"); + printf("\tRunning <> kernel2 ...\n"); + { + checkCudaErrors(cudaDeviceSynchronize()); + dim3 gridBlock(1, 1); + dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); + VoteAllKernel2<<>>(d_input, d_result, VOTE_DATA_GROUP * warp_size); + getLastCudaError("VoteAllKernel() execution failed\n"); + checkCudaErrors(cudaDeviceSynchronize()); + } + checkCudaErrors( + cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size); - // Second Vote Kernel Test #3 (both Any/All) - hinfo = reinterpret_cast(calloc(warp_size * 3 * 3, sizeof(bool))); - cudaMalloc(reinterpret_cast(&dinfo), - warp_size * 3 * 3 * sizeof(bool)); - cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), - cudaMemcpyHostToDevice); + // Second Vote Kernel Test #3 (both Any/All) + hinfo = reinterpret_cast(calloc(warp_size * 3 * 3, sizeof(bool))); + cudaMalloc(reinterpret_cast(&dinfo), warp_size * 3 * 3 * sizeof(bool)); + cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice); - printf("\n[VOTE Kernel Test 3/3]\n"); - printf("\tRunning <> kernel3 ...\n"); - { - checkCudaErrors(cudaDeviceSynchronize()); - VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size); - checkCudaErrors(cudaDeviceSynchronize()); - } + printf("\n[VOTE Kernel Test 3/3]\n"); + printf("\tRunning <> kernel3 ...\n"); + { + checkCudaErrors(cudaDeviceSynchronize()); + VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size); + checkCudaErrors(cudaDeviceSynchronize()); + } - cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), - cudaMemcpyDeviceToHost); + cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost); - error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3); + error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3); - // Now free these resources for Test #1,2 - checkCudaErrors(cudaFree(d_input)); - checkCudaErrors(cudaFree(d_result)); - free(h_input); - free(h_result); + // Now free these resources for Test #1,2 + checkCudaErrors(cudaFree(d_input)); + checkCudaErrors(cudaFree(d_result)); + free(h_input); + free(h_result); - // Free resources from Test #3 - free(hinfo); - cudaFree(dinfo); + // Free resources from Test #3 + free(hinfo); + cudaFree(dinfo); - printf("\tShutting down...\n"); + printf("\tShutting down...\n"); - return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) - ? EXIT_SUCCESS - : EXIT_FAILURE; + return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh index 952188ad..efc88e05 100644 --- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh +++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh @@ -38,43 +38,44 @@ // If ANY one of the threads (within the warp) of the predicated condition // returns a non-zero value, then all threads within this warp will return a // non-zero value -__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, - int size) { - int tx = threadIdx.x; +__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size) +{ + int tx = threadIdx.x; - int mask = 0xffffffff; - result[tx] = __any_sync(mask, input[tx]); + int mask = 0xffffffff; + result[tx] = __any_sync(mask, input[tx]); } // Kernel #2 tests the across-the-warp vote(all) intrinsic. // If ALL of the threads (within the warp) of the predicated condition returns // a non-zero value, then all threads within this warp will return a non-zero // value -__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, - int size) { - int tx = threadIdx.x; +__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size) +{ + int tx = threadIdx.x; - int mask = 0xffffffff; - result[tx] = __all_sync(mask, input[tx]); + int mask = 0xffffffff; + result[tx] = __all_sync(mask, input[tx]); } // Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic. // This kernel will test for conditions across warps, and within half warps -__global__ void VoteAnyKernel3(bool *info, int warp_size) { - int tx = threadIdx.x; - unsigned int mask = 0xffffffff; - bool *offs = info + (tx * 3); +__global__ void VoteAnyKernel3(bool *info, int warp_size) +{ + int tx = threadIdx.x; + unsigned int mask = 0xffffffff; + bool *offs = info + (tx * 3); - // The following should hold true for the second and third warp - *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2)); - // The following should hold true for the "upper half" of the second warp, - // and all of the third warp - *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false); + // The following should hold true for the second and third warp + *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2)); + // The following should hold true for the "upper half" of the second warp, + // and all of the third warp + *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false); - // The following should hold true for the third warp only - if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) { - *(offs + 2) = true; - } + // The following should hold true for the third warp only + if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) { + *(offs + 2) = true; + } } #endif diff --git a/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu b/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu index bb23d0a4..1117e1e8 100644 --- a/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu +++ b/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu @@ -41,12 +41,13 @@ #endif /* Add two vectors on the GPU */ -__global__ void vectorAddGPU(float *a, float *b, float *c, int N) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void vectorAddGPU(float *a, float *b, float *c, int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < N) { - c[idx] = a[idx] + b[idx]; - } + if (idx < N) { + c[idx] = a[idx] + b[idx]; + } } // Allocate generic memory with malloc() and pin it laster instead of using @@ -54,194 +55,196 @@ __global__ void vectorAddGPU(float *a, float *b, float *c, int N) { bool bPinGenericMemory = false; // Macro to aligned up to the memory size in question -#define MEMORY_ALIGNMENT 4096 +#define MEMORY_ALIGNMENT 4096 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1))) -int main(int argc, char **argv) { - int n, nelem, deviceCount; - int idev = 0; // use default device 0 - char *device = NULL; - unsigned int flags; - size_t bytes; - float *a, *b, *c; // Pinned memory allocated on the CPU - float *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU - float *d_a, *d_b, *d_c; // Device pointers for mapped memory - float errorNorm, refNorm, ref, diff; - cudaDeviceProp deviceProp; +int main(int argc, char **argv) +{ + int n, nelem, deviceCount; + int idev = 0; // use default device 0 + char *device = NULL; + unsigned int flags; + size_t bytes; + float *a, *b, *c; // Pinned memory allocated on the CPU + float *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU + float *d_a, *d_b, *d_c; // Device pointers for mapped memory + float errorNorm, refNorm, ref, diff; + cudaDeviceProp deviceProp; - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Usage: simpleZeroCopy [OPTION]\n\n"); - printf("Options:\n"); - printf(" --device=[device #] Specify the device to be used\n"); - printf( - " --use_generic_memory (optional) use generic page-aligned for system " - "memory\n"); - return EXIT_SUCCESS; - } - - /* Get the device selected by the user or default to 0, and then set it. */ - if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) { - cudaGetDeviceCount(&deviceCount); - idev = atoi(device); - - if (idev >= deviceCount || idev < 0) { - fprintf(stderr, - "Device number %d is invalid, will use default CUDA device 0.\n", - idev); - idev = 0; + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Usage: simpleZeroCopy [OPTION]\n\n"); + printf("Options:\n"); + printf(" --device=[device #] Specify the device to be used\n"); + printf(" --use_generic_memory (optional) use generic page-aligned for system " + "memory\n"); + return EXIT_SUCCESS; } - } - // if GPU found supports SM 1.2, then continue, otherwise we exit - if (!checkCudaCapabilities(1, 2)) { - exit(EXIT_SUCCESS); - } + /* Get the device selected by the user or default to 0, and then set it. */ + if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) { + cudaGetDeviceCount(&deviceCount); + idev = atoi(device); - if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) { + if (idev >= deviceCount || idev < 0) { + fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev); + idev = 0; + } + } + + // if GPU found supports SM 1.2, then continue, otherwise we exit + if (!checkCudaCapabilities(1, 2)) { + exit(EXIT_SUCCESS); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) { #if defined(__APPLE__) || defined(MACOSX) - bPinGenericMemory = false; // Generic Pinning of System Paged memory is not - // currently supported on Mac OSX + bPinGenericMemory = false; // Generic Pinning of System Paged memory is not + // currently supported on Mac OSX #else - bPinGenericMemory = true; + bPinGenericMemory = true; #endif - } + } - if (bPinGenericMemory) { - printf("> Using Generic System Paged Memory (malloc)\n"); - } else { - printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); - } + if (bPinGenericMemory) { + printf("> Using Generic System Paged Memory (malloc)\n"); + } + else { + printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); + } - checkCudaErrors(cudaSetDevice(idev)); + checkCudaErrors(cudaSetDevice(idev)); - /* Verify the selected device supports mapped memory and set the device - flags for mapping host memory. */ + /* Verify the selected device supports mapped memory and set the device + flags for mapping host memory. */ - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev)); #if CUDART_VERSION >= 2020 - if (!deviceProp.canMapHostMemory) { - fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", - idev); + if (!deviceProp.canMapHostMemory) { + fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev); + + exit(EXIT_SUCCESS); + } + + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); +#else + fprintf(stderr, + "CUDART version %d.%d does not support " + " field\n", + , + CUDART_VERSION / 1000, + (CUDART_VERSION % 100) / 10); exit(EXIT_SUCCESS); - } - - checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); -#else - fprintf(stderr, - "CUDART version %d.%d does not support " - " field\n", - , CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); - - exit(EXIT_SUCCESS); #endif #if CUDART_VERSION < 4000 - if (bPinGenericMemory) { - fprintf( - stderr, - "CUDART version %d.%d does not support function\n", - CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); + if (bPinGenericMemory) { + fprintf(stderr, + "CUDART version %d.%d does not support function\n", + CUDART_VERSION / 1000, + (CUDART_VERSION % 100) / 10); - exit(EXIT_SUCCESS); - } + exit(EXIT_SUCCESS); + } #endif - /* Allocate mapped CPU memory. */ + /* Allocate mapped CPU memory. */ - nelem = 1048576; - bytes = nelem * sizeof(float); + nelem = 1048576; + bytes = nelem * sizeof(float); - if (bPinGenericMemory) { + if (bPinGenericMemory) { #if CUDART_VERSION >= 4000 - a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); - b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); - c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); + a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); + b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); + c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); - // We need to ensure memory is aligned to 4K (so we will need to padd memory - // accordingly) - a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); - b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT); - c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT); + // We need to ensure memory is aligned to 4K (so we will need to padd memory + // accordingly) + a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); + b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT); + c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT); - checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped)); - checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped)); - checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped)); + checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped)); + checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped)); + checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped)); #endif - } else { + } + else { #if CUDART_VERSION >= 2020 - flags = cudaHostAllocMapped; - checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags)); - checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags)); - checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags)); + flags = cudaHostAllocMapped; + checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags)); + checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags)); + checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags)); #endif - } + } - /* Initialize the vectors. */ + /* Initialize the vectors. */ - for (n = 0; n < nelem; n++) { - a[n] = rand() / (float)RAND_MAX; - b[n] = rand() / (float)RAND_MAX; - } + for (n = 0; n < nelem; n++) { + a[n] = rand() / (float)RAND_MAX; + b[n] = rand() / (float)RAND_MAX; + } /* Get the device pointers for the pinned CPU memory mapped into the GPU memory space. */ #if CUDART_VERSION >= 2020 - checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0)); - checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0)); - checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0)); #endif - /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory. - */ - printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n"); - dim3 block(256); - dim3 grid((unsigned int)ceil(nelem / (float)block.x)); - vectorAddGPU<<>>(d_a, d_b, d_c, nelem); - checkCudaErrors(cudaDeviceSynchronize()); - getLastCudaError("vectorAddGPU() execution failed"); + /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory. + */ + printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n"); + dim3 block(256); + dim3 grid((unsigned int)ceil(nelem / (float)block.x)); + vectorAddGPU<<>>(d_a, d_b, d_c, nelem); + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("vectorAddGPU() execution failed"); - /* Compare the results */ + /* Compare the results */ - printf("> Checking the results from vectorAddGPU() ...\n"); - errorNorm = 0.f; - refNorm = 0.f; + printf("> Checking the results from vectorAddGPU() ...\n"); + errorNorm = 0.f; + refNorm = 0.f; - for (n = 0; n < nelem; n++) { - ref = a[n] + b[n]; - diff = c[n] - ref; - errorNorm += diff * diff; - refNorm += ref * ref; - } + for (n = 0; n < nelem; n++) { + ref = a[n] + b[n]; + diff = c[n] - ref; + errorNorm += diff * diff; + refNorm += ref * ref; + } - errorNorm = (float)sqrt((double)errorNorm); - refNorm = (float)sqrt((double)refNorm); + errorNorm = (float)sqrt((double)errorNorm); + refNorm = (float)sqrt((double)refNorm); - /* Memory clean up */ + /* Memory clean up */ - printf("> Releasing CPU memory...\n"); + printf("> Releasing CPU memory...\n"); - if (bPinGenericMemory) { + if (bPinGenericMemory) { #if CUDART_VERSION >= 4000 - checkCudaErrors(cudaHostUnregister(a)); - checkCudaErrors(cudaHostUnregister(b)); - checkCudaErrors(cudaHostUnregister(c)); - free(a_UA); - free(b_UA); - free(c_UA); + checkCudaErrors(cudaHostUnregister(a)); + checkCudaErrors(cudaHostUnregister(b)); + checkCudaErrors(cudaHostUnregister(c)); + free(a_UA); + free(b_UA); + free(c_UA); #endif - } else { + } + else { #if CUDART_VERSION >= 2020 - checkCudaErrors(cudaFreeHost(a)); - checkCudaErrors(cudaFreeHost(b)); - checkCudaErrors(cudaFreeHost(c)); + checkCudaErrors(cudaFreeHost(a)); + checkCudaErrors(cudaFreeHost(b)); + checkCudaErrors(cudaFreeHost(c)); #endif - } + } - exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE); + exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/systemWideAtomics/README.md b/Samples/0_Introduction/systemWideAtomics/README.md index 4a539cd6..b6d52932 100644 --- a/Samples/0_Introduction/systemWideAtomics/README.md +++ b/Samples/0_Introduction/systemWideAtomics/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu b/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu index f5d89986..28e40ebc 100644 --- a/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu +++ b/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu @@ -29,113 +29,111 @@ * memory. */ +#include +#include #include #include #include #include -#include -#include #define min(a, b) (a) < (b) ? (a) : (b) #define max(a, b) (a) > (b) ? (a) : (b) #define LOOP_NUM 50 -__global__ void atomicKernel(int *atom_arr) { - unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void atomicKernel(int *atom_arr) +{ + unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = 0; i < LOOP_NUM; i++) { - // Atomic addition - atomicAdd_system(&atom_arr[0], 10); + for (int i = 0; i < LOOP_NUM; i++) { + // Atomic addition + atomicAdd_system(&atom_arr[0], 10); - // Atomic exchange - atomicExch_system(&atom_arr[1], tid); + // Atomic exchange + atomicExch_system(&atom_arr[1], tid); - // Atomic maximum - atomicMax_system(&atom_arr[2], tid); + // Atomic maximum + atomicMax_system(&atom_arr[2], tid); - // Atomic minimum - atomicMin_system(&atom_arr[3], tid); + // Atomic minimum + atomicMin_system(&atom_arr[3], tid); - // Atomic increment (modulo 17+1) - atomicInc_system((unsigned int *)&atom_arr[4], 17); + // Atomic increment (modulo 17+1) + atomicInc_system((unsigned int *)&atom_arr[4], 17); - // Atomic decrement - atomicDec_system((unsigned int *)&atom_arr[5], 137); + // Atomic decrement + atomicDec_system((unsigned int *)&atom_arr[5], 137); - // Atomic compare-and-swap - atomicCAS_system(&atom_arr[6], tid - 1, tid); + // Atomic compare-and-swap + atomicCAS_system(&atom_arr[6], tid - 1, tid); - // Bitwise atomic instructions + // Bitwise atomic instructions - // Atomic AND - atomicAnd_system(&atom_arr[7], 2 * tid + 7); + // Atomic AND + atomicAnd_system(&atom_arr[7], 2 * tid + 7); - // Atomic OR - atomicOr_system(&atom_arr[8], 1 << tid); + // Atomic OR + atomicOr_system(&atom_arr[8], 1 << tid); - // Atomic XOR - atomicXor_system(&atom_arr[9], tid); - } + // Atomic XOR + atomicXor_system(&atom_arr[9], tid); + } } -void atomicKernel_CPU(int *atom_arr, int no_of_threads) { - for (int i = no_of_threads; i < 2 * no_of_threads; i++) { - for (int j = 0; j < LOOP_NUM; j++) { - // Atomic addition - __sync_fetch_and_add(&atom_arr[0], 10); +void atomicKernel_CPU(int *atom_arr, int no_of_threads) +{ + for (int i = no_of_threads; i < 2 * no_of_threads; i++) { + for (int j = 0; j < LOOP_NUM; j++) { + // Atomic addition + __sync_fetch_and_add(&atom_arr[0], 10); - // Atomic exchange - __sync_lock_test_and_set(&atom_arr[1], i); + // Atomic exchange + __sync_lock_test_and_set(&atom_arr[1], i); - // Atomic maximum - int old, expected; - do { - expected = atom_arr[2]; - old = __sync_val_compare_and_swap(&atom_arr[2], expected, - max(expected, i)); - } while (old != expected); + // Atomic maximum + int old, expected; + do { + expected = atom_arr[2]; + old = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i)); + } while (old != expected); - // Atomic minimum - do { - expected = atom_arr[3]; - old = __sync_val_compare_and_swap(&atom_arr[3], expected, - min(expected, i)); - } while (old != expected); + // Atomic minimum + do { + expected = atom_arr[3]; + old = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i)); + } while (old != expected); - // Atomic increment (modulo 17+1) - int limit = 17; - do { - expected = atom_arr[4]; - old = __sync_val_compare_and_swap( - &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1); - } while (old != expected); + // Atomic increment (modulo 17+1) + int limit = 17; + do { + expected = atom_arr[4]; + old = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1); + } while (old != expected); - // Atomic decrement - limit = 137; - do { - expected = atom_arr[5]; - old = __sync_val_compare_and_swap( - &atom_arr[5], expected, - ((expected == 0) || (expected > limit)) ? limit : expected - 1); - } while (old != expected); + // Atomic decrement + limit = 137; + do { + expected = atom_arr[5]; + old = __sync_val_compare_and_swap( + &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1); + } while (old != expected); - // Atomic compare-and-swap - __sync_val_compare_and_swap(&atom_arr[6], i - 1, i); + // Atomic compare-and-swap + __sync_val_compare_and_swap(&atom_arr[6], i - 1, i); - // Bitwise atomic instructions + // Bitwise atomic instructions - // Atomic AND - __sync_fetch_and_and(&atom_arr[7], 2 * i + 7); + // Atomic AND + __sync_fetch_and_and(&atom_arr[7], 2 * i + 7); - // Atomic OR - __sync_fetch_and_or(&atom_arr[8], 1 << i); + // Atomic OR + __sync_fetch_and_or(&atom_arr[8], 1 << i); - // Atomic XOR - // 11th element should be 0xff - __sync_fetch_and_xor(&atom_arr[9], i); + // Atomic XOR + // 11th element should be 0xff + __sync_fetch_and_xor(&atom_arr[9], i); + } } - } } //////////////////////////////////////////////////////////////////////////////// @@ -145,198 +143,201 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) { //! @param idata input data as provided to device //! @param len number of elements in reference / idata //////////////////////////////////////////////////////////////////////////////// -int verify(int *testData, const int len) { - int val = 0; +int verify(int *testData, const int len) +{ + int val = 0; - for (int i = 0; i < len * LOOP_NUM; ++i) { - val += 10; - } - - if (val != testData[0]) { - printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]); - return false; - } - - val = 0; - - bool found = false; - - for (int i = 0; i < len; ++i) { - // second element should be a member of [0, len) - if (i == testData[1]) { - found = true; - break; + for (int i = 0; i < len * LOOP_NUM; ++i) { + val += 10; } - } - if (!found) { - printf("atomicExch failed\n"); - return false; - } - - val = -(1 << 8); - - for (int i = 0; i < len; ++i) { - // third element should be len-1 - val = max(val, i); - } - - if (val != testData[2]) { - printf("atomicMax failed\n"); - return false; - } - - val = 1 << 8; - - for (int i = 0; i < len; ++i) { - val = min(val, i); - } - - if (val != testData[3]) { - printf("atomicMin failed\n"); - return false; - } - - int limit = 17; - val = 0; - - for (int i = 0; i < len * LOOP_NUM; ++i) { - val = (val >= limit) ? 0 : val + 1; - } - - if (val != testData[4]) { - printf("atomicInc failed\n"); - return false; - } - - limit = 137; - val = 0; - - for (int i = 0; i < len * LOOP_NUM; ++i) { - val = ((val == 0) || (val > limit)) ? limit : val - 1; - } - - if (val != testData[5]) { - printf("atomicDec failed\n"); - return false; - } - - found = false; - - for (int i = 0; i < len; ++i) { - // seventh element should be a member of [0, len) - if (i == testData[6]) { - found = true; - break; + if (val != testData[0]) { + printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]); + return false; } - } - if (!found) { - printf("atomicCAS failed\n"); - return false; - } + val = 0; - val = 0xff; + bool found = false; - for (int i = 0; i < len; ++i) { - // 8th element should be 1 - val &= (2 * i + 7); - } + for (int i = 0; i < len; ++i) { + // second element should be a member of [0, len) + if (i == testData[1]) { + found = true; + break; + } + } - if (val != testData[7]) { - printf("atomicAnd failed\n"); - return false; - } + if (!found) { + printf("atomicExch failed\n"); + return false; + } - val = 0; + val = -(1 << 8); - for (int i = 0; i < len; ++i) { - // 9th element should be 0xff - val |= (1 << i); - } + for (int i = 0; i < len; ++i) { + // third element should be len-1 + val = max(val, i); + } - if (val != testData[8]) { - printf("atomicOr failed\n"); - return false; - } + if (val != testData[2]) { + printf("atomicMax failed\n"); + return false; + } - val = 0xff; + val = 1 << 8; - for (int i = 0; i < len; ++i) { - // 11th element should be 0xff - val ^= i; - } + for (int i = 0; i < len; ++i) { + val = min(val, i); + } - if (val != testData[9]) { - printf("atomicXor failed\n"); - return false; - } + if (val != testData[3]) { + printf("atomicMin failed\n"); + return false; + } - return true; + int limit = 17; + val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val = (val >= limit) ? 0 : val + 1; + } + + if (val != testData[4]) { + printf("atomicInc failed\n"); + return false; + } + + limit = 137; + val = 0; + + for (int i = 0; i < len * LOOP_NUM; ++i) { + val = ((val == 0) || (val > limit)) ? limit : val - 1; + } + + if (val != testData[5]) { + printf("atomicDec failed\n"); + return false; + } + + found = false; + + for (int i = 0; i < len; ++i) { + // seventh element should be a member of [0, len) + if (i == testData[6]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicCAS failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 8th element should be 1 + val &= (2 * i + 7); + } + + if (val != testData[7]) { + printf("atomicAnd failed\n"); + return false; + } + + val = 0; + + for (int i = 0; i < len; ++i) { + // 9th element should be 0xff + val |= (1 << i); + } + + if (val != testData[8]) { + printf("atomicOr failed\n"); + return false; + } + + val = 0xff; + + for (int i = 0; i < len; ++i) { + // 11th element should be 0xff + val ^= i; + } + + if (val != testData[9]) { + printf("atomicXor failed\n"); + return false; + } + + return true; } -int main(int argc, char **argv) { - // set device - cudaDeviceProp device_prop; - int dev_id = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); +int main(int argc, char **argv) +{ + // set device + cudaDeviceProp device_prop; + int dev_id = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); - if (!device_prop.managedMemory) { - // This samples requires being run on a device that supports Unified Memory - fprintf(stderr, "Unified Memory not supported on this device\n"); - exit(EXIT_WAIVED); - } + if (!device_prop.managedMemory) { + // This samples requires being run on a device that supports Unified Memory + fprintf(stderr, "Unified Memory not supported on this device\n"); + exit(EXIT_WAIVED); + } - if (device_prop.computeMode == cudaComputeModeProhibited) { - // This sample requires being run with a default or process exclusive mode - fprintf(stderr, - "This sample requires a device in either default or process " - "exclusive mode\n"); - exit(EXIT_WAIVED); - } + if (device_prop.computeMode == cudaComputeModeProhibited) { + // This sample requires being run with a default or process exclusive mode + fprintf(stderr, + "This sample requires a device in either default or process " + "exclusive mode\n"); + exit(EXIT_WAIVED); + } - if (device_prop.major < 6) { - printf( - "%s: requires a minimum CUDA compute 6.0 capability, waiving " - "testing.\n", - argv[0]); - exit(EXIT_WAIVED); - } + if (device_prop.major < 6) { + printf("%s: requires a minimum CUDA compute 6.0 capability, waiving " + "testing.\n", + argv[0]); + exit(EXIT_WAIVED); + } - unsigned int numThreads = 256; - unsigned int numBlocks = 64; - unsigned int numData = 10; + unsigned int numThreads = 256; + unsigned int numBlocks = 64; + unsigned int numData = 10; - int *atom_arr; + int *atom_arr; - if (device_prop.pageableMemoryAccess) { - printf("CAN access pageable memory\n"); - atom_arr = (int *)malloc(sizeof(int) * numData); - } else { - printf("CANNOT access pageable memory\n"); - checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData)); - } + if (device_prop.pageableMemoryAccess) { + printf("CAN access pageable memory\n"); + atom_arr = (int *)malloc(sizeof(int) * numData); + } + else { + printf("CANNOT access pageable memory\n"); + checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData)); + } - for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0; + for (unsigned int i = 0; i < numData; i++) + atom_arr[i] = 0; - // To make the AND and XOR tests generate something other than 0... - atom_arr[7] = atom_arr[9] = 0xff; + // To make the AND and XOR tests generate something other than 0... + atom_arr[7] = atom_arr[9] = 0xff; - atomicKernel<<>>(atom_arr); - atomicKernel_CPU(atom_arr, numBlocks * numThreads); + atomicKernel<<>>(atom_arr); + atomicKernel_CPU(atom_arr, numBlocks * numThreads); - checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaDeviceSynchronize()); - // Compute & verify reference solution - int testResult = verify(atom_arr, 2 * numThreads * numBlocks); + // Compute & verify reference solution + int testResult = verify(atom_arr, 2 * numThreads * numBlocks); - if (device_prop.pageableMemoryAccess) { - free(atom_arr); - } else { - cudaFree(atom_arr); - } + if (device_prop.pageableMemoryAccess) { + free(atom_arr); + } + else { + cudaFree(atom_arr); + } - printf("systemWideAtomics completed, returned %s \n", - testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/template/template.cu b/Samples/0_Introduction/template/template.cu index 104df9fa..ba0164f5 100644 --- a/Samples/0_Introduction/template/template.cu +++ b/Samples/0_Introduction/template/template.cu @@ -31,10 +31,10 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes CUDA #include @@ -47,34 +47,34 @@ // declaration, forward void runTest(int argc, char **argv); -extern "C" void computeGold(float *reference, float *idata, - const unsigned int len); +extern "C" void computeGold(float *reference, float *idata, const unsigned int len); //////////////////////////////////////////////////////////////////////////////// //! Simple test kernel for device functionality //! @param g_idata input data in global memory //! @param g_odata output data in global memory //////////////////////////////////////////////////////////////////////////////// -__global__ void testKernel(float *g_idata, float *g_odata) { - // shared memory - // the size is determined by the host application - extern __shared__ float sdata[]; +__global__ void testKernel(float *g_idata, float *g_odata) +{ + // shared memory + // the size is determined by the host application + extern __shared__ float sdata[]; - // access thread id - const unsigned int tid = threadIdx.x; - // access number of threads in this block - const unsigned int num_threads = blockDim.x; + // access thread id + const unsigned int tid = threadIdx.x; + // access number of threads in this block + const unsigned int num_threads = blockDim.x; - // read in input data from global memory - sdata[tid] = g_idata[tid]; - __syncthreads(); + // read in input data from global memory + sdata[tid] = g_idata[tid]; + __syncthreads(); - // perform some computations - sdata[tid] = (float)num_threads * sdata[tid]; - __syncthreads(); + // perform some computations + sdata[tid] = (float)num_threads * sdata[tid]; + __syncthreads(); - // write data to global memory - g_odata[tid] = sdata[tid]; + // write data to global memory + g_odata[tid] = sdata[tid]; } //////////////////////////////////////////////////////////////////////////////// @@ -85,81 +85,81 @@ int main(int argc, char **argv) { runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - bool bTestResult = true; +void runTest(int argc, char **argv) +{ + bool bTestResult = true; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - int devID = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); - StopWatchInterface *timer = 0; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + StopWatchInterface *timer = 0; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); - unsigned int num_threads = 32; - unsigned int mem_size = sizeof(float) * num_threads; + unsigned int num_threads = 32; + unsigned int mem_size = sizeof(float) * num_threads; - // allocate host memory - float *h_idata = (float *)malloc(mem_size); + // allocate host memory + float *h_idata = (float *)malloc(mem_size); - // initalize the memory - for (unsigned int i = 0; i < num_threads; ++i) { - h_idata[i] = (float)i; - } + // initalize the memory + for (unsigned int i = 0; i < num_threads; ++i) { + h_idata[i] = (float)i; + } - // allocate device memory - float *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); - // copy host memory to device - checkCudaErrors( - cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); + // allocate device memory + float *d_idata; + checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); + // copy host memory to device + checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); - // allocate device memory for result - float *d_odata; - checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); + // allocate device memory for result + float *d_odata; + checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); - // setup execution parameters - dim3 grid(1, 1, 1); - dim3 threads(num_threads, 1, 1); + // setup execution parameters + dim3 grid(1, 1, 1); + dim3 threads(num_threads, 1, 1); - // execute the kernel - testKernel<<>>(d_idata, d_odata); + // execute the kernel + testKernel<<>>(d_idata, d_odata); - // check if kernel execution generated and error - getLastCudaError("Kernel execution failed"); + // check if kernel execution generated and error + getLastCudaError("Kernel execution failed"); - // allocate mem for the result on host side - float *h_odata = (float *)malloc(mem_size); - // copy result from device to host - checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, - cudaMemcpyDeviceToHost)); + // allocate mem for the result on host side + float *h_odata = (float *)malloc(mem_size); + // copy result from device to host + checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost)); - sdkStopTimer(&timer); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - sdkDeleteTimer(&timer); + sdkStopTimer(&timer); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + sdkDeleteTimer(&timer); - // compute reference solution - float *reference = (float *)malloc(mem_size); - computeGold(reference, h_idata, num_threads); + // compute reference solution + float *reference = (float *)malloc(mem_size); + computeGold(reference, h_idata, num_threads); - // check result - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // write file for regression test - sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false); - } else { - // custom output handling when no regression test running - // in this case check if the result is equivalent to the expected solution - bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f); - } + // check result + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // write file for regression test + sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false); + } + else { + // custom output handling when no regression test running + // in this case check if the result is equivalent to the expected solution + bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f); + } - // cleanup memory - free(h_idata); - free(h_odata); - free(reference); - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(d_odata)); + // cleanup memory + free(h_idata); + free(h_odata); + free(reference); + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(d_odata)); - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/0_Introduction/template/template_cpu.cpp b/Samples/0_Introduction/template/template_cpu.cpp index 1e7c9f18..2546d02d 100644 --- a/Samples/0_Introduction/template/template_cpu.cpp +++ b/Samples/0_Introduction/template/template_cpu.cpp @@ -26,8 +26,7 @@ */ // export C interface -extern "C" void computeGold(float *reference, float *idata, - const unsigned int len); +extern "C" void computeGold(float *reference, float *idata, const unsigned int len); //////////////////////////////////////////////////////////////////////////////// //! Compute reference data set @@ -36,10 +35,11 @@ extern "C" void computeGold(float *reference, float *idata, //! @param idata input data as provided to device //! @param len number of elements in reference / idata //////////////////////////////////////////////////////////////////////////////// -void computeGold(float *reference, float *idata, const unsigned int len) { - const float f_len = static_cast(len); +void computeGold(float *reference, float *idata, const unsigned int len) +{ + const float f_len = static_cast(len); - for (unsigned int i = 0; i < len; ++i) { - reference[i] = idata[i] * f_len; - } + for (unsigned int i = 0; i < len; ++i) { + reference[i] = idata[i] * f_len; + } } diff --git a/Samples/0_Introduction/vectorAdd/vectorAdd.cu b/Samples/0_Introduction/vectorAdd/vectorAdd.cu index 284b0f0e..38b043ef 100644 --- a/Samples/0_Introduction/vectorAdd/vectorAdd.cu +++ b/Samples/0_Introduction/vectorAdd/vectorAdd.cu @@ -37,7 +37,6 @@ // For the CUDA runtime routines (prefixed with "cuda_") #include - #include /** * CUDA Kernel Device code @@ -45,166 +44,153 @@ * Computes the vector addition of A and B into C. The 3 vectors have the same * number of elements numElements. */ -__global__ void vectorAdd(const float *A, const float *B, float *C, - int numElements) { - int i = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < numElements) { - C[i] = A[i] + B[i] + 0.0f; - } + if (i < numElements) { + C[i] = A[i] + B[i] + 0.0f; + } } /** * Host main routine */ -int main(void) { - // Error code to check return values for CUDA calls - cudaError_t err = cudaSuccess; +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; - // Print the vector length to be used, and compute its size - int numElements = 50000; - size_t size = numElements * sizeof(float); - printf("[Vector addition of %d elements]\n", numElements); + // Print the vector length to be used, and compute its size + int numElements = 50000; + size_t size = numElements * sizeof(float); + printf("[Vector addition of %d elements]\n", numElements); - // Allocate the host input vector A - float *h_A = (float *)malloc(size); + // Allocate the host input vector A + float *h_A = (float *)malloc(size); - // Allocate the host input vector B - float *h_B = (float *)malloc(size); + // Allocate the host input vector B + float *h_B = (float *)malloc(size); - // Allocate the host output vector C - float *h_C = (float *)malloc(size); + // Allocate the host output vector C + float *h_C = (float *)malloc(size); - // Verify that allocations succeeded - if (h_A == NULL || h_B == NULL || h_C == NULL) { - fprintf(stderr, "Failed to allocate host vectors!\n"); - exit(EXIT_FAILURE); - } - - // Initialize the host input vectors - for (int i = 0; i < numElements; ++i) { - h_A[i] = rand() / (float)RAND_MAX; - h_B[i] = rand() / (float)RAND_MAX; - } - - // Allocate the device input vector A - float *d_A = NULL; - err = cudaMalloc((void **)&d_A, size); - - if (err != cudaSuccess) { - fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } - - // Allocate the device input vector B - float *d_B = NULL; - err = cudaMalloc((void **)&d_B, size); - - if (err != cudaSuccess) { - fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } - - // Allocate the device output vector C - float *d_C = NULL; - err = cudaMalloc((void **)&d_C, size); - - if (err != cudaSuccess) { - fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } - - // Copy the host input vectors A and B in host memory to the device input - // vectors in - // device memory - printf("Copy input data from the host memory to the CUDA device\n"); - err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); - - if (err != cudaSuccess) { - fprintf(stderr, - "Failed to copy vector A from host to device (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } - - err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); - - if (err != cudaSuccess) { - fprintf(stderr, - "Failed to copy vector B from host to device (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } - - // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 256; - int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; - printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, - threadsPerBlock); - vectorAdd<<>>(d_A, d_B, d_C, numElements); - err = cudaGetLastError(); - - if (err != cudaSuccess) { - fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } - - // Copy the device result vector in device memory to the host result vector - // in host memory. - printf("Copy output data from the CUDA device to the host memory\n"); - err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); - - if (err != cudaSuccess) { - fprintf(stderr, - "Failed to copy vector C from device to host (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } - - // Verify that the result vector is correct - for (int i = 0; i < numElements; ++i) { - if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { - fprintf(stderr, "Result verification failed at element %d!\n", i); - exit(EXIT_FAILURE); + // Verify that allocations succeeded + if (h_A == NULL || h_B == NULL || h_C == NULL) { + fprintf(stderr, "Failed to allocate host vectors!\n"); + exit(EXIT_FAILURE); } - } - printf("Test PASSED\n"); + // Initialize the host input vectors + for (int i = 0; i < numElements; ++i) { + h_A[i] = rand() / (float)RAND_MAX; + h_B[i] = rand() / (float)RAND_MAX; + } - // Free device global memory - err = cudaFree(d_A); + // Allocate the device input vector A + float *d_A = NULL; + err = cudaMalloc((void **)&d_A, size); - if (err != cudaSuccess) { - fprintf(stderr, "Failed to free device vector A (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } + if (err != cudaSuccess) { + fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } - err = cudaFree(d_B); + // Allocate the device input vector B + float *d_B = NULL; + err = cudaMalloc((void **)&d_B, size); - if (err != cudaSuccess) { - fprintf(stderr, "Failed to free device vector B (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } + if (err != cudaSuccess) { + fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } - err = cudaFree(d_C); + // Allocate the device output vector C + float *d_C = NULL; + err = cudaMalloc((void **)&d_C, size); - if (err != cudaSuccess) { - fprintf(stderr, "Failed to free device vector C (error code %s)!\n", - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } + if (err != cudaSuccess) { + fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } - // Free host memory - free(h_A); - free(h_B); - free(h_C); + // Copy the host input vectors A and B in host memory to the device input + // vectors in + // device memory + printf("Copy input data from the host memory to the CUDA device\n"); + err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); - printf("Done\n"); - return 0; + if (err != cudaSuccess) { + fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); + + if (err != cudaSuccess) { + fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); + vectorAdd<<>>(d_A, d_B, d_C, numElements); + err = cudaGetLastError(); + + if (err != cudaSuccess) { + fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + // Copy the device result vector in device memory to the host result vector + // in host memory. + printf("Copy output data from the CUDA device to the host memory\n"); + err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); + + if (err != cudaSuccess) { + fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) { + if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + + printf("Test PASSED\n"); + + // Free device global memory + err = cudaFree(d_A); + + if (err != cudaSuccess) { + fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + err = cudaFree(d_B); + + if (err != cudaSuccess) { + fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + err = cudaFree(d_C); + + if (err != cudaSuccess) { + fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + // Free host memory + free(h_A); + free(h_B); + free(h_C); + + printf("Done\n"); + return 0; } diff --git a/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp b/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp index c025f5cd..16f6cbfd 100644 --- a/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp +++ b/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp @@ -34,11 +34,11 @@ */ // Includes -#include -#include -#include #include #include +#include +#include +#include // includes, project #include @@ -50,19 +50,19 @@ using namespace std; // Variables -CUdevice cuDevice; -CUcontext cuContext; -CUmodule cuModule; -CUfunction vecAdd_kernel; -float *h_A; -float *h_B; -float *h_C; +CUdevice cuDevice; +CUcontext cuContext; +CUmodule cuModule; +CUfunction vecAdd_kernel; +float *h_A; +float *h_B; +float *h_C; CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_C; // Functions -int CleanupNoFailure(); +int CleanupNoFailure(); void RandomInit(float *, int); bool findModulePath(const char *, string &, char **, string &); @@ -72,150 +72,152 @@ bool findModulePath(const char *, string &, char **, string &); #endif // Host code -int main(int argc, char **argv) { - printf("Vector Addition (Driver API)\n"); - int N = 50000, devID = 0; - size_t size = N * sizeof(float); +int main(int argc, char **argv) +{ + printf("Vector Addition (Driver API)\n"); + int N = 50000, devID = 0; + size_t size = N * sizeof(float); - // Initialize - checkCudaErrors(cuInit(0)); + // Initialize + checkCudaErrors(cuInit(0)); - cuDevice = findCudaDeviceDRV(argc, (const char **)argv); - // Create context - checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + cuDevice = findCudaDeviceDRV(argc, (const char **)argv); + // Create context + checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); - // first search for the module path before we load the results - string module_path; + // first search for the module path before we load the results + string module_path; - std::ostringstream fatbin; + std::ostringstream fatbin; - if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { - exit(EXIT_FAILURE); - } else { - printf("> initCUDA loading module: <%s>\n", module_path.c_str()); - } + if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { + exit(EXIT_FAILURE); + } + else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); + } - if (!fatbin.str().size()) { - printf("fatbin file empty. exiting..\n"); - exit(EXIT_FAILURE); - } + if (!fatbin.str().size()) { + printf("fatbin file empty. exiting..\n"); + exit(EXIT_FAILURE); + } - // Create module from binary file (FATBIN) - checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); + // Create module from binary file (FATBIN) + checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); - // Get function handle from module - checkCudaErrors( - cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel")); + // Get function handle from module + checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel")); - // Allocate input vectors h_A and h_B in host memory - h_A = (float *)malloc(size); - h_B = (float *)malloc(size); - h_C = (float *)malloc(size); + // Allocate input vectors h_A and h_B in host memory + h_A = (float *)malloc(size); + h_B = (float *)malloc(size); + h_C = (float *)malloc(size); - // Initialize input vectors - RandomInit(h_A, N); - RandomInit(h_B, N); + // Initialize input vectors + RandomInit(h_A, N); + RandomInit(h_B, N); - // Allocate vectors in device memory - checkCudaErrors(cuMemAlloc(&d_A, size)); + // Allocate vectors in device memory + checkCudaErrors(cuMemAlloc(&d_A, size)); - checkCudaErrors(cuMemAlloc(&d_B, size)); + checkCudaErrors(cuMemAlloc(&d_B, size)); - checkCudaErrors(cuMemAlloc(&d_C, size)); + checkCudaErrors(cuMemAlloc(&d_C, size)); - // Copy vectors from host memory to device memory - checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size)); + // Copy vectors from host memory to device memory + checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size)); - checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size)); + checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size)); - if (1) { - // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel - // Launch (simpler method) + if (1) { + // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel + // Launch (simpler method) - // Grid/Block configuration - int threadsPerBlock = 256; - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + // Grid/Block configuration + int threadsPerBlock = 256; + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - void *args[] = {&d_A, &d_B, &d_C, &N}; + void *args[] = {&d_A, &d_B, &d_C, &N}; - // Launch the CUDA kernel - checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, - threadsPerBlock, 1, 1, 0, NULL, args, NULL)); - } else { - // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel - // Launch (advanced method) - int offset = 0; - void *argBuffer[16]; - *((CUdeviceptr *)&argBuffer[offset]) = d_A; - offset += sizeof(d_A); - *((CUdeviceptr *)&argBuffer[offset]) = d_B; - offset += sizeof(d_B); - *((CUdeviceptr *)&argBuffer[offset]) = d_C; - offset += sizeof(d_C); - *((int *)&argBuffer[offset]) = N; - offset += sizeof(N); + // Launch the CUDA kernel + checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL)); + } + else { + // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel + // Launch (advanced method) + int offset = 0; + void *argBuffer[16]; + *((CUdeviceptr *)&argBuffer[offset]) = d_A; + offset += sizeof(d_A); + *((CUdeviceptr *)&argBuffer[offset]) = d_B; + offset += sizeof(d_B); + *((CUdeviceptr *)&argBuffer[offset]) = d_C; + offset += sizeof(d_C); + *((int *)&argBuffer[offset]) = N; + offset += sizeof(N); - // Grid/Block configuration - int threadsPerBlock = 256; - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + // Grid/Block configuration + int threadsPerBlock = 256; + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - // Launch the CUDA kernel - checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, - threadsPerBlock, 1, 1, 0, NULL, NULL, - argBuffer)); - } + // Launch the CUDA kernel + checkCudaErrors( + cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer)); + } #ifdef _DEBUG - checkCudaErrors(cuCtxSynchronize()); + checkCudaErrors(cuCtxSynchronize()); #endif - // Copy result from device memory to host memory - // h_C contains the result in host memory - checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size)); + // Copy result from device memory to host memory + // h_C contains the result in host memory + checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size)); - // Verify result - int i; + // Verify result + int i; - for (i = 0; i < N; ++i) { - float sum = h_A[i] + h_B[i]; + for (i = 0; i < N; ++i) { + float sum = h_A[i] + h_B[i]; - if (fabs(h_C[i] - sum) > 1e-7f) { - break; + if (fabs(h_C[i] - sum) > 1e-7f) { + break; + } } - } - CleanupNoFailure(); - printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL"); + CleanupNoFailure(); + printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL"); - exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); + exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); } -int CleanupNoFailure() { - // Free device memory - checkCudaErrors(cuMemFree(d_A)); - checkCudaErrors(cuMemFree(d_B)); - checkCudaErrors(cuMemFree(d_C)); +int CleanupNoFailure() +{ + // Free device memory + checkCudaErrors(cuMemFree(d_A)); + checkCudaErrors(cuMemFree(d_B)); + checkCudaErrors(cuMemFree(d_C)); - // Free host memory - if (h_A) { - free(h_A); - } + // Free host memory + if (h_A) { + free(h_A); + } - if (h_B) { - free(h_B); - } + if (h_B) { + free(h_B); + } - if (h_C) { - free(h_C); - } + if (h_C) { + free(h_C); + } - checkCudaErrors(cuCtxDestroy(cuContext)); + checkCudaErrors(cuCtxDestroy(cuContext)); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } // Allocates an array with random float entries. -void RandomInit(float *data, int n) { - for (int i = 0; i < n; ++i) { - data[i] = rand() / (float)RAND_MAX; - } +void RandomInit(float *data, int n) +{ + for (int i = 0; i < n; ++i) { + data[i] = rand() / (float)RAND_MAX; + } } diff --git a/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu b/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu index 74cd1876..c963ed40 100644 --- a/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu +++ b/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu @@ -33,9 +33,10 @@ */ // Device code -extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, - float *C, int N) { - int i = blockDim.x * blockIdx.x + threadIdx.x; +extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < N) C[i] = A[i] + B[i]; + if (i < N) + C[i] = A[i] + B[i]; } diff --git a/Samples/0_Introduction/vectorAddMMAP/README.md b/Samples/0_Introduction/vectorAddMMAP/README.md index 512ae5f2..d194a9ac 100644 --- a/Samples/0_Introduction/vectorAddMMAP/README.md +++ b/Samples/0_Introduction/vectorAddMMAP/README.md @@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## References (for more details) - diff --git a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp index 5abd596b..7d5358d1 100644 --- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp +++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp @@ -29,172 +29,172 @@ static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; } -CUresult simpleMallocMultiDeviceMmap( - CUdeviceptr *dptr, size_t *allocationSize, size_t size, - const std::vector &residentDevices, - const std::vector &mappingDevices, size_t align) { - CUresult status = CUDA_SUCCESS; - size_t min_granularity = 0; - size_t stripeSize; +CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr, + size_t *allocationSize, + size_t size, + const std::vector &residentDevices, + const std::vector &mappingDevices, + size_t align) +{ + CUresult status = CUDA_SUCCESS; + size_t min_granularity = 0; + size_t stripeSize; - // Setup the properties common for all the chunks - // The allocations will be device pinned memory. - // This property structure describes the physical location where the memory - // will be allocated via cuMemCreate allong with additional properties In this - // case, the allocation will be pinnded device memory local to a given device. - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + // Setup the properties common for all the chunks + // The allocations will be device pinned memory. + // This property structure describes the physical location where the memory + // will be allocated via cuMemCreate allong with additional properties In this + // case, the allocation will be pinnded device memory local to a given device. + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - // Get the minimum granularity needed for the resident devices - // (the max of the minimum granularity of each participating device) - for (int idx = 0; idx < residentDevices.size(); idx++) { - size_t granularity = 0; + // Get the minimum granularity needed for the resident devices + // (the max of the minimum granularity of each participating device) + for (int idx = 0; idx < residentDevices.size(); idx++) { + size_t granularity = 0; - // get the minnimum granularity for residentDevices[idx] - prop.location.id = residentDevices[idx]; - status = cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM); - if (status != CUDA_SUCCESS) { - goto done; - } - if (min_granularity < granularity) { - min_granularity = granularity; - } - } - - // Get the minimum granularity needed for the accessing devices - // (the max of the minimum granularity of each participating device) - for (size_t idx = 0; idx < mappingDevices.size(); idx++) { - size_t granularity = 0; - - // get the minnimum granularity for mappingDevices[idx] - prop.location.id = mappingDevices[idx]; - status = cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM); - if (status != CUDA_SUCCESS) { - goto done; - } - if (min_granularity < granularity) { - min_granularity = granularity; - } - } - - // Round up the size such that we can evenly split it into a stripe size tha - // meets the granularity requirements Essentially size = N * - // residentDevices.size() * min_granularity is the requirement, since each - // piece of the allocation will be stripeSize = N * min_granularity and the - // min_granularity requirement applies to each stripeSize piece of the - // allocation. - size = round_up(size, residentDevices.size() * min_granularity); - stripeSize = size / residentDevices.size(); - - // Return the rounded up size to the caller for use in the free - if (allocationSize) { - *allocationSize = size; - } - - // Reserve the required contiguous VA space for the allocations - status = cuMemAddressReserve(dptr, size, align, 0, 0); - if (status != CUDA_SUCCESS) { - goto done; - } - - // Create and map the backings on each gpu - // note: reusing CUmemAllocationProp prop from earlier with prop.type & - // prop.location.type already specified. - for (size_t idx = 0; idx < residentDevices.size(); idx++) { - CUresult status2 = CUDA_SUCCESS; - - // Set the location for this chunk to this device - prop.location.id = residentDevices[idx]; - - // Create the allocation as a pinned allocation on this device - CUmemGenericAllocationHandle allocationHandle; - status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0); - if (status != CUDA_SUCCESS) { - goto done; + // get the minnimum granularity for residentDevices[idx] + prop.location.id = residentDevices[idx]; + status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + if (status != CUDA_SUCCESS) { + goto done; + } + if (min_granularity < granularity) { + min_granularity = granularity; + } } - // Assign the chunk to the appropriate VA range and release the handle. - // After mapping the memory, it can be referenced by virtual address. - // Since we do not need to make any other mappings of this memory or export - // it, we no longer need and can release the allocationHandle. The - // allocation will be kept live until it is unmapped. - status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, - allocationHandle, 0); - - // the handle needs to be released even if the mapping failed. - status2 = cuMemRelease(allocationHandle); - if (status == CUDA_SUCCESS) { - // cuMemRelease should not have failed here - // as the handle was just allocated successfully - // however return an error if it does. - status = status2; - } - - // Cleanup in case of any mapping failures. - if (status != CUDA_SUCCESS) { - goto done; - } - } - - { - // Each accessDescriptor will describe the mapping requirement for a single - // device - std::vector accessDescriptors; - accessDescriptors.resize(mappingDevices.size()); - - // Prepare the access descriptor array indicating where and how the backings - // should be visible. + // Get the minimum granularity needed for the accessing devices + // (the max of the minimum granularity of each participating device) for (size_t idx = 0; idx < mappingDevices.size(); idx++) { - // Specify which device we are adding mappings for. - accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDescriptors[idx].location.id = mappingDevices[idx]; + size_t granularity = 0; - // Specify both read and write access. - accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + // get the minnimum granularity for mappingDevices[idx] + prop.location.id = mappingDevices[idx]; + status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + if (status != CUDA_SUCCESS) { + goto done; + } + if (min_granularity < granularity) { + min_granularity = granularity; + } } - // Apply the access descriptors to the whole VA range. - status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], - accessDescriptors.size()); + // Round up the size such that we can evenly split it into a stripe size tha + // meets the granularity requirements Essentially size = N * + // residentDevices.size() * min_granularity is the requirement, since each + // piece of the allocation will be stripeSize = N * min_granularity and the + // min_granularity requirement applies to each stripeSize piece of the + // allocation. + size = round_up(size, residentDevices.size() * min_granularity); + stripeSize = size / residentDevices.size(); + + // Return the rounded up size to the caller for use in the free + if (allocationSize) { + *allocationSize = size; + } + + // Reserve the required contiguous VA space for the allocations + status = cuMemAddressReserve(dptr, size, align, 0, 0); if (status != CUDA_SUCCESS) { - goto done; + goto done; + } + + // Create and map the backings on each gpu + // note: reusing CUmemAllocationProp prop from earlier with prop.type & + // prop.location.type already specified. + for (size_t idx = 0; idx < residentDevices.size(); idx++) { + CUresult status2 = CUDA_SUCCESS; + + // Set the location for this chunk to this device + prop.location.id = residentDevices[idx]; + + // Create the allocation as a pinned allocation on this device + CUmemGenericAllocationHandle allocationHandle; + status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0); + if (status != CUDA_SUCCESS) { + goto done; + } + + // Assign the chunk to the appropriate VA range and release the handle. + // After mapping the memory, it can be referenced by virtual address. + // Since we do not need to make any other mappings of this memory or export + // it, we no longer need and can release the allocationHandle. The + // allocation will be kept live until it is unmapped. + status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0); + + // the handle needs to be released even if the mapping failed. + status2 = cuMemRelease(allocationHandle); + if (status == CUDA_SUCCESS) { + // cuMemRelease should not have failed here + // as the handle was just allocated successfully + // however return an error if it does. + status = status2; + } + + // Cleanup in case of any mapping failures. + if (status != CUDA_SUCCESS) { + goto done; + } + } + + { + // Each accessDescriptor will describe the mapping requirement for a single + // device + std::vector accessDescriptors; + accessDescriptors.resize(mappingDevices.size()); + + // Prepare the access descriptor array indicating where and how the backings + // should be visible. + for (size_t idx = 0; idx < mappingDevices.size(); idx++) { + // Specify which device we are adding mappings for. + accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDescriptors[idx].location.id = mappingDevices[idx]; + + // Specify both read and write access. + accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + + // Apply the access descriptors to the whole VA range. + status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size()); + if (status != CUDA_SUCCESS) { + goto done; + } } - } done: - if (status != CUDA_SUCCESS) { - if (*dptr) { - simpleFreeMultiDeviceMmap(*dptr, size); + if (status != CUDA_SUCCESS) { + if (*dptr) { + simpleFreeMultiDeviceMmap(*dptr, size); + } } - } - return status; + return status; } -CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) { - CUresult status = CUDA_SUCCESS; +CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) +{ + CUresult status = CUDA_SUCCESS; - // Unmap the mapped virtual memory region - // Since the handles to the mapped backing stores have already been released - // by cuMemRelease, and these are the only/last mappings referencing them, - // The backing stores will be freed. - // Since the memory has been unmapped after this call, accessing the specified - // va range will result in a fault (unitll it is remapped). - status = cuMemUnmap(dptr, size); - if (status != CUDA_SUCCESS) { - return status; - } - // Free the virtual address region. This allows the virtual address region - // to be reused by future cuMemAddressReserve calls. This also allows the - // virtual address region to be used by other allocation made through - // opperating system calls like malloc & mmap. - status = cuMemAddressFree(dptr, size); - if (status != CUDA_SUCCESS) { - return status; - } + // Unmap the mapped virtual memory region + // Since the handles to the mapped backing stores have already been released + // by cuMemRelease, and these are the only/last mappings referencing them, + // The backing stores will be freed. + // Since the memory has been unmapped after this call, accessing the specified + // va range will result in a fault (unitll it is remapped). + status = cuMemUnmap(dptr, size); + if (status != CUDA_SUCCESS) { + return status; + } + // Free the virtual address region. This allows the virtual address region + // to be reused by future cuMemAddressReserve calls. This also allows the + // virtual address region to be used by other allocation made through + // opperating system calls like malloc & mmap. + status = cuMemAddressFree(dptr, size); + if (status != CUDA_SUCCESS) { + return status; + } - return status; + return status; } diff --git a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp index 8f297310..d0cd7ae9 100644 --- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp +++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp @@ -63,10 +63,12 @@ //! handle //! is not needed after its mappings are set up. //////////////////////////////////////////////////////////////////////////// -CUresult simpleMallocMultiDeviceMmap( - CUdeviceptr *dptr, size_t *allocationSize, size_t size, - const std::vector &residentDevices, - const std::vector &mappingDevices, size_t align = 0); +CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr, + size_t *allocationSize, + size_t size, + const std::vector &residentDevices, + const std::vector &mappingDevices, + size_t align = 0); //////////////////////////////////////////////////////////////////////////// //! Frees resources allocated by simpleMallocMultiDeviceMmap diff --git a/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp b/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp index e0349974..f0249e78 100644 --- a/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp +++ b/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp @@ -36,11 +36,11 @@ */ // Includes +#include #include +#include #include #include -#include -#include // includes, project #include @@ -54,115 +54,111 @@ using namespace std; // Variables -CUdevice cuDevice; -CUcontext cuContext; -CUmodule cuModule; -CUfunction vecAdd_kernel; -float *h_A; -float *h_B; -float *h_C; +CUdevice cuDevice; +CUcontext cuContext; +CUmodule cuModule; +CUfunction vecAdd_kernel; +float *h_A; +float *h_B; +float *h_C; CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_C; -size_t allocationSize = 0; +size_t allocationSize = 0; // Functions -int CleanupNoFailure(); +int CleanupNoFailure(); void RandomInit(float *, int); -//define input fatbin file +// define input fatbin file #ifndef FATBIN_FILE #define FATBIN_FILE "vectorAdd_kernel64.fatbin" #endif // collect all of the devices whose memory can be mapped from cuDevice. -vector getBackingDevices(CUdevice cuDevice) { - int num_devices; +vector getBackingDevices(CUdevice cuDevice) +{ + int num_devices; - checkCudaErrors(cuDeviceGetCount(&num_devices)); + checkCudaErrors(cuDeviceGetCount(&num_devices)); - vector backingDevices; - backingDevices.push_back(cuDevice); - for (int dev = 0; dev < num_devices; dev++) { - int capable = 0; - int attributeVal = 0; + vector backingDevices; + backingDevices.push_back(cuDevice); + for (int dev = 0; dev < num_devices; dev++) { + int capable = 0; + int attributeVal = 0; - // The mapping device is already in the backingDevices vector - if (dev == cuDevice) { - continue; + // The mapping device is already in the backingDevices vector + if (dev == cuDevice) { + continue; + } + + // Only peer capable devices can map each others memory + checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev)); + if (!capable) { + continue; + } + + // The device needs to support virtual address management for the required + // apis to work + checkCudaErrors( + cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice)); + if (attributeVal == 0) { + continue; + } + + backingDevices.push_back(dev); } - - // Only peer capable devices can map each others memory - checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev)); - if (!capable) { - continue; - } - - // The device needs to support virtual address management for the required - // apis to work - checkCudaErrors(cuDeviceGetAttribute( - &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice)); - if (attributeVal == 0) { - continue; - } - - backingDevices.push_back(dev); - } - return backingDevices; + return backingDevices; } // Host code -int main(int argc, char **argv) { - printf("Vector Addition (Driver API)\n"); - int N = 50000; - size_t size = N * sizeof(float); - int attributeVal = 0; +int main(int argc, char **argv) +{ + printf("Vector Addition (Driver API)\n"); + int N = 50000; + size_t size = N * sizeof(float); + int attributeVal = 0; - // Initialize - checkCudaErrors(cuInit(0)); + // Initialize + checkCudaErrors(cuInit(0)); - cuDevice = findCudaDeviceDRV(argc, (const char **)argv); + cuDevice = findCudaDeviceDRV(argc, (const char **)argv); - // Check that the selected device supports virtual address management - checkCudaErrors(cuDeviceGetAttribute( - &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice)); - printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, - attributeVal); - if (attributeVal == 0) { - printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice); - exit(EXIT_WAIVED); - } + // Check that the selected device supports virtual address management + checkCudaErrors( + cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice)); + printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal); + if (attributeVal == 0) { + printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice); + exit(EXIT_WAIVED); + } - // The vector addition happens on cuDevice, so the allocations need to be - // mapped there. - vector mappingDevices; - mappingDevices.push_back(cuDevice); + // The vector addition happens on cuDevice, so the allocations need to be + // mapped there. + vector mappingDevices; + mappingDevices.push_back(cuDevice); - // Collect devices accessible by the mapping device (cuDevice) into the - // backingDevices vector. - vector backingDevices = getBackingDevices(cuDevice); + // Collect devices accessible by the mapping device (cuDevice) into the + // backingDevices vector. + vector backingDevices = getBackingDevices(cuDevice); - // Create context - checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + // Create context + checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); // first search for the module path before we load the results string module_path; std::ostringstream fatbin; - if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) - { + if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { exit(EXIT_FAILURE); } - else - { + else { printf("> initCUDA loading module: <%s>\n", module_path.c_str()); } - if (!fatbin.str().size()) - { + if (!fatbin.str().size()) { printf("fatbin file empty. exiting..\n"); exit(EXIT_FAILURE); } @@ -204,13 +200,10 @@ int main(int argc, char **argv) { int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - void *args[] = { &d_A, &d_B, &d_C, &N }; + void *args[] = {&d_A, &d_B, &d_C, &N}; // Launch the CUDA kernel - checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, - threadsPerBlock, 1, 1, - 0, - NULL, args, NULL)); + checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL)); // Copy result from device memory to host memory // h_C contains the result in host memory @@ -219,20 +212,18 @@ int main(int argc, char **argv) { // Verify result int i; - for (i = 0; i < N; ++i) - { + for (i = 0; i < N; ++i) { float sum = h_A[i] + h_B[i]; - if (fabs(h_C[i] - sum) > 1e-7f) - { + if (fabs(h_C[i] - sum) > 1e-7f) { break; } } CleanupNoFailure(); - printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL"); + printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL"); - exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE); + exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); } int CleanupNoFailure() @@ -243,18 +234,15 @@ int CleanupNoFailure() checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize)); // Free host memory - if (h_A) - { + if (h_A) { free(h_A); } - if (h_B) - { + if (h_B) { free(h_B); } - if (h_C) - { + if (h_C) { free(h_C); } @@ -265,8 +253,7 @@ int CleanupNoFailure() // Allocates an array with random float entries. void RandomInit(float *data, int n) { - for (int i = 0; i < n; ++i) - { + for (int i = 0; i < n; ++i) { data[i] = rand() / (float)RAND_MAX; } } diff --git a/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu b/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu index 8dba27ac..00662fa4 100644 --- a/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu +++ b/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu @@ -34,9 +34,10 @@ */ // Device code -extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, - float *C, int N) { - int i = blockDim.x * blockIdx.x + threadIdx.x; +extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < N) C[i] = A[i] + B[i]; + if (i < N) + C[i] = A[i] + B[i]; } diff --git a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp index d79e4475..1b3200f8 100644 --- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp +++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp @@ -33,8 +33,8 @@ * of the programming guide with some additions like error checking. */ -#include #include +#include // For the CUDA runtime routines (prefixed with "cuda_") #include @@ -42,112 +42,116 @@ // helper functions and utilities to work with CUDA #include - #include /** * Host main routine */ -int main(int argc, char **argv) { - char *cubin, *kernel_file; - size_t cubinSize; - kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - CUmodule module = loadCUBIN(cubin, argc, argv); +int main(int argc, char **argv) +{ + char *cubin, *kernel_file; + size_t cubinSize; + kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + CUmodule module = loadCUBIN(cubin, argc, argv); - CUfunction kernel_addr; - checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd")); + CUfunction kernel_addr; + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd")); - // Print the vector length to be used, and compute its size - int numElements = 50000; - size_t size = numElements * sizeof(float); - printf("[Vector addition of %d elements]\n", numElements); + // Print the vector length to be used, and compute its size + int numElements = 50000; + size_t size = numElements * sizeof(float); + printf("[Vector addition of %d elements]\n", numElements); - // Allocate the host input vector A - float *h_A = reinterpret_cast(malloc(size)); + // Allocate the host input vector A + float *h_A = reinterpret_cast(malloc(size)); - // Allocate the host input vector B - float *h_B = reinterpret_cast(malloc(size)); + // Allocate the host input vector B + float *h_B = reinterpret_cast(malloc(size)); - // Allocate the host output vector C - float *h_C = reinterpret_cast(malloc(size)); + // Allocate the host output vector C + float *h_C = reinterpret_cast(malloc(size)); - // Verify that allocations succeeded - if (h_A == NULL || h_B == NULL || h_C == NULL) { - fprintf(stderr, "Failed to allocate host vectors!\n"); - exit(EXIT_FAILURE); - } - - // Initialize the host input vectors - for (int i = 0; i < numElements; ++i) { - h_A[i] = rand() / static_cast(RAND_MAX); - h_B[i] = rand() / static_cast(RAND_MAX); - } - - // Allocate the device input vector A - CUdeviceptr d_A; - checkCudaErrors(cuMemAlloc(&d_A, size)); - - // Allocate the device input vector B - CUdeviceptr d_B; - checkCudaErrors(cuMemAlloc(&d_B, size)); - - // Allocate the device output vector C - CUdeviceptr d_C; - checkCudaErrors(cuMemAlloc(&d_C, size)); - - // Copy the host input vectors A and B in host memory to the device input - // vectors in device memory - printf("Copy input data from the host memory to the CUDA device\n"); - checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size)); - checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size)); - - // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 256; - int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; - printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, - threadsPerBlock); - dim3 cudaBlockSize(threadsPerBlock, 1, 1); - dim3 cudaGridSize(blocksPerGrid, 1, 1); - - void *arr[] = {reinterpret_cast(&d_A), reinterpret_cast(&d_B), - reinterpret_cast(&d_C), - reinterpret_cast(&numElements)}; - checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - cudaBlockSize.x, cudaBlockSize.y, - cudaBlockSize.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &arr[0], /* arguments */ - 0)); - checkCudaErrors(cuCtxSynchronize()); - - // Copy the device result vector in device memory to the host result vector - // in host memory. - printf("Copy output data from the CUDA device to the host memory\n"); - checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size)); - - // Verify that the result vector is correct - for (int i = 0; i < numElements; ++i) { - if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { - fprintf(stderr, "Result verification failed at element %d!\n", i); - exit(EXIT_FAILURE); + // Verify that allocations succeeded + if (h_A == NULL || h_B == NULL || h_C == NULL) { + fprintf(stderr, "Failed to allocate host vectors!\n"); + exit(EXIT_FAILURE); } - } - printf("Test PASSED\n"); + // Initialize the host input vectors + for (int i = 0; i < numElements; ++i) { + h_A[i] = rand() / static_cast(RAND_MAX); + h_B[i] = rand() / static_cast(RAND_MAX); + } - // Free device global memory - checkCudaErrors(cuMemFree(d_A)); - checkCudaErrors(cuMemFree(d_B)); - checkCudaErrors(cuMemFree(d_C)); + // Allocate the device input vector A + CUdeviceptr d_A; + checkCudaErrors(cuMemAlloc(&d_A, size)); - // Free host memory - free(h_A); - free(h_B); - free(h_C); + // Allocate the device input vector B + CUdeviceptr d_B; + checkCudaErrors(cuMemAlloc(&d_B, size)); - printf("Done\n"); + // Allocate the device output vector C + CUdeviceptr d_C; + checkCudaErrors(cuMemAlloc(&d_C, size)); - return 0; + // Copy the host input vectors A and B in host memory to the device input + // vectors in device memory + printf("Copy input data from the host memory to the CUDA device\n"); + checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size)); + checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size)); + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); + dim3 cudaBlockSize(threadsPerBlock, 1, 1); + dim3 cudaGridSize(blocksPerGrid, 1, 1); + + void *arr[] = {reinterpret_cast(&d_A), + reinterpret_cast(&d_B), + reinterpret_cast(&d_C), + reinterpret_cast(&numElements)}; + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + cudaBlockSize.x, + cudaBlockSize.y, + cudaBlockSize.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &arr[0], /* arguments */ + 0)); + checkCudaErrors(cuCtxSynchronize()); + + // Copy the device result vector in device memory to the host result vector + // in host memory. + printf("Copy output data from the CUDA device to the host memory\n"); + checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size)); + + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) { + if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { + fprintf(stderr, "Result verification failed at element %d!\n", i); + exit(EXIT_FAILURE); + } + } + + printf("Test PASSED\n"); + + // Free device global memory + checkCudaErrors(cuMemFree(d_A)); + checkCudaErrors(cuMemFree(d_B)); + checkCudaErrors(cuMemFree(d_C)); + + // Free host memory + free(h_A); + free(h_B); + free(h_C); + + printf("Done\n"); + + return 0; } diff --git a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu index 44c85a47..32b187cc 100644 --- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu +++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu @@ -32,11 +32,11 @@ * number of elements numElements. */ -extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, - int numElements) { - int i = blockDim.x * blockIdx.x + threadIdx.x; +extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < numElements) { - C[i] = A[i] + B[i]; - } + if (i < numElements) { + C[i] = A[i] + B[i]; + } } diff --git a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu b/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu index 01cb00b0..56764d02 100644 --- a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu +++ b/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu @@ -39,12 +39,10 @@ #include // includes -#include // helper functions for CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples - -#include - #include +#include +#include // helper functions for CUDA error checking and initialization +#include // helper for shared functions common to CUDA Samples #include #include @@ -52,26 +50,26 @@ static const char *sSDKsample = "CUDA Bandwidth Test"; // defines, project #define MEMCOPY_ITERATIONS 100 -#define DEFAULT_SIZE (32 * (1e6)) // 32 M -#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M -#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M +#define DEFAULT_SIZE (32 * (1e6)) // 32 M +#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M +#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M // shmoo mode defines -#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M -#define SHMOO_MEMSIZE_START (1e3) // 1 KB -#define SHMOO_INCREMENT_1KB (1e3) // 1 KB -#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB -#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB -#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB -#define SHMOO_INCREMENT_1MB (1e6) // 1 MB -#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB -#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB -#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB -#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB -#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB -#define SHMOO_LIMIT_1MB (1e6) // 1 MB -#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB -#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB +#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M +#define SHMOO_MEMSIZE_START (1e3) // 1 KB +#define SHMOO_INCREMENT_1KB (1e3) // 1 KB +#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB +#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB +#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB +#define SHMOO_INCREMENT_1MB (1e6) // 1 MB +#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB +#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB +#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB +#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB +#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB +#define SHMOO_LIMIT_1MB (1e6) // 1 MB +#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB +#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB // CPU cache flush #define FLUSH_SIZE (256 * 1024 * 1024) @@ -83,887 +81,969 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE }; enum printMode { USER_READABLE, CSV }; enum memoryMode { PINNED, PAGEABLE }; -const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", - "Device to Device", NULL}; +const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL}; const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL}; // if true, use CPU based timing for everything static bool bDontUseGPUTiming; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; //////////////////////////////////////////////////////////////////////////////// // declaration, forward -int runTest(const int argc, const char **argv); -void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, - testMode mode, memcpyKind kind, printMode printmode, - memoryMode memMode, int startDevice, int endDevice, bool wc); -void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, - memoryMode memMode, int startDevice, int endDevice, - bool wc); -void testBandwidthRange(unsigned int start, unsigned int end, - unsigned int increment, memcpyKind kind, - printMode printmode, memoryMode memMode, - int startDevice, int endDevice, bool wc); -void testBandwidthShmoo(memcpyKind kind, printMode printmode, - memoryMode memMode, int startDevice, int endDevice, - bool wc); -float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, - bool wc); -float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, - bool wc); +int runTest(const int argc, const char **argv); +void testBandwidth(unsigned int start, + unsigned int end, + unsigned int increment, + testMode mode, + memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc); +void testBandwidthQuick(unsigned int size, + memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc); +void testBandwidthRange(unsigned int start, + unsigned int end, + unsigned int increment, + memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc); +void testBandwidthShmoo(memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc); +float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc); +float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc); float testDeviceToDeviceTransfer(unsigned int memSize); -void printResultsReadable(unsigned int *memSizes, double *bandwidths, - unsigned int count, memcpyKind kind, - memoryMode memMode, int iNumDevs, bool wc); -void printResultsCSV(unsigned int *memSizes, double *bandwidths, - unsigned int count, memcpyKind kind, memoryMode memMode, - int iNumDevs, bool wc); -void printHelp(void); +void printResultsReadable(unsigned int *memSizes, + double *bandwidths, + unsigned int count, + memcpyKind kind, + memoryMode memMode, + int iNumDevs, + bool wc); +void printResultsCSV(unsigned int *memSizes, + double *bandwidths, + unsigned int count, + memcpyKind kind, + memoryMode memMode, + int iNumDevs, + bool wc); +void printHelp(void); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; - flush_buf = (char *)malloc(FLUSH_SIZE); + flush_buf = (char *)malloc(FLUSH_SIZE); - // set logfile name and start logs - printf("[%s] - Starting...\n", sSDKsample); + // set logfile name and start logs + printf("[%s] - Starting...\n", sSDKsample); - int iRetVal = runTest(argc, (const char **)argv); + int iRetVal = runTest(argc, (const char **)argv); - if (iRetVal < 0) { - checkCudaErrors(cudaSetDevice(0)); - } + if (iRetVal < 0) { + checkCudaErrors(cudaSetDevice(0)); + } - // finish - printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); + // finish + printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); - free(flush_buf); + free(flush_buf); - exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE); + exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE); } /////////////////////////////////////////////////////////////////////////////// // Parse args, run the appropriate tests /////////////////////////////////////////////////////////////////////////////// -int runTest(const int argc, const char **argv) { - int start = DEFAULT_SIZE; - int end = DEFAULT_SIZE; - int startDevice = 0; - int endDevice = 0; - int increment = DEFAULT_INCREMENT; - testMode mode = QUICK_MODE; - bool htod = false; - bool dtoh = false; - bool dtod = false; - bool wc = false; - char *modeStr; - char *device = NULL; - printMode printmode = USER_READABLE; - char *memModeStr = NULL; - memoryMode memMode = PINNED; +int runTest(const int argc, const char **argv) +{ + int start = DEFAULT_SIZE; + int end = DEFAULT_SIZE; + int startDevice = 0; + int endDevice = 0; + int increment = DEFAULT_INCREMENT; + testMode mode = QUICK_MODE; + bool htod = false; + bool dtoh = false; + bool dtod = false; + bool wc = false; + char *modeStr; + char *device = NULL; + printMode printmode = USER_READABLE; + char *memModeStr = NULL; + memoryMode memMode = PINNED; - // process command line args - if (checkCmdLineFlag(argc, argv, "help")) { - printHelp(); - return 0; - } - - if (checkCmdLineFlag(argc, argv, "csv")) { - printmode = CSV; - } - - if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { - if (strcmp(memModeStr, "pageable") == 0) { - memMode = PAGEABLE; - } else if (strcmp(memModeStr, "pinned") == 0) { - memMode = PINNED; - } else { - printf("Invalid memory mode - valid modes are pageable or pinned\n"); - printf("See --help for more information\n"); - return -1000; - } - } else { - // default - pinned memory - memMode = PINNED; - } - - if (getCmdLineArgumentString(argc, argv, "device", &device)) { - int deviceCount; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, - cudaGetErrorString(error_id)); - exit(EXIT_FAILURE); + // process command line args + if (checkCmdLineFlag(argc, argv, "help")) { + printHelp(); + return 0; } - if (deviceCount == 0) { - printf("!!!!!No devices found!!!!!\n"); - return -2000; + if (checkCmdLineFlag(argc, argv, "csv")) { + printmode = CSV; } - if (strcmp(device, "all") == 0) { - printf( - "\n!!!!!Cumulative Bandwidth to be computed from all the devices " - "!!!!!!\n\n"); - startDevice = 0; - endDevice = deviceCount - 1; - } else { - startDevice = endDevice = atoi(device); - - if (startDevice >= deviceCount || startDevice < 0) { - printf( - "\n!!!!!Invalid GPU number %d given hence default gpu %d will be " - "used !!!!!\n", - startDevice, 0); - startDevice = endDevice = 0; - } + if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { + if (strcmp(memModeStr, "pageable") == 0) { + memMode = PAGEABLE; + } + else if (strcmp(memModeStr, "pinned") == 0) { + memMode = PINNED; + } + else { + printf("Invalid memory mode - valid modes are pageable or pinned\n"); + printf("See --help for more information\n"); + return -1000; + } } - } - - printf("Running on...\n\n"); - - for (int currentDevice = startDevice; currentDevice <= endDevice; - currentDevice++) { - cudaDeviceProp deviceProp; - cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); - - if (error_id == cudaSuccess) { - printf(" Device %d: %s\n", currentDevice, deviceProp.name); - - if (deviceProp.computeMode == cudaComputeModeProhibited) { - fprintf(stderr, - "Error: device is running in , no " - "threads can use ::cudaSetDevice().\n"); - checkCudaErrors(cudaSetDevice(currentDevice)); - - exit(EXIT_FAILURE); - } - } else { - printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, - cudaGetErrorString(error_id)); - checkCudaErrors(cudaSetDevice(currentDevice)); - - exit(EXIT_FAILURE); + else { + // default - pinned memory + memMode = PINNED; } - } - if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) { - // figure out the mode - if (strcmp(modeStr, "quick") == 0) { - printf(" Quick Mode\n\n"); - mode = QUICK_MODE; - } else if (strcmp(modeStr, "shmoo") == 0) { - printf(" Shmoo Mode\n\n"); - mode = SHMOO_MODE; - } else if (strcmp(modeStr, "range") == 0) { - printf(" Range Mode\n\n"); - mode = RANGE_MODE; - } else { - printf("Invalid mode - valid modes are quick, range, or shmoo\n"); - printf("See --help for more information\n"); - return -3000; + if (getCmdLineArgumentString(argc, argv, "device", &device)) { + int deviceCount; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + + if (error_id != cudaSuccess) { + printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); + exit(EXIT_FAILURE); + } + + if (deviceCount == 0) { + printf("!!!!!No devices found!!!!!\n"); + return -2000; + } + + if (strcmp(device, "all") == 0) { + printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices " + "!!!!!!\n\n"); + startDevice = 0; + endDevice = deviceCount - 1; + } + else { + startDevice = endDevice = atoi(device); + + if (startDevice >= deviceCount || startDevice < 0) { + printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be " + "used !!!!!\n", + startDevice, + 0); + startDevice = endDevice = 0; + } + } } - } else { - // default mode - quick - printf(" Quick Mode\n\n"); - mode = QUICK_MODE; - } - if (checkCmdLineFlag(argc, argv, "htod")) { - htod = true; - } + printf("Running on...\n\n"); - if (checkCmdLineFlag(argc, argv, "dtoh")) { - dtoh = true; - } + for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { + cudaDeviceProp deviceProp; + cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); - if (checkCmdLineFlag(argc, argv, "dtod")) { - dtod = true; - } + if (error_id == cudaSuccess) { + printf(" Device %d: %s\n", currentDevice, deviceProp.name); + + if (deviceProp.computeMode == cudaComputeModeProhibited) { + fprintf(stderr, + "Error: device is running in , no " + "threads can use ::cudaSetDevice().\n"); + checkCudaErrors(cudaSetDevice(currentDevice)); + + exit(EXIT_FAILURE); + } + } + else { + printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); + checkCudaErrors(cudaSetDevice(currentDevice)); + + exit(EXIT_FAILURE); + } + } + + if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) { + // figure out the mode + if (strcmp(modeStr, "quick") == 0) { + printf(" Quick Mode\n\n"); + mode = QUICK_MODE; + } + else if (strcmp(modeStr, "shmoo") == 0) { + printf(" Shmoo Mode\n\n"); + mode = SHMOO_MODE; + } + else if (strcmp(modeStr, "range") == 0) { + printf(" Range Mode\n\n"); + mode = RANGE_MODE; + } + else { + printf("Invalid mode - valid modes are quick, range, or shmoo\n"); + printf("See --help for more information\n"); + return -3000; + } + } + else { + // default mode - quick + printf(" Quick Mode\n\n"); + mode = QUICK_MODE; + } + + if (checkCmdLineFlag(argc, argv, "htod")) { + htod = true; + } + + if (checkCmdLineFlag(argc, argv, "dtoh")) { + dtoh = true; + } + + if (checkCmdLineFlag(argc, argv, "dtod")) { + dtod = true; + } #if CUDART_VERSION >= 2020 - if (checkCmdLineFlag(argc, argv, "wc")) { - wc = true; - } + if (checkCmdLineFlag(argc, argv, "wc")) { + wc = true; + } #endif - if (checkCmdLineFlag(argc, argv, "cputiming")) { - bDontUseGPUTiming = true; - } - - if (!htod && !dtoh && !dtod) { - // default: All - htod = true; - dtoh = true; - dtod = true; - } - - if (RANGE_MODE == mode) { - if (checkCmdLineFlag(argc, (const char **)argv, "start")) { - start = getCmdLineArgumentInt(argc, argv, "start"); - - if (start <= 0) { - printf("Illegal argument - start must be greater than zero\n"); - return -4000; - } - } else { - printf("Must specify a starting size in range mode\n"); - printf("See --help for more information\n"); - return -5000; + if (checkCmdLineFlag(argc, argv, "cputiming")) { + bDontUseGPUTiming = true; } - if (checkCmdLineFlag(argc, (const char **)argv, "end")) { - end = getCmdLineArgumentInt(argc, argv, "end"); - - if (end <= 0) { - printf("Illegal argument - end must be greater than zero\n"); - return -6000; - } - - if (start > end) { - printf("Illegal argument - start is greater than end\n"); - return -7000; - } - } else { - printf("Must specify an end size in range mode.\n"); - printf("See --help for more information\n"); - return -8000; + if (!htod && !dtoh && !dtod) { + // default: All + htod = true; + dtoh = true; + dtod = true; } - if (checkCmdLineFlag(argc, argv, "increment")) { - increment = getCmdLineArgumentInt(argc, argv, "increment"); + if (RANGE_MODE == mode) { + if (checkCmdLineFlag(argc, (const char **)argv, "start")) { + start = getCmdLineArgumentInt(argc, argv, "start"); - if (increment <= 0) { - printf("Illegal argument - increment must be greater than zero\n"); - return -9000; - } - } else { - printf("Must specify an increment in user mode\n"); - printf("See --help for more information\n"); - return -10000; + if (start <= 0) { + printf("Illegal argument - start must be greater than zero\n"); + return -4000; + } + } + else { + printf("Must specify a starting size in range mode\n"); + printf("See --help for more information\n"); + return -5000; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "end")) { + end = getCmdLineArgumentInt(argc, argv, "end"); + + if (end <= 0) { + printf("Illegal argument - end must be greater than zero\n"); + return -6000; + } + + if (start > end) { + printf("Illegal argument - start is greater than end\n"); + return -7000; + } + } + else { + printf("Must specify an end size in range mode.\n"); + printf("See --help for more information\n"); + return -8000; + } + + if (checkCmdLineFlag(argc, argv, "increment")) { + increment = getCmdLineArgumentInt(argc, argv, "increment"); + + if (increment <= 0) { + printf("Illegal argument - increment must be greater than zero\n"); + return -9000; + } + } + else { + printf("Must specify an increment in user mode\n"); + printf("See --help for more information\n"); + return -10000; + } } - } - if (htod) { - testBandwidth((unsigned int)start, (unsigned int)end, - (unsigned int)increment, mode, HOST_TO_DEVICE, printmode, - memMode, startDevice, endDevice, wc); - } + if (htod) { + testBandwidth((unsigned int)start, + (unsigned int)end, + (unsigned int)increment, + mode, + HOST_TO_DEVICE, + printmode, + memMode, + startDevice, + endDevice, + wc); + } - if (dtoh) { - testBandwidth((unsigned int)start, (unsigned int)end, - (unsigned int)increment, mode, DEVICE_TO_HOST, printmode, - memMode, startDevice, endDevice, wc); - } + if (dtoh) { + testBandwidth((unsigned int)start, + (unsigned int)end, + (unsigned int)increment, + mode, + DEVICE_TO_HOST, + printmode, + memMode, + startDevice, + endDevice, + wc); + } - if (dtod) { - testBandwidth((unsigned int)start, (unsigned int)end, - (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, - memMode, startDevice, endDevice, wc); - } + if (dtod) { + testBandwidth((unsigned int)start, + (unsigned int)end, + (unsigned int)increment, + mode, + DEVICE_TO_DEVICE, + printmode, + memMode, + startDevice, + endDevice, + wc); + } - // Ensure that we reset all CUDA Devices in question - for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) { - cudaSetDevice(nDevice); - } + // Ensure that we reset all CUDA Devices in question + for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) { + cudaSetDevice(nDevice); + } - return 0; + return 0; } /////////////////////////////////////////////////////////////////////////////// // Run a bandwidth test /////////////////////////////////////////////////////////////////////////////// -void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, - testMode mode, memcpyKind kind, printMode printmode, - memoryMode memMode, int startDevice, int endDevice, - bool wc) { - switch (mode) { +void testBandwidth(unsigned int start, + unsigned int end, + unsigned int increment, + testMode mode, + memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc) +{ + switch (mode) { case QUICK_MODE: - testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, - endDevice, wc); - break; + testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc); + break; case RANGE_MODE: - testBandwidthRange(start, end, increment, kind, printmode, memMode, - startDevice, endDevice, wc); - break; + testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc); + break; case SHMOO_MODE: - testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc); - break; + testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc); + break; default: - break; - } + break; + } } ////////////////////////////////////////////////////////////////////// // Run a quick mode bandwidth test ////////////////////////////////////////////////////////////////////// -void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, - memoryMode memMode, int startDevice, int endDevice, - bool wc) { - testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, - startDevice, endDevice, wc); +void testBandwidthQuick(unsigned int size, + memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc) +{ + testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc); } /////////////////////////////////////////////////////////////////////// // Run a range mode bandwidth test ////////////////////////////////////////////////////////////////////// -void testBandwidthRange(unsigned int start, unsigned int end, - unsigned int increment, memcpyKind kind, - printMode printmode, memoryMode memMode, - int startDevice, int endDevice, bool wc) { - // count the number of copies we're going to run - unsigned int count = 1 + ((end - start) / increment); +void testBandwidthRange(unsigned int start, + unsigned int end, + unsigned int increment, + memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc) +{ + // count the number of copies we're going to run + unsigned int count = 1 + ((end - start) / increment); - unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); - double *bandwidths = (double *)malloc(count * sizeof(double)); + unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); + double *bandwidths = (double *)malloc(count * sizeof(double)); - // Before calculating the cumulative bandwidth, initialize bandwidths array to - // NULL - for (unsigned int i = 0; i < count; i++) { - bandwidths[i] = 0.0; - } - - // Use the device asked by the user - for (int currentDevice = startDevice; currentDevice <= endDevice; - currentDevice++) { - cudaSetDevice(currentDevice); - - // run each of the copies + // Before calculating the cumulative bandwidth, initialize bandwidths array to + // NULL for (unsigned int i = 0; i < count; i++) { - memSizes[i] = start + i * increment; - - switch (kind) { - case DEVICE_TO_HOST: - bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); - break; - - case HOST_TO_DEVICE: - bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); - break; - - case DEVICE_TO_DEVICE: - bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]); - break; - } + bandwidths[i] = 0.0; } - } // Complete the bandwidth computation on all the devices - // print results - if (printmode == CSV) { - printResultsCSV(memSizes, bandwidths, count, kind, memMode, - (1 + endDevice - startDevice), wc); - } else { - printResultsReadable(memSizes, bandwidths, count, kind, memMode, - (1 + endDevice - startDevice), wc); - } + // Use the device asked by the user + for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { + cudaSetDevice(currentDevice); - // clean up - free(memSizes); - free(bandwidths); + // run each of the copies + for (unsigned int i = 0; i < count; i++) { + memSizes[i] = start + i * increment; + + switch (kind) { + case DEVICE_TO_HOST: + bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); + break; + + case HOST_TO_DEVICE: + bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); + break; + + case DEVICE_TO_DEVICE: + bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]); + break; + } + } + } // Complete the bandwidth computation on all the devices + + // print results + if (printmode == CSV) { + printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } + else { + printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } + + // clean up + free(memSizes); + free(bandwidths); } ////////////////////////////////////////////////////////////////////////////// // Intense shmoo mode - covers a large range of values with varying increments ////////////////////////////////////////////////////////////////////////////// -void testBandwidthShmoo(memcpyKind kind, printMode printmode, - memoryMode memMode, int startDevice, int endDevice, - bool wc) { - // count the number of copies to make - unsigned int count = - 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) + - ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) + - ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) + - ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) + - ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) + - ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) + - ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); +void testBandwidthShmoo(memcpyKind kind, + printMode printmode, + memoryMode memMode, + int startDevice, + int endDevice, + bool wc) +{ + // count the number of copies to make + unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) + + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) + + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) + + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) + + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) + + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) + + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); - unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); - double *bandwidths = (double *)malloc(count * sizeof(double)); + unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); + double *bandwidths = (double *)malloc(count * sizeof(double)); - // Before calculating the cumulative bandwidth, initialize bandwidths array to - // NULL - for (unsigned int i = 0; i < count; i++) { - bandwidths[i] = 0.0; - } - - // Use the device asked by the user - for (int currentDevice = startDevice; currentDevice <= endDevice; - currentDevice++) { - cudaSetDevice(currentDevice); - // Run the shmoo - int iteration = 0; - unsigned int memSize = 0; - - while (memSize <= SHMOO_MEMSIZE_MAX) { - if (memSize < SHMOO_LIMIT_20KB) { - memSize += SHMOO_INCREMENT_1KB; - } else if (memSize < SHMOO_LIMIT_50KB) { - memSize += SHMOO_INCREMENT_2KB; - } else if (memSize < SHMOO_LIMIT_100KB) { - memSize += SHMOO_INCREMENT_10KB; - } else if (memSize < SHMOO_LIMIT_1MB) { - memSize += SHMOO_INCREMENT_100KB; - } else if (memSize < SHMOO_LIMIT_16MB) { - memSize += SHMOO_INCREMENT_1MB; - } else if (memSize < SHMOO_LIMIT_32MB) { - memSize += SHMOO_INCREMENT_2MB; - } else { - memSize += SHMOO_INCREMENT_4MB; - } - - memSizes[iteration] = memSize; - - switch (kind) { - case DEVICE_TO_HOST: - bandwidths[iteration] += - testDeviceToHostTransfer(memSizes[iteration], memMode, wc); - break; - - case HOST_TO_DEVICE: - bandwidths[iteration] += - testHostToDeviceTransfer(memSizes[iteration], memMode, wc); - break; - - case DEVICE_TO_DEVICE: - bandwidths[iteration] += - testDeviceToDeviceTransfer(memSizes[iteration]); - break; - } - - iteration++; - printf("."); - fflush(0); + // Before calculating the cumulative bandwidth, initialize bandwidths array to + // NULL + for (unsigned int i = 0; i < count; i++) { + bandwidths[i] = 0.0; } - } // Complete the bandwidth computation on all the devices - // print results - printf("\n"); + // Use the device asked by the user + for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { + cudaSetDevice(currentDevice); + // Run the shmoo + int iteration = 0; + unsigned int memSize = 0; - if (CSV == printmode) { - printResultsCSV(memSizes, bandwidths, count, kind, memMode, - (1 + endDevice - startDevice), wc); - } else { - printResultsReadable(memSizes, bandwidths, count, kind, memMode, - (1 + endDevice - startDevice), wc); - } + while (memSize <= SHMOO_MEMSIZE_MAX) { + if (memSize < SHMOO_LIMIT_20KB) { + memSize += SHMOO_INCREMENT_1KB; + } + else if (memSize < SHMOO_LIMIT_50KB) { + memSize += SHMOO_INCREMENT_2KB; + } + else if (memSize < SHMOO_LIMIT_100KB) { + memSize += SHMOO_INCREMENT_10KB; + } + else if (memSize < SHMOO_LIMIT_1MB) { + memSize += SHMOO_INCREMENT_100KB; + } + else if (memSize < SHMOO_LIMIT_16MB) { + memSize += SHMOO_INCREMENT_1MB; + } + else if (memSize < SHMOO_LIMIT_32MB) { + memSize += SHMOO_INCREMENT_2MB; + } + else { + memSize += SHMOO_INCREMENT_4MB; + } - // clean up - free(memSizes); - free(bandwidths); + memSizes[iteration] = memSize; + + switch (kind) { + case DEVICE_TO_HOST: + bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc); + break; + + case HOST_TO_DEVICE: + bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc); + break; + + case DEVICE_TO_DEVICE: + bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]); + break; + } + + iteration++; + printf("."); + fflush(0); + } + } // Complete the bandwidth computation on all the devices + + // print results + printf("\n"); + + if (CSV == printmode) { + printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } + else { + printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } + + // clean up + free(memSizes); + free(bandwidths); } /////////////////////////////////////////////////////////////////////////////// // test the bandwidth of a device to host memcopy of a specific size /////////////////////////////////////////////////////////////////////////////// -float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, - bool wc) { - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - unsigned char *h_idata = NULL; - unsigned char *h_odata = NULL; - cudaEvent_t start, stop; +float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) +{ + StopWatchInterface *timer = NULL; + float elapsedTimeInMs = 0.0f; + float bandwidthInGBs = 0.0f; + unsigned char *h_idata = NULL; + unsigned char *h_odata = NULL; + cudaEvent_t start, stop; - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); + sdkCreateTimer(&timer); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); - // allocate host memory - if (PINNED == memMode) { - // pinned memory mode - use special function to get OS-pinned memory + // allocate host memory + if (PINNED == memMode) { + // pinned memory mode - use special function to get OS-pinned memory #if CUDART_VERSION >= 2020 - checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, - (wc) ? cudaHostAllocWriteCombined : 0)); - checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, - (wc) ? cudaHostAllocWriteCombined : 0)); + checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); + checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); #else - checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); - checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); + checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); + checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); #endif - } else { - // pageable memory mode - use malloc - h_idata = (unsigned char *)malloc(memSize); - h_odata = (unsigned char *)malloc(memSize); - - if (h_idata == 0 || h_odata == 0) { - fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); - exit(EXIT_FAILURE); } - } + else { + // pageable memory mode - use malloc + h_idata = (unsigned char *)malloc(memSize); + h_odata = (unsigned char *)malloc(memSize); - // initialize the memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_idata[i] = (unsigned char)(i & 0xff); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - - // initialize the device memory - checkCudaErrors( - cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); - - // copy data from GPU to Host - if (PINNED == memMode) { - if (bDontUseGPUTiming) sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, - cudaMemcpyDeviceToHost, 0)); + if (h_idata == 0 || h_odata == 0) { + fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); + exit(EXIT_FAILURE); + } } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - if (bDontUseGPUTiming) { - sdkStopTimer(&timer); - elapsedTimeInMs = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); + + // initialize the memory + for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { + h_idata[i] = (unsigned char)(i & 0xff); } - } else { - elapsedTimeInMs = 0; - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - sdkStartTimer(&timer); - checkCudaErrors( - cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost)); - sdkStopTimer(&timer); - elapsedTimeInMs += sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - memset(flush_buf, i, FLUSH_SIZE); + + // allocate device memory + unsigned char *d_idata; + checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); + + // initialize the device memory + checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); + + // copy data from GPU to Host + if (PINNED == memMode) { + if (bDontUseGPUTiming) + sdkStartTimer(&timer); + checkCudaErrors(cudaEventRecord(start, 0)); + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0)); + } + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); + if (bDontUseGPUTiming) { + sdkStopTimer(&timer); + elapsedTimeInMs = sdkGetTimerValue(&timer); + sdkResetTimer(&timer); + } + } + else { + elapsedTimeInMs = 0; + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + sdkStartTimer(&timer); + checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost)); + sdkStopTimer(&timer); + elapsedTimeInMs += sdkGetTimerValue(&timer); + sdkResetTimer(&timer); + memset(flush_buf, i, FLUSH_SIZE); + } } - } - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - // clean up memory - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - sdkDeleteTimer(&timer); + // calculate bandwidth in GB/s + double time_s = elapsedTimeInMs / 1e3; + bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; + bandwidthInGBs = bandwidthInGBs / time_s; + // clean up memory + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaEventDestroy(start)); + sdkDeleteTimer(&timer); - if (PINNED == memMode) { - checkCudaErrors(cudaFreeHost(h_idata)); - checkCudaErrors(cudaFreeHost(h_odata)); - } else { - free(h_idata); - free(h_odata); - } + if (PINNED == memMode) { + checkCudaErrors(cudaFreeHost(h_idata)); + checkCudaErrors(cudaFreeHost(h_odata)); + } + else { + free(h_idata); + free(h_odata); + } - checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(d_idata)); - return bandwidthInGBs; + return bandwidthInGBs; } /////////////////////////////////////////////////////////////////////////////// //! test the bandwidth of a host to device memcopy of a specific size /////////////////////////////////////////////////////////////////////////////// -float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, - bool wc) { - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - cudaEvent_t start, stop; - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); +float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) +{ + StopWatchInterface *timer = NULL; + float elapsedTimeInMs = 0.0f; + float bandwidthInGBs = 0.0f; + cudaEvent_t start, stop; + sdkCreateTimer(&timer); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); - // allocate host memory - unsigned char *h_odata = NULL; + // allocate host memory + unsigned char *h_odata = NULL; - if (PINNED == memMode) { + if (PINNED == memMode) { #if CUDART_VERSION >= 2020 - // pinned memory mode - use special function to get OS-pinned memory - checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, - (wc) ? cudaHostAllocWriteCombined : 0)); + // pinned memory mode - use special function to get OS-pinned memory + checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); #else - // pinned memory mode - use special function to get OS-pinned memory - checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); + // pinned memory mode - use special function to get OS-pinned memory + checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); #endif - } else { - // pageable memory mode - use malloc - h_odata = (unsigned char *)malloc(memSize); - - if (h_odata == 0) { - fprintf(stderr, "Not enough memory available on host to run test!\n"); - exit(EXIT_FAILURE); } - } + else { + // pageable memory mode - use malloc + h_odata = (unsigned char *)malloc(memSize); - unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); - unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); - - if (h_cacheClear1 == 0 || h_cacheClear2 == 0) { - fprintf(stderr, "Not enough memory available on host to run test!\n"); - exit(EXIT_FAILURE); - } - - // initialize the memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_odata[i] = (unsigned char)(i & 0xff); - } - - for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) { - h_cacheClear1[i] = (unsigned char)(i & 0xff); - h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff)); - } - - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - - // copy host memory to device memory - if (PINNED == memMode) { - if (bDontUseGPUTiming) sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, - cudaMemcpyHostToDevice, 0)); + if (h_odata == 0) { + fprintf(stderr, "Not enough memory available on host to run test!\n"); + exit(EXIT_FAILURE); + } } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - if (bDontUseGPUTiming) { - sdkStopTimer(&timer); - elapsedTimeInMs = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); + + unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); + unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); + + if (h_cacheClear1 == 0 || h_cacheClear2 == 0) { + fprintf(stderr, "Not enough memory available on host to run test!\n"); + exit(EXIT_FAILURE); } - } else { - elapsedTimeInMs = 0; - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - sdkStartTimer(&timer); - checkCudaErrors( - cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice)); - sdkStopTimer(&timer); - elapsedTimeInMs += sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - memset(flush_buf, i, FLUSH_SIZE); + + // initialize the memory + for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { + h_odata[i] = (unsigned char)(i & 0xff); } - } - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; - // clean up memory - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - sdkDeleteTimer(&timer); + for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) { + h_cacheClear1[i] = (unsigned char)(i & 0xff); + h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff)); + } - if (PINNED == memMode) { - checkCudaErrors(cudaFreeHost(h_odata)); - } else { - free(h_odata); - } + // allocate device memory + unsigned char *d_idata; + checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - free(h_cacheClear1); - free(h_cacheClear2); - checkCudaErrors(cudaFree(d_idata)); + // copy host memory to device memory + if (PINNED == memMode) { + if (bDontUseGPUTiming) + sdkStartTimer(&timer); + checkCudaErrors(cudaEventRecord(start, 0)); + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0)); + } + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); + if (bDontUseGPUTiming) { + sdkStopTimer(&timer); + elapsedTimeInMs = sdkGetTimerValue(&timer); + sdkResetTimer(&timer); + } + } + else { + elapsedTimeInMs = 0; + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + sdkStartTimer(&timer); + checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice)); + sdkStopTimer(&timer); + elapsedTimeInMs += sdkGetTimerValue(&timer); + sdkResetTimer(&timer); + memset(flush_buf, i, FLUSH_SIZE); + } + } - return bandwidthInGBs; + // calculate bandwidth in GB/s + double time_s = elapsedTimeInMs / 1e3; + bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; + bandwidthInGBs = bandwidthInGBs / time_s; + // clean up memory + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaEventDestroy(start)); + sdkDeleteTimer(&timer); + + if (PINNED == memMode) { + checkCudaErrors(cudaFreeHost(h_odata)); + } + else { + free(h_odata); + } + + free(h_cacheClear1); + free(h_cacheClear2); + checkCudaErrors(cudaFree(d_idata)); + + return bandwidthInGBs; } /////////////////////////////////////////////////////////////////////////////// //! test the bandwidth of a device to device memcopy of a specific size /////////////////////////////////////////////////////////////////////////////// -float testDeviceToDeviceTransfer(unsigned int memSize) { - StopWatchInterface *timer = NULL; - float elapsedTimeInMs = 0.0f; - float bandwidthInGBs = 0.0f; - cudaEvent_t start, stop; +float testDeviceToDeviceTransfer(unsigned int memSize) +{ + StopWatchInterface *timer = NULL; + float elapsedTimeInMs = 0.0f; + float bandwidthInGBs = 0.0f; + cudaEvent_t start, stop; - sdkCreateTimer(&timer); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); + sdkCreateTimer(&timer); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); - // allocate host memory - unsigned char *h_idata = (unsigned char *)malloc(memSize); + // allocate host memory + unsigned char *h_idata = (unsigned char *)malloc(memSize); - if (h_idata == 0) { - fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); - exit(EXIT_FAILURE); - } + if (h_idata == 0) { + fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); + exit(EXIT_FAILURE); + } - // initialize the host memory - for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { - h_idata[i] = (unsigned char)(i & 0xff); - } + // initialize the host memory + for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { + h_idata[i] = (unsigned char)(i & 0xff); + } - // allocate device memory - unsigned char *d_idata; - checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); - unsigned char *d_odata; - checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); + // allocate device memory + unsigned char *d_idata; + checkCudaErrors(cudaMalloc((void **)&d_idata, memSize)); + unsigned char *d_odata; + checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); - // initialize memory - checkCudaErrors( - cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); + // initialize memory + checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); - // run the memcopy - sdkStartTimer(&timer); - checkCudaErrors(cudaEventRecord(start, 0)); + // run the memcopy + sdkStartTimer(&timer); + checkCudaErrors(cudaEventRecord(start, 0)); - for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { - checkCudaErrors( - cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice)); - } + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice)); + } - checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventRecord(stop, 0)); - // Since device to device memory copies are non-blocking, - // cudaDeviceSynchronize() is required in order to get - // proper timing. - checkCudaErrors(cudaDeviceSynchronize()); + // Since device to device memory copies are non-blocking, + // cudaDeviceSynchronize() is required in order to get + // proper timing. + checkCudaErrors(cudaDeviceSynchronize()); - // get the total elapsed time in ms - sdkStopTimer(&timer); - checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); + // get the total elapsed time in ms + sdkStopTimer(&timer); + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); - if (bDontUseGPUTiming) { - elapsedTimeInMs = sdkGetTimerValue(&timer); - } + if (bDontUseGPUTiming) { + elapsedTimeInMs = sdkGetTimerValue(&timer); + } - // calculate bandwidth in GB/s - double time_s = elapsedTimeInMs / 1e3; - bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; - bandwidthInGBs = bandwidthInGBs / time_s; + // calculate bandwidth in GB/s + double time_s = elapsedTimeInMs / 1e3; + bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9; + bandwidthInGBs = bandwidthInGBs / time_s; - // clean up memory - sdkDeleteTimer(&timer); - free(h_idata); - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(d_odata)); + // clean up memory + sdkDeleteTimer(&timer); + free(h_idata); + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(d_odata)); - return bandwidthInGBs; + return bandwidthInGBs; } ///////////////////////////////////////////////////////// // print results in an easily read format //////////////////////////////////////////////////////// -void printResultsReadable(unsigned int *memSizes, double *bandwidths, - unsigned int count, memcpyKind kind, - memoryMode memMode, int iNumDevs, bool wc) { - printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); - printf(" %s Memory Transfers\n", sMemoryMode[memMode]); +void printResultsReadable(unsigned int *memSizes, + double *bandwidths, + unsigned int count, + memcpyKind kind, + memoryMode memMode, + int iNumDevs, + bool wc) +{ + printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); + printf(" %s Memory Transfers\n", sMemoryMode[memMode]); - if (wc) { - printf(" Write-Combined Memory Writes are Enabled"); - } + if (wc) { + printf(" Write-Combined Memory Writes are Enabled"); + } - printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n"); - unsigned int i; + printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n"); + unsigned int i; - for (i = 0; i < (count - 1); i++) { - printf(" %u\t\t\t%s%.1f\n", memSizes[i], - (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); - } + for (i = 0; i < (count - 1); i++) { + printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); + } - printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], - (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); + printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); } /////////////////////////////////////////////////////////////////////////// // print results in a database format /////////////////////////////////////////////////////////////////////////// -void printResultsCSV(unsigned int *memSizes, double *bandwidths, - unsigned int count, memcpyKind kind, memoryMode memMode, - int iNumDevs, bool wc) { - std::string sConfig; +void printResultsCSV(unsigned int *memSizes, + double *bandwidths, + unsigned int count, + memcpyKind kind, + memoryMode memMode, + int iNumDevs, + bool wc) +{ + std::string sConfig; - // log config information - if (kind == DEVICE_TO_DEVICE) { - sConfig += "D2D"; - } else { - if (kind == DEVICE_TO_HOST) { - sConfig += "D2H"; - } else if (kind == HOST_TO_DEVICE) { - sConfig += "H2D"; + // log config information + if (kind == DEVICE_TO_DEVICE) { + sConfig += "D2D"; + } + else { + if (kind == DEVICE_TO_HOST) { + sConfig += "D2H"; + } + else if (kind == HOST_TO_DEVICE) { + sConfig += "H2D"; + } + + if (memMode == PAGEABLE) { + sConfig += "-Paged"; + } + else if (memMode == PINNED) { + sConfig += "-Pinned"; + + if (wc) { + sConfig += "-WriteCombined"; + } + } } - if (memMode == PAGEABLE) { - sConfig += "-Paged"; - } else if (memMode == PINNED) { - sConfig += "-Pinned"; + unsigned int i; + double dSeconds = 0.0; - if (wc) { - sConfig += "-WriteCombined"; - } + for (i = 0; i < count; i++) { + dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); + printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u " + "bytes, NumDevsUsed = %d\n", + sConfig.c_str(), + bandwidths[i], + dSeconds, + memSizes[i], + iNumDevs); } - } - - unsigned int i; - double dSeconds = 0.0; - - for (i = 0; i < count; i++) { - dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9)); - printf( - "bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u " - "bytes, NumDevsUsed = %d\n", - sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs); - } } /////////////////////////////////////////////////////////////////////////// // Print help screen /////////////////////////////////////////////////////////////////////////// -void printHelp(void) { - printf("Usage: bandwidthTest [OPTION]...\n"); - printf( - "Test the bandwidth for device to host, host to device, and device to " - "device transfers\n"); - printf("\n"); - printf( - "Example: measure the bandwidth of device to host pinned memory copies " - "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); - printf( - "./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 " - "--increment=1024 --dtoh\n"); +void printHelp(void) +{ + printf("Usage: bandwidthTest [OPTION]...\n"); + printf("Test the bandwidth for device to host, host to device, and device to " + "device transfers\n"); + printf("\n"); + printf("Example: measure the bandwidth of device to host pinned memory copies " + "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); + printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 " + "--increment=1024 --dtoh\n"); - printf("\n"); - printf("Options:\n"); - printf("--help\tDisplay this help menu\n"); - printf("--csv\tPrint results as a CSV\n"); - printf("--device=[deviceno]\tSpecify the device device to be used\n"); - printf(" all - compute cumulative bandwidth on all the devices\n"); - printf(" 0,1,2,...,n - Specify any particular device to be used\n"); - printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); - printf(" pageable - pageable memory\n"); - printf(" pinned - non-pageable system memory\n"); - printf("--mode=[MODE]\tSpecify the mode to use\n"); - printf(" quick - performs a quick measurement\n"); - printf(" range - measures a user-specified range of values\n"); - printf(" shmoo - performs an intense shmoo of a large range of values\n"); + printf("\n"); + printf("Options:\n"); + printf("--help\tDisplay this help menu\n"); + printf("--csv\tPrint results as a CSV\n"); + printf("--device=[deviceno]\tSpecify the device device to be used\n"); + printf(" all - compute cumulative bandwidth on all the devices\n"); + printf(" 0,1,2,...,n - Specify any particular device to be used\n"); + printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); + printf(" pageable - pageable memory\n"); + printf(" pinned - non-pageable system memory\n"); + printf("--mode=[MODE]\tSpecify the mode to use\n"); + printf(" quick - performs a quick measurement\n"); + printf(" range - measures a user-specified range of values\n"); + printf(" shmoo - performs an intense shmoo of a large range of values\n"); - printf("--htod\tMeasure host to device transfers\n"); - printf("--dtoh\tMeasure device to host transfers\n"); - printf("--dtod\tMeasure device to device transfers\n"); + printf("--htod\tMeasure host to device transfers\n"); + printf("--dtoh\tMeasure device to host transfers\n"); + printf("--dtod\tMeasure device to device transfers\n"); #if CUDART_VERSION >= 2020 - printf("--wc\tAllocate pinned memory as write-combined\n"); + printf("--wc\tAllocate pinned memory as write-combined\n"); #endif - printf("--cputiming\tForce CPU-based timing always\n"); + printf("--cputiming\tForce CPU-based timing always\n"); - printf("Range mode options\n"); - printf("--start=[SIZE]\tStarting transfer size in bytes\n"); - printf("--end=[SIZE]\tEnding transfer size in bytes\n"); - printf("--increment=[SIZE]\tIncrement size in bytes\n"); + printf("Range mode options\n"); + printf("--start=[SIZE]\tStarting transfer size in bytes\n"); + printf("--end=[SIZE]\tEnding transfer size in bytes\n"); + printf("--increment=[SIZE]\tIncrement size in bytes\n"); } diff --git a/Samples/1_Utilities/deviceQuery/deviceQuery.cpp b/Samples/1_Utilities/deviceQuery/deviceQuery.cpp index a6c46d54..4deffb87 100644 --- a/Samples/1_Utilities/deviceQuery/deviceQuery.cpp +++ b/Samples/1_Utilities/deviceQuery/deviceQuery.cpp @@ -32,12 +32,11 @@ #include #include - #include #include #include -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; #if CUDART_VERSION < 5000 @@ -46,19 +45,16 @@ char **pArgv = NULL; #include // This function wraps the CUDA Driver API into a template function -template -inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, - int device) { - CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); +template inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) +{ + CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); - if (CUDA_SUCCESS != error) { - fprintf( - stderr, - "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", - error, __FILE__, __LINE__); + if (CUDA_SUCCESS != error) { + fprintf( + stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__); - exit(EXIT_FAILURE); - } + exit(EXIT_FAILURE); + } } #endif /* CUDART_VERSION < 5000 */ @@ -66,278 +62,259 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; - printf("%s Starting...\n\n", argv[0]); - printf( - " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); + printf("%s Starting...\n\n", argv[0]); + printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); - int deviceCount = 0; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + int deviceCount = 0; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", - static_cast(error_id), cudaGetErrorString(error_id)); - printf("Result = FAIL\n"); - exit(EXIT_FAILURE); - } + if (error_id != cudaSuccess) { + printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast(error_id), cudaGetErrorString(error_id)); + printf("Result = FAIL\n"); + exit(EXIT_FAILURE); + } - // This function call returns 0 if there are no CUDA capable devices. - if (deviceCount == 0) { - printf("There are no available device(s) that support CUDA\n"); - } else { - printf("Detected %d CUDA Capable device(s)\n", deviceCount); - } + // This function call returns 0 if there are no CUDA capable devices. + if (deviceCount == 0) { + printf("There are no available device(s) that support CUDA\n"); + } + else { + printf("Detected %d CUDA Capable device(s)\n", deviceCount); + } - int dev, driverVersion = 0, runtimeVersion = 0; + int dev, driverVersion = 0, runtimeVersion = 0; - for (dev = 0; dev < deviceCount; ++dev) { - cudaSetDevice(dev); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); + for (dev = 0; dev < deviceCount; ++dev) { + cudaSetDevice(dev); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); - printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); + printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); - // Console log - cudaDriverGetVersion(&driverVersion); - cudaRuntimeGetVersion(&runtimeVersion); - printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", - driverVersion / 1000, (driverVersion % 100) / 10, - runtimeVersion / 1000, (runtimeVersion % 100) / 10); - printf(" CUDA Capability Major/Minor version number: %d.%d\n", - deviceProp.major, deviceProp.minor); + // Console log + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", + driverVersion / 1000, + (driverVersion % 100) / 10, + runtimeVersion / 1000, + (runtimeVersion % 100) / 10); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); - char msg[256]; + char msg[256]; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - sprintf_s(msg, sizeof(msg), - " Total amount of global memory: %.0f MBytes " - "(%llu bytes)\n", - static_cast(deviceProp.totalGlobalMem / 1048576.0f), - (unsigned long long)deviceProp.totalGlobalMem); + sprintf_s(msg, + sizeof(msg), + " Total amount of global memory: %.0f MBytes " + "(%llu bytes)\n", + static_cast(deviceProp.totalGlobalMem / 1048576.0f), + (unsigned long long)deviceProp.totalGlobalMem); #else - snprintf(msg, sizeof(msg), - " Total amount of global memory: %.0f MBytes " - "(%llu bytes)\n", - static_cast(deviceProp.totalGlobalMem / 1048576.0f), - (unsigned long long)deviceProp.totalGlobalMem); + snprintf(msg, + sizeof(msg), + " Total amount of global memory: %.0f MBytes " + "(%llu bytes)\n", + static_cast(deviceProp.totalGlobalMem / 1048576.0f), + (unsigned long long)deviceProp.totalGlobalMem); #endif - printf("%s", msg); + printf("%s", msg); - printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n", - deviceProp.multiProcessorCount, - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - deviceProp.multiProcessorCount); - printf( - " GPU Max Clock rate: %.0f MHz (%0.2f " - "GHz)\n", - deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); + printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n", + deviceProp.multiProcessorCount, + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); + printf(" GPU Max Clock rate: %.0f MHz (%0.2f " + "GHz)\n", + deviceProp.clockRate * 1e-3f, + deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 - // This is supported in CUDA 5.0 (runtime API device properties) - printf(" Memory Clock rate: %.0f Mhz\n", - deviceProp.memoryClockRate * 1e-3f); - printf(" Memory Bus Width: %d-bit\n", - deviceProp.memoryBusWidth); + // This is supported in CUDA 5.0 (runtime API device properties) + printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); + printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); - if (deviceProp.l2CacheSize) { - printf(" L2 Cache Size: %d bytes\n", - deviceProp.l2CacheSize); - } - -#else - // This only available in CUDA 4.0-4.2 (but these were only exposed in the - // CUDA Driver API) - int memoryClock; - getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - dev); - printf(" Memory Clock rate: %.0f Mhz\n", - memoryClock * 1e-3f); - int memBusWidth; - getCudaAttribute(&memBusWidth, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); - printf(" Memory Bus Width: %d-bit\n", - memBusWidth); - int L2CacheSize; - getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); - - if (L2CacheSize) { - printf(" L2 Cache Size: %d bytes\n", - L2CacheSize); - } - -#endif - - printf( - " Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, " - "%d), 3D=(%d, %d, %d)\n", - deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], - deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], - deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); - printf( - " Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", - deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); - printf( - " Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d " - "layers\n", - deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], - deviceProp.maxTexture2DLayered[2]); - - printf(" Total amount of constant memory: %zu bytes\n", - deviceProp.totalConstMem); - printf(" Total amount of shared memory per block: %zu bytes\n", - deviceProp.sharedMemPerBlock); - printf(" Total shared memory per multiprocessor: %zu bytes\n", - deviceProp.sharedMemPerMultiprocessor); - printf(" Total number of registers available per block: %d\n", - deviceProp.regsPerBlock); - printf(" Warp size: %d\n", - deviceProp.warpSize); - printf(" Maximum number of threads per multiprocessor: %d\n", - deviceProp.maxThreadsPerMultiProcessor); - printf(" Maximum number of threads per block: %d\n", - deviceProp.maxThreadsPerBlock); - printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", - deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], - deviceProp.maxThreadsDim[2]); - printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", - deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], - deviceProp.maxGridSize[2]); - printf(" Maximum memory pitch: %zu bytes\n", - deviceProp.memPitch); - printf(" Texture alignment: %zu bytes\n", - deviceProp.textureAlignment); - printf( - " Concurrent copy and kernel execution: %s with %d copy " - "engine(s)\n", - (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); - printf(" Run time limit on kernels: %s\n", - deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated GPU sharing Host Memory: %s\n", - deviceProp.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", - deviceProp.canMapHostMemory ? "Yes" : "No"); - printf(" Alignment requirement for Surfaces: %s\n", - deviceProp.surfaceAlignment ? "Yes" : "No"); - printf(" Device has ECC support: %s\n", - deviceProp.ECCEnabled ? "Enabled" : "Disabled"); -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", - deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" - : "WDDM (Windows Display Driver Model)"); -#endif - printf(" Device supports Unified Addressing (UVA): %s\n", - deviceProp.unifiedAddressing ? "Yes" : "No"); - printf(" Device supports Managed Memory: %s\n", - deviceProp.managedMemory ? "Yes" : "No"); - printf(" Device supports Compute Preemption: %s\n", - deviceProp.computePreemptionSupported ? "Yes" : "No"); - printf(" Supports Cooperative Kernel Launch: %s\n", - deviceProp.cooperativeLaunch ? "Yes" : "No"); - printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", - deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No"); - printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", - deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); - - const char *sComputeMode[] = { - "Default (multiple host threads can use ::cudaSetDevice() with device " - "simultaneously)", - "Exclusive (only one host thread in one process is able to use " - "::cudaSetDevice() with this device)", - "Prohibited (no host thread can use ::cudaSetDevice() with this " - "device)", - "Exclusive Process (many threads in one process is able to use " - "::cudaSetDevice() with this device)", - "Unknown", NULL}; - printf(" Compute Mode:\n"); - printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); - } - - // If there are 2 or more GPUs, query to determine whether RDMA is supported - if (deviceCount >= 2) { - cudaDeviceProp prop[64]; - int gpuid[64]; // we want to find the first two GPUs that can support P2P - int gpu_p2p_count = 0; - - for (int i = 0; i < deviceCount; i++) { - checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); - - // Only boards based on Fermi or later can support P2P - if ((prop[i].major >= 2) -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - // on Windows (64-bit), the Tesla Compute Cluster driver for windows - // must be enabled to support this - && prop[i].tccDriver -#endif - ) { - // This is an array of P2P capable GPUs - gpuid[gpu_p2p_count++] = i; - } - } - - // Show all the combinations of support P2P GPUs - int can_access_peer; - - if (gpu_p2p_count >= 2) { - for (int i = 0; i < gpu_p2p_count; i++) { - for (int j = 0; j < gpu_p2p_count; j++) { - if (gpuid[i] == gpuid[j]) { - continue; - } - checkCudaErrors( - cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); - printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", - prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], - can_access_peer ? "Yes" : "No"); + if (deviceProp.l2CacheSize) { + printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } - } + +#else + // This only available in CUDA 4.0-4.2 (but these were only exposed in the + // CUDA Driver API) + int memoryClock; + getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); + printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); + int memBusWidth; + getCudaAttribute(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); + printf(" Memory Bus Width: %d-bit\n", memBusWidth); + int L2CacheSize; + getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); + + if (L2CacheSize) { + printf(" L2 Cache Size: %d bytes\n", L2CacheSize); + } + +#endif + + printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, " + "%d), 3D=(%d, %d, %d)\n", + deviceProp.maxTexture1D, + deviceProp.maxTexture2D[0], + deviceProp.maxTexture2D[1], + deviceProp.maxTexture3D[0], + deviceProp.maxTexture3D[1], + deviceProp.maxTexture3D[2]); + printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", + deviceProp.maxTexture1DLayered[0], + deviceProp.maxTexture1DLayered[1]); + printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d " + "layers\n", + deviceProp.maxTexture2DLayered[0], + deviceProp.maxTexture2DLayered[1], + deviceProp.maxTexture2DLayered[2]); + + printf(" Total amount of constant memory: %zu bytes\n", deviceProp.totalConstMem); + printf(" Total amount of shared memory per block: %zu bytes\n", deviceProp.sharedMemPerBlock); + printf(" Total shared memory per multiprocessor: %zu bytes\n", deviceProp.sharedMemPerMultiprocessor); + printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); + printf(" Warp size: %d\n", deviceProp.warpSize); + printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); + printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); + printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", + deviceProp.maxThreadsDim[0], + deviceProp.maxThreadsDim[1], + deviceProp.maxThreadsDim[2]); + printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", + deviceProp.maxGridSize[0], + deviceProp.maxGridSize[1], + deviceProp.maxGridSize[2]); + printf(" Maximum memory pitch: %zu bytes\n", deviceProp.memPitch); + printf(" Texture alignment: %zu bytes\n", deviceProp.textureAlignment); + printf(" Concurrent copy and kernel execution: %s with %d copy " + "engine(s)\n", + (deviceProp.deviceOverlap ? "Yes" : "No"), + deviceProp.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", + deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", + deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); +#endif + printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); + printf(" Device supports Managed Memory: %s\n", deviceProp.managedMemory ? "Yes" : "No"); + printf(" Device supports Compute Preemption: %s\n", + deviceProp.computePreemptionSupported ? "Yes" : "No"); + printf(" Supports Cooperative Kernel Launch: %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No"); + printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", + deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No"); + printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", + deviceProp.pciDomainID, + deviceProp.pciBusID, + deviceProp.pciDeviceID); + + const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device " + "simultaneously)", + "Exclusive (only one host thread in one process is able to use " + "::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this " + "device)", + "Exclusive Process (many threads in one process is able to use " + "::cudaSetDevice() with this device)", + "Unknown", + NULL}; + printf(" Compute Mode:\n"); + printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } - } - // csv masterlog info - // ***************************** - // exe and CUDA driver name - printf("\n"); - std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; - char cTemp[16]; + // If there are 2 or more GPUs, query to determine whether RDMA is supported + if (deviceCount >= 2) { + cudaDeviceProp prop[64]; + int gpuid[64]; // we want to find the first two GPUs that can support P2P + int gpu_p2p_count = 0; - // driver version - sProfileString += ", CUDA Driver Version = "; + for (int i = 0; i < deviceCount; i++) { + checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); + + // Only boards based on Fermi or later can support P2P + if ((prop[i].major >= 2) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, - (driverVersion % 100) / 10); -#else - snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, - (driverVersion % 100) / 10); + // on Windows (64-bit), the Tesla Compute Cluster driver for windows + // must be enabled to support this + && prop[i].tccDriver #endif - sProfileString += cTemp; + ) { + // This is an array of P2P capable GPUs + gpuid[gpu_p2p_count++] = i; + } + } - // Runtime version - sProfileString += ", CUDA Runtime Version = "; + // Show all the combinations of support P2P GPUs + int can_access_peer; + + if (gpu_p2p_count >= 2) { + for (int i = 0; i < gpu_p2p_count; i++) { + for (int j = 0; j < gpu_p2p_count; j++) { + if (gpuid[i] == gpuid[j]) { + continue; + } + checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); + printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", + prop[gpuid[i]].name, + gpuid[i], + prop[gpuid[j]].name, + gpuid[j], + can_access_peer ? "Yes" : "No"); + } + } + } + } + + // csv masterlog info + // ***************************** + // exe and CUDA driver name + printf("\n"); + std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; + char cTemp[16]; + + // driver version + sProfileString += ", CUDA Driver Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, - (runtimeVersion % 100) / 10); + sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10); #else - snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, - (runtimeVersion % 100) / 10); + snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10); #endif - sProfileString += cTemp; + sProfileString += cTemp; - // Device count - sProfileString += ", NumDevs = "; + // Runtime version + sProfileString += ", CUDA Runtime Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - sprintf_s(cTemp, 10, "%d", deviceCount); + sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10); #else - snprintf(cTemp, sizeof(cTemp), "%d", deviceCount); + snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10); #endif - sProfileString += cTemp; - sProfileString += "\n"; - printf("%s", sProfileString.c_str()); + sProfileString += cTemp; - printf("Result = PASS\n"); + // Device count + sProfileString += ", NumDevs = "; +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + sprintf_s(cTemp, 10, "%d", deviceCount); +#else + snprintf(cTemp, sizeof(cTemp), "%d", deviceCount); +#endif + sProfileString += cTemp; + sProfileString += "\n"; + printf("%s", sProfileString.c_str()); - // finish - exit(EXIT_SUCCESS); + printf("Result = PASS\n"); + + // finish + exit(EXIT_SUCCESS); } diff --git a/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp b/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp index 7d538b56..610844d9 100644 --- a/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp +++ b/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp @@ -30,358 +30,295 @@ */ // includes, system -#include -#include -#include - #include #include +#include +#include +#include //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - CUdevice dev; - int major = 0, minor = 0; - int deviceCount = 0; - char deviceName[256]; +int main(int argc, char **argv) +{ + CUdevice dev; + int major = 0, minor = 0; + int deviceCount = 0; + char deviceName[256]; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - // note your project will need to link with cuda.lib files on windows - printf("CUDA Device Query (Driver API) statically linked version \n"); + // note your project will need to link with cuda.lib files on windows + printf("CUDA Device Query (Driver API) statically linked version \n"); - checkCudaErrors(cuInit(0)); + checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGetCount(&deviceCount)); + checkCudaErrors(cuDeviceGetCount(&deviceCount)); - // This function call returns 0 if there are no CUDA capable devices. - if (deviceCount == 0) { - printf("There are no available device(s) that support CUDA\n"); - } else { - printf("Detected %d CUDA Capable device(s)\n", deviceCount); - } - - for (dev = 0; dev < deviceCount; ++dev) { - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev)); - - checkCudaErrors(cuDeviceGetName(deviceName, 256, dev)); - - printf("\nDevice %d: \"%s\"\n", dev, deviceName); - - int driverVersion = 0; - checkCudaErrors(cuDriverGetVersion(&driverVersion)); - printf(" CUDA Driver Version: %d.%d\n", - driverVersion / 1000, (driverVersion % 100) / 10); - printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, - minor); - - size_t totalGlobalMem; - checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev)); - - char msg[256]; - SPRINTF(msg, - " Total amount of global memory: %.0f MBytes " - "(%llu bytes)\n", - (float)totalGlobalMem / 1048576.0f, - (unsigned long long)totalGlobalMem); - printf("%s", msg); - - int multiProcessorCount; - getCudaAttribute(&multiProcessorCount, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); - - printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", - multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor), - _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount); - - int clockRate; - getCudaAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); - printf( - " GPU Max Clock rate: %.0f MHz (%0.2f " - "GHz)\n", - clockRate * 1e-3f, clockRate * 1e-6f); - int memoryClock; - getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - dev); - printf(" Memory Clock rate: %.0f Mhz\n", - memoryClock * 1e-3f); - int memBusWidth; - getCudaAttribute(&memBusWidth, - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); - printf(" Memory Bus Width: %d-bit\n", - memBusWidth); - int L2CacheSize; - getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); - - if (L2CacheSize) { - printf(" L2 Cache Size: %d bytes\n", - L2CacheSize); + // This function call returns 0 if there are no CUDA capable devices. + if (deviceCount == 0) { + printf("There are no available device(s) that support CUDA\n"); + } + else { + printf("Detected %d CUDA Capable device(s)\n", deviceCount); } - int maxTex1D, maxTex2D[2], maxTex3D[3]; - getCudaAttribute(&maxTex1D, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev); - getCudaAttribute(&maxTex2D[0], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev); - getCudaAttribute(&maxTex2D[1], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev); - getCudaAttribute(&maxTex3D[0], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev); - getCudaAttribute(&maxTex3D[1], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev); - getCudaAttribute(&maxTex3D[2], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev); - printf( - " Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) " - "3D=(%d, %d, %d)\n", - maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], - maxTex3D[2]); + for (dev = 0; dev < deviceCount; ++dev) { + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev)); - int maxTex1DLayered[2]; - getCudaAttribute(&maxTex1DLayered[0], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, - dev); - getCudaAttribute(&maxTex1DLayered[1], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, - dev); - printf( - " Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", - maxTex1DLayered[0], maxTex1DLayered[1]); + checkCudaErrors(cuDeviceGetName(deviceName, 256, dev)); - int maxTex2DLayered[3]; - getCudaAttribute(&maxTex2DLayered[0], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, - dev); - getCudaAttribute(&maxTex2DLayered[1], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, - dev); - getCudaAttribute(&maxTex2DLayered[2], - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, - dev); - printf( - " Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d " - "layers\n", - maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]); + printf("\nDevice %d: \"%s\"\n", dev, deviceName); - int totalConstantMemory; - getCudaAttribute(&totalConstantMemory, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev); - printf(" Total amount of constant memory: %u bytes\n", - totalConstantMemory); - int sharedMemPerBlock; - getCudaAttribute(&sharedMemPerBlock, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev); - printf(" Total amount of shared memory per block: %u bytes\n", - sharedMemPerBlock); - int regsPerBlock; - getCudaAttribute(®sPerBlock, - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); - printf(" Total number of registers available per block: %d\n", - regsPerBlock); - int warpSize; - getCudaAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); - printf(" Warp size: %d\n", warpSize); - int maxThreadsPerMultiProcessor; - getCudaAttribute(&maxThreadsPerMultiProcessor, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, - dev); - printf(" Maximum number of threads per multiprocessor: %d\n", - maxThreadsPerMultiProcessor); - int maxThreadsPerBlock; - getCudaAttribute(&maxThreadsPerBlock, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); - printf(" Maximum number of threads per block: %d\n", - maxThreadsPerBlock); + int driverVersion = 0; + checkCudaErrors(cuDriverGetVersion(&driverVersion)); + printf(" CUDA Driver Version: %d.%d\n", + driverVersion / 1000, + (driverVersion % 100) / 10); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor); - int blockDim[3]; - getCudaAttribute(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - dev); - getCudaAttribute(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - dev); - getCudaAttribute(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - dev); - printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", - blockDim[0], blockDim[1], blockDim[2]); - int gridDim[3]; - getCudaAttribute(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev); - getCudaAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev); - getCudaAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev); - printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", - gridDim[0], gridDim[1], gridDim[2]); + size_t totalGlobalMem; + checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev)); - int textureAlign; - getCudaAttribute(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - dev); - printf(" Texture alignment: %u bytes\n", - textureAlign); + char msg[256]; + SPRINTF(msg, + " Total amount of global memory: %.0f MBytes " + "(%llu bytes)\n", + (float)totalGlobalMem / 1048576.0f, + (unsigned long long)totalGlobalMem); + printf("%s", msg); - int memPitch; - getCudaAttribute(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev); - printf(" Maximum memory pitch: %u bytes\n", - memPitch); + int multiProcessorCount; + getCudaAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); - int gpuOverlap; - getCudaAttribute(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); + printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", + multiProcessorCount, + _ConvertSMVer2CoresDRV(major, minor), + _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount); - int asyncEngineCount; - getCudaAttribute(&asyncEngineCount, - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); - printf( - " Concurrent copy and kernel execution: %s with %d copy " - "engine(s)\n", - (gpuOverlap ? "Yes" : "No"), asyncEngineCount); + int clockRate; + getCudaAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); + printf(" GPU Max Clock rate: %.0f MHz (%0.2f " + "GHz)\n", + clockRate * 1e-3f, + clockRate * 1e-6f); + int memoryClock; + getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); + printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); + int memBusWidth; + getCudaAttribute(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); + printf(" Memory Bus Width: %d-bit\n", memBusWidth); + int L2CacheSize; + getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); - int kernelExecTimeoutEnabled; - getCudaAttribute(&kernelExecTimeoutEnabled, - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); - printf(" Run time limit on kernels: %s\n", - kernelExecTimeoutEnabled ? "Yes" : "No"); - int integrated; - getCudaAttribute(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); - printf(" Integrated GPU sharing Host Memory: %s\n", - integrated ? "Yes" : "No"); - int canMapHostMemory; - getCudaAttribute(&canMapHostMemory, - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); - printf(" Support host page-locked memory mapping: %s\n", - canMapHostMemory ? "Yes" : "No"); - - int concurrentKernels; - getCudaAttribute(&concurrentKernels, - CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); - printf(" Concurrent kernel execution: %s\n", - concurrentKernels ? "Yes" : "No"); - - int surfaceAlignment; - getCudaAttribute(&surfaceAlignment, - CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev); - printf(" Alignment requirement for Surfaces: %s\n", - surfaceAlignment ? "Yes" : "No"); - - int eccEnabled; - getCudaAttribute(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev); - printf(" Device has ECC support: %s\n", - eccEnabled ? "Enabled" : "Disabled"); - -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - int tccDriver; - getCudaAttribute(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev); - printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", - tccDriver ? "TCC (Tesla Compute Cluster Driver)" - : "WDDM (Windows Display Driver Model)"); -#endif - - int unifiedAddressing; - getCudaAttribute(&unifiedAddressing, - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); - printf(" Device supports Unified Addressing (UVA): %s\n", - unifiedAddressing ? "Yes" : "No"); - - int managedMemory; - getCudaAttribute(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, - dev); - printf(" Device supports Managed Memory: %s\n", - managedMemory ? "Yes" : "No"); - - int computePreemption; - getCudaAttribute(&computePreemption, - CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, - dev); - printf(" Device supports Compute Preemption: %s\n", - computePreemption ? "Yes" : "No"); - - int cooperativeLaunch; - getCudaAttribute(&cooperativeLaunch, - CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev); - printf(" Supports Cooperative Kernel Launch: %s\n", - cooperativeLaunch ? "Yes" : "No"); - - int cooperativeMultiDevLaunch; - getCudaAttribute(&cooperativeMultiDevLaunch, - CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, - dev); - printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", - cooperativeMultiDevLaunch ? "Yes" : "No"); - - int pciDomainID, pciBusID, pciDeviceID; - getCudaAttribute(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev); - getCudaAttribute(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); - getCudaAttribute(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); - printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", - pciDomainID, pciBusID, pciDeviceID); - - const char *sComputeMode[] = { - "Default (multiple host threads can use ::cudaSetDevice() with device " - "simultaneously)", - "Exclusive (only one host thread in one process is able to use " - "::cudaSetDevice() with this device)", - "Prohibited (no host thread can use ::cudaSetDevice() with this " - "device)", - "Exclusive Process (many threads in one process is able to use " - "::cudaSetDevice() with this device)", - "Unknown", NULL}; - - int computeMode; - getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); - printf(" Compute Mode:\n"); - printf(" < %s >\n", sComputeMode[computeMode]); - } - - // If there are 2 or more GPUs, query to determine whether RDMA is supported - if (deviceCount >= 2) { - int gpuid[64]; // we want to find the first two GPUs that can support P2P - int gpu_p2p_count = 0; - int tccDriver = 0; - - for (int i = 0; i < deviceCount; i++) { - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i)); - getCudaAttribute(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i); - - // Only boards based on Fermi or later can support P2P - if ((major >= 2) -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - // on Windows (64-bit), the Tesla Compute Cluster driver for windows - // must be enabled to support this - && tccDriver -#endif - ) { - // This is an array of P2P capable GPUs - gpuid[gpu_p2p_count++] = i; - } - } - - // Show all the combinations of support P2P GPUs - int can_access_peer; - char deviceName0[256], deviceName1[256]; - - if (gpu_p2p_count >= 2) { - for (int i = 0; i < gpu_p2p_count; i++) { - for (int j = 0; j < gpu_p2p_count; j++) { - if (gpuid[i] == gpuid[j]) { - continue; - } - checkCudaErrors( - cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); - checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i])); - checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j])); - printf( - "> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : " - "%s\n", - deviceName0, gpuid[i], deviceName1, gpuid[j], - can_access_peer ? "Yes" : "No"); + if (L2CacheSize) { + printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } - } + + int maxTex1D, maxTex2D[2], maxTex3D[3]; + getCudaAttribute(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev); + getCudaAttribute(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev); + getCudaAttribute(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev); + getCudaAttribute(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev); + getCudaAttribute(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev); + getCudaAttribute(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev); + printf(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) " + "3D=(%d, %d, %d)\n", + maxTex1D, + maxTex2D[0], + maxTex2D[1], + maxTex3D[0], + maxTex3D[1], + maxTex3D[2]); + + int maxTex1DLayered[2]; + getCudaAttribute(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev); + getCudaAttribute(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev); + printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", + maxTex1DLayered[0], + maxTex1DLayered[1]); + + int maxTex2DLayered[3]; + getCudaAttribute(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev); + getCudaAttribute(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev); + getCudaAttribute(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev); + printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d " + "layers\n", + maxTex2DLayered[0], + maxTex2DLayered[1], + maxTex2DLayered[2]); + + int totalConstantMemory; + getCudaAttribute(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev); + printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory); + int sharedMemPerBlock; + getCudaAttribute(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev); + printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock); + int regsPerBlock; + getCudaAttribute(®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); + printf(" Total number of registers available per block: %d\n", regsPerBlock); + int warpSize; + getCudaAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); + printf(" Warp size: %d\n", warpSize); + int maxThreadsPerMultiProcessor; + getCudaAttribute(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); + printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor); + int maxThreadsPerBlock; + getCudaAttribute(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); + printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock); + + int blockDim[3]; + getCudaAttribute(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev); + getCudaAttribute(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev); + getCudaAttribute(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev); + printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]); + int gridDim[3]; + getCudaAttribute(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev); + getCudaAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev); + getCudaAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev); + printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]); + + int textureAlign; + getCudaAttribute(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev); + printf(" Texture alignment: %u bytes\n", textureAlign); + + int memPitch; + getCudaAttribute(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev); + printf(" Maximum memory pitch: %u bytes\n", memPitch); + + int gpuOverlap; + getCudaAttribute(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); + + int asyncEngineCount; + getCudaAttribute(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); + printf(" Concurrent copy and kernel execution: %s with %d copy " + "engine(s)\n", + (gpuOverlap ? "Yes" : "No"), + asyncEngineCount); + + int kernelExecTimeoutEnabled; + getCudaAttribute(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); + printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No"); + int integrated; + getCudaAttribute(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); + printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No"); + int canMapHostMemory; + getCudaAttribute(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); + printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No"); + + int concurrentKernels; + getCudaAttribute(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); + printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No"); + + int surfaceAlignment; + getCudaAttribute(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev); + printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No"); + + int eccEnabled; + getCudaAttribute(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev); + printf(" Device has ECC support: %s\n", eccEnabled ? "Enabled" : "Disabled"); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + int tccDriver; + getCudaAttribute(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev); + printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", + tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); +#endif + + int unifiedAddressing; + getCudaAttribute(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); + printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No"); + + int managedMemory; + getCudaAttribute(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev); + printf(" Device supports Managed Memory: %s\n", managedMemory ? "Yes" : "No"); + + int computePreemption; + getCudaAttribute(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev); + printf(" Device supports Compute Preemption: %s\n", computePreemption ? "Yes" : "No"); + + int cooperativeLaunch; + getCudaAttribute(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev); + printf(" Supports Cooperative Kernel Launch: %s\n", cooperativeLaunch ? "Yes" : "No"); + + int cooperativeMultiDevLaunch; + getCudaAttribute(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev); + printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", cooperativeMultiDevLaunch ? "Yes" : "No"); + + int pciDomainID, pciBusID, pciDeviceID; + getCudaAttribute(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev); + getCudaAttribute(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); + getCudaAttribute(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); + printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID); + + const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device " + "simultaneously)", + "Exclusive (only one host thread in one process is able to use " + "::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this " + "device)", + "Exclusive Process (many threads in one process is able to use " + "::cudaSetDevice() with this device)", + "Unknown", + NULL}; + + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); + printf(" Compute Mode:\n"); + printf(" < %s >\n", sComputeMode[computeMode]); } - } - printf("Result = PASS\n"); + // If there are 2 or more GPUs, query to determine whether RDMA is supported + if (deviceCount >= 2) { + int gpuid[64]; // we want to find the first two GPUs that can support P2P + int gpu_p2p_count = 0; + int tccDriver = 0; - exit(EXIT_SUCCESS); + for (int i = 0; i < deviceCount; i++) { + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i)); + getCudaAttribute(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i); + + // Only boards based on Fermi or later can support P2P + if ((major >= 2) +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // on Windows (64-bit), the Tesla Compute Cluster driver for windows + // must be enabled to support this + && tccDriver +#endif + ) { + // This is an array of P2P capable GPUs + gpuid[gpu_p2p_count++] = i; + } + } + + // Show all the combinations of support P2P GPUs + int can_access_peer; + char deviceName0[256], deviceName1[256]; + + if (gpu_p2p_count >= 2) { + for (int i = 0; i < gpu_p2p_count; i++) { + for (int j = 0; j < gpu_p2p_count; j++) { + if (gpuid[i] == gpuid[j]) { + continue; + } + checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); + checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i])); + checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j])); + printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : " + "%s\n", + deviceName0, + gpuid[i], + deviceName1, + gpuid[j], + can_access_peer ? "Yes" : "No"); + } + } + } + } + + printf("Result = PASS\n"); + + exit(EXIT_SUCCESS); } diff --git a/Samples/1_Utilities/topologyQuery/README.md b/Samples/1_Utilities/topologyQuery/README.md index 0ee74818..db8bb9b0 100644 --- a/Samples/1_Utilities/topologyQuery/README.md +++ b/Samples/1_Utilities/topologyQuery/README.md @@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## References (for more details) - diff --git a/Samples/1_Utilities/topologyQuery/topologyQuery.cu b/Samples/1_Utilities/topologyQuery/topologyQuery.cu index f09c10e1..ec183e4e 100644 --- a/Samples/1_Utilities/topologyQuery/topologyQuery.cu +++ b/Samples/1_Utilities/topologyQuery/topologyQuery.cu @@ -35,48 +35,44 @@ // includes, project #include -#include // helper for shared that are common to CUDA Samples +#include // helper for shared that are common to CUDA Samples -int main(int argc, char **argv) { - int deviceCount = 0; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); +int main(int argc, char **argv) +{ + int deviceCount = 0; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - // Enumerates Device <-> Device links - for (int device1 = 0; device1 < deviceCount; device1++) { - for (int device2 = 0; device2 < deviceCount; device2++) { - if (device1 == device2) continue; + // Enumerates Device <-> Device links + for (int device1 = 0; device1 < deviceCount; device1++) { + for (int device2 = 0; device2 < deviceCount; device2++) { + if (device1 == device2) + continue; - int perfRank = 0; - int atomicSupported = 0; - int accessSupported = 0; + int perfRank = 0; + int atomicSupported = 0; + int accessSupported = 0; - checkCudaErrors(cudaDeviceGetP2PAttribute( - &accessSupported, cudaDevP2PAttrAccessSupported, device1, device2)); - checkCudaErrors(cudaDeviceGetP2PAttribute( - &perfRank, cudaDevP2PAttrPerformanceRank, device1, device2)); - checkCudaErrors(cudaDeviceGetP2PAttribute( - &atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, - device2)); + checkCudaErrors( + cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2)); + checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2)); + checkCudaErrors( + cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2)); - if (accessSupported) { - std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" - << std::endl; - std::cout << " * Atomic Supported: " - << (atomicSupported ? "yes" : "no") << std::endl; - std::cout << " * Perf Rank: " << perfRank << std::endl; - } + if (accessSupported) { + std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl; + std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl; + std::cout << " * Perf Rank: " << perfRank << std::endl; + } + } } - } - // Enumerates Device <-> Host links - for (int device = 0; device < deviceCount; device++) { - int atomicSupported = 0; - checkCudaErrors(cudaDeviceGetAttribute( - &atomicSupported, cudaDevAttrHostNativeAtomicSupported, device)); - std::cout << "GPU" << device << " <-> CPU:" << std::endl; - std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") - << std::endl; - } + // Enumerates Device <-> Host links + for (int device = 0; device < deviceCount; device++) { + int atomicSupported = 0; + checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device)); + std::cout << "GPU" << device << " <-> CPU:" << std::endl; + std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl; + } - return 0; + return 0; } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md index 93c703e8..600163df 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md @@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp index e8e1f414..a1bbc02a 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp @@ -29,230 +29,236 @@ // DESCRIPTION: Simple CUDA consumer rendering sample app // -#include #include "cuda_consumer.h" -#include "eglstrm_common.h" + +#include #include #include +#include "eglstrm_common.h" + #if defined(EXTENSION_LIST) EXTENSION_LIST(EXTLST_EXTERN) #endif CUgraphicsResource cudaResource; -static int count_acq = 0; +static int count_acq = 0; static double acquire_time[25000] = {0}, total_time_acq = 0; -static int count_rel = 0; +static int count_rel = 0; static double rel_time[25000] = {0}, total_time_rel = 0; void acquireApiStat(void); -void acquireApiStat(void) { - int i = 0; - double min = 10000000, max = 0; - double average_launch_time = 0, standard_deviation = 0; - if (count_acq == 0) return; - // lets compute the standard deviation - min = max = acquire_time[1]; - average_launch_time = (total_time_acq - acquire_time[0]) / count_acq; - for (i = 1; i < count_acq; i++) { - standard_deviation += (acquire_time[i] - average_launch_time) * - (acquire_time[i] - average_launch_time); - if (acquire_time[i] < min) min = acquire_time[i]; - if (acquire_time[i] > max) max = acquire_time[i]; - } - standard_deviation = sqrt(standard_deviation / count_acq); - printf("acquire Avg: %lf\n", average_launch_time); - printf("acquire SD: %lf\n", standard_deviation); - printf("acquire min: %lf\n", min); - printf("acquire max: %lf\n", max); +void acquireApiStat(void) +{ + int i = 0; + double min = 10000000, max = 0; + double average_launch_time = 0, standard_deviation = 0; + if (count_acq == 0) + return; + // lets compute the standard deviation + min = max = acquire_time[1]; + average_launch_time = (total_time_acq - acquire_time[0]) / count_acq; + for (i = 1; i < count_acq; i++) { + standard_deviation += (acquire_time[i] - average_launch_time) * (acquire_time[i] - average_launch_time); + if (acquire_time[i] < min) + min = acquire_time[i]; + if (acquire_time[i] > max) + max = acquire_time[i]; + } + standard_deviation = sqrt(standard_deviation / count_acq); + printf("acquire Avg: %lf\n", average_launch_time); + printf("acquire SD: %lf\n", standard_deviation); + printf("acquire min: %lf\n", min); + printf("acquire max: %lf\n", max); - min = max = rel_time[1]; - average_launch_time = (total_time_rel - rel_time[0]) / count_rel; - for (i = 1; i < count_rel; i++) { - standard_deviation += (rel_time[i] - average_launch_time) * - (rel_time[i] - average_launch_time); - if (rel_time[i] < min) min = rel_time[i]; - if (rel_time[i] > max) max = rel_time[i]; - } - standard_deviation = sqrt(standard_deviation / count_rel); - printf("release Avg: %lf\n", average_launch_time); - printf("release SD: %lf\n", standard_deviation); - printf("release min: %lf\n", min); - printf("release max: %lf\n", max); + min = max = rel_time[1]; + average_launch_time = (total_time_rel - rel_time[0]) / count_rel; + for (i = 1; i < count_rel; i++) { + standard_deviation += (rel_time[i] - average_launch_time) * (rel_time[i] - average_launch_time); + if (rel_time[i] < min) + min = rel_time[i]; + if (rel_time[i] > max) + max = rel_time[i]; + } + standard_deviation = sqrt(standard_deviation / count_rel); + printf("release Avg: %lf\n", average_launch_time); + printf("release SD: %lf\n", standard_deviation); + printf("release min: %lf\n", min); + printf("release max: %lf\n", max); } -CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer, - int frameNumber) { - CUresult cuStatus = CUDA_SUCCESS; - CUeglFrame cudaEgl; - struct timespec start, end; - EGLint streamState = 0; - double curTime; +CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber) +{ + CUresult cuStatus = CUDA_SUCCESS; + CUeglFrame cudaEgl; + struct timespec start, end; + EGLint streamState = 0; + double curTime; - if (!cudaConsumer) { - printf("%s: Bad parameter\n", __func__); - goto done; - } - - while (1) { - if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream, - EGL_STREAM_STATE_KHR, &streamState)) { - printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); - cuStatus = CUDA_ERROR_UNKNOWN; - goto done; - } - if (streamState == EGL_STREAM_STATE_DISCONNECTED_KHR) { - printf("Cuda Consumer: EGL_STREAM_STATE_DISCONNECTED_KHR received\n"); - cuStatus = CUDA_ERROR_UNKNOWN; - goto done; + if (!cudaConsumer) { + printf("%s: Bad parameter\n", __func__); + goto done; } - if (streamState == EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR) { - break; - } - } - if (cudaConsumer->profileAPI) { - getTime(&start); - } - cuStatus = - cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource, - &cudaConsumer->consCudaStream, 16000); - if (cudaConsumer->profileAPI) { - getTime(&end); - curTime = TIME_DIFF(end, start); - acquire_time[count_acq++] = curTime; - if (count_acq == 25000) count_acq = 0; - total_time_acq += curTime; - } - if (cuStatus == CUDA_SUCCESS) { - CUdeviceptr pDevPtr = 0; - cudaError_t err; + while (1) { + if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream, EGL_STREAM_STATE_KHR, &streamState)) { + printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + cuStatus = CUDA_ERROR_UNKNOWN; + goto done; + } + if (streamState == EGL_STREAM_STATE_DISCONNECTED_KHR) { + printf("Cuda Consumer: EGL_STREAM_STATE_DISCONNECTED_KHR received\n"); + cuStatus = CUDA_ERROR_UNKNOWN; + goto done; + } + if (streamState == EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR) { + break; + } + } + if (cudaConsumer->profileAPI) { + getTime(&start); + } cuStatus = - cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0); + cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource, &cudaConsumer->consCudaStream, 16000); + if (cudaConsumer->profileAPI) { + getTime(&end); + curTime = TIME_DIFF(end, start); + acquire_time[count_acq++] = curTime; + if (count_acq == 25000) + count_acq = 0; + total_time_acq += curTime; + } + if (cuStatus == CUDA_SUCCESS) { + CUdeviceptr pDevPtr = 0; + cudaError_t err; + + cuStatus = cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0); + if (cuStatus != CUDA_SUCCESS) { + printf("Cuda get resource failed with %d\n", cuStatus); + goto done; + } + pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0]; + + err = cudaConsumer_filter(cudaConsumer->consCudaStream, + (char *)pDevPtr, + WIDTH * 4, + HEIGHT, + PROD_DATA + frameNumber, + CONS_DATA + frameNumber, + frameNumber); + if (err != cudaSuccess) { + printf("Cuda Consumer: kernel failed with: %s\n", cudaGetErrorString(err)); + goto done; + } + } + +done: + return cuStatus; +} + +CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber) +{ + CUresult cuStatus = CUDA_SUCCESS; + struct timespec start, end; + double curTime; + + if (!cudaConsumer) { + printf("%s: Bad parameter\n", __func__); + goto done; + } + if (cudaConsumer->profileAPI) { + getTime(&start); + } + cuStatus = cuEGLStreamConsumerReleaseFrame(&cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream); + if (cudaConsumer->profileAPI) { + getTime(&end); + curTime = TIME_DIFF(end, start); + rel_time[count_rel++] = curTime; + if (count_rel == 25000) + count_rel = 0; + total_time_rel += curTime; + } if (cuStatus != CUDA_SUCCESS) { - printf("Cuda get resource failed with %d\n", cuStatus); - goto done; + printf("cuEGLStreamConsumerReleaseFrame failed, status:%d\n", cuStatus); + goto done; } - pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0]; - err = cudaConsumer_filter(cudaConsumer->consCudaStream, (char *)pDevPtr, - WIDTH * 4, HEIGHT, PROD_DATA + frameNumber, - CONS_DATA + frameNumber, frameNumber); - if (err != cudaSuccess) { - printf("Cuda Consumer: kernel failed with: %s\n", - cudaGetErrorString(err)); - goto done; +done: + return cuStatus; +} + +CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) +{ + CUdevice device; + CUresult status = CUDA_SUCCESS; + + if (CUDA_SUCCESS != (status = cuInit(0))) { + printf("Failed to initialize CUDA\n"); + return status; } - } -done: - return cuStatus; -} + if (CUDA_SUCCESS != (status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) { + printf("failed to get CUDA device\n"); + return status; + } -CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer, - int frameNumber) { - CUresult cuStatus = CUDA_SUCCESS; - struct timespec start, end; - double curTime; + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, 0, device))) { + printf("failed to create CUDA context\n"); + return status; + } - if (!cudaConsumer) { - printf("%s: Bad parameter\n", __func__); - goto done; - } - if (cudaConsumer->profileAPI) { - getTime(&start); - } - cuStatus = cuEGLStreamConsumerReleaseFrame( - &cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream); - if (cudaConsumer->profileAPI) { - getTime(&end); - curTime = TIME_DIFF(end, start); - rel_time[count_rel++] = curTime; - if (count_rel == 25000) count_rel = 0; - total_time_rel += curTime; - } - if (cuStatus != CUDA_SUCCESS) { - printf("cuEGLStreamConsumerReleaseFrame failed, status:%d\n", cuStatus); - goto done; - } + int major = 0, minor = 0; + char deviceName[256]; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); + cuDeviceGetName(deviceName, 256, device); + printf("CUDA Consumer on GPU Device %d: \"%s\" with compute capability " + "%d.%d\n\n", + device, + deviceName, + major, + minor); -done: - return cuStatus; -} + cuCtxPopCurrent(&cudaConsumer->context); + if (major < 6) { + printf("EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. " + "Exiting...\n"); + exit(2); // EXIT_WAIVED + } -CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) { - CUdevice device; - CUresult status = CUDA_SUCCESS; - - if (CUDA_SUCCESS != (status = cuInit(0))) { - printf("Failed to initialize CUDA\n"); return status; - } - - if (CUDA_SUCCESS != - (status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) { - printf("failed to get CUDA device\n"); - return status; - } - - if (CUDA_SUCCESS != - (status = cuCtxCreate(&cudaConsumer->context, 0, device))) { - printf("failed to create CUDA context\n"); - return status; - } - - int major = 0, minor = 0; - char deviceName[256]; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device); - cuDeviceGetName(deviceName, 256, device); - printf( - "CUDA Consumer on GPU Device %d: \"%s\" with compute capability " - "%d.%d\n\n", - device, deviceName, major, minor); - - cuCtxPopCurrent(&cudaConsumer->context); - if (major < 6) { - printf( - "EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. " - "Exiting...\n"); - exit(2); // EXIT_WAIVED - } - - return status; } -CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, - TestArgs *args) { - CUresult status = CUDA_SUCCESS; - int bufferSize; +CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args) +{ + CUresult status = CUDA_SUCCESS; + int bufferSize; - cudaConsumer->charCnt = args->charCnt; - bufferSize = args->charCnt; + cudaConsumer->charCnt = args->charCnt; + bufferSize = args->charCnt; - cudaConsumer->pCudaCopyMem = (unsigned char *)malloc(bufferSize); - if (cudaConsumer->pCudaCopyMem == NULL) { - printf("Cuda Consumer: malloc failed\n"); - goto done; - } + cudaConsumer->pCudaCopyMem = (unsigned char *)malloc(bufferSize); + if (cudaConsumer->pCudaCopyMem == NULL) { + printf("Cuda Consumer: malloc failed\n"); + goto done; + } - status = cuStreamCreate(&cudaConsumer->consCudaStream, 0); - if (status != CUDA_SUCCESS) { - printf("Cuda Consumer: cuStreamCreate failed, status:%d\n", status); - goto done; - } + status = cuStreamCreate(&cudaConsumer->consCudaStream, 0); + if (status != CUDA_SUCCESS) { + printf("Cuda Consumer: cuStreamCreate failed, status:%d\n", status); + goto done; + } - atexit(acquireApiStat); + atexit(acquireApiStat); done: - return status; + return status; } -CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer) { - if (cudaConsumer->pCudaCopyMem) { - free(cudaConsumer->pCudaCopyMem); - } - return cuEGLStreamConsumerDisconnect(&cudaConsumer->cudaConn); +CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer) +{ + if (cudaConsumer->pCudaCopyMem) { + free(cudaConsumer->pCudaCopyMem); + } + return cuEGLStreamConsumerDisconnect(&cudaConsumer->cudaConn); } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.h b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.h index 7f64cf1f..126a381b 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.h +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.h @@ -32,35 +32,41 @@ #ifndef _CUDA_CONSUMER_H_ #define _CUDA_CONSUMER_H_ +#include +#include #include #include #include + #include "cudaEGL.h" #include "eglstrm_common.h" -#include -#include -typedef struct _test_cuda_consumer_s { - CUcontext context; - CUeglStreamConnection cudaConn; - int cudaDevId; - EGLDisplay eglDisplay; - EGLStreamKHR eglStream; - unsigned int charCnt; - char *cudaBuf; - bool profileAPI; - unsigned char *pCudaCopyMem; - CUstream consCudaStream; +typedef struct _test_cuda_consumer_s +{ + CUcontext context; + CUeglStreamConnection cudaConn; + int cudaDevId; + EGLDisplay eglDisplay; + EGLStreamKHR eglStream; + unsigned int charCnt; + char *cudaBuf; + bool profileAPI; + unsigned char *pCudaCopyMem; + CUstream consCudaStream; } test_cuda_consumer_s; -CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args); -CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer); -CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *data, int frameNumber); -CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *data, int frameNumber); -CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer); -cudaError_t cudaConsumer_filter(CUstream cStream, char *pSrc, int width, - int height, char expectedVal, char newVal, - int frameNumber); +CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args); +CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer); +CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *data, int frameNumber); +CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *data, int frameNumber); +CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer); +cudaError_t cudaConsumer_filter(CUstream cStream, + char *pSrc, + int width, + int height, + char expectedVal, + char newVal, + int frameNumber); cudaError_t cudaGetValueMismatch(void); #endif diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp index 88f1722c..e862e541 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp @@ -29,260 +29,265 @@ // DESCRIPTION: Simple cuda EGL stream producer app // -#include "cudaEGL.h" #include "cuda_producer.h" -#include "eglstrm_common.h" + #include + +#include "cudaEGL.h" +#include "eglstrm_common.h" #if defined(EXTENSION_LIST) EXTENSION_LIST(EXTLST_EXTERN) #endif #include #include #include -#include + #include "cuda_runtime.h" #include "math.h" -int cudaPresentReturnData = INIT_DATA; -int fakePresent = 0; -CUeglFrame fakeFrame; +int cudaPresentReturnData = INIT_DATA; +int fakePresent = 0; +CUeglFrame fakeFrame; CUdeviceptr cudaPtrFake; extern bool isCrossDevice; -void cudaProducerPrepareFrame(CUeglFrame *cudaEgl, CUdeviceptr cudaPtr, - int bufferSize) { - cudaEgl->frame.pPitch[0] = (void *)cudaPtr; - cudaEgl->width = WIDTH; - cudaEgl->depth = 0; - cudaEgl->height = HEIGHT; - cudaEgl->pitch = WIDTH * 4; - cudaEgl->frameType = CU_EGL_FRAME_TYPE_PITCH; - cudaEgl->planeCount = 1; - cudaEgl->numChannels = 4; - cudaEgl->eglColorFormat = CU_EGL_COLOR_FORMAT_ARGB; - cudaEgl->cuFormat = CU_AD_FORMAT_UNSIGNED_INT8; +void cudaProducerPrepareFrame(CUeglFrame *cudaEgl, CUdeviceptr cudaPtr, int bufferSize) +{ + cudaEgl->frame.pPitch[0] = (void *)cudaPtr; + cudaEgl->width = WIDTH; + cudaEgl->depth = 0; + cudaEgl->height = HEIGHT; + cudaEgl->pitch = WIDTH * 4; + cudaEgl->frameType = CU_EGL_FRAME_TYPE_PITCH; + cudaEgl->planeCount = 1; + cudaEgl->numChannels = 4; + cudaEgl->eglColorFormat = CU_EGL_COLOR_FORMAT_ARGB; + cudaEgl->cuFormat = CU_AD_FORMAT_UNSIGNED_INT8; } -static int count_present = 0, count_return = 0; +static int count_present = 0, count_return = 0; static double present_time[25000] = {0}, total_time_present = 0; static double return_time[25000] = {0}, total_time_return = 0; void presentApiStat(void); -void presentApiStat(void) { - int i = 0; - double min = 10000000, max = 0; - double average_launch_time = 0, standard_deviation = 0; - if (count_present == 0) return; - // lets compute the standard deviation - min = max = present_time[1]; - average_launch_time = (total_time_present) / count_present; - for (i = 1; i < count_present; i++) { - standard_deviation += (present_time[i] - average_launch_time) * - (present_time[i] - average_launch_time); - if (present_time[i] < min) min = present_time[i]; - if (present_time[i] > max) max = present_time[i]; - } - standard_deviation = sqrt(standard_deviation / count_present); - printf("present Avg: %lf\n", average_launch_time); - printf("present SD: %lf\n", standard_deviation); - printf("present min: %lf\n", min); - printf("present max: %lf\n", max); - - min = max = return_time[1]; - average_launch_time = (total_time_return - return_time[0]) / count_return; - for (i = 1; i < count_return; i++) { - standard_deviation += (return_time[i] - average_launch_time) * - (return_time[i] - average_launch_time); - if (return_time[i] < min) min = return_time[i]; - if (return_time[i] > max) max = return_time[i]; - } - standard_deviation = sqrt(standard_deviation / count_return); - printf("return Avg: %lf\n", average_launch_time); - printf("return SD: %lf\n", standard_deviation); - printf("return min: %lf\n", min); - printf("return max: %lf\n", max); -} -CUresult cudaProducerPresentFrame(test_cuda_producer_s *cudaProducer, - CUeglFrame cudaEgl, int t) { - static int flag = 0; - CUresult status = CUDA_SUCCESS; - struct timespec start, end; - double curTime; - CUdeviceptr pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0]; - cudaProducer_filter(cudaProducer->prodCudaStream, (char *)pDevPtr, WIDTH * 4, - HEIGHT, cudaPresentReturnData, PROD_DATA + t, t); - if (cudaProducer->profileAPI) { - getTime(&start); - } - status = cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl, - &cudaProducer->prodCudaStream); - if (status != CUDA_SUCCESS) { - printf("Cuda Producer: Present frame failed, status:%d\n", status); - goto done; - } - flag++; - if (cudaProducer->profileAPI && flag > 10) { - getTime(&end); - curTime = TIME_DIFF(end, start); - present_time[count_present++] = curTime; - if (count_present == 25000) count_present = 0; - total_time_present += curTime; - } -done: - return status; -} - -int flag = 0; -CUresult cudaProducerReturnFrame(test_cuda_producer_s *cudaProducer, - CUeglFrame cudaEgl, int t) { - CUresult status = CUDA_SUCCESS; - struct timespec start, end; - double curTime; - CUdeviceptr pDevPtr = 0; - - pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0]; - if (cudaProducer->profileAPI) { - getTime(&start); - } - - while (1) { - status = cuEGLStreamProducerReturnFrame(&cudaProducer->cudaConn, &cudaEgl, - &cudaProducer->prodCudaStream); - if (status == CUDA_ERROR_LAUNCH_TIMEOUT) { - continue; - } else if (status != CUDA_SUCCESS) { - printf("Cuda Producer: Return frame failed, status:%d\n", status); - goto done; +void presentApiStat(void) +{ + int i = 0; + double min = 10000000, max = 0; + double average_launch_time = 0, standard_deviation = 0; + if (count_present == 0) + return; + // lets compute the standard deviation + min = max = present_time[1]; + average_launch_time = (total_time_present) / count_present; + for (i = 1; i < count_present; i++) { + standard_deviation += (present_time[i] - average_launch_time) * (present_time[i] - average_launch_time); + if (present_time[i] < min) + min = present_time[i]; + if (present_time[i] > max) + max = present_time[i]; + } + standard_deviation = sqrt(standard_deviation / count_present); + printf("present Avg: %lf\n", average_launch_time); + printf("present SD: %lf\n", standard_deviation); + printf("present min: %lf\n", min); + printf("present max: %lf\n", max); + + min = max = return_time[1]; + average_launch_time = (total_time_return - return_time[0]) / count_return; + for (i = 1; i < count_return; i++) { + standard_deviation += (return_time[i] - average_launch_time) * (return_time[i] - average_launch_time); + if (return_time[i] < min) + min = return_time[i]; + if (return_time[i] > max) + max = return_time[i]; + } + standard_deviation = sqrt(standard_deviation / count_return); + printf("return Avg: %lf\n", average_launch_time); + printf("return SD: %lf\n", standard_deviation); + printf("return min: %lf\n", min); + printf("return max: %lf\n", max); +} +CUresult cudaProducerPresentFrame(test_cuda_producer_s *cudaProducer, CUeglFrame cudaEgl, int t) +{ + static int flag = 0; + CUresult status = CUDA_SUCCESS; + struct timespec start, end; + double curTime; + CUdeviceptr pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0]; + cudaProducer_filter( + cudaProducer->prodCudaStream, (char *)pDevPtr, WIDTH * 4, HEIGHT, cudaPresentReturnData, PROD_DATA + t, t); + if (cudaProducer->profileAPI) { + getTime(&start); + } + status = cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl, &cudaProducer->prodCudaStream); + if (status != CUDA_SUCCESS) { + printf("Cuda Producer: Present frame failed, status:%d\n", status); + goto done; + } + flag++; + if (cudaProducer->profileAPI && flag > 10) { + getTime(&end); + curTime = TIME_DIFF(end, start); + present_time[count_present++] = curTime; + if (count_present == 25000) + count_present = 0; + total_time_present += curTime; } - break; - } - if (cudaProducer->profileAPI) { - getTime(&end); - curTime = TIME_DIFF(end, start); - return_time[count_return++] = curTime; - if (count_return == 25000) count_return = 0; - total_time_return += curTime; - } - if (flag % 2 == 0) { - cudaPresentReturnData++; - } - cudaProducer_filter(cudaProducer->prodCudaStream, (char *)pDevPtr, WIDTH * 4, - HEIGHT, CONS_DATA + t, cudaPresentReturnData, t); - flag++; done: - return status; + return status; } -CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer) { - CUdevice device; - CUresult status = CUDA_SUCCESS; +int flag = 0; +CUresult cudaProducerReturnFrame(test_cuda_producer_s *cudaProducer, CUeglFrame cudaEgl, int t) +{ + CUresult status = CUDA_SUCCESS; + struct timespec start, end; + double curTime; + CUdeviceptr pDevPtr = 0; - if (CUDA_SUCCESS != (status = cuInit(0))) { - printf("Failed to initialize CUDA\n"); - return status; - } + pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0]; + if (cudaProducer->profileAPI) { + getTime(&start); + } - if (CUDA_SUCCESS != - (status = cuDeviceGet(&device, cudaProducer->cudaDevId))) { - printf("failed to get CUDA device\n"); - return status; - } - - if (CUDA_SUCCESS != - (status = cuCtxCreate(&cudaProducer->context, 0, device))) { - printf("failed to create CUDA context\n"); - return status; - } - - int major = 0, minor = 0; - char deviceName[256]; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device); - cuDeviceGetName(deviceName, 256, device); - printf( - "CUDA Producer on GPU Device %d: \"%s\" with compute capability " - "%d.%d\n\n", - device, deviceName, major, minor); - - cuCtxPopCurrent(&cudaProducer->context); - - if (major < 6) { - printf( - "EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. " - "Exiting...\n"); - exit(2); // EXIT_WAIVED - } - - return status; -} - -CUresult cudaProducerInit(test_cuda_producer_s *cudaProducer, TestArgs *args) { - CUresult status = CUDA_SUCCESS; - int bufferSize; - - cudaProducer->charCnt = args->charCnt; - bufferSize = cudaProducer->charCnt; - - cudaProducer->tempBuff = (char *)malloc(bufferSize); - if (!cudaProducer->tempBuff) { - printf("Cuda Producer: Failed to allocate image buffer\n"); - status = CUDA_ERROR_UNKNOWN; - goto done; - } - memset((void *)cudaProducer->tempBuff, INIT_DATA, cudaProducer->charCnt); - - // Fill this init data - status = cuMemAlloc(&cudaProducer->cudaPtr, bufferSize); - if (status != CUDA_SUCCESS) { - printf("Cuda Producer: cuda Malloc failed, status:%d\n", status); - goto done; - } - status = cuMemcpyHtoD(cudaProducer->cudaPtr, (void *)(cudaProducer->tempBuff), - bufferSize); - if (status != CUDA_SUCCESS) { - printf("Cuda Producer: cuMemCpy failed, status:%d\n", status); - goto done; - } - - // Fill this init data - status = cuMemAlloc(&cudaProducer->cudaPtr1, bufferSize); - if (status != CUDA_SUCCESS) { - printf("Cuda Producer: cuda Malloc failed, status:%d\n", status); - goto done; - } - status = cuMemcpyHtoD(cudaProducer->cudaPtr1, - (void *)(cudaProducer->tempBuff), bufferSize); - if (status != CUDA_SUCCESS) { - printf("Cuda Producer: cuMemCpy failed, status:%d\n", status); - goto done; - } - - status = cuStreamCreate(&cudaProducer->prodCudaStream, 0); - if (status != CUDA_SUCCESS) { - printf("Cuda Producer: cuStreamCreate failed, status:%d\n", status); - goto done; - } - - // Fill this init data - status = cuMemAlloc(&cudaPtrFake, 100); - if (status != CUDA_SUCCESS) { - printf("Cuda Producer: cuda Malloc failed, status:%d\n", status); - goto done; - } - - atexit(presentApiStat); + while (1) { + status = cuEGLStreamProducerReturnFrame(&cudaProducer->cudaConn, &cudaEgl, &cudaProducer->prodCudaStream); + if (status == CUDA_ERROR_LAUNCH_TIMEOUT) { + continue; + } + else if (status != CUDA_SUCCESS) { + printf("Cuda Producer: Return frame failed, status:%d\n", status); + goto done; + } + break; + } + if (cudaProducer->profileAPI) { + getTime(&end); + curTime = TIME_DIFF(end, start); + return_time[count_return++] = curTime; + if (count_return == 25000) + count_return = 0; + total_time_return += curTime; + } + if (flag % 2 == 0) { + cudaPresentReturnData++; + } + cudaProducer_filter( + cudaProducer->prodCudaStream, (char *)pDevPtr, WIDTH * 4, HEIGHT, CONS_DATA + t, cudaPresentReturnData, t); + flag++; done: - return status; + return status; } -CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer) { - if (cudaProducer->tempBuff) { - free(cudaProducer->tempBuff); - } - if (cudaProducer->cudaPtr) { - cuMemFree(cudaProducer->cudaPtr); - } - return cuEGLStreamProducerDisconnect(&cudaProducer->cudaConn); +CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer) +{ + CUdevice device; + CUresult status = CUDA_SUCCESS; + + if (CUDA_SUCCESS != (status = cuInit(0))) { + printf("Failed to initialize CUDA\n"); + return status; + } + + if (CUDA_SUCCESS != (status = cuDeviceGet(&device, cudaProducer->cudaDevId))) { + printf("failed to get CUDA device\n"); + return status; + } + + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaProducer->context, 0, device))) { + printf("failed to create CUDA context\n"); + return status; + } + + int major = 0, minor = 0; + char deviceName[256]; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); + cuDeviceGetName(deviceName, 256, device); + printf("CUDA Producer on GPU Device %d: \"%s\" with compute capability " + "%d.%d\n\n", + device, + deviceName, + major, + minor); + + cuCtxPopCurrent(&cudaProducer->context); + + if (major < 6) { + printf("EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. " + "Exiting...\n"); + exit(2); // EXIT_WAIVED + } + + return status; +} + +CUresult cudaProducerInit(test_cuda_producer_s *cudaProducer, TestArgs *args) +{ + CUresult status = CUDA_SUCCESS; + int bufferSize; + + cudaProducer->charCnt = args->charCnt; + bufferSize = cudaProducer->charCnt; + + cudaProducer->tempBuff = (char *)malloc(bufferSize); + if (!cudaProducer->tempBuff) { + printf("Cuda Producer: Failed to allocate image buffer\n"); + status = CUDA_ERROR_UNKNOWN; + goto done; + } + memset((void *)cudaProducer->tempBuff, INIT_DATA, cudaProducer->charCnt); + + // Fill this init data + status = cuMemAlloc(&cudaProducer->cudaPtr, bufferSize); + if (status != CUDA_SUCCESS) { + printf("Cuda Producer: cuda Malloc failed, status:%d\n", status); + goto done; + } + status = cuMemcpyHtoD(cudaProducer->cudaPtr, (void *)(cudaProducer->tempBuff), bufferSize); + if (status != CUDA_SUCCESS) { + printf("Cuda Producer: cuMemCpy failed, status:%d\n", status); + goto done; + } + + // Fill this init data + status = cuMemAlloc(&cudaProducer->cudaPtr1, bufferSize); + if (status != CUDA_SUCCESS) { + printf("Cuda Producer: cuda Malloc failed, status:%d\n", status); + goto done; + } + status = cuMemcpyHtoD(cudaProducer->cudaPtr1, (void *)(cudaProducer->tempBuff), bufferSize); + if (status != CUDA_SUCCESS) { + printf("Cuda Producer: cuMemCpy failed, status:%d\n", status); + goto done; + } + + status = cuStreamCreate(&cudaProducer->prodCudaStream, 0); + if (status != CUDA_SUCCESS) { + printf("Cuda Producer: cuStreamCreate failed, status:%d\n", status); + goto done; + } + + // Fill this init data + status = cuMemAlloc(&cudaPtrFake, 100); + if (status != CUDA_SUCCESS) { + printf("Cuda Producer: cuda Malloc failed, status:%d\n", status); + goto done; + } + + atexit(presentApiStat); +done: + return status; +} + +CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer) +{ + if (cudaProducer->tempBuff) { + free(cudaProducer->tempBuff); + } + if (cudaProducer->cudaPtr) { + cuMemFree(cudaProducer->cudaPtr); + } + return cuEGLStreamProducerDisconnect(&cudaProducer->cudaConn); } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.h b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.h index ce12bb5a..0c632b30 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.h +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.h @@ -33,36 +33,39 @@ #define _CUDA_PRODUCER_H_ #include #include +#include +#include + #include "cudaEGL.h" #include "eglstrm_common.h" -#include -#include -typedef struct _test_cuda_producer_s { - // Stream params - CUcontext context; - CUeglStreamConnection cudaConn; - int cudaDevId; - EGLStreamKHR eglStream; - EGLDisplay eglDisplay; - unsigned int charCnt; - bool profileAPI; - char *tempBuff; - CUdeviceptr cudaPtr; - CUdeviceptr cudaPtr1; - CUstream prodCudaStream; +typedef struct _test_cuda_producer_s +{ + // Stream params + CUcontext context; + CUeglStreamConnection cudaConn; + int cudaDevId; + EGLStreamKHR eglStream; + EGLDisplay eglDisplay; + unsigned int charCnt; + bool profileAPI; + char *tempBuff; + CUdeviceptr cudaPtr; + CUdeviceptr cudaPtr1; + CUstream prodCudaStream; } test_cuda_producer_s; -CUresult cudaProducerInit(test_cuda_producer_s *cudaProducer, TestArgs *args); -CUresult cudaProducerPresentFrame(test_cuda_producer_s *parserArg, - CUeglFrame cudaEgl, int t); -CUresult cudaProducerReturnFrame(test_cuda_producer_s *parserArg, - CUeglFrame cudaEgl, int t); -CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer); -CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer); -cudaError_t cudaProducer_filter(CUstream cStream, char *pSrc, int width, - int height, char expectedVal, char newVal, - int frameNumber); -void cudaProducerPrepareFrame(CUeglFrame *cudaEgl, CUdeviceptr cudaPtr, - int bufferSize); +CUresult cudaProducerInit(test_cuda_producer_s *cudaProducer, TestArgs *args); +CUresult cudaProducerPresentFrame(test_cuda_producer_s *parserArg, CUeglFrame cudaEgl, int t); +CUresult cudaProducerReturnFrame(test_cuda_producer_s *parserArg, CUeglFrame cudaEgl, int t); +CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer); +CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer); +cudaError_t cudaProducer_filter(CUstream cStream, + char *pSrc, + int width, + int height, + char expectedVal, + char newVal, + int frameNumber); +void cudaProducerPrepareFrame(CUeglFrame *cudaEgl, CUdeviceptr cudaPtr, int bufferSize); #endif diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.cpp index 383ee73b..329d41cb 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.cpp @@ -31,343 +31,343 @@ #include "eglstrm_common.h" -EGLStreamKHR g_producerEglStream = EGL_NO_STREAM_KHR; -EGLStreamKHR g_consumerEglStream = EGL_NO_STREAM_KHR; -EGLDisplay g_producerEglDisplay = EGL_NO_DISPLAY; -EGLDisplay g_consumerEglDisplay = EGL_NO_DISPLAY; -int cudaDevIndexProd = -1; -int cudaDevIndexCons = -1; +EGLStreamKHR g_producerEglStream = EGL_NO_STREAM_KHR; +EGLStreamKHR g_consumerEglStream = EGL_NO_STREAM_KHR; +EGLDisplay g_producerEglDisplay = EGL_NO_DISPLAY; +EGLDisplay g_consumerEglDisplay = EGL_NO_DISPLAY; +int cudaDevIndexProd = -1; +int cudaDevIndexCons = -1; #if defined(EXTENSION_LIST) EXTENSION_LIST(EXTLST_DECL) typedef void (*extlst_fnptr_t)(void); -static struct { - extlst_fnptr_t *fnptr; - char const *name; - bool is_dgpu; // This function is need only for dgpu case +static struct +{ + extlst_fnptr_t *fnptr; + char const *name; + bool is_dgpu; // This function is need only for dgpu case } extensionList[] = {EXTENSION_LIST(EXTLST_ENTRY)}; -int eglSetupExtensions(bool isCrossDevice) { - unsigned int i; +int eglSetupExtensions(bool isCrossDevice) +{ + unsigned int i; - for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) { - // load the dgpu function only if we are running cross device test - if ((!extensionList[i].is_dgpu) || - (extensionList[i].is_dgpu == isCrossDevice)) { - *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name); - if (*extensionList[i].fnptr == NULL) { - printf("Couldn't get address of %s()\n", extensionList[i].name); - return 0; - } + for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) { + // load the dgpu function only if we are running cross device test + if ((!extensionList[i].is_dgpu) || (extensionList[i].is_dgpu == isCrossDevice)) { + *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name); + if (*extensionList[i].fnptr == NULL) { + printf("Couldn't get address of %s()\n", extensionList[i].name); + return 0; + } + } } - } - return 1; + return 1; } -int EGLStreamInit(bool isCrossDevice, int isConsumer, - EGLNativeFileDescriptorKHR fileDesc) { - static const EGLint streamAttrFIFOMode[] = { - EGL_STREAM_FIFO_LENGTH_KHR, 5, EGL_SUPPORT_REUSE_NV, EGL_FALSE, EGL_NONE}; - EGLDisplay eglDisplay[2] = {0}; - EGLStreamKHR eglStream[2] = {0}; - EGLBoolean eglStatus; +int EGLStreamInit(bool isCrossDevice, int isConsumer, EGLNativeFileDescriptorKHR fileDesc) +{ + static const EGLint streamAttrFIFOMode[] = { + EGL_STREAM_FIFO_LENGTH_KHR, 5, EGL_SUPPORT_REUSE_NV, EGL_FALSE, EGL_NONE}; + EGLDisplay eglDisplay[2] = {0}; + EGLStreamKHR eglStream[2] = {0}; + EGLBoolean eglStatus; #define MAX_EGL_DEVICES 4 - EGLDeviceEXT devices[MAX_EGL_DEVICES]; - EGLint numDevices = 0; + EGLDeviceEXT devices[MAX_EGL_DEVICES]; + EGLint numDevices = 0; - eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices); - if (eglStatus != EGL_TRUE) { - printf("Error querying EGL devices\n"); - goto Done; - } - - if (numDevices == 0) { - printf("No EGL devices found\n"); - eglStatus = EGL_FALSE; - goto Done; - } - - // If cross device, create discrete GPU stream first and then create the - // integrated GPU stream to connect to it via fd. The other way round fails - // in producer connect. - // - // TODO: Find out if this EGL behavior is by design. - if (isConsumer) { - int egl_device_id = 0; - for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { - EGLAttrib cuda_device; - eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], - EGL_CUDA_DEVICE_NV, &cuda_device); - if (eglStatus == EGL_TRUE) { - cudaDevIndexCons = cuda_device; // We select first EGL-CUDA Capable - // device for consumer. - printf( - "Found EGL-CUDA Capable device with CUDA Device id = %d out of " - "egl_device_id = %d\n", - (int)cudaDevIndexCons, egl_device_id); - break; - } - } - - if (egl_device_id >= numDevices) { - printf("No CUDA Capable EGL Device found.. Waiving execution\n"); - goto Done; - } - - g_consumerEglDisplay = eglGetPlatformDisplayEXT( - EGL_PLATFORM_DEVICE_EXT, (void *)devices[egl_device_id], NULL); - if (g_consumerEglDisplay == EGL_NO_DISPLAY) { - printf("Could not get EGL display from device. \n"); - eglStatus = EGL_FALSE; - goto Done; - } - - eglStatus = eglInitialize(g_consumerEglDisplay, 0, 0); - if (!eglStatus) { - printf("EGL failed to initialize. \n"); - eglStatus = EGL_FALSE; - goto Done; - } - - g_consumerEglStream = - eglCreateStreamKHR(g_consumerEglDisplay, streamAttrFIFOMode); - if (g_consumerEglStream == EGL_NO_STREAM_KHR) { - printf("Could not create EGL stream.\n"); - eglStatus = EGL_FALSE; - goto Done; - } - - eglStatus = eglStreamAttribKHR(g_consumerEglDisplay, g_consumerEglStream, - EGL_CONSUMER_LATENCY_USEC_KHR, 16000); + eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices); if (eglStatus != EGL_TRUE) { - printf("eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n"); - goto Done; + printf("Error querying EGL devices\n"); + goto Done; } - eglStatus = - eglStreamAttribKHR(g_consumerEglDisplay, g_consumerEglStream, - EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000); - if (eglStatus != EGL_TRUE) { - printf( - "eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR failed\n"); - goto Done; - } - } - - if (!isConsumer) { // Producer - - if (fileDesc == EGL_NO_FILE_DESCRIPTOR_KHR) { - printf("Cuda Producer received bad file descriptor\n"); - eglStatus = EGL_FALSE; - goto Done; + if (numDevices == 0) { + printf("No EGL devices found\n"); + eglStatus = EGL_FALSE; + goto Done; } - int egl_device_id = 0; - int egl_cuda_devices = 0; - for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { - EGLAttrib cuda_device = -1; - eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], - EGL_CUDA_DEVICE_NV, &cuda_device); - if (eglStatus == EGL_TRUE) { - egl_cuda_devices++; - if (isCrossDevice && (egl_cuda_devices > 1)) { - // We select second EGL-CUDA Capable device for producer. - cudaDevIndexProd = (int)cuda_device; - printf( - "Found EGL-CUDA Capable device with CUDA Device id = %d " - "egl_device_id = %d \n", - (int)cudaDevIndexProd, egl_device_id); - break; + // If cross device, create discrete GPU stream first and then create the + // integrated GPU stream to connect to it via fd. The other way round fails + // in producer connect. + // + // TODO: Find out if this EGL behavior is by design. + if (isConsumer) { + int egl_device_id = 0; + for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { + EGLAttrib cuda_device; + eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], EGL_CUDA_DEVICE_NV, &cuda_device); + if (eglStatus == EGL_TRUE) { + cudaDevIndexCons = cuda_device; // We select first EGL-CUDA Capable + // device for consumer. + printf("Found EGL-CUDA Capable device with CUDA Device id = %d out of " + "egl_device_id = %d\n", + (int)cudaDevIndexCons, + egl_device_id); + break; + } } - if (!isCrossDevice) { - // We select first EGL-CUDA Capable device for producer same as - // consumer. - cudaDevIndexProd = (int)cuda_device; - printf( - "Found EGL-CUDA Capable device with CUDA Device id = %d " - "egl_device_id = %d \n", - (int)cudaDevIndexProd, egl_device_id); - break; + + if (egl_device_id >= numDevices) { + printf("No CUDA Capable EGL Device found.. Waiving execution\n"); + goto Done; + } + + g_consumerEglDisplay = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, (void *)devices[egl_device_id], NULL); + if (g_consumerEglDisplay == EGL_NO_DISPLAY) { + printf("Could not get EGL display from device. \n"); + eglStatus = EGL_FALSE; + goto Done; + } + + eglStatus = eglInitialize(g_consumerEglDisplay, 0, 0); + if (!eglStatus) { + printf("EGL failed to initialize. \n"); + eglStatus = EGL_FALSE; + goto Done; + } + + g_consumerEglStream = eglCreateStreamKHR(g_consumerEglDisplay, streamAttrFIFOMode); + if (g_consumerEglStream == EGL_NO_STREAM_KHR) { + printf("Could not create EGL stream.\n"); + eglStatus = EGL_FALSE; + goto Done; + } + + eglStatus = eglStreamAttribKHR(g_consumerEglDisplay, g_consumerEglStream, EGL_CONSUMER_LATENCY_USEC_KHR, 16000); + if (eglStatus != EGL_TRUE) { + printf("eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n"); + goto Done; + } + + eglStatus = + eglStreamAttribKHR(g_consumerEglDisplay, g_consumerEglStream, EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000); + if (eglStatus != EGL_TRUE) { + printf("eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR failed\n"); + goto Done; } - } } - if (egl_device_id >= numDevices) { - printf("No CUDA Capable EGL Device found.. Waiving execution\n"); - goto Done; - } + if (!isConsumer) { // Producer - g_producerEglDisplay = eglGetPlatformDisplayEXT( - EGL_PLATFORM_DEVICE_EXT, (void *)devices[egl_device_id], NULL); - if (g_producerEglDisplay == EGL_NO_DISPLAY) { - printf("Could not get EGL display from device. \n"); - eglStatus = EGL_FALSE; - goto Done; - } + if (fileDesc == EGL_NO_FILE_DESCRIPTOR_KHR) { + printf("Cuda Producer received bad file descriptor\n"); + eglStatus = EGL_FALSE; + goto Done; + } - eglStatus = eglInitialize(g_producerEglDisplay, 0, 0); - if (!eglStatus) { - printf("EGL failed to initialize. \n"); - eglStatus = EGL_FALSE; - goto Done; - } + int egl_device_id = 0; + int egl_cuda_devices = 0; + for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { + EGLAttrib cuda_device = -1; + eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], EGL_CUDA_DEVICE_NV, &cuda_device); + if (eglStatus == EGL_TRUE) { + egl_cuda_devices++; + if (isCrossDevice && (egl_cuda_devices > 1)) { + // We select second EGL-CUDA Capable device for producer. + cudaDevIndexProd = (int)cuda_device; + printf("Found EGL-CUDA Capable device with CUDA Device id = %d " + "egl_device_id = %d \n", + (int)cudaDevIndexProd, + egl_device_id); + break; + } + if (!isCrossDevice) { + // We select first EGL-CUDA Capable device for producer same as + // consumer. + cudaDevIndexProd = (int)cuda_device; + printf("Found EGL-CUDA Capable device with CUDA Device id = %d " + "egl_device_id = %d \n", + (int)cudaDevIndexProd, + egl_device_id); + break; + } + } + } - g_producerEglStream = - eglCreateStreamFromFileDescriptorKHR(g_producerEglDisplay, fileDesc); - close(fileDesc); + if (egl_device_id >= numDevices) { + printf("No CUDA Capable EGL Device found.. Waiving execution\n"); + goto Done; + } - if (g_producerEglStream == EGL_NO_STREAM_KHR) { - printf("CUDA Producer Could not create EGL stream.\n"); - eglStatus = EGL_FALSE; - goto Done; - } else { - printf("Producer created EGLStream for the GPU.\n"); + g_producerEglDisplay = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, (void *)devices[egl_device_id], NULL); + if (g_producerEglDisplay == EGL_NO_DISPLAY) { + printf("Could not get EGL display from device. \n"); + eglStatus = EGL_FALSE; + goto Done; + } + + eglStatus = eglInitialize(g_producerEglDisplay, 0, 0); + if (!eglStatus) { + printf("EGL failed to initialize. \n"); + eglStatus = EGL_FALSE; + goto Done; + } + + g_producerEglStream = eglCreateStreamFromFileDescriptorKHR(g_producerEglDisplay, fileDesc); + close(fileDesc); + + if (g_producerEglStream == EGL_NO_STREAM_KHR) { + printf("CUDA Producer Could not create EGL stream.\n"); + eglStatus = EGL_FALSE; + goto Done; + } + else { + printf("Producer created EGLStream for the GPU.\n"); + } } - } Done: - return eglStatus == EGL_TRUE ? 1 : 0; + return eglStatus == EGL_TRUE ? 1 : 0; } -void EGLStreamFini(void) { - if (g_producerEglStream != EGL_NO_STREAM_KHR) { - eglDestroyStreamKHR(g_producerEglDisplay, g_producerEglStream); - } - if (g_consumerEglStream != g_producerEglStream) { - if (g_consumerEglStream != EGL_NO_STREAM_KHR) { - eglDestroyStreamKHR(g_consumerEglDisplay, g_consumerEglStream); +void EGLStreamFini(void) +{ + if (g_producerEglStream != EGL_NO_STREAM_KHR) { + eglDestroyStreamKHR(g_producerEglDisplay, g_producerEglStream); + } + if (g_consumerEglStream != g_producerEglStream) { + if (g_consumerEglStream != EGL_NO_STREAM_KHR) { + eglDestroyStreamKHR(g_consumerEglDisplay, g_consumerEglStream); + } } - } } -int UnixSocketConnect(const char *socket_name) { - int sock_fd = -1; - struct sockaddr_un sock_addr; - int wait_loop = 0; +int UnixSocketConnect(const char *socket_name) +{ + int sock_fd = -1; + struct sockaddr_un sock_addr; + int wait_loop = 0; - sock_fd = socket(PF_UNIX, SOCK_STREAM, 0); - if (sock_fd < 0) { - printf("%s: socket create failed.\n", __func__); - return -1; - } - - if (verbose) printf("%s: send_fd: sock_fd: %d\n", __func__, sock_fd); - - memset(&sock_addr, 0, sizeof(struct sockaddr_un)); - sock_addr.sun_family = AF_UNIX; - strncpy(sock_addr.sun_path, socket_name, sizeof(sock_addr.sun_path) - 1); - - while (connect(sock_fd, (const struct sockaddr *)&sock_addr, - sizeof(struct sockaddr_un))) { - if (wait_loop < 60) { - if (!wait_loop) - printf("Waiting for EGL stream producer "); - else - printf("."); - fflush(stdout); - sleep(1); - wait_loop++; - } else { - printf("\n%s: Waiting timed out\n", __func__); - return -1; + sock_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (sock_fd < 0) { + printf("%s: socket create failed.\n", __func__); + return -1; } - } - if (wait_loop) printf("\n"); - if (verbose) printf("%s: Wait is done\n", __func__); + if (verbose) + printf("%s: send_fd: sock_fd: %d\n", __func__, sock_fd); - return sock_fd; + memset(&sock_addr, 0, sizeof(struct sockaddr_un)); + sock_addr.sun_family = AF_UNIX; + strncpy(sock_addr.sun_path, socket_name, sizeof(sock_addr.sun_path) - 1); + + while (connect(sock_fd, (const struct sockaddr *)&sock_addr, sizeof(struct sockaddr_un))) { + if (wait_loop < 60) { + if (!wait_loop) + printf("Waiting for EGL stream producer "); + else + printf("."); + fflush(stdout); + sleep(1); + wait_loop++; + } + else { + printf("\n%s: Waiting timed out\n", __func__); + return -1; + } + } + if (wait_loop) + printf("\n"); + + if (verbose) + printf("%s: Wait is done\n", __func__); + + return sock_fd; } /* Send (a file descriptor) to another process */ /* over a unix domain socket named . */ /* can be any nonexistant filename. */ -int EGLStreamSendfd(int send_fd, int fd_to_send) { - struct msghdr msg; - struct iovec iov[1]; - char ctrl_buf[CMSG_SPACE(sizeof(int))]; - struct cmsghdr *cmsg = NULL; - void *data; - int res; - memset(&msg, 0, sizeof(msg)); +int EGLStreamSendfd(int send_fd, int fd_to_send) +{ + struct msghdr msg; + struct iovec iov[1]; + char ctrl_buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr *cmsg = NULL; + void *data; + int res; + memset(&msg, 0, sizeof(msg)); - iov[0].iov_len = 1; // must send at least 1 byte - iov[0].iov_base = (void *)"x"; // any byte value (value ignored) - msg.msg_iov = iov; - msg.msg_iovlen = 1; + iov[0].iov_len = 1; // must send at least 1 byte + iov[0].iov_base = (void *)"x"; // any byte value (value ignored) + msg.msg_iov = iov; + msg.msg_iovlen = 1; - memset(ctrl_buf, 0, sizeof(ctrl_buf)); - msg.msg_control = ctrl_buf; - msg.msg_controllen = sizeof(ctrl_buf); + memset(ctrl_buf, 0, sizeof(ctrl_buf)); + msg.msg_control = ctrl_buf; + msg.msg_controllen = sizeof(ctrl_buf); - cmsg = CMSG_FIRSTHDR(&msg); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); - data = CMSG_DATA(cmsg); - *(int *)data = fd_to_send; + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + data = CMSG_DATA(cmsg); + *(int *)data = fd_to_send; - msg.msg_controllen = cmsg->cmsg_len; + msg.msg_controllen = cmsg->cmsg_len; - res = sendmsg(send_fd, &msg, 0); - if (res <= 0) { - printf("%s: sendmsg failed", __func__); - return -1; - } + res = sendmsg(send_fd, &msg, 0); + if (res <= 0) { + printf("%s: sendmsg failed", __func__); + return -1; + } - return 0; + return 0; } /* Listen on a unix domain socket named . */ /* Connect to it and return connect_fd */ -int UnixSocketCreate(const char *socket_name) { - int listen_fd; - struct sockaddr_un sock_addr; - int connect_fd; - struct sockaddr_un connect_addr; - socklen_t connect_addr_len = 0; +int UnixSocketCreate(const char *socket_name) +{ + int listen_fd; + struct sockaddr_un sock_addr; + int connect_fd; + struct sockaddr_un connect_addr; + socklen_t connect_addr_len = 0; - listen_fd = socket(PF_UNIX, SOCK_STREAM, 0); - if (listen_fd < 0) { - printf("%s: socket create failed", __func__); - return -1; - } + listen_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (listen_fd < 0) { + printf("%s: socket create failed", __func__); + return -1; + } - if (verbose) printf("%s: listen_fd: %d\n", __func__, listen_fd); + if (verbose) + printf("%s: listen_fd: %d\n", __func__, listen_fd); - unlink(socket_name); + unlink(socket_name); - memset(&sock_addr, 0, sizeof(struct sockaddr_un)); - sock_addr.sun_family = AF_UNIX; - strncpy(sock_addr.sun_path, socket_name, sizeof(sock_addr.sun_path) - 1); + memset(&sock_addr, 0, sizeof(struct sockaddr_un)); + sock_addr.sun_family = AF_UNIX; + strncpy(sock_addr.sun_path, socket_name, sizeof(sock_addr.sun_path) - 1); - if (bind(listen_fd, (const struct sockaddr *)&sock_addr, - sizeof(struct sockaddr_un))) { - printf("i%s: bind error", __func__); - return -1; - } + if (bind(listen_fd, (const struct sockaddr *)&sock_addr, sizeof(struct sockaddr_un))) { + printf("i%s: bind error", __func__); + return -1; + } - if (listen(listen_fd, 1)) { - printf("%s: listen error", __func__); - return -1; - } + if (listen(listen_fd, 1)) { + printf("%s: listen error", __func__); + return -1; + } - connect_fd = - accept(listen_fd, (struct sockaddr *)&connect_addr, &connect_addr_len); + connect_fd = accept(listen_fd, (struct sockaddr *)&connect_addr, &connect_addr_len); - if (verbose) printf("%s: connect_fd: %d\n", __func__, connect_fd); + if (verbose) + printf("%s: connect_fd: %d\n", __func__, connect_fd); - close(listen_fd); - unlink(socket_name); - if (connect_fd < 0) { - printf("%s: accept failed\n", __func__); - return -1; - } + close(listen_fd); + unlink(socket_name); + if (connect_fd < 0) { + printf("%s: accept failed\n", __func__); + return -1; + } - return connect_fd; + return connect_fd; } /* receive a file descriptor from another process. */ @@ -376,48 +376,49 @@ int UnixSocketCreate(const char *socket_name) { /* integer value in the other process, but the file */ /* descriptors in each process will refer to the same file */ /* object in the kernel. */ -int EGLStreamReceivefd(int connect_fd) { - struct msghdr msg; - struct iovec iov[1]; - char msg_buf[1]; - char ctrl_buf[CMSG_SPACE(sizeof(int))]; - struct cmsghdr *cmsg; - void *data; - int recvfd; +int EGLStreamReceivefd(int connect_fd) +{ + struct msghdr msg; + struct iovec iov[1]; + char msg_buf[1]; + char ctrl_buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr *cmsg; + void *data; + int recvfd; - memset(&msg, 0, sizeof(msg)); + memset(&msg, 0, sizeof(msg)); - iov[0].iov_base = msg_buf; - iov[0].iov_len = sizeof(msg_buf); - msg.msg_iov = iov; - msg.msg_iovlen = 1; + iov[0].iov_base = msg_buf; + iov[0].iov_len = sizeof(msg_buf); + msg.msg_iov = iov; + msg.msg_iovlen = 1; - msg.msg_control = ctrl_buf; - msg.msg_controllen = sizeof(ctrl_buf); + msg.msg_control = ctrl_buf; + msg.msg_controllen = sizeof(ctrl_buf); - if (recvmsg(connect_fd, &msg, 0) <= 0) { - printf("%s: recvmsg failed", __func__); - return -1; - } + if (recvmsg(connect_fd, &msg, 0) <= 0) { + printf("%s: recvmsg failed", __func__); + return -1; + } - cmsg = CMSG_FIRSTHDR(&msg); - if (!cmsg) { - printf("%s: NULL message header\n", __func__); - return -1; - } - if (cmsg->cmsg_level != SOL_SOCKET) { - printf("%s: Message level is not SOL_SOCKET\n", __func__); - return -1; - } - if (cmsg->cmsg_type != SCM_RIGHTS) { - printf("%s: Message type is not SCM_RIGHTS\n", __func__); - return -1; - } + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + printf("%s: NULL message header\n", __func__); + return -1; + } + if (cmsg->cmsg_level != SOL_SOCKET) { + printf("%s: Message level is not SOL_SOCKET\n", __func__); + return -1; + } + if (cmsg->cmsg_type != SCM_RIGHTS) { + printf("%s: Message type is not SCM_RIGHTS\n", __func__); + return -1; + } - data = CMSG_DATA(cmsg); - recvfd = *(int *)data; + data = CMSG_DATA(cmsg); + recvfd = *(int *)data; - return recvfd; + return recvfd; } #endif diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.h b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.h index c1e4a5ec..46442d1b 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.h +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/eglstrm_common.h @@ -51,51 +51,49 @@ extern EGLStreamKHR g_producerEglStream; extern EGLStreamKHR g_consumerEglStream; -extern EGLDisplay g_producerEglDisplay; -extern EGLDisplay g_consumerEglDisplay; -extern int cudaDevIndexCons; -extern int cudaDevIndexProd; -extern bool verbose; +extern EGLDisplay g_producerEglDisplay; +extern EGLDisplay g_consumerEglDisplay; +extern int cudaDevIndexCons; +extern int cudaDevIndexProd; +extern bool verbose; -#define EXTENSION_LIST(T) \ - T(PFNEGLCREATESTREAMKHRPROC, eglCreateStreamKHR) \ - T(PFNEGLDESTROYSTREAMKHRPROC, eglDestroyStreamKHR) \ - T(PFNEGLQUERYSTREAMKHRPROC, eglQueryStreamKHR) \ - T(PFNEGLQUERYSTREAMU64KHRPROC, eglQueryStreamu64KHR) \ - T(PFNEGLQUERYSTREAMTIMEKHRPROC, eglQueryStreamTimeKHR) \ - T(PFNEGLSTREAMATTRIBKHRPROC, eglStreamAttribKHR) \ - T(PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR) \ - T(PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR) \ - T(PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC, \ - eglStreamConsumerGLTextureExternalKHR) \ - T(PFNEGLQUERYDEVICESEXTPROC, eglQueryDevicesEXT) \ - T(PFNEGLGETPLATFORMDISPLAYEXTPROC, eglGetPlatformDisplayEXT) \ - T(PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \ - T(PFNEGLQUERYDEVICEATTRIBEXTPROC, eglQueryDeviceAttribEXT) \ - T(PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC, \ - eglCreateStreamFromFileDescriptorKHR) +#define EXTENSION_LIST(T) \ + T(PFNEGLCREATESTREAMKHRPROC, eglCreateStreamKHR) \ + T(PFNEGLDESTROYSTREAMKHRPROC, eglDestroyStreamKHR) \ + T(PFNEGLQUERYSTREAMKHRPROC, eglQueryStreamKHR) \ + T(PFNEGLQUERYSTREAMU64KHRPROC, eglQueryStreamu64KHR) \ + T(PFNEGLQUERYSTREAMTIMEKHRPROC, eglQueryStreamTimeKHR) \ + T(PFNEGLSTREAMATTRIBKHRPROC, eglStreamAttribKHR) \ + T(PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR) \ + T(PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR) \ + T(PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC, eglStreamConsumerGLTextureExternalKHR) \ + T(PFNEGLQUERYDEVICESEXTPROC, eglQueryDevicesEXT) \ + T(PFNEGLGETPLATFORMDISPLAYEXTPROC, eglGetPlatformDisplayEXT) \ + T(PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \ + T(PFNEGLQUERYDEVICEATTRIBEXTPROC, eglQueryDeviceAttribEXT) \ + T(PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC, eglCreateStreamFromFileDescriptorKHR) -#define EXTLST_DECL(tx, x) tx x = NULL; +#define EXTLST_DECL(tx, x) tx x = NULL; #define EXTLST_EXTERN(tx, x) extern tx x; -#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&x, #x}, +#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&x, #x}, #define MAX_STRING_SIZE 256 -#define INIT_DATA 0x01 -#define PROD_DATA 0x07 -#define CONS_DATA 0x04 +#define INIT_DATA 0x01 +#define PROD_DATA 0x07 +#define CONS_DATA 0x04 #define SOCK_PATH "/tmp/tegra_sw_egl_socket" -typedef struct _TestArgs { - unsigned int charCnt; - bool isProducer; +typedef struct _TestArgs +{ + unsigned int charCnt; + bool isProducer; } TestArgs; extern int WIDTH, HEIGHT; -int eglSetupExtensions(bool is_dgpu); -int EGLStreamInit(bool isCrossDevice, int isConsumer, - EGLNativeFileDescriptorKHR fileDesc); +int eglSetupExtensions(bool is_dgpu); +int EGLStreamInit(bool isCrossDevice, int isConsumer, EGLNativeFileDescriptorKHR fileDesc); void EGLStreamFini(void); int EGLStreamSetAttr(EGLDisplay display, EGLStreamKHR eglStream); @@ -104,10 +102,8 @@ int EGLStreamSendfd(int send_fd, int fd_to_send); int UnixSocketCreate(const char *socket_name); int EGLStreamReceivefd(int connect_fd); -static clockid_t clock_id = CLOCK_MONOTONIC; // CLOCK_PROCESS_CPUTIME_ID; -static double getMicrosecond(struct timespec t) { - return ((t.tv_sec) * 1000000.0 + (t.tv_nsec) / 1.0e3); -} +static clockid_t clock_id = CLOCK_MONOTONIC; // CLOCK_PROCESS_CPUTIME_ID; +static double getMicrosecond(struct timespec t) { return ((t.tv_sec) * 1000000.0 + (t.tv_nsec) / 1.0e3); } static inline void getTime(struct timespec *t) { clock_gettime(clock_id, t); } #endif diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/helper.h b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/helper.h index 373d82f3..0b056c60 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/helper.h +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/helper.h @@ -31,191 +31,198 @@ EXTENSION_LIST(EXTLST_EXTERN) #endif #include -int parseCmdLine(int argc, char *argv[], TestArgs *args); +int parseCmdLine(int argc, char *argv[], TestArgs *args); void printUsage(void); -int NUMTRIALS = 10; -int profileAPIs = 0; +int NUMTRIALS = 10; +int profileAPIs = 0; -bool verbose = 0; +bool verbose = 0; bool isCrossDevice = 0; // Parse the command line options. Returns FAILURE on a parse error, SUCCESS // otherwise. -int parseCmdLine(int argc, char *argv[], TestArgs *args) { - int i; +int parseCmdLine(int argc, char *argv[], TestArgs *args) +{ + int i; - for (i = 1; i < argc; i++) { - if (strcmp(argv[i], "-h") == 0) { - printUsage(); - exit(0); - } else if (strcmp(argv[i], "-n") == 0) { - ++i; - if (sscanf(argv[i], "%d", &NUMTRIALS) != 1 || NUMTRIALS <= 0) { - printf("Invalid trial count: %s should be > 0\n", argv[i]); - return -1; - } - } else if (strcmp(argv[i], "-profile") == 0) { - profileAPIs = 1; - } else if (strcmp(argv[i], "-crossdev") == 0) { - isCrossDevice = 1; - } else if (strcmp(argv[i], "-width") == 0) { - ++i; - if (sscanf(argv[i], "%d", &WIDTH) != 1 || (WIDTH <= 0)) { - printf("Width should be greater than 0\n"); - return -1; - } - } else if (strcmp(argv[i], "-height") == 0) { - ++i; - if (sscanf(argv[i], "%d", &HEIGHT) != 1 || (HEIGHT <= 0)) { - printf("Width should be greater than 0\n"); - return -1; - } - } else if (0 == strcmp(&argv[i][1], "proctype")) { - ++i; - if (!strcasecmp(argv[i], "prod")) { - args->isProducer = 1; - } else if (!strcasecmp(argv[i], "cons")) { - args->isProducer = 0; - } else { - printf("%s: Bad Process Type: %s\n", __func__, argv[i]); - return 1; - } - } else if (strcmp(argv[i], "-v") == 0) { - verbose = 1; - } else { - printf("Unknown option: %s\n", argv[i]); - return -1; - } - } - - if (isCrossDevice) { - int deviceCount = 0; - - CUresult error_id = cuInit(0); - if (error_id != CUDA_SUCCESS) { - printf("cuInit(0) returned %d\n", error_id); - printf("Result = FAIL\n"); - exit(EXIT_FAILURE); + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + printUsage(); + exit(0); + } + else if (strcmp(argv[i], "-n") == 0) { + ++i; + if (sscanf(argv[i], "%d", &NUMTRIALS) != 1 || NUMTRIALS <= 0) { + printf("Invalid trial count: %s should be > 0\n", argv[i]); + return -1; + } + } + else if (strcmp(argv[i], "-profile") == 0) { + profileAPIs = 1; + } + else if (strcmp(argv[i], "-crossdev") == 0) { + isCrossDevice = 1; + } + else if (strcmp(argv[i], "-width") == 0) { + ++i; + if (sscanf(argv[i], "%d", &WIDTH) != 1 || (WIDTH <= 0)) { + printf("Width should be greater than 0\n"); + return -1; + } + } + else if (strcmp(argv[i], "-height") == 0) { + ++i; + if (sscanf(argv[i], "%d", &HEIGHT) != 1 || (HEIGHT <= 0)) { + printf("Width should be greater than 0\n"); + return -1; + } + } + else if (0 == strcmp(&argv[i][1], "proctype")) { + ++i; + if (!strcasecmp(argv[i], "prod")) { + args->isProducer = 1; + } + else if (!strcasecmp(argv[i], "cons")) { + args->isProducer = 0; + } + else { + printf("%s: Bad Process Type: %s\n", __func__, argv[i]); + return 1; + } + } + else if (strcmp(argv[i], "-v") == 0) { + verbose = 1; + } + else { + printf("Unknown option: %s\n", argv[i]); + return -1; + } } - error_id = cuDeviceGetCount(&deviceCount); - if (error_id != CUDA_SUCCESS) { - printf("cuDeviceGetCount returned %d\n", (int)error_id); - printf("Result = FAIL\n"); - exit(EXIT_FAILURE); + if (isCrossDevice) { + int deviceCount = 0; + + CUresult error_id = cuInit(0); + if (error_id != CUDA_SUCCESS) { + printf("cuInit(0) returned %d\n", error_id); + printf("Result = FAIL\n"); + exit(EXIT_FAILURE); + } + + error_id = cuDeviceGetCount(&deviceCount); + if (error_id != CUDA_SUCCESS) { + printf("cuDeviceGetCount returned %d\n", (int)error_id); + printf("Result = FAIL\n"); + exit(EXIT_FAILURE); + } + + int iGPUexists = 0; + CUdevice dev; + for (dev = 0; dev < deviceCount; ++dev) { + int integrated = 0; + CUresult error_result = cuDeviceGetAttribute(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); + + if (error_result != CUDA_SUCCESS) { + printf("cuDeviceGetAttribute returned error : %d\n", (int)error_result); + exit(EXIT_FAILURE); + } + + if (integrated) { + iGPUexists = 1; + } + } + + if (!iGPUexists) { + printf("No Integrated GPU found in the system.\n"); + printf("-crossdev option is only supported on systems with an Integrated " + "GPU and a Discrete GPU\n"); + printf("Waiving the execution\n"); + exit(EXIT_SUCCESS); + } } - int iGPUexists = 0; - CUdevice dev; - for (dev = 0; dev < deviceCount; ++dev) { - int integrated = 0; - CUresult error_result = cuDeviceGetAttribute( - &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); - - if (error_result != CUDA_SUCCESS) { - printf("cuDeviceGetAttribute returned error : %d\n", (int)error_result); + if (!eglSetupExtensions(isCrossDevice)) { + printf("SetupExtentions failed \n"); exit(EXIT_FAILURE); - } - - if (integrated) { - iGPUexists = 1; - } } - - if (!iGPUexists) { - printf("No Integrated GPU found in the system.\n"); - printf( - "-crossdev option is only supported on systems with an Integrated " - "GPU and a Discrete GPU\n"); - printf("Waiving the execution\n"); - exit(EXIT_SUCCESS); - } - } - - if (!eglSetupExtensions(isCrossDevice)) { - printf("SetupExtentions failed \n"); - exit(EXIT_FAILURE); - } #define MAX_EGL_DEVICES 4 - EGLDeviceEXT devices[MAX_EGL_DEVICES]; - EGLint numDevices = 0; - EGLBoolean eglStatus = - eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices); - if (eglStatus != EGL_TRUE) { - printf("Error querying EGL devices\n"); - exit(EXIT_FAILURE); - } - - if (numDevices == 0) { - printf("No EGL devices found\n"); - eglStatus = EGL_FALSE; - exit(2); // EXIT_WAIVED - } - - int egl_device_id = 0; - for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { - EGLAttrib cuda_device; - eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], - EGL_CUDA_DEVICE_NV, &cuda_device); - if (eglStatus == EGL_TRUE) { - break; + EGLDeviceEXT devices[MAX_EGL_DEVICES]; + EGLint numDevices = 0; + EGLBoolean eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices); + if (eglStatus != EGL_TRUE) { + printf("Error querying EGL devices\n"); + exit(EXIT_FAILURE); } - } - if (egl_device_id >= numDevices) { - printf("No CUDA Capable EGL Device found.. Waiving execution\n"); - exit(2); // EXIT_WAIVED - } - - if (isCrossDevice) { - if (numDevices == 1) { - printf( - "Found only one EGL device, cannot setup cross GPU streams. " - "Waiving\n"); - eglStatus = EGL_FALSE; - exit(2); // EXIT_WAIVED + if (numDevices == 0) { + printf("No EGL devices found\n"); + eglStatus = EGL_FALSE; + exit(2); // EXIT_WAIVED } - } - return 0; + int egl_device_id = 0; + for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { + EGLAttrib cuda_device; + eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], EGL_CUDA_DEVICE_NV, &cuda_device); + if (eglStatus == EGL_TRUE) { + break; + } + } + + if (egl_device_id >= numDevices) { + printf("No CUDA Capable EGL Device found.. Waiving execution\n"); + exit(2); // EXIT_WAIVED + } + + if (isCrossDevice) { + if (numDevices == 1) { + printf("Found only one EGL device, cannot setup cross GPU streams. " + "Waiving\n"); + eglStatus = EGL_FALSE; + exit(2); // EXIT_WAIVED + } + } + + return 0; } -void launchProducer(TestArgs *args) { - /* Cross-process creation of producer */ - char argsProducer[1024]; - char str[256]; +void launchProducer(TestArgs *args) +{ + /* Cross-process creation of producer */ + char argsProducer[1024]; + char str[256]; - strcpy(argsProducer, "./EGLStream_CUDA_CrossGPU -proctype prod "); + strcpy(argsProducer, "./EGLStream_CUDA_CrossGPU -proctype prod "); - if (isCrossDevice) { - sprintf(str, "-crossdev "); - strcat(argsProducer, str); - } + if (isCrossDevice) { + sprintf(str, "-crossdev "); + strcat(argsProducer, str); + } - if (verbose) { - sprintf(str, "-v "); - strcat(argsProducer, str); - } + if (verbose) { + sprintf(str, "-v "); + strcat(argsProducer, str); + } - /*Make the process run in bg*/ - strcat(argsProducer, "& "); + /*Make the process run in bg*/ + strcat(argsProducer, "& "); - printf("\n%s: Crossproc Producer command: %s \n", __func__, argsProducer); + printf("\n%s: Crossproc Producer command: %s \n", __func__, argsProducer); - /*Create crossproc Producer*/ - system(argsProducer); + /*Create crossproc Producer*/ + system(argsProducer); - /*Enable crossproc Consumer in the same process */ - args->isProducer = 0; + /*Enable crossproc Consumer in the same process */ + args->isProducer = 0; } -void printUsage(void) { - printf("Usage:\n"); - printf(" -h Print this help message\n"); - printf(" -n n Exit after running n trials. Set to 10 by default\n"); - printf( - " -profile Profile time taken by ReleaseAPI. Not set by default\n"); - printf(" -crossdev Run with producer on idgpu and consumer on dgpu\n"); - printf(" -dgpu (same as -crossdev, deprecated)\n"); - printf(" -v verbose output\n"); +void printUsage(void) +{ + printf("Usage:\n"); + printf(" -h Print this help message\n"); + printf(" -n n Exit after running n trials. Set to 10 by default\n"); + printf(" -profile Profile time taken by ReleaseAPI. Not set by default\n"); + printf(" -crossdev Run with producer on idgpu and consumer on dgpu\n"); + printf(" -dgpu (same as -crossdev, deprecated)\n"); + printf(" -v verbose output\n"); } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/kernel.cu b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/kernel.cu index cc44e123..e13ab264 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/kernel.cu +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/kernel.cu @@ -41,100 +41,110 @@ extern bool isCrossDevice; __device__ static unsigned int numErrors = 0, errorFound = 0; -__device__ void checkProducerDataGPU(char *data, int size, char expectedVal, - int frameNumber) { - if ((data[blockDim.x * blockIdx.x + threadIdx.x] != expectedVal) && - (!errorFound)) { - printf("Producer FOUND:%d expected: %d at %d for trial %d %d\n", - data[blockDim.x * blockIdx.x + threadIdx.x], expectedVal, - (blockDim.x * blockIdx.x + threadIdx.x), frameNumber, numErrors); - numErrors++; - errorFound = 1; - return; - } +__device__ void checkProducerDataGPU(char *data, int size, char expectedVal, int frameNumber) +{ + if ((data[blockDim.x * blockIdx.x + threadIdx.x] != expectedVal) && (!errorFound)) { + printf("Producer FOUND:%d expected: %d at %d for trial %d %d\n", + data[blockDim.x * blockIdx.x + threadIdx.x], + expectedVal, + (blockDim.x * blockIdx.x + threadIdx.x), + frameNumber, + numErrors); + numErrors++; + errorFound = 1; + return; + } } -__device__ void checkConsumerDataGPU(char *data, int size, char expectedVal, - int frameNumber) { - if ((data[blockDim.x * blockIdx.x + threadIdx.x] != expectedVal) && - (!errorFound)) { - printf("Consumer FOUND:%d expected: %d at %d for trial %d %d\n", - data[blockDim.x * blockIdx.x + threadIdx.x], expectedVal, - (blockDim.x * blockIdx.x + threadIdx.x), frameNumber, numErrors); - numErrors++; - errorFound = 1; - return; - } +__device__ void checkConsumerDataGPU(char *data, int size, char expectedVal, int frameNumber) +{ + if ((data[blockDim.x * blockIdx.x + threadIdx.x] != expectedVal) && (!errorFound)) { + printf("Consumer FOUND:%d expected: %d at %d for trial %d %d\n", + data[blockDim.x * blockIdx.x + threadIdx.x], + expectedVal, + (blockDim.x * blockIdx.x + threadIdx.x), + frameNumber, + numErrors); + numErrors++; + errorFound = 1; + return; + } } -__global__ void writeDataToBuffer(char *pSrc, char newVal) { - pSrc[blockDim.x * blockIdx.x + threadIdx.x] = newVal; +__global__ void writeDataToBuffer(char *pSrc, char newVal) { pSrc[blockDim.x * blockIdx.x + threadIdx.x] = newVal; } + +__global__ void testKernelConsumer(char *pSrc, char size, char expectedVal, char newVal, int frameNumber) +{ + checkConsumerDataGPU(pSrc, size, expectedVal, frameNumber); } -__global__ void testKernelConsumer(char *pSrc, char size, char expectedVal, - char newVal, int frameNumber) { - checkConsumerDataGPU(pSrc, size, expectedVal, frameNumber); -} - -__global__ void testKernelProducer(char *pSrc, char size, char expectedVal, - char newVal, int frameNumber) { - checkProducerDataGPU(pSrc, size, expectedVal, frameNumber); +__global__ void testKernelProducer(char *pSrc, char size, char expectedVal, char newVal, int frameNumber) +{ + checkProducerDataGPU(pSrc, size, expectedVal, frameNumber); } __global__ void getNumErrors(int *numErr) { *numErr = numErrors; } -cudaError_t cudaProducer_filter(cudaStream_t pStream, char *pSrc, int width, - int height, char expectedVal, char newVal, - int frameNumber) { - // in case where consumer is on dgpu and producer is on igpu when return is - // called the frame is not copied back to igpu. So the consumer changes is not - // visible to producer - if (isCrossDevice == 0) { - testKernelProducer<<<(width * height) / 1024, 1024, 1, pStream>>>( +cudaError_t cudaProducer_filter(cudaStream_t pStream, + char *pSrc, + int width, + int height, + char expectedVal, + char newVal, + int frameNumber) +{ + // in case where consumer is on dgpu and producer is on igpu when return is + // called the frame is not copied back to igpu. So the consumer changes is not + // visible to producer + if (isCrossDevice == 0) { + testKernelProducer<<<(width * height) / 1024, 1024, 1, pStream>>>( + pSrc, width * height, expectedVal, newVal, frameNumber); + } + writeDataToBuffer<<<(width * height) / 1024, 1024, 1, pStream>>>(pSrc, newVal); + return cudaSuccess; +}; + +cudaError_t cudaConsumer_filter(cudaStream_t cStream, + char *pSrc, + int width, + int height, + char expectedVal, + char newVal, + int frameNumber) +{ + testKernelConsumer<<<(width * height) / 1024, 1024, 1, cStream>>>( pSrc, width * height, expectedVal, newVal, frameNumber); - } - writeDataToBuffer<<<(width * height) / 1024, 1024, 1, pStream>>>(pSrc, - newVal); - return cudaSuccess; + writeDataToBuffer<<<(width * height) / 1024, 1024, 1, cStream>>>(pSrc, newVal); + return cudaSuccess; }; -cudaError_t cudaConsumer_filter(cudaStream_t cStream, char *pSrc, int width, - int height, char expectedVal, char newVal, - int frameNumber) { - testKernelConsumer<<<(width * height) / 1024, 1024, 1, cStream>>>( - pSrc, width * height, expectedVal, newVal, frameNumber); - writeDataToBuffer<<<(width * height) / 1024, 1024, 1, cStream>>>(pSrc, - newVal); - return cudaSuccess; -}; - -cudaError_t cudaGetValueMismatch() { - int numErr_h; - int *numErr_d = NULL; - cudaError_t err = cudaSuccess; - err = cudaMalloc(&numErr_d, sizeof(int)); - if (err != cudaSuccess) { - printf("Cuda Main: cudaMalloc failed with %s\n", cudaGetErrorString(err)); - return err; - } - getNumErrors<<<1, 1>>>(numErr_d); - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) { - printf("Cuda Main: cudaDeviceSynchronize failed with %s\n", - cudaGetErrorString(err)); - } - err = cudaMemcpy(&numErr_h, numErr_d, sizeof(int), cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - printf("Cuda Main: cudaMemcpy failed with %s\n", cudaGetErrorString(err)); - cudaFree(numErr_d); - return err; - } - err = cudaFree(numErr_d); - if (err != cudaSuccess) { - printf("Cuda Main: cudaFree failed with %s\n", cudaGetErrorString(err)); - return err; - } - if (numErr_h > 0) { - return cudaErrorUnknown; - } - return cudaSuccess; +cudaError_t cudaGetValueMismatch() +{ + int numErr_h; + int *numErr_d = NULL; + cudaError_t err = cudaSuccess; + err = cudaMalloc(&numErr_d, sizeof(int)); + if (err != cudaSuccess) { + printf("Cuda Main: cudaMalloc failed with %s\n", cudaGetErrorString(err)); + return err; + } + getNumErrors<<<1, 1>>>(numErr_d); + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("Cuda Main: cudaDeviceSynchronize failed with %s\n", cudaGetErrorString(err)); + } + err = cudaMemcpy(&numErr_h, numErr_d, sizeof(int), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + printf("Cuda Main: cudaMemcpy failed with %s\n", cudaGetErrorString(err)); + cudaFree(numErr_d); + return err; + } + err = cudaFree(numErr_d); + if (err != cudaSuccess) { + printf("Cuda Main: cudaFree failed with %s\n", cudaGetErrorString(err)); + return err; + } + if (numErr_h > 0) { + return cudaErrorUnknown; + } + return cudaSuccess; } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/main.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/main.cpp index e29bca5a..15350594 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/main.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/main.cpp @@ -34,359 +34,345 @@ EXTENSION_LIST(EXTLST_EXTERN) #endif -bool signal_stop = 0; +bool signal_stop = 0; extern bool verbose; -static void sig_handler(int sig) { - signal_stop = 1; - printf("Signal: %d\n", sig); +static void sig_handler(int sig) +{ + signal_stop = 1; + printf("Signal: %d\n", sig); } -void DoneCons(int consumerStatus, int send_fd) { - EGLStreamFini(); - // get the final status from producer, combine and print - int producerStatus = -1; - if (-1 == recv(send_fd, (void *)&producerStatus, sizeof(int), 0)) { - printf("%s: Cuda Consumer could not receive status from producer.\n", - __func__); - } - close(send_fd); +void DoneCons(int consumerStatus, int send_fd) +{ + EGLStreamFini(); + // get the final status from producer, combine and print + int producerStatus = -1; + if (-1 == recv(send_fd, (void *)&producerStatus, sizeof(int), 0)) { + printf("%s: Cuda Consumer could not receive status from producer.\n", __func__); + } + close(send_fd); - if (producerStatus == 0 && consumerStatus == 0) { - printf("&&&& EGLStream_CUDA_CrossGPU PASSED\n"); - exit(EXIT_SUCCESS); - } else { - printf("&&&& EGLStream_CUDA_CrossGPU FAILED\n"); - exit(EXIT_FAILURE); - } + if (producerStatus == 0 && consumerStatus == 0) { + printf("&&&& EGLStream_CUDA_CrossGPU PASSED\n"); + exit(EXIT_SUCCESS); + } + else { + printf("&&&& EGLStream_CUDA_CrossGPU FAILED\n"); + exit(EXIT_FAILURE); + } } -void DoneProd(int producerStatus, int connect_fd) { - EGLStreamFini(); - if (-1 == send(connect_fd, (void *)&producerStatus, sizeof(int), 0)) { - printf("%s: Cuda Producer could not send status to consumer.\n", __func__); - } - close(connect_fd); - if (producerStatus == 0) { - exit(EXIT_SUCCESS); - } else { - exit(EXIT_FAILURE); - } +void DoneProd(int producerStatus, int connect_fd) +{ + EGLStreamFini(); + if (-1 == send(connect_fd, (void *)&producerStatus, sizeof(int), 0)) { + printf("%s: Cuda Producer could not send status to consumer.\n", __func__); + } + close(connect_fd); + if (producerStatus == 0) { + exit(EXIT_SUCCESS); + } + else { + exit(EXIT_FAILURE); + } } int WIDTH = 8192, HEIGHT = 8192; -int main(int argc, char **argv) { - TestArgs args = {0, false}; - CUresult curesult = CUDA_SUCCESS; - unsigned int j = 0; - cudaError_t err = cudaSuccess; - EGLNativeFileDescriptorKHR fileDescriptor = EGL_NO_FILE_DESCRIPTOR_KHR; - struct timespec start, end; - CUeglFrame cudaEgl1, cudaEgl2; - int consumerStatus = 0; - int send_fd = -1; +int main(int argc, char **argv) +{ + TestArgs args = {0, false}; + CUresult curesult = CUDA_SUCCESS; + unsigned int j = 0; + cudaError_t err = cudaSuccess; + EGLNativeFileDescriptorKHR fileDescriptor = EGL_NO_FILE_DESCRIPTOR_KHR; + struct timespec start, end; + CUeglFrame cudaEgl1, cudaEgl2; + int consumerStatus = 0; + int send_fd = -1; - if (parseCmdLine(argc, argv, &args) < 0) { - printUsage(); - curesult = CUDA_ERROR_UNKNOWN; - DoneCons(consumerStatus, send_fd); - } - - printf("Width : %u, height: %u and iterations: %u\n", WIDTH, HEIGHT, - NUMTRIALS); - - if (!args.isProducer) // Consumer code - { - test_cuda_consumer_s cudaConsumer; - memset(&cudaConsumer, 0, sizeof(test_cuda_consumer_s)); - cudaConsumer.profileAPI = profileAPIs; - - // Hook up Ctrl-C handler - signal(SIGINT, sig_handler); - - if (!EGLStreamInit(isCrossDevice, !args.isProducer, - EGL_NO_FILE_DESCRIPTOR_KHR)) { - printf("EGLStream Init failed.\n"); - curesult = CUDA_ERROR_UNKNOWN; - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - - cudaConsumer.cudaDevId = cudaDevIndexCons; - curesult = cudaDeviceCreateConsumer(&cudaConsumer); - if (curesult != CUDA_SUCCESS) { - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - - cuCtxPushCurrent(cudaConsumer.context); - - launchProducer(&args); - - args.charCnt = WIDTH * HEIGHT * 4; - - curesult = cuda_consumer_init(&cudaConsumer, &args); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Consumer: Init failed, status: %d\n", curesult); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - - cuCtxPopCurrent(&cudaConsumer.context); - - send_fd = UnixSocketConnect(SOCK_PATH); - if (-1 == send_fd) { - printf("%s: Cuda Consumer cannot create socket %s\n", __func__, - SOCK_PATH); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - - cuCtxPushCurrent(cudaConsumer.context); - cudaConsumer.eglStream = g_consumerEglStream; - cudaConsumer.eglDisplay = g_consumerEglDisplay; - - // Send the EGL stream FD to producer - fileDescriptor = eglGetStreamFileDescriptorKHR(cudaConsumer.eglDisplay, - cudaConsumer.eglStream); - if (EGL_NO_FILE_DESCRIPTOR_KHR == fileDescriptor) { - printf("%s: Cuda Consumer could not get EGL file descriptor.\n", - __func__); - eglDestroyStreamKHR(cudaConsumer.eglDisplay, cudaConsumer.eglStream); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - - if (verbose) - printf("%s: Cuda Consumer EGL stream FD obtained : %d.\n", __func__, - fileDescriptor); - - int res = -1; - res = EGLStreamSendfd(send_fd, fileDescriptor); - if (-1 == res) { - printf("%s: Cuda Consumer could not send EGL file descriptor.\n", - __func__); - consumerStatus = -1; - close(fileDescriptor); - } - - if (CUDA_SUCCESS != - (curesult = cuEGLStreamConsumerConnect(&(cudaConsumer.cudaConn), - cudaConsumer.eglStream))) { - printf("FAILED Connect CUDA consumer with error %d\n", curesult); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - - j = 0; - for (j = 0; j < NUMTRIALS; j++) { - curesult = cudaConsumerAcquireFrame(&cudaConsumer, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Consumer Test failed for frame = %d\n", j + 1); - consumerStatus = -1; + if (parseCmdLine(argc, argv, &args) < 0) { + printUsage(); + curesult = CUDA_ERROR_UNKNOWN; DoneCons(consumerStatus, send_fd); - } - curesult = cudaConsumerReleaseFrame(&cudaConsumer, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Consumer Test failed for frame = %d\n", j + 1); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - - curesult = cudaConsumerAcquireFrame(&cudaConsumer, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Consumer Test failed for frame = %d\n", j + 1); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - curesult = cudaConsumerReleaseFrame(&cudaConsumer, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Consumer Test failed for frame = %d\n", j + 1); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - } - cuCtxSynchronize(); - close(fileDescriptor); - err = cudaGetValueMismatch(); - if (err != cudaSuccess) { - printf("Consumer: App failed with value mismatch\n"); - curesult = CUDA_ERROR_UNKNOWN; - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); } - EGLint streamState = 0; - if (!eglQueryStreamKHR(cudaConsumer.eglDisplay, cudaConsumer.eglStream, - EGL_STREAM_STATE_KHR, &streamState)) { - printf("Main, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); - curesult = CUDA_ERROR_UNKNOWN; - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); + printf("Width : %u, height: %u and iterations: %u\n", WIDTH, HEIGHT, NUMTRIALS); + + if (!args.isProducer) // Consumer code + { + test_cuda_consumer_s cudaConsumer; + memset(&cudaConsumer, 0, sizeof(test_cuda_consumer_s)); + cudaConsumer.profileAPI = profileAPIs; + + // Hook up Ctrl-C handler + signal(SIGINT, sig_handler); + + if (!EGLStreamInit(isCrossDevice, !args.isProducer, EGL_NO_FILE_DESCRIPTOR_KHR)) { + printf("EGLStream Init failed.\n"); + curesult = CUDA_ERROR_UNKNOWN; + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + cudaConsumer.cudaDevId = cudaDevIndexCons; + curesult = cudaDeviceCreateConsumer(&cudaConsumer); + if (curesult != CUDA_SUCCESS) { + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + cuCtxPushCurrent(cudaConsumer.context); + + launchProducer(&args); + + args.charCnt = WIDTH * HEIGHT * 4; + + curesult = cuda_consumer_init(&cudaConsumer, &args); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Consumer: Init failed, status: %d\n", curesult); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + cuCtxPopCurrent(&cudaConsumer.context); + + send_fd = UnixSocketConnect(SOCK_PATH); + if (-1 == send_fd) { + printf("%s: Cuda Consumer cannot create socket %s\n", __func__, SOCK_PATH); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + cuCtxPushCurrent(cudaConsumer.context); + cudaConsumer.eglStream = g_consumerEglStream; + cudaConsumer.eglDisplay = g_consumerEglDisplay; + + // Send the EGL stream FD to producer + fileDescriptor = eglGetStreamFileDescriptorKHR(cudaConsumer.eglDisplay, cudaConsumer.eglStream); + if (EGL_NO_FILE_DESCRIPTOR_KHR == fileDescriptor) { + printf("%s: Cuda Consumer could not get EGL file descriptor.\n", __func__); + eglDestroyStreamKHR(cudaConsumer.eglDisplay, cudaConsumer.eglStream); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + if (verbose) + printf("%s: Cuda Consumer EGL stream FD obtained : %d.\n", __func__, fileDescriptor); + + int res = -1; + res = EGLStreamSendfd(send_fd, fileDescriptor); + if (-1 == res) { + printf("%s: Cuda Consumer could not send EGL file descriptor.\n", __func__); + consumerStatus = -1; + close(fileDescriptor); + } + + if (CUDA_SUCCESS != (curesult = cuEGLStreamConsumerConnect(&(cudaConsumer.cudaConn), cudaConsumer.eglStream))) { + printf("FAILED Connect CUDA consumer with error %d\n", curesult); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + j = 0; + for (j = 0; j < NUMTRIALS; j++) { + curesult = cudaConsumerAcquireFrame(&cudaConsumer, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Consumer Test failed for frame = %d\n", j + 1); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + curesult = cudaConsumerReleaseFrame(&cudaConsumer, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Consumer Test failed for frame = %d\n", j + 1); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + curesult = cudaConsumerAcquireFrame(&cudaConsumer, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Consumer Test failed for frame = %d\n", j + 1); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + curesult = cudaConsumerReleaseFrame(&cudaConsumer, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Consumer Test failed for frame = %d\n", j + 1); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + } + cuCtxSynchronize(); + close(fileDescriptor); + err = cudaGetValueMismatch(); + if (err != cudaSuccess) { + printf("Consumer: App failed with value mismatch\n"); + curesult = CUDA_ERROR_UNKNOWN; + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + EGLint streamState = 0; + if (!eglQueryStreamKHR(cudaConsumer.eglDisplay, cudaConsumer.eglStream, EGL_STREAM_STATE_KHR, &streamState)) { + printf("Main, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + curesult = CUDA_ERROR_UNKNOWN; + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + + if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { + if (CUDA_SUCCESS != (curesult = cuda_consumer_Deinit(&cudaConsumer))) { + printf("Consumer Disconnect FAILED.\n"); + consumerStatus = -1; + DoneCons(consumerStatus, send_fd); + } + } + } + else // Producer + { + test_cuda_producer_s cudaProducer; + memset(&cudaProducer, 0, sizeof(test_cuda_producer_s)); + cudaProducer.profileAPI = profileAPIs; + int producerStatus = 0; + + setenv("CUDA_EGL_PRODUCER_RETURN_WAIT_TIMEOUT", "1600", 0); + + int connect_fd = -1; + // Hook up Ctrl-C handler + signal(SIGINT, sig_handler); + + // Create connection to Consumer + connect_fd = UnixSocketCreate(SOCK_PATH); + if (-1 == connect_fd) { + printf("%s: Cuda Producer could not create socket: %s.\n", __func__, SOCK_PATH); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + // Get the file descriptor of the stream from the consumer process + // and re-create the EGL stream from it + fileDescriptor = EGLStreamReceivefd(connect_fd); + if (-1 == fileDescriptor) { + printf("%s: Cuda Producer could not receive EGL file descriptor \n", __func__); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + if (!EGLStreamInit(isCrossDevice, 0, fileDescriptor)) { + printf("EGLStream Init failed.\n"); + producerStatus = -1; + curesult = CUDA_ERROR_UNKNOWN; + DoneProd(producerStatus, connect_fd); + } + + cudaProducer.eglDisplay = g_producerEglDisplay; + cudaProducer.eglStream = g_producerEglStream; + cudaProducer.cudaDevId = cudaDevIndexProd; + + curesult = cudaDeviceCreateProducer(&cudaProducer); + if (curesult != CUDA_SUCCESS) { + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + args.charCnt = WIDTH * HEIGHT * 4; + cuCtxPushCurrent(cudaProducer.context); + curesult = cudaProducerInit(&cudaProducer, &args); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Producer: Init failed, status: %d\n", curesult); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + // wait for consumer to connect first + int err = 0; + int wait_loop = 0; + EGLint streamState = 0; + do { + err = + eglQueryStreamKHR(cudaProducer.eglDisplay, cudaProducer.eglStream, EGL_STREAM_STATE_KHR, &streamState); + if ((0 != err) && (EGL_STREAM_STATE_CONNECTING_KHR != streamState)) { + sleep(1); + wait_loop++; + } + } while ((wait_loop < 10) && (0 != err) && (streamState != EGL_STREAM_STATE_CONNECTING_KHR)); + + if ((0 == err) || (wait_loop >= 10)) { + printf("%s: Cuda Producer eglQueryStreamKHR EGL_STREAM_STATE_KHR failed.\n", __func__); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + if (CUDA_SUCCESS + != (curesult = + cuEGLStreamProducerConnect(&(cudaProducer.cudaConn), cudaProducer.eglStream, WIDTH, HEIGHT))) { + printf("Connect CUDA producer FAILED with error %d\n", curesult); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + printf("main - Cuda Producer and Consumer Initialized.\n"); + + cudaProducerPrepareFrame(&cudaEgl1, cudaProducer.cudaPtr, args.charCnt); + cudaProducerPrepareFrame(&cudaEgl2, cudaProducer.cudaPtr1, args.charCnt); + + j = 0; + for (j = 0; j < NUMTRIALS; j++) { + curesult = cudaProducerPresentFrame(&cudaProducer, cudaEgl1, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", j + 1, curesult); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + curesult = cudaProducerPresentFrame(&cudaProducer, cudaEgl2, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", j + 1, curesult); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + curesult = cudaProducerReturnFrame(&cudaProducer, cudaEgl1, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", j + 1, curesult); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + curesult = cudaProducerReturnFrame(&cudaProducer, cudaEgl2, j); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", j + 1, curesult); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + } + + cuCtxSynchronize(); + err = cudaGetValueMismatch(); + if (err != cudaSuccess) { + printf("Prod: App failed with value mismatch\n"); + curesult = CUDA_ERROR_UNKNOWN; + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + printf("Tear Down Start.....\n"); + if (!eglQueryStreamKHR(cudaProducer.eglDisplay, cudaProducer.eglStream, EGL_STREAM_STATE_KHR, &streamState)) { + printf("Main, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + curesult = CUDA_ERROR_UNKNOWN; + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + + if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { + if (CUDA_SUCCESS != (curesult = cudaProducerDeinit(&cudaProducer))) { + printf("Producer Disconnect FAILED with %d\n", curesult); + producerStatus = -1; + DoneProd(producerStatus, connect_fd); + } + } + unsetenv("CUDA_EGL_PRODUCER_RETURN_WAIT_TIMEOUT"); } - if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { - if (CUDA_SUCCESS != (curesult = cuda_consumer_Deinit(&cudaConsumer))) { - printf("Consumer Disconnect FAILED.\n"); - consumerStatus = -1; - DoneCons(consumerStatus, send_fd); - } - } - } else // Producer - { - test_cuda_producer_s cudaProducer; - memset(&cudaProducer, 0, sizeof(test_cuda_producer_s)); - cudaProducer.profileAPI = profileAPIs; - int producerStatus = 0; - - setenv("CUDA_EGL_PRODUCER_RETURN_WAIT_TIMEOUT", "1600", 0); - - int connect_fd = -1; - // Hook up Ctrl-C handler - signal(SIGINT, sig_handler); - - // Create connection to Consumer - connect_fd = UnixSocketCreate(SOCK_PATH); - if (-1 == connect_fd) { - printf("%s: Cuda Producer could not create socket: %s.\n", __func__, - SOCK_PATH); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - // Get the file descriptor of the stream from the consumer process - // and re-create the EGL stream from it - fileDescriptor = EGLStreamReceivefd(connect_fd); - if (-1 == fileDescriptor) { - printf("%s: Cuda Producer could not receive EGL file descriptor \n", - __func__); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - if (!EGLStreamInit(isCrossDevice, 0, fileDescriptor)) { - printf("EGLStream Init failed.\n"); - producerStatus = -1; - curesult = CUDA_ERROR_UNKNOWN; - DoneProd(producerStatus, connect_fd); - } - - cudaProducer.eglDisplay = g_producerEglDisplay; - cudaProducer.eglStream = g_producerEglStream; - cudaProducer.cudaDevId = cudaDevIndexProd; - - curesult = cudaDeviceCreateProducer(&cudaProducer); - if (curesult != CUDA_SUCCESS) { - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - args.charCnt = WIDTH * HEIGHT * 4; - cuCtxPushCurrent(cudaProducer.context); - curesult = cudaProducerInit(&cudaProducer, &args); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Producer: Init failed, status: %d\n", curesult); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - // wait for consumer to connect first - int err = 0; - int wait_loop = 0; - EGLint streamState = 0; - do { - err = eglQueryStreamKHR(cudaProducer.eglDisplay, cudaProducer.eglStream, - EGL_STREAM_STATE_KHR, &streamState); - if ((0 != err) && (EGL_STREAM_STATE_CONNECTING_KHR != streamState)) { - sleep(1); - wait_loop++; - } - } while ((wait_loop < 10) && (0 != err) && - (streamState != EGL_STREAM_STATE_CONNECTING_KHR)); - - if ((0 == err) || (wait_loop >= 10)) { - printf( - "%s: Cuda Producer eglQueryStreamKHR EGL_STREAM_STATE_KHR failed.\n", - __func__); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - if (CUDA_SUCCESS != (curesult = cuEGLStreamProducerConnect( - &(cudaProducer.cudaConn), cudaProducer.eglStream, - WIDTH, HEIGHT))) { - printf("Connect CUDA producer FAILED with error %d\n", curesult); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - printf("main - Cuda Producer and Consumer Initialized.\n"); - - cudaProducerPrepareFrame(&cudaEgl1, cudaProducer.cudaPtr, args.charCnt); - cudaProducerPrepareFrame(&cudaEgl2, cudaProducer.cudaPtr1, args.charCnt); - - j = 0; - for (j = 0; j < NUMTRIALS; j++) { - curesult = cudaProducerPresentFrame(&cudaProducer, cudaEgl1, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", - j + 1, curesult); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - curesult = cudaProducerPresentFrame(&cudaProducer, cudaEgl2, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", - j + 1, curesult); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - curesult = cudaProducerReturnFrame(&cudaProducer, cudaEgl1, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", - j + 1, curesult); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - curesult = cudaProducerReturnFrame(&cudaProducer, cudaEgl2, j); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n", - j + 1, curesult); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - } - - cuCtxSynchronize(); - err = cudaGetValueMismatch(); - if (err != cudaSuccess) { - printf("Prod: App failed with value mismatch\n"); - curesult = CUDA_ERROR_UNKNOWN; - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - printf("Tear Down Start.....\n"); - if (!eglQueryStreamKHR(cudaProducer.eglDisplay, cudaProducer.eglStream, - EGL_STREAM_STATE_KHR, &streamState)) { - printf("Main, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); - curesult = CUDA_ERROR_UNKNOWN; - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - - if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { - if (CUDA_SUCCESS != (curesult = cudaProducerDeinit(&cudaProducer))) { - printf("Producer Disconnect FAILED with %d\n", curesult); - producerStatus = -1; - DoneProd(producerStatus, connect_fd); - } - } - unsetenv("CUDA_EGL_PRODUCER_RETURN_WAIT_TIMEOUT"); - } - - return 0; + return 0; } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md index 2cc244e5..099f3d53 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md @@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp index c3bda418..358b8a16 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp @@ -30,7 +30,9 @@ // #include "cuda_consumer.h" + #include + #include "eglstrm_common.h" #if defined(EXTENSION_LIST) @@ -39,280 +41,288 @@ EXTENSION_LIST(EXTLST_EXTERN) int checkbuf(FILE *fp1, FILE *fp2); -CUresult cudaConsumerTest(test_cuda_consumer_s *data, const char *fileName) { - CUresult cuStatus = CUDA_SUCCESS; - CUarray cudaArr = NULL; - CUeglFrame cudaEgl; - CUgraphicsResource cudaResource; - unsigned int i; - int check_result; - FILE *pInFile1 = NULL, *pInFile2 = NULL, *file_p = NULL; - EGLint streamState = 0; +CUresult cudaConsumerTest(test_cuda_consumer_s *data, const char *fileName) +{ + CUresult cuStatus = CUDA_SUCCESS; + CUarray cudaArr = NULL; + CUeglFrame cudaEgl; + CUgraphicsResource cudaResource; + unsigned int i; + int check_result; + FILE *pInFile1 = NULL, *pInFile2 = NULL, *file_p = NULL; + EGLint streamState = 0; - if (!data) { - printf("%s: Bad parameter\n", __func__); - goto done; - } - - if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, - &streamState)) { - printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); - } - if (streamState == EGL_STREAM_STATE_DISCONNECTED_KHR) { - printf("CUDA Consumer: - EGL_STREAM_STATE_DISCONNECTED_KHR received\n"); - } - - if (streamState == EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR) { - cuStatus = cuEGLStreamConsumerAcquireFrame(&(data->cudaConn), &cudaResource, - NULL, 16000); - - if (cuStatus == CUDA_SUCCESS) { - CUdeviceptr pDevPtr = 0; - int bufferSize; - unsigned char *pCudaCopyMem = NULL; - unsigned int copyWidthInBytes = 0, copyHeight = 0; - - file_p = fopen(fileName, "wb+"); - if (!file_p) { - printf("WriteFrame: file open failed %s\n", fileName); - cuStatus = CUDA_ERROR_UNKNOWN; + if (!data) { + printf("%s: Bad parameter\n", __func__); goto done; - } - cuStatus = - cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0); - if (cuStatus != CUDA_SUCCESS) { - printf("Cuda get resource failed with %d\n", cuStatus); - goto done; - } - cuStatus = cuCtxSynchronize(); - if (cuStatus != CUDA_SUCCESS) { - printf("cuCtxSynchronize failed \n"); - goto done; - } - if (!(cudaEgl.planeCount >= 1 && cudaEgl.planeCount <= 3)) { - printf("Plane count is invalid\nExiting\n"); - goto done; - } - - for (i = 0; i < cudaEgl.planeCount; i++) { - if (cudaEgl.frameType == CU_EGL_FRAME_TYPE_PITCH) { - pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[i]; - if (cudaEgl.planeCount == 1) { - bufferSize = cudaEgl.pitch * cudaEgl.height; - copyWidthInBytes = cudaEgl.pitch; - copyHeight = data->height; - } else if (i == 1 && cudaEgl.planeCount == 2) { // YUV 420 - // semi-planar - bufferSize = cudaEgl.pitch * cudaEgl.height / 2; - copyWidthInBytes = cudaEgl.pitch; - copyHeight = data->height / 2; - } else { - bufferSize = data->width * data->height; - copyWidthInBytes = data->width; - copyHeight = data->height; - if (i > 0) { - bufferSize >>= 2; - copyWidthInBytes >>= 1; - copyHeight >>= 1; - } - } - } else { - cudaArr = cudaEgl.frame.pArray[i]; - if (cudaEgl.planeCount == 1) { - bufferSize = data->width * data->height * 4; - copyWidthInBytes = data->width * 4; - copyHeight = data->height; - } else if (i == 1 && cudaEgl.planeCount == 2) { // YUV 420 - // semi-planar - bufferSize = data->width * data->height / 2; - copyWidthInBytes = data->width; - copyHeight = data->height / 2; - } else { - bufferSize = data->width * data->height; - copyWidthInBytes = data->width; - copyHeight = data->height; - if (i > 0) { - bufferSize >>= 2; - copyWidthInBytes >>= 1; - copyHeight >>= 1; - } - } - } - if (i == 0) { - pCudaCopyMem = (unsigned char *)malloc(bufferSize); - if (pCudaCopyMem == NULL) { - printf("pCudaCopyMem malloc failed\n"); - goto done; - } - } - memset(pCudaCopyMem, 0, bufferSize); - if (data->pitchLinearOutput) { - cuStatus = cuMemcpyDtoH(pCudaCopyMem, pDevPtr, bufferSize); - if (cuStatus != CUDA_SUCCESS) { - printf( - "cuda_consumer: pitch linear Memcpy failed, bufferSize =%d\n", - bufferSize); - goto done; - } - cuStatus = cuCtxSynchronize(); - if (cuStatus != CUDA_SUCCESS) { - printf("cuda_consumer: cuCtxSynchronize failed after memcpy \n"); - goto done; - } - } else { - CUDA_MEMCPY3D cpdesc; - memset(&cpdesc, 0, sizeof(cpdesc)); - cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0; - cpdesc.srcMemoryType = CU_MEMORYTYPE_ARRAY; - cpdesc.srcArray = cudaArr; - cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0; - cpdesc.dstMemoryType = CU_MEMORYTYPE_HOST; - cpdesc.dstHost = (void *)pCudaCopyMem; - cpdesc.WidthInBytes = copyWidthInBytes; // data->width * 4; - cpdesc.Height = copyHeight; // data->height; - cpdesc.Depth = 1; - - cuStatus = cuMemcpy3D(&cpdesc); - if (cuStatus != CUDA_SUCCESS) { - printf( - "Cuda consumer: cuMemCpy3D failed, copyWidthInBytes=%d, " - "copyHight=%d\n", - copyWidthInBytes, copyHeight); - } - cuStatus = cuCtxSynchronize(); - if (cuStatus != CUDA_SUCCESS) { - printf("cuCtxSynchronize failed after memcpy \n"); - } - } - if (cuStatus == CUDA_SUCCESS) { - if (fwrite(pCudaCopyMem, bufferSize, 1, file_p) != 1) { - printf("Cuda consumer: output file write failed\n"); - cuStatus = CUDA_ERROR_UNKNOWN; - goto done; - } - } - } - pInFile1 = fopen(data->fileName1, "rb"); - if (!pInFile1) { - printf("Failed to open file :%s\n", data->fileName1); - goto done; - } - pInFile2 = fopen(data->fileName2, "rb"); - if (!pInFile2) { - printf("Failed to open file :%s\n", data->fileName2); - goto done; - } - rewind(file_p); - check_result = checkbuf(file_p, pInFile1); - if (check_result == -1) { - rewind(file_p); - check_result = checkbuf(file_p, pInFile2); - if (check_result == -1) { - printf("Frame received does not match any valid image: FAILED\n"); - } else { - printf("Frame check Passed\n"); - } - } else { - printf("Frame check Passed\n"); - } - if (pCudaCopyMem) { - free(pCudaCopyMem); - pCudaCopyMem = NULL; - } - cuStatus = - cuEGLStreamConsumerReleaseFrame(&data->cudaConn, cudaResource, NULL); - if (cuStatus != CUDA_SUCCESS) { - printf("cuEGLStreamConsumerReleaseFrame failed with cuStatus = %d\n", - cuStatus); - goto done; - } - } else { - printf("cuda AcquireFrame FAILED with cuStatus=%d\n", cuStatus); - goto done; } - } + + if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, &streamState)) { + printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + } + if (streamState == EGL_STREAM_STATE_DISCONNECTED_KHR) { + printf("CUDA Consumer: - EGL_STREAM_STATE_DISCONNECTED_KHR received\n"); + } + + if (streamState == EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR) { + cuStatus = cuEGLStreamConsumerAcquireFrame(&(data->cudaConn), &cudaResource, NULL, 16000); + + if (cuStatus == CUDA_SUCCESS) { + CUdeviceptr pDevPtr = 0; + int bufferSize; + unsigned char *pCudaCopyMem = NULL; + unsigned int copyWidthInBytes = 0, copyHeight = 0; + + file_p = fopen(fileName, "wb+"); + if (!file_p) { + printf("WriteFrame: file open failed %s\n", fileName); + cuStatus = CUDA_ERROR_UNKNOWN; + goto done; + } + cuStatus = cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0); + if (cuStatus != CUDA_SUCCESS) { + printf("Cuda get resource failed with %d\n", cuStatus); + goto done; + } + cuStatus = cuCtxSynchronize(); + if (cuStatus != CUDA_SUCCESS) { + printf("cuCtxSynchronize failed \n"); + goto done; + } + if (!(cudaEgl.planeCount >= 1 && cudaEgl.planeCount <= 3)) { + printf("Plane count is invalid\nExiting\n"); + goto done; + } + + for (i = 0; i < cudaEgl.planeCount; i++) { + if (cudaEgl.frameType == CU_EGL_FRAME_TYPE_PITCH) { + pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[i]; + if (cudaEgl.planeCount == 1) { + bufferSize = cudaEgl.pitch * cudaEgl.height; + copyWidthInBytes = cudaEgl.pitch; + copyHeight = data->height; + } + else if (i == 1 && cudaEgl.planeCount == 2) { // YUV 420 + // semi-planar + bufferSize = cudaEgl.pitch * cudaEgl.height / 2; + copyWidthInBytes = cudaEgl.pitch; + copyHeight = data->height / 2; + } + else { + bufferSize = data->width * data->height; + copyWidthInBytes = data->width; + copyHeight = data->height; + if (i > 0) { + bufferSize >>= 2; + copyWidthInBytes >>= 1; + copyHeight >>= 1; + } + } + } + else { + cudaArr = cudaEgl.frame.pArray[i]; + if (cudaEgl.planeCount == 1) { + bufferSize = data->width * data->height * 4; + copyWidthInBytes = data->width * 4; + copyHeight = data->height; + } + else if (i == 1 && cudaEgl.planeCount == 2) { // YUV 420 + // semi-planar + bufferSize = data->width * data->height / 2; + copyWidthInBytes = data->width; + copyHeight = data->height / 2; + } + else { + bufferSize = data->width * data->height; + copyWidthInBytes = data->width; + copyHeight = data->height; + if (i > 0) { + bufferSize >>= 2; + copyWidthInBytes >>= 1; + copyHeight >>= 1; + } + } + } + if (i == 0) { + pCudaCopyMem = (unsigned char *)malloc(bufferSize); + if (pCudaCopyMem == NULL) { + printf("pCudaCopyMem malloc failed\n"); + goto done; + } + } + memset(pCudaCopyMem, 0, bufferSize); + if (data->pitchLinearOutput) { + cuStatus = cuMemcpyDtoH(pCudaCopyMem, pDevPtr, bufferSize); + if (cuStatus != CUDA_SUCCESS) { + printf("cuda_consumer: pitch linear Memcpy failed, bufferSize =%d\n", bufferSize); + goto done; + } + cuStatus = cuCtxSynchronize(); + if (cuStatus != CUDA_SUCCESS) { + printf("cuda_consumer: cuCtxSynchronize failed after memcpy \n"); + goto done; + } + } + else { + CUDA_MEMCPY3D cpdesc; + memset(&cpdesc, 0, sizeof(cpdesc)); + cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0; + cpdesc.srcMemoryType = CU_MEMORYTYPE_ARRAY; + cpdesc.srcArray = cudaArr; + cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0; + cpdesc.dstMemoryType = CU_MEMORYTYPE_HOST; + cpdesc.dstHost = (void *)pCudaCopyMem; + cpdesc.WidthInBytes = copyWidthInBytes; // data->width * 4; + cpdesc.Height = copyHeight; // data->height; + cpdesc.Depth = 1; + + cuStatus = cuMemcpy3D(&cpdesc); + if (cuStatus != CUDA_SUCCESS) { + printf("Cuda consumer: cuMemCpy3D failed, copyWidthInBytes=%d, " + "copyHight=%d\n", + copyWidthInBytes, + copyHeight); + } + cuStatus = cuCtxSynchronize(); + if (cuStatus != CUDA_SUCCESS) { + printf("cuCtxSynchronize failed after memcpy \n"); + } + } + if (cuStatus == CUDA_SUCCESS) { + if (fwrite(pCudaCopyMem, bufferSize, 1, file_p) != 1) { + printf("Cuda consumer: output file write failed\n"); + cuStatus = CUDA_ERROR_UNKNOWN; + goto done; + } + } + } + pInFile1 = fopen(data->fileName1, "rb"); + if (!pInFile1) { + printf("Failed to open file :%s\n", data->fileName1); + goto done; + } + pInFile2 = fopen(data->fileName2, "rb"); + if (!pInFile2) { + printf("Failed to open file :%s\n", data->fileName2); + goto done; + } + rewind(file_p); + check_result = checkbuf(file_p, pInFile1); + if (check_result == -1) { + rewind(file_p); + check_result = checkbuf(file_p, pInFile2); + if (check_result == -1) { + printf("Frame received does not match any valid image: FAILED\n"); + } + else { + printf("Frame check Passed\n"); + } + } + else { + printf("Frame check Passed\n"); + } + if (pCudaCopyMem) { + free(pCudaCopyMem); + pCudaCopyMem = NULL; + } + cuStatus = cuEGLStreamConsumerReleaseFrame(&data->cudaConn, cudaResource, NULL); + if (cuStatus != CUDA_SUCCESS) { + printf("cuEGLStreamConsumerReleaseFrame failed with cuStatus = %d\n", cuStatus); + goto done; + } + } + else { + printf("cuda AcquireFrame FAILED with cuStatus=%d\n", cuStatus); + goto done; + } + } done: - if (file_p) { - fclose(file_p); - file_p = NULL; - } - if (pInFile1) { - fclose(pInFile1); - pInFile1 = NULL; - } - if (pInFile1) { - fclose(pInFile2); - pInFile2 = NULL; - } - return cuStatus; -} - -int checkbuf(FILE *fp1, FILE *fp2) { - int match = 0; - int ch1, ch2; - if (fp1 == NULL) { - printf("Invalid file pointer for first file\n"); - return -1; - } else if (fp2 == NULL) { - printf("Invalid file pointer for second file\n"); - return -1; - } else { - ch1 = getc(fp1); - ch2 = getc(fp2); - while ((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2)) { - ch1 = getc(fp1); - ch2 = getc(fp2); + if (file_p) { + fclose(file_p); + file_p = NULL; } - if (ch1 == ch2) { - match = 1; - } else if (ch1 != ch2) { - match = -1; + if (pInFile1) { + fclose(pInFile1); + pInFile1 = NULL; } - } - return match; + if (pInFile1) { + fclose(pInFile2); + pInFile2 = NULL; + } + return cuStatus; } -CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer, - CUdevice device) { - CUresult status = CUDA_SUCCESS; - if (CUDA_SUCCESS != (status = cuInit(0))) { - printf("Failed to initialize CUDA\n"); +int checkbuf(FILE *fp1, FILE *fp2) +{ + int match = 0; + int ch1, ch2; + if (fp1 == NULL) { + printf("Invalid file pointer for first file\n"); + return -1; + } + else if (fp2 == NULL) { + printf("Invalid file pointer for second file\n"); + return -1; + } + else { + ch1 = getc(fp1); + ch2 = getc(fp2); + while ((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2)) { + ch1 = getc(fp1); + ch2 = getc(fp2); + } + if (ch1 == ch2) { + match = 1; + } + else if (ch1 != ch2) { + match = -1; + } + } + return match; +} + +CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer, CUdevice device) +{ + CUresult status = CUDA_SUCCESS; + if (CUDA_SUCCESS != (status = cuInit(0))) { + printf("Failed to initialize CUDA\n"); + return status; + } + + int major = 0, minor = 0; + char deviceName[256]; + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); + checkCudaErrors(cuDeviceGetName(deviceName, 256, device)); + printf("CUDA Consumer on GPU Device %d: \"%s\" with compute capability " + "%d.%d\n\n", + device, + deviceName, + major, + minor); + + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, 0, device))) { + printf("failed to create CUDA context\n"); + return status; + } + checkCudaErrors(cuCtxPopCurrent(&cudaConsumer->context)); return status; - } - - int major = 0, minor = 0; - char deviceName[256]; - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); - checkCudaErrors(cuDeviceGetName(deviceName, 256, device)); - printf( - "CUDA Consumer on GPU Device %d: \"%s\" with compute capability " - "%d.%d\n\n", - device, deviceName, major, minor); - - if (CUDA_SUCCESS != - (status = cuCtxCreate(&cudaConsumer->context, 0, device))) { - printf("failed to create CUDA context\n"); - return status; - } - checkCudaErrors(cuCtxPopCurrent(&cudaConsumer->context)); - return status; } -void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args) { - cudaConsumer->pitchLinearOutput = args->pitchLinearOutput; - cudaConsumer->width = args->inputWidth; - cudaConsumer->height = args->inputHeight; - cudaConsumer->fileName1 = args->infile1; - cudaConsumer->fileName2 = args->infile2; +void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args) +{ + cudaConsumer->pitchLinearOutput = args->pitchLinearOutput; + cudaConsumer->width = args->inputWidth; + cudaConsumer->height = args->inputHeight; + cudaConsumer->fileName1 = args->infile1; + cudaConsumer->fileName2 = args->infile2; - cudaConsumer->outFile1 = const_cast("cuda_out1.yuv"); - cudaConsumer->outFile2 = const_cast("cuda_out2.yuv"); + cudaConsumer->outFile1 = const_cast("cuda_out1.yuv"); + cudaConsumer->outFile2 = const_cast("cuda_out2.yuv"); } -CUresult cuda_consumer_deinit(test_cuda_consumer_s *cudaConsumer) { - return cuEGLStreamConsumerDisconnect(&cudaConsumer->cudaConn); +CUresult cuda_consumer_deinit(test_cuda_consumer_s *cudaConsumer) +{ + return cuEGLStreamConsumerDisconnect(&cudaConsumer->cudaConn); } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.h b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.h index 2e89a5c0..dfc4fcf1 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.h +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.h @@ -35,28 +35,29 @@ #include #include #include + #include "cudaEGL.h" #include "eglstrm_common.h" extern EGLStreamKHR eglStream; -extern EGLDisplay g_display; +extern EGLDisplay g_display; -typedef struct _test_cuda_consumer_s { - CUcontext context; - CUeglStreamConnection cudaConn; - bool pitchLinearOutput; - unsigned int width; - unsigned int height; - const char *fileName1; - const char *fileName2; - const char *outFile1; - const char *outFile2; - unsigned int frameCount; +typedef struct _test_cuda_consumer_s +{ + CUcontext context; + CUeglStreamConnection cudaConn; + bool pitchLinearOutput; + unsigned int width; + unsigned int height; + const char *fileName1; + const char *fileName2; + const char *outFile1; + const char *outFile2; + unsigned int frameCount; } test_cuda_consumer_s; -void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args); +void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args); CUresult cuda_consumer_deinit(test_cuda_consumer_s *cudaConsumer); CUresult cudaConsumerTest(test_cuda_consumer_s *data, const char *outFile); -CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer, - CUdevice device); +CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer, CUdevice device); #endif diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp index ef3adab2..6d356841 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp @@ -30,7 +30,9 @@ // #include "cuda_producer.h" + #include + #include "cudaEGL.h" #include "eglstrm_common.h" @@ -38,369 +40,379 @@ EXTENSION_LIST(EXTLST_EXTERN) #endif -static CUresult cudaProducerReadYUVFrame(FILE *file, unsigned int frameNum, - unsigned int width, - unsigned int height, - unsigned char *pBuff) { - int bOrderUV = 0; - unsigned char *pYBuff, *pUBuff, *pVBuff, *pChroma; - unsigned int frameSize = (width * height * 3) / 2; - CUresult ret = CUDA_SUCCESS; - unsigned int i; +static CUresult cudaProducerReadYUVFrame(FILE *file, + unsigned int frameNum, + unsigned int width, + unsigned int height, + unsigned char *pBuff) +{ + int bOrderUV = 0; + unsigned char *pYBuff, *pUBuff, *pVBuff, *pChroma; + unsigned int frameSize = (width * height * 3) / 2; + CUresult ret = CUDA_SUCCESS; + unsigned int i; - if (!pBuff || !file) return CUDA_ERROR_FILE_NOT_FOUND; + if (!pBuff || !file) + return CUDA_ERROR_FILE_NOT_FOUND; - pYBuff = pBuff; + pYBuff = pBuff; - // YVU order in the buffer - pVBuff = pYBuff + width * height; - pUBuff = pVBuff + width * height / 4; + // YVU order in the buffer + pVBuff = pYBuff + width * height; + pUBuff = pVBuff + width * height / 4; - if (fseek(file, frameNum * frameSize, SEEK_SET)) { - printf("ReadYUVFrame: Error seeking file: %p\n", file); - ret = CUDA_ERROR_NOT_PERMITTED; - goto done; - } - // read Y U V separately - for (i = 0; i < height; i++) { - if (fread(pYBuff, width, 1, file) != 1) { - printf("ReadYUVFrame: Error reading file: %p\n", file); - ret = CUDA_ERROR_NOT_PERMITTED; - goto done; + if (fseek(file, frameNum * frameSize, SEEK_SET)) { + printf("ReadYUVFrame: Error seeking file: %p\n", file); + ret = CUDA_ERROR_NOT_PERMITTED; + goto done; } - pYBuff += width; - } - - pChroma = bOrderUV ? pUBuff : pVBuff; - for (i = 0; i < height / 2; i++) { - if (fread(pChroma, width / 2, 1, file) != 1) { - printf("ReadYUVFrame: Error reading file: %p\n", file); - ret = CUDA_ERROR_NOT_PERMITTED; - goto done; - } - pChroma += width / 2; - } - - pChroma = bOrderUV ? pVBuff : pUBuff; - for (i = 0; i < height / 2; i++) { - if (fread(pChroma, width / 2, 1, file) != 1) { - printf("ReadYUVFrame: Error reading file: %p\n", file); - ret = CUDA_ERROR_NOT_PERMITTED; - goto done; - } - pChroma += width / 2; - } -done: - return ret; -} - -static CUresult cudaProducerReadARGBFrame(FILE *file, unsigned int frameNum, - unsigned int width, - unsigned int height, - unsigned char *pBuff) { - unsigned int frameSize = width * height * 4; - CUresult ret = CUDA_SUCCESS; - - if (!pBuff || !file) return CUDA_ERROR_FILE_NOT_FOUND; - - if (fseek(file, frameNum * frameSize, SEEK_SET)) { - printf("ReadYUVFrame: Error seeking file: %p\n", file); - ret = CUDA_ERROR_NOT_PERMITTED; - goto done; - } - - // read ARGB data - if (fread(pBuff, frameSize, 1, file) != 1) { - if (feof(file)) - printf("ReadARGBFrame: file read to the end\n"); - else - printf("ReadARGBFrame: Error reading file: %p\n", file); - ret = CUDA_ERROR_NOT_PERMITTED; - goto done; - } -done: - return ret; -} - -CUresult cudaProducerTest(test_cuda_producer_s *cudaProducer, char *file) { - int framenum = 0; - CUarray cudaArr[3] = {0}; - CUdeviceptr cudaPtr[3] = {0, 0, 0}; - unsigned int bufferSize; - CUresult cuStatus = CUDA_SUCCESS; - unsigned int i, surfNum, uvOffset[3] = {0}; - unsigned int copyWidthInBytes[3] = {0, 0, 0}, copyHeight[3] = {0, 0, 0}; - CUeglColorFormat eglColorFormat; - FILE *file_p; - CUeglFrame cudaEgl; - CUcontext oldContext; - - file_p = fopen(file, "rb"); - if (!file_p) { - printf("CudaProducer: Error opening file: %s\n", file); - goto done; - } - - if (cudaProducer->pitchLinearOutput) { - if (cudaProducer->isARGB) { - cudaPtr[0] = cudaProducer->cudaPtrARGB[0]; - } else { // YUV case - for (i = 0; i < 3; i++) { - if (i == 0) { - bufferSize = cudaProducer->width * cudaProducer->height; - } else { - bufferSize = cudaProducer->width * cudaProducer->height / 4; + // read Y U V separately + for (i = 0; i < height; i++) { + if (fread(pYBuff, width, 1, file) != 1) { + printf("ReadYUVFrame: Error reading file: %p\n", file); + ret = CUDA_ERROR_NOT_PERMITTED; + goto done; } - - cudaPtr[i] = cudaProducer->cudaPtrYUV[i]; - } + pYBuff += width; } - } else { + + pChroma = bOrderUV ? pUBuff : pVBuff; + for (i = 0; i < height / 2; i++) { + if (fread(pChroma, width / 2, 1, file) != 1) { + printf("ReadYUVFrame: Error reading file: %p\n", file); + ret = CUDA_ERROR_NOT_PERMITTED; + goto done; + } + pChroma += width / 2; + } + + pChroma = bOrderUV ? pVBuff : pUBuff; + for (i = 0; i < height / 2; i++) { + if (fread(pChroma, width / 2, 1, file) != 1) { + printf("ReadYUVFrame: Error reading file: %p\n", file); + ret = CUDA_ERROR_NOT_PERMITTED; + goto done; + } + pChroma += width / 2; + } +done: + return ret; +} + +static CUresult cudaProducerReadARGBFrame(FILE *file, + unsigned int frameNum, + unsigned int width, + unsigned int height, + unsigned char *pBuff) +{ + unsigned int frameSize = width * height * 4; + CUresult ret = CUDA_SUCCESS; + + if (!pBuff || !file) + return CUDA_ERROR_FILE_NOT_FOUND; + + if (fseek(file, frameNum * frameSize, SEEK_SET)) { + printf("ReadYUVFrame: Error seeking file: %p\n", file); + ret = CUDA_ERROR_NOT_PERMITTED; + goto done; + } + + // read ARGB data + if (fread(pBuff, frameSize, 1, file) != 1) { + if (feof(file)) + printf("ReadARGBFrame: file read to the end\n"); + else + printf("ReadARGBFrame: Error reading file: %p\n", file); + ret = CUDA_ERROR_NOT_PERMITTED; + goto done; + } +done: + return ret; +} + +CUresult cudaProducerTest(test_cuda_producer_s *cudaProducer, char *file) +{ + int framenum = 0; + CUarray cudaArr[3] = {0}; + CUdeviceptr cudaPtr[3] = {0, 0, 0}; + unsigned int bufferSize; + CUresult cuStatus = CUDA_SUCCESS; + unsigned int i, surfNum, uvOffset[3] = {0}; + unsigned int copyWidthInBytes[3] = {0, 0, 0}, copyHeight[3] = {0, 0, 0}; + CUeglColorFormat eglColorFormat; + FILE *file_p; + CUeglFrame cudaEgl; + CUcontext oldContext; + + file_p = fopen(file, "rb"); + if (!file_p) { + printf("CudaProducer: Error opening file: %s\n", file); + goto done; + } + + if (cudaProducer->pitchLinearOutput) { + if (cudaProducer->isARGB) { + cudaPtr[0] = cudaProducer->cudaPtrARGB[0]; + } + else { // YUV case + for (i = 0; i < 3; i++) { + if (i == 0) { + bufferSize = cudaProducer->width * cudaProducer->height; + } + else { + bufferSize = cudaProducer->width * cudaProducer->height / 4; + } + + cudaPtr[i] = cudaProducer->cudaPtrYUV[i]; + } + } + } + else { + if (cudaProducer->isARGB) { + cudaArr[0] = cudaProducer->cudaArrARGB[0]; + } + else { + for (i = 0; i < 3; i++) { + cudaArr[i] = cudaProducer->cudaArrYUV[i]; + } + } + } + uvOffset[0] = 0; if (cudaProducer->isARGB) { - cudaArr[0] = cudaProducer->cudaArrARGB[0]; - } else { - for (i = 0; i < 3; i++) { - cudaArr[i] = cudaProducer->cudaArrYUV[i]; - } + if (CUDA_SUCCESS + != cudaProducerReadARGBFrame( + file_p, framenum, cudaProducer->width, cudaProducer->height, cudaProducer->pBuff)) { + printf("cuda producer, read ARGB frame failed\n"); + goto done; + } + copyWidthInBytes[0] = cudaProducer->width * 4; + copyHeight[0] = cudaProducer->height; + surfNum = 1; + eglColorFormat = CU_EGL_COLOR_FORMAT_ARGB; } - } - uvOffset[0] = 0; - if (cudaProducer->isARGB) { - if (CUDA_SUCCESS != - cudaProducerReadARGBFrame(file_p, framenum, cudaProducer->width, - cudaProducer->height, cudaProducer->pBuff)) { - printf("cuda producer, read ARGB frame failed\n"); - goto done; + else { + if (CUDA_SUCCESS + != cudaProducerReadYUVFrame( + file_p, framenum, cudaProducer->width, cudaProducer->height, cudaProducer->pBuff)) { + printf("cuda producer, reading YUV frame failed\n"); + goto done; + } + surfNum = 3; + eglColorFormat = CU_EGL_COLOR_FORMAT_YUV420_PLANAR; + copyWidthInBytes[0] = cudaProducer->width; + copyHeight[0] = cudaProducer->height; + copyWidthInBytes[1] = cudaProducer->width / 2; + copyHeight[1] = cudaProducer->height / 2; + copyWidthInBytes[2] = cudaProducer->width / 2; + copyHeight[2] = cudaProducer->height / 2; + uvOffset[1] = cudaProducer->width * cudaProducer->height; + uvOffset[2] = uvOffset[1] + cudaProducer->width / 2 * cudaProducer->height / 2; } - copyWidthInBytes[0] = cudaProducer->width * 4; - copyHeight[0] = cudaProducer->height; - surfNum = 1; - eglColorFormat = CU_EGL_COLOR_FORMAT_ARGB; - } else { - if (CUDA_SUCCESS != - cudaProducerReadYUVFrame(file_p, framenum, cudaProducer->width, - cudaProducer->height, cudaProducer->pBuff)) { - printf("cuda producer, reading YUV frame failed\n"); - goto done; + if (cudaProducer->pitchLinearOutput) { + for (i = 0; i < surfNum; i++) { + cuStatus = cuMemcpy( + cudaPtr[i], (CUdeviceptr)(cudaProducer->pBuff + uvOffset[i]), copyWidthInBytes[i] * copyHeight[i]); + + if (cuStatus != CUDA_SUCCESS) { + printf("Cuda producer: cuMemCpy pitchlinear failed, cuStatus =%d\n", cuStatus); + goto done; + } + } + } + else { + // copy cudaProducer->pBuff to cudaArray + CUDA_MEMCPY3D cpdesc; + for (i = 0; i < surfNum; i++) { + memset(&cpdesc, 0, sizeof(cpdesc)); + cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0; + cpdesc.srcMemoryType = CU_MEMORYTYPE_HOST; + cpdesc.srcHost = (void *)(cudaProducer->pBuff + uvOffset[i]); + cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0; + cpdesc.dstMemoryType = CU_MEMORYTYPE_ARRAY; + cpdesc.dstArray = cudaArr[i]; + cpdesc.WidthInBytes = copyWidthInBytes[i]; + cpdesc.Height = copyHeight[i]; + cpdesc.Depth = 1; + cuStatus = cuMemcpy3D(&cpdesc); + if (cuStatus != CUDA_SUCCESS) { + printf("Cuda producer: cuMemCpy failed, cuStatus =%d\n", cuStatus); + goto done; + } + } } - surfNum = 3; - eglColorFormat = CU_EGL_COLOR_FORMAT_YUV420_PLANAR; - copyWidthInBytes[0] = cudaProducer->width; - copyHeight[0] = cudaProducer->height; - copyWidthInBytes[1] = cudaProducer->width / 2; - copyHeight[1] = cudaProducer->height / 2; - copyWidthInBytes[2] = cudaProducer->width / 2; - copyHeight[2] = cudaProducer->height / 2; - uvOffset[1] = cudaProducer->width * cudaProducer->height; - uvOffset[2] = - uvOffset[1] + cudaProducer->width / 2 * cudaProducer->height / 2; - } - if (cudaProducer->pitchLinearOutput) { for (i = 0; i < surfNum; i++) { - cuStatus = - cuMemcpy(cudaPtr[i], (CUdeviceptr)(cudaProducer->pBuff + uvOffset[i]), - copyWidthInBytes[i] * copyHeight[i]); + if (cudaProducer->pitchLinearOutput) + cudaEgl.frame.pPitch[i] = (void *)cudaPtr[i]; + else + cudaEgl.frame.pArray[i] = cudaArr[i]; + } + cudaEgl.width = copyWidthInBytes[0]; + cudaEgl.depth = 1; + cudaEgl.height = copyHeight[0]; + cudaEgl.pitch = cudaProducer->pitchLinearOutput ? cudaEgl.width : 0; + cudaEgl.frameType = cudaProducer->pitchLinearOutput ? CU_EGL_FRAME_TYPE_PITCH : CU_EGL_FRAME_TYPE_ARRAY; + cudaEgl.planeCount = surfNum; + cudaEgl.numChannels = (eglColorFormat == CU_EGL_COLOR_FORMAT_ARGB) ? 4 : 1; + cudaEgl.eglColorFormat = eglColorFormat; + cudaEgl.cuFormat = CU_AD_FORMAT_UNSIGNED_INT8; - if (cuStatus != CUDA_SUCCESS) { - printf("Cuda producer: cuMemCpy pitchlinear failed, cuStatus =%d\n", - cuStatus); + static int numFramesPresented = 0; + // If there is a frame presented before we check if consumer + // is done with it using cuEGLStreamProducerReturnFrame. + while (numFramesPresented) { + CUeglFrame returnedCudaEgl; + cuStatus = cuEGLStreamProducerReturnFrame(&cudaProducer->cudaConn, &returnedCudaEgl, NULL); + if (cuStatus == CUDA_ERROR_LAUNCH_TIMEOUT) { + continue; + } + else if (cuStatus != CUDA_SUCCESS) { + printf("cuda Producer return frame FAILED with custatus= %d\n", cuStatus); + return cuStatus; + } + else { + numFramesPresented--; + } + } + + cuStatus = cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl, NULL); + if (cuStatus != CUDA_SUCCESS) { + printf("cuda Producer present frame FAILED with custatus= %d\n", cuStatus); goto done; - } } - } else { - // copy cudaProducer->pBuff to cudaArray - CUDA_MEMCPY3D cpdesc; - for (i = 0; i < surfNum; i++) { - memset(&cpdesc, 0, sizeof(cpdesc)); - cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0; - cpdesc.srcMemoryType = CU_MEMORYTYPE_HOST; - cpdesc.srcHost = (void *)(cudaProducer->pBuff + uvOffset[i]); - cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0; - cpdesc.dstMemoryType = CU_MEMORYTYPE_ARRAY; - cpdesc.dstArray = cudaArr[i]; - cpdesc.WidthInBytes = copyWidthInBytes[i]; - cpdesc.Height = copyHeight[i]; - cpdesc.Depth = 1; - cuStatus = cuMemcpy3D(&cpdesc); - if (cuStatus != CUDA_SUCCESS) { - printf("Cuda producer: cuMemCpy failed, cuStatus =%d\n", cuStatus); - goto done; - } - } - } - for (i = 0; i < surfNum; i++) { - if (cudaProducer->pitchLinearOutput) - cudaEgl.frame.pPitch[i] = (void *)cudaPtr[i]; - else - cudaEgl.frame.pArray[i] = cudaArr[i]; - } - cudaEgl.width = copyWidthInBytes[0]; - cudaEgl.depth = 1; - cudaEgl.height = copyHeight[0]; - cudaEgl.pitch = cudaProducer->pitchLinearOutput ? cudaEgl.width : 0; - cudaEgl.frameType = cudaProducer->pitchLinearOutput ? CU_EGL_FRAME_TYPE_PITCH - : CU_EGL_FRAME_TYPE_ARRAY; - cudaEgl.planeCount = surfNum; - cudaEgl.numChannels = (eglColorFormat == CU_EGL_COLOR_FORMAT_ARGB) ? 4 : 1; - cudaEgl.eglColorFormat = eglColorFormat; - cudaEgl.cuFormat = CU_AD_FORMAT_UNSIGNED_INT8; - - static int numFramesPresented = 0; - // If there is a frame presented before we check if consumer - // is done with it using cuEGLStreamProducerReturnFrame. - while (numFramesPresented) { - CUeglFrame returnedCudaEgl; - cuStatus = cuEGLStreamProducerReturnFrame(&cudaProducer->cudaConn, - &returnedCudaEgl, NULL); - if (cuStatus == CUDA_ERROR_LAUNCH_TIMEOUT) { - continue; - } else if (cuStatus != CUDA_SUCCESS) { - printf("cuda Producer return frame FAILED with custatus= %d\n", cuStatus); - return cuStatus; - } else { - numFramesPresented--; - } - } - - cuStatus = - cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl, NULL); - if (cuStatus != CUDA_SUCCESS) { - printf("cuda Producer present frame FAILED with custatus= %d\n", cuStatus); - goto done; - } - numFramesPresented++; + numFramesPresented++; done: - if (file_p) { - fclose(file_p); - file_p = NULL; - } + if (file_p) { + fclose(file_p); + file_p = NULL; + } - return cuStatus; + return cuStatus; } -CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer, - CUdevice device) { - CUresult status = CUDA_SUCCESS; - if (CUDA_SUCCESS != (status = cuInit(0))) { - printf("Failed to initialize CUDA\n"); - return status; - } - - int major = 0, minor = 0; - char deviceName[256]; - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); - checkCudaErrors(cuDeviceGetName(deviceName, 256, device)); - printf( - "CUDA Producer on GPU Device %d: \"%s\" with compute capability " - "%d.%d\n\n", - device, deviceName, major, minor); - - if (major < 6) { - printf( - "EGLStream_CUDA_Interop requires SM 6.0 or higher arch GPU. " - "Exiting...\n"); - exit(2); // EXIT_WAIVED - } - - if (CUDA_SUCCESS != - (status = cuCtxCreate(&cudaProducer->context, 0, device))) { - printf("failed to create CUDA context\n"); - return status; - } - - status = cuMemAlloc(&cudaProducer->cudaPtrARGB[0], (WIDTH * HEIGHT * 4)); - if (status != CUDA_SUCCESS) { - printf("Create CUDA pointer failed, cuStatus=%d\n", status); - return status; - } - - status = cuMemAlloc(&cudaProducer->cudaPtrYUV[0], (WIDTH * HEIGHT)); - if (status != CUDA_SUCCESS) { - printf("Create CUDA pointer failed, cuStatus=%d\n", status); - return status; - } - status = cuMemAlloc(&cudaProducer->cudaPtrYUV[1], (WIDTH * HEIGHT) / 4); - if (status != CUDA_SUCCESS) { - printf("Create CUDA pointer failed, cuStatus=%d\n", status); - return status; - } - status = cuMemAlloc(&cudaProducer->cudaPtrYUV[2], (WIDTH * HEIGHT) / 4); - if (status != CUDA_SUCCESS) { - printf("Create CUDA pointer failed, cuStatus=%d\n", status); - return status; - } - - CUDA_ARRAY3D_DESCRIPTOR desc = {0}; - - desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - desc.Depth = 1; - desc.Flags = CUDA_ARRAY3D_SURFACE_LDST; - desc.NumChannels = 4; - desc.Width = WIDTH * 4; - desc.Height = HEIGHT; - status = cuArray3DCreate(&cudaProducer->cudaArrARGB[0], &desc); - if (status != CUDA_SUCCESS) { - printf("Create CUDA array failed, cuStatus=%d\n", status); - return status; - } - - for (int i = 0; i < 3; i++) { - if (i == 0) { - desc.NumChannels = 1; - desc.Width = WIDTH; - desc.Height = HEIGHT; - } else { // U/V surface as planar - desc.NumChannels = 1; - desc.Width = WIDTH / 2; - desc.Height = HEIGHT / 2; +CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer, CUdevice device) +{ + CUresult status = CUDA_SUCCESS; + if (CUDA_SUCCESS != (status = cuInit(0))) { + printf("Failed to initialize CUDA\n"); + return status; } - status = cuArray3DCreate(&cudaProducer->cudaArrYUV[i], &desc); + + int major = 0, minor = 0; + char deviceName[256]; + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); + checkCudaErrors(cuDeviceGetName(deviceName, 256, device)); + printf("CUDA Producer on GPU Device %d: \"%s\" with compute capability " + "%d.%d\n\n", + device, + deviceName, + major, + minor); + + if (major < 6) { + printf("EGLStream_CUDA_Interop requires SM 6.0 or higher arch GPU. " + "Exiting...\n"); + exit(2); // EXIT_WAIVED + } + + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaProducer->context, 0, device))) { + printf("failed to create CUDA context\n"); + return status; + } + + status = cuMemAlloc(&cudaProducer->cudaPtrARGB[0], (WIDTH * HEIGHT * 4)); if (status != CUDA_SUCCESS) { - printf("Create CUDA array failed, cuStatus=%d\n", status); - return status; + printf("Create CUDA pointer failed, cuStatus=%d\n", status); + return status; } - } - cudaProducer->pBuff = (unsigned char *)malloc((WIDTH * HEIGHT * 4)); - if (!cudaProducer->pBuff) { - printf("CudaProducer: Failed to allocate image buffer\n"); - } + status = cuMemAlloc(&cudaProducer->cudaPtrYUV[0], (WIDTH * HEIGHT)); + if (status != CUDA_SUCCESS) { + printf("Create CUDA pointer failed, cuStatus=%d\n", status); + return status; + } + status = cuMemAlloc(&cudaProducer->cudaPtrYUV[1], (WIDTH * HEIGHT) / 4); + if (status != CUDA_SUCCESS) { + printf("Create CUDA pointer failed, cuStatus=%d\n", status); + return status; + } + status = cuMemAlloc(&cudaProducer->cudaPtrYUV[2], (WIDTH * HEIGHT) / 4); + if (status != CUDA_SUCCESS) { + printf("Create CUDA pointer failed, cuStatus=%d\n", status); + return status; + } - checkCudaErrors(cuCtxPopCurrent(&cudaProducer->context)); - return status; + CUDA_ARRAY3D_DESCRIPTOR desc = {0}; + + desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + desc.Depth = 1; + desc.Flags = CUDA_ARRAY3D_SURFACE_LDST; + desc.NumChannels = 4; + desc.Width = WIDTH * 4; + desc.Height = HEIGHT; + status = cuArray3DCreate(&cudaProducer->cudaArrARGB[0], &desc); + if (status != CUDA_SUCCESS) { + printf("Create CUDA array failed, cuStatus=%d\n", status); + return status; + } + + for (int i = 0; i < 3; i++) { + if (i == 0) { + desc.NumChannels = 1; + desc.Width = WIDTH; + desc.Height = HEIGHT; + } + else { // U/V surface as planar + desc.NumChannels = 1; + desc.Width = WIDTH / 2; + desc.Height = HEIGHT / 2; + } + status = cuArray3DCreate(&cudaProducer->cudaArrYUV[i], &desc); + if (status != CUDA_SUCCESS) { + printf("Create CUDA array failed, cuStatus=%d\n", status); + return status; + } + } + + cudaProducer->pBuff = (unsigned char *)malloc((WIDTH * HEIGHT * 4)); + if (!cudaProducer->pBuff) { + printf("CudaProducer: Failed to allocate image buffer\n"); + } + + checkCudaErrors(cuCtxPopCurrent(&cudaProducer->context)); + return status; } -void cudaProducerInit(test_cuda_producer_s *cudaProducer, EGLDisplay eglDisplay, - EGLStreamKHR eglStream, TestArgs *args) { - cudaProducer->fileName1 = args->infile1; - cudaProducer->fileName2 = args->infile2; +void cudaProducerInit(test_cuda_producer_s *cudaProducer, EGLDisplay eglDisplay, EGLStreamKHR eglStream, TestArgs *args) +{ + cudaProducer->fileName1 = args->infile1; + cudaProducer->fileName2 = args->infile2; - cudaProducer->frameCount = 2; - cudaProducer->width = args->inputWidth; - cudaProducer->height = args->inputHeight; - cudaProducer->isARGB = args->isARGB; - cudaProducer->pitchLinearOutput = args->pitchLinearOutput; + cudaProducer->frameCount = 2; + cudaProducer->width = args->inputWidth; + cudaProducer->height = args->inputHeight; + cudaProducer->isARGB = args->isARGB; + cudaProducer->pitchLinearOutput = args->pitchLinearOutput; - // Set cudaProducer default parameters - cudaProducer->eglDisplay = eglDisplay; - cudaProducer->eglStream = eglStream; + // Set cudaProducer default parameters + cudaProducer->eglDisplay = eglDisplay; + cudaProducer->eglStream = eglStream; } -CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer) { - if (cudaProducer->pBuff) free(cudaProducer->pBuff); +CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer) +{ + if (cudaProducer->pBuff) + free(cudaProducer->pBuff); - checkCudaErrors(cuMemFree(cudaProducer->cudaPtrARGB[0])); - checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[0])); - checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[1])); - checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[2])); - checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrARGB[0])); - checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[0])); - checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[1])); - checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[2])); + checkCudaErrors(cuMemFree(cudaProducer->cudaPtrARGB[0])); + checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[0])); + checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[1])); + checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[2])); + checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrARGB[0])); + checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[0])); + checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[1])); + checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[2])); - return cuEGLStreamProducerDisconnect(&cudaProducer->cudaConn); + return cuEGLStreamProducerDisconnect(&cudaProducer->cudaConn); } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.h b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.h index f8e9c8f3..f8bc9efd 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.h +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.h @@ -33,36 +33,39 @@ #define _CUDA_PRODUCER_H_ #include #include + #include "cudaEGL.h" #include "eglstrm_common.h" extern EGLStreamKHR eglStream; -extern EGLDisplay g_display; +extern EGLDisplay g_display; -typedef struct _test_cuda_producer_s { - // Stream params - char *fileName1; - char *fileName2; - unsigned char *pBuff; - int frameCount; - bool isARGB; - bool pitchLinearOutput; - unsigned int width; - unsigned int height; - CUcontext context; - CUeglStreamConnection cudaConn; - CUdeviceptr cudaPtrARGB[1]; - CUdeviceptr cudaPtrYUV[3]; - CUarray cudaArrARGB[1]; - CUarray cudaArrYUV[3]; - EGLStreamKHR eglStream; - EGLDisplay eglDisplay; +typedef struct _test_cuda_producer_s +{ + // Stream params + char *fileName1; + char *fileName2; + unsigned char *pBuff; + int frameCount; + bool isARGB; + bool pitchLinearOutput; + unsigned int width; + unsigned int height; + CUcontext context; + CUeglStreamConnection cudaConn; + CUdeviceptr cudaPtrARGB[1]; + CUdeviceptr cudaPtrYUV[3]; + CUarray cudaArrARGB[1]; + CUarray cudaArrYUV[3]; + EGLStreamKHR eglStream; + EGLDisplay eglDisplay; } test_cuda_producer_s; -void cudaProducerInit(test_cuda_producer_s *cudaProducer, EGLDisplay eglDisplay, - EGLStreamKHR eglStream, TestArgs *args); +void cudaProducerInit(test_cuda_producer_s *cudaProducer, + EGLDisplay eglDisplay, + EGLStreamKHR eglStream, + TestArgs *args); CUresult cudaProducerTest(test_cuda_producer_s *parserArg, char *file); CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer); -CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer, - CUdevice device); +CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer, CUdevice device); #endif diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.cpp index 605a0ea6..ac96dfa0 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.cpp @@ -32,107 +32,102 @@ #include "eglstrm_common.h" EGLStreamKHR eglStream; -EGLDisplay g_display; -EGLAttrib cudaIndex; +EGLDisplay g_display; +EGLAttrib cudaIndex; #if defined(EXTENSION_LIST) EXTENSION_LIST(EXTLST_DECL) typedef void (*extlst_fnptr_t)(void); -static struct { - extlst_fnptr_t *fnptr; - char const *name; +static struct +{ + extlst_fnptr_t *fnptr; + char const *name; } extensionList[] = {EXTENSION_LIST(EXTLST_ENTRY)}; -int eglSetupExtensions(void) { - unsigned int i; +int eglSetupExtensions(void) +{ + unsigned int i; - for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) { - *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name); - if (*extensionList[i].fnptr == NULL) { - printf("Couldn't get address of %s()\n", extensionList[i].name); - return 0; + for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) { + *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name); + if (*extensionList[i].fnptr == NULL) { + printf("Couldn't get address of %s()\n", extensionList[i].name); + return 0; + } } - } - return 1; + return 1; } -int EGLStreamInit(int *cuda_device) { - static const EGLint streamAttrMailboxMode[] = {EGL_SUPPORT_REUSE_NV, - EGL_FALSE, EGL_NONE}; - EGLBoolean eglStatus; +int EGLStreamInit(int *cuda_device) +{ + static const EGLint streamAttrMailboxMode[] = {EGL_SUPPORT_REUSE_NV, EGL_FALSE, EGL_NONE}; + EGLBoolean eglStatus; #define MAX_EGL_DEVICES 4 - EGLint numDevices = 0; - EGLDeviceEXT devices[MAX_EGL_DEVICES]; - eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices); - if (eglStatus != EGL_TRUE) { - printf("Error querying EGL devices\n"); - exit(EXIT_FAILURE); - } - - if (numDevices == 0) { - printf("No EGL devices found.. Waiving\n"); - eglStatus = EGL_FALSE; - exit(EXIT_WAIVED); - } - - int egl_device_id = 0; - for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { - eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], - EGL_CUDA_DEVICE_NV, &cudaIndex); - if (eglStatus == EGL_TRUE) { - *cuda_device = cudaIndex; // We select first EGL-CUDA Capable device. - printf("Found EGL-CUDA Capable device with CUDA Device id = %d\n", - (int)cudaIndex); - break; + EGLint numDevices = 0; + EGLDeviceEXT devices[MAX_EGL_DEVICES]; + eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices); + if (eglStatus != EGL_TRUE) { + printf("Error querying EGL devices\n"); + exit(EXIT_FAILURE); } - } - if (egl_device_id >= numDevices) { - printf("No CUDA Capable EGL Device found.. Waiving execution\n"); - exit(EXIT_WAIVED); - } + if (numDevices == 0) { + printf("No EGL devices found.. Waiving\n"); + eglStatus = EGL_FALSE; + exit(EXIT_WAIVED); + } - g_display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, - (void *)devices[egl_device_id], NULL); - if (g_display == EGL_NO_DISPLAY) { - printf("Could not get EGL display from device. \n"); - eglStatus = EGL_FALSE; - exit(EXIT_FAILURE); - } + int egl_device_id = 0; + for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) { + eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id], EGL_CUDA_DEVICE_NV, &cudaIndex); + if (eglStatus == EGL_TRUE) { + *cuda_device = cudaIndex; // We select first EGL-CUDA Capable device. + printf("Found EGL-CUDA Capable device with CUDA Device id = %d\n", (int)cudaIndex); + break; + } + } - eglStatus = eglInitialize(g_display, 0, 0); - if (!eglStatus) { - printf("EGL failed to initialize. \n"); - eglStatus = EGL_FALSE; - exit(EXIT_FAILURE); - } + if (egl_device_id >= numDevices) { + printf("No CUDA Capable EGL Device found.. Waiving execution\n"); + exit(EXIT_WAIVED); + } - eglStream = eglCreateStreamKHR(g_display, streamAttrMailboxMode); - if (eglStream == EGL_NO_STREAM_KHR) { - printf("Could not create EGL stream.\n"); - eglStatus = EGL_FALSE; - exit(EXIT_FAILURE); - } + g_display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, (void *)devices[egl_device_id], NULL); + if (g_display == EGL_NO_DISPLAY) { + printf("Could not get EGL display from device. \n"); + eglStatus = EGL_FALSE; + exit(EXIT_FAILURE); + } - printf("Created EGLStream %p\n", eglStream); + eglStatus = eglInitialize(g_display, 0, 0); + if (!eglStatus) { + printf("EGL failed to initialize. \n"); + eglStatus = EGL_FALSE; + exit(EXIT_FAILURE); + } - // Set stream attribute - if (!eglStreamAttribKHR(g_display, eglStream, EGL_CONSUMER_LATENCY_USEC_KHR, - 16000)) { - printf( - "Consumer: eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n"); - return 0; - } - if (!eglStreamAttribKHR(g_display, eglStream, - EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000)) { - printf( - "Consumer: eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR " - "failed\n"); - return 0; - } - printf("EGLStream initialized\n"); - return 1; + eglStream = eglCreateStreamKHR(g_display, streamAttrMailboxMode); + if (eglStream == EGL_NO_STREAM_KHR) { + printf("Could not create EGL stream.\n"); + eglStatus = EGL_FALSE; + exit(EXIT_FAILURE); + } + + printf("Created EGLStream %p\n", eglStream); + + // Set stream attribute + if (!eglStreamAttribKHR(g_display, eglStream, EGL_CONSUMER_LATENCY_USEC_KHR, 16000)) { + printf("Consumer: eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n"); + return 0; + } + if (!eglStreamAttribKHR(g_display, eglStream, EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000)) { + printf("Consumer: eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR " + "failed\n"); + return 0; + } + printf("EGLStream initialized\n"); + return 1; } void EGLStreamFini(void) { eglDestroyStreamKHR(g_display, eglStream); } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.h b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.h index 8e0b1e9d..a86a7b63 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.h +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/eglstrm_common.h @@ -45,59 +45,56 @@ #include "cudaEGL.h" #include "helper_cuda_drvapi.h" -#define EXTENSION_LIST(T) \ - T(PFNEGLCREATESTREAMKHRPROC, eglCreateStreamKHR) \ - T(PFNEGLDESTROYSTREAMKHRPROC, eglDestroyStreamKHR) \ - T(PFNEGLQUERYSTREAMKHRPROC, eglQueryStreamKHR) \ - T(PFNEGLQUERYSTREAMU64KHRPROC, eglQueryStreamu64KHR) \ - T(PFNEGLQUERYSTREAMTIMEKHRPROC, eglQueryStreamTimeKHR) \ - T(PFNEGLSTREAMATTRIBKHRPROC, eglStreamAttribKHR) \ - T(PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR) \ - T(PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR) \ - T(PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC, \ - eglStreamConsumerGLTextureExternalKHR) \ - T(PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \ - T(PFNEGLQUERYDEVICESEXTPROC, eglQueryDevicesEXT) \ - T(PFNEGLGETPLATFORMDISPLAYEXTPROC, eglGetPlatformDisplayEXT) \ - T(PFNEGLQUERYDEVICEATTRIBEXTPROC, eglQueryDeviceAttribEXT) \ - T(PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC, \ - eglCreateStreamFromFileDescriptorKHR) +#define EXTENSION_LIST(T) \ + T(PFNEGLCREATESTREAMKHRPROC, eglCreateStreamKHR) \ + T(PFNEGLDESTROYSTREAMKHRPROC, eglDestroyStreamKHR) \ + T(PFNEGLQUERYSTREAMKHRPROC, eglQueryStreamKHR) \ + T(PFNEGLQUERYSTREAMU64KHRPROC, eglQueryStreamu64KHR) \ + T(PFNEGLQUERYSTREAMTIMEKHRPROC, eglQueryStreamTimeKHR) \ + T(PFNEGLSTREAMATTRIBKHRPROC, eglStreamAttribKHR) \ + T(PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR) \ + T(PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR) \ + T(PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC, eglStreamConsumerGLTextureExternalKHR) \ + T(PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \ + T(PFNEGLQUERYDEVICESEXTPROC, eglQueryDevicesEXT) \ + T(PFNEGLGETPLATFORMDISPLAYEXTPROC, eglGetPlatformDisplayEXT) \ + T(PFNEGLQUERYDEVICEATTRIBEXTPROC, eglQueryDeviceAttribEXT) \ + T(PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC, eglCreateStreamFromFileDescriptorKHR) -#define eglCreateStreamKHR my_eglCreateStreamKHR -#define eglDestroyStreamKHR my_eglDestroyStreamKHR -#define eglQueryStreamKHR my_eglQueryStreamKHR -#define eglQueryStreamu64KHR my_eglQueryStreamu64KHR -#define eglQueryStreamTimeKHR my_eglQueryStreamTimeKHR -#define eglStreamAttribKHR my_eglStreamAttribKHR -#define eglStreamConsumerAcquireKHR my_eglStreamConsumerAcquireKHR -#define eglStreamConsumerReleaseKHR my_eglStreamConsumerReleaseKHR -#define eglStreamConsumerGLTextureExternalKHR \ - my_eglStreamConsumerGLTextureExternalKHR -#define eglGetStreamFileDescriptorKHR my_eglGetStreamFileDescriptorKHR -#define eglCreateStreamFromFileDescriptorKHR \ - my_eglCreateStreamFromFileDescriptorKHR -#define eglQueryDevicesEXT my_eglQueryDevicesEXT -#define eglGetPlatformDisplayEXT my_eglGetPlatformDisplayEXT -#define eglQueryDeviceAttribEXT my_eglQueryDeviceAttribEXT +#define eglCreateStreamKHR my_eglCreateStreamKHR +#define eglDestroyStreamKHR my_eglDestroyStreamKHR +#define eglQueryStreamKHR my_eglQueryStreamKHR +#define eglQueryStreamu64KHR my_eglQueryStreamu64KHR +#define eglQueryStreamTimeKHR my_eglQueryStreamTimeKHR +#define eglStreamAttribKHR my_eglStreamAttribKHR +#define eglStreamConsumerAcquireKHR my_eglStreamConsumerAcquireKHR +#define eglStreamConsumerReleaseKHR my_eglStreamConsumerReleaseKHR +#define eglStreamConsumerGLTextureExternalKHR my_eglStreamConsumerGLTextureExternalKHR +#define eglGetStreamFileDescriptorKHR my_eglGetStreamFileDescriptorKHR +#define eglCreateStreamFromFileDescriptorKHR my_eglCreateStreamFromFileDescriptorKHR +#define eglQueryDevicesEXT my_eglQueryDevicesEXT +#define eglGetPlatformDisplayEXT my_eglGetPlatformDisplayEXT +#define eglQueryDeviceAttribEXT my_eglQueryDeviceAttribEXT -#define EXTLST_DECL(tx, x) tx my_##x = NULL; +#define EXTLST_DECL(tx, x) tx my_##x = NULL; #define EXTLST_EXTERN(tx, x) extern tx my_##x; -#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&my_##x, #x}, +#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&my_##x, #x}, #define MAX_STRING_SIZE 256 -#define WIDTH 720 -#define HEIGHT 480 +#define WIDTH 720 +#define HEIGHT 480 -typedef struct _TestArgs { - char *infile1; - char *infile2; - bool isARGB; - unsigned int inputWidth; - unsigned int inputHeight; - bool pitchLinearOutput; +typedef struct _TestArgs +{ + char *infile1; + char *infile2; + bool isARGB; + unsigned int inputWidth; + unsigned int inputHeight; + bool pitchLinearOutput; } TestArgs; -int eglSetupExtensions(void); -int EGLStreamInit(int *dev); +int eglSetupExtensions(void); +int EGLStreamInit(int *dev); void EGLStreamFini(void); #endif diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/main.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/main.cpp index 4b5879f2..f14c88c9 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/main.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/main.cpp @@ -29,7 +29,7 @@ // // -//#define EGL_EGLEXT_PROTOTYPES +// #define EGL_EGLEXT_PROTOTYPES #include "cudaEGL.h" #include "cuda_consumer.h" @@ -46,186 +46,187 @@ EXTENSION_LIST(EXTLST_EXTERN) bool signal_stop = 0; -static void sig_handler(int sig) { - signal_stop = 1; - printf("Signal: %d\n", sig); +static void sig_handler(int sig) +{ + signal_stop = 1; + printf("Signal: %d\n", sig); } -int main(int argc, char **argv) { - TestArgs args; - CUresult curesult = CUDA_SUCCESS; - unsigned int i, j; - EGLint streamState = 0; +int main(int argc, char **argv) +{ + TestArgs args; + CUresult curesult = CUDA_SUCCESS; + unsigned int i, j; + EGLint streamState = 0; - test_cuda_consumer_s cudaConsumer; - test_cuda_producer_s cudaProducer; + test_cuda_consumer_s cudaConsumer; + test_cuda_producer_s cudaProducer; - memset(&cudaProducer, 0, sizeof(test_cuda_producer_s)); - memset(&cudaConsumer, 0, sizeof(test_cuda_consumer_s)); + memset(&cudaProducer, 0, sizeof(test_cuda_producer_s)); + memset(&cudaConsumer, 0, sizeof(test_cuda_consumer_s)); - // Hook up Ctrl-C handler - signal(SIGINT, sig_handler); - if (!eglSetupExtensions()) { - printf("SetupExtentions failed \n"); - curesult = CUDA_ERROR_UNKNOWN; - goto done; - } - - checkCudaErrors(cuInit(0)); - - int count; - - checkCudaErrors(cuDeviceGetCount(&count)); - printf("Found %d cuda devices\n", count); - - CUdevice devId; - - if (!EGLStreamInit(&devId)) { - printf("EGLStream Init failed.\n"); - curesult = CUDA_ERROR_UNKNOWN; - goto done; - } - curesult = cudaDeviceCreateProducer(&cudaProducer, devId); - if (curesult != CUDA_SUCCESS) { - goto done; - } - curesult = cudaDeviceCreateConsumer(&cudaConsumer, devId); - if (curesult != CUDA_SUCCESS) { - goto done; - } - checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); - if (CUDA_SUCCESS != (curesult = cuEGLStreamConsumerConnect( - &(cudaConsumer.cudaConn), eglStream))) { - printf("FAILED Connect CUDA consumer with error %d\n", curesult); - goto done; - } else { - printf("Connected CUDA consumer, CudaConsumer %p\n", cudaConsumer.cudaConn); - } - checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); - - checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); - if (CUDA_SUCCESS == - (curesult = cuEGLStreamProducerConnect(&(cudaProducer.cudaConn), - eglStream, WIDTH, HEIGHT))) { - printf("Connect CUDA producer Done, CudaProducer %p\n", - cudaProducer.cudaConn); - } else { - printf("Connect CUDA producer FAILED with error %d\n", curesult); - goto done; - } - checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); - - // Initialize producer - for (i = 0; i < NUM_TRAILS; i++) { - if (streamState != EGL_STREAM_STATE_CONNECTING_KHR) { - if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, - &streamState)) { - printf("main: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + // Hook up Ctrl-C handler + signal(SIGINT, sig_handler); + if (!eglSetupExtensions()) { + printf("SetupExtentions failed \n"); curesult = CUDA_ERROR_UNKNOWN; goto done; - } } - args.inputWidth = WIDTH; - args.inputHeight = HEIGHT; - if (i % 2 != 0) { - args.isARGB = 1; - args.infile1 = sdkFindFilePath("cuda_f_1.yuv", argv[0]); - args.infile2 = sdkFindFilePath("cuda_f_2.yuv", argv[0]); - } else { - args.isARGB = 0; - args.infile1 = sdkFindFilePath("cuda_yuv_f_1.yuv", argv[0]); - args.infile2 = sdkFindFilePath("cuda_yuv_f_2.yuv", argv[0]); + + checkCudaErrors(cuInit(0)); + + int count; + + checkCudaErrors(cuDeviceGetCount(&count)); + printf("Found %d cuda devices\n", count); + + CUdevice devId; + + if (!EGLStreamInit(&devId)) { + printf("EGLStream Init failed.\n"); + curesult = CUDA_ERROR_UNKNOWN; + goto done; } - if ((i % 4) < 2) { - args.pitchLinearOutput = 1; - } else { - args.pitchLinearOutput = 0; + curesult = cudaDeviceCreateProducer(&cudaProducer, devId); + if (curesult != CUDA_SUCCESS) { + goto done; + } + curesult = cudaDeviceCreateConsumer(&cudaConsumer, devId); + if (curesult != CUDA_SUCCESS) { + goto done; + } + checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); + if (CUDA_SUCCESS != (curesult = cuEGLStreamConsumerConnect(&(cudaConsumer.cudaConn), eglStream))) { + printf("FAILED Connect CUDA consumer with error %d\n", curesult); + goto done; + } + else { + printf("Connected CUDA consumer, CudaConsumer %p\n", cudaConsumer.cudaConn); + } + checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); + + checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); + if (CUDA_SUCCESS == (curesult = cuEGLStreamProducerConnect(&(cudaProducer.cudaConn), eglStream, WIDTH, HEIGHT))) { + printf("Connect CUDA producer Done, CudaProducer %p\n", cudaProducer.cudaConn); + } + else { + printf("Connect CUDA producer FAILED with error %d\n", curesult); + goto done; + } + checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); + + // Initialize producer + for (i = 0; i < NUM_TRAILS; i++) { + if (streamState != EGL_STREAM_STATE_CONNECTING_KHR) { + if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, &streamState)) { + printf("main: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + curesult = CUDA_ERROR_UNKNOWN; + goto done; + } + } + args.inputWidth = WIDTH; + args.inputHeight = HEIGHT; + if (i % 2 != 0) { + args.isARGB = 1; + args.infile1 = sdkFindFilePath("cuda_f_1.yuv", argv[0]); + args.infile2 = sdkFindFilePath("cuda_f_2.yuv", argv[0]); + } + else { + args.isARGB = 0; + args.infile1 = sdkFindFilePath("cuda_yuv_f_1.yuv", argv[0]); + args.infile2 = sdkFindFilePath("cuda_yuv_f_2.yuv", argv[0]); + } + if ((i % 4) < 2) { + args.pitchLinearOutput = 1; + } + else { + args.pitchLinearOutput = 0; + } + + checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); + cudaProducerInit(&cudaProducer, g_display, eglStream, &args); + checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); + + checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); + cuda_consumer_init(&cudaConsumer, &args); + checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); + + printf("main - Cuda Producer and Consumer Initialized.\n"); + + for (j = 0; j < 2; j++) { + printf("Running for %s frame and %s input\n", + args.isARGB ? "ARGB" : "YUV", + args.pitchLinearOutput ? "Pitchlinear" : "BlockLinear"); + if (j == 0) { + checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); + curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName1); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Producer Test failed for frame = %d\n", j + 1); + goto done; + } + checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); + checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); + curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile1); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Consumer Test failed for frame = %d\n", j + 1); + goto done; + } + checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); + } + else { + checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); + curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName2); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Producer Test failed for frame = %d\n", j + 1); + goto done; + } + + checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); + checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); + curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile2); + if (curesult != CUDA_SUCCESS) { + printf("Cuda Consumer Test failed for frame = %d\n", j + 1); + goto done; + } + checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); + } + } } checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); - cudaProducerInit(&cudaProducer, g_display, eglStream, &args); + if (CUDA_SUCCESS != (curesult = cudaProducerDeinit(&cudaProducer))) { + printf("Producer Disconnect FAILED. \n"); + goto done; + } checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); - checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); - cuda_consumer_init(&cudaConsumer, &args); - checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); - - printf("main - Cuda Producer and Consumer Initialized.\n"); - - for (j = 0; j < 2; j++) { - printf("Running for %s frame and %s input\n", - args.isARGB ? "ARGB" : "YUV", - args.pitchLinearOutput ? "Pitchlinear" : "BlockLinear"); - if (j == 0) { - checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); - curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName1); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Producer Test failed for frame = %d\n", j + 1); - goto done; - } - checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); - checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); - curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile1); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Consumer Test failed for frame = %d\n", j + 1); - goto done; - } - checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); - } else { - checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); - curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName2); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Producer Test failed for frame = %d\n", j + 1); - goto done; - } - - checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); - checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context)); - curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile2); - if (curesult != CUDA_SUCCESS) { - printf("Cuda Consumer Test failed for frame = %d\n", j + 1); - goto done; - } - checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context)); - } + if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, &streamState)) { + printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + curesult = CUDA_ERROR_UNKNOWN; + goto done; } - } - - checkCudaErrors(cuCtxPushCurrent(cudaProducer.context)); - if (CUDA_SUCCESS != (curesult = cudaProducerDeinit(&cudaProducer))) { - printf("Producer Disconnect FAILED. \n"); - goto done; - } - checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context)); - - if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, - &streamState)) { - printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); - curesult = CUDA_ERROR_UNKNOWN; - goto done; - } - if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { - if (CUDA_SUCCESS != (curesult = cuda_consumer_deinit(&cudaConsumer))) { - printf("Consumer Disconnect FAILED.\n"); - goto done; + if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { + if (CUDA_SUCCESS != (curesult = cuda_consumer_deinit(&cudaConsumer))) { + printf("Consumer Disconnect FAILED.\n"); + goto done; + } } - } - printf("Producer and Consumer Disconnected \n"); + printf("Producer and Consumer Disconnected \n"); done: - if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, - &streamState)) { - printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); - curesult = CUDA_ERROR_UNKNOWN; - } - if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { - EGLStreamFini(); - } + if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR, &streamState)) { + printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n"); + curesult = CUDA_ERROR_UNKNOWN; + } + if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) { + EGLStreamFini(); + } - if (curesult == CUDA_SUCCESS) { - printf("&&&& EGLStream interop test PASSED\n"); - } else { - printf("&&&& EGLStream interop test FAILED\n"); - } - return 0; + if (curesult == CUDA_SUCCESS) { + printf("&&&& EGLStream interop test PASSED\n"); + } + else { + printf("&&&& EGLStream interop test FAILED\n"); + } + return 0; } diff --git a/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers.cpp b/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers.cpp index 30d6ce0a..791a9d79 100644 --- a/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers.cpp +++ b/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers.cpp @@ -38,15 +38,13 @@ #endif // Includes -#include -#include -#include -#include // helper functions for timing, string parsing - -#include // CUDA Runtime #include // CUDA OpenGL interop - -#include // includes for CUDA initialization and error checking +#include // CUDA Runtime +#include // includes for CUDA initialization and error checking +#include // helper functions for timing, string parsing +#include +#include +#include #include "FunctionPointers_kernels.h" @@ -70,441 +68,439 @@ void initializeData(char *file); static const char *sSDKsample = "CUDA Function Pointers (SobelFilter)"; -const char *filterMode[] = {"No Filtering", "Sobel Texture", - "Sobel SMEM+Texture", NULL}; +const char *filterMode[] = {"No Filtering", "Sobel Texture", "Sobel SMEM+Texture", NULL}; -static int wWidth = 512; // Window width -static int wHeight = 512; // Window height -static int imWidth = 0; // Image width +static int wWidth = 512; // Window width +static int wHeight = 512; // Window height +static int imWidth = 0; // Image width static int imHeight = 0; // Image height -static int blockOp = 0; -static int pointOp = 1; +static int blockOp = 0; +static int pointOp = 1; // Code to handle Auto verification -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 8; // FPS limit for sampling -unsigned int frameCount = 0; -StopWatchInterface *timer = NULL; -unsigned int g_Bpp; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 8; // FPS limit for sampling +unsigned int frameCount = 0; +StopWatchInterface *timer = NULL; +unsigned int g_Bpp; int g_TotalErrors = 0; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; bool g_bQAReadback = false; // Display Data -static GLuint pbo_buffer = 0; // Front and back CA buffers -struct cudaGraphicsResource - *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) +static GLuint pbo_buffer = 0; // Front and back CA buffers +struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) -static GLuint texid = 0; // Texture for display -unsigned char *pixels = NULL; // Image pixel data on the host -float imageScale = 1.f; // Image exposure +static GLuint texid = 0; // Texture for display +unsigned char *pixels = NULL; // Image pixel data on the host +float imageScale = 1.f; // Image exposure enum SobelDisplayMode g_SobelDisplayMode; -#define OFFSET(i) ((char *)NULL + (i)) -#define MAX(a, b) ((a > b) ? a : b) -#define REFRESH_DELAY 10 // ms +#define OFFSET(i) ((char *)NULL + (i)) +#define MAX(a, b) ((a > b) ? a : b) +#define REFRESH_DELAY 10 // ms -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "FunctionPointers [CUDA Edge Detection] (%s): %3.1f fps", - filterMode[g_SobelDisplayMode], ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "FunctionPointers [CUDA Edge Detection] (%s): %3.1f fps", filterMode[g_SobelDisplayMode], ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - fpsLimit = (int)MAX(ifps, 1.f); - sdkResetTimer(&timer); - } + fpsLimit = (int)MAX(ifps, 1.f); + sdkResetTimer(&timer); + } } // This is the normal display path -void display(void) { - sdkStartTimer(&timer); +void display(void) +{ + sdkStartTimer(&timer); - // Sobel operation - Pixel *data = NULL; + // Sobel operation + Pixel *data = NULL; - // map PBO to get CUDA device pointer - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&data, &num_bytes, cuda_pbo_resource)); - // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); + // map PBO to get CUDA device pointer + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes, cuda_pbo_resource)); + // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); - sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, - pointOp); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - glClear(GL_COLOR_BUFFER_BIT); + glClear(GL_COLOR_BUFFER_BIT); - glBindTexture(GL_TEXTURE_2D, texid); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, - GL_UNSIGNED_BYTE, OFFSET(0)); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + glBindTexture(GL_TEXTURE_2D, texid); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0)); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - glDisable(GL_DEPTH_TEST); - glEnable(GL_TEXTURE_2D); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); + glDisable(GL_DEPTH_TEST); + glEnable(GL_TEXTURE_2D); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); - glBegin(GL_QUADS); - glVertex2f(0, 0); - glTexCoord2f(0, 0); - glVertex2f(0, 1); - glTexCoord2f(1, 0); - glVertex2f(1, 1); - glTexCoord2f(1, 1); - glVertex2f(1, 0); - glTexCoord2f(0, 1); - glEnd(); - glBindTexture(GL_TEXTURE_2D, 0); + glBegin(GL_QUADS); + glVertex2f(0, 0); + glTexCoord2f(0, 0); + glVertex2f(0, 1); + glTexCoord2f(1, 0); + glVertex2f(1, 1); + glTexCoord2f(1, 1); + glVertex2f(1, 0); + glTexCoord2f(0, 1); + glEnd(); + glBindTexture(GL_TEXTURE_2D, 0); - glutSwapBuffers(); + glutSwapBuffers(); - sdkStopTimer(&timer); - computeFPS(); + sdkStopTimer(&timer); + computeFPS(); } -void timerEvent(int value) { - if (glutGetWindow()) { - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - } +void timerEvent(int value) +{ + if (glutGetWindow()) { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + } } -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - char temp[256]; +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + char temp[256]; - switch (key) { + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case '-': - imageScale -= 0.1f; - printf("brightness = %4.2f\n", imageScale); - break; + imageScale -= 0.1f; + printf("brightness = %4.2f\n", imageScale); + break; case '=': - imageScale += 0.1f; - printf("brightness = %4.2f\n", imageScale); - break; + imageScale += 0.1f; + printf("brightness = %4.2f\n", imageScale); + break; case 'i': case 'I': - g_SobelDisplayMode = SOBELDISPLAY_IMAGE; - sprintf(temp, "Function Pointers [CUDA Edge Detection] (%s)", - filterMode[g_SobelDisplayMode]); - glutSetWindowTitle(temp); - break; + g_SobelDisplayMode = SOBELDISPLAY_IMAGE; + sprintf(temp, "Function Pointers [CUDA Edge Detection] (%s)", filterMode[g_SobelDisplayMode]); + glutSetWindowTitle(temp); + break; case 's': case 'S': - g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; - sprintf(temp, "Function Pointers [CUDA Edge Detection] (%s)", - filterMode[g_SobelDisplayMode]); - glutSetWindowTitle(temp); - break; + g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; + sprintf(temp, "Function Pointers [CUDA Edge Detection] (%s)", filterMode[g_SobelDisplayMode]); + glutSetWindowTitle(temp); + break; case 't': case 'T': - g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; - sprintf(temp, "Function Pointers [CUDA Edge Detection] (%s)", - filterMode[g_SobelDisplayMode]); - glutSetWindowTitle(temp); - break; + g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; + sprintf(temp, "Function Pointers [CUDA Edge Detection] (%s)", filterMode[g_SobelDisplayMode]); + glutSetWindowTitle(temp); + break; case 'b': case 'B': - blockOp = (blockOp + 1) % LAST_BLOCK_FILTER; - break; + blockOp = (blockOp + 1) % LAST_BLOCK_FILTER; + break; case 'p': case 'P': - pointOp = (pointOp + 1) % LAST_POINT_FILTER; - break; + pointOp = (pointOp + 1) % LAST_POINT_FILTER; + break; default: - break; - } + break; + } } -void reshape(int x, int y) { - glViewport(0, 0, x, y); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0, 1, 0, 1, 0, 1); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0, 1, 0, 1, 0, 1); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); } -void cleanup(void) { - cudaGraphicsUnregisterResource(cuda_pbo_resource); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - glDeleteBuffers(1, &pbo_buffer); - glDeleteTextures(1, &texid); - deleteTexture(); - - sdkDeleteTimer(&timer); -} - -void initializeData(char *file) { - GLint bsize; - unsigned int w, h; - size_t file_length = strlen(file); - - if (!strcmp(&file[file_length - 3], "pgm")) { - if (sdkLoadPGM(file, &pixels, &w, &h) != true) { - printf("Failed to load PGM image file: %s\n", file); - exit(EXIT_FAILURE); - } - - g_Bpp = 1; - } else if (!strcmp(&file[file_length - 3], "ppm")) { - if (sdkLoadPPM4(file, &pixels, &w, &h) != true) { - printf("Failed to load PPM image file: %s\n", file); - exit(EXIT_FAILURE); - } - - g_Bpp = 4; - } else { - exit(EXIT_FAILURE); - } - - imWidth = (int)w; - imHeight = (int)h; - setupTexture(imWidth, imHeight, pixels, g_Bpp); - - // copy function pointer tables to host side for later use - setupFunctionTables(); - - memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight); - - if (!g_bQAReadback) { - // use OpenGL Path - glGenBuffers(1, &pbo_buffer); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); - glBufferData(GL_PIXEL_UNPACK_BUFFER, - g_Bpp * sizeof(Pixel) * imWidth * imHeight, pixels, - GL_STREAM_DRAW); - - glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); - - if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) { - printf("Buffer object (%d) has incorrect size (%d).\n", - (unsigned)pbo_buffer, (unsigned)bsize); - exit(EXIT_FAILURE); - } +void cleanup(void) +{ + cudaGraphicsUnregisterResource(cuda_pbo_resource); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + glDeleteBuffers(1, &pbo_buffer); + glDeleteTextures(1, &texid); + deleteTexture(); - // register this buffer object with CUDA - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard)); - - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - glTexImage2D(GL_TEXTURE_2D, 0, ((g_Bpp == 1) ? GL_LUMINANCE : GL_BGRA), - imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL); - glBindTexture(GL_TEXTURE_2D, 0); - - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glPixelStorei(GL_PACK_ALIGNMENT, 1); - } + sdkDeleteTimer(&timer); } -void loadDefaultImage(char *loc_exec) { - printf("Reading image: teapot512.pgm\n"); - const char *image_filename = "teapot512.pgm"; - char *image_path = sdkFindFilePath(image_filename, loc_exec); +void initializeData(char *file) +{ + GLint bsize; + unsigned int w, h; + size_t file_length = strlen(file); - if (image_path == NULL) { - printf("Failed to read image file: <%s>\n", image_filename); - exit(EXIT_FAILURE); - } + if (!strcmp(&file[file_length - 3], "pgm")) { + if (sdkLoadPGM(file, &pixels, &w, &h) != true) { + printf("Failed to load PGM image file: %s\n", file); + exit(EXIT_FAILURE); + } - initializeData(image_path); - free(image_path); + g_Bpp = 1; + } + else if (!strcmp(&file[file_length - 3], "ppm")) { + if (sdkLoadPPM4(file, &pixels, &w, &h) != true) { + printf("Failed to load PPM image file: %s\n", file); + exit(EXIT_FAILURE); + } + + g_Bpp = 4; + } + else { + exit(EXIT_FAILURE); + } + + imWidth = (int)w; + imHeight = (int)h; + setupTexture(imWidth, imHeight, pixels, g_Bpp); + + // copy function pointer tables to host side for later use + setupFunctionTables(); + + memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight); + + if (!g_bQAReadback) { + // use OpenGL Path + glGenBuffers(1, &pbo_buffer); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); + glBufferData(GL_PIXEL_UNPACK_BUFFER, g_Bpp * sizeof(Pixel) * imWidth * imHeight, pixels, GL_STREAM_DRAW); + + glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); + + if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) { + printf("Buffer object (%d) has incorrect size (%d).\n", (unsigned)pbo_buffer, (unsigned)bsize); + exit(EXIT_FAILURE); + } + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard)); + + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + glTexImage2D(GL_TEXTURE_2D, + 0, + ((g_Bpp == 1) ? GL_LUMINANCE : GL_BGRA), + imWidth, + imHeight, + 0, + GL_LUMINANCE, + GL_UNSIGNED_BYTE, + NULL); + glBindTexture(GL_TEXTURE_2D, 0); + + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glPixelStorei(GL_PACK_ALIGNMENT, 1); + } } -void initGL(int *argc, char **argv) { - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); - glutInitWindowSize(wWidth, wHeight); - glutCreateWindow("Function Pointers [CUDA Edge Detection]n"); +void loadDefaultImage(char *loc_exec) +{ + printf("Reading image: teapot512.pgm\n"); + const char *image_filename = "teapot512.pgm"; + char *image_path = sdkFindFilePath(image_filename, loc_exec); - if (!isGLVersionSupported(1, 5) || - !areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); - fprintf(stderr, "This sample requires:\n"); - fprintf(stderr, " OpenGL version 1.5\n"); - fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); - fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); + if (image_path == NULL) { + printf("Failed to read image file: <%s>\n", image_filename); + exit(EXIT_FAILURE); + } - exit(EXIT_WAIVED); - } + initializeData(image_path); + free(image_path); } -void runAutoTest(int argc, char *argv[]) { - printf("[%s] (automated testing w/ readback)\n", sSDKsample); - int devID = findCudaDevice(argc, (const char **)argv); +void initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(wWidth, wHeight); + glutCreateWindow("Function Pointers [CUDA Edge Detection]n"); - loadDefaultImage(argv[0]); + if (!isGLVersionSupported(1, 5) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); + fprintf(stderr, "This sample requires:\n"); + fprintf(stderr, " OpenGL version 1.5\n"); + fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); + fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); - Pixel *d_result; - checkCudaErrors( - cudaMalloc((void **)&d_result, imWidth * imHeight * sizeof(Pixel))); - - char *ref_file = NULL; - char dump_file[256]; - - int mode = 0; - mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - - switch (mode) { - case 0: - g_SobelDisplayMode = SOBELDISPLAY_IMAGE; - sprintf(dump_file, "teapot512_orig.pgm"); - break; - - case 1: - g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; - sprintf(dump_file, "teapot512_tex.pgm"); - break; - - case 2: - g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; - sprintf(dump_file, "teapot512_shared.pgm"); - break; - - default: - printf("Invalid Filter Mode File\n"); - exit(EXIT_FAILURE); - break; - } - - printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); - sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, - blockOp, pointOp); - checkCudaErrors(cudaDeviceSynchronize()); - - unsigned char *h_result = - (unsigned char *)malloc(imWidth * imHeight * sizeof(Pixel)); - checkCudaErrors(cudaMemcpy(h_result, d_result, - imWidth * imHeight * sizeof(Pixel), - cudaMemcpyDeviceToHost)); - sdkSavePGM(dump_file, h_result, imWidth, imHeight); - - if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), - MAX_EPSILON_ERROR, 0.15f, false)) { - g_TotalErrors++; - } - - checkCudaErrors(cudaFree(d_result)); - free(h_result); - - if (g_TotalErrors != 0) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } - - printf("Test passed!\n"); - exit(EXIT_SUCCESS); + exit(EXIT_WAIVED); + } } -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; - -#if defined(__linux__) - setenv("DISPLAY", ":0", 0); -#endif - - printf("%s Starting...\n\n", argv[0]); - - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("\nUsage: FunctionPointers (SobelFilter) \n"); - printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n"); - printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n"); - - exit(EXIT_WAIVED); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - g_bQAReadback = true; - runAutoTest(argc, argv); - } - - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf( - " This SDK does not explicitly support -device=n when running with " - "OpenGL.\n"); - printf( - " When specifying -device=n (n=0,1,2,....) the sample must not use " - "OpenGL.\n"); - printf(" See details below to run without OpenGL:\n\n"); - printf(" > %s -device=n\n\n", argv[0]); - printf("exiting...\n"); - - exit(EXIT_WAIVED); - } - - if (!g_bQAReadback) { - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with - // OpenGL/CUDA interop. - initGL(&argc, argv); - - int dev = findCudaDevice(argc, (const char **)argv); - - sdkCreateTimer(&timer); - sdkResetTimer(&timer); - - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); +void runAutoTest(int argc, char *argv[]) +{ + printf("[%s] (automated testing w/ readback)\n", sSDKsample); + int devID = findCudaDevice(argc, (const char **)argv); loadDefaultImage(argv[0]); - // If code is not printing the USage, then we execute this path. - printf("I: display Image (no filtering)\n"); - printf("T: display Sobel Edge Detection (Using Texture)\n"); - printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); - printf("Use the '-' and '=' keys to change the brightness.\n"); - printf("b: switch block filter operation (Mean/Sobel)\n"); - printf("p: switch point filter operation (Threshold ON/OFF)\n"); - fflush(stdout); + Pixel *d_result; + checkCudaErrors(cudaMalloc((void **)&d_result, imWidth * imHeight * sizeof(Pixel))); -#if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); -#else - glutCloseFunc(cleanup); + char *ref_file = NULL; + char dump_file[256]; + + int mode = 0; + mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + + switch (mode) { + case 0: + g_SobelDisplayMode = SOBELDISPLAY_IMAGE; + sprintf(dump_file, "teapot512_orig.pgm"); + break; + + case 1: + g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; + sprintf(dump_file, "teapot512_tex.pgm"); + break; + + case 2: + g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; + sprintf(dump_file, "teapot512_shared.pgm"); + break; + + default: + printf("Invalid Filter Mode File\n"); + exit(EXIT_FAILURE); + break; + } + + printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); + sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp); + checkCudaErrors(cudaDeviceSynchronize()); + + unsigned char *h_result = (unsigned char *)malloc(imWidth * imHeight * sizeof(Pixel)); + checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth * imHeight * sizeof(Pixel), cudaMemcpyDeviceToHost)); + sdkSavePGM(dump_file, h_result, imWidth, imHeight); + + if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { + g_TotalErrors++; + } + + checkCudaErrors(cudaFree(d_result)); + free(h_result); + + if (g_TotalErrors != 0) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("Test passed!\n"); + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; + +#if defined(__linux__) + setenv("DISPLAY", ":0", 0); #endif - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - glutMainLoop(); - } + printf("%s Starting...\n\n", argv[0]); + + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("\nUsage: FunctionPointers (SobelFilter) \n"); + printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n"); + printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n"); + + exit(EXIT_WAIVED); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + g_bQAReadback = true; + runAutoTest(argc, argv); + } + + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf(" This SDK does not explicitly support -device=n when running with " + "OpenGL.\n"); + printf(" When specifying -device=n (n=0,1,2,....) the sample must not use " + "OpenGL.\n"); + printf(" See details below to run without OpenGL:\n\n"); + printf(" > %s -device=n\n\n", argv[0]); + printf("exiting...\n"); + + exit(EXIT_WAIVED); + } + + if (!g_bQAReadback) { + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with + // OpenGL/CUDA interop. + initGL(&argc, argv); + + int dev = findCudaDevice(argc, (const char **)argv); + + sdkCreateTimer(&timer); + sdkResetTimer(&timer); + + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + + loadDefaultImage(argv[0]); + + // If code is not printing the USage, then we execute this path. + printf("I: display Image (no filtering)\n"); + printf("T: display Sobel Edge Detection (Using Texture)\n"); + printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); + printf("Use the '-' and '=' keys to change the brightness.\n"); + printf("b: switch block filter operation (Mean/Sobel)\n"); + printf("p: switch point filter operation (Threshold ON/OFF)\n"); + fflush(stdout); + +#if defined(__APPLE__) || defined(MACOSX) + atexit(cleanup); +#else + glutCloseFunc(cleanup); +#endif + + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + glutMainLoop(); + } } diff --git a/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.cu b/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.cu index a2549658..9ed2a3d7 100644 --- a/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.cu +++ b/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.cu @@ -25,9 +25,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include -#include namespace cg = cooperative_groups; @@ -36,9 +36,9 @@ namespace cg = cooperative_groups; #include "FunctionPointers_kernels.h" // Texture object for reading image -cudaTextureObject_t tex; +cudaTextureObject_t tex; extern __shared__ unsigned char LocalBlock[]; -static cudaArray *array = NULL; +static cudaArray *array = NULL; #define RADIUS 1 @@ -47,7 +47,7 @@ static cudaArray *array = NULL; #define THRESHOLD 150.0f #ifdef FIXED_BLOCKWIDTH -#define BlockWidth 80 +#define BlockWidth 80 #define SharedPitch 384 #endif @@ -55,59 +55,73 @@ static cudaArray *array = NULL; //__device__ unsigned char (*pointFunction)(unsigned char, float ) = NULL; // or by using typedef's like below: -typedef unsigned char (*blockFunction_t)(unsigned char, unsigned char, - unsigned char, unsigned char, - unsigned char, unsigned char, - unsigned char, unsigned char, - unsigned char, float); +typedef unsigned char (*blockFunction_t)(unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + float); typedef unsigned char (*pointFunction_t)(unsigned char, float); __device__ blockFunction_t blockFunction; -__device__ unsigned char ComputeSobel(unsigned char ul, // upper left - unsigned char um, // upper middle - unsigned char ur, // upper right - unsigned char ml, // middle left - unsigned char mm, // middle (unused) - unsigned char mr, // middle right - unsigned char ll, // lower left - unsigned char lm, // lower middle - unsigned char lr, // lower right - float fScale) { - short Horz = ur + 2 * mr + lr - ul - 2 * ml - ll; - short Vert = ul + 2 * um + ur - ll - 2 * lm - lr; - short Sum = (short)(fScale * (abs((int)Horz) + abs((int)Vert))); - return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)); +__device__ unsigned char ComputeSobel(unsigned char ul, // upper left + unsigned char um, // upper middle + unsigned char ur, // upper right + unsigned char ml, // middle left + unsigned char mm, // middle (unused) + unsigned char mr, // middle right + unsigned char ll, // lower left + unsigned char lm, // lower middle + unsigned char lr, // lower right + float fScale) +{ + short Horz = ur + 2 * mr + lr - ul - 2 * ml - ll; + short Vert = ul + 2 * um + ur - ll - 2 * lm - lr; + short Sum = (short)(fScale * (abs((int)Horz) + abs((int)Vert))); + return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)); } // define a function pointer and initialize to NULL -__device__ unsigned char (*varFunction)(unsigned char, unsigned char, - unsigned char, unsigned char, - unsigned char, unsigned char, - unsigned char, unsigned char, - unsigned char, float x) = NULL; +__device__ unsigned char (*varFunction)(unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + unsigned char, + float x) = NULL; -__device__ unsigned char ComputeBox(unsigned char ul, // upper left - unsigned char um, // upper middle - unsigned char ur, // upper right - unsigned char ml, // middle left - unsigned char mm, // middle...middle - unsigned char mr, // middle right - unsigned char ll, // lower left - unsigned char lm, // lower middle - unsigned char lr, // lower right - float fscale) { - short Sum = (short)(ul + um + ur + ml + mm + mr + ll + lm + lr) / 9; - Sum *= fscale; - return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)); +__device__ unsigned char ComputeBox(unsigned char ul, // upper left + unsigned char um, // upper middle + unsigned char ur, // upper right + unsigned char ml, // middle left + unsigned char mm, // middle...middle + unsigned char mr, // middle right + unsigned char ll, // lower left + unsigned char lm, // lower middle + unsigned char lr, // lower right + float fscale) +{ + short Sum = (short)(ul + um + ur + ml + mm + mr + ll + lm + lr) / 9; + Sum *= fscale; + return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)); } -__device__ unsigned char Threshold(unsigned char in, float thresh) { - if (in > thresh) { - return 0xFF; - } else { - return 0; - } +__device__ unsigned char Threshold(unsigned char in, float thresh) +{ + if (in > thresh) { + return 0xFF; + } + else { + return 0; + } } // Declare function tables, one for the point function chosen, one for the @@ -119,8 +133,8 @@ __device__ pointFunction_t pointFunction_table[LAST_POINT_FILTER]; // Declare device side function pointers. We retrieve them later with // cudaMemcpyFromSymbol to set our function tables above in some // particular order specified at runtime. -__device__ blockFunction_t pComputeSobel = ComputeSobel; -__device__ blockFunction_t pComputeBox = ComputeBox; +__device__ blockFunction_t pComputeSobel = ComputeSobel; +__device__ blockFunction_t pComputeBox = ComputeBox; __device__ pointFunction_t pComputeThreshold = Threshold; // Allocate host side tables to mirror the device side, and later, we @@ -138,222 +152,210 @@ pointFunction_t h_pointFunction_table[2]; // Following the block operation, a per-pixel operation, // pointed to by pPointFunction is performed before the final // pixel is produced. -__global__ void SobelShared(uchar4 *pSobelOriginal, unsigned short SobelPitch, +__global__ void SobelShared(uchar4 *pSobelOriginal, + unsigned short SobelPitch, #ifndef FIXED_BLOCKWIDTH - short BlockWidth, short SharedPitch, + short BlockWidth, + short SharedPitch, #endif - short w, short h, float fScale, int blockOperation, - pointFunction_t pPointFunction, - cudaTextureObject_t tex) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - short u = 4 * blockIdx.x * BlockWidth; - short v = blockIdx.y * blockDim.y + threadIdx.y; - short ib; + short w, + short h, + float fScale, + int blockOperation, + pointFunction_t pPointFunction, + cudaTextureObject_t tex) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + short u = 4 * blockIdx.x * BlockWidth; + short v = blockIdx.y * blockDim.y + threadIdx.y; + short ib; - int SharedIdx = threadIdx.y * SharedPitch; - - for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) { - LocalBlock[SharedIdx + 4 * ib + 0] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 1] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 2] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 3] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v - RADIUS)); - } - - if (threadIdx.y < RADIUS * 2) { - // - // copy trailing RADIUS*2 rows of pixels into shared - // - SharedIdx = (blockDim.y + threadIdx.y) * SharedPitch; + int SharedIdx = threadIdx.y * SharedPitch; for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) { - LocalBlock[SharedIdx + 4 * ib + 0] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 0), - (float)(v + blockDim.y - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 1] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 1), - (float)(v + blockDim.y - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 2] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 2), - (float)(v + blockDim.y - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 3] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 3), - (float)(v + blockDim.y - RADIUS)); - } - } - - cg::sync(cta); - - u >>= 2; // index as uchar4 from here - uchar4 *pSobel = (uchar4 *)(((char *)pSobelOriginal) + v * SobelPitch); - SharedIdx = threadIdx.y * SharedPitch; - - blockFunction = blockFunction_table[blockOperation]; - - for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x) { - uchar4 out; - - unsigned char pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 0]; - unsigned char pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 1]; - unsigned char pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 2]; - unsigned char pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 0]; - unsigned char pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 1]; - unsigned char pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 2]; - unsigned char pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 0]; - unsigned char pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 1]; - unsigned char pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 2]; - - out.x = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20, - pix21, pix22, fScale); - - pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 3]; - pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 3]; - pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 3]; - out.y = (*blockFunction)(pix01, pix02, pix00, pix11, pix12, pix10, pix21, - pix22, pix20, fScale); - - pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 4]; - pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 4]; - pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 4]; - out.z = (*blockFunction)(pix02, pix00, pix01, pix12, pix10, pix11, pix22, - pix20, pix21, fScale); - - pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 5]; - pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 5]; - pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 5]; - out.w = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20, - pix21, pix22, fScale); - - if (pPointFunction != NULL) { - out.x = (*pPointFunction)(out.x, THRESHOLD); - out.y = (*pPointFunction)(out.y, THRESHOLD); - out.z = (*pPointFunction)(out.z, THRESHOLD); - out.w = (*pPointFunction)(out.w, THRESHOLD); + LocalBlock[SharedIdx + 4 * ib + 0] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 1] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 2] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 3] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v - RADIUS)); } - if (u + ib < w / 4 && v < h) { - pSobel[u + ib] = out; - } - } + if (threadIdx.y < RADIUS * 2) { + // + // copy trailing RADIUS*2 rows of pixels into shared + // + SharedIdx = (blockDim.y + threadIdx.y) * SharedPitch; - cg::sync(cta); + for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) { + LocalBlock[SharedIdx + 4 * ib + 0] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v + blockDim.y - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 1] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v + blockDim.y - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 2] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v + blockDim.y - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 3] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v + blockDim.y - RADIUS)); + } + } + + cg::sync(cta); + + u >>= 2; // index as uchar4 from here + uchar4 *pSobel = (uchar4 *)(((char *)pSobelOriginal) + v * SobelPitch); + SharedIdx = threadIdx.y * SharedPitch; + + blockFunction = blockFunction_table[blockOperation]; + + for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x) { + uchar4 out; + + unsigned char pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 0]; + unsigned char pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 1]; + unsigned char pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 2]; + unsigned char pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 0]; + unsigned char pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 1]; + unsigned char pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 2]; + unsigned char pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 0]; + unsigned char pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 1]; + unsigned char pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 2]; + + out.x = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); + + pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 3]; + pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 3]; + pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 3]; + out.y = (*blockFunction)(pix01, pix02, pix00, pix11, pix12, pix10, pix21, pix22, pix20, fScale); + + pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 4]; + pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 4]; + pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 4]; + out.z = (*blockFunction)(pix02, pix00, pix01, pix12, pix10, pix11, pix22, pix20, pix21, fScale); + + pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 5]; + pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 5]; + pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 5]; + out.w = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); + + if (pPointFunction != NULL) { + out.x = (*pPointFunction)(out.x, THRESHOLD); + out.y = (*pPointFunction)(out.y, THRESHOLD); + out.z = (*pPointFunction)(out.z, THRESHOLD); + out.w = (*pPointFunction)(out.w, THRESHOLD); + } + + if (u + ib < w / 4 && v < h) { + pSobel[u + ib] = out; + } + } + + cg::sync(cta); } -__global__ void SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch, int w, - int h, float fscale, cudaTextureObject_t tex) { - unsigned char *pSobel = - (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); +__global__ void +SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch, int w, int h, float fscale, cudaTextureObject_t tex) +{ + unsigned char *pSobel = (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); - for (int i = threadIdx.x; i < w; i += blockDim.x) { - pSobel[i] = min( - max((tex2D(tex, (float)i, (float)blockIdx.x) * fscale), - 0.f), - 255.f); - } + for (int i = threadIdx.x; i < w; i += blockDim.x) { + pSobel[i] = min(max((tex2D(tex, (float)i, (float)blockIdx.x) * fscale), 0.f), 255.f); + } } // Perform block and pointer filtering using texture lookups. // The block and point operations are determined by the // input argument (see comment above for "SobelShared" function) -__global__ void SobelTex(Pixel *pSobelOriginal, unsigned int Pitch, int w, - int h, float fScale, int blockOperation, - pointFunction_t pPointOperation, - cudaTextureObject_t tex) { - unsigned char *pSobel = - (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); - unsigned char tmp = 0; +__global__ void SobelTex(Pixel *pSobelOriginal, + unsigned int Pitch, + int w, + int h, + float fScale, + int blockOperation, + pointFunction_t pPointOperation, + cudaTextureObject_t tex) +{ + unsigned char *pSobel = (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); + unsigned char tmp = 0; - for (int i = threadIdx.x; i < w; i += blockDim.x) { - unsigned char pix00 = - tex2D(tex, (float)i - 1, (float)blockIdx.x - 1); - unsigned char pix01 = - tex2D(tex, (float)i + 0, (float)blockIdx.x - 1); - unsigned char pix02 = - tex2D(tex, (float)i + 1, (float)blockIdx.x - 1); - unsigned char pix10 = - tex2D(tex, (float)i - 1, (float)blockIdx.x + 0); - unsigned char pix11 = - tex2D(tex, (float)i + 0, (float)blockIdx.x + 0); - unsigned char pix12 = - tex2D(tex, (float)i + 1, (float)blockIdx.x + 0); - unsigned char pix20 = - tex2D(tex, (float)i - 1, (float)blockIdx.x + 1); - unsigned char pix21 = - tex2D(tex, (float)i + 0, (float)blockIdx.x + 1); - unsigned char pix22 = - tex2D(tex, (float)i + 1, (float)blockIdx.x + 1); - tmp = (*(blockFunction_table[blockOperation]))( - pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); + for (int i = threadIdx.x; i < w; i += blockDim.x) { + unsigned char pix00 = tex2D(tex, (float)i - 1, (float)blockIdx.x - 1); + unsigned char pix01 = tex2D(tex, (float)i + 0, (float)blockIdx.x - 1); + unsigned char pix02 = tex2D(tex, (float)i + 1, (float)blockIdx.x - 1); + unsigned char pix10 = tex2D(tex, (float)i - 1, (float)blockIdx.x + 0); + unsigned char pix11 = tex2D(tex, (float)i + 0, (float)blockIdx.x + 0); + unsigned char pix12 = tex2D(tex, (float)i + 1, (float)blockIdx.x + 0); + unsigned char pix20 = tex2D(tex, (float)i - 1, (float)blockIdx.x + 1); + unsigned char pix21 = tex2D(tex, (float)i + 0, (float)blockIdx.x + 1); + unsigned char pix22 = tex2D(tex, (float)i + 1, (float)blockIdx.x + 1); + tmp = (*(blockFunction_table[blockOperation]))( + pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); - if (pPointOperation != NULL) { - tmp = (*pPointOperation)(tmp, 150.0); + if (pPointOperation != NULL) { + tmp = (*pPointOperation)(tmp, 150.0); + } + + pSobel[i] = tmp; + } +} + +extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp) +{ + cudaChannelFormatDesc desc; + + if (Bpp == 1) { + desc = cudaCreateChannelDesc(); + } + else { + desc = cudaCreateChannelDesc(); } - pSobel[i] = tmp; - } + checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih)); + checkCudaErrors(cudaMemcpy2DToArray( + array, 0, 0, data, iw * Bpp * sizeof(Pixel), iw * Bpp * sizeof(Pixel), ih, cudaMemcpyHostToDevice)); + + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = array; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); } -extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp) { - cudaChannelFormatDesc desc; - - if (Bpp == 1) { - desc = cudaCreateChannelDesc(); - } else { - desc = cudaCreateChannelDesc(); - } - - checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih)); - checkCudaErrors(cudaMemcpy2DToArray( - array, 0, 0, data, iw * Bpp * sizeof(Pixel), iw * Bpp * sizeof(Pixel), ih, - cudaMemcpyHostToDevice)); - - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = array; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); -} - -extern "C" void deleteTexture(void) { - checkCudaErrors(cudaFreeArray(array)); - checkCudaErrors(cudaDestroyTextureObject(tex)); +extern "C" void deleteTexture(void) +{ + checkCudaErrors(cudaFreeArray(array)); + checkCudaErrors(cudaDestroyTextureObject(tex)); } // Copy the pointers from the function tables to the host side -void setupFunctionTables() { - // Dynamically assign the function table. - // Copy the function pointers to their appropriate locations according to the - // enum - checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[SOBEL_FILTER], - pComputeSobel, sizeof(blockFunction_t))); - checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[BOX_FILTER], - pComputeBox, sizeof(blockFunction_t))); +void setupFunctionTables() +{ + // Dynamically assign the function table. + // Copy the function pointers to their appropriate locations according to the + // enum + checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[SOBEL_FILTER], pComputeSobel, sizeof(blockFunction_t))); + checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[BOX_FILTER], pComputeBox, sizeof(blockFunction_t))); - // do the same for the point function, where the 2nd function is NULL ("no-op" - // filter, skipped in kernel code) - checkCudaErrors(cudaMemcpyFromSymbol(&h_pointFunction_table[THRESHOLD_FILTER], - pComputeThreshold, - sizeof(pointFunction_t))); - h_pointFunction_table[NULL_FILTER] = NULL; + // do the same for the point function, where the 2nd function is NULL ("no-op" + // filter, skipped in kernel code) + checkCudaErrors( + cudaMemcpyFromSymbol(&h_pointFunction_table[THRESHOLD_FILTER], pComputeThreshold, sizeof(pointFunction_t))); + h_pointFunction_table[NULL_FILTER] = NULL; - // now copy the function tables back to the device, so if we wish we can use - // an index into the table to choose them - // We have now set the order in the function table according to our enum. - checkCudaErrors( - cudaMemcpyToSymbol(blockFunction_table, h_blockFunction_table, - sizeof(blockFunction_t) * LAST_BLOCK_FILTER)); - checkCudaErrors( - cudaMemcpyToSymbol(pointFunction_table, h_pointFunction_table, - sizeof(pointFunction_t) * LAST_POINT_FILTER)); + // now copy the function tables back to the device, so if we wish we can use + // an index into the table to choose them + // We have now set the order in the function table according to our enum. + checkCudaErrors( + cudaMemcpyToSymbol(blockFunction_table, h_blockFunction_table, sizeof(blockFunction_t) * LAST_BLOCK_FILTER)); + checkCudaErrors( + cudaMemcpyToSymbol(pointFunction_table, h_pointFunction_table, sizeof(pointFunction_t) * LAST_POINT_FILTER)); } // Wrapper for the __global__ call that sets up the texture and threads @@ -363,40 +365,50 @@ void setupFunctionTables() { // blockFunction_table on the device side // pPointOp is itself a function pointer passed as a kernel argument, retrieved // from a host side copy of the function table -extern "C" void sobelFilter(Pixel *odata, int iw, int ih, - enum SobelDisplayMode mode, float fScale, - int blockOperation, int pointOperation) { - pointFunction_t pPointOp = h_pointFunction_table[pointOperation]; +extern "C" void sobelFilter(Pixel *odata, + int iw, + int ih, + enum SobelDisplayMode mode, + float fScale, + int blockOperation, + int pointOperation) +{ + pointFunction_t pPointOp = h_pointFunction_table[pointOperation]; - switch (mode) { + switch (mode) { case SOBELDISPLAY_IMAGE: - SobelCopyImage<<>>(odata, iw, iw, ih, fScale, tex); - break; + SobelCopyImage<<>>(odata, iw, iw, ih, fScale, tex); + break; case SOBELDISPLAY_SOBELTEX: - SobelTex<<>>(odata, iw, iw, ih, fScale, blockOperation, pPointOp, - tex); - break; + SobelTex<<>>(odata, iw, iw, ih, fScale, blockOperation, pPointOp, tex); + break; case SOBELDISPLAY_SOBELSHARED: { - dim3 threads(16, 4); + dim3 threads(16, 4); #ifndef FIXED_BLOCKWIDTH - int BlockWidth = 80; // must be divisible by 16 for coalescing + int BlockWidth = 80; // must be divisible by 16 for coalescing #endif - dim3 blocks = dim3(iw / (4 * BlockWidth) + (0 != iw % (4 * BlockWidth)), - ih / threads.y + (0 != ih % threads.y)); - int SharedPitch = ~0x3f & (4 * (BlockWidth + 2 * RADIUS) + 0x3f); - int sharedMem = SharedPitch * (threads.y + 2 * RADIUS); + dim3 blocks = + dim3(iw / (4 * BlockWidth) + (0 != iw % (4 * BlockWidth)), ih / threads.y + (0 != ih % threads.y)); + int SharedPitch = ~0x3f & (4 * (BlockWidth + 2 * RADIUS) + 0x3f); + int sharedMem = SharedPitch * (threads.y + 2 * RADIUS); - // for the shared kernel, width must be divisible by 4 - iw &= ~3; + // for the shared kernel, width must be divisible by 4 + iw &= ~3; - SobelShared<<>>( - (uchar4 *)odata, iw, + SobelShared<<>>((uchar4 *)odata, + iw, #ifndef FIXED_BLOCKWIDTH - BlockWidth, SharedPitch, + BlockWidth, + SharedPitch, #endif - iw, ih, fScale, blockOperation, pPointOp, tex); + iw, + ih, + fScale, + blockOperation, + pPointOp, + tex); } break; - } + } } diff --git a/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.h b/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.h index a5f4f824..ac4c8c37 100644 --- a/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.h +++ b/Samples/2_Concepts_and_Techniques/FunctionPointers/FunctionPointers_kernels.h @@ -31,11 +31,7 @@ typedef unsigned char Pixel; // global determines which filter to invoke -enum SobelDisplayMode { - SOBELDISPLAY_IMAGE = 0, - SOBELDISPLAY_SOBELTEX, - SOBELDISPLAY_SOBELSHARED -}; +enum SobelDisplayMode { SOBELDISPLAY_IMAGE = 0, SOBELDISPLAY_SOBELTEX, SOBELDISPLAY_SOBELSHARED }; // Enums to set up the function table // note: if you change these be sure to recompile those files @@ -47,12 +43,16 @@ enum BLOCK_ENUM { THRESHOLD_FILTER = 0, NULL_FILTER, LAST_BLOCK_FILTER }; extern enum SobelDisplayMode g_SobelDisplayMode; -extern "C" void sobelFilter(Pixel *odata, int iw, int ih, - enum SobelDisplayMode mode, float fScale, - int blockOperation, int pointOperation); +extern "C" void sobelFilter(Pixel *odata, + int iw, + int ih, + enum SobelDisplayMode mode, + float fScale, + int blockOperation, + int pointOperation); extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp); extern "C" void deleteTexture(void); extern "C" void initFilter(void); -void setupFunctionTables(); +void setupFunctionTables(); #endif diff --git a/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md b/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md index 76ecfefe..0dc3081c 100644 --- a/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md +++ b/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/cudasharedmem.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/cudasharedmem.h index 5c9bab98..4627d72c 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/cudasharedmem.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/cudasharedmem.h @@ -65,25 +65,27 @@ // This is the un-specialized struct. Note that we prevent instantiation of // this struct by making it abstract (i.e. with pure virtual methods). -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - virtual __device__ T &operator*() = 0; - virtual __device__ T &operator[](int i) = 0; +template struct SharedMemory +{ + // Ensure that we won't compile any un-specialized types + virtual __device__ T &operator*() = 0; + virtual __device__ T &operator[](int i) = 0; }; -#define BUILD_SHAREDMEMORY_TYPE(t, n) \ - template <> \ - struct SharedMemory { \ - __device__ t &operator*() { \ - extern __shared__ t n[]; \ - return *n; \ - } \ - __device__ t &operator[](int i) { \ - extern __shared__ t n[]; \ - return n[i]; \ - } \ - } +#define BUILD_SHAREDMEMORY_TYPE(t, n) \ + template <> struct SharedMemory \ + { \ + __device__ t &operator*() \ + { \ + extern __shared__ t n[]; \ + return *n; \ + } \ + __device__ t &operator[](int i) \ + { \ + extern __shared__ t n[]; \ + return n[i]; \ + } \ + } BUILD_SHAREDMEMORY_TYPE(int, s_int); BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint); diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/piestimator.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/piestimator.h index 6099c614..e683cb39 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/piestimator.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/piestimator.h @@ -28,18 +28,17 @@ #ifndef PIESTIMATOR_H #define PIESTIMATOR_H -template -class PiEstimator { - public: - PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize, unsigned int seed); - Real operator()(); +template class PiEstimator +{ +public: + PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed); + Real operator()(); - private: - unsigned int m_seed; - unsigned int m_numSims; - unsigned int m_device; - unsigned int m_threadBlockSize; +private: + unsigned int m_seed; + unsigned int m_numSims; + unsigned int m_device; + unsigned int m_threadBlockSize; }; #endif diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/test.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/test.h index f8309d95..4928b1d9 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/test.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/inc/test.h @@ -28,30 +28,31 @@ #ifndef TEST_H #define TEST_H -template -struct Test { - Test() : pass(false){}; +template struct Test +{ + Test() + : pass(false) {}; - int device; - unsigned int numSims; - unsigned int threadBlockSize; - unsigned int seed; + int device; + unsigned int numSims; + unsigned int threadBlockSize; + unsigned int seed; - bool pass; - double elapsedTime; + bool pass; + double elapsedTime; - bool operator()(); + bool operator()(); }; // Defaults are arbitrary to give sensible runtime -#define k_sims_min 100000 -#define k_sims_max 10000000 -#define k_sims_def 100000 -#define k_sims_qa 100000 +#define k_sims_min 100000 +#define k_sims_max 10000000 +#define k_sims_def 100000 +#define k_sims_qa 100000 #define k_bsize_min 32 #define k_bsize_def 128 -#define k_bsize_qa 128 -#define k_seed_def 1234 +#define k_bsize_qa 128 +#define k_seed_def 1234 // Target value #define PI 3.14159265359 diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/main.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/main.cpp index 3b579268..d22cda10 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/main.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/main.cpp @@ -40,7 +40,6 @@ #include #include - #include #include #include @@ -51,204 +50,208 @@ static const char *printfFile = "MonteCarloEstimatePiInlineP.txt"; // Forward declarations -void showHelp(const int argc, const char **argv); -template -void runTest(int argc, const char **argv); +void showHelp(const int argc, const char **argv); +template void runTest(int argc, const char **argv); -int main(int argc, char **argv) { - using std::invalid_argument; - using std::string; +int main(int argc, char **argv) +{ + using std::invalid_argument; + using std::string; - // Open the log file - printf("Monte Carlo Estimate Pi (with inline PRNG)\n"); - printf("==========================================\n\n"); + // Open the log file + printf("Monte Carlo Estimate Pi (with inline PRNG)\n"); + printf("==========================================\n\n"); - // If help flag is set, display help and exit immediately - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Displaying help on console\n"); - showHelp(argc, (const char **)argv); - exit(EXIT_SUCCESS); - } - - // Check the precision (checked against the device capability later) - try { - char *value; - - if (getCmdLineArgumentString(argc, (const char **)argv, "precision", - &value)) { - // Check requested precision is valid - string prec(value); - - if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { - runTest(argc, (const char **)argv); - } else if (prec.compare("double") == 0 || - prec.compare("\"double\"") == 0) { - runTest(argc, (const char **)argv); - } else { - printf( - "specified precision (%s) is invalid, must be \"single\" or " - "\"double\".\n", - value); - throw invalid_argument("precision"); - } - } else { - runTest(argc, (const char **)argv); + // If help flag is set, display help and exit immediately + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Displaying help on console\n"); + showHelp(argc, (const char **)argv); + exit(EXIT_SUCCESS); } - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } - // Finish - exit(EXIT_SUCCESS); + // Check the precision (checked against the device capability later) + try { + char *value; + + if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value)) { + // Check requested precision is valid + string prec(value); + + if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { + runTest(argc, (const char **)argv); + } + else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0) { + runTest(argc, (const char **)argv); + } + else { + printf("specified precision (%s) is invalid, must be \"single\" or " + "\"double\".\n", + value); + throw invalid_argument("precision"); + } + } + else { + runTest(argc, (const char **)argv); + } + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + + // Finish + exit(EXIT_SUCCESS); } -template -void runTest(int argc, const char **argv) { - using std::invalid_argument; - using std::runtime_error; +template void runTest(int argc, const char **argv) +{ + using std::invalid_argument; + using std::runtime_error; - try { - Test test; - int deviceCount = 0; - cudaError_t cudaResult = cudaSuccess; + try { + Test test; + int deviceCount = 0; + cudaError_t cudaResult = cudaSuccess; - // by default specify GPU Device == 0 - test.device = 0; + // by default specify GPU Device == 0 + test.device = 0; - // Get number of available devices - cudaResult = cudaGetDeviceCount(&deviceCount); - - if (cudaResult != cudaSuccess) { - printf("could not get device count (%s).\n", - cudaGetErrorString(cudaResult)); - throw runtime_error("cudaGetDeviceCount"); - } - - // (default parameters) - test.numSims = k_sims_qa; - test.threadBlockSize = k_bsize_qa; - test.seed = k_seed_def; - - { - char *value = 0; - - if (getCmdLineArgumentString(argc, argv, "device", &value)) { - test.device = (int)atoi(value); - - if (test.device >= deviceCount) { - printf( - "invalid target device specified on command line (device %d does " - "not exist).\n", - test.device); - throw invalid_argument("device"); - } - } else { - test.device = gpuGetMaxGflopsDeviceId(); - } - - if (getCmdLineArgumentString(argc, argv, "sims", &value)) { - test.numSims = (unsigned int)atoi(value); - printf("number of simulations = %d\n", test.numSims); - - if (test.numSims < k_sims_min || test.numSims > k_sims_max) { - printf( - "specified number of simulations (%d) is invalid, must be " - "between %d and %d.\n", - test.numSims, k_sims_min, k_sims_max); - throw invalid_argument("sims"); - } - } else { - test.numSims = k_sims_def; - } - - if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { - // Determine max threads per block - cudaDeviceProp deviceProperties; - cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + // Get number of available devices + cudaResult = cudaGetDeviceCount(&deviceCount); if (cudaResult != cudaSuccess) { - printf("unable to get device properties for device %d.\n", - test.device); - throw runtime_error("cudaGetDeviceProperties"); + printf("could not get device count (%s).\n", cudaGetErrorString(cudaResult)); + throw runtime_error("cudaGetDeviceCount"); } - // Check requested size is valid - test.threadBlockSize = (unsigned int)atoi(value); - printf("block size = %d\n", test.threadBlockSize); + // (default parameters) + test.numSims = k_sims_qa; + test.threadBlockSize = k_bsize_qa; + test.seed = k_seed_def; - if (test.threadBlockSize < k_bsize_min || - test.threadBlockSize > static_cast( - deviceProperties.maxThreadsPerBlock)) { - printf( - "specified block size (%d) is invalid, must be between %d and %d " - "for device %d.\n", - test.threadBlockSize, k_bsize_min, - deviceProperties.maxThreadsPerBlock, test.device); - throw invalid_argument("block-size"); + { + char *value = 0; + + if (getCmdLineArgumentString(argc, argv, "device", &value)) { + test.device = (int)atoi(value); + + if (test.device >= deviceCount) { + printf("invalid target device specified on command line (device %d does " + "not exist).\n", + test.device); + throw invalid_argument("device"); + } + } + else { + test.device = gpuGetMaxGflopsDeviceId(); + } + + if (getCmdLineArgumentString(argc, argv, "sims", &value)) { + test.numSims = (unsigned int)atoi(value); + printf("number of simulations = %d\n", test.numSims); + + if (test.numSims < k_sims_min || test.numSims > k_sims_max) { + printf("specified number of simulations (%d) is invalid, must be " + "between %d and %d.\n", + test.numSims, + k_sims_min, + k_sims_max); + throw invalid_argument("sims"); + } + } + else { + test.numSims = k_sims_def; + } + + if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { + // Determine max threads per block + cudaDeviceProp deviceProperties; + cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + + if (cudaResult != cudaSuccess) { + printf("unable to get device properties for device %d.\n", test.device); + throw runtime_error("cudaGetDeviceProperties"); + } + + // Check requested size is valid + test.threadBlockSize = (unsigned int)atoi(value); + printf("block size = %d\n", test.threadBlockSize); + + if (test.threadBlockSize < k_bsize_min + || test.threadBlockSize > static_cast(deviceProperties.maxThreadsPerBlock)) { + printf("specified block size (%d) is invalid, must be between %d and %d " + "for device %d.\n", + test.threadBlockSize, + k_bsize_min, + deviceProperties.maxThreadsPerBlock, + test.device); + throw invalid_argument("block-size"); + } + + if (test.threadBlockSize & test.threadBlockSize - 1) { + printf("specified block size (%d) is invalid, must be a power of two " + "(see reduction function).\n", + test.threadBlockSize); + throw invalid_argument("block-size"); + } + } + else { + test.threadBlockSize = k_bsize_def; + } + + if (getCmdLineArgumentString(argc, argv, "seed", &value)) { + // Check requested seed is valid + test.seed = (unsigned int)atoi(value); + printf("seed = %d\n", test.seed); + + if (test.seed == 0) { + printf("specified seed (%d) is invalid, must be non-zero.\n", test.seed); + throw invalid_argument("seed"); + } + } + else { + test.seed = k_seed_def; + } } - if (test.threadBlockSize & test.threadBlockSize - 1) { - printf( - "specified block size (%d) is invalid, must be a power of two " - "(see reduction function).\n", - test.threadBlockSize); - throw invalid_argument("block-size"); - } - } else { - test.threadBlockSize = k_bsize_def; - } + // Execute + test(); + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + catch (runtime_error &e) { + printf("runtime error (%s)\n", e.what()); + exit(EXIT_FAILURE); + } +} - if (getCmdLineArgumentString(argc, argv, "seed", &value)) { - // Check requested seed is valid - test.seed = (unsigned int)atoi(value); - printf("seed = %d\n", test.seed); +void showHelp(int argc, const char **argv) +{ + using std::cout; + using std::endl; + using std::left; + using std::setw; - if (test.seed == 0) { - printf("specified seed (%d) is invalid, must be non-zero.\n", - test.seed); - throw invalid_argument("seed"); - } - } else { - test.seed = k_seed_def; - } + if (argc > 0) { + cout << endl << argv[0] << endl; } - // Execute - test(); - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } catch (runtime_error &e) { - printf("runtime error (%s)\n", e.what()); - exit(EXIT_FAILURE); - } -} - -void showHelp(int argc, const char **argv) { - using std::cout; - using std::endl; - using std::left; - using std::setw; - - if (argc > 0) { - cout << endl << argv[0] << endl; - } - - cout << endl << "Syntax:" << endl; - cout << left; - cout << " " << setw(20) << "--device=" - << "Specify device to use for execution" << endl; - cout << " " << setw(20) << "--sims=" - << "Specify number of Monte Carlo simulations" << endl; - cout << " " << setw(20) << "--block-size=" - << "Specify number of threads per block" << endl; - cout << " " << setw(20) << "--seed=" - << "Specify the seed to use for the random number generator" << endl; - cout << " " << setw(20) << "--precision=

" - << "Specify the precision (\"single\" or \"double\")" << endl; - cout << endl; - cout << " " << setw(20) << "--noprompt" - << "Skip prompt before exit" << endl; - cout << endl; + cout << endl << "Syntax:" << endl; + cout << left; + cout << " " << setw(20) << "--device=" + << "Specify device to use for execution" << endl; + cout << " " << setw(20) << "--sims=" + << "Specify number of Monte Carlo simulations" << endl; + cout << " " << setw(20) << "--block-size=" + << "Specify number of threads per block" << endl; + cout << " " << setw(20) << "--seed=" + << "Specify the seed to use for the random number generator" << endl; + cout << " " << setw(20) << "--precision=

" + << "Specify the precision (\"single\" or \"double\")" << endl; + cout << endl; + cout << " " << setw(20) << "--noprompt" + << "Skip prompt before exit" << endl; + cout << endl; } diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/piestimator.cu b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/piestimator.cu index 93cadc48..ffdf1e65 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/piestimator.cu +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/piestimator.cu @@ -25,15 +25,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "../inc/piestimator.h" - -#include -#include +#include +#include #include #include +#include #include -#include -#include +#include + +#include "../inc/piestimator.h" namespace cg = cooperative_groups; #include @@ -42,240 +42,240 @@ using std::string; using std::vector; // RNG init kernel -__global__ void initRNG(curandState *const rngStates, const unsigned int seed) { - // Determine thread ID - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void initRNG(curandState *const rngStates, const unsigned int seed) +{ + // Determine thread ID + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - // Initialise the RNG - curand_init(seed, tid, 0, &rngStates[tid]); + // Initialise the RNG + curand_init(seed, tid, 0, &rngStates[tid]); } -__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) { - extern __shared__ unsigned int sdata[]; +__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) +{ + extern __shared__ unsigned int sdata[]; - // Perform first level of reduction: - // - Write to shared memory - unsigned int ltid = threadIdx.x; + // Perform first level of reduction: + // - Write to shared memory + unsigned int ltid = threadIdx.x; - sdata[ltid] = in; - cg::sync(cta); + sdata[ltid] = in; + cg::sync(cta); - // Do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (ltid < s) { - sdata[ltid] += sdata[ltid + s]; + // Do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (ltid < s) { + sdata[ltid] += sdata[ltid + s]; + } + + cg::sync(cta); } - cg::sync(cta); - } - - return sdata[0]; + return sdata[0]; } -__device__ inline void getPoint(float &x, float &y, curandState &state) { - x = curand_uniform(&state); - y = curand_uniform(&state); +__device__ inline void getPoint(float &x, float &y, curandState &state) +{ + x = curand_uniform(&state); + y = curand_uniform(&state); } -__device__ inline void getPoint(double &x, double &y, curandState &state) { - x = curand_uniform_double(&state); - y = curand_uniform_double(&state); +__device__ inline void getPoint(double &x, double &y, curandState &state) +{ + x = curand_uniform_double(&state); + y = curand_uniform_double(&state); } // Estimator kernel template -__global__ void computeValue(unsigned int *const results, - curandState *const rngStates, - const unsigned int numSims) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Determine thread ID - unsigned int bid = blockIdx.x; - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int step = gridDim.x * blockDim.x; +__global__ void computeValue(unsigned int *const results, curandState *const rngStates, const unsigned int numSims) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Determine thread ID + unsigned int bid = blockIdx.x; + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int step = gridDim.x * blockDim.x; - // Initialise the RNG - curandState localState = rngStates[tid]; + // Initialise the RNG + curandState localState = rngStates[tid]; - // Count the number of points which lie inside the unit quarter-circle - unsigned int pointsInside = 0; + // Count the number of points which lie inside the unit quarter-circle + unsigned int pointsInside = 0; - for (unsigned int i = tid; i < numSims; i += step) { - Real x; - Real y; - getPoint(x, y, localState); - Real l2norm2 = x * x + y * y; + for (unsigned int i = tid; i < numSims; i += step) { + Real x; + Real y; + getPoint(x, y, localState); + Real l2norm2 = x * x + y * y; - if (l2norm2 < static_cast(1)) { - pointsInside++; + if (l2norm2 < static_cast(1)) { + pointsInside++; + } } - } - // Reduce within the block - pointsInside = reduce_sum(pointsInside, cta); + // Reduce within the block + pointsInside = reduce_sum(pointsInside, cta); - // Store the result - if (threadIdx.x == 0) { - results[bid] = pointsInside; - } + // Store the result + if (threadIdx.x == 0) { + results[bid] = pointsInside; + } } template -PiEstimator::PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize, unsigned int seed) - : m_numSims(numSims), - m_device(device), - m_threadBlockSize(threadBlockSize), - m_seed(seed) {} +PiEstimator::PiEstimator(unsigned int numSims, + unsigned int device, + unsigned int threadBlockSize, + unsigned int seed) + : m_numSims(numSims) + , m_device(device) + , m_threadBlockSize(threadBlockSize) + , m_seed(seed) +{ +} -template -Real PiEstimator::operator()() { - cudaError_t cudaResult = cudaSuccess; - struct cudaDeviceProp deviceProperties; - struct cudaFuncAttributes funcAttributes; +template Real PiEstimator::operator()() +{ + cudaError_t cudaResult = cudaSuccess; + struct cudaDeviceProp deviceProperties; + struct cudaFuncAttributes funcAttributes; - // Get device properties - cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); + // Get device properties + cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Check precision is valid - if (typeid(Real) == typeid(double) && - (deviceProperties.major < 1 || - (deviceProperties.major == 1 && deviceProperties.minor < 3))) { - throw std::runtime_error("Device does not have double precision support"); - } + // Check precision is valid + if (typeid(Real) == typeid(double) + && (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3))) { + throw std::runtime_error("Device does not have double precision support"); + } - // Attach to GPU - cudaResult = cudaSetDevice(m_device); + // Attach to GPU + cudaResult = cudaSetDevice(m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not set CUDA device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not set CUDA device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Determine how to divide the work between cores - dim3 block; - dim3 grid; - block.x = m_threadBlockSize; - grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; + // Determine how to divide the work between cores + dim3 block; + dim3 grid; + block.x = m_threadBlockSize; + grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; - // Aim to launch around ten or more times as many blocks as there - // are multiprocessors on the target device. - unsigned int blocksPerSM = 10; - unsigned int numSMs = deviceProperties.multiProcessorCount; + // Aim to launch around ten or more times as many blocks as there + // are multiprocessors on the target device. + unsigned int blocksPerSM = 10; + unsigned int numSMs = deviceProperties.multiProcessorCount; - while (grid.x > 2 * blocksPerSM * numSMs) { - grid.x >>= 1; - } + while (grid.x > 2 * blocksPerSM * numSMs) { + grid.x >>= 1; + } - // Get initRNG function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG); + // Get initRNG function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG); - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for initRNG kernel"); - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for initRNG kernel"); + } - // Get computeValue function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); + // Get computeValue function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for computeValue kernel"); - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for computeValue kernel"); + } - // Check the dimensions are valid - if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { - throw std::runtime_error("Block X dimension is too large for device"); - } + // Check the dimensions are valid + if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { + throw std::runtime_error("Block X dimension is too large for device"); + } - if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { - throw std::runtime_error("Grid X dimension is too large for device"); - } + if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { + throw std::runtime_error("Grid X dimension is too large for device"); + } - // Allocate memory for RNG states - curandState *d_rngStates = 0; - cudaResult = - cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState)); + // Allocate memory for RNG states + curandState *d_rngStates = 0; + cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for RNG states: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for RNG states: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Allocate memory for result - // Each thread block will produce one result - unsigned int *d_results = 0; - cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); + // Allocate memory for result + // Each thread block will produce one result + unsigned int *d_results = 0; + cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for partial results: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for partial results: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Initialise RNG - initRNG<<>>(d_rngStates, m_seed); + // Initialise RNG + initRNG<<>>(d_rngStates, m_seed); - // Count the points inside unit quarter-circle - computeValue<<>>( - d_results, d_rngStates, m_numSims); + // Count the points inside unit quarter-circle + computeValue<<>>(d_results, d_rngStates, m_numSims); - // Copy partial results back - vector results(grid.x); - cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), - cudaMemcpyDeviceToHost); + // Copy partial results back + vector results(grid.x); + cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost); - if (cudaResult != cudaSuccess) { - string msg("Could not copy partial results to host: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not copy partial results to host: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Complete sum-reduction on host - Real value = - static_cast(std::accumulate(results.begin(), results.end(), 0)); + // Complete sum-reduction on host + Real value = static_cast(std::accumulate(results.begin(), results.end(), 0)); - // Determine the proportion of points inside the quarter-circle, - // i.e. the area of the unit quarter-circle - value /= m_numSims; + // Determine the proportion of points inside the quarter-circle, + // i.e. the area of the unit quarter-circle + value /= m_numSims; - // Value is currently an estimate of the area of a unit quarter-circle, so we - // can scale to a full circle by multiplying by four. Now since the area of a - // circle is pi * r^2, and r is one, the value will be an estimate for the - // value of pi. - value *= 4; + // Value is currently an estimate of the area of a unit quarter-circle, so we + // can scale to a full circle by multiplying by four. Now since the area of a + // circle is pi * r^2, and r is one, the value will be an estimate for the + // value of pi. + value *= 4; - // Cleanup - if (d_rngStates) { - cudaFree(d_rngStates); - d_rngStates = 0; - } + // Cleanup + if (d_rngStates) { + cudaFree(d_rngStates); + d_rngStates = 0; + } - if (d_results) { - cudaFree(d_results); - d_results = 0; - } + if (d_results) { + cudaFree(d_results); + d_results = 0; + } - return value; + return value; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/test.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/test.cpp index 5acc4357..5089a932 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/test.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/src/test.cpp @@ -28,83 +28,84 @@ #include "../inc/test.h" -#include -#include -#include -#include -#include #include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "../inc/piestimator.h" -template -bool Test::operator()() { - using std::endl; - using std::setw; - using std::stringstream; +template bool Test::operator()() +{ + using std::endl; + using std::setw; + using std::stringstream; - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); - // Get device properties - struct cudaDeviceProp deviceProperties; - cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); + // Get device properties + struct cudaDeviceProp deviceProperties; + cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); - if (cudaResult != cudaSuccess) { - std::string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + std::string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Evaluate on GPU - printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); - PiEstimator estimator(numSims, device, threadBlockSize, seed); - sdkStartTimer(&timer); - Real result = estimator(); - sdkStopTimer(&timer); - elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; + // Evaluate on GPU + printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); + PiEstimator estimator(numSims, device, threadBlockSize, seed); + sdkStartTimer(&timer); + Real result = estimator(); + sdkStopTimer(&timer); + elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; - // Tolerance to compare result with expected - // This is just to check that nothing has gone very wrong with the - // test, the actual accuracy of the result depends on the number of - // Monte Carlo trials - const Real tolerance = static_cast(0.01); + // Tolerance to compare result with expected + // This is just to check that nothing has gone very wrong with the + // test, the actual accuracy of the result depends on the number of + // Monte Carlo trials + const Real tolerance = static_cast(0.01); - // Display results - Real abserror = fabs(result - static_cast(PI)); - Real relerror = abserror / static_cast(PI); - printf("Precision: %s\n", - (typeid(Real) == typeid(double)) ? "double" : "single"); - printf("Number of sims: %d\n", numSims); - printf("Tolerance: %e\n", tolerance); - printf("GPU result: %e\n", result); - printf("Expected: %e\n", PI); - printf("Absolute error: %e\n", abserror); - printf("Relative error: %e\n\n", relerror); + // Display results + Real abserror = fabs(result - static_cast(PI)); + Real relerror = abserror / static_cast(PI); + printf("Precision: %s\n", (typeid(Real) == typeid(double)) ? "double" : "single"); + printf("Number of sims: %d\n", numSims); + printf("Tolerance: %e\n", tolerance); + printf("GPU result: %e\n", result); + printf("Expected: %e\n", PI); + printf("Absolute error: %e\n", abserror); + printf("Relative error: %e\n\n", relerror); - // Check result - if (relerror > tolerance) { - printf("computed result (%e) does not match expected result (%e).\n", - result, PI); - pass = false; - } else { - pass = true; - } + // Check result + if (relerror > tolerance) { + printf("computed result (%e) does not match expected result (%e).\n", result, PI); + pass = false; + } + else { + pass = true; + } - // Print results - printf( - "MonteCarloEstimatePiInlineP, Performance = %.2f sims/s, Time = " - "%.2f(ms), NumDevsUsed = %u, Blocksize = %u\n", - numSims / elapsedTime, elapsedTime * 1000.0f, 1, threadBlockSize); + // Print results + printf("MonteCarloEstimatePiInlineP, Performance = %.2f sims/s, Time = " + "%.2f(ms), NumDevsUsed = %u, Blocksize = %u\n", + numSims / elapsedTime, + elapsedTime * 1000.0f, + 1, + threadBlockSize); - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); - return pass; + return pass; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/cudasharedmem.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/cudasharedmem.h index 5c9bab98..4627d72c 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/cudasharedmem.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/cudasharedmem.h @@ -65,25 +65,27 @@ // This is the un-specialized struct. Note that we prevent instantiation of // this struct by making it abstract (i.e. with pure virtual methods). -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - virtual __device__ T &operator*() = 0; - virtual __device__ T &operator[](int i) = 0; +template struct SharedMemory +{ + // Ensure that we won't compile any un-specialized types + virtual __device__ T &operator*() = 0; + virtual __device__ T &operator[](int i) = 0; }; -#define BUILD_SHAREDMEMORY_TYPE(t, n) \ - template <> \ - struct SharedMemory { \ - __device__ t &operator*() { \ - extern __shared__ t n[]; \ - return *n; \ - } \ - __device__ t &operator[](int i) { \ - extern __shared__ t n[]; \ - return n[i]; \ - } \ - } +#define BUILD_SHAREDMEMORY_TYPE(t, n) \ + template <> struct SharedMemory \ + { \ + __device__ t &operator*() \ + { \ + extern __shared__ t n[]; \ + return *n; \ + } \ + __device__ t &operator[](int i) \ + { \ + extern __shared__ t n[]; \ + return n[i]; \ + } \ + } BUILD_SHAREDMEMORY_TYPE(int, s_int); BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint); diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/piestimator.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/piestimator.h index b683d421..66515ab6 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/piestimator.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/piestimator.h @@ -28,17 +28,16 @@ #ifndef PIESTIMATOR_H #define PIESTIMATOR_H -template -class PiEstimator { - public: - PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize); - Real operator()(); +template class PiEstimator +{ +public: + PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize); + Real operator()(); - private: - unsigned int m_numSims; - unsigned int m_device; - unsigned int m_threadBlockSize; +private: + unsigned int m_numSims; + unsigned int m_device; + unsigned int m_threadBlockSize; }; #endif diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/test.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/test.h index 457f93bd..4c920fe5 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/test.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/inc/test.h @@ -28,28 +28,29 @@ #ifndef TEST_H #define TEST_H -template -struct Test { - Test() : pass(false){}; +template struct Test +{ + Test() + : pass(false) {}; - int device; - unsigned int numSims; - unsigned int threadBlockSize; + int device; + unsigned int numSims; + unsigned int threadBlockSize; - bool pass; - double elapsedTime; + bool pass; + double elapsedTime; - bool operator()(); + bool operator()(); }; // Defaults are arbitrary to give sensible runtime -#define k_sims_min 100000 -#define k_sims_max 10000000 -#define k_sims_def 100000 -#define k_sims_qa 100000 +#define k_sims_min 100000 +#define k_sims_max 10000000 +#define k_sims_def 100000 +#define k_sims_qa 100000 #define k_bsize_min 32 #define k_bsize_def 128 -#define k_bsize_qa 128 +#define k_bsize_qa 128 // Target value #define PI 3.14159265359 diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/main.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/main.cpp index 266a86c6..5b58a86d 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/main.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/main.cpp @@ -39,196 +39,201 @@ /////////////////////////////////////////////////////////////////////////////// -#include -#include -#include #include -#include #include - +#include +#include +#include #include +#include + #include "../inc/test.h" // Forward declarations -void showHelp(const int argc, const char **argv); -template -void runTest(int argc, const char **argv); +void showHelp(const int argc, const char **argv); +template void runTest(int argc, const char **argv); -int main(int argc, char **argv) { - using std::invalid_argument; - using std::string; +int main(int argc, char **argv) +{ + using std::invalid_argument; + using std::string; - // Open the log file - printf("Monte Carlo Estimate Pi (with inline QRNG)\n"); - printf("==========================================\n\n"); + // Open the log file + printf("Monte Carlo Estimate Pi (with inline QRNG)\n"); + printf("==========================================\n\n"); - // If help flag is set, display help and exit immediately - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Displaying help on console\n"); - showHelp(argc, (const char **)argv); - exit(EXIT_SUCCESS); - } - - // Check the precision (checked against the device capability later) - try { - char *value; - - if (getCmdLineArgumentString(argc, (const char **)argv, "precision", - &value)) { - // Check requested precision is valid - string prec(value); - - if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { - runTest(argc, (const char **)argv); - } else if (prec.compare("double") == 0 || - prec.compare("\"double\"") == 0) { - runTest(argc, (const char **)argv); - } else { - printf( - "specified precision (%s) is invalid, must be \"single\" or " - "\"double\".\n", - value); - throw invalid_argument("precision"); - } - } else { - runTest(argc, (const char **)argv); + // If help flag is set, display help and exit immediately + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Displaying help on console\n"); + showHelp(argc, (const char **)argv); + exit(EXIT_SUCCESS); } - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } - // Finish - exit(EXIT_SUCCESS); + // Check the precision (checked against the device capability later) + try { + char *value; + + if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value)) { + // Check requested precision is valid + string prec(value); + + if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { + runTest(argc, (const char **)argv); + } + else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0) { + runTest(argc, (const char **)argv); + } + else { + printf("specified precision (%s) is invalid, must be \"single\" or " + "\"double\".\n", + value); + throw invalid_argument("precision"); + } + } + else { + runTest(argc, (const char **)argv); + } + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + + // Finish + exit(EXIT_SUCCESS); } -template -void runTest(int argc, const char **argv) { - using std::invalid_argument; - using std::runtime_error; +template void runTest(int argc, const char **argv) +{ + using std::invalid_argument; + using std::runtime_error; - StopWatchInterface *timer = NULL; + StopWatchInterface *timer = NULL; - try { - Test test; - int deviceCount = 0; - cudaError_t cudaResult = cudaSuccess; + try { + Test test; + int deviceCount = 0; + cudaError_t cudaResult = cudaSuccess; - // by default specify GPU Device == 0 - test.device = 0; + // by default specify GPU Device == 0 + test.device = 0; - // Get number of available devices - cudaResult = cudaGetDeviceCount(&deviceCount); - - if (cudaResult != cudaSuccess) { - printf("could not get device count.\n"); - throw runtime_error("cudaGetDeviceCount"); - } - - // (default parameters) - test.numSims = k_sims_qa; - test.threadBlockSize = k_bsize_qa; - - { - char *value = 0; - - if (getCmdLineArgumentString(argc, argv, "device", &value)) { - test.device = (int)atoi(value); - - if (test.device >= deviceCount) { - printf( - "invalid target device specified on command line (device %d does " - "not exist).\n", - test.device); - throw invalid_argument("device"); - } - } else { - test.device = gpuGetMaxGflopsDeviceId(); - } - - if (getCmdLineArgumentString(argc, argv, "sims", &value)) { - test.numSims = (unsigned int)atoi(value); - - if (test.numSims < k_sims_min || test.numSims > k_sims_max) { - printf( - "specified number of simulations (%d) is invalid, must be " - "between %d and %d.\n", - test.numSims, k_sims_min, k_sims_max); - throw invalid_argument("sims"); - } - } else { - test.numSims = k_sims_def; - } - - if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { - // Determine max threads per block - cudaDeviceProp deviceProperties; - cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + // Get number of available devices + cudaResult = cudaGetDeviceCount(&deviceCount); if (cudaResult != cudaSuccess) { - printf("cound not get device properties for device %d.\n", - test.device); - throw runtime_error("cudaGetDeviceProperties"); + printf("could not get device count.\n"); + throw runtime_error("cudaGetDeviceCount"); } - // Check requested size is valid - test.threadBlockSize = (unsigned int)atoi(value); + // (default parameters) + test.numSims = k_sims_qa; + test.threadBlockSize = k_bsize_qa; - if (test.threadBlockSize < k_bsize_min || - test.threadBlockSize > static_cast( - deviceProperties.maxThreadsPerBlock)) { - printf( - "specified block size (%d) is invalid, must be between %d and %d " - "for device %d.\n", - test.threadBlockSize, k_bsize_min, - deviceProperties.maxThreadsPerBlock, test.device); - throw invalid_argument("block-size"); - } + { + char *value = 0; - if (test.threadBlockSize & test.threadBlockSize - 1) { - printf( - "specified block size (%d) is invalid, must be a power of two " - "(see reduction function).\n", - test.threadBlockSize); - throw invalid_argument("block-size"); + if (getCmdLineArgumentString(argc, argv, "device", &value)) { + test.device = (int)atoi(value); + + if (test.device >= deviceCount) { + printf("invalid target device specified on command line (device %d does " + "not exist).\n", + test.device); + throw invalid_argument("device"); + } + } + else { + test.device = gpuGetMaxGflopsDeviceId(); + } + + if (getCmdLineArgumentString(argc, argv, "sims", &value)) { + test.numSims = (unsigned int)atoi(value); + + if (test.numSims < k_sims_min || test.numSims > k_sims_max) { + printf("specified number of simulations (%d) is invalid, must be " + "between %d and %d.\n", + test.numSims, + k_sims_min, + k_sims_max); + throw invalid_argument("sims"); + } + } + else { + test.numSims = k_sims_def; + } + + if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { + // Determine max threads per block + cudaDeviceProp deviceProperties; + cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + + if (cudaResult != cudaSuccess) { + printf("cound not get device properties for device %d.\n", test.device); + throw runtime_error("cudaGetDeviceProperties"); + } + + // Check requested size is valid + test.threadBlockSize = (unsigned int)atoi(value); + + if (test.threadBlockSize < k_bsize_min + || test.threadBlockSize > static_cast(deviceProperties.maxThreadsPerBlock)) { + printf("specified block size (%d) is invalid, must be between %d and %d " + "for device %d.\n", + test.threadBlockSize, + k_bsize_min, + deviceProperties.maxThreadsPerBlock, + test.device); + throw invalid_argument("block-size"); + } + + if (test.threadBlockSize & test.threadBlockSize - 1) { + printf("specified block size (%d) is invalid, must be a power of two " + "(see reduction function).\n", + test.threadBlockSize); + throw invalid_argument("block-size"); + } + } + else { + test.threadBlockSize = k_bsize_def; + } } - } else { - test.threadBlockSize = k_bsize_def; - } + // Execute + test(); + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + catch (runtime_error &e) { + printf("runtime error (%s)\n", e.what()); + exit(EXIT_FAILURE); } - // Execute - test(); - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } catch (runtime_error &e) { - printf("runtime error (%s)\n", e.what()); - exit(EXIT_FAILURE); - } } -void showHelp(int argc, const char **argv) { - using std::cout; - using std::endl; - using std::left; - using std::setw; +void showHelp(int argc, const char **argv) +{ + using std::cout; + using std::endl; + using std::left; + using std::setw; - if (argc > 0) { - cout << endl << argv[0] << endl; - } + if (argc > 0) { + cout << endl << argv[0] << endl; + } - cout << endl << "Syntax:" << endl; - cout << left; - cout << " " << setw(20) << "--device=" - << "Specify device to use for execution" << endl; - cout << " " << setw(20) << "--sims=" - << "Specify number of Monte Carlo simulations" << endl; - cout << " " << setw(20) << "--block-size=" - << "Specify number of threads per block" << endl; - cout << " " << setw(20) << "--precision=

" - << "Specify the precision (\"single\" or \"double\")" << endl; - cout << endl; - cout << " " << setw(20) << "--noprompt" - << "Skip prompt before exit" << endl; - cout << endl; + cout << endl << "Syntax:" << endl; + cout << left; + cout << " " << setw(20) << "--device=" + << "Specify device to use for execution" << endl; + cout << " " << setw(20) << "--sims=" + << "Specify number of Monte Carlo simulations" << endl; + cout << " " << setw(20) << "--block-size=" + << "Specify number of threads per block" << endl; + cout << " " << setw(20) << "--precision=

" + << "Specify the precision (\"single\" or \"double\")" << endl; + cout << endl; + cout << " " << setw(20) << "--noprompt" + << "Skip prompt before exit" << endl; + cout << endl; } diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/piestimator.cu b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/piestimator.cu index ee51e39a..4d055602 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/piestimator.cu +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/piestimator.cu @@ -26,15 +26,15 @@ */ -#include "../inc/piestimator.h" - -#include -#include +#include +#include #include #include +#include #include -#include - #include +#include + +#include "../inc/piestimator.h" namespace cg = cooperative_groups; #include @@ -46,346 +46,329 @@ using std::string; using std::vector; // Helper templates to support float and double in same code -template -struct TYPE_IS { - static const bool test = false; +template struct TYPE_IS +{ + static const bool test = false; }; -template -struct TYPE_IS { - static const bool test = true; +template struct TYPE_IS +{ + static const bool test = true; }; -template -struct IF { - typedef R type; +template struct IF +{ + typedef R type; }; -template -struct IF { - typedef L type; +template struct IF +{ + typedef L type; }; // RNG init kernel template -__global__ void initRNG(rngState_t *const rngStates, - rngDirectionVectors_t *const rngDirections, - unsigned int numDrawsPerDirection) { - // Determine thread ID - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int step = gridDim.x * blockDim.x; +__global__ void +initRNG(rngState_t *const rngStates, rngDirectionVectors_t *const rngDirections, unsigned int numDrawsPerDirection) +{ + // Determine thread ID + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int step = gridDim.x * blockDim.x; - // Determine offset to avoid overlapping sub-sequences - unsigned int offset = tid * ((numDrawsPerDirection + step - 1) / step); + // Determine offset to avoid overlapping sub-sequences + unsigned int offset = tid * ((numDrawsPerDirection + step - 1) / step); - // Initialise the RNG - curand_init(rngDirections[0], offset, &rngStates[tid]); - curand_init(rngDirections[1], offset, &rngStates[tid + step]); + // Initialise the RNG + curand_init(rngDirections[0], offset, &rngStates[tid]); + curand_init(rngDirections[1], offset, &rngStates[tid + step]); } -__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) { - extern __shared__ unsigned int sdata[]; +__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) +{ + extern __shared__ unsigned int sdata[]; - // Perform first level of reduction: - // - Write to shared memory - unsigned int ltid = threadIdx.x; + // Perform first level of reduction: + // - Write to shared memory + unsigned int ltid = threadIdx.x; - sdata[ltid] = in; - cg::sync(cta); + sdata[ltid] = in; + cg::sync(cta); - // Do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (ltid < s) { - sdata[ltid] += sdata[ltid + s]; + // Do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (ltid < s) { + sdata[ltid] += sdata[ltid + s]; + } + + cg::sync(cta); } - cg::sync(cta); - } - - return sdata[0]; + return sdata[0]; } -__device__ inline void getPoint(float &x, float &y, curandStateSobol32 &state1, - curandStateSobol32 &state2) { - x = curand_uniform(&state1); - y = curand_uniform(&state2); +__device__ inline void getPoint(float &x, float &y, curandStateSobol32 &state1, curandStateSobol32 &state2) +{ + x = curand_uniform(&state1); + y = curand_uniform(&state2); } -__device__ inline void getPoint(double &x, double &y, - curandStateSobol64 &state1, - curandStateSobol64 &state2) { - x = curand_uniform_double(&state1); - y = curand_uniform_double(&state2); +__device__ inline void getPoint(double &x, double &y, curandStateSobol64 &state1, curandStateSobol64 &state2) +{ + x = curand_uniform_double(&state1); + y = curand_uniform_double(&state2); } // Estimator kernel template -__global__ void computeValue(unsigned int *const results, - rngState_t *const rngStates, - const unsigned int numSims) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Determine thread ID - unsigned int bid = blockIdx.x; - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int step = gridDim.x * blockDim.x; +__global__ void computeValue(unsigned int *const results, rngState_t *const rngStates, const unsigned int numSims) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Determine thread ID + unsigned int bid = blockIdx.x; + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int step = gridDim.x * blockDim.x; - // Initialise the RNG - rngState_t localState1 = rngStates[tid]; - rngState_t localState2 = rngStates[tid + step]; + // Initialise the RNG + rngState_t localState1 = rngStates[tid]; + rngState_t localState2 = rngStates[tid + step]; - // Count the number of points which lie inside the unit quarter-circle - unsigned int pointsInside = 0; + // Count the number of points which lie inside the unit quarter-circle + unsigned int pointsInside = 0; - for (unsigned int i = tid; i < numSims; i += step) { - Real x; - Real y; - getPoint(x, y, localState1, localState2); - Real l2norm2 = x * x + y * y; + for (unsigned int i = tid; i < numSims; i += step) { + Real x; + Real y; + getPoint(x, y, localState1, localState2); + Real l2norm2 = x * x + y * y; - if (l2norm2 < static_cast(1)) { - pointsInside++; + if (l2norm2 < static_cast(1)) { + pointsInside++; + } } - } - // Reduce within the block - pointsInside = reduce_sum(pointsInside, cta); + // Reduce within the block + pointsInside = reduce_sum(pointsInside, cta); - // Store the result - if (threadIdx.x == 0) { - results[bid] = pointsInside; - } + // Store the result + if (threadIdx.x == 0) { + results[bid] = pointsInside; + } } template -PiEstimator::PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize) - : m_numSims(numSims), - m_device(device), - m_threadBlockSize(threadBlockSize) {} +PiEstimator::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize) + : m_numSims(numSims) + , m_device(device) + , m_threadBlockSize(threadBlockSize) +{ +} -template -Real PiEstimator::operator()() { - cudaError_t cudaResult = cudaSuccess; - struct cudaDeviceProp deviceProperties; - struct cudaFuncAttributes funcAttributes; +template Real PiEstimator::operator()() +{ + cudaError_t cudaResult = cudaSuccess; + struct cudaDeviceProp deviceProperties; + struct cudaFuncAttributes funcAttributes; - // Determine type of generator to use (32- or 64-bit) - typedef typename IF::test, curandStateSobol64_t, - curandStateSobol32_t>::type curandStateSobol_sz; - typedef - typename IF::test, curandDirectionVectors64_t, - curandDirectionVectors32_t>::type curandDirectionVectors_sz; + // Determine type of generator to use (32- or 64-bit) + typedef + typename IF::test, curandStateSobol64_t, curandStateSobol32_t>::type curandStateSobol_sz; + typedef typename IF::test, curandDirectionVectors64_t, curandDirectionVectors32_t>::type + curandDirectionVectors_sz; - // Get device properties - cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); - - if (cudaResult != cudaSuccess) { - string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - - // Check precision is valid - if (typeid(Real) == typeid(double) && - (deviceProperties.major < 1 || - (deviceProperties.major == 1 && deviceProperties.minor < 3))) { - throw std::runtime_error("Device does not have double precision support"); - } - - // Attach to GPU - cudaResult = cudaSetDevice(m_device); - - if (cudaResult != cudaSuccess) { - string msg("Could not set CUDA device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - - // Determine how to divide the work between cores - dim3 block; - dim3 grid; - block.x = m_threadBlockSize; - grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; - - // Aim to launch around ten or more times as many blocks as there - // are multiprocessors on the target device. - unsigned int blocksPerSM = 10; - unsigned int numSMs = deviceProperties.multiProcessorCount; - - while (grid.x > 2 * blocksPerSM * numSMs) { - grid.x >>= 1; - } - - // Get initRNG function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes( - &funcAttributes, initRNG); - - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for initRNG kernel"); - } - - // Get computeValue function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, - computeValue); - - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for computeValue kernel"); - } - - // Check the dimensions are valid - if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { - throw std::runtime_error("Block X dimension is too large for device"); - } - - if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { - throw std::runtime_error("Grid X dimension is too large for device"); - } - - // Allocate memory for RNG states and direction vectors - curandStateSobol_sz *d_rngStates = 0; - curandDirectionVectors_sz *d_rngDirections = 0; - cudaResult = cudaMalloc((void **)&d_rngStates, - 2 * grid.x * block.x * sizeof(curandStateSobol_sz)); - - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for RNG states: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - - cudaResult = cudaMalloc((void **)&d_rngDirections, - 2 * sizeof(curandDirectionVectors_sz)); - - if (cudaResult != cudaSuccess) { - string msg( - "Could not allocate memory on device for RNG direction vectors: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - - // Allocate memory for result - // Each thread block will produce one result - unsigned int *d_results = 0; - cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); - - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for partial results: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - - // Generate direction vectors on the host and copy to the device - if (typeid(Real) == typeid(float)) { - curandDirectionVectors32_t *rngDirections; - curandStatus_t curandResult = curandGetDirectionVectors32( - &rngDirections, CURAND_DIRECTION_VECTORS_32_JOEKUO6); - - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg( - "Could not get direction vectors for quasi-random number " - "generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } - - cudaResult = cudaMemcpy(d_rngDirections, rngDirections, - 2 * sizeof(curandDirectionVectors32_t), - cudaMemcpyHostToDevice); + // Get device properties + cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); if (cudaResult != cudaSuccess) { - string msg("Could not copy direction vectors to device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } - } else if (typeid(Real) == typeid(double)) { - curandDirectionVectors64_t *rngDirections; - curandStatus_t curandResult = curandGetDirectionVectors64( - &rngDirections, CURAND_DIRECTION_VECTORS_64_JOEKUO6); - - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg( - "Could not get direction vectors for quasi-random number " - "generator: "); - msg += curandResult; - throw std::runtime_error(msg); + string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); } - cudaResult = cudaMemcpy(d_rngDirections, rngDirections, - 2 * sizeof(curandDirectionVectors64_t), - cudaMemcpyHostToDevice); + // Check precision is valid + if (typeid(Real) == typeid(double) + && (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3))) { + throw std::runtime_error("Device does not have double precision support"); + } + + // Attach to GPU + cudaResult = cudaSetDevice(m_device); if (cudaResult != cudaSuccess) { - string msg("Could not copy direction vectors to device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); + string msg("Could not set CUDA device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); } - } else { - string msg( - "Could not get direction vectors for random number generator of " - "specified type"); - throw std::runtime_error(msg); - } - // Initialise RNG - initRNG<<>>(d_rngStates, d_rngDirections, m_numSims); + // Determine how to divide the work between cores + dim3 block; + dim3 grid; + block.x = m_threadBlockSize; + grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; - // Count the points inside unit quarter-circle - computeValue<<>>( - d_results, d_rngStates, m_numSims); + // Aim to launch around ten or more times as many blocks as there + // are multiprocessors on the target device. + unsigned int blocksPerSM = 10; + unsigned int numSMs = deviceProperties.multiProcessorCount; - // Copy partial results back - vector results(grid.x); - cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), - cudaMemcpyDeviceToHost); + while (grid.x > 2 * blocksPerSM * numSMs) { + grid.x >>= 1; + } - if (cudaResult != cudaSuccess) { - string msg("Could not copy partial results to host: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + // Get initRNG function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG); - // Complete sum-reduction on host - Real value = - static_cast(std::accumulate(results.begin(), results.end(), 0)); + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Determine the proportion of points inside the quarter-circle, - // i.e. the area of the unit quarter-circle - value /= m_numSims; + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for initRNG kernel"); + } - // Value is currently an estimate of the area of a unit quarter-circle, so we - // can scale to a full circle by multiplying by four. Now since the area of a - // circle is pi * r^2, and r is one, the value will be an estimate for the - // value of pi. - value *= 4; + // Get computeValue function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); - // Cleanup - if (d_rngStates) { - cudaFree(d_rngStates); - d_rngStates = 0; - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (d_rngDirections) { - cudaFree(d_rngDirections); - d_rngDirections = 0; - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for computeValue kernel"); + } - if (d_results) { - cudaFree(d_results); - d_results = 0; - } + // Check the dimensions are valid + if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { + throw std::runtime_error("Block X dimension is too large for device"); + } - return value; + if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { + throw std::runtime_error("Grid X dimension is too large for device"); + } + + // Allocate memory for RNG states and direction vectors + curandStateSobol_sz *d_rngStates = 0; + curandDirectionVectors_sz *d_rngDirections = 0; + cudaResult = cudaMalloc((void **)&d_rngStates, 2 * grid.x * block.x * sizeof(curandStateSobol_sz)); + + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for RNG states: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } + + cudaResult = cudaMalloc((void **)&d_rngDirections, 2 * sizeof(curandDirectionVectors_sz)); + + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for RNG direction vectors: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } + + // Allocate memory for result + // Each thread block will produce one result + unsigned int *d_results = 0; + cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); + + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for partial results: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } + + // Generate direction vectors on the host and copy to the device + if (typeid(Real) == typeid(float)) { + curandDirectionVectors32_t *rngDirections; + curandStatus_t curandResult = curandGetDirectionVectors32(&rngDirections, CURAND_DIRECTION_VECTORS_32_JOEKUO6); + + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not get direction vectors for quasi-random number " + "generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } + + cudaResult = + cudaMemcpy(d_rngDirections, rngDirections, 2 * sizeof(curandDirectionVectors32_t), cudaMemcpyHostToDevice); + + if (cudaResult != cudaSuccess) { + string msg("Could not copy direction vectors to device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } + } + else if (typeid(Real) == typeid(double)) { + curandDirectionVectors64_t *rngDirections; + curandStatus_t curandResult = curandGetDirectionVectors64(&rngDirections, CURAND_DIRECTION_VECTORS_64_JOEKUO6); + + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not get direction vectors for quasi-random number " + "generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } + + cudaResult = + cudaMemcpy(d_rngDirections, rngDirections, 2 * sizeof(curandDirectionVectors64_t), cudaMemcpyHostToDevice); + + if (cudaResult != cudaSuccess) { + string msg("Could not copy direction vectors to device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } + } + else { + string msg("Could not get direction vectors for random number generator of " + "specified type"); + throw std::runtime_error(msg); + } + + // Initialise RNG + initRNG<<>>(d_rngStates, d_rngDirections, m_numSims); + + // Count the points inside unit quarter-circle + computeValue<<>>(d_results, d_rngStates, m_numSims); + + // Copy partial results back + vector results(grid.x); + cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost); + + if (cudaResult != cudaSuccess) { + string msg("Could not copy partial results to host: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } + + // Complete sum-reduction on host + Real value = static_cast(std::accumulate(results.begin(), results.end(), 0)); + + // Determine the proportion of points inside the quarter-circle, + // i.e. the area of the unit quarter-circle + value /= m_numSims; + + // Value is currently an estimate of the area of a unit quarter-circle, so we + // can scale to a full circle by multiplying by four. Now since the area of a + // circle is pi * r^2, and r is one, the value will be an estimate for the + // value of pi. + value *= 4; + + // Cleanup + if (d_rngStates) { + cudaFree(d_rngStates); + d_rngStates = 0; + } + + if (d_rngDirections) { + cudaFree(d_rngDirections); + d_rngDirections = 0; + } + + if (d_results) { + cudaFree(d_results); + d_results = 0; + } + + return value; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/test.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/test.cpp index 5be6ac29..c098b8ea 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/test.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/src/test.cpp @@ -28,84 +28,84 @@ #include "../inc/test.h" -#include -#include -#include -#include -#include #include -#include -#include -#include #include - +#include +#include +#include #include +#include +#include +#include +#include +#include #include "../inc/piestimator.h" -template -bool Test::operator()() { - using std::endl; - using std::setw; - using std::stringstream; +template bool Test::operator()() +{ + using std::endl; + using std::setw; + using std::stringstream; - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); - // Get device properties - struct cudaDeviceProp deviceProperties; - cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); + // Get device properties + struct cudaDeviceProp deviceProperties; + cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); - if (cudaResult != cudaSuccess) { - std::string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + std::string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Evaluate on GPU - printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); - PiEstimator estimator(numSims, device, threadBlockSize); - sdkStartTimer(&timer); - Real result = estimator(); - sdkStopTimer(&timer); - elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; + // Evaluate on GPU + printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); + PiEstimator estimator(numSims, device, threadBlockSize); + sdkStartTimer(&timer); + Real result = estimator(); + sdkStopTimer(&timer); + elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; - // Tolerance to compare result with expected - // This is just to check that nothing has gone very wrong with the - // test, the actual accuracy of the result depends on the number of - // Monte Carlo trials - const Real tolerance = static_cast(0.01); + // Tolerance to compare result with expected + // This is just to check that nothing has gone very wrong with the + // test, the actual accuracy of the result depends on the number of + // Monte Carlo trials + const Real tolerance = static_cast(0.01); - // Display results - Real abserror = fabs(result - static_cast(PI)); - Real relerror = abserror / static_cast(PI); - printf("Precision: %s\n", - (typeid(Real) == typeid(double)) ? "double" : "single"); - printf("Number of sims: %d\n", numSims); - printf("Tolerance: %e\n", tolerance); - printf("GPU result: %e\n", result); - printf("Expected: %e\n", PI); - printf("Absolute error: %e\n", abserror); - printf("Relative error: %e\n\n", relerror); + // Display results + Real abserror = fabs(result - static_cast(PI)); + Real relerror = abserror / static_cast(PI); + printf("Precision: %s\n", (typeid(Real) == typeid(double)) ? "double" : "single"); + printf("Number of sims: %d\n", numSims); + printf("Tolerance: %e\n", tolerance); + printf("GPU result: %e\n", result); + printf("Expected: %e\n", PI); + printf("Absolute error: %e\n", abserror); + printf("Relative error: %e\n\n", relerror); - // Check result - if (relerror > tolerance) { - printf("computed result (%e) does not match expected result (%e).\n", - result, PI); - pass = false; - } else { - pass = true; - } + // Check result + if (relerror > tolerance) { + printf("computed result (%e) does not match expected result (%e).\n", result, PI); + pass = false; + } + else { + pass = true; + } - // Print results - printf( - "MonteCarloEstimatePiInlineQ, Performance = %.2f sims/s, Time = " - "%.2f(ms), NumDevsUsed = %u, Blocksize = %u\n", - numSims / elapsedTime, elapsedTime * 1000.0f, 1, threadBlockSize); + // Print results + printf("MonteCarloEstimatePiInlineQ, Performance = %.2f sims/s, Time = " + "%.2f(ms), NumDevsUsed = %u, Blocksize = %u\n", + numSims / elapsedTime, + elapsedTime * 1000.0f, + 1, + threadBlockSize); - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); - return pass; + return pass; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/cudasharedmem.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/cudasharedmem.h index 5c9bab98..4627d72c 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/cudasharedmem.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/cudasharedmem.h @@ -65,25 +65,27 @@ // This is the un-specialized struct. Note that we prevent instantiation of // this struct by making it abstract (i.e. with pure virtual methods). -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - virtual __device__ T &operator*() = 0; - virtual __device__ T &operator[](int i) = 0; +template struct SharedMemory +{ + // Ensure that we won't compile any un-specialized types + virtual __device__ T &operator*() = 0; + virtual __device__ T &operator[](int i) = 0; }; -#define BUILD_SHAREDMEMORY_TYPE(t, n) \ - template <> \ - struct SharedMemory { \ - __device__ t &operator*() { \ - extern __shared__ t n[]; \ - return *n; \ - } \ - __device__ t &operator[](int i) { \ - extern __shared__ t n[]; \ - return n[i]; \ - } \ - } +#define BUILD_SHAREDMEMORY_TYPE(t, n) \ + template <> struct SharedMemory \ + { \ + __device__ t &operator*() \ + { \ + extern __shared__ t n[]; \ + return *n; \ + } \ + __device__ t &operator[](int i) \ + { \ + extern __shared__ t n[]; \ + return n[i]; \ + } \ + } BUILD_SHAREDMEMORY_TYPE(int, s_int); BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint); diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/piestimator.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/piestimator.h index 6099c614..e683cb39 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/piestimator.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/piestimator.h @@ -28,18 +28,17 @@ #ifndef PIESTIMATOR_H #define PIESTIMATOR_H -template -class PiEstimator { - public: - PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize, unsigned int seed); - Real operator()(); +template class PiEstimator +{ +public: + PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed); + Real operator()(); - private: - unsigned int m_seed; - unsigned int m_numSims; - unsigned int m_device; - unsigned int m_threadBlockSize; +private: + unsigned int m_seed; + unsigned int m_numSims; + unsigned int m_device; + unsigned int m_threadBlockSize; }; #endif diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/test.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/test.h index f8309d95..4928b1d9 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/test.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/inc/test.h @@ -28,30 +28,31 @@ #ifndef TEST_H #define TEST_H -template -struct Test { - Test() : pass(false){}; +template struct Test +{ + Test() + : pass(false) {}; - int device; - unsigned int numSims; - unsigned int threadBlockSize; - unsigned int seed; + int device; + unsigned int numSims; + unsigned int threadBlockSize; + unsigned int seed; - bool pass; - double elapsedTime; + bool pass; + double elapsedTime; - bool operator()(); + bool operator()(); }; // Defaults are arbitrary to give sensible runtime -#define k_sims_min 100000 -#define k_sims_max 10000000 -#define k_sims_def 100000 -#define k_sims_qa 100000 +#define k_sims_min 100000 +#define k_sims_max 10000000 +#define k_sims_def 100000 +#define k_sims_qa 100000 #define k_bsize_min 32 #define k_bsize_def 128 -#define k_bsize_qa 128 -#define k_seed_def 1234 +#define k_bsize_qa 128 +#define k_seed_def 1234 // Target value #define PI 3.14159265359 diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/main.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/main.cpp index 6193efb0..a7cbe5a6 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/main.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/main.cpp @@ -39,13 +39,13 @@ /////////////////////////////////////////////////////////////////////////////// -#include -#include -#include #include -#include #include +#include +#include +#include #include +#include #include "../inc/test.h" @@ -53,198 +53,203 @@ static const char *printfFile = "MonteCarloEstimatePiP.txt"; // Forward declarations -void showHelp(const int argc, const char **argv); -template -void runTest(int argc, const char **argv); +void showHelp(const int argc, const char **argv); +template void runTest(int argc, const char **argv); -int main(int argc, char **argv) { - using std::invalid_argument; - using std::string; +int main(int argc, char **argv) +{ + using std::invalid_argument; + using std::string; - // Open the log file - printf("Monte Carlo Estimate Pi (with batch PRNG)\n"); - printf("=========================================\n\n"); + // Open the log file + printf("Monte Carlo Estimate Pi (with batch PRNG)\n"); + printf("=========================================\n\n"); - // If help flag is set, display help and exit immediately - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Displaying help on console\n"); - showHelp(argc, (const char **)argv); - exit(EXIT_SUCCESS); - } - - // Check the precision (checked against the device capability later) - try { - char *value; - - if (getCmdLineArgumentString(argc, (const char **)argv, "precision", - &value)) { - // Check requested precision is valid - string prec(value); - - if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { - runTest(argc, (const char **)argv); - } else if (prec.compare("double") == 0 || - prec.compare("\"double\"") == 0) { - runTest(argc, (const char **)argv); - } else { - printf("specified precision (%s) is invalid, must be \"single\".\n", - value); - throw invalid_argument("precision"); - } - } else { - runTest(argc, (const char **)argv); + // If help flag is set, display help and exit immediately + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Displaying help on console\n"); + showHelp(argc, (const char **)argv); + exit(EXIT_SUCCESS); } - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } - // Finish - exit(EXIT_SUCCESS); + // Check the precision (checked against the device capability later) + try { + char *value; + + if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value)) { + // Check requested precision is valid + string prec(value); + + if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { + runTest(argc, (const char **)argv); + } + else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0) { + runTest(argc, (const char **)argv); + } + else { + printf("specified precision (%s) is invalid, must be \"single\".\n", value); + throw invalid_argument("precision"); + } + } + else { + runTest(argc, (const char **)argv); + } + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + + // Finish + exit(EXIT_SUCCESS); } -template -void runTest(int argc, const char **argv) { - using std::invalid_argument; - using std::runtime_error; +template void runTest(int argc, const char **argv) +{ + using std::invalid_argument; + using std::runtime_error; - try { - Test test; - int deviceCount = 0; - cudaError_t cudaResult = cudaSuccess; + try { + Test test; + int deviceCount = 0; + cudaError_t cudaResult = cudaSuccess; - // by default specify GPU Device == 0 - test.device = 0; + // by default specify GPU Device == 0 + test.device = 0; - // Get number of available devices - cudaResult = cudaGetDeviceCount(&deviceCount); - - if (cudaResult != cudaSuccess) { - printf("could not get device count.\n"); - throw runtime_error("cudaGetDeviceCount"); - } - - // (default parameters) - test.numSims = k_sims_qa; - test.threadBlockSize = k_bsize_qa; - test.seed = k_seed_def; - - { - char *value = 0; - - if (getCmdLineArgumentString(argc, argv, "device", &value)) { - test.device = (int)atoi(value); - - if (test.device >= deviceCount) { - printf( - "invalid target device specified on command line (device %d does " - "not exist).\n", - test.device); - throw invalid_argument("device"); - } - } else { - test.device = gpuGetMaxGflopsDeviceId(); - } - - if (getCmdLineArgumentString(argc, argv, "sims", &value)) { - test.numSims = (unsigned int)atoi(value); - - if (test.numSims < k_sims_min || test.numSims > k_sims_max) { - printf( - "specified number of simulations (%d) is invalid, must be " - "between %d and %d.\n", - test.numSims, k_sims_min, k_sims_max); - throw invalid_argument("sims"); - } - } else { - test.numSims = k_sims_def; - } - - if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { - // Determine max threads per block - cudaDeviceProp deviceProperties; - cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + // Get number of available devices + cudaResult = cudaGetDeviceCount(&deviceCount); if (cudaResult != cudaSuccess) { - printf("cound not get device properties for device %d.\n", - test.device); - throw runtime_error("cudaGetDeviceProperties"); + printf("could not get device count.\n"); + throw runtime_error("cudaGetDeviceCount"); } - // Check requested size is valid - test.threadBlockSize = (unsigned int)atoi(value); + // (default parameters) + test.numSims = k_sims_qa; + test.threadBlockSize = k_bsize_qa; + test.seed = k_seed_def; - if (test.threadBlockSize < k_bsize_min || - test.threadBlockSize > static_cast( - deviceProperties.maxThreadsPerBlock)) { - printf( - "specified block size (%d) is invalid, must be between %d and %d " - "for device %d.\n", - test.threadBlockSize, k_bsize_min, - deviceProperties.maxThreadsPerBlock, test.device); - throw invalid_argument("block-size"); + { + char *value = 0; + + if (getCmdLineArgumentString(argc, argv, "device", &value)) { + test.device = (int)atoi(value); + + if (test.device >= deviceCount) { + printf("invalid target device specified on command line (device %d does " + "not exist).\n", + test.device); + throw invalid_argument("device"); + } + } + else { + test.device = gpuGetMaxGflopsDeviceId(); + } + + if (getCmdLineArgumentString(argc, argv, "sims", &value)) { + test.numSims = (unsigned int)atoi(value); + + if (test.numSims < k_sims_min || test.numSims > k_sims_max) { + printf("specified number of simulations (%d) is invalid, must be " + "between %d and %d.\n", + test.numSims, + k_sims_min, + k_sims_max); + throw invalid_argument("sims"); + } + } + else { + test.numSims = k_sims_def; + } + + if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { + // Determine max threads per block + cudaDeviceProp deviceProperties; + cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + + if (cudaResult != cudaSuccess) { + printf("cound not get device properties for device %d.\n", test.device); + throw runtime_error("cudaGetDeviceProperties"); + } + + // Check requested size is valid + test.threadBlockSize = (unsigned int)atoi(value); + + if (test.threadBlockSize < k_bsize_min + || test.threadBlockSize > static_cast(deviceProperties.maxThreadsPerBlock)) { + printf("specified block size (%d) is invalid, must be between %d and %d " + "for device %d.\n", + test.threadBlockSize, + k_bsize_min, + deviceProperties.maxThreadsPerBlock, + test.device); + throw invalid_argument("block-size"); + } + + if (test.threadBlockSize & test.threadBlockSize - 1) { + printf("specified block size (%d) is invalid, must be a power of two " + "(see reduction function).\n", + test.threadBlockSize); + throw invalid_argument("block-size"); + } + } + else { + test.threadBlockSize = k_bsize_def; + } + + if (getCmdLineArgumentString(argc, argv, "seed", &value)) { + // Check requested seed is valid + test.seed = (unsigned int)atoi(value); + + if (test.seed == 0) { + printf("specified seed (%d) is invalid, must be non-zero.\n", test.seed); + throw invalid_argument("seed"); + } + } + else { + test.seed = k_seed_def; + } } - if (test.threadBlockSize & test.threadBlockSize - 1) { - printf( - "specified block size (%d) is invalid, must be a power of two " - "(see reduction function).\n", - test.threadBlockSize); - throw invalid_argument("block-size"); - } - } else { - test.threadBlockSize = k_bsize_def; - } + // Execute + test(); + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + catch (runtime_error &e) { + printf("runtime error (%s)\n", e.what()); + exit(EXIT_FAILURE); + } +} - if (getCmdLineArgumentString(argc, argv, "seed", &value)) { - // Check requested seed is valid - test.seed = (unsigned int)atoi(value); +void showHelp(int argc, const char **argv) +{ + using std::cout; + using std::endl; + using std::left; + using std::setw; - if (test.seed == 0) { - printf("specified seed (%d) is invalid, must be non-zero.\n", - test.seed); - throw invalid_argument("seed"); - } - } else { - test.seed = k_seed_def; - } + if (argc > 0) { + cout << endl << argv[0] << endl; } - // Execute - test(); - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } catch (runtime_error &e) { - printf("runtime error (%s)\n", e.what()); - exit(EXIT_FAILURE); - } -} - -void showHelp(int argc, const char **argv) { - using std::cout; - using std::endl; - using std::left; - using std::setw; - - if (argc > 0) { - cout << endl << argv[0] << endl; - } - - cout << endl << "Syntax:" << endl; - cout << left; - cout << " " << setw(20) << "--device=" - << "Specify device to use for execution" << endl; - cout << " " << setw(20) << "--sims=" - << "Specify number of Monte Carlo simulations" << endl; - cout << " " << setw(20) << "--block-size=" - << "Specify number of threads per block" << endl; - cout << " " << setw(20) << "--seed=" - << "Specify the seed to use for the random number generator" << endl; - cout << " " << setw(20) << "--precision=

" - << "Specify the precision (\"single\" or \"double\")" << endl; - cout << endl; - cout << " " << setw(20) << "--noprompt" - << "Skip prompt before exit" << endl; - cout << endl; + cout << endl << "Syntax:" << endl; + cout << left; + cout << " " << setw(20) << "--device=" + << "Specify device to use for execution" << endl; + cout << " " << setw(20) << "--sims=" + << "Specify number of Monte Carlo simulations" << endl; + cout << " " << setw(20) << "--block-size=" + << "Specify number of threads per block" << endl; + cout << " " << setw(20) << "--seed=" + << "Specify the seed to use for the random number generator" << endl; + cout << " " << setw(20) << "--precision=

" + << "Specify the precision (\"single\" or \"double\")" << endl; + cout << endl; + cout << " " << setw(20) << "--noprompt" + << "Skip prompt before exit" << endl; + cout << endl; } diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/piestimator.cu b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/piestimator.cu index 83f7884e..82090f6d 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/piestimator.cu +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/piestimator.cu @@ -25,15 +25,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "../inc/piestimator.h" - -#include -#include +#include +#include #include #include +#include #include -#include - #include +#include + +#include "../inc/piestimator.h" namespace cg = cooperative_groups; #include @@ -41,251 +41,249 @@ namespace cg = cooperative_groups; using std::string; using std::vector; -__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) { - extern __shared__ unsigned int sdata[]; +__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) +{ + extern __shared__ unsigned int sdata[]; - // Perform first level of reduction: - // - Write to shared memory - unsigned int ltid = threadIdx.x; + // Perform first level of reduction: + // - Write to shared memory + unsigned int ltid = threadIdx.x; - sdata[ltid] = in; - cg::sync(cta); + sdata[ltid] = in; + cg::sync(cta); - // Do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (ltid < s) { - sdata[ltid] += sdata[ltid + s]; + // Do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (ltid < s) { + sdata[ltid] += sdata[ltid + s]; + } + + cg::sync(cta); } - cg::sync(cta); - } - - return sdata[0]; + return sdata[0]; } // Estimator kernel template -__global__ void computeValue(unsigned int *const results, - const Real *const points, - const unsigned int numSims) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Determine thread ID - unsigned int bid = blockIdx.x; - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int step = gridDim.x * blockDim.x; +__global__ void computeValue(unsigned int *const results, const Real *const points, const unsigned int numSims) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Determine thread ID + unsigned int bid = blockIdx.x; + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int step = gridDim.x * blockDim.x; - // Shift the input/output pointers - const Real *pointx = points + tid; - const Real *pointy = pointx + numSims; + // Shift the input/output pointers + const Real *pointx = points + tid; + const Real *pointy = pointx + numSims; - // Count the number of points which lie inside the unit quarter-circle - unsigned int pointsInside = 0; + // Count the number of points which lie inside the unit quarter-circle + unsigned int pointsInside = 0; - for (unsigned int i = tid; i < numSims; - i += step, pointx += step, pointy += step) { - Real x = *pointx; - Real y = *pointy; - Real l2norm2 = x * x + y * y; + for (unsigned int i = tid; i < numSims; i += step, pointx += step, pointy += step) { + Real x = *pointx; + Real y = *pointy; + Real l2norm2 = x * x + y * y; - if (l2norm2 < static_cast(1)) { - pointsInside++; + if (l2norm2 < static_cast(1)) { + pointsInside++; + } } - } - // Reduce within the block - pointsInside = reduce_sum(pointsInside, cta); + // Reduce within the block + pointsInside = reduce_sum(pointsInside, cta); - // Store the result - if (threadIdx.x == 0) { - results[bid] = pointsInside; - } + // Store the result + if (threadIdx.x == 0) { + results[bid] = pointsInside; + } } template -PiEstimator::PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize, unsigned int seed) - : m_numSims(numSims), - m_device(device), - m_threadBlockSize(threadBlockSize), - m_seed(seed) {} +PiEstimator::PiEstimator(unsigned int numSims, + unsigned int device, + unsigned int threadBlockSize, + unsigned int seed) + : m_numSims(numSims) + , m_device(device) + , m_threadBlockSize(threadBlockSize) + , m_seed(seed) +{ +} -template -Real PiEstimator::operator()() { - cudaError_t cudaResult = cudaSuccess; - struct cudaDeviceProp deviceProperties; - struct cudaFuncAttributes funcAttributes; +template Real PiEstimator::operator()() +{ + cudaError_t cudaResult = cudaSuccess; + struct cudaDeviceProp deviceProperties; + struct cudaFuncAttributes funcAttributes; - // Get device properties - cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); + // Get device properties + cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Check precision is valid - if (typeid(Real) == typeid(double) && - (deviceProperties.major < 1 || - (deviceProperties.major == 1 && deviceProperties.minor < 3))) { - throw std::runtime_error("Device does not have double precision support"); - } + // Check precision is valid + if (typeid(Real) == typeid(double) + && (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3))) { + throw std::runtime_error("Device does not have double precision support"); + } - // Attach to GPU - cudaResult = cudaSetDevice(m_device); + // Attach to GPU + cudaResult = cudaSetDevice(m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not set CUDA device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not set CUDA device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Determine how to divide the work between cores - dim3 block; - dim3 grid; - block.x = m_threadBlockSize; - grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; + // Determine how to divide the work between cores + dim3 block; + dim3 grid; + block.x = m_threadBlockSize; + grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; - // Aim to launch around ten or more times as many blocks as there - // are multiprocessors on the target device. - unsigned int blocksPerSM = 10; - unsigned int numSMs = deviceProperties.multiProcessorCount; + // Aim to launch around ten or more times as many blocks as there + // are multiprocessors on the target device. + unsigned int blocksPerSM = 10; + unsigned int numSMs = deviceProperties.multiProcessorCount; - while (grid.x > 2 * blocksPerSM * numSMs) { - grid.x >>= 1; - } + while (grid.x > 2 * blocksPerSM * numSMs) { + grid.x >>= 1; + } - // Get computeValue function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); + // Get computeValue function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for computeValue kernel"); - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for computeValue kernel"); + } - // Check the dimensions are valid - if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { - throw std::runtime_error("Block X dimension is too large for device"); - } + // Check the dimensions are valid + if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { + throw std::runtime_error("Block X dimension is too large for device"); + } - if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { - throw std::runtime_error("Grid X dimension is too large for device"); - } + if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { + throw std::runtime_error("Grid X dimension is too large for device"); + } - // Allocate memory for points - // Each simulation has two random numbers to give X and Y coordinate - Real *d_points = 0; - cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real)); + // Allocate memory for points + // Each simulation has two random numbers to give X and Y coordinate + Real *d_points = 0; + cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for random numbers: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for random numbers: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Allocate memory for result - // Each thread block will produce one result - unsigned int *d_results = 0; - cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); + // Allocate memory for result + // Each thread block will produce one result + unsigned int *d_results = 0; + cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for partial results: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for partial results: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Generate random points in unit square - curandStatus_t curandResult; - curandGenerator_t prng; - curandResult = curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT); + // Generate random points in unit square + curandStatus_t curandResult; + curandGenerator_t prng; + curandResult = curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not create pseudo-random number generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not create pseudo-random number generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } - curandResult = curandSetPseudoRandomGeneratorSeed(prng, m_seed); + curandResult = curandSetPseudoRandomGeneratorSeed(prng, m_seed); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not set seed for pseudo-random number generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not set seed for pseudo-random number generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } - if (typeid(Real) == typeid(float)) { - curandResult = - curandGenerateUniform(prng, (float *)d_points, 2 * m_numSims); - } else if (typeid(Real) == typeid(double)) { - curandResult = - curandGenerateUniformDouble(prng, (double *)d_points, 2 * m_numSims); - } else { - string msg("Could not generate random numbers of specified type"); - throw std::runtime_error(msg); - } + if (typeid(Real) == typeid(float)) { + curandResult = curandGenerateUniform(prng, (float *)d_points, 2 * m_numSims); + } + else if (typeid(Real) == typeid(double)) { + curandResult = curandGenerateUniformDouble(prng, (double *)d_points, 2 * m_numSims); + } + else { + string msg("Could not generate random numbers of specified type"); + throw std::runtime_error(msg); + } - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not generate pseudo-random numbers: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not generate pseudo-random numbers: "); + msg += curandResult; + throw std::runtime_error(msg); + } - curandResult = curandDestroyGenerator(prng); + curandResult = curandDestroyGenerator(prng); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not destroy pseudo-random number generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not destroy pseudo-random number generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } - // Count the points inside unit quarter-circle - computeValue<<>>( - d_results, d_points, m_numSims); + // Count the points inside unit quarter-circle + computeValue<<>>(d_results, d_points, m_numSims); - // Copy partial results back - vector results(grid.x); - cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), - cudaMemcpyDeviceToHost); + // Copy partial results back + vector results(grid.x); + cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost); - if (cudaResult != cudaSuccess) { - string msg("Could not copy partial results to host: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not copy partial results to host: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Complete sum-reduction on host - Real value = - static_cast(std::accumulate(results.begin(), results.end(), 0)); + // Complete sum-reduction on host + Real value = static_cast(std::accumulate(results.begin(), results.end(), 0)); - // Determine the proportion of points inside the quarter-circle, - // i.e. the area of the unit quarter-circle - value /= m_numSims; + // Determine the proportion of points inside the quarter-circle, + // i.e. the area of the unit quarter-circle + value /= m_numSims; - // Value is currently an estimate of the area of a unit quarter-circle, so we - // can scale to a full circle by multiplying by four. Now since the area of a - // circle is pi * r^2, and r is one, the value will be an estimate for the - // value of pi. - value *= 4; + // Value is currently an estimate of the area of a unit quarter-circle, so we + // can scale to a full circle by multiplying by four. Now since the area of a + // circle is pi * r^2, and r is one, the value will be an estimate for the + // value of pi. + value *= 4; - // Cleanup - if (d_points) { - cudaFree(d_points); - d_points = 0; - } + // Cleanup + if (d_points) { + cudaFree(d_points); + d_points = 0; + } - if (d_results) { - cudaFree(d_results); - d_results = 0; - } + if (d_results) { + cudaFree(d_results); + d_results = 0; + } - return value; + return value; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/test.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/test.cpp index db8d8cf4..b74f695e 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/test.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/src/test.cpp @@ -27,81 +27,82 @@ #include "../inc/test.h" -#include -#include -#include -#include -#include #include -#include -#include -#include #include +#include +#include +#include #include +#include +#include +#include +#include +#include #include "../inc/piestimator.h" -template -bool Test::operator()() { - using std::endl; - using std::setw; - using std::stringstream; +template bool Test::operator()() +{ + using std::endl; + using std::setw; + using std::stringstream; - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); - // Get device properties - struct cudaDeviceProp deviceProperties; - cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); + // Get device properties + struct cudaDeviceProp deviceProperties; + cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); - if (cudaResult != cudaSuccess) { - std::string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + std::string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Evaluate on GPU - printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); - PiEstimator estimator(numSims, device, threadBlockSize, seed); - sdkStartTimer(&timer); - Real result = estimator(); - sdkStopTimer(&timer); - elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; + // Evaluate on GPU + printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); + PiEstimator estimator(numSims, device, threadBlockSize, seed); + sdkStartTimer(&timer); + Real result = estimator(); + sdkStopTimer(&timer); + elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; - // Tolerance to compare result with expected - // This is just to check that nothing has gone very wrong with the - // test, the actual accuracy of the result depends on the number of - // Monte Carlo trials - const Real tolerance = static_cast(0.01); + // Tolerance to compare result with expected + // This is just to check that nothing has gone very wrong with the + // test, the actual accuracy of the result depends on the number of + // Monte Carlo trials + const Real tolerance = static_cast(0.01); - // Display results - Real abserror = fabs(result - static_cast(PI)); - Real relerror = abserror / static_cast(PI); - printf("Precision: %s\n", - (typeid(Real) == typeid(double)) ? "double" : "single"); - printf("Number of sims: %d\n", numSims); - printf("Tolerance: %e\n", tolerance); - printf("GPU result: %e\n", result); - printf("Expected: %e\n", PI); - printf("Absolute error: %e\n", abserror); - printf("Relative error: %e\n\n", relerror); + // Display results + Real abserror = fabs(result - static_cast(PI)); + Real relerror = abserror / static_cast(PI); + printf("Precision: %s\n", (typeid(Real) == typeid(double)) ? "double" : "single"); + printf("Number of sims: %d\n", numSims); + printf("Tolerance: %e\n", tolerance); + printf("GPU result: %e\n", result); + printf("Expected: %e\n", PI); + printf("Absolute error: %e\n", abserror); + printf("Relative error: %e\n\n", relerror); - // Check result - if (relerror > tolerance) { - printf("computed result (%e) does not match expected result (%e).\n", - result, PI); - pass = false; - } else { - pass = true; - } + // Check result + if (relerror > tolerance) { + printf("computed result (%e) does not match expected result (%e).\n", result, PI); + pass = false; + } + else { + pass = true; + } - // Print results - printf( - "MonteCarloEstimatePiP, Performance = %.2f sims/s, Time = %.2f(ms), " - "NumDevsUsed = %u, Blocksize = %u\n", - numSims / elapsedTime, elapsedTime * 1000.0f, 1, threadBlockSize); + // Print results + printf("MonteCarloEstimatePiP, Performance = %.2f sims/s, Time = %.2f(ms), " + "NumDevsUsed = %u, Blocksize = %u\n", + numSims / elapsedTime, + elapsedTime * 1000.0f, + 1, + threadBlockSize); - return pass; + return pass; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/cudasharedmem.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/cudasharedmem.h index 5c9bab98..4627d72c 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/cudasharedmem.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/cudasharedmem.h @@ -65,25 +65,27 @@ // This is the un-specialized struct. Note that we prevent instantiation of // this struct by making it abstract (i.e. with pure virtual methods). -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - virtual __device__ T &operator*() = 0; - virtual __device__ T &operator[](int i) = 0; +template struct SharedMemory +{ + // Ensure that we won't compile any un-specialized types + virtual __device__ T &operator*() = 0; + virtual __device__ T &operator[](int i) = 0; }; -#define BUILD_SHAREDMEMORY_TYPE(t, n) \ - template <> \ - struct SharedMemory { \ - __device__ t &operator*() { \ - extern __shared__ t n[]; \ - return *n; \ - } \ - __device__ t &operator[](int i) { \ - extern __shared__ t n[]; \ - return n[i]; \ - } \ - } +#define BUILD_SHAREDMEMORY_TYPE(t, n) \ + template <> struct SharedMemory \ + { \ + __device__ t &operator*() \ + { \ + extern __shared__ t n[]; \ + return *n; \ + } \ + __device__ t &operator[](int i) \ + { \ + extern __shared__ t n[]; \ + return n[i]; \ + } \ + } BUILD_SHAREDMEMORY_TYPE(int, s_int); BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint); diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/piestimator.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/piestimator.h index 83fc5099..3100ff84 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/piestimator.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/piestimator.h @@ -27,17 +27,16 @@ #ifndef PIESTIMATOR_H #define PIESTIMATOR_H -template -class PiEstimator { - public: - PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize); - Real operator()(); +template class PiEstimator +{ +public: + PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize); + Real operator()(); - private: - unsigned int m_numSims; - unsigned int m_device; - unsigned int m_threadBlockSize; +private: + unsigned int m_numSims; + unsigned int m_device; + unsigned int m_threadBlockSize; }; #endif diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/test.h b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/test.h index 457f93bd..4c920fe5 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/test.h +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/inc/test.h @@ -28,28 +28,29 @@ #ifndef TEST_H #define TEST_H -template -struct Test { - Test() : pass(false){}; +template struct Test +{ + Test() + : pass(false) {}; - int device; - unsigned int numSims; - unsigned int threadBlockSize; + int device; + unsigned int numSims; + unsigned int threadBlockSize; - bool pass; - double elapsedTime; + bool pass; + double elapsedTime; - bool operator()(); + bool operator()(); }; // Defaults are arbitrary to give sensible runtime -#define k_sims_min 100000 -#define k_sims_max 10000000 -#define k_sims_def 100000 -#define k_sims_qa 100000 +#define k_sims_min 100000 +#define k_sims_max 10000000 +#define k_sims_def 100000 +#define k_sims_qa 100000 #define k_bsize_min 32 #define k_bsize_def 128 -#define k_bsize_qa 128 +#define k_bsize_qa 128 // Target value #define PI 3.14159265359 diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/main.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/main.cpp index 9344acb4..7448abd9 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/main.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/main.cpp @@ -39,12 +39,12 @@ /////////////////////////////////////////////////////////////////////////////// -#include -#include -#include #include -#include #include +#include +#include +#include +#include #include "../inc/test.h" @@ -52,181 +52,186 @@ static const char *printfFile = "MonteCarloEstimatePiQ.txt"; // Forward declarations -void showHelp(const int argc, const char **argv); -template -void runTest(int argc, const char **argv); +void showHelp(const int argc, const char **argv); +template void runTest(int argc, const char **argv); -int main(int argc, char **argv) { - using std::invalid_argument; - using std::string; +int main(int argc, char **argv) +{ + using std::invalid_argument; + using std::string; - // Open the log file - printf("Monte Carlo Estimate Pi (with batch QRNG)\n"); - printf("=========================================\n\n"); + // Open the log file + printf("Monte Carlo Estimate Pi (with batch QRNG)\n"); + printf("=========================================\n\n"); - // If help flag is set, display help and exit immediately - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Displaying help on console\n"); - showHelp(argc, (const char **)argv); - exit(EXIT_SUCCESS); - } - - // Check the precision (checked against the device capability later) - try { - char *value; - - if (getCmdLineArgumentString(argc, (const char **)argv, "precision", - &value)) { - // Check requested precision is valid - string prec(value); - - if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { - runTest(argc, (const char **)argv); - } else if (prec.compare("double") == 0 || - prec.compare("\"double\"") == 0) { - runTest(argc, (const char **)argv); - } else { - printf("specified precision (%s) is invalid, must be \"single\".\n", - value); - throw invalid_argument("precision"); - } - } else { - runTest(argc, (const char **)argv); + // If help flag is set, display help and exit immediately + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Displaying help on console\n"); + showHelp(argc, (const char **)argv); + exit(EXIT_SUCCESS); } - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } - // Finish - exit(EXIT_SUCCESS); + // Check the precision (checked against the device capability later) + try { + char *value; + + if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value)) { + // Check requested precision is valid + string prec(value); + + if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { + runTest(argc, (const char **)argv); + } + else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0) { + runTest(argc, (const char **)argv); + } + else { + printf("specified precision (%s) is invalid, must be \"single\".\n", value); + throw invalid_argument("precision"); + } + } + else { + runTest(argc, (const char **)argv); + } + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + + // Finish + exit(EXIT_SUCCESS); } -template -void runTest(int argc, const char **argv) { - using std::invalid_argument; - using std::runtime_error; +template void runTest(int argc, const char **argv) +{ + using std::invalid_argument; + using std::runtime_error; - try { - Test test; - int deviceCount = 0; - cudaError_t cudaResult = cudaSuccess; + try { + Test test; + int deviceCount = 0; + cudaError_t cudaResult = cudaSuccess; - // by default specify GPU Device == 0 - test.device = 0; + // by default specify GPU Device == 0 + test.device = 0; - // Get number of available devices - cudaResult = cudaGetDeviceCount(&deviceCount); - - if (cudaResult != cudaSuccess) { - printf("could not get device count.\n"); - throw runtime_error("cudaGetDeviceCount"); - } - - // (default parameters) - test.numSims = k_sims_qa; - test.threadBlockSize = k_bsize_qa; - - { - char *value = 0; - - if (getCmdLineArgumentString(argc, argv, "device", &value)) { - test.device = (int)atoi(value); - - if (test.device >= deviceCount) { - printf( - "invalid target device specified on command line (device %d does " - "not exist).\n", - test.device); - throw invalid_argument("device"); - } - } else { - test.device = gpuGetMaxGflopsDeviceId(); - } - - if (getCmdLineArgumentString(argc, argv, "sims", &value)) { - test.numSims = (unsigned int)atoi(value); - - if (test.numSims < k_sims_min || test.numSims > k_sims_max) { - printf( - "specified number of simulations (%d) is invalid, must be " - "between %d and %d.\n", - test.numSims, k_sims_min, k_sims_max); - throw invalid_argument("sims"); - } - } else { - test.numSims = k_sims_def; - } - - if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { - // Determine max threads per block - cudaDeviceProp deviceProperties; - cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + // Get number of available devices + cudaResult = cudaGetDeviceCount(&deviceCount); if (cudaResult != cudaSuccess) { - printf("cound not get device properties for device %d.\n", - test.device); - throw runtime_error("cudaGetDeviceProperties"); + printf("could not get device count.\n"); + throw runtime_error("cudaGetDeviceCount"); } - // Check requested size is valid - test.threadBlockSize = (unsigned int)atoi(value); + // (default parameters) + test.numSims = k_sims_qa; + test.threadBlockSize = k_bsize_qa; - if (test.threadBlockSize < k_bsize_min || - test.threadBlockSize > static_cast( - deviceProperties.maxThreadsPerBlock)) { - printf( - "specified block size (%d) is invalid, must be between %d and %d " - "for device %d.\n", - test.threadBlockSize, k_bsize_min, - deviceProperties.maxThreadsPerBlock, test.device); - throw invalid_argument("block-size"); - } + { + char *value = 0; - if (test.threadBlockSize & test.threadBlockSize - 1) { - printf( - "specified block size (%d) is invalid, must be a power of two " - "(see reduction function).\n", - test.threadBlockSize); - throw invalid_argument("block-size"); + if (getCmdLineArgumentString(argc, argv, "device", &value)) { + test.device = (int)atoi(value); + + if (test.device >= deviceCount) { + printf("invalid target device specified on command line (device %d does " + "not exist).\n", + test.device); + throw invalid_argument("device"); + } + } + else { + test.device = gpuGetMaxGflopsDeviceId(); + } + + if (getCmdLineArgumentString(argc, argv, "sims", &value)) { + test.numSims = (unsigned int)atoi(value); + + if (test.numSims < k_sims_min || test.numSims > k_sims_max) { + printf("specified number of simulations (%d) is invalid, must be " + "between %d and %d.\n", + test.numSims, + k_sims_min, + k_sims_max); + throw invalid_argument("sims"); + } + } + else { + test.numSims = k_sims_def; + } + + if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { + // Determine max threads per block + cudaDeviceProp deviceProperties; + cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + + if (cudaResult != cudaSuccess) { + printf("cound not get device properties for device %d.\n", test.device); + throw runtime_error("cudaGetDeviceProperties"); + } + + // Check requested size is valid + test.threadBlockSize = (unsigned int)atoi(value); + + if (test.threadBlockSize < k_bsize_min + || test.threadBlockSize > static_cast(deviceProperties.maxThreadsPerBlock)) { + printf("specified block size (%d) is invalid, must be between %d and %d " + "for device %d.\n", + test.threadBlockSize, + k_bsize_min, + deviceProperties.maxThreadsPerBlock, + test.device); + throw invalid_argument("block-size"); + } + + if (test.threadBlockSize & test.threadBlockSize - 1) { + printf("specified block size (%d) is invalid, must be a power of two " + "(see reduction function).\n", + test.threadBlockSize); + throw invalid_argument("block-size"); + } + } + else { + test.threadBlockSize = k_bsize_def; + } } - } else { - test.threadBlockSize = k_bsize_def; - } + // Execute + test(); + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + catch (runtime_error &e) { + printf("runtime error (%s)\n", e.what()); + exit(EXIT_FAILURE); } - // Execute - test(); - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } catch (runtime_error &e) { - printf("runtime error (%s)\n", e.what()); - exit(EXIT_FAILURE); - } } -void showHelp(int argc, const char **argv) { - using std::cout; - using std::endl; - using std::left; - using std::setw; +void showHelp(int argc, const char **argv) +{ + using std::cout; + using std::endl; + using std::left; + using std::setw; - if (argc > 0) { - cout << endl << argv[0] << endl; - } + if (argc > 0) { + cout << endl << argv[0] << endl; + } - cout << endl << "Syntax:" << endl; - cout << left; - cout << " " << setw(20) << "--device=" - << "Specify device to use for execution" << endl; - cout << " " << setw(20) << "--sims=" - << "Specify number of Monte Carlo simulations" << endl; - cout << " " << setw(20) << "--block-size=" - << "Specify number of threads per block" << endl; - cout << " " << setw(20) << "--precision=

" - << "Specify the precision (\"single\" or \"double\")" << endl; - cout << endl; - cout << " " << setw(20) << "--noprompt" - << "Skip prompt before exit" << endl; - cout << endl; + cout << endl << "Syntax:" << endl; + cout << left; + cout << " " << setw(20) << "--device=" + << "Specify device to use for execution" << endl; + cout << " " << setw(20) << "--sims=" + << "Specify number of Monte Carlo simulations" << endl; + cout << " " << setw(20) << "--block-size=" + << "Specify number of threads per block" << endl; + cout << " " << setw(20) << "--precision=

" + << "Specify the precision (\"single\" or \"double\")" << endl; + cout << endl; + cout << " " << setw(20) << "--noprompt" + << "Skip prompt before exit" << endl; + cout << endl; } diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/piestimator.cu b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/piestimator.cu index 87466745..c708a490 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/piestimator.cu +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/piestimator.cu @@ -25,15 +25,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "../inc/piestimator.h" - -#include -#include +#include +#include #include #include +#include #include -#include -#include +#include + +#include "../inc/piestimator.h" namespace cg = cooperative_groups; #include @@ -41,269 +41,264 @@ namespace cg = cooperative_groups; using std::string; using std::vector; -__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) { - extern __shared__ unsigned int sdata[]; +__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta) +{ + extern __shared__ unsigned int sdata[]; - // Perform first level of reduction: - // - Write to shared memory - unsigned int ltid = threadIdx.x; + // Perform first level of reduction: + // - Write to shared memory + unsigned int ltid = threadIdx.x; - sdata[ltid] = in; - cg::sync(cta); + sdata[ltid] = in; + cg::sync(cta); - // Do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (ltid < s) { - sdata[ltid] += sdata[ltid + s]; + // Do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (ltid < s) { + sdata[ltid] += sdata[ltid + s]; + } + + cg::sync(cta); } - cg::sync(cta); - } - - return sdata[0]; + return sdata[0]; } // Estimator kernel template -__global__ void computeValue(unsigned int *const results, - const Real *const points, - const unsigned int numSims) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Determine thread ID - unsigned int bid = blockIdx.x; - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int step = gridDim.x * blockDim.x; +__global__ void computeValue(unsigned int *const results, const Real *const points, const unsigned int numSims) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Determine thread ID + unsigned int bid = blockIdx.x; + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int step = gridDim.x * blockDim.x; - // Shift the input/output pointers - const Real *pointx = points + tid; - const Real *pointy = pointx + numSims; + // Shift the input/output pointers + const Real *pointx = points + tid; + const Real *pointy = pointx + numSims; - // Count the number of points which lie inside the unit quarter-circle - unsigned int pointsInside = 0; + // Count the number of points which lie inside the unit quarter-circle + unsigned int pointsInside = 0; - for (unsigned int i = tid; i < numSims; - i += step, pointx += step, pointy += step) { - Real x = *pointx; - Real y = *pointy; - Real l2norm2 = x * x + y * y; + for (unsigned int i = tid; i < numSims; i += step, pointx += step, pointy += step) { + Real x = *pointx; + Real y = *pointy; + Real l2norm2 = x * x + y * y; - if (l2norm2 < static_cast(1)) { - pointsInside++; + if (l2norm2 < static_cast(1)) { + pointsInside++; + } } - } - // Reduce within the block - pointsInside = reduce_sum(pointsInside, cta); + // Reduce within the block + pointsInside = reduce_sum(pointsInside, cta); - // Store the result - if (threadIdx.x == 0) { - results[bid] = pointsInside; - } + // Store the result + if (threadIdx.x == 0) { + results[bid] = pointsInside; + } } template -PiEstimator::PiEstimator(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize) - : m_numSims(numSims), - m_device(device), - m_threadBlockSize(threadBlockSize) {} +PiEstimator::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize) + : m_numSims(numSims) + , m_device(device) + , m_threadBlockSize(threadBlockSize) +{ +} -template -Real PiEstimator::operator()() { - cudaError_t cudaResult = cudaSuccess; - struct cudaDeviceProp deviceProperties; - struct cudaFuncAttributes funcAttributes; +template Real PiEstimator::operator()() +{ + cudaError_t cudaResult = cudaSuccess; + struct cudaDeviceProp deviceProperties; + struct cudaFuncAttributes funcAttributes; - // Get device properties - cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); + // Get device properties + cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Check precision is valid - if (typeid(Real) == typeid(double) && - (deviceProperties.major < 1 || - (deviceProperties.major == 1 && deviceProperties.minor < 3))) { - throw std::runtime_error("Device does not have double precision support"); - } + // Check precision is valid + if (typeid(Real) == typeid(double) + && (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3))) { + throw std::runtime_error("Device does not have double precision support"); + } - // Attach to GPU - cudaResult = cudaSetDevice(m_device); + // Attach to GPU + cudaResult = cudaSetDevice(m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not set CUDA device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not set CUDA device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Determine how to divide the work between cores - dim3 block; - dim3 grid; - block.x = m_threadBlockSize; - grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; + // Determine how to divide the work between cores + dim3 block; + dim3 grid; + block.x = m_threadBlockSize; + grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; - // Aim to launch around ten or more times as many blocks as there - // are multiprocessors on the target device. - unsigned int blocksPerSM = 10; - unsigned int numSMs = deviceProperties.multiProcessorCount; + // Aim to launch around ten or more times as many blocks as there + // are multiprocessors on the target device. + unsigned int blocksPerSM = 10; + unsigned int numSMs = deviceProperties.multiProcessorCount; - while (grid.x > 2 * blocksPerSM * numSMs) { - grid.x >>= 1; - } + while (grid.x > 2 * blocksPerSM * numSMs) { + grid.x >>= 1; + } - // Get computeValue function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); + // Get computeValue function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for computeValue kernel"); - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for computeValue kernel"); + } - // Check the dimensions are valid - if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { - throw std::runtime_error("Block X dimension is too large for device"); - } + // Check the dimensions are valid + if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0]) { + throw std::runtime_error("Block X dimension is too large for device"); + } - if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { - throw std::runtime_error("Grid X dimension is too large for device"); - } + if (grid.x > (unsigned int)deviceProperties.maxGridSize[0]) { + throw std::runtime_error("Grid X dimension is too large for device"); + } - // Allocate memory for points - // Each simulation has two random numbers to give X and Y coordinate - Real *d_points = 0; - cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real)); + // Allocate memory for points + // Each simulation has two random numbers to give X and Y coordinate + Real *d_points = 0; + cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for random numbers: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for random numbers: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Allocate memory for result - // Each thread block will produce one result - unsigned int *d_results = 0; - cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); + // Allocate memory for result + // Each thread block will produce one result + unsigned int *d_results = 0; + cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for partial results: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for partial results: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Generate random points in unit square - curandStatus_t curandResult; - curandGenerator_t qrng; + // Generate random points in unit square + curandStatus_t curandResult; + curandGenerator_t qrng; - if (typeid(Real) == typeid(float)) { - curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32); - } else if (typeid(Real) == typeid(double)) { - curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL64); - } else { - string msg("Could not create random number generator of specified type"); - throw std::runtime_error(msg); - } + if (typeid(Real) == typeid(float)) { + curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32); + } + else if (typeid(Real) == typeid(double)) { + curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL64); + } + else { + string msg("Could not create random number generator of specified type"); + throw std::runtime_error(msg); + } - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not create quasi-random number generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not create quasi-random number generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } - curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2); + curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg( - "Could not set number of dimensions for quasi-random number " - "generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not set number of dimensions for quasi-random number " + "generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } - curandResult = - curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT); + curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not set order for quasi-random number generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not set order for quasi-random number generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } - if (typeid(Real) == typeid(float)) { - curandResult = - curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims); - } else if (typeid(Real) == typeid(double)) { - curandResult = - curandGenerateUniformDouble(qrng, (double *)d_points, 2 * m_numSims); - } else { - string msg("Could not generate random numbers of specified type"); - throw std::runtime_error(msg); - } + if (typeid(Real) == typeid(float)) { + curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims); + } + else if (typeid(Real) == typeid(double)) { + curandResult = curandGenerateUniformDouble(qrng, (double *)d_points, 2 * m_numSims); + } + else { + string msg("Could not generate random numbers of specified type"); + throw std::runtime_error(msg); + } - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not generate quasi-random numbers: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not generate quasi-random numbers: "); + msg += curandResult; + throw std::runtime_error(msg); + } - curandResult = curandDestroyGenerator(qrng); + curandResult = curandDestroyGenerator(qrng); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not destroy quasi-random number generator: "); - msg += curandResult; - throw std::runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not destroy quasi-random number generator: "); + msg += curandResult; + throw std::runtime_error(msg); + } - // Count the points inside unit quarter-circle - computeValue<<>>( - d_results, d_points, m_numSims); + // Count the points inside unit quarter-circle + computeValue<<>>(d_results, d_points, m_numSims); - // Copy partial results back - vector results(grid.x); - cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), - cudaMemcpyDeviceToHost); + // Copy partial results back + vector results(grid.x); + cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost); - if (cudaResult != cudaSuccess) { - string msg("Could not copy partial results to host: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not copy partial results to host: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Complete sum-reduction on host - Real value = - static_cast(std::accumulate(results.begin(), results.end(), 0)); + // Complete sum-reduction on host + Real value = static_cast(std::accumulate(results.begin(), results.end(), 0)); - // Determine the proportion of points inside the quarter-circle, - // i.e. the area of the unit quarter-circle - value /= m_numSims; + // Determine the proportion of points inside the quarter-circle, + // i.e. the area of the unit quarter-circle + value /= m_numSims; - // Value is currently an estimate of the area of a unit quarter-circle, so we - // can scale to a full circle by multiplying by four. Now since the area of a - // circle is pi * r^2, and r is one, the value will be an estimate for the - // value of pi. - value *= 4; + // Value is currently an estimate of the area of a unit quarter-circle, so we + // can scale to a full circle by multiplying by four. Now since the area of a + // circle is pi * r^2, and r is one, the value will be an estimate for the + // value of pi. + value *= 4; - // Cleanup - if (d_points) { - cudaFree(d_points); - d_points = 0; - } + // Cleanup + if (d_points) { + cudaFree(d_points); + d_points = 0; + } - if (d_results) { - cudaFree(d_results); - d_results = 0; - } + if (d_results) { + cudaFree(d_results); + d_results = 0; + } - return value; + return value; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/test.cpp b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/test.cpp index e37f39ed..9d4f2252 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/test.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/src/test.cpp @@ -27,83 +27,84 @@ #include "../inc/test.h" -#include -#include -#include -#include -#include #include -#include -#include -#include #include +#include +#include +#include #include +#include +#include +#include +#include +#include #include "../inc/piestimator.h" -template -bool Test::operator()() { - using std::endl; - using std::setw; - using std::stringstream; +template bool Test::operator()() +{ + using std::endl; + using std::setw; + using std::stringstream; - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); - // Get device properties - struct cudaDeviceProp deviceProperties; - cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); + // Get device properties + struct cudaDeviceProp deviceProperties; + cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); - if (cudaResult != cudaSuccess) { - std::string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + std::string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Evaluate on GPU - printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); - PiEstimator estimator(numSims, device, threadBlockSize); - sdkStartTimer(&timer); - Real result = estimator(); - sdkStopTimer(&timer); - elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; + // Evaluate on GPU + printf("Estimating Pi on GPU (%s)\n\n", deviceProperties.name); + PiEstimator estimator(numSims, device, threadBlockSize); + sdkStartTimer(&timer); + Real result = estimator(); + sdkStopTimer(&timer); + elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; - // Tolerance to compare result with expected - // This is just to check that nothing has gone very wrong with the - // test, the actual accuracy of the result depends on the number of - // Monte Carlo trials - const Real tolerance = static_cast(0.01); + // Tolerance to compare result with expected + // This is just to check that nothing has gone very wrong with the + // test, the actual accuracy of the result depends on the number of + // Monte Carlo trials + const Real tolerance = static_cast(0.01); - // Display results - Real abserror = fabs(result - static_cast(PI)); - Real relerror = abserror / static_cast(PI); - printf("Precision: %s\n", - (typeid(Real) == typeid(double)) ? "double" : "single"); - printf("Number of sims: %d\n", numSims); - printf("Tolerance: %e\n", tolerance); - printf("GPU result: %e\n", result); - printf("Expected: %e\n", PI); - printf("Absolute error: %e\n", abserror); - printf("Relative error: %e\n\n", relerror); + // Display results + Real abserror = fabs(result - static_cast(PI)); + Real relerror = abserror / static_cast(PI); + printf("Precision: %s\n", (typeid(Real) == typeid(double)) ? "double" : "single"); + printf("Number of sims: %d\n", numSims); + printf("Tolerance: %e\n", tolerance); + printf("GPU result: %e\n", result); + printf("Expected: %e\n", PI); + printf("Absolute error: %e\n", abserror); + printf("Relative error: %e\n\n", relerror); - // Check result - if (relerror > tolerance) { - printf("computed result (%e) does not match expected result (%e).\n", - result, PI); - pass = false; - } else { - pass = true; - } + // Check result + if (relerror > tolerance) { + printf("computed result (%e) does not match expected result (%e).\n", result, PI); + pass = false; + } + else { + pass = true; + } - // Print results - printf( - "MonteCarloEstimatePiQ, Performance = %.2f sims/s, Time = %.2f(ms), " - "NumDevsUsed = %u, Blocksize = %u\n", - numSims / elapsedTime, elapsedTime * 1000.0f, 1, threadBlockSize); + // Print results + printf("MonteCarloEstimatePiQ, Performance = %.2f sims/s, Time = %.2f(ms), " + "NumDevsUsed = %u, Blocksize = %u\n", + numSims / elapsedTime, + elapsedTime * 1000.0f, + 1, + threadBlockSize); - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); - return pass; + return pass; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/asianoption.h b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/asianoption.h index af91b675..304e59c7 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/asianoption.h +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/asianoption.h @@ -28,24 +28,24 @@ #ifndef ASIANOPTION_H #define ASIANOPTION_H -template -struct AsianOption { - enum CallPut { Call, Put }; +template struct AsianOption +{ + enum CallPut { Call, Put }; - // Parameters - Real spot; - Real strike; - Real r; - Real sigma; - Real tenor; - Real dt; + // Parameters + Real spot; + Real strike; + Real r; + Real sigma; + Real tenor; + Real dt; - // Value - Real golden; - Real value; + // Value + Real golden; + Real value; - // Option type - CallPut type; + // Option type + CallPut type; }; #endif diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/cudasharedmem.h b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/cudasharedmem.h index 5c9bab98..4627d72c 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/cudasharedmem.h +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/cudasharedmem.h @@ -65,25 +65,27 @@ // This is the un-specialized struct. Note that we prevent instantiation of // this struct by making it abstract (i.e. with pure virtual methods). -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - virtual __device__ T &operator*() = 0; - virtual __device__ T &operator[](int i) = 0; +template struct SharedMemory +{ + // Ensure that we won't compile any un-specialized types + virtual __device__ T &operator*() = 0; + virtual __device__ T &operator[](int i) = 0; }; -#define BUILD_SHAREDMEMORY_TYPE(t, n) \ - template <> \ - struct SharedMemory { \ - __device__ t &operator*() { \ - extern __shared__ t n[]; \ - return *n; \ - } \ - __device__ t &operator[](int i) { \ - extern __shared__ t n[]; \ - return n[i]; \ - } \ - } +#define BUILD_SHAREDMEMORY_TYPE(t, n) \ + template <> struct SharedMemory \ + { \ + __device__ t &operator*() \ + { \ + extern __shared__ t n[]; \ + return *n; \ + } \ + __device__ t &operator[](int i) \ + { \ + extern __shared__ t n[]; \ + return n[i]; \ + } \ + } BUILD_SHAREDMEMORY_TYPE(int, s_int); BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint); diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/pricingengine.h b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/pricingengine.h index b68d71a6..e8422fdf 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/pricingengine.h +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/pricingengine.h @@ -30,18 +30,17 @@ #include "asianoption.h" -template -class PricingEngine { - public: - PricingEngine(unsigned int numSims, unsigned int device, - unsigned int threadBlockSize, unsigned int seed); - void operator()(AsianOption &option); +template class PricingEngine +{ +public: + PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed); + void operator()(AsianOption &option); - private: - unsigned int m_seed; - unsigned int m_numSims; - unsigned int m_device; - unsigned int m_threadBlockSize; +private: + unsigned int m_seed; + unsigned int m_numSims; + unsigned int m_device; + unsigned int m_threadBlockSize; }; #endif diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/test.h b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/test.h index e89120cd..ff689b33 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/test.h +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/inc/test.h @@ -28,30 +28,31 @@ #ifndef TEST_H #define TEST_H -template -struct Test { - Test() : pass(false){}; +template struct Test +{ + Test() + : pass(false) {}; - int device; - unsigned int numSims; - unsigned int threadBlockSize; - unsigned int seed; + int device; + unsigned int numSims; + unsigned int threadBlockSize; + unsigned int seed; - bool pass; - double elapsedTime; + bool pass; + double elapsedTime; - bool operator()(); + bool operator()(); }; // Defaults are arbitrary to give sensible runtime -#define k_sims_min 100000 -#define k_sims_max 1000000 -#define k_sims_def 100000 -#define k_sims_qa 100000 +#define k_sims_min 100000 +#define k_sims_max 1000000 +#define k_sims_def 100000 +#define k_sims_qa 100000 #define k_bsize_min 32 #define k_bsize_def 128 -#define k_bsize_qa 128 -#define k_seed_def 1234 -#define k_seed_qa 1234 +#define k_bsize_qa 128 +#define k_seed_def 1234 +#define k_seed_qa 1234 #endif diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/main.cpp b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/main.cpp index f7003f44..bb26f7ba 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/main.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/main.cpp @@ -41,209 +41,214 @@ /////////////////////////////////////////////////////////////////////////////// -#include -#include -#include #include -#include #include +#include +#include +#include +#include #include "../inc/test.h" // Forward declarations -void showHelp(const int argc, const char **argv); -template -void runTest(int argc, const char **argv); +void showHelp(const int argc, const char **argv); +template void runTest(int argc, const char **argv); -int main(int argc, char **argv) { - using std::invalid_argument; - using std::string; +int main(int argc, char **argv) +{ + using std::invalid_argument; + using std::string; - // Open the log file - printf("Monte Carlo Single Asian Option (with PRNG)\n"); - printf("===========================================\n\n"); + // Open the log file + printf("Monte Carlo Single Asian Option (with PRNG)\n"); + printf("===========================================\n\n"); - // If help flag is set, display help and exit immediately - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Displaying help on console\n"); - showHelp(argc, (const char **)argv); - exit(EXIT_SUCCESS); - } - - // Check the precision (checked against the device capability later) - try { - char *value; - - if (getCmdLineArgumentString(argc, (const char **)argv, "precision", - &value)) { - // Check requested precision is valid - string prec(value); - - if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { - runTest(argc, (const char **)argv); - } else if (prec.compare("double") == 0 || - prec.compare("\"double\"") == 0) { - runTest(argc, (const char **)argv); - } else { - printf( - "specified precision (%s) is invalid, must be \"single\" or " - "\"double\".\n", - value); - throw invalid_argument("precision"); - } - } else { - runTest(argc, (const char **)argv); + // If help flag is set, display help and exit immediately + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Displaying help on console\n"); + showHelp(argc, (const char **)argv); + exit(EXIT_SUCCESS); } - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } - // Finish - exit(EXIT_SUCCESS); + // Check the precision (checked against the device capability later) + try { + char *value; + + if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value)) { + // Check requested precision is valid + string prec(value); + + if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0) { + runTest(argc, (const char **)argv); + } + else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0) { + runTest(argc, (const char **)argv); + } + else { + printf("specified precision (%s) is invalid, must be \"single\" or " + "\"double\".\n", + value); + throw invalid_argument("precision"); + } + } + else { + runTest(argc, (const char **)argv); + } + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + + // Finish + exit(EXIT_SUCCESS); } -template -void runTest(int argc, const char **argv) { - using std::invalid_argument; - using std::runtime_error; +template void runTest(int argc, const char **argv) +{ + using std::invalid_argument; + using std::runtime_error; - try { - Test test; - int deviceCount = 0; - cudaError_t cudaResult = cudaSuccess; + try { + Test test; + int deviceCount = 0; + cudaError_t cudaResult = cudaSuccess; - // by default specify GPU Device == 0 - test.device = 0; + // by default specify GPU Device == 0 + test.device = 0; - // Get number of available devices - cudaResult = cudaGetDeviceCount(&deviceCount); - - if (cudaResult != cudaSuccess) { - printf("could not get device count.\n"); - throw runtime_error("cudaGetDeviceCount"); - } - - // (default parameters) - test.numSims = k_sims_qa; - test.threadBlockSize = k_bsize_qa; - test.seed = k_seed_qa; - - { - char *value = 0; - - if (getCmdLineArgumentString(argc, argv, "device", &value)) { - test.device = (int)atoi(value); - - if (test.device >= deviceCount) { - printf( - "invalid target device specified on command line (device %d does " - "not exist).\n", - test.device); - throw invalid_argument("device"); - } - } else { - test.device = gpuGetMaxGflopsDeviceId(); - } - - if (getCmdLineArgumentString(argc, argv, "sims", &value)) { - test.numSims = (unsigned int)atoi(value); - - if (test.numSims < k_sims_min || test.numSims > k_sims_max) { - printf( - "specified number of simulations (%d) is invalid, must be " - "between %d and %d.\n", - test.numSims, k_sims_min, k_sims_max); - throw invalid_argument("sims"); - } - } else { - test.numSims = k_sims_def; - } - - if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { - // Determine max threads per block - cudaDeviceProp deviceProperties; - cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + // Get number of available devices + cudaResult = cudaGetDeviceCount(&deviceCount); if (cudaResult != cudaSuccess) { - printf("cound not get device properties for device %d.\n", - test.device); - throw runtime_error("cudaGetDeviceProperties"); + printf("could not get device count.\n"); + throw runtime_error("cudaGetDeviceCount"); } - // Check requested size is valid - test.threadBlockSize = (unsigned int)atoi(value); + // (default parameters) + test.numSims = k_sims_qa; + test.threadBlockSize = k_bsize_qa; + test.seed = k_seed_qa; - if (test.threadBlockSize < k_bsize_min || - test.threadBlockSize > static_cast( - deviceProperties.maxThreadsPerBlock)) { - printf( - "specified block size (%d) is invalid, must be between %d and %d " - "for device %d.\n", - test.threadBlockSize, k_bsize_min, - deviceProperties.maxThreadsPerBlock, test.device); - throw invalid_argument("block-size"); + { + char *value = 0; + + if (getCmdLineArgumentString(argc, argv, "device", &value)) { + test.device = (int)atoi(value); + + if (test.device >= deviceCount) { + printf("invalid target device specified on command line (device %d does " + "not exist).\n", + test.device); + throw invalid_argument("device"); + } + } + else { + test.device = gpuGetMaxGflopsDeviceId(); + } + + if (getCmdLineArgumentString(argc, argv, "sims", &value)) { + test.numSims = (unsigned int)atoi(value); + + if (test.numSims < k_sims_min || test.numSims > k_sims_max) { + printf("specified number of simulations (%d) is invalid, must be " + "between %d and %d.\n", + test.numSims, + k_sims_min, + k_sims_max); + throw invalid_argument("sims"); + } + } + else { + test.numSims = k_sims_def; + } + + if (getCmdLineArgumentString(argc, argv, "block-size", &value)) { + // Determine max threads per block + cudaDeviceProp deviceProperties; + cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device); + + if (cudaResult != cudaSuccess) { + printf("cound not get device properties for device %d.\n", test.device); + throw runtime_error("cudaGetDeviceProperties"); + } + + // Check requested size is valid + test.threadBlockSize = (unsigned int)atoi(value); + + if (test.threadBlockSize < k_bsize_min + || test.threadBlockSize > static_cast(deviceProperties.maxThreadsPerBlock)) { + printf("specified block size (%d) is invalid, must be between %d and %d " + "for device %d.\n", + test.threadBlockSize, + k_bsize_min, + deviceProperties.maxThreadsPerBlock, + test.device); + throw invalid_argument("block-size"); + } + + if (test.threadBlockSize & test.threadBlockSize - 1) { + printf("specified block size (%d) is invalid, must be a power of two " + "(see reduction function).\n", + test.threadBlockSize); + throw invalid_argument("block-size"); + } + } + else { + test.threadBlockSize = k_bsize_def; + } + + if (getCmdLineArgumentString(argc, argv, "seed", &value)) { + // Check requested seed is valid + test.seed = (unsigned int)atoi(value); + + if (test.seed == 0) { + printf("specified seed (%d) is invalid, must be non-zero.\n", test.seed); + throw invalid_argument("seed"); + } + } + else { + test.seed = k_seed_def; + } } - - if (test.threadBlockSize & test.threadBlockSize - 1) { - printf( - "specified block size (%d) is invalid, must be a power of two " - "(see reduction function).\n", - test.threadBlockSize); - throw invalid_argument("block-size"); - } - } else { - test.threadBlockSize = k_bsize_def; - } - - if (getCmdLineArgumentString(argc, argv, "seed", &value)) { - // Check requested seed is valid - test.seed = (unsigned int)atoi(value); - - if (test.seed == 0) { - printf("specified seed (%d) is invalid, must be non-zero.\n", - test.seed); - throw invalid_argument("seed"); - } - } else { - test.seed = k_seed_def; - } + // Execute + test(); + } + catch (invalid_argument &e) { + printf("invalid command line argument (%s)\n", e.what()); + exit(EXIT_FAILURE); + } + catch (runtime_error &e) { + printf("runtime error (%s)\n", e.what()); + exit(EXIT_FAILURE); } - // Execute - test(); - } catch (invalid_argument &e) { - printf("invalid command line argument (%s)\n", e.what()); - exit(EXIT_FAILURE); - } catch (runtime_error &e) { - printf("runtime error (%s)\n", e.what()); - exit(EXIT_FAILURE); - } } -void showHelp(int argc, const char **argv) { - using std::cout; - using std::endl; - using std::left; - using std::setw; +void showHelp(int argc, const char **argv) +{ + using std::cout; + using std::endl; + using std::left; + using std::setw; - if (argc > 0) { - cout << endl << argv[0] << endl; - } + if (argc > 0) { + cout << endl << argv[0] << endl; + } - cout << endl << "Syntax:" << endl; - cout << left; - cout << " " << setw(20) << "--device=" - << "Specify device to use for execution" << endl; - cout << " " << setw(20) << "--sims=" - << "Specify number of Monte Carlo simulations" << endl; - cout << " " << setw(20) << "--block-size=" - << "Specify number of threads per block" << endl; - cout << " " << setw(20) << "--seed=" - << "Specify the seed to use for the random number generator" << endl; - cout << " " << setw(20) << "--precision=

" - << "Specify the precision (\"single\" or \"double\")" << endl; - cout << endl; - cout << " " << setw(20) << "--noprompt" - << "Skip prompt before exit" << endl; - cout << endl; + cout << endl << "Syntax:" << endl; + cout << left; + cout << " " << setw(20) << "--device=" + << "Specify device to use for execution" << endl; + cout << " " << setw(20) << "--sims=" + << "Specify number of Monte Carlo simulations" << endl; + cout << " " << setw(20) << "--block-size=" + << "Specify number of threads per block" << endl; + cout << " " << setw(20) << "--seed=" + << "Specify the seed to use for the random number generator" << endl; + cout << " " << setw(20) << "--precision=

" + << "Specify the precision (\"single\" or \"double\")" << endl; + cout << endl; + cout << " " << setw(20) << "--noprompt" + << "Skip prompt before exit" << endl; + cout << endl; } diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/pricingengine.cu b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/pricingengine.cu index 17331d67..60069ca0 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/pricingengine.cu +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/pricingengine.cu @@ -25,15 +25,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "../inc/pricingengine.h" - -#include -#include +#include +#include #include #include +#include #include -#include -#include +#include + +#include "../inc/pricingengine.h" namespace cg = cooperative_groups; #include @@ -45,328 +45,323 @@ using std::string; using std::vector; // RNG init kernel -__global__ void initRNG(curandState *const rngStates, const unsigned int seed) { - // Determine thread ID - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void initRNG(curandState *const rngStates, const unsigned int seed) +{ + // Determine thread ID + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - // Initialise the RNG - curand_init(seed, tid, 0, &rngStates[tid]); + // Initialise the RNG + curand_init(seed, tid, 0, &rngStates[tid]); } -__device__ inline float getPathStep(float &drift, float &diffusion, - curandState &state) { - return expf(drift + diffusion * curand_normal(&state)); +__device__ inline float getPathStep(float &drift, float &diffusion, curandState &state) +{ + return expf(drift + diffusion * curand_normal(&state)); } -__device__ inline double getPathStep(double &drift, double &diffusion, - curandState &state) { - return exp(drift + diffusion * curand_normal_double(&state)); +__device__ inline double getPathStep(double &drift, double &diffusion, curandState &state) +{ + return exp(drift + diffusion * curand_normal_double(&state)); } // Path generation kernel template -__global__ void generatePaths(Real *const paths, curandState *const rngStates, +__global__ void generatePaths(Real *const paths, + curandState *const rngStates, const AsianOption *const option, - const unsigned int numSims, - const unsigned int numTimesteps) { - // Determine thread ID - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int step = gridDim.x * blockDim.x; + const unsigned int numSims, + const unsigned int numTimesteps) +{ + // Determine thread ID + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int step = gridDim.x * blockDim.x; - // Compute parameters - Real drift = - (option->r - static_cast(0.5) * option->sigma * option->sigma) * - option->dt; - Real diffusion = option->sigma * sqrt(option->dt); + // Compute parameters + Real drift = (option->r - static_cast(0.5) * option->sigma * option->sigma) * option->dt; + Real diffusion = option->sigma * sqrt(option->dt); - // Initialise the RNG - curandState localState = rngStates[tid]; + // Initialise the RNG + curandState localState = rngStates[tid]; - for (unsigned int i = tid; i < numSims; i += step) { - // Shift the output pointer - Real *output = paths + i; + for (unsigned int i = tid; i < numSims; i += step) { + // Shift the output pointer + Real *output = paths + i; - // Simulate the path - Real s = static_cast(1); + // Simulate the path + Real s = static_cast(1); - for (unsigned int t = 0; t < numTimesteps; t++, output += numSims) { - s *= getPathStep(drift, diffusion, localState); - *output = s; + for (unsigned int t = 0; t < numTimesteps; t++, output += numSims) { + s *= getPathStep(drift, diffusion, localState); + *output = s; + } } - } } -template -__device__ Real reduce_sum(Real in, cg::thread_block cta) { - SharedMemory sdata; +template __device__ Real reduce_sum(Real in, cg::thread_block cta) +{ + SharedMemory sdata; - // Perform first level of reduction: - // - Write to shared memory - unsigned int ltid = threadIdx.x; + // Perform first level of reduction: + // - Write to shared memory + unsigned int ltid = threadIdx.x; - sdata[ltid] = in; - cg::sync(cta); + sdata[ltid] = in; + cg::sync(cta); - // Do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (ltid < s) { - sdata[ltid] += sdata[ltid + s]; + // Do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (ltid < s) { + sdata[ltid] += sdata[ltid + s]; + } + + cg::sync(cta); } - cg::sync(cta); - } - - return sdata[0]; + return sdata[0]; } // Valuation kernel template -__global__ void computeValue(Real *const values, const Real *const paths, +__global__ void computeValue(Real *const values, + const Real *const paths, const AsianOption *const option, - const unsigned int numSims, - const unsigned int numTimesteps) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Determine thread ID - unsigned int bid = blockIdx.x; - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int step = gridDim.x * blockDim.x; + const unsigned int numSims, + const unsigned int numTimesteps) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Determine thread ID + unsigned int bid = blockIdx.x; + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int step = gridDim.x * blockDim.x; - Real sumPayoffs = static_cast(0); + Real sumPayoffs = static_cast(0); - for (unsigned int i = tid; i < numSims; i += step) { - // Shift the input pointer - const Real *path = paths + i; - // Compute the arithmetic average - Real avg = static_cast(0); + for (unsigned int i = tid; i < numSims; i += step) { + // Shift the input pointer + const Real *path = paths + i; + // Compute the arithmetic average + Real avg = static_cast(0); - for (unsigned int t = 0; t < numTimesteps; t++, path += numSims) { - avg += *path; + for (unsigned int t = 0; t < numTimesteps; t++, path += numSims) { + avg += *path; + } + + avg = avg * option->spot / numTimesteps; + // Compute the payoff + Real payoff = avg - option->strike; + + if (option->type == AsianOption::Put) { + payoff = -payoff; + } + + payoff = max(static_cast(0), payoff); + // Accumulate payoff locally + sumPayoffs += payoff; } - avg = avg * option->spot / numTimesteps; - // Compute the payoff - Real payoff = avg - option->strike; + // Reduce within the block + sumPayoffs = reduce_sum(sumPayoffs, cta); - if (option->type == AsianOption::Put) { - payoff = -payoff; + // Store the result + if (threadIdx.x == 0) { + values[bid] = sumPayoffs; } - - payoff = max(static_cast(0), payoff); - // Accumulate payoff locally - sumPayoffs += payoff; - } - - // Reduce within the block - sumPayoffs = reduce_sum(sumPayoffs, cta); - - // Store the result - if (threadIdx.x == 0) { - values[bid] = sumPayoffs; - } } template -PricingEngine::PricingEngine(unsigned int numSims, unsigned int device, +PricingEngine::PricingEngine(unsigned int numSims, + unsigned int device, unsigned int threadBlockSize, unsigned int seed) - : m_numSims(numSims), - m_device(device), - m_threadBlockSize(threadBlockSize), - m_seed(seed) {} + : m_numSims(numSims) + , m_device(device) + , m_threadBlockSize(threadBlockSize) + , m_seed(seed) +{ +} -template -void PricingEngine::operator()(AsianOption &option) { - cudaError_t cudaResult = cudaSuccess; - struct cudaDeviceProp deviceProperties; - struct cudaFuncAttributes funcAttributes; +template void PricingEngine::operator()(AsianOption &option) +{ + cudaError_t cudaResult = cudaSuccess; + struct cudaDeviceProp deviceProperties; + struct cudaFuncAttributes funcAttributes; - // Get device properties - cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); + // Get device properties + cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Check precision is valid - unsigned int deviceVersion = - deviceProperties.major * 10 + deviceProperties.minor; + // Check precision is valid + unsigned int deviceVersion = deviceProperties.major * 10 + deviceProperties.minor; - if (typeid(Real) == typeid(double) && deviceVersion < 13) { - throw std::runtime_error("Device does not have double precision support"); - } + if (typeid(Real) == typeid(double) && deviceVersion < 13) { + throw std::runtime_error("Device does not have double precision support"); + } - // Attach to GPU - cudaResult = cudaSetDevice(m_device); + // Attach to GPU + cudaResult = cudaSetDevice(m_device); - if (cudaResult != cudaSuccess) { - string msg("Could not set CUDA device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not set CUDA device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Determine how to divide the work between cores - dim3 block; - dim3 grid; - block.x = m_threadBlockSize; - grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; + // Determine how to divide the work between cores + dim3 block; + dim3 grid; + block.x = m_threadBlockSize; + grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize; - // Aim to launch around ten or more times as many blocks as there - // are multiprocessors on the target device. - unsigned int blocksPerSM = 10; - unsigned int numSMs = deviceProperties.multiProcessorCount; + // Aim to launch around ten or more times as many blocks as there + // are multiprocessors on the target device. + unsigned int blocksPerSM = 10; + unsigned int numSMs = deviceProperties.multiProcessorCount; - while (grid.x > 2 * blocksPerSM * numSMs) { - grid.x >>= 1; - } + while (grid.x > 2 * blocksPerSM * numSMs) { + grid.x >>= 1; + } - // Get initRNG function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG); + // Get initRNG function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG); - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for initRNG kernel"); - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for initRNG kernel"); + } - // Get generatePaths function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, generatePaths); + // Get generatePaths function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, generatePaths); - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for generatePaths kernel"); - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for generatePaths kernel"); + } - // Get computeValue function properties and check the maximum block size - cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); + // Get computeValue function properties and check the maximum block size + cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue); - if (cudaResult != cudaSuccess) { - string msg("Could not get function attributes: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not get function attributes: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { - throw std::runtime_error( - "Block X dimension is too large for computeValue kernel"); - } + if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock) { + throw std::runtime_error("Block X dimension is too large for computeValue kernel"); + } - // Setup problem on GPU - AsianOption *d_option = 0; - cudaResult = cudaMalloc((void **)&d_option, sizeof(AsianOption)); + // Setup problem on GPU + AsianOption *d_option = 0; + cudaResult = cudaMalloc((void **)&d_option, sizeof(AsianOption)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for option data: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for option data: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - cudaResult = cudaMemcpy(d_option, &option, sizeof(AsianOption), - cudaMemcpyHostToDevice); + cudaResult = cudaMemcpy(d_option, &option, sizeof(AsianOption), cudaMemcpyHostToDevice); - if (cudaResult != cudaSuccess) { - string msg("Could not copy data to device: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not copy data to device: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Allocate memory for paths - Real *d_paths = 0; - int numTimesteps = static_cast(option.tenor / option.dt); - cudaResult = - cudaMalloc((void **)&d_paths, m_numSims * numTimesteps * sizeof(Real)); + // Allocate memory for paths + Real *d_paths = 0; + int numTimesteps = static_cast(option.tenor / option.dt); + cudaResult = cudaMalloc((void **)&d_paths, m_numSims * numTimesteps * sizeof(Real)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for paths: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for paths: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Allocate memory for RNG states - curandState *d_rngStates = 0; - cudaResult = - cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState)); + // Allocate memory for RNG states + curandState *d_rngStates = 0; + cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for RNG state: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for RNG state: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Allocate memory for result - Real *d_values = 0; - cudaResult = cudaMalloc((void **)&d_values, grid.x * sizeof(Real)); + // Allocate memory for result + Real *d_values = 0; + cudaResult = cudaMalloc((void **)&d_values, grid.x * sizeof(Real)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate memory on device for partial results: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate memory on device for partial results: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Initialise RNG - initRNG<<>>(d_rngStates, m_seed); + // Initialise RNG + initRNG<<>>(d_rngStates, m_seed); - // Generate paths - generatePaths<<>>(d_paths, d_rngStates, d_option, - m_numSims, numTimesteps); + // Generate paths + generatePaths<<>>(d_paths, d_rngStates, d_option, m_numSims, numTimesteps); - // Compute value - computeValue<<>>( - d_values, d_paths, d_option, m_numSims, numTimesteps); + // Compute value + computeValue<<>>(d_values, d_paths, d_option, m_numSims, numTimesteps); - // Copy partial results back - vector values(grid.x); - cudaResult = cudaMemcpy(&values[0], d_values, grid.x * sizeof(Real), - cudaMemcpyDeviceToHost); + // Copy partial results back + vector values(grid.x); + cudaResult = cudaMemcpy(&values[0], d_values, grid.x * sizeof(Real), cudaMemcpyDeviceToHost); - if (cudaResult != cudaSuccess) { - string msg("Could not copy partial results to host: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not copy partial results to host: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // Complete sum-reduction on host - option.value = - std::accumulate(values.begin(), values.end(), static_cast(0)); + // Complete sum-reduction on host + option.value = std::accumulate(values.begin(), values.end(), static_cast(0)); - // Compute the mean - option.value /= m_numSims; + // Compute the mean + option.value /= m_numSims; - // Discount to present value - option.value *= exp(-option.r * option.tenor); + // Discount to present value + option.value *= exp(-option.r * option.tenor); - // Cleanup - if (d_option) { - cudaFree(d_option); - d_option = 0; - } + // Cleanup + if (d_option) { + cudaFree(d_option); + d_option = 0; + } - if (d_paths) { - cudaFree(d_paths); - d_paths = 0; - } + if (d_paths) { + cudaFree(d_paths); + d_paths = 0; + } - if (d_rngStates) { - cudaFree(d_rngStates); - d_rngStates = 0; - } + if (d_rngStates) { + cudaFree(d_rngStates); + d_rngStates = 0; + } - if (d_values) { - cudaFree(d_values); - d_values = 0; - } + if (d_values) { + cudaFree(d_values); + d_values = 0; + } } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/test.cpp b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/test.cpp index 5f328167..9f955fb8 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/test.cpp +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/src/test.cpp @@ -27,111 +27,110 @@ #include "../inc/test.h" -#include -#include -#include -#include -#include #include -#include -#include #include -#include - #include +#include +#include +#include +#include +#include +#include +#include +#include #include "../inc/asianoption.h" #include "../inc/pricingengine.h" -template -bool Test::operator()() { - using std::endl; - using std::setw; - using std::stringstream; +template bool Test::operator()() +{ + using std::endl; + using std::setw; + using std::stringstream; - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); - // Get device properties - struct cudaDeviceProp deviceProperties; - cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); + // Get device properties + struct cudaDeviceProp deviceProperties; + cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device); - if (cudaResult != cudaSuccess) { - std::string msg("Could not get device properties: "); - msg += cudaGetErrorString(cudaResult); - throw std::runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + std::string msg("Could not get device properties: "); + msg += cudaGetErrorString(cudaResult); + throw std::runtime_error(msg); + } - // This test prices a single Asian call option with European - // exercise, with the priced averaged arithmetically on discrete - // trading days (weekdays). - AsianOption option; - option.spot = static_cast(40); - option.strike = static_cast(35); - option.r = static_cast(0.03); - option.sigma = static_cast(0.20); - option.tenor = static_cast(1.0 / 3.0); - option.dt = static_cast(1.0 / 261); - option.type = AsianOption::Call; - option.value = static_cast(0.0); - option.golden = static_cast(5.162534); + // This test prices a single Asian call option with European + // exercise, with the priced averaged arithmetically on discrete + // trading days (weekdays). + AsianOption option; + option.spot = static_cast(40); + option.strike = static_cast(35); + option.r = static_cast(0.03); + option.sigma = static_cast(0.20); + option.tenor = static_cast(1.0 / 3.0); + option.dt = static_cast(1.0 / 261); + option.type = AsianOption::Call; + option.value = static_cast(0.0); + option.golden = static_cast(5.162534); - // Evaluate on GPU - printf("Pricing option on GPU (%s)\n\n", deviceProperties.name); - PricingEngine pricer(numSims, device, threadBlockSize, seed); - sdkStartTimer(&timer); - pricer(option); - sdkStopTimer(&timer); - elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; + // Evaluate on GPU + printf("Pricing option on GPU (%s)\n\n", deviceProperties.name); + PricingEngine pricer(numSims, device, threadBlockSize, seed); + sdkStartTimer(&timer); + pricer(option); + sdkStopTimer(&timer); + elapsedTime = sdkGetAverageTimerValue(&timer) / 1000.0f; - // Tolerance to compare result with expected - // This is just to check that nothing has gone very wrong with the - // test, the actual accuracy of the result depends on the number of - // Monte Carlo trials - const Real tolerance = static_cast(0.1); + // Tolerance to compare result with expected + // This is just to check that nothing has gone very wrong with the + // test, the actual accuracy of the result depends on the number of + // Monte Carlo trials + const Real tolerance = static_cast(0.1); - // Display results - stringstream output; - output << "Precision: " - << ((typeid(Real) == typeid(double)) ? "double" : "single") << endl; - output << "Number of sims: " << numSims << endl; - output << endl; - output << " Spot | Strike | r | sigma | tenor | " - "Call/Put | Value | Expected |" - << endl; - output << "-----------|------------|------------|------------|------------|--" - "----------|------------|------------|" - << endl; - output << setw(10) << option.spot << " | "; - output << setw(10) << option.strike << " | "; - output << setw(10) << option.r << " | "; - output << setw(10) << option.sigma << " | "; - output << setw(10) << option.tenor << " | "; - output << setw(10) - << (option.type == AsianOption::Call ? "Call" : "Put") << " | "; - output << setw(10) << option.value << " | "; - output << setw(10) << option.golden << " |"; + // Display results + stringstream output; + output << "Precision: " << ((typeid(Real) == typeid(double)) ? "double" : "single") << endl; + output << "Number of sims: " << numSims << endl; + output << endl; + output << " Spot | Strike | r | sigma | tenor | " + "Call/Put | Value | Expected |" + << endl; + output << "-----------|------------|------------|------------|------------|--" + "----------|------------|------------|" + << endl; + output << setw(10) << option.spot << " | "; + output << setw(10) << option.strike << " | "; + output << setw(10) << option.r << " | "; + output << setw(10) << option.sigma << " | "; + output << setw(10) << option.tenor << " | "; + output << setw(10) << (option.type == AsianOption::Call ? "Call" : "Put") << " | "; + output << setw(10) << option.value << " | "; + output << setw(10) << option.golden << " |"; - printf("%s\n\n", output.str().c_str()); + printf("%s\n\n", output.str().c_str()); - // Check result - if (fabs(option.value - option.golden) > tolerance) { - printf("computed result (%e) does not match expected result (%e).\n", - option.value, option.golden); - pass = false; - } else { - pass = true; - } + // Check result + if (fabs(option.value - option.golden) > tolerance) { + printf("computed result (%e) does not match expected result (%e).\n", option.value, option.golden); + pass = false; + } + else { + pass = true; + } - // Print results - printf( - "MonteCarloSingleAsianOptionP, Performance = %.2f sims/s, Time = " - "%.2f(ms), NumDevsUsed = %u, Blocksize = %u\n", - numSims / elapsedTime, elapsedTime * 1000.0f, 1, threadBlockSize); + // Print results + printf("MonteCarloSingleAsianOptionP, Performance = %.2f sims/s, Time = " + "%.2f(ms), NumDevsUsed = %u, Blocksize = %u\n", + numSims / elapsedTime, + elapsedTime * 1000.0f, + 1, + threadBlockSize); - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); - return pass; + return pass; } // Explicit template instantiation diff --git a/Samples/2_Concepts_and_Techniques/README.md b/Samples/2_Concepts_and_Techniques/README.md index b407d80b..b333bfdb 100644 --- a/Samples/2_Concepts_and_Techniques/README.md +++ b/Samples/2_Concepts_and_Techniques/README.md @@ -83,7 +83,7 @@ This example demonstrates an efficient CUDA implementation of parallel prefix su This sample demonstrates an approach to the image segmentation trees construction. This method is based on Boruvka's MST algorithm. ### [shfl_scan](./shfl_scan) -This example demonstrates how to use the shuffle intrinsic __shfl_up_sync to perform a scan operation across a thread block. +This example demonstrates how to use the shuffle intrinsic __shfl_up_sync to perform a scan operation across a thread block. ### [sortingNetworks](./sortingNetworks) This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/indexen.htm @@ -102,4 +102,3 @@ This sample shows how to perform a reduction operation on an array of values usi ### [threadMigration](./threadMigration) Simple program illustrating how to the CUDA Context Management API and uses the new CUDA 4.0 parameter passing and CUDA launch API. CUDA contexts can be created separately and attached independently to different threads. - diff --git a/Samples/2_Concepts_and_Techniques/boxFilter/README.md b/Samples/2_Concepts_and_Techniques/boxFilter/README.md index 98ee4901..70822e25 100644 --- a/Samples/2_Concepts_and_Techniques/boxFilter/README.md +++ b/Samples/2_Concepts_and_Techniques/boxFilter/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter.cpp b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter.cpp index f86ac2e8..9c4be106 100644 --- a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter.cpp +++ b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter.cpp @@ -54,15 +54,15 @@ #endif // CUDA utilities and system includes -#include #include +#include // Helper functions -#include // CUDA SDK Helper functions -#include // CUDA device initialization helper functions +#include // CUDA device initialization helper functions +#include // CUDA SDK Helper functions #define MAX_EPSILON_ERROR 5.0f -#define REFRESH_DELAY 10 // ms +#define REFRESH_DELAY 10 // ms const static char *sSDKsample = "CUDA Iterative Box Filter"; @@ -71,544 +71,559 @@ const char *sOriginal[] = {"teapot1024_14.ppm", "teapot1024_22.ppm", NULL}; const char *sReference[] = {"ref_14.ppm", "ref_22.ppm", NULL}; -const char *image_filename = "teapot1024.ppm"; -int iterations = 1; -int filter_radius = 14; -int nthreads = 64; -unsigned int width, height; -unsigned int *h_img = NULL; +const char *image_filename = "teapot1024.ppm"; +int iterations = 1; +int filter_radius = 14; +int nthreads = 64; +unsigned int width, height; +unsigned int *h_img = NULL; unsigned int *d_temp = NULL; -GLuint pbo; // OpenGL pixel buffer object -struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange -GLuint texid; // Texture -GLuint shader; +GLuint pbo; // OpenGL pixel buffer object +struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange +GLuint texid; // Texture +GLuint shader; StopWatchInterface *timer = NULL, *kernel_timer = NULL; // Auto-Verification Code -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 8; // FPS limit for sampling -int g_Index = 0; -int g_nFilterSign = 1; -float avgFPS = 0.0f; -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; -bool g_bInteractive = false; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 8; // FPS limit for sampling +int g_Index = 0; +int g_nFilterSign = 1; +float avgFPS = 0.0f; +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; +bool g_bInteractive = false; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; -extern "C" int runSingleTest(char *ref_file, char *exec_path); -extern "C" int runBenchmark(); +extern "C" int runSingleTest(char *ref_file, char *exec_path); +extern "C" int runBenchmark(); extern "C" void loadImageData(int argc, char **argv); extern "C" void computeGold(float *id, float *od, int w, int h, int n); // These are CUDA functions to handle allocation and launching the kernels -extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA); -extern "C" void freeTextures(); -extern "C" double boxFilter(float *d_temp, float *d_dest, - int width, int height, int radius, int iterations, - int nthreads, StopWatchInterface *timer); +extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA); +extern "C" void freeTextures(); +extern "C" double boxFilter(float *d_temp, + float *d_dest, + int width, + int height, + int radius, + int iterations, + int nthreads, + StopWatchInterface *timer); -extern "C" double boxFilterRGBA(unsigned int *d_temp, - unsigned int *d_dest, int width, int height, - int radius, int iterations, int nthreads, +extern "C" double boxFilterRGBA(unsigned int *d_temp, + unsigned int *d_dest, + int width, + int height, + int radius, + int iterations, + int nthreads, StopWatchInterface *timer); // This varies the filter radius, so we can see automatic animation -void varySigma() { - filter_radius += g_nFilterSign; +void varySigma() +{ + filter_radius += g_nFilterSign; - if (filter_radius > 64) { - filter_radius = 64; // clamp to 64 and then negate sign - g_nFilterSign = -1; - } else if (filter_radius < 0) { - filter_radius = 0; - g_nFilterSign = 1; - } + if (filter_radius > 64) { + filter_radius = 64; // clamp to 64 and then negate sign + g_nFilterSign = -1; + } + else if (filter_radius < 0) { + filter_radius = 0; + g_nFilterSign = 1; + } } // Calculate the Frames per second and print in the title bar -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - avgFPS = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f); - fpsCount = 0; - fpsLimit = (int)MAX(avgFPS, 1.0f); - sdkResetTimer(&timer); - } + if (fpsCount == fpsLimit) { + avgFPS = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f); + fpsCount = 0; + fpsLimit = (int)MAX(avgFPS, 1.0f); + sdkResetTimer(&timer); + } - char fps[256]; - sprintf(fps, - "CUDA Rolling Box Filter (radius=%d, passes=%d): " - "%3.1f fps", - (!g_bInteractive ? "ON" : "OFF"), filter_radius, iterations, avgFPS); - glutSetWindowTitle(fps); + char fps[256]; + sprintf(fps, + "CUDA Rolling Box Filter (radius=%d, passes=%d): " + "%3.1f fps", + (!g_bInteractive ? "ON" : "OFF"), + filter_radius, + iterations, + avgFPS); + glutSetWindowTitle(fps); - if (!g_bInteractive) { - varySigma(); - } + if (!g_bInteractive) { + varySigma(); + } } // display results using OpenGL -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - // execute filter, writing results to pbo - unsigned int *d_result; + // execute filter, writing results to pbo + unsigned int *d_result; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_result, &num_bytes, cuda_pbo_resource)); - boxFilterRGBA(d_temp, d_result, width, height, filter_radius, - iterations, nthreads, kernel_timer); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes, cuda_pbo_resource)); + boxFilterRGBA(d_temp, d_result, width, height, filter_radius, iterations, nthreads, kernel_timer); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - // OpenGL display code path - { - glClear(GL_COLOR_BUFFER_BIT); - - // load texture from pbo - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBindTexture(GL_TEXTURE_2D, texid); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, - GL_UNSIGNED_BYTE, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - - // fragment program is required to display floating point texture - glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); - glEnable(GL_FRAGMENT_PROGRAM_ARB); - glDisable(GL_DEPTH_TEST); - - glBegin(GL_QUADS); + // OpenGL display code path { - glTexCoord2f(0.0f, 0.0f); - glVertex2f(0.0f, 0.0f); - glTexCoord2f(1.0f, 0.0f); - glVertex2f(1.0f, 0.0f); - glTexCoord2f(1.0f, 1.0f); - glVertex2f(1.0f, 1.0f); - glTexCoord2f(0.0f, 1.0f); - glVertex2f(0.0f, 1.0f); + glClear(GL_COLOR_BUFFER_BIT); + + // load texture from pbo + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBindTexture(GL_TEXTURE_2D, texid); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + + // fragment program is required to display floating point texture + glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); + glEnable(GL_FRAGMENT_PROGRAM_ARB); + glDisable(GL_DEPTH_TEST); + + glBegin(GL_QUADS); + { + glTexCoord2f(0.0f, 0.0f); + glVertex2f(0.0f, 0.0f); + glTexCoord2f(1.0f, 0.0f); + glVertex2f(1.0f, 0.0f); + glTexCoord2f(1.0f, 1.0f); + glVertex2f(1.0f, 1.0f); + glTexCoord2f(0.0f, 1.0f); + glVertex2f(0.0f, 1.0f); + } + glEnd(); + glBindTexture(GL_TEXTURE_2D, 0); + glDisable(GL_FRAGMENT_PROGRAM_ARB); } - glEnd(); - glBindTexture(GL_TEXTURE_2D, 0); - glDisable(GL_FRAGMENT_PROGRAM_ARB); - } - glutSwapBuffers(); - glutReportErrors(); + glutSwapBuffers(); + glutReportErrors(); - sdkStopTimer(&timer); + sdkStopTimer(&timer); - computeFPS(); + computeFPS(); } // Keyboard callback function for OpenGL (GLUT) -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case 'a': case 'A': - g_bInteractive = !g_bInteractive; - printf("> Animation is %s\n", !g_bInteractive ? "ON" : "OFF"); - break; + g_bInteractive = !g_bInteractive; + printf("> Animation is %s\n", !g_bInteractive ? "ON" : "OFF"); + break; case '=': case '+': - if (filter_radius < (int)width - 1 && filter_radius < (int)height - 1) { - filter_radius++; - } + if (filter_radius < (int)width - 1 && filter_radius < (int)height - 1) { + filter_radius++; + } - break; + break; case '-': - if (filter_radius > 1) { - filter_radius--; - } + if (filter_radius > 1) { + filter_radius--; + } - break; + break; case ']': - iterations++; - break; + iterations++; + break; case '[': - if (iterations > 1) { - iterations--; - } + if (iterations > 1) { + iterations--; + } - break; + break; default: - break; - } + break; + } - printf("radius = %d, iterations = %d\n", filter_radius, iterations); + printf("radius = %d, iterations = %d\n", filter_radius, iterations); } // Timer Event so we can refresh the display -void timerEvent(int value) { - if (glutGetWindow()) { - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - } +void timerEvent(int value) +{ + if (glutGetWindow()) { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + } } // Resizing the window -void reshape(int x, int y) { - glViewport(0, 0, x, y); +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } -void initCuda(bool useRGBA) { - checkCudaErrors(cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int)))); +void initCuda(bool useRGBA) +{ + checkCudaErrors(cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int)))); - // Refer to boxFilter_kernel.cu for implementation - initTexture(width, height, h_img, useRGBA); + // Refer to boxFilter_kernel.cu for implementation + initTexture(width, height, h_img, useRGBA); - sdkCreateTimer(&timer); - sdkCreateTimer(&kernel_timer); + sdkCreateTimer(&timer); + sdkCreateTimer(&kernel_timer); } -void cleanup() { - sdkDeleteTimer(&timer); - sdkDeleteTimer(&kernel_timer); +void cleanup() +{ + sdkDeleteTimer(&timer); + sdkDeleteTimer(&kernel_timer); - if (h_img) { - free(h_img); - h_img = NULL; - } + if (h_img) { + free(h_img); + h_img = NULL; + } - if (d_temp) { - cudaFree(d_temp); - d_temp = NULL; - } + if (d_temp) { + cudaFree(d_temp); + d_temp = NULL; + } - // Refer to boxFilter_kernel.cu for implementation - freeTextures(); + // Refer to boxFilter_kernel.cu for implementation + freeTextures(); - cudaGraphicsUnregisterResource(cuda_pbo_resource); + cudaGraphicsUnregisterResource(cuda_pbo_resource); - glDeleteBuffers(1, &pbo); - glDeleteTextures(1, &texid); - glDeleteProgramsARB(1, &shader); + glDeleteBuffers(1, &pbo); + glDeleteTextures(1, &texid); + glDeleteProgramsARB(1, &shader); } // shader for displaying floating-point texture -static const char *shader_code = - "!!ARBfp1.0\n" - "TEX result.color, fragment.texcoord, texture[0], 2D; \n" - "END"; +static const char *shader_code = "!!ARBfp1.0\n" + "TEX result.color, fragment.texcoord, texture[0], 2D; \n" + "END"; -GLuint compileASMShader(GLenum program_type, const char *code) { - GLuint program_id; - glGenProgramsARB(1, &program_id); - glBindProgramARB(program_type, program_id); - glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, - (GLsizei)strlen(code), (GLubyte *)code); +GLuint compileASMShader(GLenum program_type, const char *code) +{ + GLuint program_id; + glGenProgramsARB(1, &program_id); + glBindProgramARB(program_type, program_id); + glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)strlen(code), (GLubyte *)code); - GLint error_pos; - glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); + GLint error_pos; + glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); - if (error_pos != -1) { - const GLubyte *error_string; - error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); - printf("Program error at position: %d\n%s\n", (int)error_pos, error_string); - return 0; - } + if (error_pos != -1) { + const GLubyte *error_string; + error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); + printf("Program error at position: %d\n%s\n", (int)error_pos, error_string); + return 0; + } - return program_id; + return program_id; } // This is where we create the OpenGL PBOs, FBOs, and texture resources -void initGLResources() { - // create pixel buffer object - glGenBuffers(1, &pbo); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, - h_img, GL_STREAM_DRAW_ARB); +void initGLResources() +{ + // create pixel buffer object + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, h_img, GL_STREAM_DRAW_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); - // create texture for display - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); + // create texture for display + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); - // load shader program - shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); + // load shader program + shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); } -void initGL(int *argc, char **argv) { - // initialize GLUT - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); - glutInitWindowSize(768, 768); - glutCreateWindow("CUDA Rolling Box Filter"); - glutDisplayFunc(display); +void initGL(int *argc, char **argv) +{ + // initialize GLUT + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(768, 768); + glutCreateWindow("CUDA Rolling Box Filter"); + glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - printf("Error: failed to get minimal extensions for demo\n"); - printf("This sample requires:\n"); - printf(" OpenGL version 2.0\n"); - printf(" GL_ARB_vertex_buffer_object\n"); - printf(" GL_ARB_pixel_buffer_object\n"); - exit(EXIT_FAILURE); - } + if (!isGLVersionSupported(2, 0) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + printf("Error: failed to get minimal extensions for demo\n"); + printf("This sample requires:\n"); + printf(" OpenGL version 2.0\n"); + printf(" GL_ARB_vertex_buffer_object\n"); + printf(" GL_ARB_pixel_buffer_object\n"); + exit(EXIT_FAILURE); + } } //////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// -int runBenchmark() { - printf("[runBenchmark]: [%s]\n", sSDKsample); +int runBenchmark() +{ + printf("[runBenchmark]: [%s]\n", sSDKsample); - initCuda(true); + initCuda(true); - unsigned int *d_result; - checkCudaErrors( - cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int))); + unsigned int *d_result; + checkCudaErrors(cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int))); - // warm-up - boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, iterations, - nthreads, kernel_timer); - checkCudaErrors(cudaDeviceSynchronize()); + // warm-up + boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, iterations, nthreads, kernel_timer); + checkCudaErrors(cudaDeviceSynchronize()); - sdkStartTimer(&kernel_timer); - // Start round-trip timer and process iCycles loops on the GPU - iterations = 1; // standard 1-pass filtering - const int iCycles = 150; - double dProcessingTime = 0.0; - printf("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); + sdkStartTimer(&kernel_timer); + // Start round-trip timer and process iCycles loops on the GPU + iterations = 1; // standard 1-pass filtering + const int iCycles = 150; + double dProcessingTime = 0.0; + printf("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); - for (int i = 0; i < iCycles; i++) { - dProcessingTime += - boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, - iterations, nthreads, kernel_timer); - } + for (int i = 0; i < iCycles; i++) { + dProcessingTime += + boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, iterations, nthreads, kernel_timer); + } - // check if kernel execution generated an error and sync host - getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&kernel_timer); + // check if kernel execution generated an error and sync host + getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&kernel_timer); - // Get average computation time - dProcessingTime /= (double)iCycles; + // Get average computation time + dProcessingTime /= (double)iCycles; - // log testname, throughput, timing and config info to sample and master logs - printf( - "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, " - "Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * width * height) / dProcessingTime, dProcessingTime, - (width * height), 1, nthreads); - printf("\n"); + // log testname, throughput, timing and config info to sample and master logs + printf("boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, " + "Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-6 * width * height) / dProcessingTime, + dProcessingTime, + (width * height), + 1, + nthreads); + printf("\n"); - return 0; + return 0; } // This test specifies a single test (where you specify radius and/or // iterations) -int runSingleTest(char *ref_file, char *exec_path) { - int nTotalErrors = 0; - char dump_file[256]; +int runSingleTest(char *ref_file, char *exec_path) +{ + int nTotalErrors = 0; + char dump_file[256]; - printf("[runSingleTest]: [%s]\n", sSDKsample); + printf("[runSingleTest]: [%s]\n", sSDKsample); - initCuda(true); + initCuda(true); - unsigned int *d_result; - unsigned int *h_result = - (unsigned int *)malloc(width * height * sizeof(unsigned int)); - checkCudaErrors( - cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int))); + unsigned int *d_result; + unsigned int *h_result = (unsigned int *)malloc(width * height * sizeof(unsigned int)); + checkCudaErrors(cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int))); - // run the sample radius - { - printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, - iterations); - boxFilterRGBA(d_temp, d_result, width, height, filter_radius, - iterations, nthreads, kernel_timer); + // run the sample radius + { + printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations); + boxFilterRGBA(d_temp, d_result, width, height, filter_radius, iterations, nthreads, kernel_timer); - // check if kernel execution generated an error - getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED"); - checkCudaErrors(cudaDeviceSynchronize()); + // check if kernel execution generated an error + getLastCudaError("Error: boxFilterRGBA Kernel execution FAILED"); + checkCudaErrors(cudaDeviceSynchronize()); - // readback the results to system memory - cudaMemcpy((unsigned char *)h_result, (unsigned char *)d_result, - width * height * sizeof(unsigned int), cudaMemcpyDeviceToHost); + // readback the results to system memory + cudaMemcpy((unsigned char *)h_result, + (unsigned char *)d_result, + width * height * sizeof(unsigned int), + cudaMemcpyDeviceToHost); - sprintf(dump_file, "teapot1024_%02d.ppm", filter_radius); + sprintf(dump_file, "teapot1024_%02d.ppm", filter_radius); - sdkSavePPM4ub((const char *)dump_file, (unsigned char *)h_result, width, - height); + sdkSavePPM4ub((const char *)dump_file, (unsigned char *)h_result, width, height); - if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), - MAX_EPSILON_ERROR, 0.15f, false)) { - printf("Image is Different "); - nTotalErrors++; - } else { - printf("Image is Matching "); + if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false)) { + printf("Image is Different "); + nTotalErrors++; + } + else { + printf("Image is Matching "); + } + + printf(" <%s>\n", ref_file); + } + printf("\n"); + + free(h_result); + checkCudaErrors(cudaFree(d_result)); + + return nTotalErrors; +} + +void loadImageData(int argc, char **argv) +{ + // load image (needed so we can get the width and height before we create the + // window + char *image_path = NULL; + + if (argc >= 1) { + image_path = sdkFindFilePath(image_filename, argv[0]); } - printf(" <%s>\n", ref_file); - } - printf("\n"); + if (image_path == 0) { + printf("Error finding image file '%s'\n", image_filename); + exit(EXIT_FAILURE); + } - free(h_result); - checkCudaErrors(cudaFree(d_result)); + sdkLoadPPM4(image_path, (unsigned char **)&h_img, &width, &height); - return nTotalErrors; + if (!h_img) { + printf("Error opening file '%s'\n", image_path); + exit(EXIT_FAILURE); + } + + printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); } -void loadImageData(int argc, char **argv) { - // load image (needed so we can get the width and height before we create the - // window - char *image_path = NULL; - - if (argc >= 1) { - image_path = sdkFindFilePath(image_filename, argv[0]); - } - - if (image_path == 0) { - printf("Error finding image file '%s'\n", image_filename); - exit(EXIT_FAILURE); - } - - sdkLoadPPM4(image_path, (unsigned char **)&h_img, &width, &height); - - if (!h_img) { - printf("Error opening file '%s'\n", image_path); - exit(EXIT_FAILURE); - } - - printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); -} - -void printHelp() { - printf("boxFilter usage\n"); - printf(" -threads=n (specify the # of of threads to use)\n"); - printf(" -radius=n (specify the filter radius n to use)\n"); - printf(" -passes=n (specify the number of passes n to use)\n"); - printf(" -file=name (specify reference file for comparison)\n"); +void printHelp() +{ + printf("boxFilter usage\n"); + printf(" -threads=n (specify the # of of threads to use)\n"); + printf(" -radius=n (specify the filter radius n to use)\n"); + printf(" -passes=n (specify the number of passes n to use)\n"); + printf(" -file=name (specify reference file for comparison)\n"); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - int devID = 0; - char *ref_file = NULL; +int main(int argc, char **argv) +{ + int devID = 0; + char *ref_file = NULL; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; - // start logs - printf("%s Starting...\n\n", argv[0]); + // start logs + printf("%s Starting...\n\n", argv[0]); - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printHelp(); - exit(EXIT_SUCCESS); - } - - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { - nthreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printHelp(); + exit(EXIT_SUCCESS); } - if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { - filter_radius = - getCmdLineArgumentInt(argc, (const char **)argv, "radius"); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { + nthreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { + filter_radius = getCmdLineArgumentInt(argc, (const char **)argv, "radius"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { + iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); + } } - if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { - iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes"); + // load image to process + loadImageData(argc, argv); + devID = findCudaDevice(argc, (const char **)argv); + + if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { + // This is a separate mode of the sample, where we are benchmark the kernels + // for performance + // Running CUDA kernels (boxfilter) in Benchmarking mode + g_TotalErrors += runBenchmark(); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } - - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", - (char **)&ref_file); + else if (checkCmdLineFlag(argc, (const char **)argv, "radius") + || checkCmdLineFlag(argc, (const char **)argv, "passes")) { + // This overrides the default mode. Users can specify the radius used by + // the filter kernel + g_TotalErrors += runSingleTest(ref_file, argv[0]); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } - } + else { + // Default mode running with OpenGL visualization and in automatic mode + // the output automatically changes animation + printf("\n"); - // load image to process - loadImageData(argc, argv); - devID = findCudaDevice(argc, (const char **)argv); + initGL(&argc, argv); - if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { - // This is a separate mode of the sample, where we are benchmark the kernels - // for performance - // Running CUDA kernels (boxfilter) in Benchmarking mode - g_TotalErrors += runBenchmark(); - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || - checkCmdLineFlag(argc, (const char **)argv, "passes")) { - // This overrides the default mode. Users can specify the radius used by - // the filter kernel - g_TotalErrors += runSingleTest(ref_file, argv[0]); - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - } else { - // Default mode running with OpenGL visualization and in automatic mode - // the output automatically changes animation - printf("\n"); - - initGL(&argc, argv); - - initCuda(true); - initGLResources(); + initCuda(true); + initGLResources(); // sets the callback function so it will call cleanup upon exit #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - printf("Running Standard Demonstration with GLUT loop...\n\n"); - printf( - "Press '+' and '-' to change filter width\n" - "Press ']' and '[' to change number of iterations\n" - "Press 'a' or 'A' to change animation ON/OFF\n\n"); + printf("Running Standard Demonstration with GLUT loop...\n\n"); + printf("Press '+' and '-' to change filter width\n" + "Press ']' and '[' to change number of iterations\n" + "Press 'a' or 'A' to change animation ON/OFF\n\n"); - // Main OpenGL loop that will run visualization for every vsync - glutMainLoop(); - } + // Main OpenGL loop that will run visualization for every vsync + glutMainLoop(); + } } diff --git a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_cpu.cpp b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_cpu.cpp index 765cc849..783fc846 100644 --- a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_cpu.cpp +++ b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_cpu.cpp @@ -30,82 +30,84 @@ extern "C" void computeGold(float *id, float *od, int w, int h, int r); // CPU implementation -void hboxfilter_x(float *id, float *od, int w, int h, int r) { - float scale = 1.0f / (2 * r + 1); +void hboxfilter_x(float *id, float *od, int w, int h, int r) +{ + float scale = 1.0f / (2 * r + 1); - for (int y = 0; y < h; y++) { - float t; - // do left edge - t = id[y * w] * r; + for (int y = 0; y < h; y++) { + float t; + // do left edge + t = id[y * w] * r; - for (int x = 0; x < r + 1; x++) { - t += id[y * w + x]; + for (int x = 0; x < r + 1; x++) { + t += id[y * w + x]; + } + + od[y * w] = t * scale; + + for (int x = 1; x < r + 1; x++) { + int c = y * w + x; + t += id[c + r]; + t -= id[y * w]; + od[c] = t * scale; + } + + // main loop + for (int x = r + 1; x < w - r; x++) { + int c = y * w + x; + t += id[c + r]; + t -= id[c - r - 1]; + od[c] = t * scale; + } + + // do right edge + for (int x = w - r; x < w; x++) { + int c = y * w + x; + t += id[(y * w) + w - 1]; + t -= id[c - r - 1]; + od[c] = t * scale; + } } - - od[y * w] = t * scale; - - for (int x = 1; x < r + 1; x++) { - int c = y * w + x; - t += id[c + r]; - t -= id[y * w]; - od[c] = t * scale; - } - - // main loop - for (int x = r + 1; x < w - r; x++) { - int c = y * w + x; - t += id[c + r]; - t -= id[c - r - 1]; - od[c] = t * scale; - } - - // do right edge - for (int x = w - r; x < w; x++) { - int c = y * w + x; - t += id[(y * w) + w - 1]; - t -= id[c - r - 1]; - od[c] = t * scale; - } - } } -void hboxfilter_y(float *id, float *od, int w, int h, int r) { - float scale = 1.0f / (2 * r + 1); +void hboxfilter_y(float *id, float *od, int w, int h, int r) +{ + float scale = 1.0f / (2 * r + 1); - for (int x = 0; x < w; x++) { - float t; - // do left edge - t = id[x] * r; + for (int x = 0; x < w; x++) { + float t; + // do left edge + t = id[x] * r; - for (int y = 0; y < r + 1; y++) { - t += id[y * w + x]; + for (int y = 0; y < r + 1; y++) { + t += id[y * w + x]; + } + + od[x] = t * scale; + + for (int y = 1; y < r + 1; y++) { + int c = y * w + x; + t += id[c + r * w]; + t -= id[x]; + od[c] = t * scale; + } + + // main loop + for (int y = r + 1; y < h - r; y++) { + int c = y * w + x; + t += id[c + r * w]; + t -= id[c - (r * w) - w]; + od[c] = t * scale; + } + + // do right edge + for (int y = h - r; y < h; y++) { + int c = y * w + x; + t += id[(h - 1) * w + x]; + t -= id[c - (r * w) - w]; + od[c] = t * scale; + } } - - od[x] = t * scale; - - for (int y = 1; y < r + 1; y++) { - int c = y * w + x; - t += id[c + r * w]; - t -= id[x]; - od[c] = t * scale; - } - - // main loop - for (int y = r + 1; y < h - r; y++) { - int c = y * w + x; - t += id[c + r * w]; - t -= id[c - (r * w) - w]; - od[c] = t * scale; - } - - // do right edge - for (int y = h - r; y < h; y++) { - int c = y * w + x; - t += id[(h - 1) * w + x]; - t -= id[c - (r * w) - w]; - od[c] = t * scale; - } - } } //////////////////////////////////////////////////////////////////////////////// @@ -117,7 +119,8 @@ void hboxfilter_y(float *id, float *od, int w, int h, int r) { //! @param r radius of filter //////////////////////////////////////////////////////////////////////////////// -void computeGold(float *image, float *temp, int w, int h, int r) { - hboxfilter_x(image, temp, w, h, r); - hboxfilter_y(temp, image, w, h, r); +void computeGold(float *image, float *temp, int w, int h, int r) +{ + hboxfilter_x(image, temp, w, h, r); + hboxfilter_y(temp, image, w, h, r); } diff --git a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_kernel.cu b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_kernel.cu index b368590f..5f021f15 100644 --- a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_kernel.cu +++ b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_kernel.cu @@ -28,14 +28,14 @@ #ifndef _BOXFILTER_KERNEL_CH_ #define _BOXFILTER_KERNEL_CH_ -#include #include +#include cudaTextureObject_t tex; cudaTextureObject_t texTempArray; cudaTextureObject_t rgbaTex; cudaTextureObject_t rgbaTexTempArray; -cudaArray *d_array, *d_tempArray; +cudaArray *d_array, *d_tempArray; //////////////////////////////////////////////////////////////////////////////// // These are CUDA Helper functions @@ -44,12 +44,12 @@ cudaArray *d_array, *d_tempArray; // call returns an error #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) -inline void __checkCudaErrors(cudaError err, const char *file, const int line) { - if (cudaSuccess != err) { - fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", file, line, - (int)err, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } +inline void __checkCudaErrors(cudaError err, const char *file, const int line) +{ + if (cudaSuccess != err) { + fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", file, line, (int)err, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } } /* @@ -90,309 +90,309 @@ inline void __checkCudaErrors(cudaError err, const char *file, const int line) { */ // process row -__device__ void d_boxfilter_x(float *id, float *od, int w, int h, int r) { - float scale = 1.0f / (float)((r << 1) + 1); +__device__ void d_boxfilter_x(float *id, float *od, int w, int h, int r) +{ + float scale = 1.0f / (float)((r << 1) + 1); - float t; - // do left edge - t = id[0] * r; + float t; + // do left edge + t = id[0] * r; - for (int x = 0; x < (r + 1); x++) { - t += id[x]; - } + for (int x = 0; x < (r + 1); x++) { + t += id[x]; + } - od[0] = t * scale; + od[0] = t * scale; - for (int x = 1; x < (r + 1); x++) { - t += id[x + r]; - t -= id[0]; - od[x] = t * scale; - } + for (int x = 1; x < (r + 1); x++) { + t += id[x + r]; + t -= id[0]; + od[x] = t * scale; + } - // main loop - for (int x = (r + 1); x < w - r; x++) { - t += id[x + r]; - t -= id[x - r - 1]; - od[x] = t * scale; - } + // main loop + for (int x = (r + 1); x < w - r; x++) { + t += id[x + r]; + t -= id[x - r - 1]; + od[x] = t * scale; + } - // do right edge - for (int x = w - r; x < w; x++) { - t += id[w - 1]; - t -= id[x - r - 1]; - od[x] = t * scale; - } + // do right edge + for (int x = w - r; x < w; x++) { + t += id[w - 1]; + t -= id[x - r - 1]; + od[x] = t * scale; + } } // process column -__device__ void d_boxfilter_y(float *id, float *od, int w, int h, int r) { - float scale = 1.0f / (float)((r << 1) + 1); +__device__ void d_boxfilter_y(float *id, float *od, int w, int h, int r) +{ + float scale = 1.0f / (float)((r << 1) + 1); - float t; - // do left edge - t = id[0] * r; + float t; + // do left edge + t = id[0] * r; - for (int y = 0; y < (r + 1); y++) { - t += id[y * w]; - } + for (int y = 0; y < (r + 1); y++) { + t += id[y * w]; + } - od[0] = t * scale; + od[0] = t * scale; - for (int y = 1; y < (r + 1); y++) { - t += id[(y + r) * w]; - t -= id[0]; - od[y * w] = t * scale; - } + for (int y = 1; y < (r + 1); y++) { + t += id[(y + r) * w]; + t -= id[0]; + od[y * w] = t * scale; + } - // main loop - for (int y = (r + 1); y < (h - r); y++) { - t += id[(y + r) * w]; - t -= id[((y - r) * w) - w]; - od[y * w] = t * scale; - } + // main loop + for (int y = (r + 1); y < (h - r); y++) { + t += id[(y + r) * w]; + t -= id[((y - r) * w) - w]; + od[y * w] = t * scale; + } - // do right edge - for (int y = h - r; y < h; y++) { - t += id[(h - 1) * w]; - t -= id[((y - r) * w) - w]; - od[y * w] = t * scale; - } + // do right edge + for (int y = h - r; y < h; y++) { + t += id[(h - 1) * w]; + t -= id[((y - r) * w) - w]; + od[y * w] = t * scale; + } } -__global__ void d_boxfilter_x_global(float *id, float *od, int w, int h, - int r) { - unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; - d_boxfilter_x(&id[y * w], &od[y * w], w, h, r); +__global__ void d_boxfilter_x_global(float *id, float *od, int w, int h, int r) +{ + unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; + d_boxfilter_x(&id[y * w], &od[y * w], w, h, r); } -__global__ void d_boxfilter_y_global(float *id, float *od, int w, int h, - int r) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - d_boxfilter_y(&id[x], &od[x], w, h, r); +__global__ void d_boxfilter_y_global(float *id, float *od, int w, int h, int r) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + d_boxfilter_y(&id[x], &od[x], w, h, r); } // texture version // texture fetches automatically clamp to edge of image -__global__ void d_boxfilter_x_tex(float *od, int w, int h, int r, - cudaTextureObject_t tex) { - float scale = 1.0f / (float)((r << 1) + 1); - unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void d_boxfilter_x_tex(float *od, int w, int h, int r, cudaTextureObject_t tex) +{ + float scale = 1.0f / (float)((r << 1) + 1); + unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; - float t = 0.0f; + float t = 0.0f; - for (int x = -r; x <= r; x++) { - t += tex2D(tex, x, y); - } + for (int x = -r; x <= r; x++) { + t += tex2D(tex, x, y); + } - od[y * w] = t * scale; + od[y * w] = t * scale; - for (int x = 1; x < w; x++) { - t += tex2D(tex, x + r, y); - t -= tex2D(tex, x - r - 1, y); - od[y * w + x] = t * scale; - } + for (int x = 1; x < w; x++) { + t += tex2D(tex, x + r, y); + t -= tex2D(tex, x - r - 1, y); + od[y * w + x] = t * scale; + } } -__global__ void d_boxfilter_y_tex(float *od, int w, int h, int r, - cudaTextureObject_t tex) { - float scale = 1.0f / (float)((r << 1) + 1); - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void d_boxfilter_y_tex(float *od, int w, int h, int r, cudaTextureObject_t tex) +{ + float scale = 1.0f / (float)((r << 1) + 1); + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - float t = 0.0f; + float t = 0.0f; - for (int y = -r; y <= r; y++) { - t += tex2D(tex, x, y); - } + for (int y = -r; y <= r; y++) { + t += tex2D(tex, x, y); + } - od[x] = t * scale; + od[x] = t * scale; - for (int y = 1; y < h; y++) { - t += tex2D(tex, x, y + r); - t -= tex2D(tex, x, y - r - 1); - od[y * w + x] = t * scale; - } + for (int y = 1; y < h; y++) { + t += tex2D(tex, x, y + r); + t -= tex2D(tex, x, y - r - 1); + od[y * w + x] = t * scale; + } } // RGBA version // reads from 32-bit unsigned int array holding 8-bit RGBA // convert floating point rgba color to 32-bit integer -__device__ unsigned int rgbaFloatToInt(float4 rgba) { - rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] - rgba.y = __saturatef(rgba.y); - rgba.z = __saturatef(rgba.z); - rgba.w = __saturatef(rgba.w); - return ((unsigned int)(rgba.w * 255.0f) << 24) | - ((unsigned int)(rgba.z * 255.0f) << 16) | - ((unsigned int)(rgba.y * 255.0f) << 8) | - ((unsigned int)(rgba.x * 255.0f)); +__device__ unsigned int rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return ((unsigned int)(rgba.w * 255.0f) << 24) | ((unsigned int)(rgba.z * 255.0f) << 16) + | ((unsigned int)(rgba.y * 255.0f) << 8) | ((unsigned int)(rgba.x * 255.0f)); } -__device__ float4 rgbaIntToFloat(unsigned int c) { - float4 rgba; - rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; - rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; - rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; - rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; - return rgba; +__device__ float4 rgbaIntToFloat(unsigned int c) +{ + float4 rgba; + rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; + rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; + rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; + rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; + return rgba; } // row pass using texture lookups -__global__ void d_boxfilter_rgba_x(unsigned int *od, int w, int h, int r, - cudaTextureObject_t rgbaTex) { - float scale = 1.0f / (float)((r << 1) + 1); - unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void d_boxfilter_rgba_x(unsigned int *od, int w, int h, int r, cudaTextureObject_t rgbaTex) +{ + float scale = 1.0f / (float)((r << 1) + 1); + unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; - // as long as address is always less than height, we do work - if (y < h) { - float4 t = make_float4(0.0f); + // as long as address is always less than height, we do work + if (y < h) { + float4 t = make_float4(0.0f); - for (int x = -r; x <= r; x++) { - t += tex2D(rgbaTex, x, y); + for (int x = -r; x <= r; x++) { + t += tex2D(rgbaTex, x, y); + } + + od[y * w] = rgbaFloatToInt(t * scale); + + for (int x = 1; x < w; x++) { + t += tex2D(rgbaTex, x + r, y); + t -= tex2D(rgbaTex, x - r - 1, y); + od[y * w + x] = rgbaFloatToInt(t * scale); + } } - - od[y * w] = rgbaFloatToInt(t * scale); - - for (int x = 1; x < w; x++) { - t += tex2D(rgbaTex, x + r, y); - t -= tex2D(rgbaTex, x - r - 1, y); - od[y * w + x] = rgbaFloatToInt(t * scale); - } - } } // column pass using coalesced global memory reads -__global__ void d_boxfilter_rgba_y(unsigned int *id, unsigned int *od, int w, - int h, int r) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - id = &id[x]; - od = &od[x]; +__global__ void d_boxfilter_rgba_y(unsigned int *id, unsigned int *od, int w, int h, int r) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + id = &id[x]; + od = &od[x]; - float scale = 1.0f / (float)((r << 1) + 1); + float scale = 1.0f / (float)((r << 1) + 1); - float4 t; - // do left edge - t = rgbaIntToFloat(id[0]) * r; + float4 t; + // do left edge + t = rgbaIntToFloat(id[0]) * r; - for (int y = 0; y < (r + 1); y++) { - t += rgbaIntToFloat(id[y * w]); - } + for (int y = 0; y < (r + 1); y++) { + t += rgbaIntToFloat(id[y * w]); + } - od[0] = rgbaFloatToInt(t * scale); + od[0] = rgbaFloatToInt(t * scale); - for (int y = 1; y < (r + 1); y++) { - t += rgbaIntToFloat(id[(y + r) * w]); - t -= rgbaIntToFloat(id[0]); - od[y * w] = rgbaFloatToInt(t * scale); - } + for (int y = 1; y < (r + 1); y++) { + t += rgbaIntToFloat(id[(y + r) * w]); + t -= rgbaIntToFloat(id[0]); + od[y * w] = rgbaFloatToInt(t * scale); + } - // main loop - for (int y = (r + 1); y < (h - r); y++) { - t += rgbaIntToFloat(id[(y + r) * w]); - t -= rgbaIntToFloat(id[((y - r) * w) - w]); - od[y * w] = rgbaFloatToInt(t * scale); - } + // main loop + for (int y = (r + 1); y < (h - r); y++) { + t += rgbaIntToFloat(id[(y + r) * w]); + t -= rgbaIntToFloat(id[((y - r) * w) - w]); + od[y * w] = rgbaFloatToInt(t * scale); + } - // do right edge - for (int y = h - r; y < h; y++) { - t += rgbaIntToFloat(id[(h - 1) * w]); - t -= rgbaIntToFloat(id[((y - r) * w) - w]); - od[y * w] = rgbaFloatToInt(t * scale); - } + // do right edge + for (int y = h - r; y < h; y++) { + t += rgbaIntToFloat(id[(h - 1) * w]); + t -= rgbaIntToFloat(id[((y - r) * w) - w]); + od[y * w] = rgbaFloatToInt(t * scale); + } } -extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA) { - // copy image data to array - cudaChannelFormatDesc channelDesc; - if (useRGBA) { - channelDesc = - cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned); - } else { - channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); - } - checkCudaErrors(cudaMallocArray(&d_array, &channelDesc, width, height)); +extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA) +{ + // copy image data to array + cudaChannelFormatDesc channelDesc; + if (useRGBA) { + channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned); + } + else { + channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + } + checkCudaErrors(cudaMallocArray(&d_array, &channelDesc, width, height)); - size_t bytesPerElem = (useRGBA ? sizeof(uchar4) : sizeof(float)); - checkCudaErrors(cudaMemcpy2DToArray( - d_array, 0, 0, pImage, width * bytesPerElem, width * bytesPerElem, height, - cudaMemcpyHostToDevice)); + size_t bytesPerElem = (useRGBA ? sizeof(uchar4) : sizeof(float)); + checkCudaErrors(cudaMemcpy2DToArray( + d_array, 0, 0, pImage, width * bytesPerElem, width * bytesPerElem, height, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMallocArray(&d_tempArray, &channelDesc, width, height)); + checkCudaErrors(cudaMallocArray(&d_tempArray, &channelDesc, width, height)); - // set texture parameters - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + // set texture parameters + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_array; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_array; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors(cudaCreateTextureObject(&rgbaTex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&rgbaTex, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_tempArray; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_tempArray; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors( - cudaCreateTextureObject(&rgbaTexTempArray, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&rgbaTexTempArray, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_array; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_array; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_tempArray; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_tempArray; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texTempArray, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texTempArray, &texRes, &texDescr, NULL)); } -extern "C" void freeTextures() { - checkCudaErrors(cudaDestroyTextureObject(tex)); - checkCudaErrors(cudaDestroyTextureObject(texTempArray)); - checkCudaErrors(cudaDestroyTextureObject(rgbaTex)); - checkCudaErrors(cudaDestroyTextureObject(rgbaTexTempArray)); - checkCudaErrors(cudaFreeArray(d_array)); - checkCudaErrors(cudaFreeArray(d_tempArray)); +extern "C" void freeTextures() +{ + checkCudaErrors(cudaDestroyTextureObject(tex)); + checkCudaErrors(cudaDestroyTextureObject(texTempArray)); + checkCudaErrors(cudaDestroyTextureObject(rgbaTex)); + checkCudaErrors(cudaDestroyTextureObject(rgbaTexTempArray)); + checkCudaErrors(cudaFreeArray(d_array)); + checkCudaErrors(cudaFreeArray(d_tempArray)); } /* @@ -407,83 +407,100 @@ extern "C" void freeTextures() { iterations - number of iterations */ -extern "C" double boxFilter(float *d_temp, float *d_dest, - int width, int height, int radius, int iterations, - int nthreads, StopWatchInterface *timer) { - // var for kernel timing - double dKernelTime = 0.0; +extern "C" double boxFilter(float *d_temp, + float *d_dest, + int width, + int height, + int radius, + int iterations, + int nthreads, + StopWatchInterface *timer) +{ + // var for kernel timing + double dKernelTime = 0.0; - // sync host and start computation timer_kernel - checkCudaErrors(cudaDeviceSynchronize()); - - for (int i = 0; i < iterations; i++) { - sdkResetTimer(&timer); - // use texture for horizontal pass - if (iterations > 1) { - d_boxfilter_x_tex<<>>( - d_temp, width, height, radius, texTempArray); - } else { - d_boxfilter_x_tex<<>>( - d_temp, width, height, radius, tex); - } - - d_boxfilter_y_global<<>>( - d_temp, d_dest, width, height, radius); - - // sync host and stop computation timer_kernel + // sync host and start computation timer_kernel checkCudaErrors(cudaDeviceSynchronize()); - dKernelTime += sdkGetTimerValue(&timer); - if (iterations > 1) { - // copy result back from global memory to array - checkCudaErrors(cudaMemcpy2DToArray( - d_tempArray, 0, 0, d_dest, width * sizeof(float), - width * sizeof(float), height, cudaMemcpyDeviceToDevice)); + for (int i = 0; i < iterations; i++) { + sdkResetTimer(&timer); + // use texture for horizontal pass + if (iterations > 1) { + d_boxfilter_x_tex<<>>(d_temp, width, height, radius, texTempArray); + } + else { + d_boxfilter_x_tex<<>>(d_temp, width, height, radius, tex); + } + + d_boxfilter_y_global<<>>(d_temp, d_dest, width, height, radius); + + // sync host and stop computation timer_kernel + checkCudaErrors(cudaDeviceSynchronize()); + dKernelTime += sdkGetTimerValue(&timer); + + if (iterations > 1) { + // copy result back from global memory to array + checkCudaErrors(cudaMemcpy2DToArray(d_tempArray, + 0, + 0, + d_dest, + width * sizeof(float), + width * sizeof(float), + height, + cudaMemcpyDeviceToDevice)); + } } - } - return ((dKernelTime / 1000.) / (double)iterations); + return ((dKernelTime / 1000.) / (double)iterations); } // RGBA version -extern "C" double boxFilterRGBA(unsigned int *d_temp, - unsigned int *d_dest, int width, int height, - int radius, int iterations, int nthreads, - StopWatchInterface *timer) { - // var for kernel computation timing - double dKernelTime; +extern "C" double boxFilterRGBA(unsigned int *d_temp, + unsigned int *d_dest, + int width, + int height, + int radius, + int iterations, + int nthreads, + StopWatchInterface *timer) +{ + // var for kernel computation timing + double dKernelTime; - for (int i = 0; i < iterations; i++) { - // sync host and start kernel computation timer_kernel - dKernelTime = 0.0; - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&timer); + for (int i = 0; i < iterations; i++) { + // sync host and start kernel computation timer_kernel + dKernelTime = 0.0; + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&timer); - // use texture for horizontal pass - if (iterations > 1) { - d_boxfilter_rgba_x<<>>( - d_temp, width, height, radius, rgbaTexTempArray); - } else { - d_boxfilter_rgba_x<<>>( - d_temp, width, height, radius, rgbaTex); + // use texture for horizontal pass + if (iterations > 1) { + d_boxfilter_rgba_x<<>>(d_temp, width, height, radius, rgbaTexTempArray); + } + else { + d_boxfilter_rgba_x<<>>(d_temp, width, height, radius, rgbaTex); + } + + d_boxfilter_rgba_y<<>>(d_temp, d_dest, width, height, radius); + + // sync host and stop computation timer_kernel + checkCudaErrors(cudaDeviceSynchronize()); + dKernelTime += sdkGetTimerValue(&timer); + + if (iterations > 1) { + // copy result back from global memory to array + checkCudaErrors(cudaMemcpy2DToArray(d_tempArray, + 0, + 0, + d_dest, + width * sizeof(unsigned int), + width * sizeof(unsigned int), + height, + cudaMemcpyDeviceToDevice)); + } } - d_boxfilter_rgba_y<<>>(d_temp, d_dest, width, - height, radius); - - // sync host and stop computation timer_kernel - checkCudaErrors(cudaDeviceSynchronize()); - dKernelTime += sdkGetTimerValue(&timer); - - if (iterations > 1) { - // copy result back from global memory to array - checkCudaErrors(cudaMemcpy2DToArray( - d_tempArray, 0, 0, d_dest, width * sizeof(unsigned int), - width * sizeof(unsigned int), height, cudaMemcpyDeviceToDevice)); - } - } - - return ((dKernelTime / 1000.) / (double)iterations); + return ((dKernelTime / 1000.) / (double)iterations); } -#endif // #ifndef _BOXFILTER_KERNEL_H_ +#endif // #ifndef _BOXFILTER_KERNEL_H_ diff --git a/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu b/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu index e840fa97..9a116e68 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu +++ b/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable.cu @@ -26,8 +26,8 @@ */ #include -#include #include +#include namespace cg = cooperative_groups; #include "convolutionSeparable_common.h" @@ -37,178 +37,156 @@ namespace cg = cooperative_groups; //////////////////////////////////////////////////////////////////////////////// __constant__ float c_Kernel[KERNEL_LENGTH]; -extern "C" void setConvolutionKernel(float *h_Kernel) { - cudaMemcpyToSymbol(c_Kernel, h_Kernel, KERNEL_LENGTH * sizeof(float)); +extern "C" void setConvolutionKernel(float *h_Kernel) +{ + cudaMemcpyToSymbol(c_Kernel, h_Kernel, KERNEL_LENGTH * sizeof(float)); } //////////////////////////////////////////////////////////////////////////////// // Row convolution filter //////////////////////////////////////////////////////////////////////////////// -#define ROWS_BLOCKDIM_X 16 -#define ROWS_BLOCKDIM_Y 4 +#define ROWS_BLOCKDIM_X 16 +#define ROWS_BLOCKDIM_Y 4 #define ROWS_RESULT_STEPS 8 -#define ROWS_HALO_STEPS 1 +#define ROWS_HALO_STEPS 1 -__global__ void convolutionRowsKernel(float *d_Dst, float *d_Src, int imageW, - int imageH, int pitch) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float - s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * - ROWS_BLOCKDIM_X]; +__global__ void convolutionRowsKernel(float *d_Dst, float *d_Src, int imageW, int imageH, int pitch) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X]; - // Offset to the left halo edge - const int baseX = - (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + - threadIdx.x; - const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y; + // Offset to the left halo edge + const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x; + const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y; - d_Src += baseY * pitch + baseX; - d_Dst += baseY * pitch + baseX; + d_Src += baseY * pitch + baseX; + d_Dst += baseY * pitch + baseX; // Load main data #pragma unroll - for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { - s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = - d_Src[i * ROWS_BLOCKDIM_X]; - } + for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { + s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[i * ROWS_BLOCKDIM_X]; + } // Load left halo #pragma unroll - for (int i = 0; i < ROWS_HALO_STEPS; i++) { - s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = - (baseX >= -i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; - } + for (int i = 0; i < ROWS_HALO_STEPS; i++) { + s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = + (baseX >= -i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; + } // Load right halo #pragma unroll - for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; - i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) { - s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = - (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; - } - - // Compute and store results - cg::sync(cta); -#pragma unroll - - for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { - float sum = 0; - -#pragma unroll - - for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { - sum += c_Kernel[KERNEL_RADIUS - j] * - s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j]; + for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) { + s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = + (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; } - d_Dst[i * ROWS_BLOCKDIM_X] = sum; - } + // Compute and store results + cg::sync(cta); +#pragma unroll + + for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) { + float sum = 0; + +#pragma unroll + + for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { + sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j]; + } + + d_Dst[i * ROWS_BLOCKDIM_X] = sum; + } } -extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, - int imageH) { - assert(ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS); - assert(imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0); - assert(imageH % ROWS_BLOCKDIM_Y == 0); +extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, int imageH) +{ + assert(ROWS_BLOCKDIM_X * ROWS_HALO_STEPS >= KERNEL_RADIUS); + assert(imageW % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) == 0); + assert(imageH % ROWS_BLOCKDIM_Y == 0); - dim3 blocks(imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X), - imageH / ROWS_BLOCKDIM_Y); - dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y); + dim3 blocks(imageW / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X), imageH / ROWS_BLOCKDIM_Y); + dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y); - convolutionRowsKernel<<>>(d_Dst, d_Src, imageW, imageH, - imageW); - getLastCudaError("convolutionRowsKernel() execution failed\n"); + convolutionRowsKernel<<>>(d_Dst, d_Src, imageW, imageH, imageW); + getLastCudaError("convolutionRowsKernel() execution failed\n"); } //////////////////////////////////////////////////////////////////////////////// // Column convolution filter //////////////////////////////////////////////////////////////////////////////// -#define COLUMNS_BLOCKDIM_X 16 -#define COLUMNS_BLOCKDIM_Y 8 +#define COLUMNS_BLOCKDIM_X 16 +#define COLUMNS_BLOCKDIM_Y 8 #define COLUMNS_RESULT_STEPS 8 -#define COLUMNS_HALO_STEPS 1 +#define COLUMNS_HALO_STEPS 1 -__global__ void convolutionColumnsKernel(float *d_Dst, float *d_Src, int imageW, - int imageH, int pitch) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float s_Data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS + - 2 * COLUMNS_HALO_STEPS) * - COLUMNS_BLOCKDIM_Y + - 1]; +__global__ void convolutionColumnsKernel(float *d_Dst, float *d_Src, int imageW, int imageH, int pitch) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float s_Data[COLUMNS_BLOCKDIM_X] + [(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1]; - // Offset to the upper halo edge - const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x; - const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * - COLUMNS_BLOCKDIM_Y + - threadIdx.y; - d_Src += baseY * pitch + baseX; - d_Dst += baseY * pitch + baseX; + // Offset to the upper halo edge + const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x; + const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y; + d_Src += baseY * pitch + baseX; + d_Dst += baseY * pitch + baseX; // Main data #pragma unroll - for (int i = COLUMNS_HALO_STEPS; - i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { - s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = - d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]; - } + for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { + s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[i * COLUMNS_BLOCKDIM_Y * pitch]; + } // Upper halo #pragma unroll - for (int i = 0; i < COLUMNS_HALO_STEPS; i++) { - s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = - (baseY >= -i * COLUMNS_BLOCKDIM_Y) - ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] - : 0; - } + for (int i = 0; i < COLUMNS_HALO_STEPS; i++) { + s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = + (baseY >= -i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] : 0; + } // Lower halo #pragma unroll - for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; - i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; - i++) { - s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = - (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) - ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] - : 0; - } - - // Compute and store results - cg::sync(cta); -#pragma unroll - - for (int i = COLUMNS_HALO_STEPS; - i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { - float sum = 0; -#pragma unroll - - for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { - sum += c_Kernel[KERNEL_RADIUS - j] * - s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j]; + for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; + i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; + i++) { + s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = + (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] : 0; } - d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum; - } + // Compute and store results + cg::sync(cta); +#pragma unroll + + for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) { + float sum = 0; +#pragma unroll + + for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) { + sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j]; + } + + d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum; + } } -extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, - int imageH) { - assert(COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS); - assert(imageW % COLUMNS_BLOCKDIM_X == 0); - assert(imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0); +extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, int imageH) +{ + assert(COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS); + assert(imageW % COLUMNS_BLOCKDIM_X == 0); + assert(imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0); - dim3 blocks(imageW / COLUMNS_BLOCKDIM_X, - imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y)); - dim3 threads(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y); + dim3 blocks(imageW / COLUMNS_BLOCKDIM_X, imageH / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y)); + dim3 threads(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y); - convolutionColumnsKernel<<>>(d_Dst, d_Src, imageW, imageH, - imageW); - getLastCudaError("convolutionColumnsKernel() execution failed\n"); + convolutionColumnsKernel<<>>(d_Dst, d_Src, imageW, imageH, imageW); + getLastCudaError("convolutionColumnsKernel() execution failed\n"); } diff --git a/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h b/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h index 9dba4a5a..83743f70 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h +++ b/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_common.h @@ -34,22 +34,17 @@ //////////////////////////////////////////////////////////////////////////////// // Reference CPU convolution //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, - int imageW, int imageH, int kernelR); +extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR); -extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, - float *h_Kernel, int imageW, int imageH, - int kernelR); +extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR); //////////////////////////////////////////////////////////////////////////////// // GPU convolution //////////////////////////////////////////////////////////////////////////////// extern "C" void setConvolutionKernel(float *h_Kernel); -extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, - int imageH); +extern "C" void convolutionRowsGPU(float *d_Dst, float *d_Src, int imageW, int imageH); -extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, - int imageH); +extern "C" void convolutionColumnsGPU(float *d_Dst, float *d_Src, int imageW, int imageH); #endif diff --git a/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp b/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp index e8a40ca8..9d298ecd 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp +++ b/Samples/2_Concepts_and_Techniques/convolutionSeparable/convolutionSeparable_gold.cpp @@ -30,40 +30,39 @@ //////////////////////////////////////////////////////////////////////////////// // Reference row convolution filter //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, - int imageW, int imageH, int kernelR) { - for (int y = 0; y < imageH; y++) - for (int x = 0; x < imageW; x++) { - float sum = 0; +extern "C" void convolutionRowCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR) +{ + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; - for (int k = -kernelR; k <= kernelR; k++) { - int d = x + k; + for (int k = -kernelR; k <= kernelR; k++) { + int d = x + k; - if (d >= 0 && d < imageW) - sum += h_Src[y * imageW + d] * h_Kernel[kernelR - k]; - } + if (d >= 0 && d < imageW) + sum += h_Src[y * imageW + d] * h_Kernel[kernelR - k]; + } - h_Dst[y * imageW + x] = sum; - } + h_Dst[y * imageW + x] = sum; + } } //////////////////////////////////////////////////////////////////////////////// // Reference column convolution filter //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, - float *h_Kernel, int imageW, int imageH, - int kernelR) { - for (int y = 0; y < imageH; y++) - for (int x = 0; x < imageW; x++) { - float sum = 0; +extern "C" void convolutionColumnCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR) +{ + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; - for (int k = -kernelR; k <= kernelR; k++) { - int d = y + k; + for (int k = -kernelR; k <= kernelR; k++) { + int d = y + k; - if (d >= 0 && d < imageH) - sum += h_Src[d * imageW + x] * h_Kernel[kernelR - k]; - } + if (d >= 0 && d < imageH) + sum += h_Src[d * imageW + x] * h_Kernel[kernelR - k]; + } - h_Dst[y * imageW + x] = sum; - } + h_Dst[y * imageW + x] = sum; + } } diff --git a/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp b/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp index 2eb7dc01..f43e8517 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp +++ b/Samples/2_Concepts_and_Techniques/convolutionSeparable/main.cpp @@ -26,149 +26,140 @@ */ /* -* This sample implements a separable convolution filter -* of a 2D image with an arbitrary kernel. -*/ + * This sample implements a separable convolution filter + * of a 2D image with an arbitrary kernel. + */ // CUDA runtime #include // Utilities and system includes -#include #include +#include #include "convolutionSeparable_common.h" //////////////////////////////////////////////////////////////////////////////// // Reference CPU convolution //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionRowCPU(float *h_Result, float *h_Data, - float *h_Kernel, int imageW, int imageH, - int kernelR); +extern "C" void convolutionRowCPU(float *h_Result, float *h_Data, float *h_Kernel, int imageW, int imageH, int kernelR); -extern "C" void convolutionColumnCPU(float *h_Result, float *h_Data, - float *h_Kernel, int imageW, int imageH, - int kernelR); +extern "C" void +convolutionColumnCPU(float *h_Result, float *h_Data, float *h_Kernel, int imageW, int imageH, int kernelR); //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // start logs - printf("[%s] - Starting...\n", argv[0]); +int main(int argc, char **argv) +{ + // start logs + printf("[%s] - Starting...\n", argv[0]); - float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; + float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; - float *d_Input, *d_Output, *d_Buffer; + float *d_Input, *d_Output, *d_Buffer; - const int imageW = 3072; - const int imageH = 3072; - const int iterations = 16; + const int imageW = 3072; + const int imageH = 3072; + const int iterations = 16; - StopWatchInterface *hTimer = NULL; + StopWatchInterface *hTimer = NULL; - // Use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // Use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Image Width x Height = %i x %i\n\n", imageW, imageH); - printf("Allocating and initializing host arrays...\n"); - h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); - h_Input = (float *)malloc(imageW * imageH * sizeof(float)); - h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); - h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); - h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); - srand(200); + printf("Image Width x Height = %i x %i\n\n", imageW, imageH); + printf("Allocating and initializing host arrays...\n"); + h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); + h_Input = (float *)malloc(imageW * imageH * sizeof(float)); + h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); + srand(200); - for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { - h_Kernel[i] = (float)(rand() % 16); - } - - for (unsigned i = 0; i < imageW * imageH; i++) { - h_Input[i] = (float)(rand() % 16); - } - - printf("Allocating and initializing CUDA arrays...\n"); - checkCudaErrors( - cudaMalloc((void **)&d_Input, imageW * imageH * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_Buffer, imageW * imageH * sizeof(float))); - - setConvolutionKernel(h_Kernel); - checkCudaErrors(cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), - cudaMemcpyHostToDevice)); - - printf("Running GPU convolution (%u identical iterations)...\n\n", - iterations); - - for (int i = -1; i < iterations; i++) { - // i == -1 -- warmup iteration - if (i == 0) { - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { + h_Kernel[i] = (float)(rand() % 16); } - convolutionRowsGPU(d_Buffer, d_Input, imageW, imageH); + for (unsigned i = 0; i < imageW * imageH; i++) { + h_Input[i] = (float)(rand() % 16); + } - convolutionColumnsGPU(d_Output, d_Buffer, imageW, imageH); - } + printf("Allocating and initializing CUDA arrays...\n"); + checkCudaErrors(cudaMalloc((void **)&d_Input, imageW * imageH * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Buffer, imageW * imageH * sizeof(float))); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - double gpuTime = 0.001 * sdkGetTimerValue(&hTimer) / (double)iterations; - printf( - "convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, " - "Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", - (1.0e-6 * (double)(imageW * imageH) / gpuTime), gpuTime, - (imageW * imageH), 1, 0); + setConvolutionKernel(h_Kernel); + checkCudaErrors(cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice)); - printf("\nReading back GPU results...\n\n"); - checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, - imageW * imageH * sizeof(float), - cudaMemcpyDeviceToHost)); + printf("Running GPU convolution (%u identical iterations)...\n\n", iterations); - printf("Checking the results...\n"); - printf(" ...running convolutionRowCPU()\n"); - convolutionRowCPU(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); + for (int i = -1; i < iterations; i++) { + // i == -1 -- warmup iteration + if (i == 0) { + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } - printf(" ...running convolutionColumnCPU()\n"); - convolutionColumnCPU(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, - KERNEL_RADIUS); + convolutionRowsGPU(d_Buffer, d_Input, imageW, imageH); - printf(" ...comparing the results\n"); - double sum = 0, delta = 0; + convolutionColumnsGPU(d_Output, d_Buffer, imageW, imageH); + } - for (unsigned i = 0; i < imageW * imageH; i++) { - delta += - (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); - sum += h_OutputCPU[i] * h_OutputCPU[i]; - } + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + double gpuTime = 0.001 * sdkGetTimerValue(&hTimer) / (double)iterations; + printf("convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, " + "Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", + (1.0e-6 * (double)(imageW * imageH) / gpuTime), + gpuTime, + (imageW * imageH), + 1, + 0); - double L2norm = sqrt(delta / sum); - printf(" ...Relative L2 norm: %E\n\n", L2norm); - printf("Shutting down...\n"); + printf("\nReading back GPU results...\n\n"); + checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaFree(d_Buffer)); - checkCudaErrors(cudaFree(d_Output)); - checkCudaErrors(cudaFree(d_Input)); - free(h_OutputGPU); - free(h_OutputCPU); - free(h_Buffer); - free(h_Input); - free(h_Kernel); + printf("Checking the results...\n"); + printf(" ...running convolutionRowCPU()\n"); + convolutionRowCPU(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); - sdkDeleteTimer(&hTimer); + printf(" ...running convolutionColumnCPU()\n"); + convolutionColumnCPU(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS); - if (L2norm > 1e-6) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + printf(" ...comparing the results\n"); + double sum = 0, delta = 0; - printf("Test passed\n"); - exit(EXIT_SUCCESS); + for (unsigned i = 0; i < imageW * imageH; i++) { + delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); + sum += h_OutputCPU[i] * h_OutputCPU[i]; + } + + double L2norm = sqrt(delta / sum); + printf(" ...Relative L2 norm: %E\n\n", L2norm); + printf("Shutting down...\n"); + + checkCudaErrors(cudaFree(d_Buffer)); + checkCudaErrors(cudaFree(d_Output)); + checkCudaErrors(cudaFree(d_Input)); + free(h_OutputGPU); + free(h_OutputCPU); + free(h_Buffer); + free(h_Input); + free(h_Kernel); + + sdkDeleteTimer(&hTimer); + + if (L2norm > 1e-6) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture.cu b/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture.cu index 8b5a5c98..3bd5b22a 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture.cu +++ b/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture.cu @@ -25,10 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include #include -#include #include "convolutionTexture_common.h" @@ -52,111 +52,102 @@ inline int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; } //////////////////////////////////////////////////////////////////////////////// __constant__ float c_Kernel[KERNEL_LENGTH]; -extern "C" void setConvolutionKernel(float *h_Kernel) { - cudaMemcpyToSymbol(c_Kernel, h_Kernel, KERNEL_LENGTH * sizeof(float)); +extern "C" void setConvolutionKernel(float *h_Kernel) +{ + cudaMemcpyToSymbol(c_Kernel, h_Kernel, KERNEL_LENGTH * sizeof(float)); } //////////////////////////////////////////////////////////////////////////////// // Loop unrolling templates, needed for best performance //////////////////////////////////////////////////////////////////////////////// -template -__device__ float convolutionRow(float x, float y, cudaTextureObject_t texSrc) { - return tex2D(texSrc, x + (float)(KERNEL_RADIUS - i), y) * c_Kernel[i] + - convolutionRow(x, y, texSrc); +template __device__ float convolutionRow(float x, float y, cudaTextureObject_t texSrc) +{ + return tex2D(texSrc, x + (float)(KERNEL_RADIUS - i), y) * c_Kernel[i] + convolutionRow(x, y, texSrc); } -template <> -__device__ float convolutionRow<-1>(float x, float y, - cudaTextureObject_t texSrc) { - return 0; +template <> __device__ float convolutionRow<-1>(float x, float y, cudaTextureObject_t texSrc) { return 0; } + +template __device__ float convolutionColumn(float x, float y, cudaTextureObject_t texSrc) +{ + return tex2D(texSrc, x, y + (float)(KERNEL_RADIUS - i)) * c_Kernel[i] + + convolutionColumn(x, y, texSrc); } -template -__device__ float convolutionColumn(float x, float y, - cudaTextureObject_t texSrc) { - return tex2D(texSrc, x, y + (float)(KERNEL_RADIUS - i)) * c_Kernel[i] + - convolutionColumn(x, y, texSrc); -} - -template <> -__device__ float convolutionColumn<-1>(float x, float y, - cudaTextureObject_t texSrc) { - return 0; -} +template <> __device__ float convolutionColumn<-1>(float x, float y, cudaTextureObject_t texSrc) { return 0; } //////////////////////////////////////////////////////////////////////////////// // Row convolution filter //////////////////////////////////////////////////////////////////////////////// -__global__ void convolutionRowsKernel(float *d_Dst, int imageW, int imageH, - cudaTextureObject_t texSrc) { - const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x); - const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y); - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; +__global__ void convolutionRowsKernel(float *d_Dst, int imageW, int imageH, cudaTextureObject_t texSrc) +{ + const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x); + const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y); + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; - if (ix >= imageW || iy >= imageH) { - return; - } + if (ix >= imageW || iy >= imageH) { + return; + } - float sum = 0; + float sum = 0; #if (UNROLL_INNER) - sum = convolutionRow<2 * KERNEL_RADIUS>(x, y, texSrc); + sum = convolutionRow<2 * KERNEL_RADIUS>(x, y, texSrc); #else - for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) { - sum += tex2D(texSrc, x + (float)k, y) * c_Kernel[KERNEL_RADIUS - k]; - } + for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) { + sum += tex2D(texSrc, x + (float)k, y) * c_Kernel[KERNEL_RADIUS - k]; + } #endif - d_Dst[IMAD(iy, imageW, ix)] = sum; + d_Dst[IMAD(iy, imageW, ix)] = sum; } -extern "C" void convolutionRowsGPU(float *d_Dst, cudaArray *a_Src, int imageW, - int imageH, cudaTextureObject_t texSrc) { - dim3 threads(16, 12); - dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y)); +extern "C" void convolutionRowsGPU(float *d_Dst, cudaArray *a_Src, int imageW, int imageH, cudaTextureObject_t texSrc) +{ + dim3 threads(16, 12); + dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y)); - convolutionRowsKernel<<>>(d_Dst, imageW, imageH, texSrc); - getLastCudaError("convolutionRowsKernel() execution failed\n"); + convolutionRowsKernel<<>>(d_Dst, imageW, imageH, texSrc); + getLastCudaError("convolutionRowsKernel() execution failed\n"); } //////////////////////////////////////////////////////////////////////////////// // Column convolution filter //////////////////////////////////////////////////////////////////////////////// -__global__ void convolutionColumnsKernel(float *d_Dst, int imageW, int imageH, - cudaTextureObject_t texSrc) { - const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x); - const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y); - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; +__global__ void convolutionColumnsKernel(float *d_Dst, int imageW, int imageH, cudaTextureObject_t texSrc) +{ + const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x); + const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y); + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; - if (ix >= imageW || iy >= imageH) { - return; - } + if (ix >= imageW || iy >= imageH) { + return; + } - float sum = 0; + float sum = 0; #if (UNROLL_INNER) - sum = convolutionColumn<2 * KERNEL_RADIUS>(x, y, texSrc); + sum = convolutionColumn<2 * KERNEL_RADIUS>(x, y, texSrc); #else - for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) { - sum += tex2D(texSrc, x, y + (float)k) * c_Kernel[KERNEL_RADIUS - k]; - } + for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) { + sum += tex2D(texSrc, x, y + (float)k) * c_Kernel[KERNEL_RADIUS - k]; + } #endif - d_Dst[IMAD(iy, imageW, ix)] = sum; + d_Dst[IMAD(iy, imageW, ix)] = sum; } -extern "C" void convolutionColumnsGPU(float *d_Dst, cudaArray *a_Src, - int imageW, int imageH, - cudaTextureObject_t texSrc) { - dim3 threads(16, 12); - dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y)); +extern "C" void +convolutionColumnsGPU(float *d_Dst, cudaArray *a_Src, int imageW, int imageH, cudaTextureObject_t texSrc) +{ + dim3 threads(16, 12); + dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y)); - convolutionColumnsKernel<<>>(d_Dst, imageW, imageH, texSrc); - getLastCudaError("convolutionColumnsKernel() execution failed\n"); + convolutionColumnsKernel<<>>(d_Dst, imageW, imageH, texSrc); + getLastCudaError("convolutionColumnsKernel() execution failed\n"); } diff --git a/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_common.h b/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_common.h index b9c4c591..4435f0b5 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_common.h +++ b/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_common.h @@ -39,23 +39,18 @@ //////////////////////////////////////////////////////////////////////////////// // Reference CPU convolution //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionRowsCPU(float *h_Dst, float *h_Src, float *h_Kernel, - int imageW, int imageH, int kernelR); +extern "C" void convolutionRowsCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR); -extern "C" void convolutionColumnsCPU(float *h_Dst, float *h_Src, - float *h_Kernel, int imageW, int imageH, - int kernelR); +extern "C" void convolutionColumnsCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR); //////////////////////////////////////////////////////////////////////////////// // GPU texture-based convolution //////////////////////////////////////////////////////////////////////////////// extern "C" void setConvolutionKernel(float *h_Kernel); -extern "C" void convolutionRowsGPU(float *d_Dst, cudaArray *a_Src, int imageW, - int imageH, cudaTextureObject_t texSrc); +extern "C" void convolutionRowsGPU(float *d_Dst, cudaArray *a_Src, int imageW, int imageH, cudaTextureObject_t texSrc); -extern "C" void convolutionColumnsGPU(float *d_Dst, cudaArray *a_Src, - int imageW, int imageH, - cudaTextureObject_t texSrc); +extern "C" void +convolutionColumnsGPU(float *d_Dst, cudaArray *a_Src, int imageW, int imageH, cudaTextureObject_t texSrc); #endif diff --git a/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_gold.cpp b/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_gold.cpp index 56201c55..a7beaded 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_gold.cpp +++ b/Samples/2_Concepts_and_Techniques/convolutionTexture/convolutionTexture_gold.cpp @@ -30,46 +30,49 @@ //////////////////////////////////////////////////////////////////////////////// // Reference row convolution filter //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionRowsCPU(float *h_Dst, float *h_Src, float *h_Kernel, - int imageW, int imageH, int kernelR) { - for (int y = 0; y < imageH; y++) - for (int x = 0; x < imageW; x++) { - float sum = 0; +extern "C" void convolutionRowsCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR) +{ + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; - for (int k = -kernelR; k <= kernelR; k++) { - int d = x + k; + for (int k = -kernelR; k <= kernelR; k++) { + int d = x + k; - if (d < 0) d = 0; + if (d < 0) + d = 0; - if (d >= imageW) d = imageW - 1; + if (d >= imageW) + d = imageW - 1; - sum += h_Src[y * imageW + d] * h_Kernel[kernelR - k]; - } + sum += h_Src[y * imageW + d] * h_Kernel[kernelR - k]; + } - h_Dst[y * imageW + x] = sum; - } + h_Dst[y * imageW + x] = sum; + } } //////////////////////////////////////////////////////////////////////////////// // Reference column convolution filter //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionColumnsCPU(float *h_Dst, float *h_Src, - float *h_Kernel, int imageW, int imageH, - int kernelR) { - for (int y = 0; y < imageH; y++) - for (int x = 0; x < imageW; x++) { - float sum = 0; +extern "C" void convolutionColumnsCPU(float *h_Dst, float *h_Src, float *h_Kernel, int imageW, int imageH, int kernelR) +{ + for (int y = 0; y < imageH; y++) + for (int x = 0; x < imageW; x++) { + float sum = 0; - for (int k = -kernelR; k <= kernelR; k++) { - int d = y + k; + for (int k = -kernelR; k <= kernelR; k++) { + int d = y + k; - if (d < 0) d = 0; + if (d < 0) + d = 0; - if (d >= imageH) d = imageH - 1; + if (d >= imageH) + d = imageH - 1; - sum += h_Src[d * imageW + x] * h_Kernel[kernelR - k]; - } + sum += h_Src[d * imageW + x] * h_Kernel[kernelR - k]; + } - h_Dst[y * imageW + x] = sum; - } + h_Dst[y * imageW + x] = sum; + } } diff --git a/Samples/2_Concepts_and_Techniques/convolutionTexture/main.cpp b/Samples/2_Concepts_and_Techniques/convolutionTexture/main.cpp index 7d209900..b7ba2d86 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionTexture/main.cpp +++ b/Samples/2_Concepts_and_Techniques/convolutionTexture/main.cpp @@ -33,174 +33,165 @@ * Refer to the "Performance" section of convolutionSeparable whitepaper. */ -#include -#include -#include -#include #include - -#include #include +#include +#include +#include +#include +#include #include "convolutionTexture_common.h" //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; +int main(int argc, char **argv) +{ + float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; - cudaArray *a_Src; - cudaTextureObject_t texSrc; - cudaChannelFormatDesc floatTex = cudaCreateChannelDesc(); + cudaArray *a_Src; + cudaTextureObject_t texSrc; + cudaChannelFormatDesc floatTex = cudaCreateChannelDesc(); - float *d_Output; + float *d_Output; - float gpuTime; + float gpuTime; - StopWatchInterface *hTimer = NULL; + StopWatchInterface *hTimer = NULL; - const int imageW = 3072; - const int imageH = 3072 / 2; - const unsigned int iterations = 10; + const int imageW = 3072; + const int imageH = 3072 / 2; + const unsigned int iterations = 10; - printf("[%s] - Starting...\n", argv[0]); + printf("[%s] - Starting...\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Initializing data...\n"); - h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); - h_Input = (float *)malloc(imageW * imageH * sizeof(float)); - h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); - h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); - h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); - checkCudaErrors(cudaMallocArray(&a_Src, &floatTex, imageW, imageH)); - checkCudaErrors( - cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); + printf("Initializing data...\n"); + h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); + h_Input = (float *)malloc(imageW * imageH * sizeof(float)); + h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); + h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); + checkCudaErrors(cudaMallocArray(&a_Src, &floatTex, imageW, imageH)); + checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = a_Src; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = a_Src; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&texSrc, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texSrc, &texRes, &texDescr, NULL)); - srand(2009); + srand(2009); - for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { - h_Kernel[i] = (float)(rand() % 16); - } + for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { + h_Kernel[i] = (float)(rand() % 16); + } - for (unsigned int i = 0; i < imageW * imageH; i++) { - h_Input[i] = (float)(rand() % 16); - } + for (unsigned int i = 0; i < imageW * imageH; i++) { + h_Input[i] = (float)(rand() % 16); + } - setConvolutionKernel(h_Kernel); - checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, h_Input, - imageW * imageH * sizeof(float), - cudaMemcpyHostToDevice)); + setConvolutionKernel(h_Kernel); + checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice)); - printf("Running GPU rows convolution (%u identical iterations)...\n", - iterations); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + printf("Running GPU rows convolution (%u identical iterations)...\n", iterations); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - for (unsigned int i = 0; i < iterations; i++) { - convolutionRowsGPU(d_Output, a_Src, imageW, imageH, texSrc); - } + for (unsigned int i = 0; i < iterations; i++) { + convolutionRowsGPU(d_Output, a_Src, imageW, imageH, texSrc); + } - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; - printf("Average convolutionRowsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, - imageW * imageH * 1e-6 / (0.001 * gpuTime)); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; + printf("Average convolutionRowsGPU() time: %f msecs; //%f Mpix/s\n", + gpuTime, + imageW * imageH * 1e-6 / (0.001 * gpuTime)); - // While CUDA kernels can't write to textures directly, this copy is - // inevitable - printf("Copying convolutionRowGPU() output back to the texture...\n"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, d_Output, - imageW * imageH * sizeof(float), - cudaMemcpyDeviceToDevice)); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer); - printf("cudaMemcpyToArray() time: %f msecs; //%f Mpix/s\n", gpuTime, - imageW * imageH * 1e-6 / (0.001 * gpuTime)); + // While CUDA kernels can't write to textures directly, this copy is + // inevitable + printf("Copying convolutionRowGPU() output back to the texture...\n"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + checkCudaErrors( + cudaMemcpyToArray(a_Src, 0, 0, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer); + printf("cudaMemcpyToArray() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); - printf("Running GPU columns convolution (%i iterations)\n", iterations); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + printf("Running GPU columns convolution (%i iterations)\n", iterations); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - for (int i = 0; i < iterations; i++) { - convolutionColumnsGPU(d_Output, a_Src, imageW, imageH, texSrc); - } + for (int i = 0; i < iterations; i++) { + convolutionColumnsGPU(d_Output, a_Src, imageW, imageH, texSrc); + } - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; - printf("Average convolutionColumnsGPU() time: %f msecs; //%f Mpix/s\n", - gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; + printf("Average convolutionColumnsGPU() time: %f msecs; //%f Mpix/s\n", + gpuTime, + imageW * imageH * 1e-6 / (0.001 * gpuTime)); - printf("Reading back GPU results...\n"); - checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, - imageW * imageH * sizeof(float), - cudaMemcpyDeviceToHost)); + printf("Reading back GPU results...\n"); + checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost)); - printf("Checking the results...\n"); - printf("...running convolutionRowsCPU()\n"); - convolutionRowsCPU(h_Buffer, h_Input, h_Kernel, imageW, imageH, - KERNEL_RADIUS); + printf("Checking the results...\n"); + printf("...running convolutionRowsCPU()\n"); + convolutionRowsCPU(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); - printf("...running convolutionColumnsCPU()\n"); - convolutionColumnsCPU(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, - KERNEL_RADIUS); + printf("...running convolutionColumnsCPU()\n"); + convolutionColumnsCPU(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS); - double delta = 0; - double sum = 0; + double delta = 0; + double sum = 0; - for (unsigned int i = 0; i < imageW * imageH; i++) { - sum += h_OutputCPU[i] * h_OutputCPU[i]; - delta += - (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); - } + for (unsigned int i = 0; i < imageW * imageH; i++) { + sum += h_OutputCPU[i] * h_OutputCPU[i]; + delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); + } - double L2norm = sqrt(delta / sum); - printf("Relative L2 norm: %E\n", L2norm); - printf("Shutting down...\n"); + double L2norm = sqrt(delta / sum); + printf("Relative L2 norm: %E\n", L2norm); + printf("Shutting down...\n"); - checkCudaErrors(cudaFree(d_Output)); - checkCudaErrors(cudaFreeArray(a_Src)); - free(h_OutputGPU); - free(h_Buffer); - free(h_Input); - free(h_Kernel); + checkCudaErrors(cudaFree(d_Output)); + checkCudaErrors(cudaFreeArray(a_Src)); + free(h_OutputGPU); + free(h_Buffer); + free(h_Input); + free(h_Kernel); - sdkDeleteTimer(&hTimer); + sdkDeleteTimer(&hTimer); - if (L2norm > 1e-6) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + if (L2norm > 1e-6) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - printf("Test passed\n"); - exit(EXIT_SUCCESS); + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.cpp b/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.cpp index eefb153c..b3524cac 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.cpp +++ b/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.cpp @@ -34,11 +34,12 @@ * conversions to different representations and memory management routines. */ -#include "Common.h" #include "BmpUtil.h" +#include "Common.h" + #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#pragma warning(disable : 4996) // disable deprecated warning +#pragma warning(disable : 4996) // disable deprecated warning #endif /** @@ -59,11 +60,12 @@ int clamp_0_255(int x) { return (x < 0) ? 0 : ((x > 255) ? 255 : x); } * * \return The closest to the input float integer value */ -float round_f(float num) { - float NumAbs = fabs(num); - int NumAbsI = (int)(NumAbs + 0.5f); - float sign = num > 0 ? 1.0f : -1.0f; - return sign * NumAbsI; +float round_f(float num) +{ + float NumAbs = fabs(num); + int NumAbsI = (int)(NumAbs + 0.5f); + float sign = num > 0 ? 1.0f : -1.0f; + return sign * NumAbsI; } /** @@ -76,15 +78,16 @@ float round_f(float num) { * * \return Pointer to the created plane */ -byte *MallocPlaneByte(int width, int height, int *pStepBytes) { - byte *ptr; - *pStepBytes = ((int)ceil(width / 16.0f)) * 16; - //#ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT - // ptr = (byte *)_aligned_malloc(*pStepBytes * height, 16); - //#else - ptr = (byte *)malloc(*pStepBytes * height); - //#endif - return ptr; +byte *MallocPlaneByte(int width, int height, int *pStepBytes) +{ + byte *ptr; + *pStepBytes = ((int)ceil(width / 16.0f)) * 16; + // #ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT + // ptr = (byte *)_aligned_malloc(*pStepBytes * height, 16); + // #else + ptr = (byte *)malloc(*pStepBytes * height); + // #endif + return ptr; } /** @@ -97,16 +100,17 @@ byte *MallocPlaneByte(int width, int height, int *pStepBytes) { * * \return Pointer to the created plane */ -short *MallocPlaneShort(int width, int height, int *pStepBytes) { - short *ptr; - *pStepBytes = ((int)ceil((width * sizeof(short)) / 16.0f)) * 16; - //#ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT - // ptr = (float *)_aligned_malloc(*pStepBytes * height, 16); - //#else - ptr = (short *)malloc(*pStepBytes * height); - //#endif - *pStepBytes = *pStepBytes / sizeof(short); - return ptr; +short *MallocPlaneShort(int width, int height, int *pStepBytes) +{ + short *ptr; + *pStepBytes = ((int)ceil((width * sizeof(short)) / 16.0f)) * 16; + // #ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT + // ptr = (float *)_aligned_malloc(*pStepBytes * height, 16); + // #else + ptr = (short *)malloc(*pStepBytes * height); + // #endif + *pStepBytes = *pStepBytes / sizeof(short); + return ptr; } /** @@ -119,16 +123,17 @@ short *MallocPlaneShort(int width, int height, int *pStepBytes) { * * \return Pointer to the created plane */ -float *MallocPlaneFloat(int width, int height, int *pStepBytes) { - float *ptr; - *pStepBytes = ((int)ceil((width * sizeof(float)) / 16.0f)) * 16; - //#ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT - // ptr = (float *)_aligned_malloc(*pStepBytes * height, 16); - //#else - ptr = (float *)malloc(*pStepBytes * height); - //#endif - *pStepBytes = *pStepBytes / sizeof(float); - return ptr; +float *MallocPlaneFloat(int width, int height, int *pStepBytes) +{ + float *ptr; + *pStepBytes = ((int)ceil((width * sizeof(float)) / 16.0f)) * 16; + // #ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT + // ptr = (float *)_aligned_malloc(*pStepBytes * height, 16); + // #else + ptr = (float *)malloc(*pStepBytes * height); + // #endif + *pStepBytes = *pStepBytes / sizeof(float); + return ptr; } /** @@ -143,13 +148,13 @@ float *MallocPlaneFloat(int width, int height, int *pStepBytes) { * * \return None */ -void CopyByte2Float(byte *ImgSrc, int StrideB, float *ImgDst, int StrideF, - ROI Size) { - for (int i = 0; i < Size.height; i++) { - for (int j = 0; j < Size.width; j++) { - ImgDst[i * StrideF + j] = (float)ImgSrc[i * StrideB + j]; +void CopyByte2Float(byte *ImgSrc, int StrideB, float *ImgDst, int StrideF, ROI Size) +{ + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + ImgDst[i * StrideF + j] = (float)ImgSrc[i * StrideB + j]; + } } - } } /** @@ -164,14 +169,13 @@ void CopyByte2Float(byte *ImgSrc, int StrideB, float *ImgDst, int StrideF, * * \return None */ -void CopyFloat2Byte(float *ImgSrc, int StrideF, byte *ImgDst, int StrideB, - ROI Size) { - for (int i = 0; i < Size.height; i++) { - for (int j = 0; j < Size.width; j++) { - ImgDst[i * StrideB + j] = - (byte)clamp_0_255((int)(round_f(ImgSrc[i * StrideF + j]))); +void CopyFloat2Byte(float *ImgSrc, int StrideF, byte *ImgDst, int StrideB, ROI Size) +{ + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + ImgDst[i * StrideB + j] = (byte)clamp_0_255((int)(round_f(ImgSrc[i * StrideF + j]))); + } } - } } /** @@ -182,18 +186,19 @@ void CopyFloat2Byte(float *ImgSrc, int StrideF, byte *ImgDst, int StrideB, * * \return None */ -void FreePlane(void *ptr) { - //#ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT - // if (ptr) - // { - // _aligned_free(ptr); - // } - //#else - if (ptr) { - free(ptr); - } +void FreePlane(void *ptr) +{ + // #ifdef __ALLOW_ALIGNED_MEMORY_MANAGEMENT + // if (ptr) + // { + // _aligned_free(ptr); + // } + // #else + if (ptr) { + free(ptr); + } - //#endif + // #endif } /** @@ -207,12 +212,13 @@ void FreePlane(void *ptr) { * * \return None */ -void AddFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size) { - for (int i = 0; i < Size.height; i++) { - for (int j = 0; j < Size.width; j++) { - ImgSrcDst[i * StrideF + j] += Value; +void AddFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size) +{ + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + ImgSrcDst[i * StrideF + j] += Value; + } } - } } /** @@ -226,12 +232,13 @@ void AddFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size) { * * \return None */ -void MulFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size) { - for (int i = 0; i < Size.height; i++) { - for (int j = 0; j < Size.width; j++) { - ImgSrcDst[i * StrideF + j] *= Value; +void MulFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size) +{ + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + ImgSrcDst[i * StrideF + j] *= Value; + } } - } } /** @@ -244,36 +251,37 @@ void MulFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size) { * * \return Status code */ -int PreLoadBmp(char *FileName, int *Width, int *Height) { - BMPFileHeader FileHeader; - BMPInfoHeader InfoHeader; - FILE *fh; +int PreLoadBmp(char *FileName, int *Width, int *Height) +{ + BMPFileHeader FileHeader; + BMPInfoHeader InfoHeader; + FILE *fh; - if (!(fh = fopen(FileName, "rb"))) { - return 1; // invalid filename - } + if (!(fh = fopen(FileName, "rb"))) { + return 1; // invalid filename + } - fread(&FileHeader, sizeof(BMPFileHeader), 1, fh); + fread(&FileHeader, sizeof(BMPFileHeader), 1, fh); - if (FileHeader._bm_signature != 0x4D42) { - return 2; // invalid file format - } + if (FileHeader._bm_signature != 0x4D42) { + return 2; // invalid file format + } - fread(&InfoHeader, sizeof(BMPInfoHeader), 1, fh); + fread(&InfoHeader, sizeof(BMPInfoHeader), 1, fh); - if (InfoHeader._bm_color_depth != 24) { - return 3; // invalid color depth - } + if (InfoHeader._bm_color_depth != 24) { + return 3; // invalid color depth + } - if (InfoHeader._bm_compressed) { - return 4; // invalid compression property - } + if (InfoHeader._bm_compressed) { + return 4; // invalid compression property + } - *Width = InfoHeader._bm_image_width; - *Height = InfoHeader._bm_image_height; + *Width = InfoHeader._bm_image_width; + *Height = InfoHeader._bm_image_height; - fclose(fh); - return 0; + fclose(fh); + return 0; } /** @@ -287,28 +295,29 @@ int PreLoadBmp(char *FileName, int *Width, int *Height) { * * \return None */ -void LoadBmpAsGray(char *FileName, int Stride, ROI ImSize, byte *Img) { - BMPFileHeader FileHeader; - BMPInfoHeader InfoHeader; - FILE *fh; - fh = fopen(FileName, "rb"); +void LoadBmpAsGray(char *FileName, int Stride, ROI ImSize, byte *Img) +{ + BMPFileHeader FileHeader; + BMPInfoHeader InfoHeader; + FILE *fh; + fh = fopen(FileName, "rb"); - fread(&FileHeader, sizeof(BMPFileHeader), 1, fh); - fread(&InfoHeader, sizeof(BMPInfoHeader), 1, fh); + fread(&FileHeader, sizeof(BMPFileHeader), 1, fh); + fread(&InfoHeader, sizeof(BMPInfoHeader), 1, fh); - for (int i = ImSize.height - 1; i >= 0; i--) { - for (int j = 0; j < ImSize.width; j++) { - int r = 0, g = 0, b = 0; - fread(&b, 1, 1, fh); - fread(&g, 1, 1, fh); - fread(&r, 1, 1, fh); - int val = (313524 * r + 615514 * g + 119537 * b + 524288) >> 20; - Img[i * Stride + j] = (byte)clamp_0_255(val); + for (int i = ImSize.height - 1; i >= 0; i--) { + for (int j = 0; j < ImSize.width; j++) { + int r = 0, g = 0, b = 0; + fread(&b, 1, 1, fh); + fread(&g, 1, 1, fh); + fread(&r, 1, 1, fh); + int val = (313524 * r + 615514 * g + 119537 * b + 524288) >> 20; + Img[i * Stride + j] = (byte)clamp_0_255(val); + } } - } - fclose(fh); - return; + fclose(fh); + return; } /** @@ -322,46 +331,47 @@ void LoadBmpAsGray(char *FileName, int Stride, ROI ImSize, byte *Img) { * * \return None */ -void DumpBmpAsGray(char *FileName, byte *Img, int Stride, ROI ImSize) { - FILE *fp = NULL; - fp = fopen(FileName, "wb"); +void DumpBmpAsGray(char *FileName, byte *Img, int Stride, ROI ImSize) +{ + FILE *fp = NULL; + fp = fopen(FileName, "wb"); - if (fp == NULL) { - return; - } - - BMPFileHeader FileHeader; - BMPInfoHeader InfoHeader; - - // init headers - FileHeader._bm_signature = 0x4D42; - FileHeader._bm_file_size = 54 + 3 * ImSize.width * ImSize.height; - FileHeader._bm_reserved = 0; - FileHeader._bm_bitmap_data = 0x36; - InfoHeader._bm_bitmap_size = 0; - InfoHeader._bm_color_depth = 24; - InfoHeader._bm_compressed = 0; - InfoHeader._bm_hor_resolution = 0; - InfoHeader._bm_image_height = ImSize.height; - InfoHeader._bm_image_width = ImSize.width; - InfoHeader._bm_info_header_size = 40; - InfoHeader._bm_num_colors_used = 0; - InfoHeader._bm_num_important_colors = 0; - InfoHeader._bm_num_of_planes = 1; - InfoHeader._bm_ver_resolution = 0; - - fwrite(&FileHeader, sizeof(BMPFileHeader), 1, fp); - fwrite(&InfoHeader, sizeof(BMPInfoHeader), 1, fp); - - for (int i = ImSize.height - 1; i >= 0; i--) { - for (int j = 0; j < ImSize.width; j++) { - fwrite(&(Img[i * Stride + j]), 1, 1, fp); - fwrite(&(Img[i * Stride + j]), 1, 1, fp); - fwrite(&(Img[i * Stride + j]), 1, 1, fp); + if (fp == NULL) { + return; } - } - fclose(fp); + BMPFileHeader FileHeader; + BMPInfoHeader InfoHeader; + + // init headers + FileHeader._bm_signature = 0x4D42; + FileHeader._bm_file_size = 54 + 3 * ImSize.width * ImSize.height; + FileHeader._bm_reserved = 0; + FileHeader._bm_bitmap_data = 0x36; + InfoHeader._bm_bitmap_size = 0; + InfoHeader._bm_color_depth = 24; + InfoHeader._bm_compressed = 0; + InfoHeader._bm_hor_resolution = 0; + InfoHeader._bm_image_height = ImSize.height; + InfoHeader._bm_image_width = ImSize.width; + InfoHeader._bm_info_header_size = 40; + InfoHeader._bm_num_colors_used = 0; + InfoHeader._bm_num_important_colors = 0; + InfoHeader._bm_num_of_planes = 1; + InfoHeader._bm_ver_resolution = 0; + + fwrite(&FileHeader, sizeof(BMPFileHeader), 1, fp); + fwrite(&InfoHeader, sizeof(BMPInfoHeader), 1, fp); + + for (int i = ImSize.height - 1; i >= 0; i--) { + for (int j = 0; j < ImSize.width; j++) { + fwrite(&(Img[i * Stride + j]), 1, 1, fp); + fwrite(&(Img[i * Stride + j]), 1, 1, fp); + fwrite(&(Img[i * Stride + j]), 1, 1, fp); + } + } + + fclose(fp); } /** @@ -374,18 +384,19 @@ void DumpBmpAsGray(char *FileName, byte *Img, int Stride, ROI ImSize) { * * \return None */ -void DumpBlockF(float *PlaneF, int StrideF, char *Fname) { - FILE *fp = fopen(Fname, "wb"); +void DumpBlockF(float *PlaneF, int StrideF, char *Fname) +{ + FILE *fp = fopen(Fname, "wb"); - for (int i = 0; i < 8; i++) { - for (int j = 0; j < 8; j++) { - fprintf(fp, "%.*f ", 14, PlaneF[i * StrideF + j]); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + fprintf(fp, "%.*f ", 14, PlaneF[i * StrideF + j]); + } + + fprintf(fp, "\n"); } - fprintf(fp, "\n"); - } - - fclose(fp); + fclose(fp); } /** @@ -398,18 +409,19 @@ void DumpBlockF(float *PlaneF, int StrideF, char *Fname) { * * \return None */ -void DumpBlock(byte *Plane, int Stride, char *Fname) { - FILE *fp = fopen(Fname, "wb"); +void DumpBlock(byte *Plane, int Stride, char *Fname) +{ + FILE *fp = fopen(Fname, "wb"); - for (int i = 0; i < 8; i++) { - for (int j = 0; j < 8; j++) { - fprintf(fp, "%.3d ", Plane[i * Stride + j]); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + fprintf(fp, "%.3d ", Plane[i * Stride + j]); + } + + fprintf(fp, "\n"); } - fprintf(fp, "\n"); - } - - fclose(fp); + fclose(fp); } /** @@ -423,18 +435,19 @@ void DumpBlock(byte *Plane, int Stride, char *Fname) { * * \return Mean Square Error between images */ -float CalculateMSE(byte *Img1, byte *Img2, int Stride, ROI Size) { - uint32 Acc = 0; +float CalculateMSE(byte *Img1, byte *Img2, int Stride, ROI Size) +{ + uint32 Acc = 0; - for (int i = 0; i < Size.height; i++) { - for (int j = 0; j < Size.width; j++) { - int TmpDiff = Img1[i * Stride + j] - Img2[i * Stride + j]; - TmpDiff *= TmpDiff; - Acc += TmpDiff; + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + int TmpDiff = Img1[i * Stride + j] - Img2[i * Stride + j]; + TmpDiff *= TmpDiff; + Acc += TmpDiff; + } } - } - return ((float)Acc) / (Size.height * Size.width); + return ((float)Acc) / (Size.height * Size.width); } /** @@ -449,7 +462,8 @@ float CalculateMSE(byte *Img1, byte *Img2, int Stride, ROI Size) { * * \return Peak Signal to Noise Ratio between images */ -float CalculatePSNR(byte *Img1, byte *Img2, int Stride, ROI Size) { - float MSE = CalculateMSE(Img1, Img2, Stride, Size); - return 10 * log10(255 * 255 / MSE); +float CalculatePSNR(byte *Img1, byte *Img2, int Stride, ROI Size) +{ + float MSE = CalculateMSE(Img1, Img2, Stride, Size); + return 10 * log10(255 * 255 / MSE); } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.h b/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.h index 54ccff78..4b6dfe73 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.h +++ b/Samples/2_Concepts_and_Techniques/dct8x8/BmpUtil.h @@ -42,43 +42,45 @@ #pragma pack(1) -typedef char int8; -typedef short int16; -typedef int int32; -typedef unsigned char uint8; +typedef char int8; +typedef short int16; +typedef int int32; +typedef unsigned char uint8; typedef unsigned short uint16; -typedef unsigned int uint32; +typedef unsigned int uint32; /** -* \brief Bitmap file header structure -* -* Bitmap file header structure -*/ -typedef struct { - uint16 _bm_signature; //!< File signature, must be "BM" - uint32 _bm_file_size; //!< File size - uint32 _bm_reserved; //!< Reserved, must be zero - uint32 _bm_bitmap_data; //!< Bitmap data + * \brief Bitmap file header structure + * + * Bitmap file header structure + */ +typedef struct +{ + uint16 _bm_signature; //!< File signature, must be "BM" + uint32 _bm_file_size; //!< File size + uint32 _bm_reserved; //!< Reserved, must be zero + uint32 _bm_bitmap_data; //!< Bitmap data } BMPFileHeader; /** -* \brief Bitmap info header structure -* -* Bitmap info header structure -*/ -typedef struct { - uint32 _bm_info_header_size; //!< Info header size, must be 40 - uint32 _bm_image_width; //!< Image width - uint32 _bm_image_height; //!< Image height - uint16 _bm_num_of_planes; //!< Amount of image planes, must be 1 - uint16 _bm_color_depth; //!< Color depth - uint32 _bm_compressed; //!< Image compression, must be none - uint32 _bm_bitmap_size; //!< Size of bitmap data - uint32 _bm_hor_resolution; //!< Horizontal resolution, assumed to be 0 - uint32 _bm_ver_resolution; //!< Vertical resolution, assumed to be 0 - uint32 _bm_num_colors_used; //!< Number of colors used, assumed to be 0 - uint32 _bm_num_important_colors; //!< Number of important colors, assumed to - //!be 0 + * \brief Bitmap info header structure + * + * Bitmap info header structure + */ +typedef struct +{ + uint32 _bm_info_header_size; //!< Info header size, must be 40 + uint32 _bm_image_width; //!< Image width + uint32 _bm_image_height; //!< Image height + uint16 _bm_num_of_planes; //!< Amount of image planes, must be 1 + uint16 _bm_color_depth; //!< Color depth + uint32 _bm_compressed; //!< Image compression, must be none + uint32 _bm_bitmap_size; //!< Size of bitmap data + uint32 _bm_hor_resolution; //!< Horizontal resolution, assumed to be 0 + uint32 _bm_ver_resolution; //!< Vertical resolution, assumed to be 0 + uint32 _bm_num_colors_used; //!< Number of colors used, assumed to be 0 + uint32 _bm_num_important_colors; //!< Number of important colors, assumed to + //! be 0 } BMPInfoHeader; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) @@ -88,38 +90,38 @@ typedef struct { #endif /** -* \brief Simple 2D size / region_of_interest structure -* -* Simple 2D size / region_of_interest structure -*/ -typedef struct { - int width; //!< ROI width - int height; //!< ROI height + * \brief Simple 2D size / region_of_interest structure + * + * Simple 2D size / region_of_interest structure + */ +typedef struct +{ + int width; //!< ROI width + int height; //!< ROI height } ROI; /** -* One-byte unsigned integer type -*/ + * One-byte unsigned integer type + */ typedef unsigned char byte; -extern "C" { -int clamp_0_255(int x); -float round_f(float num); -byte *MallocPlaneByte(int width, int height, int *pStepBytes); -short *MallocPlaneShort(int width, int height, int *pStepBytes); -float *MallocPlaneFloat(int width, int height, int *pStepBytes); -void CopyByte2Float(byte *ImgSrc, int StrideB, float *ImgDst, int StrideF, - ROI Size); -void CopyFloat2Byte(float *ImgSrc, int StrideF, byte *ImgDst, int StrideB, - ROI Size); -void FreePlane(void *ptr); -void AddFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size); -void MulFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size); -int PreLoadBmp(char *FileName, int *Width, int *Height); -void LoadBmpAsGray(char *FileName, int Stride, ROI ImSize, byte *Img); -void DumpBmpAsGray(char *FileName, byte *Img, int Stride, ROI ImSize); -void DumpBlockF(float *PlaneF, int StrideF, char *Fname); -void DumpBlock(byte *Plane, int Stride, char *Fname); -float CalculateMSE(byte *Img1, byte *Img2, int Stride, ROI Size); -float CalculatePSNR(byte *Img1, byte *Img2, int Stride, ROI Size); +extern "C" +{ + int clamp_0_255(int x); + float round_f(float num); + byte *MallocPlaneByte(int width, int height, int *pStepBytes); + short *MallocPlaneShort(int width, int height, int *pStepBytes); + float *MallocPlaneFloat(int width, int height, int *pStepBytes); + void CopyByte2Float(byte *ImgSrc, int StrideB, float *ImgDst, int StrideF, ROI Size); + void CopyFloat2Byte(float *ImgSrc, int StrideF, byte *ImgDst, int StrideB, ROI Size); + void FreePlane(void *ptr); + void AddFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size); + void MulFloatPlane(float Value, float *ImgSrcDst, int StrideF, ROI Size); + int PreLoadBmp(char *FileName, int *Width, int *Height); + void LoadBmpAsGray(char *FileName, int Stride, ROI ImSize, byte *Img); + void DumpBmpAsGray(char *FileName, byte *Img, int Stride, ROI ImSize); + void DumpBlockF(float *PlaneF, int StrideF, char *Fname); + void DumpBlock(byte *Plane, int Stride, char *Fname); + float CalculateMSE(byte *Img1, byte *Img2, int Stride, ROI Size); + float CalculatePSNR(byte *Img1, byte *Img2, int Stride, ROI Size); } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/Common.h b/Samples/2_Concepts_and_Techniques/dct8x8/Common.h index 5b432491..8a3129bd 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/Common.h +++ b/Samples/2_Concepts_and_Techniques/dct8x8/Common.h @@ -35,47 +35,46 @@ #pragma once +#include +#include // helper functions for CUDA timing and initialization +#include // helper functions for timing, string parsing +#include #include #include -#include - -#include -#include // helper functions for CUDA timing and initialization -#include // helper functions for timing, string parsing /** -* The dimension of pixels block -*/ + * The dimension of pixels block + */ #define BLOCK_SIZE 8 /** -* Square of dimension of pixels block -*/ + * Square of dimension of pixels block + */ #define BLOCK_SIZE2 64 /** -* log_2{BLOCK_SIZE), used for quick multiplication or division by the -* pixels block dimension via shifting -*/ + * log_2{BLOCK_SIZE), used for quick multiplication or division by the + * pixels block dimension via shifting + */ #define BLOCK_SIZE_LOG2 3 /** -* log_2{BLOCK_SIZE*BLOCK_SIZE), used for quick multiplication or division by -* the -* square of pixels block via shifting -*/ + * log_2{BLOCK_SIZE*BLOCK_SIZE), used for quick multiplication or division by + * the + * square of pixels block via shifting + */ #define BLOCK_SIZE2_LOG2 6 /** -* This macro states that __mul24 operation is performed faster that traditional -* multiplication for two integers on CUDA. Please undefine if it appears to be -* wrong on your system -*/ + * This macro states that __mul24 operation is performed faster that traditional + * multiplication for two integers on CUDA. Please undefine if it appears to be + * wrong on your system + */ #define __MUL24_FASTER_THAN_ASTERIX /** -* Wrapper to the fastest integer multiplication function on CUDA -*/ + * Wrapper to the fastest integer multiplication function on CUDA + */ #ifdef __MUL24_FASTER_THAN_ASTERIX #define FMUL(x, y) (__mul24(x, y)) #else @@ -83,6 +82,6 @@ #endif /** -* This macro allows using aligned memory management -*/ -//#define __ALLOW_ALIGNED_MEMORY_MANAGEMENT + * This macro allows using aligned memory management + */ +// #define __ALLOW_ALIGNED_MEMORY_MANAGEMENT diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.cpp b/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.cpp index 9566eae3..0cb71e53 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.cpp +++ b/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.cpp @@ -38,47 +38,53 @@ * The routine that performs quantization of coefficients is also included. */ -#include "Common.h" #include "BmpUtil.h" +#include "Common.h" /** * This unitary matrix performs DCT of rows of the matrix to the left */ const float DCTv8matrix[BLOCK_SIZE2] = { - 0.3535533905932738f, 0.4903926402016152f, 0.4619397662556434f, 0.4157348061512726f, 0.3535533905932738f, 0.2777851165098011f, 0.1913417161825449f, 0.0975451610080642f, - 0.3535533905932738f, 0.4157348061512726f, 0.1913417161825449f, -0.0975451610080641f, -0.3535533905932737f, -0.4903926402016152f, -0.4619397662556434f, -0.2777851165098011f, - 0.3535533905932738f, 0.2777851165098011f, -0.1913417161825449f, -0.4903926402016152f, -0.3535533905932738f, 0.0975451610080642f, 0.4619397662556433f, 0.4157348061512727f, - 0.3535533905932738f, 0.0975451610080642f, -0.4619397662556434f, -0.2777851165098011f, 0.3535533905932737f, 0.4157348061512727f, -0.1913417161825450f, -0.4903926402016153f, - 0.3535533905932738f, -0.0975451610080641f, -0.4619397662556434f, 0.2777851165098009f, 0.3535533905932738f, -0.4157348061512726f, -0.1913417161825453f, 0.4903926402016152f, - 0.3535533905932738f, -0.2777851165098010f, -0.1913417161825452f, 0.4903926402016153f, -0.3535533905932733f, -0.0975451610080649f, 0.4619397662556437f, -0.4157348061512720f, - 0.3535533905932738f, -0.4157348061512727f, 0.1913417161825450f, 0.0975451610080640f, -0.3535533905932736f, 0.4903926402016152f, -0.4619397662556435f, 0.2777851165098022f, - 0.3535533905932738f, -0.4903926402016152f, 0.4619397662556433f, -0.4157348061512721f, 0.3535533905932733f, -0.2777851165098008f, 0.1913417161825431f, -0.0975451610080625f}; + 0.3535533905932738f, 0.4903926402016152f, 0.4619397662556434f, 0.4157348061512726f, 0.3535533905932738f, + 0.2777851165098011f, 0.1913417161825449f, 0.0975451610080642f, 0.3535533905932738f, 0.4157348061512726f, + 0.1913417161825449f, -0.0975451610080641f, -0.3535533905932737f, -0.4903926402016152f, -0.4619397662556434f, + -0.2777851165098011f, 0.3535533905932738f, 0.2777851165098011f, -0.1913417161825449f, -0.4903926402016152f, + -0.3535533905932738f, 0.0975451610080642f, 0.4619397662556433f, 0.4157348061512727f, 0.3535533905932738f, + 0.0975451610080642f, -0.4619397662556434f, -0.2777851165098011f, 0.3535533905932737f, 0.4157348061512727f, + -0.1913417161825450f, -0.4903926402016153f, 0.3535533905932738f, -0.0975451610080641f, -0.4619397662556434f, + 0.2777851165098009f, 0.3535533905932738f, -0.4157348061512726f, -0.1913417161825453f, 0.4903926402016152f, + 0.3535533905932738f, -0.2777851165098010f, -0.1913417161825452f, 0.4903926402016153f, -0.3535533905932733f, + -0.0975451610080649f, 0.4619397662556437f, -0.4157348061512720f, 0.3535533905932738f, -0.4157348061512727f, + 0.1913417161825450f, 0.0975451610080640f, -0.3535533905932736f, 0.4903926402016152f, -0.4619397662556435f, + 0.2777851165098022f, 0.3535533905932738f, -0.4903926402016152f, 0.4619397662556433f, -0.4157348061512721f, + 0.3535533905932733f, -0.2777851165098008f, 0.1913417161825431f, -0.0975451610080625f}; /** * This unitary matrix performs DCT of columns of the matrix to the right */ const float DCTv8matrixT[BLOCK_SIZE2] = { - 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, - 0.4903926402016152f, 0.4157348061512726f, 0.2777851165098011f, 0.0975451610080642f, -0.0975451610080641f, -0.2777851165098010f, -0.4157348061512727f, -0.4903926402016152f, - 0.4619397662556434f, 0.1913417161825449f, -0.1913417161825449f, -0.4619397662556434f, -0.4619397662556434f, -0.1913417161825452f, 0.1913417161825450f, 0.4619397662556433f, - 0.4157348061512726f, -0.0975451610080641f, -0.4903926402016152f, -0.2777851165098011f, 0.2777851165098009f, 0.4903926402016153f, 0.0975451610080640f, -0.4157348061512721f, - 0.3535533905932738f, -0.3535533905932737f, -0.3535533905932738f, 0.3535533905932737f, 0.3535533905932738f, -0.3535533905932733f, -0.3535533905932736f, 0.3535533905932733f, - 0.2777851165098011f, -0.4903926402016152f, 0.0975451610080642f, 0.4157348061512727f, -0.4157348061512726f, -0.0975451610080649f, 0.4903926402016152f, -0.2777851165098008f, - 0.1913417161825449f, -0.4619397662556434f, 0.4619397662556433f, -0.1913417161825450f, -0.1913417161825453f, 0.4619397662556437f, -0.4619397662556435f, 0.1913417161825431f, - 0.0975451610080642f, -0.2777851165098011f, 0.4157348061512727f, -0.4903926402016153f, 0.4903926402016152f, -0.4157348061512720f, 0.2777851165098022f, -0.0975451610080625f}; + 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, + 0.3535533905932738f, 0.3535533905932738f, 0.3535533905932738f, 0.4903926402016152f, 0.4157348061512726f, + 0.2777851165098011f, 0.0975451610080642f, -0.0975451610080641f, -0.2777851165098010f, -0.4157348061512727f, + -0.4903926402016152f, 0.4619397662556434f, 0.1913417161825449f, -0.1913417161825449f, -0.4619397662556434f, + -0.4619397662556434f, -0.1913417161825452f, 0.1913417161825450f, 0.4619397662556433f, 0.4157348061512726f, + -0.0975451610080641f, -0.4903926402016152f, -0.2777851165098011f, 0.2777851165098009f, 0.4903926402016153f, + 0.0975451610080640f, -0.4157348061512721f, 0.3535533905932738f, -0.3535533905932737f, -0.3535533905932738f, + 0.3535533905932737f, 0.3535533905932738f, -0.3535533905932733f, -0.3535533905932736f, 0.3535533905932733f, + 0.2777851165098011f, -0.4903926402016152f, 0.0975451610080642f, 0.4157348061512727f, -0.4157348061512726f, + -0.0975451610080649f, 0.4903926402016152f, -0.2777851165098008f, 0.1913417161825449f, -0.4619397662556434f, + 0.4619397662556433f, -0.1913417161825450f, -0.1913417161825453f, 0.4619397662556437f, -0.4619397662556435f, + 0.1913417161825431f, 0.0975451610080642f, -0.2777851165098011f, 0.4157348061512727f, -0.4903926402016153f, + 0.4903926402016152f, -0.4157348061512720f, 0.2777851165098022f, -0.0975451610080625f}; /** * JPEG quality=0_of_12 quantization matrix */ -float Q_GOLD[BLOCK_SIZE2] = { - 32.f, 33.f, 51.f, 81.f, 66.f, 39.f, 34.f, 17.f, - 33.f, 36.f, 48.f, 47.f, 28.f, 23.f, 12.f, 12.f, - 51.f, 48.f, 47.f, 28.f, 23.f, 12.f, 12.f, 12.f, - 81.f, 47.f, 28.f, 23.f, 12.f, 12.f, 12.f, 12.f, - 66.f, 28.f, 23.f, 12.f, 12.f, 12.f, 12.f, 12.f, - 39.f, 23.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f, - 34.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f, - 17.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f}; +float Q_GOLD[BLOCK_SIZE2] = {32.f, 33.f, 51.f, 81.f, 66.f, 39.f, 34.f, 17.f, 33.f, 36.f, 48.f, 47.f, 28.f, + 23.f, 12.f, 12.f, 51.f, 48.f, 47.f, 28.f, 23.f, 12.f, 12.f, 12.f, 81.f, 47.f, + 28.f, 23.f, 12.f, 12.f, 12.f, 12.f, 66.f, 28.f, 23.f, 12.f, 12.f, 12.f, 12.f, + 12.f, 39.f, 23.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f, 34.f, 12.f, 12.f, 12.f, + 12.f, 12.f, 12.f, 12.f, 17.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f, 12.f}; /** ************************************************************************** @@ -93,23 +99,19 @@ float Q_GOLD[BLOCK_SIZE2] = { * * \return None */ -void mult8x8(const float *M1, int M1Stride, const float *M2, int M2Stride, - float *Mres, int MresStride) +void mult8x8(const float *M1, int M1Stride, const float *M2, int M2Stride, float *Mres, int MresStride) { - for (int i = 0; i < BLOCK_SIZE; i++) - { - for (int j = 0; j < BLOCK_SIZE; j++) - { - float accumul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + float accumul = 0; - for (int k = 0; k < BLOCK_SIZE; k++) - { - accumul += M1[i * M1Stride + k] * M2[k * M2Stride + j]; - } + for (int k = 0; k < BLOCK_SIZE; k++) { + accumul += M1[i * M1Stride + k] * M2[k * M2Stride + j]; + } - Mres[i * MresStride + j] = accumul; + Mres[i * MresStride + j] = accumul; + } } - } } /** @@ -125,25 +127,20 @@ void mult8x8(const float *M1, int M1Stride, const float *M2, int M2Stride, * * \return None */ -extern "C" void computeDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, - ROI Size) +extern "C" void computeDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, ROI Size) { - float tmpblock[BLOCK_SIZE2]; + float tmpblock[BLOCK_SIZE2]; - // perform block wise DCT - // DCT(A) = DCTv8matrixT * A * DCTv8matrix - for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) - { - for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) - { - // tmpblock = DCTv8matrixT * A - mult8x8(DCTv8matrixT, BLOCK_SIZE, fSrc + i * Stride + j, Stride, tmpblock, - BLOCK_SIZE); - // DCT(A) = tmpblock * DCTv8matrix - mult8x8(tmpblock, BLOCK_SIZE, DCTv8matrix, BLOCK_SIZE, - fDst + i * Stride + j, Stride); + // perform block wise DCT + // DCT(A) = DCTv8matrixT * A * DCTv8matrix + for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) { + for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) { + // tmpblock = DCTv8matrixT * A + mult8x8(DCTv8matrixT, BLOCK_SIZE, fSrc + i * Stride + j, Stride, tmpblock, BLOCK_SIZE); + // DCT(A) = tmpblock * DCTv8matrix + mult8x8(tmpblock, BLOCK_SIZE, DCTv8matrix, BLOCK_SIZE, fDst + i * Stride + j, Stride); + } } - } } /** @@ -159,25 +156,20 @@ extern "C" void computeDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, * * \return None */ -extern "C" void computeIDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, - ROI Size) +extern "C" void computeIDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, ROI Size) { - float tmpblock[BLOCK_SIZE2]; + float tmpblock[BLOCK_SIZE2]; - // perform block wise IDCT - // IDCT(A) = DCTv8matrix * A * DCTv8matrixT - for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) - { - for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) - { - // tmpblock = DCTv8matrix * A - mult8x8(DCTv8matrix, BLOCK_SIZE, fSrc + i * Stride + j, Stride, tmpblock, - BLOCK_SIZE); - // DCT(A) = tmpblock * DCTv8matrixT; - mult8x8(tmpblock, BLOCK_SIZE, DCTv8matrixT, BLOCK_SIZE, - fDst + i * Stride + j, Stride); + // perform block wise IDCT + // IDCT(A) = DCTv8matrix * A * DCTv8matrixT + for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) { + for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) { + // tmpblock = DCTv8matrix * A + mult8x8(DCTv8matrix, BLOCK_SIZE, fSrc + i * Stride + j, Stride, tmpblock, BLOCK_SIZE); + // DCT(A) = tmpblock * DCTv8matrixT; + mult8x8(tmpblock, BLOCK_SIZE, DCTv8matrixT, BLOCK_SIZE, fDst + i * Stride + j, Stride); + } } - } } /** @@ -193,19 +185,16 @@ extern "C" void computeIDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, */ extern "C" void quantizeGoldFloat(float *fSrcDst, int Stride, ROI Size) { - // perform block wise in-place quantization using Q_GOLD - // Q_GOLD(A) = round(A ./ Q_GOLD) .* Q_GOLD; - for (int i = 0; i < Size.height; i++) - { - for (int j = 0; j < Size.width; j++) - { - int qx = j % BLOCK_SIZE; - int qy = i % BLOCK_SIZE; - float quantized = - round_f(fSrcDst[i * Stride + j] / Q_GOLD[(qy << BLOCK_SIZE_LOG2) + qx]); - fSrcDst[i * Stride + j] = quantized * Q_GOLD[(qy << BLOCK_SIZE_LOG2) + qx]; + // perform block wise in-place quantization using Q_GOLD + // Q_GOLD(A) = round(A ./ Q_GOLD) .* Q_GOLD; + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + int qx = j % BLOCK_SIZE; + int qy = i % BLOCK_SIZE; + float quantized = round_f(fSrcDst[i * Stride + j] / Q_GOLD[(qy << BLOCK_SIZE_LOG2) + qx]); + fSrcDst[i * Stride + j] = quantized * Q_GOLD[(qy << BLOCK_SIZE_LOG2) + qx]; + } } - } } /** @@ -221,33 +210,29 @@ extern "C" void quantizeGoldFloat(float *fSrcDst, int Stride, ROI Size) */ void quantizeGoldShort(short *fSrcDst, int Stride, ROI Size) { - // perform block wise in-place quantization using Q_GOLD - // Q_GOLD(A) = round(A ./ Q_GOLD) .* Q_GOLD; - for (int i = 0; i < Size.height; i++) - { - for (int j = 0; j < Size.width; j++) - { - int qx = j % BLOCK_SIZE; - int qy = i % BLOCK_SIZE; - short temp = fSrcDst[i * Stride + j]; - short quant = (short)(Q_GOLD[(qy << BLOCK_SIZE_LOG2) + qx]); + // perform block wise in-place quantization using Q_GOLD + // Q_GOLD(A) = round(A ./ Q_GOLD) .* Q_GOLD; + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + int qx = j % BLOCK_SIZE; + int qy = i % BLOCK_SIZE; + short temp = fSrcDst[i * Stride + j]; + short quant = (short)(Q_GOLD[(qy << BLOCK_SIZE_LOG2) + qx]); - if (temp < 0) - { - temp = -temp; - temp += quant >> 1; - temp /= quant; - temp = -temp; - } - else - { - temp += quant >> 1; - temp /= quant; - } + if (temp < 0) { + temp = -temp; + temp += quant >> 1; + temp /= quant; + temp = -temp; + } + else { + temp += quant >> 1; + temp /= quant; + } - fSrcDst[i * Stride + j] = temp * quant; + fSrcDst[i * Stride + j] = temp * quant; + } } - } } // Used in forward and inverse DCT. @@ -276,37 +261,32 @@ float C_norm = 0.3535533905932737f; // 1 / (8^0.5) * * \return None */ -void SubroutineDCTvector(float *FirstIn, int StepIn, float *FirstOut, - int StepOut) +void SubroutineDCTvector(float *FirstIn, int StepIn, float *FirstOut, int StepOut) { - float X07P = FirstIn[0 * StepIn] + FirstIn[7 * StepIn]; - float X16P = FirstIn[1 * StepIn] + FirstIn[6 * StepIn]; - float X25P = FirstIn[2 * StepIn] + FirstIn[5 * StepIn]; - float X34P = FirstIn[3 * StepIn] + FirstIn[4 * StepIn]; + float X07P = FirstIn[0 * StepIn] + FirstIn[7 * StepIn]; + float X16P = FirstIn[1 * StepIn] + FirstIn[6 * StepIn]; + float X25P = FirstIn[2 * StepIn] + FirstIn[5 * StepIn]; + float X34P = FirstIn[3 * StepIn] + FirstIn[4 * StepIn]; - float X07M = FirstIn[0 * StepIn] - FirstIn[7 * StepIn]; - float X61M = FirstIn[6 * StepIn] - FirstIn[1 * StepIn]; - float X25M = FirstIn[2 * StepIn] - FirstIn[5 * StepIn]; - float X43M = FirstIn[4 * StepIn] - FirstIn[3 * StepIn]; + float X07M = FirstIn[0 * StepIn] - FirstIn[7 * StepIn]; + float X61M = FirstIn[6 * StepIn] - FirstIn[1 * StepIn]; + float X25M = FirstIn[2 * StepIn] - FirstIn[5 * StepIn]; + float X43M = FirstIn[4 * StepIn] - FirstIn[3 * StepIn]; - float X07P34PP = X07P + X34P; - float X07P34PM = X07P - X34P; - float X16P25PP = X16P + X25P; - float X16P25PM = X16P - X25P; + float X07P34PP = X07P + X34P; + float X07P34PM = X07P - X34P; + float X16P25PP = X16P + X25P; + float X16P25PM = X16P - X25P; - FirstOut[0 * StepOut] = C_norm * (X07P34PP + X16P25PP); - FirstOut[2 * StepOut] = C_norm * (C_b * X07P34PM + C_e * X16P25PM); - FirstOut[4 * StepOut] = C_norm * (X07P34PP - X16P25PP); - FirstOut[6 * StepOut] = C_norm * (C_e * X07P34PM - C_b * X16P25PM); + FirstOut[0 * StepOut] = C_norm * (X07P34PP + X16P25PP); + FirstOut[2 * StepOut] = C_norm * (C_b * X07P34PM + C_e * X16P25PM); + FirstOut[4 * StepOut] = C_norm * (X07P34PP - X16P25PP); + FirstOut[6 * StepOut] = C_norm * (C_e * X07P34PM - C_b * X16P25PM); - FirstOut[1 * StepOut] = - C_norm * (C_a * X07M - C_c * X61M + C_d * X25M - C_f * X43M); - FirstOut[3 * StepOut] = - C_norm * (C_c * X07M + C_f * X61M - C_a * X25M + C_d * X43M); - FirstOut[5 * StepOut] = - C_norm * (C_d * X07M + C_a * X61M + C_f * X25M - C_c * X43M); - FirstOut[7 * StepOut] = - C_norm * (C_f * X07M + C_d * X61M + C_c * X25M + C_a * X43M); + FirstOut[1 * StepOut] = C_norm * (C_a * X07M - C_c * X61M + C_d * X25M - C_f * X43M); + FirstOut[3 * StepOut] = C_norm * (C_c * X07M + C_f * X61M - C_a * X25M + C_d * X43M); + FirstOut[5 * StepOut] = C_norm * (C_d * X07M + C_a * X61M + C_f * X25M - C_c * X43M); + FirstOut[7 * StepOut] = C_norm * (C_f * X07M + C_d * X61M + C_c * X25M + C_a * X43M); } /** @@ -322,38 +302,37 @@ void SubroutineDCTvector(float *FirstIn, int StepIn, float *FirstOut, * * \return None */ -void SubroutineIDCTvector(float *FirstIn, int StepIn, float *FirstOut, - int StepOut) +void SubroutineIDCTvector(float *FirstIn, int StepIn, float *FirstOut, int StepOut) { - float Y04P = FirstIn[0 * StepIn] + FirstIn[4 * StepIn]; - float Y2b6eP = C_b * FirstIn[2 * StepIn] + C_e * FirstIn[6 * StepIn]; + float Y04P = FirstIn[0 * StepIn] + FirstIn[4 * StepIn]; + float Y2b6eP = C_b * FirstIn[2 * StepIn] + C_e * FirstIn[6 * StepIn]; - float Y04P2b6ePP = Y04P + Y2b6eP; - float Y04P2b6ePM = Y04P - Y2b6eP; - float Y7f1aP3c5dPP = C_f * FirstIn[7 * StepIn] + C_a * FirstIn[1 * StepIn] + - C_c * FirstIn[3 * StepIn] + C_d * FirstIn[5 * StepIn]; - float Y7a1fM3d5cMP = C_a * FirstIn[7 * StepIn] - C_f * FirstIn[1 * StepIn] + - C_d * FirstIn[3 * StepIn] - C_c * FirstIn[5 * StepIn]; + float Y04P2b6ePP = Y04P + Y2b6eP; + float Y04P2b6ePM = Y04P - Y2b6eP; + float Y7f1aP3c5dPP = + C_f * FirstIn[7 * StepIn] + C_a * FirstIn[1 * StepIn] + C_c * FirstIn[3 * StepIn] + C_d * FirstIn[5 * StepIn]; + float Y7a1fM3d5cMP = + C_a * FirstIn[7 * StepIn] - C_f * FirstIn[1 * StepIn] + C_d * FirstIn[3 * StepIn] - C_c * FirstIn[5 * StepIn]; - float Y04M = FirstIn[0 * StepIn] - FirstIn[4 * StepIn]; - float Y2e6bM = C_e * FirstIn[2 * StepIn] - C_b * FirstIn[6 * StepIn]; + float Y04M = FirstIn[0 * StepIn] - FirstIn[4 * StepIn]; + float Y2e6bM = C_e * FirstIn[2 * StepIn] - C_b * FirstIn[6 * StepIn]; - float Y04M2e6bMP = Y04M + Y2e6bM; - float Y04M2e6bMM = Y04M - Y2e6bM; - float Y1c7dM3f5aPM = C_c * FirstIn[1 * StepIn] - C_d * FirstIn[7 * StepIn] - - C_f * FirstIn[3 * StepIn] - C_a * FirstIn[5 * StepIn]; - float Y1d7cP3a5fMM = C_d * FirstIn[1 * StepIn] + C_c * FirstIn[7 * StepIn] - - C_a * FirstIn[3 * StepIn] + C_f * FirstIn[5 * StepIn]; + float Y04M2e6bMP = Y04M + Y2e6bM; + float Y04M2e6bMM = Y04M - Y2e6bM; + float Y1c7dM3f5aPM = + C_c * FirstIn[1 * StepIn] - C_d * FirstIn[7 * StepIn] - C_f * FirstIn[3 * StepIn] - C_a * FirstIn[5 * StepIn]; + float Y1d7cP3a5fMM = + C_d * FirstIn[1 * StepIn] + C_c * FirstIn[7 * StepIn] - C_a * FirstIn[3 * StepIn] + C_f * FirstIn[5 * StepIn]; - FirstOut[0 * StepOut] = C_norm * (Y04P2b6ePP + Y7f1aP3c5dPP); - FirstOut[7 * StepOut] = C_norm * (Y04P2b6ePP - Y7f1aP3c5dPP); - FirstOut[4 * StepOut] = C_norm * (Y04P2b6ePM + Y7a1fM3d5cMP); - FirstOut[3 * StepOut] = C_norm * (Y04P2b6ePM - Y7a1fM3d5cMP); + FirstOut[0 * StepOut] = C_norm * (Y04P2b6ePP + Y7f1aP3c5dPP); + FirstOut[7 * StepOut] = C_norm * (Y04P2b6ePP - Y7f1aP3c5dPP); + FirstOut[4 * StepOut] = C_norm * (Y04P2b6ePM + Y7a1fM3d5cMP); + FirstOut[3 * StepOut] = C_norm * (Y04P2b6ePM - Y7a1fM3d5cMP); - FirstOut[1 * StepOut] = C_norm * (Y04M2e6bMP + Y1c7dM3f5aPM); - FirstOut[5 * StepOut] = C_norm * (Y04M2e6bMM - Y1d7cP3a5fMM); - FirstOut[2 * StepOut] = C_norm * (Y04M2e6bMM + Y1d7cP3a5fMM); - FirstOut[6 * StepOut] = C_norm * (Y04M2e6bMP - Y1c7dM3f5aPM); + FirstOut[1 * StepOut] = C_norm * (Y04M2e6bMP + Y1c7dM3f5aPM); + FirstOut[5 * StepOut] = C_norm * (Y04M2e6bMM - Y1d7cP3a5fMM); + FirstOut[2 * StepOut] = C_norm * (Y04M2e6bMM + Y1d7cP3a5fMM); + FirstOut[6 * StepOut] = C_norm * (Y04M2e6bMP - Y1c7dM3f5aPM); } /** @@ -369,28 +348,21 @@ void SubroutineIDCTvector(float *FirstIn, int StepIn, float *FirstOut, * * \return None */ -extern "C" void computeDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, - ROI Size) +extern "C" void computeDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, ROI Size) { - for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) - { - for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) - { - // process rows - for (int k = 0; k < BLOCK_SIZE; k++) - { - SubroutineDCTvector((float *)fSrc + (i + k) * Stride + j, 1, - fDst + (i + k) * Stride + j, 1); - } + for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) { + for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) { + // process rows + for (int k = 0; k < BLOCK_SIZE; k++) { + SubroutineDCTvector((float *)fSrc + (i + k) * Stride + j, 1, fDst + (i + k) * Stride + j, 1); + } - // process columns - for (int k = 0; k < BLOCK_SIZE; k++) - { - SubroutineDCTvector(fDst + i * Stride + (j + k), Stride, - fDst + i * Stride + (j + k), Stride); - } + // process columns + for (int k = 0; k < BLOCK_SIZE; k++) { + SubroutineDCTvector(fDst + i * Stride + (j + k), Stride, fDst + i * Stride + (j + k), Stride); + } + } } - } } /** @@ -406,26 +378,19 @@ extern "C" void computeDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, * * \return None */ -extern "C" void computeIDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, - ROI Size) +extern "C" void computeIDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, ROI Size) { - for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) - { - for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) - { - // process rows - for (int k = 0; k < BLOCK_SIZE; k++) - { - SubroutineIDCTvector((float *)fSrc + (i + k) * Stride + j, 1, - fDst + (i + k) * Stride + j, 1); - } + for (int i = 0; i + BLOCK_SIZE - 1 < Size.height; i += BLOCK_SIZE) { + for (int j = 0; j + BLOCK_SIZE - 1 < Size.width; j += BLOCK_SIZE) { + // process rows + for (int k = 0; k < BLOCK_SIZE; k++) { + SubroutineIDCTvector((float *)fSrc + (i + k) * Stride + j, 1, fDst + (i + k) * Stride + j, 1); + } - // process columns - for (int k = 0; k < BLOCK_SIZE; k++) - { - SubroutineIDCTvector(fDst + i * Stride + (j + k), Stride, - fDst + i * Stride + (j + k), Stride); - } + // process columns + for (int k = 0; k < BLOCK_SIZE; k++) { + SubroutineIDCTvector(fDst + i * Stride + (j + k), Stride, fDst + i * Stride + (j + k), Stride); + } + } } - } } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.h b/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.h index fd1da6a0..b338c893 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.h +++ b/Samples/2_Concepts_and_Techniques/dct8x8/DCT8x8_Gold.h @@ -39,11 +39,12 @@ #include "BmpUtil.h" -extern "C" { -void computeDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, ROI Size); -void computeIDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, ROI Size); -void quantizeGoldFloat(float *fSrcDst, int Stride, ROI Size); -void quantizeGoldShort(short *fSrcDst, int Stride, ROI Size); -void computeDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, ROI Size); -void computeIDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, ROI Size); +extern "C" +{ + void computeDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, ROI Size); + void computeIDCT8x8Gold1(const float *fSrc, float *fDst, int Stride, ROI Size); + void quantizeGoldFloat(float *fSrcDst, int Stride, ROI Size); + void quantizeGoldShort(short *fSrcDst, int Stride, ROI Size); + void computeDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, ROI Size); + void computeIDCT8x8Gold2(const float *fSrc, float *fDst, int Stride, ROI Size); } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8.cu b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8.cu index d290787e..b53580c0 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8.cu +++ b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8.cu @@ -39,25 +39,25 @@ * 3. Output execution timings and calculate CUDA speedup. */ +#include "BmpUtil.h" #include "Common.h" #include "DCT8x8_Gold.h" -#include "BmpUtil.h" /** -* The number of DCT kernel calls -*/ + * The number of DCT kernel calls + */ #define BENCHMARK_SIZE 10 /** -* The PSNR values over this threshold indicate images equality -*/ + * The PSNR values over this threshold indicate images equality + */ #define PSNR_THRESHOLD_EQUAL 40 // includes kernels #include "dct8x8_kernel1.cuh" #include "dct8x8_kernel2.cuh" -#include "dct8x8_kernel_short.cuh" #include "dct8x8_kernel_quantization.cuh" +#include "dct8x8_kernel_short.cuh" /** ************************************************************************** @@ -71,48 +71,49 @@ * * \return Execution time in milliseconds */ -float WrapperGold1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { - // allocate float buffers for DCT and other data - int StrideF; - float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF); - float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF); +float WrapperGold1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) +{ + // allocate float buffers for DCT and other data + int StrideF; + float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF); + float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF); - // convert source image to float representation - CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size); - AddFloatPlane(-128.0f, ImgF1, StrideF, Size); + // convert source image to float representation + CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size); + AddFloatPlane(-128.0f, ImgF1, StrideF, Size); - // create and start CUDA timer - StopWatchInterface *timerGold = 0; - sdkCreateTimer(&timerGold); - sdkResetTimer(&timerGold); + // create and start CUDA timer + StopWatchInterface *timerGold = 0; + sdkCreateTimer(&timerGold); + sdkResetTimer(&timerGold); - // perform block-wise DCT processing and benchmarking - for (int i = 0; i < BENCHMARK_SIZE; i++) { - sdkStartTimer(&timerGold); - computeDCT8x8Gold1(ImgF1, ImgF2, StrideF, Size); - sdkStopTimer(&timerGold); - } + // perform block-wise DCT processing and benchmarking + for (int i = 0; i < BENCHMARK_SIZE; i++) { + sdkStartTimer(&timerGold); + computeDCT8x8Gold1(ImgF1, ImgF2, StrideF, Size); + sdkStopTimer(&timerGold); + } - // stop and destroy CUDA timer - float TimerGoldSpan = sdkGetAverageTimerValue(&timerGold); - sdkDeleteTimer(&timerGold); + // stop and destroy CUDA timer + float TimerGoldSpan = sdkGetAverageTimerValue(&timerGold); + sdkDeleteTimer(&timerGold); - // perform quantization - quantizeGoldFloat(ImgF2, StrideF, Size); + // perform quantization + quantizeGoldFloat(ImgF2, StrideF, Size); - // perform block-wise IDCT processing - computeIDCT8x8Gold1(ImgF2, ImgF1, StrideF, Size); + // perform block-wise IDCT processing + computeIDCT8x8Gold1(ImgF2, ImgF1, StrideF, Size); - // convert image back to byte representation - AddFloatPlane(128.0f, ImgF1, StrideF, Size); - CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size); + // convert image back to byte representation + AddFloatPlane(128.0f, ImgF1, StrideF, Size); + CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size); - // free float buffers - FreePlane(ImgF1); - FreePlane(ImgF2); + // free float buffers + FreePlane(ImgF1); + FreePlane(ImgF2); - // return time taken by the operation - return TimerGoldSpan; + // return time taken by the operation + return TimerGoldSpan; } /** @@ -127,48 +128,49 @@ float WrapperGold1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { * * \return Execution time in milliseconds */ -float WrapperGold2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { - // allocate float buffers for DCT and other data - int StrideF; - float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF); - float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF); +float WrapperGold2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) +{ + // allocate float buffers for DCT and other data + int StrideF; + float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF); + float *ImgF2 = MallocPlaneFloat(Size.width, Size.height, &StrideF); - // convert source image to float representation - CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size); - AddFloatPlane(-128.0f, ImgF1, StrideF, Size); + // convert source image to float representation + CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size); + AddFloatPlane(-128.0f, ImgF1, StrideF, Size); - // create and start CUDA timer - StopWatchInterface *timerGold = 0; - sdkCreateTimer(&timerGold); - sdkResetTimer(&timerGold); + // create and start CUDA timer + StopWatchInterface *timerGold = 0; + sdkCreateTimer(&timerGold); + sdkResetTimer(&timerGold); - // perform block-wise DCT processing and benchmarking - for (int i = 0; i < BENCHMARK_SIZE; i++) { - sdkStartTimer(&timerGold); - computeDCT8x8Gold2(ImgF1, ImgF2, StrideF, Size); - sdkStopTimer(&timerGold); - } + // perform block-wise DCT processing and benchmarking + for (int i = 0; i < BENCHMARK_SIZE; i++) { + sdkStartTimer(&timerGold); + computeDCT8x8Gold2(ImgF1, ImgF2, StrideF, Size); + sdkStopTimer(&timerGold); + } - // stop and destroy CUDA timer - float TimerGoldSpan = sdkGetAverageTimerValue(&timerGold); - sdkDeleteTimer(&timerGold); + // stop and destroy CUDA timer + float TimerGoldSpan = sdkGetAverageTimerValue(&timerGold); + sdkDeleteTimer(&timerGold); - // perform quantization - quantizeGoldFloat(ImgF2, StrideF, Size); + // perform quantization + quantizeGoldFloat(ImgF2, StrideF, Size); - // perform block-wise IDCT processing - computeIDCT8x8Gold2(ImgF2, ImgF1, StrideF, Size); + // perform block-wise IDCT processing + computeIDCT8x8Gold2(ImgF2, ImgF1, StrideF, Size); - // convert image back to byte representation - AddFloatPlane(128.0f, ImgF1, StrideF, Size); - CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size); + // convert image back to byte representation + AddFloatPlane(128.0f, ImgF1, StrideF, Size); + CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size); - // free float buffers - FreePlane(ImgF1); - FreePlane(ImgF2); + // free float buffers + FreePlane(ImgF1); + FreePlane(ImgF2); - // return time taken by the operation - return TimerGoldSpan; + // return time taken by the operation + return TimerGoldSpan; } /** @@ -183,101 +185,109 @@ float WrapperGold2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { * * \return Execution time in milliseconds */ -float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { - // prepare channel format descriptor for passing texture into kernels - cudaChannelFormatDesc floattex = cudaCreateChannelDesc(); +float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) +{ + // prepare channel format descriptor for passing texture into kernels + cudaChannelFormatDesc floattex = cudaCreateChannelDesc(); - // allocate device memory - cudaArray *Src; - float *Dst; - size_t DstStride; - checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height)); - checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride, - Size.width * sizeof(float), Size.height)); - DstStride /= sizeof(float); + // allocate device memory + cudaArray *Src; + float *Dst; + size_t DstStride; + checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height)); + checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height)); + DstStride /= sizeof(float); - // convert source image to float representation - int ImgSrcFStride; - float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride); - CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size); - AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size); + // convert source image to float representation + int ImgSrcFStride; + float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride); + CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size); + AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size); - // copy from host memory to device - checkCudaErrors(cudaMemcpy2DToArray( - Src, 0, 0, ImgSrcF, ImgSrcFStride * sizeof(float), - Size.width * sizeof(float), Size.height, cudaMemcpyHostToDevice)); + // copy from host memory to device + checkCudaErrors(cudaMemcpy2DToArray(Src, + 0, + 0, + ImgSrcF, + ImgSrcFStride * sizeof(float), + Size.width * sizeof(float), + Size.height, + cudaMemcpyHostToDevice)); - // setup execution parameters - dim3 threads(BLOCK_SIZE, BLOCK_SIZE); - dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE); + // setup execution parameters + dim3 threads(BLOCK_SIZE, BLOCK_SIZE); + dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE); - // create and start CUDA timer - StopWatchInterface *timerCUDA = 0; - sdkCreateTimer(&timerCUDA); - sdkResetTimer(&timerCUDA); + // create and start CUDA timer + StopWatchInterface *timerCUDA = 0; + sdkCreateTimer(&timerCUDA); + sdkResetTimer(&timerCUDA); - // execute DCT kernel and benchmark - cudaTextureObject_t TexSrc; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + // execute DCT kernel and benchmark + cudaTextureObject_t TexSrc; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = Src; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = Src; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&TexSrc, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&TexSrc, &texRes, &texDescr, NULL)); - for (int i = 0; i < BENCHMARK_SIZE; i++) { - sdkStartTimer(&timerCUDA); - CUDAkernel1DCT<<>>(Dst, (int)DstStride, 0, 0, TexSrc); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timerCUDA); - } + for (int i = 0; i < BENCHMARK_SIZE; i++) { + sdkStartTimer(&timerCUDA); + CUDAkernel1DCT<<>>(Dst, (int)DstStride, 0, 0, TexSrc); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timerCUDA); + } - getLastCudaError("Kernel execution failed"); + getLastCudaError("Kernel execution failed"); - // finalize CUDA timer - float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA); - sdkDeleteTimer(&timerCUDA); + // finalize CUDA timer + float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA); + sdkDeleteTimer(&timerCUDA); - // execute Quantization kernel - CUDAkernelQuantizationFloat<<>>(Dst, (int)DstStride); - getLastCudaError("Kernel execution failed"); + // execute Quantization kernel + CUDAkernelQuantizationFloat<<>>(Dst, (int)DstStride); + getLastCudaError("Kernel execution failed"); - // copy quantized coefficients from host memory to device array - checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0, Dst, DstStride * sizeof(float), - Size.width * sizeof(float), Size.height, - cudaMemcpyDeviceToDevice)); + // copy quantized coefficients from host memory to device array + checkCudaErrors(cudaMemcpy2DToArray( + Src, 0, 0, Dst, DstStride * sizeof(float), Size.width * sizeof(float), Size.height, cudaMemcpyDeviceToDevice)); - // execute IDCT kernel - CUDAkernel1IDCT<<>>(Dst, (int)DstStride, 0, 0, TexSrc); - getLastCudaError("Kernel execution failed"); + // execute IDCT kernel + CUDAkernel1IDCT<<>>(Dst, (int)DstStride, 0, 0, TexSrc); + getLastCudaError("Kernel execution failed"); - // copy quantized image block to host - checkCudaErrors(cudaMemcpy2D( - ImgSrcF, ImgSrcFStride * sizeof(float), Dst, DstStride * sizeof(float), - Size.width * sizeof(float), Size.height, cudaMemcpyDeviceToHost)); + // copy quantized image block to host + checkCudaErrors(cudaMemcpy2D(ImgSrcF, + ImgSrcFStride * sizeof(float), + Dst, + DstStride * sizeof(float), + Size.width * sizeof(float), + Size.height, + cudaMemcpyDeviceToHost)); - // convert image back to byte representation - AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size); - CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size); + // convert image back to byte representation + AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size); + CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size); - // clean up memory - checkCudaErrors(cudaDestroyTextureObject(TexSrc)); - checkCudaErrors(cudaFreeArray(Src)); - checkCudaErrors(cudaFree(Dst)); - FreePlane(ImgSrcF); + // clean up memory + checkCudaErrors(cudaDestroyTextureObject(TexSrc)); + checkCudaErrors(cudaFreeArray(Src)); + checkCudaErrors(cudaFree(Dst)); + FreePlane(ImgSrcF); - // return time taken by the operation - return TimerCUDASpan; + // return time taken by the operation + return TimerCUDASpan; } /** @@ -293,94 +303,95 @@ float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { * \return Execution time in milliseconds */ -float WrapperCUDA2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { - // allocate host buffers for DCT and other data - int StrideF; - float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF); +float WrapperCUDA2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) +{ + // allocate host buffers for DCT and other data + int StrideF; + float *ImgF1 = MallocPlaneFloat(Size.width, Size.height, &StrideF); - // convert source image to float representation - CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size); - AddFloatPlane(-128.0f, ImgF1, StrideF, Size); + // convert source image to float representation + CopyByte2Float(ImgSrc, Stride, ImgF1, StrideF, Size); + AddFloatPlane(-128.0f, ImgF1, StrideF, Size); - // allocate device memory - float *src, *dst; - size_t DeviceStride; - checkCudaErrors(cudaMallocPitch((void **)&src, &DeviceStride, - Size.width * sizeof(float), Size.height)); - checkCudaErrors(cudaMallocPitch((void **)&dst, &DeviceStride, - Size.width * sizeof(float), Size.height)); - DeviceStride /= sizeof(float); + // allocate device memory + float *src, *dst; + size_t DeviceStride; + checkCudaErrors(cudaMallocPitch((void **)&src, &DeviceStride, Size.width * sizeof(float), Size.height)); + checkCudaErrors(cudaMallocPitch((void **)&dst, &DeviceStride, Size.width * sizeof(float), Size.height)); + DeviceStride /= sizeof(float); - // copy from host memory to device - checkCudaErrors(cudaMemcpy2D( - src, DeviceStride * sizeof(float), ImgF1, StrideF * sizeof(float), - Size.width * sizeof(float), Size.height, cudaMemcpyHostToDevice)); + // copy from host memory to device + checkCudaErrors(cudaMemcpy2D(src, + DeviceStride * sizeof(float), + ImgF1, + StrideF * sizeof(float), + Size.width * sizeof(float), + Size.height, + cudaMemcpyHostToDevice)); - // create and start CUDA timer - StopWatchInterface *timerCUDA = 0; - sdkCreateTimer(&timerCUDA); + // create and start CUDA timer + StopWatchInterface *timerCUDA = 0; + sdkCreateTimer(&timerCUDA); - // setup execution parameters - dim3 GridFullWarps(Size.width / KER2_BLOCK_WIDTH, - Size.height / KER2_BLOCK_HEIGHT, 1); - dim3 ThreadsFullWarps(8, KER2_BLOCK_WIDTH / 8, KER2_BLOCK_HEIGHT / 8); + // setup execution parameters + dim3 GridFullWarps(Size.width / KER2_BLOCK_WIDTH, Size.height / KER2_BLOCK_HEIGHT, 1); + dim3 ThreadsFullWarps(8, KER2_BLOCK_WIDTH / 8, KER2_BLOCK_HEIGHT / 8); - // perform block-wise DCT processing and benchmarking - const int numIterations = 100; + // perform block-wise DCT processing and benchmarking + const int numIterations = 100; - for (int i = -1; i < numIterations; i++) { - if (i == 0) { - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&timerCUDA); - sdkStartTimer(&timerCUDA); + for (int i = -1; i < numIterations; i++) { + if (i == 0) { + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&timerCUDA); + sdkStartTimer(&timerCUDA); + } + + CUDAkernel2DCT<<>>(dst, src, (int)DeviceStride); + getLastCudaError("Kernel execution failed"); } - CUDAkernel2DCT<<>>(dst, src, - (int)DeviceStride); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timerCUDA); + + // finalize timing of CUDA Kernels + float avgTime = (float)sdkGetTimerValue(&timerCUDA) / (float)numIterations; + sdkDeleteTimer(&timerCUDA); + printf("%f MPix/s //%f ms\n", (1E-6 * (float)Size.width * (float)Size.height) / (1E-3 * avgTime), avgTime); + + // setup execution parameters for quantization + dim3 ThreadsSmallBlocks(BLOCK_SIZE, BLOCK_SIZE); + dim3 GridSmallBlocks(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE); + + // execute Quantization kernel + CUDAkernelQuantizationFloat<<>>(dst, (int)DeviceStride); getLastCudaError("Kernel execution failed"); - } - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timerCUDA); + // perform block-wise IDCT processing + CUDAkernel2IDCT<<>>(src, dst, (int)DeviceStride); + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("Kernel execution failed"); - // finalize timing of CUDA Kernels - float avgTime = (float)sdkGetTimerValue(&timerCUDA) / (float)numIterations; - sdkDeleteTimer(&timerCUDA); - printf("%f MPix/s //%f ms\n", - (1E-6 * (float)Size.width * (float)Size.height) / (1E-3 * avgTime), - avgTime); + // copy quantized image block to host + checkCudaErrors(cudaMemcpy2D(ImgF1, + StrideF * sizeof(float), + src, + DeviceStride * sizeof(float), + Size.width * sizeof(float), + Size.height, + cudaMemcpyDeviceToHost)); - // setup execution parameters for quantization - dim3 ThreadsSmallBlocks(BLOCK_SIZE, BLOCK_SIZE); - dim3 GridSmallBlocks(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE); + // convert image back to byte representation + AddFloatPlane(128.0f, ImgF1, StrideF, Size); + CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size); - // execute Quantization kernel - CUDAkernelQuantizationFloat<<>>( - dst, (int)DeviceStride); - getLastCudaError("Kernel execution failed"); + // clean up memory + checkCudaErrors(cudaFree(dst)); + checkCudaErrors(cudaFree(src)); + FreePlane(ImgF1); - // perform block-wise IDCT processing - CUDAkernel2IDCT<<>>(src, dst, - (int)DeviceStride); - checkCudaErrors(cudaDeviceSynchronize()); - getLastCudaError("Kernel execution failed"); - - // copy quantized image block to host - checkCudaErrors(cudaMemcpy2D( - ImgF1, StrideF * sizeof(float), src, DeviceStride * sizeof(float), - Size.width * sizeof(float), Size.height, cudaMemcpyDeviceToHost)); - - // convert image back to byte representation - AddFloatPlane(128.0f, ImgF1, StrideF, Size); - CopyFloat2Byte(ImgF1, StrideF, ImgDst, Stride, Size); - - // clean up memory - checkCudaErrors(cudaFree(dst)); - checkCudaErrors(cudaFree(src)); - FreePlane(ImgF1); - - // return time taken by the operation - return avgTime; + // return time taken by the operation + return avgTime; } /** @@ -395,83 +406,89 @@ float WrapperCUDA2(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { * * \return Execution time in milliseconds */ -float WrapperCUDAshort(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { - // allocate host buffers for DCT and other data - int StrideS; - short *ImgS1 = MallocPlaneShort(Size.width, Size.height, &StrideS); +float WrapperCUDAshort(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) +{ + // allocate host buffers for DCT and other data + int StrideS; + short *ImgS1 = MallocPlaneShort(Size.width, Size.height, &StrideS); - // convert source image to short representation centered at 128 - for (int i = 0; i < Size.height; i++) { - for (int j = 0; j < Size.width; j++) { - ImgS1[i * StrideS + j] = (short)ImgSrc[i * Stride + j] - 128; + // convert source image to short representation centered at 128 + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + ImgS1[i * StrideS + j] = (short)ImgSrc[i * Stride + j] - 128; + } } - } - // allocate device memory - short *SrcDst; - size_t DeviceStride; - checkCudaErrors(cudaMallocPitch((void **)(&SrcDst), &DeviceStride, - Size.width * sizeof(short), Size.height)); - DeviceStride /= sizeof(short); + // allocate device memory + short *SrcDst; + size_t DeviceStride; + checkCudaErrors(cudaMallocPitch((void **)(&SrcDst), &DeviceStride, Size.width * sizeof(short), Size.height)); + DeviceStride /= sizeof(short); - // copy from host memory to device - checkCudaErrors(cudaMemcpy2D( - SrcDst, DeviceStride * sizeof(short), ImgS1, StrideS * sizeof(short), - Size.width * sizeof(short), Size.height, cudaMemcpyHostToDevice)); + // copy from host memory to device + checkCudaErrors(cudaMemcpy2D(SrcDst, + DeviceStride * sizeof(short), + ImgS1, + StrideS * sizeof(short), + Size.width * sizeof(short), + Size.height, + cudaMemcpyHostToDevice)); - // create and start CUDA timer - StopWatchInterface *timerLibJpeg = 0; - sdkCreateTimer(&timerLibJpeg); - sdkResetTimer(&timerLibJpeg); + // create and start CUDA timer + StopWatchInterface *timerLibJpeg = 0; + sdkCreateTimer(&timerLibJpeg); + sdkResetTimer(&timerLibJpeg); - // setup execution parameters - dim3 GridShort(Size.width / KERS_BLOCK_WIDTH, Size.height / KERS_BLOCK_HEIGHT, - 1); - dim3 ThreadsShort(8, KERS_BLOCK_WIDTH / 8, KERS_BLOCK_HEIGHT / 8); + // setup execution parameters + dim3 GridShort(Size.width / KERS_BLOCK_WIDTH, Size.height / KERS_BLOCK_HEIGHT, 1); + dim3 ThreadsShort(8, KERS_BLOCK_WIDTH / 8, KERS_BLOCK_HEIGHT / 8); - // perform block-wise DCT processing and benchmarking - sdkStartTimer(&timerLibJpeg); - CUDAkernelShortDCT<<>>(SrcDst, (int)DeviceStride); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timerLibJpeg); - getLastCudaError("Kernel execution failed"); + // perform block-wise DCT processing and benchmarking + sdkStartTimer(&timerLibJpeg); + CUDAkernelShortDCT<<>>(SrcDst, (int)DeviceStride); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timerLibJpeg); + getLastCudaError("Kernel execution failed"); - // stop and destroy CUDA timer - float TimerLibJpegSpan16b = sdkGetAverageTimerValue(&timerLibJpeg); - sdkDeleteTimer(&timerLibJpeg); + // stop and destroy CUDA timer + float TimerLibJpegSpan16b = sdkGetAverageTimerValue(&timerLibJpeg); + sdkDeleteTimer(&timerLibJpeg); - // setup execution parameters for quantization - dim3 ThreadsSmallBlocks(BLOCK_SIZE, BLOCK_SIZE); - dim3 GridSmallBlocks(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE); + // setup execution parameters for quantization + dim3 ThreadsSmallBlocks(BLOCK_SIZE, BLOCK_SIZE); + dim3 GridSmallBlocks(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE); - // execute Quantization kernel - CUDAkernelQuantizationShort<<>>( - SrcDst, (int)DeviceStride); - getLastCudaError("Kernel execution failed"); + // execute Quantization kernel + CUDAkernelQuantizationShort<<>>(SrcDst, (int)DeviceStride); + getLastCudaError("Kernel execution failed"); - // perform block-wise IDCT processing - CUDAkernelShortIDCT<<>>(SrcDst, (int)DeviceStride); - checkCudaErrors(cudaDeviceSynchronize()); - getLastCudaError("Kernel execution failed"); + // perform block-wise IDCT processing + CUDAkernelShortIDCT<<>>(SrcDst, (int)DeviceStride); + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("Kernel execution failed"); - // copy quantized image block to host - checkCudaErrors(cudaMemcpy2D( - ImgS1, StrideS * sizeof(short), SrcDst, DeviceStride * sizeof(short), - Size.width * sizeof(short), Size.height, cudaMemcpyDeviceToHost)); + // copy quantized image block to host + checkCudaErrors(cudaMemcpy2D(ImgS1, + StrideS * sizeof(short), + SrcDst, + DeviceStride * sizeof(short), + Size.width * sizeof(short), + Size.height, + cudaMemcpyDeviceToHost)); - // convert image back to byte representation - for (int i = 0; i < Size.height; i++) { - for (int j = 0; j < Size.width; j++) { - ImgDst[i * Stride + j] = clamp_0_255(ImgS1[i * StrideS + j] + 128); + // convert image back to byte representation + for (int i = 0; i < Size.height; i++) { + for (int j = 0; j < Size.width; j++) { + ImgDst[i * Stride + j] = clamp_0_255(ImgS1[i * StrideS + j] + 128); + } } - } - // free float buffers - checkCudaErrors(cudaFree(SrcDst)); - FreePlane(ImgS1); + // free float buffers + checkCudaErrors(cudaFree(SrcDst)); + FreePlane(ImgS1); - // return time taken by the operation - return TimerLibJpegSpan16b; + // return time taken by the operation + return TimerLibJpegSpan16b; } /** @@ -484,179 +501,165 @@ float WrapperCUDAshort(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size) { * \return Status code */ -int main(int argc, char **argv) { - // - // Sample initialization - // - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + // + // Sample initialization + // + printf("%s Starting...\n\n", argv[0]); - // initialize CUDA - findCudaDevice(argc, (const char **)argv); + // initialize CUDA + findCudaDevice(argc, (const char **)argv); - // source and results image filenames - char SampleImageFname[] = "teapot512.bmp"; - char SampleImageFnameResGold1[] = "teapot512_gold1.bmp"; - char SampleImageFnameResGold2[] = "teapot512_gold2.bmp"; - char SampleImageFnameResCUDA1[] = "teapot512_cuda1.bmp"; - char SampleImageFnameResCUDA2[] = "teapot512_cuda2.bmp"; - char SampleImageFnameResCUDAshort[] = "teapot512_cuda_short.bmp"; + // source and results image filenames + char SampleImageFname[] = "teapot512.bmp"; + char SampleImageFnameResGold1[] = "teapot512_gold1.bmp"; + char SampleImageFnameResGold2[] = "teapot512_gold2.bmp"; + char SampleImageFnameResCUDA1[] = "teapot512_cuda1.bmp"; + char SampleImageFnameResCUDA2[] = "teapot512_cuda2.bmp"; + char SampleImageFnameResCUDAshort[] = "teapot512_cuda_short.bmp"; - char *pSampleImageFpath = sdkFindFilePath(SampleImageFname, argv[0]); + char *pSampleImageFpath = sdkFindFilePath(SampleImageFname, argv[0]); - if (pSampleImageFpath == NULL) { - printf("dct8x8 could not locate Sample Image <%s>\nExiting...\n", - pSampleImageFpath); - exit(EXIT_FAILURE); - } + if (pSampleImageFpath == NULL) { + printf("dct8x8 could not locate Sample Image <%s>\nExiting...\n", pSampleImageFpath); + exit(EXIT_FAILURE); + } - // preload image (acquire dimensions) - int ImgWidth, ImgHeight; - ROI ImgSize; - int res = PreLoadBmp(pSampleImageFpath, &ImgWidth, &ImgHeight); - ImgSize.width = ImgWidth; - ImgSize.height = ImgHeight; + // preload image (acquire dimensions) + int ImgWidth, ImgHeight; + ROI ImgSize; + int res = PreLoadBmp(pSampleImageFpath, &ImgWidth, &ImgHeight); + ImgSize.width = ImgWidth; + ImgSize.height = ImgHeight; - // CONSOLE INFORMATION: saying hello to user - printf("CUDA sample DCT/IDCT implementation\n"); - printf("===================================\n"); - printf("Loading test image: %s... ", SampleImageFname); + // CONSOLE INFORMATION: saying hello to user + printf("CUDA sample DCT/IDCT implementation\n"); + printf("===================================\n"); + printf("Loading test image: %s... ", SampleImageFname); - if (res) { - printf("\nError: Image file not found or invalid!\n"); - exit(EXIT_FAILURE); - return 1; - } + if (res) { + printf("\nError: Image file not found or invalid!\n"); + exit(EXIT_FAILURE); + return 1; + } - // check image dimensions are multiples of BLOCK_SIZE - if (ImgWidth % BLOCK_SIZE != 0 || ImgHeight % BLOCK_SIZE != 0) { - printf("\nError: Input image dimensions must be multiples of 8!\n"); - exit(EXIT_FAILURE); - return 1; - } + // check image dimensions are multiples of BLOCK_SIZE + if (ImgWidth % BLOCK_SIZE != 0 || ImgHeight % BLOCK_SIZE != 0) { + printf("\nError: Input image dimensions must be multiples of 8!\n"); + exit(EXIT_FAILURE); + return 1; + } - printf("[%d x %d]... ", ImgWidth, ImgHeight); + printf("[%d x %d]... ", ImgWidth, ImgHeight); - // allocate image buffers - int ImgStride; - byte *ImgSrc = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); - byte *ImgDstGold1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); - byte *ImgDstGold2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); - byte *ImgDstCUDA1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); - byte *ImgDstCUDA2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); - byte *ImgDstCUDAshort = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); + // allocate image buffers + int ImgStride; + byte *ImgSrc = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); + byte *ImgDstGold1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); + byte *ImgDstGold2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); + byte *ImgDstCUDA1 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); + byte *ImgDstCUDA2 = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); + byte *ImgDstCUDAshort = MallocPlaneByte(ImgWidth, ImgHeight, &ImgStride); - // load sample image - LoadBmpAsGray(pSampleImageFpath, ImgStride, ImgSize, ImgSrc); + // load sample image + LoadBmpAsGray(pSampleImageFpath, ImgStride, ImgSize, ImgSrc); - // - // RUNNING WRAPPERS - // + // + // RUNNING WRAPPERS + // - // compute Gold 1 version of DCT/quantization/IDCT - printf("Success\nRunning Gold 1 (CPU) version... "); - float TimeGold1 = WrapperGold1(ImgSrc, ImgDstGold1, ImgStride, ImgSize); + // compute Gold 1 version of DCT/quantization/IDCT + printf("Success\nRunning Gold 1 (CPU) version... "); + float TimeGold1 = WrapperGold1(ImgSrc, ImgDstGold1, ImgStride, ImgSize); - // compute Gold 2 version of DCT/quantization/IDCT - printf("Success\nRunning Gold 2 (CPU) version... "); - float TimeGold2 = WrapperGold2(ImgSrc, ImgDstGold2, ImgStride, ImgSize); + // compute Gold 2 version of DCT/quantization/IDCT + printf("Success\nRunning Gold 2 (CPU) version... "); + float TimeGold2 = WrapperGold2(ImgSrc, ImgDstGold2, ImgStride, ImgSize); - // compute CUDA 1 version of DCT/quantization/IDCT - printf("Success\nRunning CUDA 1 (GPU) version... "); - float TimeCUDA1 = WrapperCUDA1(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize); + // compute CUDA 1 version of DCT/quantization/IDCT + printf("Success\nRunning CUDA 1 (GPU) version... "); + float TimeCUDA1 = WrapperCUDA1(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize); - // compute CUDA 2 version of DCT/quantization/IDCT - printf("Success\nRunning CUDA 2 (GPU) version... "); - float TimeCUDA2 = WrapperCUDA2(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize); + // compute CUDA 2 version of DCT/quantization/IDCT + printf("Success\nRunning CUDA 2 (GPU) version... "); + float TimeCUDA2 = WrapperCUDA2(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize); - // compute CUDA short version of DCT/quantization/IDCT - printf("Success\nRunning CUDA short (GPU) version... "); - float TimeCUDAshort = - WrapperCUDAshort(ImgSrc, ImgDstCUDAshort, ImgStride, ImgSize); - // - // Execution statistics, result saving and validation - // + // compute CUDA short version of DCT/quantization/IDCT + printf("Success\nRunning CUDA short (GPU) version... "); + float TimeCUDAshort = WrapperCUDAshort(ImgSrc, ImgDstCUDAshort, ImgStride, ImgSize); + // + // Execution statistics, result saving and validation + // - // dump result of Gold 1 processing - printf("Success\nDumping result to %s... ", SampleImageFnameResGold1); - DumpBmpAsGray(SampleImageFnameResGold1, ImgDstGold1, ImgStride, ImgSize); + // dump result of Gold 1 processing + printf("Success\nDumping result to %s... ", SampleImageFnameResGold1); + DumpBmpAsGray(SampleImageFnameResGold1, ImgDstGold1, ImgStride, ImgSize); - // dump result of Gold 2 processing - printf("Success\nDumping result to %s... ", SampleImageFnameResGold2); - DumpBmpAsGray(SampleImageFnameResGold2, ImgDstGold2, ImgStride, ImgSize); + // dump result of Gold 2 processing + printf("Success\nDumping result to %s... ", SampleImageFnameResGold2); + DumpBmpAsGray(SampleImageFnameResGold2, ImgDstGold2, ImgStride, ImgSize); - // dump result of CUDA 1 processing - printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA1); - DumpBmpAsGray(SampleImageFnameResCUDA1, ImgDstCUDA1, ImgStride, ImgSize); + // dump result of CUDA 1 processing + printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA1); + DumpBmpAsGray(SampleImageFnameResCUDA1, ImgDstCUDA1, ImgStride, ImgSize); - // dump result of CUDA 2 processing - printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA2); - DumpBmpAsGray(SampleImageFnameResCUDA2, ImgDstCUDA2, ImgStride, ImgSize); + // dump result of CUDA 2 processing + printf("Success\nDumping result to %s... ", SampleImageFnameResCUDA2); + DumpBmpAsGray(SampleImageFnameResCUDA2, ImgDstCUDA2, ImgStride, ImgSize); - // dump result of CUDA short processing - printf("Success\nDumping result to %s... ", SampleImageFnameResCUDAshort); - DumpBmpAsGray(SampleImageFnameResCUDAshort, ImgDstCUDAshort, ImgStride, - ImgSize); - // print speed info - printf("Success\n"); + // dump result of CUDA short processing + printf("Success\nDumping result to %s... ", SampleImageFnameResCUDAshort); + DumpBmpAsGray(SampleImageFnameResCUDAshort, ImgDstCUDAshort, ImgStride, ImgSize); + // print speed info + printf("Success\n"); - printf("Processing time (CUDA 1) : %f ms \n", TimeCUDA1); - printf("Processing time (CUDA 2) : %f ms \n", TimeCUDA2); - printf("Processing time (CUDA short): %f ms \n", TimeCUDAshort); + printf("Processing time (CUDA 1) : %f ms \n", TimeCUDA1); + printf("Processing time (CUDA 2) : %f ms \n", TimeCUDA2); + printf("Processing time (CUDA short): %f ms \n", TimeCUDAshort); - // calculate PSNR between each pair of images - float PSNR_Src_DstGold1 = - CalculatePSNR(ImgSrc, ImgDstGold1, ImgStride, ImgSize); - float PSNR_Src_DstGold2 = - CalculatePSNR(ImgSrc, ImgDstGold2, ImgStride, ImgSize); - float PSNR_Src_DstCUDA1 = - CalculatePSNR(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize); - float PSNR_Src_DstCUDA2 = - CalculatePSNR(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize); - float PSNR_Src_DstCUDAshort = - CalculatePSNR(ImgSrc, ImgDstCUDAshort, ImgStride, ImgSize); - float PSNR_DstGold1_DstCUDA1 = - CalculatePSNR(ImgDstGold1, ImgDstCUDA1, ImgStride, ImgSize); - float PSNR_DstGold2_DstCUDA2 = - CalculatePSNR(ImgDstGold2, ImgDstCUDA2, ImgStride, ImgSize); - float PSNR_DstGold2_DstCUDA16b = - CalculatePSNR(ImgDstGold2, ImgDstCUDAshort, ImgStride, ImgSize); + // calculate PSNR between each pair of images + float PSNR_Src_DstGold1 = CalculatePSNR(ImgSrc, ImgDstGold1, ImgStride, ImgSize); + float PSNR_Src_DstGold2 = CalculatePSNR(ImgSrc, ImgDstGold2, ImgStride, ImgSize); + float PSNR_Src_DstCUDA1 = CalculatePSNR(ImgSrc, ImgDstCUDA1, ImgStride, ImgSize); + float PSNR_Src_DstCUDA2 = CalculatePSNR(ImgSrc, ImgDstCUDA2, ImgStride, ImgSize); + float PSNR_Src_DstCUDAshort = CalculatePSNR(ImgSrc, ImgDstCUDAshort, ImgStride, ImgSize); + float PSNR_DstGold1_DstCUDA1 = CalculatePSNR(ImgDstGold1, ImgDstCUDA1, ImgStride, ImgSize); + float PSNR_DstGold2_DstCUDA2 = CalculatePSNR(ImgDstGold2, ImgDstCUDA2, ImgStride, ImgSize); + float PSNR_DstGold2_DstCUDA16b = CalculatePSNR(ImgDstGold2, ImgDstCUDAshort, ImgStride, ImgSize); - printf("PSNR Original <---> CPU(Gold 1) : %f\n", PSNR_Src_DstGold1); - printf("PSNR Original <---> CPU(Gold 2) : %f\n", PSNR_Src_DstGold2); - printf("PSNR Original <---> GPU(CUDA 1) : %f\n", PSNR_Src_DstCUDA1); - printf("PSNR Original <---> GPU(CUDA 2) : %f\n", PSNR_Src_DstCUDA2); - printf("PSNR Original <---> GPU(CUDA short): %f\n", PSNR_Src_DstCUDAshort); - printf("PSNR CPU(Gold 1) <---> GPU(CUDA 1) : %f\n", - PSNR_DstGold1_DstCUDA1); - printf("PSNR CPU(Gold 2) <---> GPU(CUDA 2) : %f\n", - PSNR_DstGold2_DstCUDA2); - printf("PSNR CPU(Gold 2) <---> GPU(CUDA short): %f\n", - PSNR_DstGold2_DstCUDA16b); + printf("PSNR Original <---> CPU(Gold 1) : %f\n", PSNR_Src_DstGold1); + printf("PSNR Original <---> CPU(Gold 2) : %f\n", PSNR_Src_DstGold2); + printf("PSNR Original <---> GPU(CUDA 1) : %f\n", PSNR_Src_DstCUDA1); + printf("PSNR Original <---> GPU(CUDA 2) : %f\n", PSNR_Src_DstCUDA2); + printf("PSNR Original <---> GPU(CUDA short): %f\n", PSNR_Src_DstCUDAshort); + printf("PSNR CPU(Gold 1) <---> GPU(CUDA 1) : %f\n", PSNR_DstGold1_DstCUDA1); + printf("PSNR CPU(Gold 2) <---> GPU(CUDA 2) : %f\n", PSNR_DstGold2_DstCUDA2); + printf("PSNR CPU(Gold 2) <---> GPU(CUDA short): %f\n", PSNR_DstGold2_DstCUDA16b); - bool bTestResult = (PSNR_DstGold1_DstCUDA1 > PSNR_THRESHOLD_EQUAL && - PSNR_DstGold2_DstCUDA2 > PSNR_THRESHOLD_EQUAL && - PSNR_DstGold2_DstCUDA16b > PSNR_THRESHOLD_EQUAL); + bool bTestResult = (PSNR_DstGold1_DstCUDA1 > PSNR_THRESHOLD_EQUAL && PSNR_DstGold2_DstCUDA2 > PSNR_THRESHOLD_EQUAL + && PSNR_DstGold2_DstCUDA16b > PSNR_THRESHOLD_EQUAL); - // - // Finalization - // + // + // Finalization + // - // release byte planes - FreePlane(ImgSrc); - FreePlane(ImgDstGold1); - FreePlane(ImgDstGold2); - FreePlane(ImgDstCUDA1); - FreePlane(ImgDstCUDA2); - FreePlane(ImgDstCUDAshort); + // release byte planes + FreePlane(ImgSrc); + FreePlane(ImgDstGold1); + FreePlane(ImgDstGold2); + FreePlane(ImgDstCUDA1); + FreePlane(ImgDstCUDA2); + FreePlane(ImgDstCUDAshort); - // finalize - printf("\nTest Summary...\n"); + // finalize + printf("\nTest Summary...\n"); - if (!bTestResult) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + if (!bTestResult) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - printf("Test passed\n"); - exit(EXIT_SUCCESS); + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel1.cuh b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel1.cuh index 0176c054..a5b73f9c 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel1.cuh +++ b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel1.cuh @@ -47,19 +47,23 @@ namespace cg = cooperative_groups; #include "Common.h" /** -* This unitary matrix performs discrete cosine transform of rows of the matrix -* to the left -*/ + * This unitary matrix performs discrete cosine transform of rows of the matrix + * to the left + */ __constant__ float DCTv8matrix[] = { - 0.3535533905932738f, 0.4903926402016152f, 0.4619397662556434f, 0.4157348061512726f, 0.3535533905932738f, 0.2777851165098011f, 0.1913417161825449f, 0.0975451610080642f, - 0.3535533905932738f, 0.4157348061512726f, 0.1913417161825449f, -0.0975451610080641f, -0.3535533905932737f, -0.4903926402016152f, -0.4619397662556434f, -0.2777851165098011f, - 0.3535533905932738f, 0.2777851165098011f, -0.1913417161825449f, -0.4903926402016152f, -0.3535533905932738f, 0.0975451610080642f, 0.4619397662556433f, 0.4157348061512727f, - 0.3535533905932738f, 0.0975451610080642f, -0.4619397662556434f, -0.2777851165098011f, 0.3535533905932737f, 0.4157348061512727f, -0.1913417161825450f, -0.4903926402016153f, - 0.3535533905932738f, -0.0975451610080641f, -0.4619397662556434f, 0.2777851165098009f, 0.3535533905932738f, -0.4157348061512726f, -0.1913417161825453f, 0.4903926402016152f, - 0.3535533905932738f, -0.2777851165098010f, -0.1913417161825452f, 0.4903926402016153f, -0.3535533905932733f, -0.0975451610080649f, 0.4619397662556437f, -0.4157348061512720f, - 0.3535533905932738f, -0.4157348061512727f, 0.1913417161825450f, 0.0975451610080640f, -0.3535533905932736f, 0.4903926402016152f, -0.4619397662556435f, 0.2777851165098022f, - 0.3535533905932738f, -0.4903926402016152f, 0.4619397662556433f, -0.4157348061512721f, 0.3535533905932733f, -0.2777851165098008f, 0.1913417161825431f, -0.0975451610080625f -}; + 0.3535533905932738f, 0.4903926402016152f, 0.4619397662556434f, 0.4157348061512726f, 0.3535533905932738f, + 0.2777851165098011f, 0.1913417161825449f, 0.0975451610080642f, 0.3535533905932738f, 0.4157348061512726f, + 0.1913417161825449f, -0.0975451610080641f, -0.3535533905932737f, -0.4903926402016152f, -0.4619397662556434f, + -0.2777851165098011f, 0.3535533905932738f, 0.2777851165098011f, -0.1913417161825449f, -0.4903926402016152f, + -0.3535533905932738f, 0.0975451610080642f, 0.4619397662556433f, 0.4157348061512727f, 0.3535533905932738f, + 0.0975451610080642f, -0.4619397662556434f, -0.2777851165098011f, 0.3535533905932737f, 0.4157348061512727f, + -0.1913417161825450f, -0.4903926402016153f, 0.3535533905932738f, -0.0975451610080641f, -0.4619397662556434f, + 0.2777851165098009f, 0.3535533905932738f, -0.4157348061512726f, -0.1913417161825453f, 0.4903926402016152f, + 0.3535533905932738f, -0.2777851165098010f, -0.1913417161825452f, 0.4903926402016153f, -0.3535533905932733f, + -0.0975451610080649f, 0.4619397662556437f, -0.4157348061512720f, 0.3535533905932738f, -0.4157348061512727f, + 0.1913417161825450f, 0.0975451610080640f, -0.3535533905932736f, 0.4903926402016152f, -0.4619397662556435f, + 0.2777851165098022f, 0.3535533905932738f, -0.4903926402016152f, 0.4619397662556433f, -0.4157348061512721f, + 0.3535533905932733f, -0.2777851165098008f, 0.1913417161825431f, -0.0975451610080625f}; // Temporary blocks __shared__ float CurBlockLocal1[BLOCK_SIZE2]; @@ -80,73 +84,70 @@ __shared__ float CurBlockLocal2[BLOCK_SIZE2]; * * \return None */ -__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, - int OffsetYBlocks, cudaTextureObject_t TexSrc) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Block index - const int bx = blockIdx.x + OffsetXBlocks; - const int by = blockIdx.y + OffsetYBlocks; +__global__ void +CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks, cudaTextureObject_t TexSrc) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Block index + const int bx = blockIdx.x + OffsetXBlocks; + const int by = blockIdx.y + OffsetYBlocks; - // Thread index (current coefficient) - const int tx = threadIdx.x; - const int ty = threadIdx.y; + // Thread index (current coefficient) + const int tx = threadIdx.x; + const int ty = threadIdx.y; - // Texture coordinates - const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f; - const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f; + // Texture coordinates + const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f; + const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f; - // copy current image pixel to the first block - CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = - tex2D(TexSrc, tex_x, tex_y); + // copy current image pixel to the first block + CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = tex2D(TexSrc, tex_x, tex_y); - // synchronize threads to make sure the block is copied - cg::sync(cta); + // synchronize threads to make sure the block is copied + cg::sync(cta); - // calculate the multiplication of DCTv8matrixT * A and place it in the second - // block - float curelem = 0; - int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty; - int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx; + // calculate the multiplication of DCTv8matrixT * A and place it in the second + // block + float curelem = 0; + int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty; + int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx; #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) { - curelem += - DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index]; - DCTv8matrixIndex += BLOCK_SIZE; - CurBlockLocal1Index += BLOCK_SIZE; - } + for (int i = 0; i < BLOCK_SIZE; i++) { + curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index]; + DCTv8matrixIndex += BLOCK_SIZE; + CurBlockLocal1Index += BLOCK_SIZE; + } - CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; + CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; - // synchronize threads to make sure the first 2 matrices are multiplied and - // the result is stored in the second block - cg::sync(cta); + // synchronize threads to make sure the first 2 matrices are multiplied and + // the result is stored in the second block + cg::sync(cta); - // calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place - // it in the first block - curelem = 0; - int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0; - DCTv8matrixIndex = 0 * BLOCK_SIZE + tx; + // calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place + // it in the first block + curelem = 0; + int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0; + DCTv8matrixIndex = 0 * BLOCK_SIZE + tx; #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) { - curelem += - CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex]; - CurBlockLocal2Index += 1; - DCTv8matrixIndex += BLOCK_SIZE; - } + for (int i = 0; i < BLOCK_SIZE; i++) { + curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex]; + CurBlockLocal2Index += 1; + DCTv8matrixIndex += BLOCK_SIZE; + } - CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; + CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; - // synchronize threads to make sure the matrices are multiplied and the result - // is stored back in the first block - cg::sync(cta); + // synchronize threads to make sure the matrices are multiplied and the result + // is stored back in the first block + cg::sync(cta); - // copy current coefficient to its place in the result array - Dst[FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + - ((bx << BLOCK_SIZE_LOG2) + tx)] = - CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx]; + // copy current coefficient to its place in the result array + Dst[FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx)] = + CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx]; } /** @@ -164,71 +165,68 @@ __global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, * * \return None */ -__global__ void CUDAkernel1IDCT(float *Dst, int ImgWidth, int OffsetXBlocks, - int OffsetYBlocks, cudaTextureObject_t TexSrc) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Block index - int bx = blockIdx.x + OffsetXBlocks; - int by = blockIdx.y + OffsetYBlocks; +__global__ void +CUDAkernel1IDCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks, cudaTextureObject_t TexSrc) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Block index + int bx = blockIdx.x + OffsetXBlocks; + int by = blockIdx.y + OffsetYBlocks; - // Thread index (current image pixel) - int tx = threadIdx.x; - int ty = threadIdx.y; + // Thread index (current image pixel) + int tx = threadIdx.x; + int ty = threadIdx.y; - // Texture coordinates - const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f; - const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f; + // Texture coordinates + const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f; + const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f; - // copy current image pixel to the first block - CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = - tex2D(TexSrc, tex_x, tex_y); + // copy current image pixel to the first block + CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = tex2D(TexSrc, tex_x, tex_y); - // synchronize threads to make sure the block is copied - cg::sync(cta); + // synchronize threads to make sure the block is copied + cg::sync(cta); - // calculate the multiplication of DCTv8matrix * A and place it in the second - // block - float curelem = 0; - int DCTv8matrixIndex = (ty << BLOCK_SIZE_LOG2) + 0; - int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx; + // calculate the multiplication of DCTv8matrix * A and place it in the second + // block + float curelem = 0; + int DCTv8matrixIndex = (ty << BLOCK_SIZE_LOG2) + 0; + int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx; #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) { - curelem += - DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index]; - DCTv8matrixIndex += 1; - CurBlockLocal1Index += BLOCK_SIZE; - } + for (int i = 0; i < BLOCK_SIZE; i++) { + curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index]; + DCTv8matrixIndex += 1; + CurBlockLocal1Index += BLOCK_SIZE; + } - CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; + CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; - // synchronize threads to make sure the first 2 matrices are multiplied and - // the result is stored in the second block - cg::sync(cta); + // synchronize threads to make sure the first 2 matrices are multiplied and + // the result is stored in the second block + cg::sync(cta); - // calculate the multiplication of (DCTv8matrix * A) * DCTv8matrixT and place - // it in the first block - curelem = 0; - int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0; - DCTv8matrixIndex = (tx << BLOCK_SIZE_LOG2) + 0; + // calculate the multiplication of (DCTv8matrix * A) * DCTv8matrixT and place + // it in the first block + curelem = 0; + int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0; + DCTv8matrixIndex = (tx << BLOCK_SIZE_LOG2) + 0; #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) { - curelem += - CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex]; - CurBlockLocal2Index += 1; - DCTv8matrixIndex += 1; - } + for (int i = 0; i < BLOCK_SIZE; i++) { + curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex]; + CurBlockLocal2Index += 1; + DCTv8matrixIndex += 1; + } - CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; + CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx] = curelem; - // synchronize threads to make sure the matrices are multiplied and the result - // is stored back in the first block - cg::sync(cta); + // synchronize threads to make sure the matrices are multiplied and the result + // is stored back in the first block + cg::sync(cta); - // copy current coefficient to its place in the result array - Dst[FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + - ((bx << BLOCK_SIZE_LOG2) + tx)] = - CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx]; + // copy current coefficient to its place in the result array + Dst[FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx)] = + CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx]; } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel2.cuh b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel2.cuh index 2b5f8b58..e2792996 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel2.cuh +++ b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel2.cuh @@ -47,41 +47,41 @@ namespace cg = cooperative_groups; #include "Common.h" // Used in forward and inverse DCT -#define C_a 1.387039845322148f //!< a = (2^0.5) * cos( pi / 16); -#define C_b 1.306562964876377f //!< b = (2^0.5) * cos( pi / 8); -#define C_c 1.175875602419359f //!< c = (2^0.5) * cos(3 * pi / 16); -#define C_d 0.785694958387102f //!< d = (2^0.5) * cos(5 * pi / 16); -#define C_e 0.541196100146197f //!< e = (2^0.5) * cos(3 * pi / 8); -#define C_f 0.275899379282943f //!< f = (2^0.5) * cos(7 * pi / 16); +#define C_a 1.387039845322148f //!< a = (2^0.5) * cos( pi / 16); +#define C_b 1.306562964876377f //!< b = (2^0.5) * cos( pi / 8); +#define C_c 1.175875602419359f //!< c = (2^0.5) * cos(3 * pi / 16); +#define C_d 0.785694958387102f //!< d = (2^0.5) * cos(5 * pi / 16); +#define C_e 0.541196100146197f //!< e = (2^0.5) * cos(3 * pi / 8); +#define C_f 0.275899379282943f //!< f = (2^0.5) * cos(7 * pi / 16); /** -* Normalization constant that is used in forward and inverse DCT -*/ -#define C_norm 0.3535533905932737f // 1 / (8^0.5) + * Normalization constant that is used in forward and inverse DCT + */ +#define C_norm 0.3535533905932737f // 1 / (8^0.5) /** -* Width of data block (2nd kernel) -*/ + * Width of data block (2nd kernel) + */ #define KER2_BLOCK_WIDTH 32 /** -* Height of data block (2nd kernel) -*/ + * Height of data block (2nd kernel) + */ #define KER2_BLOCK_HEIGHT 16 /** -* LOG2 of width of data block (2nd kernel) -*/ + * LOG2 of width of data block (2nd kernel) + */ #define KER2_BW_LOG2 5 /** -* LOG2 of height of data block (2nd kernel) -*/ + * LOG2 of height of data block (2nd kernel) + */ #define KER2_BH_LOG2 4 /** -* Stride of shared memory buffer (2nd kernel) -*/ + * Stride of shared memory buffer (2nd kernel) + */ #define KER2_SMEMBLOCK_STRIDE (KER2_BLOCK_WIDTH + 1) /** @@ -93,39 +93,40 @@ namespace cg = cooperative_groups; * * \return None */ -__device__ void CUDAsubroutineInplaceDCTvector(float *Vect0, int Step) { - float *Vect1 = Vect0 + Step; - float *Vect2 = Vect1 + Step; - float *Vect3 = Vect2 + Step; - float *Vect4 = Vect3 + Step; - float *Vect5 = Vect4 + Step; - float *Vect6 = Vect5 + Step; - float *Vect7 = Vect6 + Step; +__device__ void CUDAsubroutineInplaceDCTvector(float *Vect0, int Step) +{ + float *Vect1 = Vect0 + Step; + float *Vect2 = Vect1 + Step; + float *Vect3 = Vect2 + Step; + float *Vect4 = Vect3 + Step; + float *Vect5 = Vect4 + Step; + float *Vect6 = Vect5 + Step; + float *Vect7 = Vect6 + Step; - float X07P = (*Vect0) + (*Vect7); - float X16P = (*Vect1) + (*Vect6); - float X25P = (*Vect2) + (*Vect5); - float X34P = (*Vect3) + (*Vect4); + float X07P = (*Vect0) + (*Vect7); + float X16P = (*Vect1) + (*Vect6); + float X25P = (*Vect2) + (*Vect5); + float X34P = (*Vect3) + (*Vect4); - float X07M = (*Vect0) - (*Vect7); - float X61M = (*Vect6) - (*Vect1); - float X25M = (*Vect2) - (*Vect5); - float X43M = (*Vect4) - (*Vect3); + float X07M = (*Vect0) - (*Vect7); + float X61M = (*Vect6) - (*Vect1); + float X25M = (*Vect2) - (*Vect5); + float X43M = (*Vect4) - (*Vect3); - float X07P34PP = X07P + X34P; - float X07P34PM = X07P - X34P; - float X16P25PP = X16P + X25P; - float X16P25PM = X16P - X25P; + float X07P34PP = X07P + X34P; + float X07P34PM = X07P - X34P; + float X16P25PP = X16P + X25P; + float X16P25PM = X16P - X25P; - (*Vect0) = C_norm * (X07P34PP + X16P25PP); - (*Vect2) = C_norm * (C_b * X07P34PM + C_e * X16P25PM); - (*Vect4) = C_norm * (X07P34PP - X16P25PP); - (*Vect6) = C_norm * (C_e * X07P34PM - C_b * X16P25PM); + (*Vect0) = C_norm * (X07P34PP + X16P25PP); + (*Vect2) = C_norm * (C_b * X07P34PM + C_e * X16P25PM); + (*Vect4) = C_norm * (X07P34PP - X16P25PP); + (*Vect6) = C_norm * (C_e * X07P34PM - C_b * X16P25PM); - (*Vect1) = C_norm * (C_a * X07M - C_c * X61M + C_d * X25M - C_f * X43M); - (*Vect3) = C_norm * (C_c * X07M + C_f * X61M - C_a * X25M + C_d * X43M); - (*Vect5) = C_norm * (C_d * X07M + C_a * X61M + C_f * X25M - C_c * X43M); - (*Vect7) = C_norm * (C_f * X07M + C_d * X61M + C_c * X25M + C_a * X43M); + (*Vect1) = C_norm * (C_a * X07M - C_c * X61M + C_d * X25M - C_f * X43M); + (*Vect3) = C_norm * (C_c * X07M + C_f * X61M - C_a * X25M + C_d * X43M); + (*Vect5) = C_norm * (C_d * X07M + C_a * X61M + C_f * X25M - C_c * X43M); + (*Vect7) = C_norm * (C_f * X07M + C_d * X61M + C_c * X25M + C_a * X43M); } /** @@ -137,44 +138,41 @@ __device__ void CUDAsubroutineInplaceDCTvector(float *Vect0, int Step) { * * \return None */ -__device__ void CUDAsubroutineInplaceIDCTvector(float *Vect0, int Step) { - float *Vect1 = Vect0 + Step; - float *Vect2 = Vect1 + Step; - float *Vect3 = Vect2 + Step; - float *Vect4 = Vect3 + Step; - float *Vect5 = Vect4 + Step; - float *Vect6 = Vect5 + Step; - float *Vect7 = Vect6 + Step; +__device__ void CUDAsubroutineInplaceIDCTvector(float *Vect0, int Step) +{ + float *Vect1 = Vect0 + Step; + float *Vect2 = Vect1 + Step; + float *Vect3 = Vect2 + Step; + float *Vect4 = Vect3 + Step; + float *Vect5 = Vect4 + Step; + float *Vect6 = Vect5 + Step; + float *Vect7 = Vect6 + Step; - float Y04P = (*Vect0) + (*Vect4); - float Y2b6eP = C_b * (*Vect2) + C_e * (*Vect6); + float Y04P = (*Vect0) + (*Vect4); + float Y2b6eP = C_b * (*Vect2) + C_e * (*Vect6); - float Y04P2b6ePP = Y04P + Y2b6eP; - float Y04P2b6ePM = Y04P - Y2b6eP; - float Y7f1aP3c5dPP = - C_f * (*Vect7) + C_a * (*Vect1) + C_c * (*Vect3) + C_d * (*Vect5); - float Y7a1fM3d5cMP = - C_a * (*Vect7) - C_f * (*Vect1) + C_d * (*Vect3) - C_c * (*Vect5); + float Y04P2b6ePP = Y04P + Y2b6eP; + float Y04P2b6ePM = Y04P - Y2b6eP; + float Y7f1aP3c5dPP = C_f * (*Vect7) + C_a * (*Vect1) + C_c * (*Vect3) + C_d * (*Vect5); + float Y7a1fM3d5cMP = C_a * (*Vect7) - C_f * (*Vect1) + C_d * (*Vect3) - C_c * (*Vect5); - float Y04M = (*Vect0) - (*Vect4); - float Y2e6bM = C_e * (*Vect2) - C_b * (*Vect6); + float Y04M = (*Vect0) - (*Vect4); + float Y2e6bM = C_e * (*Vect2) - C_b * (*Vect6); - float Y04M2e6bMP = Y04M + Y2e6bM; - float Y04M2e6bMM = Y04M - Y2e6bM; - float Y1c7dM3f5aPM = - C_c * (*Vect1) - C_d * (*Vect7) - C_f * (*Vect3) - C_a * (*Vect5); - float Y1d7cP3a5fMM = - C_d * (*Vect1) + C_c * (*Vect7) - C_a * (*Vect3) + C_f * (*Vect5); + float Y04M2e6bMP = Y04M + Y2e6bM; + float Y04M2e6bMM = Y04M - Y2e6bM; + float Y1c7dM3f5aPM = C_c * (*Vect1) - C_d * (*Vect7) - C_f * (*Vect3) - C_a * (*Vect5); + float Y1d7cP3a5fMM = C_d * (*Vect1) + C_c * (*Vect7) - C_a * (*Vect3) + C_f * (*Vect5); - (*Vect0) = C_norm * (Y04P2b6ePP + Y7f1aP3c5dPP); - (*Vect7) = C_norm * (Y04P2b6ePP - Y7f1aP3c5dPP); - (*Vect4) = C_norm * (Y04P2b6ePM + Y7a1fM3d5cMP); - (*Vect3) = C_norm * (Y04P2b6ePM - Y7a1fM3d5cMP); + (*Vect0) = C_norm * (Y04P2b6ePP + Y7f1aP3c5dPP); + (*Vect7) = C_norm * (Y04P2b6ePP - Y7f1aP3c5dPP); + (*Vect4) = C_norm * (Y04P2b6ePM + Y7a1fM3d5cMP); + (*Vect3) = C_norm * (Y04P2b6ePM - Y7a1fM3d5cMP); - (*Vect1) = C_norm * (Y04M2e6bMP + Y1c7dM3f5aPM); - (*Vect5) = C_norm * (Y04M2e6bMM - Y1d7cP3a5fMM); - (*Vect2) = C_norm * (Y04M2e6bMM + Y1d7cP3a5fMM); - (*Vect6) = C_norm * (Y04M2e6bMP - Y1c7dM3f5aPM); + (*Vect1) = C_norm * (Y04M2e6bMP + Y1c7dM3f5aPM); + (*Vect5) = C_norm * (Y04M2e6bMM - Y1d7cP3a5fMM); + (*Vect2) = C_norm * (Y04M2e6bMM + Y1d7cP3a5fMM); + (*Vect6) = C_norm * (Y04M2e6bMP - Y1c7dM3f5aPM); } /** @@ -192,40 +190,38 @@ __device__ void CUDAsubroutineInplaceIDCTvector(float *Vect0, int Step) { * \return None */ -__global__ void CUDAkernel2DCT(float *dst, float *src, int ImgStride) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void CUDAkernel2DCT(float *dst, float *src, int ImgStride) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE]; + __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE]; - int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x; - int OffsThreadInCol = threadIdx.z * BLOCK_SIZE; - src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + - blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow; - dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + - blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow; - float *bl_ptr = - block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow; + int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x; + int OffsThreadInCol = threadIdx.z * BLOCK_SIZE; + src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + + OffsThreadInRow; + dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + + OffsThreadInRow; + float *bl_ptr = block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow; #pragma unroll - for (unsigned int i = 0; i < BLOCK_SIZE; i++) - bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride]; + for (unsigned int i = 0; i < BLOCK_SIZE; i++) + bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride]; - cg::sync(cta); - // process rows - CUDAsubroutineInplaceDCTvector( - block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + - OffsThreadInRow - threadIdx.x, - 1); + cg::sync(cta); + // process rows + CUDAsubroutineInplaceDCTvector( + block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow - threadIdx.x, 1); - cg::sync(cta); - // process columns - CUDAsubroutineInplaceDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE); + cg::sync(cta); + // process columns + CUDAsubroutineInplaceDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE); - cg::sync(cta); - for (unsigned int i = 0; i < BLOCK_SIZE; i++) - dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE]; + cg::sync(cta); + for (unsigned int i = 0; i < BLOCK_SIZE; i++) + dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE]; } /** @@ -242,39 +238,37 @@ __global__ void CUDAkernel2DCT(float *dst, float *src, int ImgStride) { * \return None */ -__global__ void CUDAkernel2IDCT(float *dst, float *src, int ImgStride) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void CUDAkernel2IDCT(float *dst, float *src, int ImgStride) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE]; + __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE]; - int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x; - int OffsThreadInCol = threadIdx.z * BLOCK_SIZE; - src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + - blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow; - dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + - blockIdx.x * KER2_BLOCK_WIDTH + OffsThreadInRow; - float *bl_ptr = - block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow; + int OffsThreadInRow = threadIdx.y * BLOCK_SIZE + threadIdx.x; + int OffsThreadInCol = threadIdx.z * BLOCK_SIZE; + src += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + + OffsThreadInRow; + dst += FMUL(blockIdx.y * KER2_BLOCK_HEIGHT + OffsThreadInCol, ImgStride) + blockIdx.x * KER2_BLOCK_WIDTH + + OffsThreadInRow; + float *bl_ptr = block + OffsThreadInCol * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow; #pragma unroll - for (unsigned int i = 0; i < BLOCK_SIZE; i++) - bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride]; + for (unsigned int i = 0; i < BLOCK_SIZE; i++) + bl_ptr[i * KER2_SMEMBLOCK_STRIDE] = src[i * ImgStride]; - cg::sync(cta); - // process rows - CUDAsubroutineInplaceIDCTvector( - block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + - OffsThreadInRow - threadIdx.x, - 1); + cg::sync(cta); + // process rows + CUDAsubroutineInplaceIDCTvector( + block + (OffsThreadInCol + threadIdx.x) * KER2_SMEMBLOCK_STRIDE + OffsThreadInRow - threadIdx.x, 1); - cg::sync(cta); - // process columns - CUDAsubroutineInplaceIDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE); + cg::sync(cta); + // process columns + CUDAsubroutineInplaceIDCTvector(bl_ptr, KER2_SMEMBLOCK_STRIDE); - cg::sync(cta); + cg::sync(cta); - for (unsigned int i = 0; i < BLOCK_SIZE; i++) - dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE]; + for (unsigned int i = 0; i < BLOCK_SIZE; i++) + dst[i * ImgStride] = bl_ptr[i * KER2_SMEMBLOCK_STRIDE]; } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_quantization.cuh b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_quantization.cuh index c1d35a62..16405d83 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_quantization.cuh +++ b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_quantization.cuh @@ -41,18 +41,11 @@ namespace cg = cooperative_groups; #include "Common.h" /** -* JPEG quality=0_of_12 quantization matrix -*/ -__constant__ short Q[] = { - 32, 33, 51, 81, 66, 39, 34, 17, - 33, 36, 48, 47, 28, 23, 12, 12, - 51, 48, 47, 28, 23, 12, 12, 12, - 81, 47, 28, 23, 12, 12, 12, 12, - 66, 28, 23, 12, 12, 12, 12, 12, - 39, 23, 12, 12, 12, 12, 12, 12, - 34, 12, 12, 12, 12, 12, 12, 12, - 17, 12, 12, 12, 12, 12, 12, 12 -}; + * JPEG quality=0_of_12 quantization matrix + */ +__constant__ short Q[] = {32, 33, 51, 81, 66, 39, 34, 17, 33, 36, 48, 47, 28, 23, 12, 12, 51, 48, 47, 28, 23, 12, + 12, 12, 81, 47, 28, 23, 12, 12, 12, 12, 66, 28, 23, 12, 12, 12, 12, 12, 39, 23, 12, 12, + 12, 12, 12, 12, 34, 12, 12, 12, 12, 12, 12, 12, 17, 12, 12, 12, 12, 12, 12, 12}; /** ************************************************************************** @@ -64,26 +57,26 @@ __constant__ short Q[] = { * * \return None */ -__global__ void CUDAkernelQuantizationFloat(float *SrcDst, int Stride) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; +__global__ void CUDAkernelQuantizationFloat(float *SrcDst, int Stride) +{ + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; - // Thread index (current coefficient) - int tx = threadIdx.x; - int ty = threadIdx.y; + // Thread index (current coefficient) + int tx = threadIdx.x; + int ty = threadIdx.y; - // copy current coefficient to the local variable - float curCoef = - SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)]; - float curQuant = (float)Q[ty * BLOCK_SIZE + tx]; + // copy current coefficient to the local variable + float curCoef = SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)]; + float curQuant = (float)Q[ty * BLOCK_SIZE + tx]; - // quantize the current coefficient - float quantized = roundf(curCoef / curQuant); - curCoef = quantized * curQuant; + // quantize the current coefficient + float quantized = roundf(curCoef / curQuant); + curCoef = quantized * curQuant; - // copy quantized coefficient back to the DCT-plane - SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)] = curCoef; + // copy quantized coefficient back to the DCT-plane + SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)] = curCoef; } /** @@ -96,37 +89,38 @@ __global__ void CUDAkernelQuantizationFloat(float *SrcDst, int Stride) { * * \return None */ -__global__ void CUDAkernelQuantizationShort(short *SrcDst, int Stride) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; +__global__ void CUDAkernelQuantizationShort(short *SrcDst, int Stride) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; - // Thread index (current coefficient) - int tx = threadIdx.x; - int ty = threadIdx.y; + // Thread index (current coefficient) + int tx = threadIdx.x; + int ty = threadIdx.y; - // copy current coefficient to the local variable - short curCoef = - SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)]; - short curQuant = Q[ty * BLOCK_SIZE + tx]; + // copy current coefficient to the local variable + short curCoef = SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)]; + short curQuant = Q[ty * BLOCK_SIZE + tx]; - // quantize the current coefficient - if (curCoef < 0) { - curCoef = -curCoef; - curCoef += curQuant >> 1; - curCoef /= curQuant; - curCoef = -curCoef; - } else { - curCoef += curQuant >> 1; - curCoef /= curQuant; - } + // quantize the current coefficient + if (curCoef < 0) { + curCoef = -curCoef; + curCoef += curQuant >> 1; + curCoef /= curQuant; + curCoef = -curCoef; + } + else { + curCoef += curQuant >> 1; + curCoef /= curQuant; + } - cg::sync(cta); + cg::sync(cta); - curCoef = curCoef * curQuant; + curCoef = curCoef * curQuant; - // copy quantized coefficient back to the DCT-plane - SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)] = curCoef; + // copy quantized coefficient back to the DCT-plane + SrcDst[(by * BLOCK_SIZE + ty) * Stride + (bx * BLOCK_SIZE + tx)] = curCoef; } diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_short.cuh b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_short.cuh index fbe51063..a0bcf73a 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_short.cuh +++ b/Samples/2_Concepts_and_Techniques/dct8x8/dct8x8_kernel_short.cuh @@ -46,33 +46,33 @@ namespace cg = cooperative_groups; #include "Common.h" /** -* Width of data block (short kernel) -*/ + * Width of data block (short kernel) + */ #define KERS_BLOCK_WIDTH 32 /** -* Height of data block (short kernel) -*/ + * Height of data block (short kernel) + */ #define KERS_BLOCK_HEIGHT 32 /** -* LOG2 of width of data block (short kernel) -*/ + * LOG2 of width of data block (short kernel) + */ #define KERS_BW_LOG2 5 /** -* LOG2 of height of data block (short kernel) -*/ + * LOG2 of height of data block (short kernel) + */ #define KERS_BH_LOG2 5 /** -* Stride of shared memory buffer (short kernel) -*/ + * Stride of shared memory buffer (short kernel) + */ #define KERS_SMEMBLOCK_STRIDE (KERS_BLOCK_WIDTH + 2) /** -* Half of data block width (short kernel) -*/ + * Half of data block width (short kernel) + */ #define KERS_BLOCK_WIDTH_HALF (KERS_BLOCK_WIDTH / 2) #define SIN_1_4 0x5A82 @@ -91,25 +91,27 @@ namespace cg = cooperative_groups; #define OCOS_7_16 0x063E /** -* Package of 2 shorts into 1 int - designed to perform i/o by integers to avoid -* bank conflicts -*/ -union PackedShorts { - struct __align__(8) { - short hShort1; - short hShort2; - }; - unsigned int hInt; + * Package of 2 shorts into 1 int - designed to perform i/o by integers to avoid + * bank conflicts + */ +union PackedShorts +{ + struct __align__(8) + { + short hShort1; + short hShort2; + }; + unsigned int hInt; }; /** -* Converts fixed point value to short value -*/ + * Converts fixed point value to short value + */ __device__ inline short unfixh(int x) { return (short)((x + 0x8000) >> 16); } /** -* Converts fixed point value to short value -*/ + * Converts fixed point value to short value + */ __device__ inline int unfixo(int x) { return (x + 0x1000) >> 13; } /** @@ -122,74 +124,75 @@ __device__ inline int unfixo(int x) { return (x + 0x1000) >> 13; } * * \return None */ -__device__ void CUDAshortInplaceDCT(short *SrcDst, int Stride) { - int in0, in1, in2, in3, in4, in5, in6, in7; - int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int tmp10, tmp11, tmp12, tmp13; - int tmp14, tmp15, tmp16, tmp17; - int tmp25, tmp26; +__device__ void CUDAshortInplaceDCT(short *SrcDst, int Stride) +{ + int in0, in1, in2, in3, in4, in5, in6, in7; + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int tmp14, tmp15, tmp16, tmp17; + int tmp25, tmp26; - int DoubleStride = Stride << 1; + int DoubleStride = Stride << 1; - short *DstPtr = SrcDst; - in0 = *DstPtr; - DstPtr += Stride; - in1 = *DstPtr; - DstPtr += Stride; - in2 = *DstPtr; - DstPtr += Stride; - in3 = *DstPtr; - DstPtr += Stride; - in4 = *DstPtr; - DstPtr += Stride; - in5 = *DstPtr; - DstPtr += Stride; - in6 = *DstPtr; - DstPtr += Stride; - in7 = *DstPtr; + short *DstPtr = SrcDst; + in0 = *DstPtr; + DstPtr += Stride; + in1 = *DstPtr; + DstPtr += Stride; + in2 = *DstPtr; + DstPtr += Stride; + in3 = *DstPtr; + DstPtr += Stride; + in4 = *DstPtr; + DstPtr += Stride; + in5 = *DstPtr; + DstPtr += Stride; + in6 = *DstPtr; + DstPtr += Stride; + in7 = *DstPtr; - tmp0 = in7 + in0; - tmp1 = in6 + in1; - tmp2 = in5 + in2; - tmp3 = in4 + in3; - tmp4 = in3 - in4; - tmp5 = in2 - in5; - tmp6 = in1 - in6; - tmp7 = in0 - in7; + tmp0 = in7 + in0; + tmp1 = in6 + in1; + tmp2 = in5 + in2; + tmp3 = in4 + in3; + tmp4 = in3 - in4; + tmp5 = in2 - in5; + tmp6 = in1 - in6; + tmp7 = in0 - in7; - tmp10 = tmp3 + tmp0; - tmp11 = tmp2 + tmp1; - tmp12 = tmp1 - tmp2; - tmp13 = tmp0 - tmp3; + tmp10 = tmp3 + tmp0; + tmp11 = tmp2 + tmp1; + tmp12 = tmp1 - tmp2; + tmp13 = tmp0 - tmp3; - tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4)); - tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4)); + tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4)); + tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4)); - tmp4 <<= 2; - tmp7 <<= 2; + tmp4 <<= 2; + tmp7 <<= 2; - tmp14 = tmp4 + tmp15; - tmp25 = tmp4 - tmp15; - tmp26 = tmp7 - tmp16; - tmp17 = tmp7 + tmp16; + tmp14 = tmp4 + tmp15; + tmp25 = tmp4 - tmp15; + tmp26 = tmp7 - tmp16; + tmp17 = tmp7 + tmp16; - DstPtr = SrcDst; - *DstPtr = unfixh(FMUL(tmp10 + tmp11, SIN_1_4)); - DstPtr += DoubleStride; - *DstPtr = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8)); - DstPtr += DoubleStride; - *DstPtr = unfixh(FMUL(tmp10 - tmp11, COS_1_4)); - DstPtr += DoubleStride; - *DstPtr = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8)); + DstPtr = SrcDst; + *DstPtr = unfixh(FMUL(tmp10 + tmp11, SIN_1_4)); + DstPtr += DoubleStride; + *DstPtr = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8)); + DstPtr += DoubleStride; + *DstPtr = unfixh(FMUL(tmp10 - tmp11, COS_1_4)); + DstPtr += DoubleStride; + *DstPtr = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8)); - DstPtr = SrcDst + Stride; - *DstPtr = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16)); - DstPtr += DoubleStride; - *DstPtr = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16)); - DstPtr += DoubleStride; - *DstPtr = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16)); - DstPtr += DoubleStride; - *DstPtr = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16)); + DstPtr = SrcDst + Stride; + *DstPtr = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16)); + DstPtr += DoubleStride; + *DstPtr = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16)); + DstPtr += DoubleStride; + *DstPtr = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16)); + DstPtr += DoubleStride; + *DstPtr = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16)); } /** @@ -201,67 +204,68 @@ __device__ void CUDAshortInplaceDCT(short *SrcDst, int Stride) { * * \return None */ -__device__ void CUDAshortInplaceDCT(unsigned int *V8) { - int in0, in1, in2, in3, in4, in5, in6, in7; - int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int tmp10, tmp11, tmp12, tmp13; - int tmp14, tmp15, tmp16, tmp17; - int tmp25, tmp26; - PackedShorts sh0, sh1, sh2, sh3; +__device__ void CUDAshortInplaceDCT(unsigned int *V8) +{ + int in0, in1, in2, in3, in4, in5, in6, in7; + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int tmp14, tmp15, tmp16, tmp17; + int tmp25, tmp26; + PackedShorts sh0, sh1, sh2, sh3; - sh0.hInt = V8[0]; - sh1.hInt = V8[1]; - sh2.hInt = V8[2]; - sh3.hInt = V8[3]; - in0 = sh0.hShort1; - in1 = sh0.hShort2; - in2 = sh1.hShort1; - in3 = sh1.hShort2; - in4 = sh2.hShort1; - in5 = sh2.hShort2; - in6 = sh3.hShort1; - in7 = sh3.hShort2; + sh0.hInt = V8[0]; + sh1.hInt = V8[1]; + sh2.hInt = V8[2]; + sh3.hInt = V8[3]; + in0 = sh0.hShort1; + in1 = sh0.hShort2; + in2 = sh1.hShort1; + in3 = sh1.hShort2; + in4 = sh2.hShort1; + in5 = sh2.hShort2; + in6 = sh3.hShort1; + in7 = sh3.hShort2; - tmp0 = in7 + in0; - tmp1 = in6 + in1; - tmp2 = in5 + in2; - tmp3 = in4 + in3; - tmp4 = in3 - in4; - tmp5 = in2 - in5; - tmp6 = in1 - in6; - tmp7 = in0 - in7; + tmp0 = in7 + in0; + tmp1 = in6 + in1; + tmp2 = in5 + in2; + tmp3 = in4 + in3; + tmp4 = in3 - in4; + tmp5 = in2 - in5; + tmp6 = in1 - in6; + tmp7 = in0 - in7; - tmp10 = tmp3 + tmp0; - tmp11 = tmp2 + tmp1; - tmp12 = tmp1 - tmp2; - tmp13 = tmp0 - tmp3; + tmp10 = tmp3 + tmp0; + tmp11 = tmp2 + tmp1; + tmp12 = tmp1 - tmp2; + tmp13 = tmp0 - tmp3; - sh0.hShort1 = unfixh(FMUL(tmp10 + tmp11, SIN_1_4)); - sh2.hShort1 = unfixh(FMUL(tmp10 - tmp11, COS_1_4)); + sh0.hShort1 = unfixh(FMUL(tmp10 + tmp11, SIN_1_4)); + sh2.hShort1 = unfixh(FMUL(tmp10 - tmp11, COS_1_4)); - sh1.hShort1 = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8)); - sh3.hShort1 = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8)); + sh1.hShort1 = unfixh(FMUL(tmp13, COS_1_8) + FMUL(tmp12, SIN_1_8)); + sh3.hShort1 = unfixh(FMUL(tmp13, SIN_1_8) - FMUL(tmp12, COS_1_8)); - tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4)); - tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4)); + tmp16 = unfixo(FMUL(tmp6 + tmp5, SIN_1_4)); + tmp15 = unfixo(FMUL(tmp6 - tmp5, COS_1_4)); - tmp4 <<= 2; - tmp7 <<= 2; + tmp4 <<= 2; + tmp7 <<= 2; - tmp14 = tmp4 + tmp15; - tmp25 = tmp4 - tmp15; - tmp26 = tmp7 - tmp16; - tmp17 = tmp7 + tmp16; + tmp14 = tmp4 + tmp15; + tmp25 = tmp4 - tmp15; + tmp26 = tmp7 - tmp16; + tmp17 = tmp7 + tmp16; - sh0.hShort2 = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16)); - sh3.hShort2 = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16)); - sh2.hShort2 = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16)); - sh1.hShort2 = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16)); + sh0.hShort2 = unfixh(FMUL(tmp17, OCOS_1_16) + FMUL(tmp14, OSIN_1_16)); + sh3.hShort2 = unfixh(FMUL(tmp17, OCOS_7_16) - FMUL(tmp14, OSIN_7_16)); + sh2.hShort2 = unfixh(FMUL(tmp26, OCOS_5_16) + FMUL(tmp25, OSIN_5_16)); + sh1.hShort2 = unfixh(FMUL(tmp26, OCOS_3_16) - FMUL(tmp25, OSIN_3_16)); - V8[0] = sh0.hInt; - V8[1] = sh1.hInt; - V8[2] = sh2.hInt; - V8[3] = sh3.hInt; + V8[0] = sh0.hInt; + V8[1] = sh1.hInt; + V8[2] = sh2.hInt; + V8[3] = sh3.hInt; } /** @@ -274,73 +278,74 @@ __device__ void CUDAshortInplaceDCT(unsigned int *V8) { * * \return None */ -__device__ void CUDAshortInplaceIDCT(short *SrcDst, int Stride) { - int in0, in1, in2, in3, in4, in5, in6, in7; - int tmp10, tmp11, tmp12, tmp13; - int tmp20, tmp21, tmp22, tmp23; - int tmp30, tmp31; - int tmp40, tmp41, tmp42, tmp43; - int tmp50, tmp51, tmp52, tmp53; +__device__ void CUDAshortInplaceIDCT(short *SrcDst, int Stride) +{ + int in0, in1, in2, in3, in4, in5, in6, in7; + int tmp10, tmp11, tmp12, tmp13; + int tmp20, tmp21, tmp22, tmp23; + int tmp30, tmp31; + int tmp40, tmp41, tmp42, tmp43; + int tmp50, tmp51, tmp52, tmp53; - short *DstPtr = SrcDst; - in0 = *DstPtr; - DstPtr += Stride; - in1 = *DstPtr; - DstPtr += Stride; - in2 = *DstPtr; - DstPtr += Stride; - in3 = *DstPtr; - DstPtr += Stride; - in4 = *DstPtr; - DstPtr += Stride; - in5 = *DstPtr; - DstPtr += Stride; - in6 = *DstPtr; - DstPtr += Stride; - in7 = *DstPtr; + short *DstPtr = SrcDst; + in0 = *DstPtr; + DstPtr += Stride; + in1 = *DstPtr; + DstPtr += Stride; + in2 = *DstPtr; + DstPtr += Stride; + in3 = *DstPtr; + DstPtr += Stride; + in4 = *DstPtr; + DstPtr += Stride; + in5 = *DstPtr; + DstPtr += Stride; + in6 = *DstPtr; + DstPtr += Stride; + in7 = *DstPtr; - tmp10 = FMUL(in0 + in4, COS_1_4); - tmp11 = FMUL(in0 - in4, COS_1_4); - tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8); - tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8); + tmp10 = FMUL(in0 + in4, COS_1_4); + tmp11 = FMUL(in0 - in4, COS_1_4); + tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8); + tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8); - tmp20 = tmp10 + tmp13; - tmp21 = tmp11 + tmp12; - tmp22 = tmp11 - tmp12; - tmp23 = tmp10 - tmp13; + tmp20 = tmp10 + tmp13; + tmp21 = tmp11 + tmp12; + tmp22 = tmp11 - tmp12; + tmp23 = tmp10 - tmp13; - tmp30 = unfixo(FMUL(in3 + in5, COS_1_4)); - tmp31 = unfixo(FMUL(in3 - in5, COS_1_4)); + tmp30 = unfixo(FMUL(in3 + in5, COS_1_4)); + tmp31 = unfixo(FMUL(in3 - in5, COS_1_4)); - in1 <<= 2; - in7 <<= 2; + in1 <<= 2; + in7 <<= 2; - tmp40 = in1 + tmp30; - tmp41 = in7 + tmp31; - tmp42 = in1 - tmp30; - tmp43 = in7 - tmp31; + tmp40 = in1 + tmp30; + tmp41 = in7 + tmp31; + tmp42 = in1 - tmp30; + tmp43 = in7 - tmp31; - tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16); - tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16); - tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16); - tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16); + tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16); + tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16); + tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16); + tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16); - DstPtr = SrcDst; - *DstPtr = unfixh(tmp20 + tmp50); - DstPtr += Stride; - *DstPtr = unfixh(tmp21 + tmp53); - DstPtr += Stride; - *DstPtr = unfixh(tmp22 + tmp52); - DstPtr += Stride; - *DstPtr = unfixh(tmp23 + tmp51); - DstPtr += Stride; - *DstPtr = unfixh(tmp23 - tmp51); - DstPtr += Stride; - *DstPtr = unfixh(tmp22 - tmp52); - DstPtr += Stride; - *DstPtr = unfixh(tmp21 - tmp53); - DstPtr += Stride; - *DstPtr = unfixh(tmp20 - tmp50); + DstPtr = SrcDst; + *DstPtr = unfixh(tmp20 + tmp50); + DstPtr += Stride; + *DstPtr = unfixh(tmp21 + tmp53); + DstPtr += Stride; + *DstPtr = unfixh(tmp22 + tmp52); + DstPtr += Stride; + *DstPtr = unfixh(tmp23 + tmp51); + DstPtr += Stride; + *DstPtr = unfixh(tmp23 - tmp51); + DstPtr += Stride; + *DstPtr = unfixh(tmp22 - tmp52); + DstPtr += Stride; + *DstPtr = unfixh(tmp21 - tmp53); + DstPtr += Stride; + *DstPtr = unfixh(tmp20 - tmp50); } /** @@ -352,67 +357,68 @@ __device__ void CUDAshortInplaceIDCT(short *SrcDst, int Stride) { * * \return None */ -__device__ void CUDAshortInplaceIDCT(unsigned int *V8) { - int in0, in1, in2, in3, in4, in5, in6, in7; - int tmp10, tmp11, tmp12, tmp13; - int tmp20, tmp21, tmp22, tmp23; - int tmp30, tmp31; - int tmp40, tmp41, tmp42, tmp43; - int tmp50, tmp51, tmp52, tmp53; - PackedShorts sh0, sh1, sh2, sh3; +__device__ void CUDAshortInplaceIDCT(unsigned int *V8) +{ + int in0, in1, in2, in3, in4, in5, in6, in7; + int tmp10, tmp11, tmp12, tmp13; + int tmp20, tmp21, tmp22, tmp23; + int tmp30, tmp31; + int tmp40, tmp41, tmp42, tmp43; + int tmp50, tmp51, tmp52, tmp53; + PackedShorts sh0, sh1, sh2, sh3; - sh0.hInt = V8[0]; - sh1.hInt = V8[1]; - sh2.hInt = V8[2]; - sh3.hInt = V8[3]; - in0 = sh0.hShort1; - in1 = sh0.hShort2; - in2 = sh1.hShort1; - in3 = sh1.hShort2; - in4 = sh2.hShort1; - in5 = sh2.hShort2; - in6 = sh3.hShort1; - in7 = sh3.hShort2; + sh0.hInt = V8[0]; + sh1.hInt = V8[1]; + sh2.hInt = V8[2]; + sh3.hInt = V8[3]; + in0 = sh0.hShort1; + in1 = sh0.hShort2; + in2 = sh1.hShort1; + in3 = sh1.hShort2; + in4 = sh2.hShort1; + in5 = sh2.hShort2; + in6 = sh3.hShort1; + in7 = sh3.hShort2; - tmp10 = FMUL(in0 + in4, COS_1_4); - tmp11 = FMUL(in0 - in4, COS_1_4); - tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8); - tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8); + tmp10 = FMUL(in0 + in4, COS_1_4); + tmp11 = FMUL(in0 - in4, COS_1_4); + tmp12 = FMUL(in2, SIN_1_8) - FMUL(in6, COS_1_8); + tmp13 = FMUL(in6, SIN_1_8) + FMUL(in2, COS_1_8); - tmp20 = tmp10 + tmp13; - tmp21 = tmp11 + tmp12; - tmp22 = tmp11 - tmp12; - tmp23 = tmp10 - tmp13; + tmp20 = tmp10 + tmp13; + tmp21 = tmp11 + tmp12; + tmp22 = tmp11 - tmp12; + tmp23 = tmp10 - tmp13; - tmp30 = unfixo(FMUL(in3 + in5, COS_1_4)); - tmp31 = unfixo(FMUL(in3 - in5, COS_1_4)); + tmp30 = unfixo(FMUL(in3 + in5, COS_1_4)); + tmp31 = unfixo(FMUL(in3 - in5, COS_1_4)); - in1 <<= 2; - in7 <<= 2; + in1 <<= 2; + in7 <<= 2; - tmp40 = in1 + tmp30; - tmp41 = in7 + tmp31; - tmp42 = in1 - tmp30; - tmp43 = in7 - tmp31; + tmp40 = in1 + tmp30; + tmp41 = in7 + tmp31; + tmp42 = in1 - tmp30; + tmp43 = in7 - tmp31; - tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16); - tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16); - tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16); - tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16); + tmp50 = FMUL(tmp40, OCOS_1_16) + FMUL(tmp41, OSIN_1_16); + tmp51 = FMUL(tmp40, OSIN_1_16) - FMUL(tmp41, OCOS_1_16); + tmp52 = FMUL(tmp42, OCOS_5_16) + FMUL(tmp43, OSIN_5_16); + tmp53 = FMUL(tmp42, OSIN_5_16) - FMUL(tmp43, OCOS_5_16); - sh0.hShort1 = unfixh(tmp20 + tmp50); - sh0.hShort2 = unfixh(tmp21 + tmp53); - sh1.hShort1 = unfixh(tmp22 + tmp52); - sh1.hShort2 = unfixh(tmp23 + tmp51); - sh2.hShort1 = unfixh(tmp23 - tmp51); - sh2.hShort2 = unfixh(tmp22 - tmp52); - sh3.hShort1 = unfixh(tmp21 - tmp53); - sh3.hShort2 = unfixh(tmp20 - tmp50); + sh0.hShort1 = unfixh(tmp20 + tmp50); + sh0.hShort2 = unfixh(tmp21 + tmp53); + sh1.hShort1 = unfixh(tmp22 + tmp52); + sh1.hShort2 = unfixh(tmp23 + tmp51); + sh2.hShort1 = unfixh(tmp23 - tmp51); + sh2.hShort2 = unfixh(tmp22 - tmp52); + sh3.hShort1 = unfixh(tmp21 - tmp53); + sh3.hShort2 = unfixh(tmp20 - tmp50); - V8[0] = sh0.hInt; - V8[1] = sh1.hInt; - V8[2] = sh2.hInt; - V8[3] = sh3.hInt; + V8[0] = sh0.hInt; + V8[1] = sh1.hInt; + V8[2] = sh2.hInt; + V8[3] = sh3.hInt; } /** @@ -431,53 +437,46 @@ __device__ void CUDAshortInplaceIDCT(unsigned int *V8) { */ #define IMAD(a, b, c) (((a) * (b)) + (c)) -#define IMUL(a, b) ((a) * (b)) +#define IMUL(a, b) ((a) * (b)) -__global__ void CUDAkernelShortDCT(short *SrcDst, int ImgStride) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ short block[KERS_BLOCK_HEIGHT * KERS_SMEMBLOCK_STRIDE]; - int OffsThreadInRow = FMUL(threadIdx.y, BLOCK_SIZE) + threadIdx.x; - int OffsThreadInCol = FMUL(threadIdx.z, BLOCK_SIZE); - int OffsThrRowPermuted = - (OffsThreadInRow & 0xFFFFFFE0) | - ((OffsThreadInRow << 1) | (OffsThreadInRow >> 4) & 0x1) & 0x1F; +__global__ void CUDAkernelShortDCT(short *SrcDst, int ImgStride) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ short block[KERS_BLOCK_HEIGHT * KERS_SMEMBLOCK_STRIDE]; + int OffsThreadInRow = FMUL(threadIdx.y, BLOCK_SIZE) + threadIdx.x; + int OffsThreadInCol = FMUL(threadIdx.z, BLOCK_SIZE); + int OffsThrRowPermuted = + (OffsThreadInRow & 0xFFFFFFE0) | ((OffsThreadInRow << 1) | (OffsThreadInRow >> 4) & 0x1) & 0x1F; - SrcDst += - IMAD(IMAD(blockIdx.y, KERS_BLOCK_HEIGHT, OffsThreadInCol), ImgStride, - IMAD(blockIdx.x, KERS_BLOCK_WIDTH, OffsThreadInRow * 2)); - short *bl_ptr = - block + IMAD(OffsThreadInCol, KERS_SMEMBLOCK_STRIDE, OffsThreadInRow * 2); + SrcDst += IMAD(IMAD(blockIdx.y, KERS_BLOCK_HEIGHT, OffsThreadInCol), + ImgStride, + IMAD(blockIdx.x, KERS_BLOCK_WIDTH, OffsThreadInRow * 2)); + short *bl_ptr = block + IMAD(OffsThreadInCol, KERS_SMEMBLOCK_STRIDE, OffsThreadInRow * 2); - // load data to shared memory (only first half of threads in each row performs - // data moving (each thread moves 2 shorts) - if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { + // load data to shared memory (only first half of threads in each row performs + // data moving (each thread moves 2 shorts) + if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) - ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)] = - ((int *)SrcDst)[i * (ImgStride / 2)]; - } + for (int i = 0; i < BLOCK_SIZE; i++) + ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)] = ((int *)SrcDst)[i * (ImgStride / 2)]; + } - cg::sync(cta); - CUDAshortInplaceDCT( - block + OffsThreadInCol * KERS_SMEMBLOCK_STRIDE + OffsThrRowPermuted, - KERS_SMEMBLOCK_STRIDE); - cg::sync(cta); - CUDAshortInplaceDCT((unsigned int *)(block + - OffsThreadInRow * KERS_SMEMBLOCK_STRIDE + - OffsThreadInCol)); - cg::sync(cta); + cg::sync(cta); + CUDAshortInplaceDCT(block + OffsThreadInCol * KERS_SMEMBLOCK_STRIDE + OffsThrRowPermuted, KERS_SMEMBLOCK_STRIDE); + cg::sync(cta); + CUDAshortInplaceDCT((unsigned int *)(block + OffsThreadInRow * KERS_SMEMBLOCK_STRIDE + OffsThreadInCol)); + cg::sync(cta); - // store data to global memory (only first half of threads in each row - // performs data moving (each thread moves 2 shorts) - if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { + // store data to global memory (only first half of threads in each row + // performs data moving (each thread moves 2 shorts) + if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) - ((int *)SrcDst)[i * (ImgStride / 2)] = - ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)]; - } + for (int i = 0; i < BLOCK_SIZE; i++) + ((int *)SrcDst)[i * (ImgStride / 2)] = ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)]; + } } /** @@ -495,50 +494,43 @@ __global__ void CUDAkernelShortDCT(short *SrcDst, int ImgStride) { * \return None */ -__global__ void CUDAkernelShortIDCT(short *SrcDst, int ImgStride) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ short block[KERS_BLOCK_HEIGHT * KERS_SMEMBLOCK_STRIDE]; +__global__ void CUDAkernelShortIDCT(short *SrcDst, int ImgStride) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ short block[KERS_BLOCK_HEIGHT * KERS_SMEMBLOCK_STRIDE]; - int OffsThreadInRow = IMAD(threadIdx.y, BLOCK_SIZE, threadIdx.x); - int OffsThreadInCol = IMUL(threadIdx.z, BLOCK_SIZE); - int OffsThrRowPermuted = - (OffsThreadInRow & 0xFFFFFFE0) | - ((OffsThreadInRow << 1) | (OffsThreadInRow >> 4) & 0x1) & 0x1F; + int OffsThreadInRow = IMAD(threadIdx.y, BLOCK_SIZE, threadIdx.x); + int OffsThreadInCol = IMUL(threadIdx.z, BLOCK_SIZE); + int OffsThrRowPermuted = + (OffsThreadInRow & 0xFFFFFFE0) | ((OffsThreadInRow << 1) | (OffsThreadInRow >> 4) & 0x1) & 0x1F; - SrcDst += - IMAD(IMAD(blockIdx.y, KERS_BLOCK_HEIGHT, OffsThreadInCol), ImgStride, - IMAD(blockIdx.x, KERS_BLOCK_WIDTH, OffsThreadInRow * 2)); - short *bl_ptr = - block + IMAD(OffsThreadInCol, KERS_SMEMBLOCK_STRIDE, OffsThreadInRow * 2); + SrcDst += IMAD(IMAD(blockIdx.y, KERS_BLOCK_HEIGHT, OffsThreadInCol), + ImgStride, + IMAD(blockIdx.x, KERS_BLOCK_WIDTH, OffsThreadInRow * 2)); + short *bl_ptr = block + IMAD(OffsThreadInCol, KERS_SMEMBLOCK_STRIDE, OffsThreadInRow * 2); - // load data to shared memory (only first half of threads in each row performs - // data moving (each thread moves 2 shorts) - if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { + // load data to shared memory (only first half of threads in each row performs + // data moving (each thread moves 2 shorts) + if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) - ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)] = - ((int *)SrcDst)[i * (ImgStride / 2)]; - } + for (int i = 0; i < BLOCK_SIZE; i++) + ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)] = ((int *)SrcDst)[i * (ImgStride / 2)]; + } - cg::sync(cta); - CUDAshortInplaceIDCT( - block + OffsThreadInCol * KERS_SMEMBLOCK_STRIDE + OffsThrRowPermuted, - KERS_SMEMBLOCK_STRIDE); - cg::sync(cta); - CUDAshortInplaceIDCT( - (unsigned int *)(block + OffsThreadInRow * KERS_SMEMBLOCK_STRIDE + - OffsThreadInCol)); - cg::sync(cta); + cg::sync(cta); + CUDAshortInplaceIDCT(block + OffsThreadInCol * KERS_SMEMBLOCK_STRIDE + OffsThrRowPermuted, KERS_SMEMBLOCK_STRIDE); + cg::sync(cta); + CUDAshortInplaceIDCT((unsigned int *)(block + OffsThreadInRow * KERS_SMEMBLOCK_STRIDE + OffsThreadInCol)); + cg::sync(cta); - // store data to global memory (only first half of threads in each row - // performs data moving (each thread moves 2 shorts) - if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { + // store data to global memory (only first half of threads in each row + // performs data moving (each thread moves 2 shorts) + if (OffsThreadInRow < KERS_BLOCK_WIDTH_HALF) { #pragma unroll - for (int i = 0; i < BLOCK_SIZE; i++) - ((int *)SrcDst)[i * (ImgStride / 2)] = - ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)]; - } + for (int i = 0; i < BLOCK_SIZE; i++) + ((int *)SrcDst)[i * (ImgStride / 2)] = ((int *)bl_ptr)[i * (KERS_SMEMBLOCK_STRIDE / 2)]; + } } diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large.cuh b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large.cuh index 2eaba7bb..f7249ca1 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large.cuh +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large.cuh @@ -45,75 +45,114 @@ namespace cg = cooperative_groups; //////////////////////////////////////////////////////////////////////////////// //! Write data to global memory //////////////////////////////////////////////////////////////////////////////// -__device__ void writeToGmem( - const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_active, const unsigned int num_blocks_mult, - float *g_left_one, float *g_right_one, unsigned int *g_pos_one, - float *g_left_mult, float *g_right_mult, unsigned int *g_left_count_mult, - unsigned int *g_right_count_mult, float *s_left, float *s_right, - unsigned short *s_left_count, unsigned short *s_right_count, - unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum, - unsigned short *s_compaction_list, unsigned short *s_cl_helper, - unsigned int offset_mult_lambda); +__device__ void writeToGmem(const unsigned int tid, + const unsigned int tid_2, + const unsigned int num_threads_active, + const unsigned int num_blocks_mult, + float *g_left_one, + float *g_right_one, + unsigned int *g_pos_one, + float *g_left_mult, + float *g_right_mult, + unsigned int *g_left_count_mult, + unsigned int *g_right_count_mult, + float *s_left, + float *s_right, + unsigned short *s_left_count, + unsigned short *s_right_count, + unsigned int *g_blocks_mult, + unsigned int *g_blocks_mult_sum, + unsigned short *s_compaction_list, + unsigned short *s_cl_helper, + unsigned int offset_mult_lambda); //////////////////////////////////////////////////////////////////////////////// //! Perform final stream compaction before writing out data //////////////////////////////////////////////////////////////////////////////// -__device__ void compactStreamsFinal( - const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_active, unsigned int &offset_mult_lambda, - float *s_left, float *s_right, unsigned short *s_left_count, - unsigned short *s_right_count, unsigned short *s_cl_one, - unsigned short *s_cl_mult, unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, unsigned int is_one_lambda, - unsigned int is_one_lambda_2, float &left, float &right, float &left_2, - float &right_2, unsigned int &left_count, unsigned int &right_count, - unsigned int &left_count_2, unsigned int &right_count_2, - unsigned int c_block_iend, unsigned int c_sum_block, - unsigned int c_block_iend_2, unsigned int c_sum_block_2, - cg::thread_block cta); +__device__ void compactStreamsFinal(const unsigned int tid, + const unsigned int tid_2, + const unsigned int num_threads_active, + unsigned int &offset_mult_lambda, + float *s_left, + float *s_right, + unsigned short *s_left_count, + unsigned short *s_right_count, + unsigned short *s_cl_one, + unsigned short *s_cl_mult, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + unsigned int is_one_lambda, + unsigned int is_one_lambda_2, + float &left, + float &right, + float &left_2, + float &right_2, + unsigned int &left_count, + unsigned int &right_count, + unsigned int &left_count_2, + unsigned int &right_count_2, + unsigned int c_block_iend, + unsigned int c_sum_block, + unsigned int c_block_iend_2, + unsigned int c_sum_block_2, + cg::thread_block cta); //////////////////////////////////////////////////////////////////////////////// //! Perform scan to compact list of block start addresses //////////////////////////////////////////////////////////////////////////////// -__device__ void scanCompactBlocksStartAddress( - const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, cg::thread_block cta); +__device__ void scanCompactBlocksStartAddress(const unsigned int tid, + const unsigned int tid_2, + const unsigned int num_threads_compaction, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + cg::thread_block cta); //////////////////////////////////////////////////////////////////////////////// //! Perform scan to obtain number of eigenvalues before a specific block //////////////////////////////////////////////////////////////////////////////// -__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2, +__device__ void scanSumBlocks(const unsigned int tid, + const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, - unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, - cg::thread_block cta); + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + cg::thread_block cta); //////////////////////////////////////////////////////////////////////////////// //! Perform initial scan for compaction of intervals containing one and //! multiple eigenvalues; also do initial scan to build blocks //////////////////////////////////////////////////////////////////////////////// -__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2, +__device__ void scanInitial(const unsigned int tid, + const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, - unsigned short *s_cl_one, unsigned short *s_cl_mult, - unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, cg::thread_block cta); + unsigned short *s_cl_one, + unsigned short *s_cl_mult, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + cg::thread_block cta); //////////////////////////////////////////////////////////////////////////////// //! Store all non-empty intervals resulting from the subdivision of the interval //! currently processed by the thread //! @param addr address where to store //////////////////////////////////////////////////////////////////////////////// -__device__ void storeNonEmptyIntervalsLarge( - unsigned int addr, const unsigned int num_threads_active, float *s_left, - float *s_right, unsigned short *s_left_count, unsigned short *s_right_count, - float left, float mid, float right, const unsigned short left_count, - const unsigned short mid_count, const unsigned short right_count, - float epsilon, unsigned int &compact_second_chunk, - unsigned short *s_compaction_list, unsigned int &is_active_second); +__device__ void storeNonEmptyIntervalsLarge(unsigned int addr, + const unsigned int num_threads_active, + float *s_left, + float *s_right, + unsigned short *s_left_count, + unsigned short *s_right_count, + float left, + float mid, + float right, + const unsigned short left_count, + const unsigned short mid_count, + const unsigned short right_count, + float epsilon, + unsigned int &compact_second_chunk, + unsigned short *s_compaction_list, + unsigned int &is_active_second); //////////////////////////////////////////////////////////////////////////////// //! Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix @@ -127,679 +166,800 @@ __device__ void storeNonEmptyIntervalsLarge( //! @param lu_eig_count number of eigenvalues that are smaller than \a lu //! @param epsilon desired accuracy of eigenvalues to compute //////////////////////////////////////////////////////////////////////////////// -__global__ void bisectKernelLarge( - float *g_d, float *g_s, const unsigned int n, const float lg, - const float ug, const unsigned int lg_eig_count, - const unsigned int ug_eig_count, float epsilon, unsigned int *g_num_one, - unsigned int *g_num_blocks_mult, float *g_left_one, float *g_right_one, - unsigned int *g_pos_one, float *g_left_mult, float *g_right_mult, - unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, - unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - const unsigned int tid = threadIdx.x; +__global__ void bisectKernelLarge(float *g_d, + float *g_s, + const unsigned int n, + const float lg, + const float ug, + const unsigned int lg_eig_count, + const unsigned int ug_eig_count, + float epsilon, + unsigned int *g_num_one, + unsigned int *g_num_blocks_mult, + float *g_left_one, + float *g_right_one, + unsigned int *g_pos_one, + float *g_left_mult, + float *g_right_mult, + unsigned int *g_left_count_mult, + unsigned int *g_right_count_mult, + unsigned int *g_blocks_mult, + unsigned int *g_blocks_mult_sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + const unsigned int tid = threadIdx.x; - // intervals (store left and right because the subdivision tree is in general - // not dense - __shared__ float s_left[2 * MAX_THREADS_BLOCK + 1]; - __shared__ float s_right[2 * MAX_THREADS_BLOCK + 1]; + // intervals (store left and right because the subdivision tree is in general + // not dense + __shared__ float s_left[2 * MAX_THREADS_BLOCK + 1]; + __shared__ float s_right[2 * MAX_THREADS_BLOCK + 1]; - // number of eigenvalues that are smaller than s_left / s_right - // (correspondence is realized via indices) - __shared__ unsigned short s_left_count[2 * MAX_THREADS_BLOCK + 1]; - __shared__ unsigned short s_right_count[2 * MAX_THREADS_BLOCK + 1]; + // number of eigenvalues that are smaller than s_left / s_right + // (correspondence is realized via indices) + __shared__ unsigned short s_left_count[2 * MAX_THREADS_BLOCK + 1]; + __shared__ unsigned short s_right_count[2 * MAX_THREADS_BLOCK + 1]; - // helper for stream compaction - __shared__ unsigned short s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; + // helper for stream compaction + __shared__ unsigned short s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; - // state variables for whole block - // if 0 then compaction of second chunk of child intervals is not necessary - // (because all intervals had exactly one non-dead child) - __shared__ unsigned int compact_second_chunk; - // if 1 then all threads are converged - __shared__ unsigned int all_threads_converged; + // state variables for whole block + // if 0 then compaction of second chunk of child intervals is not necessary + // (because all intervals had exactly one non-dead child) + __shared__ unsigned int compact_second_chunk; + // if 1 then all threads are converged + __shared__ unsigned int all_threads_converged; - // number of currently active threads - __shared__ unsigned int num_threads_active; + // number of currently active threads + __shared__ unsigned int num_threads_active; - // number of threads to use for stream compaction - __shared__ unsigned int num_threads_compaction; + // number of threads to use for stream compaction + __shared__ unsigned int num_threads_compaction; - // helper for exclusive scan - unsigned short *s_compaction_list_exc = s_compaction_list + 1; + // helper for exclusive scan + unsigned short *s_compaction_list_exc = s_compaction_list + 1; - // variables for currently processed interval - // left and right limit of active interval - float left = 0.0f; - float right = 0.0f; - unsigned int left_count = 0; - unsigned int right_count = 0; - // midpoint of active interval - float mid = 0.0f; - // number of eigenvalues smaller then mid - unsigned int mid_count = 0; - // helper for stream compaction (tracking of threads generating second child) - unsigned int is_active_second = 0; + // variables for currently processed interval + // left and right limit of active interval + float left = 0.0f; + float right = 0.0f; + unsigned int left_count = 0; + unsigned int right_count = 0; + // midpoint of active interval + float mid = 0.0f; + // number of eigenvalues smaller then mid + unsigned int mid_count = 0; + // helper for stream compaction (tracking of threads generating second child) + unsigned int is_active_second = 0; - // initialize lists - s_compaction_list[tid] = 0; - s_left[tid] = 0; - s_right[tid] = 0; - s_left_count[tid] = 0; - s_right_count[tid] = 0; - - cg::sync(cta); - - // set up initial configuration - if (0 == tid) { - s_left[0] = lg; - s_right[0] = ug; - s_left_count[0] = lg_eig_count; - s_right_count[0] = ug_eig_count; - - compact_second_chunk = 0; - num_threads_active = 1; - - num_threads_compaction = 1; - - all_threads_converged = 1; - } - - cg::sync(cta); - - // for all active threads read intervals from the last level - // the number of (worst case) active threads per level l is 2^l - while (true) { - subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count, - num_threads_active, left, right, left_count, - right_count, mid, all_threads_converged); + // initialize lists + s_compaction_list[tid] = 0; + s_left[tid] = 0; + s_right[tid] = 0; + s_left_count[tid] = 0; + s_right_count[tid] = 0; cg::sync(cta); - // check if done - if (1 == all_threads_converged) { - break; - } - - // compute number of eigenvalues smaller than mid - // use all threads for reading the necessary matrix data from global - // memory - // use s_left and s_right as scratch space for diagonal and - // superdiagonal of matrix - mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, threadIdx.x, - num_threads_active, s_left, - s_right, (left == right), cta); - - cg::sync(cta); - - // store intervals - // for all threads store the first child interval in a continuous chunk of - // memory, and the second child interval -- if it exists -- in a second - // chunk; it is likely that all threads reach convergence up to - // \a epsilon at the same level; furthermore, for higher level most / all - // threads will have only one child, storing the first child compactly will - // (first) avoid to perform a compaction step on the first chunk, (second) - // make it for higher levels (when all threads / intervals have - // exactly one child) unnecessary to perform a compaction of the second - // chunk - if (tid < num_threads_active) { - if (left != right) { - // store intervals - storeNonEmptyIntervalsLarge(tid, num_threads_active, s_left, s_right, - s_left_count, s_right_count, left, mid, - right, left_count, mid_count, right_count, - epsilon, compact_second_chunk, - s_compaction_list_exc, is_active_second); - } else { - // re-write converged interval (has to be stored again because s_left - // and s_right are used as scratch space for - // computeNumSmallerEigenvalsLarge() - s_left[tid] = left; - s_right[tid] = left; - s_left_count[tid] = left_count; - s_right_count[tid] = right_count; - - is_active_second = 0; - } - } - - // necessary so that compact_second_chunk is up-to-date - cg::sync(cta); - - // perform compaction of chunk where second children are stored - // scan of (num_threads_active / 2) elements, thus at most - // (num_threads_active / 4) threads are needed - if (compact_second_chunk > 0) { - // create indices for compaction - createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, - cta); - - compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right, - mid_count, right_count, s_compaction_list, - num_threads_active, is_active_second); - } - - cg::sync(cta); - - // update state variables + // set up initial configuration if (0 == tid) { - // update number of active threads with result of reduction - num_threads_active += s_compaction_list[num_threads_active]; - num_threads_compaction = ceilPow2(num_threads_active); + s_left[0] = lg; + s_right[0] = ug; + s_left_count[0] = lg_eig_count; + s_right_count[0] = ug_eig_count; - compact_second_chunk = 0; - all_threads_converged = 1; + compact_second_chunk = 0; + num_threads_active = 1; + + num_threads_compaction = 1; + + all_threads_converged = 1; } cg::sync(cta); - if (num_threads_compaction > blockDim.x) { - break; + // for all active threads read intervals from the last level + // the number of (worst case) active threads per level l is 2^l + while (true) { + subdivideActiveInterval(tid, + s_left, + s_right, + s_left_count, + s_right_count, + num_threads_active, + left, + right, + left_count, + right_count, + mid, + all_threads_converged); + + cg::sync(cta); + + // check if done + if (1 == all_threads_converged) { + break; + } + + // compute number of eigenvalues smaller than mid + // use all threads for reading the necessary matrix data from global + // memory + // use s_left and s_right as scratch space for diagonal and + // superdiagonal of matrix + mid_count = computeNumSmallerEigenvalsLarge( + g_d, g_s, n, mid, threadIdx.x, num_threads_active, s_left, s_right, (left == right), cta); + + cg::sync(cta); + + // store intervals + // for all threads store the first child interval in a continuous chunk of + // memory, and the second child interval -- if it exists -- in a second + // chunk; it is likely that all threads reach convergence up to + // \a epsilon at the same level; furthermore, for higher level most / all + // threads will have only one child, storing the first child compactly will + // (first) avoid to perform a compaction step on the first chunk, (second) + // make it for higher levels (when all threads / intervals have + // exactly one child) unnecessary to perform a compaction of the second + // chunk + if (tid < num_threads_active) { + if (left != right) { + // store intervals + storeNonEmptyIntervalsLarge(tid, + num_threads_active, + s_left, + s_right, + s_left_count, + s_right_count, + left, + mid, + right, + left_count, + mid_count, + right_count, + epsilon, + compact_second_chunk, + s_compaction_list_exc, + is_active_second); + } + else { + // re-write converged interval (has to be stored again because s_left + // and s_right are used as scratch space for + // computeNumSmallerEigenvalsLarge() + s_left[tid] = left; + s_right[tid] = left; + s_left_count[tid] = left_count; + s_right_count[tid] = right_count; + + is_active_second = 0; + } + } + + // necessary so that compact_second_chunk is up-to-date + cg::sync(cta); + + // perform compaction of chunk where second children are stored + // scan of (num_threads_active / 2) elements, thus at most + // (num_threads_active / 4) threads are needed + if (compact_second_chunk > 0) { + // create indices for compaction + createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, cta); + + compactIntervals(s_left, + s_right, + s_left_count, + s_right_count, + mid, + right, + mid_count, + right_count, + s_compaction_list, + num_threads_active, + is_active_second); + } + + cg::sync(cta); + + // update state variables + if (0 == tid) { + // update number of active threads with result of reduction + num_threads_active += s_compaction_list[num_threads_active]; + num_threads_compaction = ceilPow2(num_threads_active); + + compact_second_chunk = 0; + all_threads_converged = 1; + } + + cg::sync(cta); + + if (num_threads_compaction > blockDim.x) { + break; + } } - } - cg::sync(cta); + cg::sync(cta); - // generate two lists of intervals; one with intervals that contain one - // eigenvalue (or are converged), and one with intervals that need further - // subdivision + // generate two lists of intervals; one with intervals that contain one + // eigenvalue (or are converged), and one with intervals that need further + // subdivision - // perform two scans in parallel + // perform two scans in parallel - unsigned int left_count_2; - unsigned int right_count_2; + unsigned int left_count_2; + unsigned int right_count_2; - unsigned int tid_2 = tid + blockDim.x; + unsigned int tid_2 = tid + blockDim.x; - // cache in per thread registers so that s_left_count and s_right_count - // can be used for scans - left_count = s_left_count[tid]; - right_count = s_right_count[tid]; + // cache in per thread registers so that s_left_count and s_right_count + // can be used for scans + left_count = s_left_count[tid]; + right_count = s_right_count[tid]; - // some threads have to cache data for two intervals - if (tid_2 < num_threads_active) { - left_count_2 = s_left_count[tid_2]; - right_count_2 = s_right_count[tid_2]; - } + // some threads have to cache data for two intervals + if (tid_2 < num_threads_active) { + left_count_2 = s_left_count[tid_2]; + right_count_2 = s_right_count[tid_2]; + } - // compaction list for intervals containing one and multiple eigenvalues - // do not affect first element for exclusive scan - unsigned short *s_cl_one = s_left_count + 1; - unsigned short *s_cl_mult = s_right_count + 1; + // compaction list for intervals containing one and multiple eigenvalues + // do not affect first element for exclusive scan + unsigned short *s_cl_one = s_left_count + 1; + unsigned short *s_cl_mult = s_right_count + 1; - // compaction list for generating blocks of intervals containing multiple - // eigenvalues - unsigned short *s_cl_blocking = s_compaction_list_exc; - // helper compaction list for generating blocks of intervals - __shared__ unsigned short s_cl_helper[2 * MAX_THREADS_BLOCK + 1]; + // compaction list for generating blocks of intervals containing multiple + // eigenvalues + unsigned short *s_cl_blocking = s_compaction_list_exc; + // helper compaction list for generating blocks of intervals + __shared__ unsigned short s_cl_helper[2 * MAX_THREADS_BLOCK + 1]; - if (0 == tid) { - // set to 0 for exclusive scan - s_left_count[0] = 0; - s_right_count[0] = 0; - } + if (0 == tid) { + // set to 0 for exclusive scan + s_left_count[0] = 0; + s_right_count[0] = 0; + } - cg::sync(cta); + cg::sync(cta); - // flag if interval contains one or multiple eigenvalues - unsigned int is_one_lambda = 0; - unsigned int is_one_lambda_2 = 0; + // flag if interval contains one or multiple eigenvalues + unsigned int is_one_lambda = 0; + unsigned int is_one_lambda_2 = 0; - // number of eigenvalues in the interval - unsigned int multiplicity = right_count - left_count; - is_one_lambda = (1 == multiplicity); + // number of eigenvalues in the interval + unsigned int multiplicity = right_count - left_count; + is_one_lambda = (1 == multiplicity); - s_cl_one[tid] = is_one_lambda; - s_cl_mult[tid] = (!is_one_lambda); - - // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero) - s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; - s_cl_helper[tid] = 0; - - if (tid_2 < num_threads_active) { - unsigned int multiplicity = right_count_2 - left_count_2; - is_one_lambda_2 = (1 == multiplicity); - - s_cl_one[tid_2] = is_one_lambda_2; - s_cl_mult[tid_2] = (!is_one_lambda_2); + s_cl_one[tid] = is_one_lambda; + s_cl_mult[tid] = (!is_one_lambda); // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero) - s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; - s_cl_helper[tid_2] = 0; - } else if (tid_2 < (2 * MAX_THREADS_BLOCK + 1)) { - // clear - s_cl_blocking[tid_2] = 0; - s_cl_helper[tid_2] = 0; - } + s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; + s_cl_helper[tid] = 0; - scanInitial(tid, tid_2, num_threads_active, num_threads_compaction, s_cl_one, - s_cl_mult, s_cl_blocking, s_cl_helper, cta); + if (tid_2 < num_threads_active) { + unsigned int multiplicity = right_count_2 - left_count_2; + is_one_lambda_2 = (1 == multiplicity); - scanSumBlocks(tid, tid_2, num_threads_active, num_threads_compaction, - s_cl_blocking, s_cl_helper, cta); + s_cl_one[tid_2] = is_one_lambda_2; + s_cl_mult[tid_2] = (!is_one_lambda_2); - // end down sweep of scan - cg::sync(cta); + // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero) + s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; + s_cl_helper[tid_2] = 0; + } + else if (tid_2 < (2 * MAX_THREADS_BLOCK + 1)) { + // clear + s_cl_blocking[tid_2] = 0; + s_cl_helper[tid_2] = 0; + } - unsigned int c_block_iend = 0; - unsigned int c_block_iend_2 = 0; - unsigned int c_sum_block = 0; - unsigned int c_sum_block_2 = 0; + scanInitial( + tid, tid_2, num_threads_active, num_threads_compaction, s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, cta); - // for each thread / interval that corresponds to root node of interval block - // store start address of block and total number of eigenvalues in all blocks - // before this block (particular thread is irrelevant, constraint is to - // have a subset of threads so that one and only one of them is in each - // interval) - if (1 == s_cl_helper[tid]) { - c_block_iend = s_cl_mult[tid] + 1; - c_sum_block = s_cl_blocking[tid]; - } + scanSumBlocks(tid, tid_2, num_threads_active, num_threads_compaction, s_cl_blocking, s_cl_helper, cta); - if (1 == s_cl_helper[tid_2]) { - c_block_iend_2 = s_cl_mult[tid_2] + 1; - c_sum_block_2 = s_cl_blocking[tid_2]; - } + // end down sweep of scan + cg::sync(cta); - scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, - s_cl_blocking, s_cl_helper, cta); + unsigned int c_block_iend = 0; + unsigned int c_block_iend_2 = 0; + unsigned int c_sum_block = 0; + unsigned int c_sum_block_2 = 0; - // finished second scan for s_cl_blocking - cg::sync(cta); + // for each thread / interval that corresponds to root node of interval block + // store start address of block and total number of eigenvalues in all blocks + // before this block (particular thread is irrelevant, constraint is to + // have a subset of threads so that one and only one of them is in each + // interval) + if (1 == s_cl_helper[tid]) { + c_block_iend = s_cl_mult[tid] + 1; + c_sum_block = s_cl_blocking[tid]; + } - // determine the global results - __shared__ unsigned int num_blocks_mult; - __shared__ unsigned int num_mult; - __shared__ unsigned int offset_mult_lambda; + if (1 == s_cl_helper[tid_2]) { + c_block_iend_2 = s_cl_mult[tid_2] + 1; + c_sum_block_2 = s_cl_blocking[tid_2]; + } - if (0 == tid) { - num_blocks_mult = s_cl_blocking[num_threads_active - 1]; - offset_mult_lambda = s_cl_one[num_threads_active - 1]; - num_mult = s_cl_mult[num_threads_active - 1]; + scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, s_cl_blocking, s_cl_helper, cta); - *g_num_one = offset_mult_lambda; - *g_num_blocks_mult = num_blocks_mult; - } + // finished second scan for s_cl_blocking + cg::sync(cta); - cg::sync(cta); + // determine the global results + __shared__ unsigned int num_blocks_mult; + __shared__ unsigned int num_mult; + __shared__ unsigned int offset_mult_lambda; - float left_2, right_2; - --s_cl_one; - --s_cl_mult; - --s_cl_blocking; - compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda, - s_left, s_right, s_left_count, s_right_count, s_cl_one, - s_cl_mult, s_cl_blocking, s_cl_helper, is_one_lambda, - is_one_lambda_2, left, right, left_2, right_2, left_count, - right_count, left_count_2, right_count_2, c_block_iend, - c_sum_block, c_block_iend_2, c_sum_block_2, cta); + if (0 == tid) { + num_blocks_mult = s_cl_blocking[num_threads_active - 1]; + offset_mult_lambda = s_cl_one[num_threads_active - 1]; + num_mult = s_cl_mult[num_threads_active - 1]; - cg::sync(cta); + *g_num_one = offset_mult_lambda; + *g_num_blocks_mult = num_blocks_mult; + } - // final adjustment before writing out data to global memory - if (0 == tid) { - s_cl_blocking[num_blocks_mult] = num_mult; - s_cl_helper[0] = 0; - } + cg::sync(cta); - cg::sync(cta); + float left_2, right_2; + --s_cl_one; + --s_cl_mult; + --s_cl_blocking; + compactStreamsFinal(tid, + tid_2, + num_threads_active, + offset_mult_lambda, + s_left, + s_right, + s_left_count, + s_right_count, + s_cl_one, + s_cl_mult, + s_cl_blocking, + s_cl_helper, + is_one_lambda, + is_one_lambda_2, + left, + right, + left_2, + right_2, + left_count, + right_count, + left_count_2, + right_count_2, + c_block_iend, + c_sum_block, + c_block_iend_2, + c_sum_block_2, + cta); - // write to global memory - writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, g_left_one, - g_right_one, g_pos_one, g_left_mult, g_right_mult, - g_left_count_mult, g_right_count_mult, s_left, s_right, - s_left_count, s_right_count, g_blocks_mult, g_blocks_mult_sum, - s_compaction_list, s_cl_helper, offset_mult_lambda); + cg::sync(cta); + + // final adjustment before writing out data to global memory + if (0 == tid) { + s_cl_blocking[num_blocks_mult] = num_mult; + s_cl_helper[0] = 0; + } + + cg::sync(cta); + + // write to global memory + writeToGmem(tid, + tid_2, + num_threads_active, + num_blocks_mult, + g_left_one, + g_right_one, + g_pos_one, + g_left_mult, + g_right_mult, + g_left_count_mult, + g_right_count_mult, + s_left, + s_right, + s_left_count, + s_right_count, + g_blocks_mult, + g_blocks_mult_sum, + s_compaction_list, + s_cl_helper, + offset_mult_lambda); } //////////////////////////////////////////////////////////////////////////////// //! Write data to global memory //////////////////////////////////////////////////////////////////////////////// -__device__ void writeToGmem( - const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_active, const unsigned int num_blocks_mult, - float *g_left_one, float *g_right_one, unsigned int *g_pos_one, - float *g_left_mult, float *g_right_mult, unsigned int *g_left_count_mult, - unsigned int *g_right_count_mult, float *s_left, float *s_right, - unsigned short *s_left_count, unsigned short *s_right_count, - unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum, - unsigned short *s_compaction_list, unsigned short *s_cl_helper, - unsigned int offset_mult_lambda) { - if (tid < offset_mult_lambda) { - g_left_one[tid] = s_left[tid]; - g_right_one[tid] = s_right[tid]; - // right count can be used to order eigenvalues without sorting - g_pos_one[tid] = s_right_count[tid]; - } else { - g_left_mult[tid - offset_mult_lambda] = s_left[tid]; - g_right_mult[tid - offset_mult_lambda] = s_right[tid]; - g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; - g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; - } - - if (tid_2 < num_threads_active) { - if (tid_2 < offset_mult_lambda) { - g_left_one[tid_2] = s_left[tid_2]; - g_right_one[tid_2] = s_right[tid_2]; - // right count can be used to order eigenvalues without sorting - g_pos_one[tid_2] = s_right_count[tid_2]; - } else { - g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; - g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; - g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; - g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; +__device__ void writeToGmem(const unsigned int tid, + const unsigned int tid_2, + const unsigned int num_threads_active, + const unsigned int num_blocks_mult, + float *g_left_one, + float *g_right_one, + unsigned int *g_pos_one, + float *g_left_mult, + float *g_right_mult, + unsigned int *g_left_count_mult, + unsigned int *g_right_count_mult, + float *s_left, + float *s_right, + unsigned short *s_left_count, + unsigned short *s_right_count, + unsigned int *g_blocks_mult, + unsigned int *g_blocks_mult_sum, + unsigned short *s_compaction_list, + unsigned short *s_cl_helper, + unsigned int offset_mult_lambda) +{ + if (tid < offset_mult_lambda) { + g_left_one[tid] = s_left[tid]; + g_right_one[tid] = s_right[tid]; + // right count can be used to order eigenvalues without sorting + g_pos_one[tid] = s_right_count[tid]; + } + else { + g_left_mult[tid - offset_mult_lambda] = s_left[tid]; + g_right_mult[tid - offset_mult_lambda] = s_right[tid]; + g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; + g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; } - } // end writing out data + if (tid_2 < num_threads_active) { + if (tid_2 < offset_mult_lambda) { + g_left_one[tid_2] = s_left[tid_2]; + g_right_one[tid_2] = s_right[tid_2]; + // right count can be used to order eigenvalues without sorting + g_pos_one[tid_2] = s_right_count[tid_2]; + } + else { + g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; + g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; + g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; + g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; + } - // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out - // s_compaction_list we write the exclusive scan result - if (tid <= num_blocks_mult) { - g_blocks_mult[tid] = s_compaction_list[tid]; - g_blocks_mult_sum[tid] = s_cl_helper[tid]; - } + } // end writing out data - if (tid_2 <= num_blocks_mult) { - g_blocks_mult[tid_2] = s_compaction_list[tid_2]; - g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; - } + // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out + // s_compaction_list we write the exclusive scan result + if (tid <= num_blocks_mult) { + g_blocks_mult[tid] = s_compaction_list[tid]; + g_blocks_mult_sum[tid] = s_cl_helper[tid]; + } + + if (tid_2 <= num_blocks_mult) { + g_blocks_mult[tid_2] = s_compaction_list[tid_2]; + g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; + } } //////////////////////////////////////////////////////////////////////////////// //! Perform final stream compaction before writing data to global memory //////////////////////////////////////////////////////////////////////////////// -__device__ void compactStreamsFinal( - const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_active, unsigned int &offset_mult_lambda, - float *s_left, float *s_right, unsigned short *s_left_count, - unsigned short *s_right_count, unsigned short *s_cl_one, - unsigned short *s_cl_mult, unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, unsigned int is_one_lambda, - unsigned int is_one_lambda_2, float &left, float &right, float &left_2, - float &right_2, unsigned int &left_count, unsigned int &right_count, - unsigned int &left_count_2, unsigned int &right_count_2, - unsigned int c_block_iend, unsigned int c_sum_block, - unsigned int c_block_iend_2, unsigned int c_sum_block_2, - cg::thread_block cta) { - // cache data before performing compaction - left = s_left[tid]; - right = s_right[tid]; +__device__ void compactStreamsFinal(const unsigned int tid, + const unsigned int tid_2, + const unsigned int num_threads_active, + unsigned int &offset_mult_lambda, + float *s_left, + float *s_right, + unsigned short *s_left_count, + unsigned short *s_right_count, + unsigned short *s_cl_one, + unsigned short *s_cl_mult, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + unsigned int is_one_lambda, + unsigned int is_one_lambda_2, + float &left, + float &right, + float &left_2, + float &right_2, + unsigned int &left_count, + unsigned int &right_count, + unsigned int &left_count_2, + unsigned int &right_count_2, + unsigned int c_block_iend, + unsigned int c_sum_block, + unsigned int c_block_iend_2, + unsigned int c_sum_block_2, + cg::thread_block cta) +{ + // cache data before performing compaction + left = s_left[tid]; + right = s_right[tid]; - if (tid_2 < num_threads_active) { - left_2 = s_left[tid_2]; - right_2 = s_right[tid_2]; - } - - cg::sync(cta); - - // determine addresses for intervals containing multiple eigenvalues and - // addresses for blocks of intervals - unsigned int ptr_w = 0; - unsigned int ptr_w_2 = 0; - unsigned int ptr_blocking_w = 0; - unsigned int ptr_blocking_w_2 = 0; - - ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] - : s_cl_mult[tid] + offset_mult_lambda; - - if (0 != c_block_iend) { - ptr_blocking_w = s_cl_blocking[tid]; - } - - if (tid_2 < num_threads_active) { - ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] - : s_cl_mult[tid_2] + offset_mult_lambda; - - if (0 != c_block_iend_2) { - ptr_blocking_w_2 = s_cl_blocking[tid_2]; + if (tid_2 < num_threads_active) { + left_2 = s_left[tid_2]; + right_2 = s_right[tid_2]; } - } - cg::sync(cta); + cg::sync(cta); - // store compactly in shared mem - s_left[ptr_w] = left; - s_right[ptr_w] = right; - s_left_count[ptr_w] = left_count; - s_right_count[ptr_w] = right_count; + // determine addresses for intervals containing multiple eigenvalues and + // addresses for blocks of intervals + unsigned int ptr_w = 0; + unsigned int ptr_w_2 = 0; + unsigned int ptr_blocking_w = 0; + unsigned int ptr_blocking_w_2 = 0; - if (0 != c_block_iend) { - s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; - s_cl_helper[ptr_blocking_w + 1] = c_sum_block; - } + ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] : s_cl_mult[tid] + offset_mult_lambda; + + if (0 != c_block_iend) { + ptr_blocking_w = s_cl_blocking[tid]; + } + + if (tid_2 < num_threads_active) { + ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] : s_cl_mult[tid_2] + offset_mult_lambda; + + if (0 != c_block_iend_2) { + ptr_blocking_w_2 = s_cl_blocking[tid_2]; + } + } + + cg::sync(cta); - if (tid_2 < num_threads_active) { // store compactly in shared mem - s_left[ptr_w_2] = left_2; - s_right[ptr_w_2] = right_2; - s_left_count[ptr_w_2] = left_count_2; - s_right_count[ptr_w_2] = right_count_2; + s_left[ptr_w] = left; + s_right[ptr_w] = right; + s_left_count[ptr_w] = left_count; + s_right_count[ptr_w] = right_count; - if (0 != c_block_iend_2) { - s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; - s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; + if (0 != c_block_iend) { + s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; + s_cl_helper[ptr_blocking_w + 1] = c_sum_block; + } + + if (tid_2 < num_threads_active) { + // store compactly in shared mem + s_left[ptr_w_2] = left_2; + s_right[ptr_w_2] = right_2; + s_left_count[ptr_w_2] = left_count_2; + s_right_count[ptr_w_2] = right_count_2; + + if (0 != c_block_iend_2) { + s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; + s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; + } } - } } //////////////////////////////////////////////////////////////////////////////// //! Compute addresses to obtain compact list of block start addresses //////////////////////////////////////////////////////////////////////////////// -__device__ void scanCompactBlocksStartAddress( - const unsigned int tid, const unsigned int tid_2, - const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, cg::thread_block cta) { - // prepare for second step of block generation: compaction of the block - // list itself to efficiently write out these - s_cl_blocking[tid] = s_cl_helper[tid]; +__device__ void scanCompactBlocksStartAddress(const unsigned int tid, + const unsigned int tid_2, + const unsigned int num_threads_compaction, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + cg::thread_block cta) +{ + // prepare for second step of block generation: compaction of the block + // list itself to efficiently write out these + s_cl_blocking[tid] = s_cl_helper[tid]; - if (tid_2 < num_threads_compaction) { - s_cl_blocking[tid_2] = s_cl_helper[tid_2]; - } - - cg::sync(cta); - - // additional scan to compact s_cl_blocking that permits to generate a - // compact list of eigenvalue blocks each one containing about - // MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be - // processed by one thread block in a subsequent processing step - - unsigned int offset = 1; - - // build scan tree - for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) { - cg::sync(cta); - - if (tid < d) { - unsigned int ai = offset * (2 * tid + 1) - 1; - unsigned int bi = offset * (2 * tid + 2) - 1; - s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; + if (tid_2 < num_threads_compaction) { + s_cl_blocking[tid_2] = s_cl_helper[tid_2]; } - offset <<= 1; - } - - // traverse down tree: first down to level 2 across - for (int d = 2; d < num_threads_compaction; d <<= 1) { - offset >>= 1; cg::sync(cta); - // - if (tid < (d - 1)) { - unsigned int ai = offset * (tid + 1) - 1; - unsigned int bi = ai + (offset >> 1); - s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; + // additional scan to compact s_cl_blocking that permits to generate a + // compact list of eigenvalue blocks each one containing about + // MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be + // processed by one thread block in a subsequent processing step + + unsigned int offset = 1; + + // build scan tree + for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) { + cg::sync(cta); + + if (tid < d) { + unsigned int ai = offset * (2 * tid + 1) - 1; + unsigned int bi = offset * (2 * tid + 2) - 1; + s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; + } + + offset <<= 1; + } + + // traverse down tree: first down to level 2 across + for (int d = 2; d < num_threads_compaction; d <<= 1) { + offset >>= 1; + cg::sync(cta); + + // + if (tid < (d - 1)) { + unsigned int ai = offset * (tid + 1) - 1; + unsigned int bi = ai + (offset >> 1); + s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; + } } - } } //////////////////////////////////////////////////////////////////////////////// //! Perform scan to obtain number of eigenvalues before a specific block //////////////////////////////////////////////////////////////////////////////// -__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2, +__device__ void scanSumBlocks(const unsigned int tid, + const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, - unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, - cg::thread_block cta) { - unsigned int offset = 1; + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + cg::thread_block cta) +{ + unsigned int offset = 1; - // first step of scan to build the sum of elements within each block - // build up tree - for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) { - cg::sync(cta); + // first step of scan to build the sum of elements within each block + // build up tree + for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) { + cg::sync(cta); - if (tid < d) { - unsigned int ai = offset * (2 * tid + 1) - 1; - unsigned int bi = offset * (2 * tid + 2) - 1; + if (tid < d) { + unsigned int ai = offset * (2 * tid + 1) - 1; + unsigned int bi = offset * (2 * tid + 2) - 1; - s_cl_blocking[bi] += s_cl_blocking[ai]; + s_cl_blocking[bi] += s_cl_blocking[ai]; + } + + offset *= 2; } - offset *= 2; - } + // first step of scan to build the sum of elements within each block + // traverse down tree + for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) { + offset >>= 1; + cg::sync(cta); + + if (tid < (d - 1)) { + unsigned int ai = offset * (tid + 1) - 1; + unsigned int bi = ai + (offset >> 1); + + s_cl_blocking[bi] += s_cl_blocking[ai]; + } + } - // first step of scan to build the sum of elements within each block - // traverse down tree - for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) { - offset >>= 1; cg::sync(cta); - if (tid < (d - 1)) { - unsigned int ai = offset * (tid + 1) - 1; - unsigned int bi = ai + (offset >> 1); - - s_cl_blocking[bi] += s_cl_blocking[ai]; + if (0 == tid) { + // move last element of scan to last element that is valid + // necessary because the number of threads employed for scan is a power + // of two and not necessarily the number of active threasd + s_cl_helper[num_threads_active - 1] = s_cl_helper[num_threads_compaction - 1]; + s_cl_blocking[num_threads_active - 1] = s_cl_blocking[num_threads_compaction - 1]; } - } - - cg::sync(cta); - - if (0 == tid) { - // move last element of scan to last element that is valid - // necessary because the number of threads employed for scan is a power - // of two and not necessarily the number of active threasd - s_cl_helper[num_threads_active - 1] = - s_cl_helper[num_threads_compaction - 1]; - s_cl_blocking[num_threads_active - 1] = - s_cl_blocking[num_threads_compaction - 1]; - } } //////////////////////////////////////////////////////////////////////////////// //! Perform initial scan for compaction of intervals containing one and //! multiple eigenvalues; also do initial scan to build blocks //////////////////////////////////////////////////////////////////////////////// -__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2, +__device__ void scanInitial(const unsigned int tid, + const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, - unsigned short *s_cl_one, unsigned short *s_cl_mult, - unsigned short *s_cl_blocking, - unsigned short *s_cl_helper, cg::thread_block cta) { - // perform scan to compactly write out the intervals containing one and - // multiple eigenvalues - // also generate tree for blocking of intervals containing multiple - // eigenvalues + unsigned short *s_cl_one, + unsigned short *s_cl_mult, + unsigned short *s_cl_blocking, + unsigned short *s_cl_helper, + cg::thread_block cta) +{ + // perform scan to compactly write out the intervals containing one and + // multiple eigenvalues + // also generate tree for blocking of intervals containing multiple + // eigenvalues - unsigned int offset = 1; + unsigned int offset = 1; - // build scan tree - for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) { - cg::sync(cta); + // build scan tree + for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) { + cg::sync(cta); - if (tid < d) { - unsigned int ai = offset * (2 * tid + 1); - unsigned int bi = offset * (2 * tid + 2) - 1; + if (tid < d) { + unsigned int ai = offset * (2 * tid + 1); + unsigned int bi = offset * (2 * tid + 2) - 1; - s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; - s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; + s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; + s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; - // s_cl_helper is binary and zero for an internal node and 1 for a - // root node of a tree corresponding to a block - // s_cl_blocking contains the number of nodes in each sub-tree at each - // iteration, the data has to be kept to compute the total number of - // eigenvalues per block that, in turn, is needed to efficiently - // write out data in the second step - if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) { - // check how many childs are non terminated - if (s_cl_helper[ai - 1] == 1) { - // mark as terminated - s_cl_helper[bi] = 1; - } else if (s_cl_helper[bi] == 1) { - // mark as terminated - s_cl_helper[ai - 1] = 1; - } else // both childs are non-terminated - { - unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; + // s_cl_helper is binary and zero for an internal node and 1 for a + // root node of a tree corresponding to a block + // s_cl_blocking contains the number of nodes in each sub-tree at each + // iteration, the data has to be kept to compute the total number of + // eigenvalues per block that, in turn, is needed to efficiently + // write out data in the second step + if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) { + // check how many childs are non terminated + if (s_cl_helper[ai - 1] == 1) { + // mark as terminated + s_cl_helper[bi] = 1; + } + else if (s_cl_helper[bi] == 1) { + // mark as terminated + s_cl_helper[ai - 1] = 1; + } + else // both childs are non-terminated + { + unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; - if (temp > MAX_THREADS_BLOCK) { - // the two child trees have to form separate blocks, terminate trees - s_cl_helper[ai - 1] = 1; - s_cl_helper[bi] = 1; - } else { - // build up tree by joining subtrees - s_cl_blocking[bi] = temp; - s_cl_blocking[ai - 1] = 0; - } + if (temp > MAX_THREADS_BLOCK) { + // the two child trees have to form separate blocks, terminate trees + s_cl_helper[ai - 1] = 1; + s_cl_helper[bi] = 1; + } + else { + // build up tree by joining subtrees + s_cl_blocking[bi] = temp; + s_cl_blocking[ai - 1] = 0; + } + } + } // end s_cl_helper update } - } // end s_cl_helper update + + offset <<= 1; } - offset <<= 1; - } + // traverse down tree, this only for stream compaction, not for block + // construction + for (int d = 2; d < num_threads_compaction; d <<= 1) { + offset >>= 1; + cg::sync(cta); - // traverse down tree, this only for stream compaction, not for block - // construction - for (int d = 2; d < num_threads_compaction; d <<= 1) { - offset >>= 1; - cg::sync(cta); + // + if (tid < (d - 1)) { + unsigned int ai = offset * (tid + 1) - 1; + unsigned int bi = ai + (offset >> 1); - // - if (tid < (d - 1)) { - unsigned int ai = offset * (tid + 1) - 1; - unsigned int bi = ai + (offset >> 1); - - s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; - s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; + s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; + s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; + } } - } } //////////////////////////////////////////////////////////////////////////////// //! Store all non-empty intervals resulting from the subdivision of the interval //! currently processed by the thread //////////////////////////////////////////////////////////////////////////////// -__device__ void storeNonEmptyIntervalsLarge( - unsigned int addr, const unsigned int num_threads_active, float *s_left, - float *s_right, unsigned short *s_left_count, unsigned short *s_right_count, - float left, float mid, float right, const unsigned short left_count, - const unsigned short mid_count, const unsigned short right_count, - float epsilon, unsigned int &compact_second_chunk, - unsigned short *s_compaction_list, unsigned int &is_active_second) { - // check if both child intervals are valid - if ((left_count != mid_count) && (mid_count != right_count)) { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid, - left_count, mid_count, epsilon); +__device__ void storeNonEmptyIntervalsLarge(unsigned int addr, + const unsigned int num_threads_active, + float *s_left, + float *s_right, + unsigned short *s_left_count, + unsigned short *s_right_count, + float left, + float mid, + float right, + const unsigned short left_count, + const unsigned short mid_count, + const unsigned short right_count, + float epsilon, + unsigned int &compact_second_chunk, + unsigned short *s_compaction_list, + unsigned int &is_active_second) +{ + // check if both child intervals are valid + if ((left_count != mid_count) && (mid_count != right_count)) { + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid, left_count, mid_count, epsilon); - is_active_second = 1; - s_compaction_list[threadIdx.x] = 1; - atomicExch(&compact_second_chunk, 1); - } else { - // only one non-empty child interval - - // mark that no second child - is_active_second = 0; - s_compaction_list[threadIdx.x] = 0; - - // store the one valid child interval - if (left_count != mid_count) { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, - mid, left_count, mid_count, epsilon); - } else { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, mid, - right, mid_count, right_count, epsilon); + is_active_second = 1; + s_compaction_list[threadIdx.x] = 1; + atomicExch(&compact_second_chunk, 1); + } + else { + // only one non-empty child interval + + // mark that no second child + is_active_second = 0; + s_compaction_list[threadIdx.x] = 0; + + // store the one valid child interval + if (left_count != mid_count) { + storeInterval( + addr, s_left, s_right, s_left_count, s_right_count, left, mid, left_count, mid_count, epsilon); + } + else { + storeInterval( + addr, s_left, s_right, s_left_count, s_right_count, mid, right, mid_count, right_count, epsilon); + } } - } } -#endif // #ifndef _BISECT_KERNEL_LARGE_H_ +#endif // #ifndef _BISECT_KERNEL_LARGE_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_multi.cuh b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_multi.cuh index e7895fea..c9884db8 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_multi.cuh +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_multi.cuh @@ -61,163 +61,208 @@ namespace cg = cooperative_groups; //! @param g_pos index of eigenvalue (in ascending order) //! @param precision desired precision of eigenvalues //////////////////////////////////////////////////////////////////////////////// -__global__ void bisectKernelLarge_MultIntervals( - float *g_d, float *g_s, const unsigned int n, unsigned int *blocks_mult, - unsigned int *blocks_mult_sum, float *g_left, float *g_right, - unsigned int *g_left_count, unsigned int *g_right_count, float *g_lambda, - unsigned int *g_pos, float precision) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - const unsigned int tid = threadIdx.x; +__global__ void bisectKernelLarge_MultIntervals(float *g_d, + float *g_s, + const unsigned int n, + unsigned int *blocks_mult, + unsigned int *blocks_mult_sum, + float *g_left, + float *g_right, + unsigned int *g_left_count, + unsigned int *g_right_count, + float *g_lambda, + unsigned int *g_pos, + float precision) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + const unsigned int tid = threadIdx.x; - // left and right limits of interval - __shared__ float s_left[2 * MAX_THREADS_BLOCK]; - __shared__ float s_right[2 * MAX_THREADS_BLOCK]; + // left and right limits of interval + __shared__ float s_left[2 * MAX_THREADS_BLOCK]; + __shared__ float s_right[2 * MAX_THREADS_BLOCK]; - // number of eigenvalues smaller than interval limits - __shared__ unsigned int s_left_count[2 * MAX_THREADS_BLOCK]; - __shared__ unsigned int s_right_count[2 * MAX_THREADS_BLOCK]; + // number of eigenvalues smaller than interval limits + __shared__ unsigned int s_left_count[2 * MAX_THREADS_BLOCK]; + __shared__ unsigned int s_right_count[2 * MAX_THREADS_BLOCK]; - // helper array for chunk compaction of second chunk - __shared__ unsigned int s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; - // compaction list helper for exclusive scan - unsigned int *s_compaction_list_exc = s_compaction_list + 1; + // helper array for chunk compaction of second chunk + __shared__ unsigned int s_compaction_list[2 * MAX_THREADS_BLOCK + 1]; + // compaction list helper for exclusive scan + unsigned int *s_compaction_list_exc = s_compaction_list + 1; - // flag if all threads are converged - __shared__ unsigned int all_threads_converged; - // number of active threads - __shared__ unsigned int num_threads_active; - // number of threads to employ for compaction - __shared__ unsigned int num_threads_compaction; - // flag if second chunk has to be compacted - __shared__ unsigned int compact_second_chunk; + // flag if all threads are converged + __shared__ unsigned int all_threads_converged; + // number of active threads + __shared__ unsigned int num_threads_active; + // number of threads to employ for compaction + __shared__ unsigned int num_threads_compaction; + // flag if second chunk has to be compacted + __shared__ unsigned int compact_second_chunk; - // parameters of block of intervals processed by this block of threads - __shared__ unsigned int c_block_start; - __shared__ unsigned int c_block_end; - __shared__ unsigned int c_block_offset_output; + // parameters of block of intervals processed by this block of threads + __shared__ unsigned int c_block_start; + __shared__ unsigned int c_block_end; + __shared__ unsigned int c_block_offset_output; - // midpoint of currently active interval of the thread - float mid = 0.0f; - // number of eigenvalues smaller than \a mid - unsigned int mid_count = 0; - // current interval parameter - float left; - float right; - unsigned int left_count; - unsigned int right_count; - // helper for compaction, keep track which threads have a second child - unsigned int is_active_second = 0; + // midpoint of currently active interval of the thread + float mid = 0.0f; + // number of eigenvalues smaller than \a mid + unsigned int mid_count = 0; + // current interval parameter + float left; + float right; + unsigned int left_count; + unsigned int right_count; + // helper for compaction, keep track which threads have a second child + unsigned int is_active_second = 0; - // initialize common start conditions - if (0 == tid) { - c_block_start = blocks_mult[blockIdx.x]; - c_block_end = blocks_mult[blockIdx.x + 1]; - c_block_offset_output = blocks_mult_sum[blockIdx.x]; - - num_threads_active = c_block_end - c_block_start; - s_compaction_list[0] = 0; - num_threads_compaction = ceilPow2(num_threads_active); - - all_threads_converged = 1; - compact_second_chunk = 0; - } - - cg::sync(cta); - - // read data into shared memory - if (tid < num_threads_active) { - s_left[tid] = g_left[c_block_start + tid]; - s_right[tid] = g_right[c_block_start + tid]; - s_left_count[tid] = g_left_count[c_block_start + tid]; - s_right_count[tid] = g_right_count[c_block_start + tid]; - } - - cg::sync(cta); - - // do until all threads converged - while (true) { - // for (int iter=0; iter < 0; iter++) { - - // subdivide interval if currently active and not already converged - subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count, - num_threads_active, left, right, left_count, - right_count, mid, all_threads_converged); - - cg::sync(cta); - - // stop if all eigenvalues have been found - if (1 == all_threads_converged) { - break; - } - - // compute number of eigenvalues smaller than mid for active and not - // converged intervals, use all threads for loading data from gmem and - // s_left and s_right as scratch space to store the data load from gmem - // in shared memory - mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, tid, - num_threads_active, s_left, - s_right, (left == right), cta); - - cg::sync(cta); - - if (tid < num_threads_active) { - // store intervals - if (left != right) { - storeNonEmptyIntervals(tid, num_threads_active, s_left, s_right, - s_left_count, s_right_count, left, mid, right, - left_count, mid_count, right_count, precision, - compact_second_chunk, s_compaction_list_exc, - is_active_second); - } else { - storeIntervalConverged( - s_left, s_right, s_left_count, s_right_count, left, mid, right, - left_count, mid_count, right_count, s_compaction_list_exc, - compact_second_chunk, num_threads_active, is_active_second); - } - } - - cg::sync(cta); - - // compact second chunk of intervals if any of the threads generated - // two child intervals - if (1 == compact_second_chunk) { - createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, - cta); - - compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right, - mid_count, right_count, s_compaction_list, - num_threads_active, is_active_second); - } - - cg::sync(cta); - - // update state variables + // initialize common start conditions if (0 == tid) { - num_threads_active += s_compaction_list[num_threads_active]; - num_threads_compaction = ceilPow2(num_threads_active); + c_block_start = blocks_mult[blockIdx.x]; + c_block_end = blocks_mult[blockIdx.x + 1]; + c_block_offset_output = blocks_mult_sum[blockIdx.x]; - compact_second_chunk = 0; - all_threads_converged = 1; + num_threads_active = c_block_end - c_block_start; + s_compaction_list[0] = 0; + num_threads_compaction = ceilPow2(num_threads_active); + + all_threads_converged = 1; + compact_second_chunk = 0; } cg::sync(cta); - // clear - s_compaction_list_exc[threadIdx.x] = 0; - s_compaction_list_exc[threadIdx.x + blockDim.x] = 0; + // read data into shared memory + if (tid < num_threads_active) { + s_left[tid] = g_left[c_block_start + tid]; + s_right[tid] = g_right[c_block_start + tid]; + s_left_count[tid] = g_left_count[c_block_start + tid]; + s_right_count[tid] = g_right_count[c_block_start + tid]; + } cg::sync(cta); - } // end until all threads converged + // do until all threads converged + while (true) { + // for (int iter=0; iter < 0; iter++) { - // write data back to global memory - if (tid < num_threads_active) { - unsigned int addr = c_block_offset_output + tid; + // subdivide interval if currently active and not already converged + subdivideActiveInterval(tid, + s_left, + s_right, + s_left_count, + s_right_count, + num_threads_active, + left, + right, + left_count, + right_count, + mid, + all_threads_converged); - g_lambda[addr] = s_left[tid]; - g_pos[addr] = s_right_count[tid]; - } + cg::sync(cta); + + // stop if all eigenvalues have been found + if (1 == all_threads_converged) { + break; + } + + // compute number of eigenvalues smaller than mid for active and not + // converged intervals, use all threads for loading data from gmem and + // s_left and s_right as scratch space to store the data load from gmem + // in shared memory + mid_count = computeNumSmallerEigenvalsLarge( + g_d, g_s, n, mid, tid, num_threads_active, s_left, s_right, (left == right), cta); + + cg::sync(cta); + + if (tid < num_threads_active) { + // store intervals + if (left != right) { + storeNonEmptyIntervals(tid, + num_threads_active, + s_left, + s_right, + s_left_count, + s_right_count, + left, + mid, + right, + left_count, + mid_count, + right_count, + precision, + compact_second_chunk, + s_compaction_list_exc, + is_active_second); + } + else { + storeIntervalConverged(s_left, + s_right, + s_left_count, + s_right_count, + left, + mid, + right, + left_count, + mid_count, + right_count, + s_compaction_list_exc, + compact_second_chunk, + num_threads_active, + is_active_second); + } + } + + cg::sync(cta); + + // compact second chunk of intervals if any of the threads generated + // two child intervals + if (1 == compact_second_chunk) { + createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, cta); + + compactIntervals(s_left, + s_right, + s_left_count, + s_right_count, + mid, + right, + mid_count, + right_count, + s_compaction_list, + num_threads_active, + is_active_second); + } + + cg::sync(cta); + + // update state variables + if (0 == tid) { + num_threads_active += s_compaction_list[num_threads_active]; + num_threads_compaction = ceilPow2(num_threads_active); + + compact_second_chunk = 0; + all_threads_converged = 1; + } + + cg::sync(cta); + + // clear + s_compaction_list_exc[threadIdx.x] = 0; + s_compaction_list_exc[threadIdx.x + blockDim.x] = 0; + + cg::sync(cta); + + } // end until all threads converged + + // write data back to global memory + if (tid < num_threads_active) { + unsigned int addr = c_block_offset_output + tid; + + g_lambda[addr] = s_left[tid]; + g_pos[addr] = s_right_count[tid]; + } } -#endif // #ifndef _BISECT_KERNEL_LARGE_MULTI_H_ +#endif // #ifndef _BISECT_KERNEL_LARGE_MULTI_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_onei.cuh b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_onei.cuh index 00b2d686..a08bd9e8 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_onei.cuh +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_large_onei.cuh @@ -57,104 +57,111 @@ namespace cg = cooperative_groups; //! right interval limit //! @param precision desired precision of eigenvalues //////////////////////////////////////////////////////////////////////////////// -__global__ void bisectKernelLarge_OneIntervals( - float *g_d, float *g_s, const unsigned int n, unsigned int num_intervals, - float *g_left, float *g_right, unsigned int *g_pos, float precision) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x; +__global__ void bisectKernelLarge_OneIntervals(float *g_d, + float *g_s, + const unsigned int n, + unsigned int num_intervals, + float *g_left, + float *g_right, + unsigned int *g_pos, + float precision) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x; - __shared__ float s_left_scratch[MAX_THREADS_BLOCK]; - __shared__ float s_right_scratch[MAX_THREADS_BLOCK]; + __shared__ float s_left_scratch[MAX_THREADS_BLOCK]; + __shared__ float s_right_scratch[MAX_THREADS_BLOCK]; - // active interval of thread - // left and right limit of current interval - float left, right; - // number of threads smaller than the right limit (also corresponds to the - // global index of the eigenvalues contained in the active interval) - unsigned int right_count; - // flag if current thread converged - unsigned int converged = 0; - // midpoint when current interval is subdivided - float mid = 0.0f; - // number of eigenvalues less than mid - unsigned int mid_count = 0; + // active interval of thread + // left and right limit of current interval + float left, right; + // number of threads smaller than the right limit (also corresponds to the + // global index of the eigenvalues contained in the active interval) + unsigned int right_count; + // flag if current thread converged + unsigned int converged = 0; + // midpoint when current interval is subdivided + float mid = 0.0f; + // number of eigenvalues less than mid + unsigned int mid_count = 0; - // read data from global memory - if (gtid < num_intervals) { - left = g_left[gtid]; - right = g_right[gtid]; - right_count = g_pos[gtid]; - } - - // flag to determine if all threads converged to eigenvalue - __shared__ unsigned int converged_all_threads; - - // initialized shared flag - if (0 == threadIdx.x) { - converged_all_threads = 0; - } - - cg::sync(cta); - - // process until all threads converged to an eigenvalue - // while( 0 == converged_all_threads) { - while (true) { - atomicExch(&converged_all_threads, 1); - - // update midpoint for all active threads - if ((gtid < num_intervals) && (0 == converged)) { - mid = computeMidpoint(left, right); + // read data from global memory + if (gtid < num_intervals) { + left = g_left[gtid]; + right = g_right[gtid]; + right_count = g_pos[gtid]; } - // find number of eigenvalues that are smaller than midpoint - mid_count = computeNumSmallerEigenvalsLarge( - g_d, g_s, n, mid, gtid, num_intervals, s_left_scratch, s_right_scratch, - converged, cta); + // flag to determine if all threads converged to eigenvalue + __shared__ unsigned int converged_all_threads; - cg::sync(cta); - - // for all active threads - if ((gtid < num_intervals) && (0 == converged)) { - // udpate intervals -- always one child interval survives - if (right_count == mid_count) { - right = mid; - } else { - left = mid; - } - - // check for convergence - float t0 = right - left; - float t1 = max(abs(right), abs(left)) * precision; - - if (t0 < min(precision, t1)) { - float lambda = computeMidpoint(left, right); - left = lambda; - right = lambda; - - converged = 1; - } else { - atomicExch(&converged_all_threads, 0); - } + // initialized shared flag + if (0 == threadIdx.x) { + converged_all_threads = 0; } cg::sync(cta); - if (1 == converged_all_threads) { - break; + // process until all threads converged to an eigenvalue + // while( 0 == converged_all_threads) { + while (true) { + atomicExch(&converged_all_threads, 1); + + // update midpoint for all active threads + if ((gtid < num_intervals) && (0 == converged)) { + mid = computeMidpoint(left, right); + } + + // find number of eigenvalues that are smaller than midpoint + mid_count = computeNumSmallerEigenvalsLarge( + g_d, g_s, n, mid, gtid, num_intervals, s_left_scratch, s_right_scratch, converged, cta); + + cg::sync(cta); + + // for all active threads + if ((gtid < num_intervals) && (0 == converged)) { + // udpate intervals -- always one child interval survives + if (right_count == mid_count) { + right = mid; + } + else { + left = mid; + } + + // check for convergence + float t0 = right - left; + float t1 = max(abs(right), abs(left)) * precision; + + if (t0 < min(precision, t1)) { + float lambda = computeMidpoint(left, right); + left = lambda; + right = lambda; + + converged = 1; + } + else { + atomicExch(&converged_all_threads, 0); + } + } + + cg::sync(cta); + + if (1 == converged_all_threads) { + break; + } + + cg::sync(cta); } + // write data back to global memory cg::sync(cta); - } - // write data back to global memory - cg::sync(cta); - - if (gtid < num_intervals) { - // intervals converged so left and right interval limit are both identical - // and identical to the eigenvalue - g_left[gtid] = left; - } + if (gtid < num_intervals) { + // intervals converged so left and right interval limit are both identical + // and identical to the eigenvalue + g_left[gtid] = left; + } } -#endif // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_ +#endif // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_small.cuh b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_small.cuh index 1c01ec67..eee40cd6 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_small.cuh +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_kernel_small.cuh @@ -53,176 +53,219 @@ namespace cg = cooperative_groups; //! @param lu_eig_count number of eigenvalues that are smaller than \a lu //! @param epsilon desired accuracy of eigenvalues to compute //////////////////////////////////////////////////////////////////////////////// -__global__ void bisectKernel(float *g_d, float *g_s, const unsigned int n, - float *g_left, float *g_right, - unsigned int *g_left_count, - unsigned int *g_right_count, const float lg, - const float ug, const unsigned int lg_eig_count, - const unsigned int ug_eig_count, float epsilon) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // intervals (store left and right because the subdivision tree is in general - // not dense - __shared__ float s_left[MAX_THREADS_BLOCK_SMALL_MATRIX]; - __shared__ float s_right[MAX_THREADS_BLOCK_SMALL_MATRIX]; +__global__ void bisectKernel(float *g_d, + float *g_s, + const unsigned int n, + float *g_left, + float *g_right, + unsigned int *g_left_count, + unsigned int *g_right_count, + const float lg, + const float ug, + const unsigned int lg_eig_count, + const unsigned int ug_eig_count, + float epsilon) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // intervals (store left and right because the subdivision tree is in general + // not dense + __shared__ float s_left[MAX_THREADS_BLOCK_SMALL_MATRIX]; + __shared__ float s_right[MAX_THREADS_BLOCK_SMALL_MATRIX]; - // number of eigenvalues that are smaller than s_left / s_right - // (correspondence is realized via indices) - __shared__ unsigned int s_left_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; - __shared__ unsigned int s_right_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; + // number of eigenvalues that are smaller than s_left / s_right + // (correspondence is realized via indices) + __shared__ unsigned int s_left_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; + __shared__ unsigned int s_right_count[MAX_THREADS_BLOCK_SMALL_MATRIX]; - // helper for stream compaction - __shared__ unsigned int s_compaction_list[MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; + // helper for stream compaction + __shared__ unsigned int s_compaction_list[MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; - // state variables for whole block - // if 0 then compaction of second chunk of child intervals is not necessary - // (because all intervals had exactly one non-dead child) - __shared__ unsigned int compact_second_chunk; - __shared__ unsigned int all_threads_converged; + // state variables for whole block + // if 0 then compaction of second chunk of child intervals is not necessary + // (because all intervals had exactly one non-dead child) + __shared__ unsigned int compact_second_chunk; + __shared__ unsigned int all_threads_converged; - // number of currently active threads - __shared__ unsigned int num_threads_active; + // number of currently active threads + __shared__ unsigned int num_threads_active; - // number of threads to use for stream compaction - __shared__ unsigned int num_threads_compaction; + // number of threads to use for stream compaction + __shared__ unsigned int num_threads_compaction; - // helper for exclusive scan - unsigned int *s_compaction_list_exc = s_compaction_list + 1; + // helper for exclusive scan + unsigned int *s_compaction_list_exc = s_compaction_list + 1; - // variables for currently processed interval - // left and right limit of active interval - float left = 0.0f; - float right = 0.0f; - unsigned int left_count = 0; - unsigned int right_count = 0; - // midpoint of active interval - float mid = 0.0f; - // number of eigenvalues smaller then mid - unsigned int mid_count = 0; - // affected from compaction - unsigned int is_active_second = 0; + // variables for currently processed interval + // left and right limit of active interval + float left = 0.0f; + float right = 0.0f; + unsigned int left_count = 0; + unsigned int right_count = 0; + // midpoint of active interval + float mid = 0.0f; + // number of eigenvalues smaller then mid + unsigned int mid_count = 0; + // affected from compaction + unsigned int is_active_second = 0; - s_compaction_list[threadIdx.x] = 0; - s_left[threadIdx.x] = 0; - s_right[threadIdx.x] = 0; - s_left_count[threadIdx.x] = 0; - s_right_count[threadIdx.x] = 0; - - cg::sync(cta); - - // set up initial configuration - if (0 == threadIdx.x) { - s_left[0] = lg; - s_right[0] = ug; - s_left_count[0] = lg_eig_count; - s_right_count[0] = ug_eig_count; - - compact_second_chunk = 0; - num_threads_active = 1; - - num_threads_compaction = 1; - } - - // for all active threads read intervals from the last level - // the number of (worst case) active threads per level l is 2^l - while (true) { - all_threads_converged = 1; - cg::sync(cta); - - is_active_second = 0; - subdivideActiveInterval(threadIdx.x, s_left, s_right, s_left_count, - s_right_count, num_threads_active, left, right, - left_count, right_count, mid, - all_threads_converged); - - cg::sync(cta); - - // check if done - if (1 == all_threads_converged) { - break; - } - - cg::sync(cta); - - // compute number of eigenvalues smaller than mid - // use all threads for reading the necessary matrix data from global - // memory - // use s_left and s_right as scratch space for diagonal and - // superdiagonal of matrix - mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, threadIdx.x, - num_threads_active, s_left, s_right, - (left == right), cta); - - cg::sync(cta); - - // store intervals - // for all threads store the first child interval in a continuous chunk of - // memory, and the second child interval -- if it exists -- in a second - // chunk; it is likely that all threads reach convergence up to - // \a epsilon at the same level; furthermore, for higher level most / all - // threads will have only one child, storing the first child compactly will - // (first) avoid to perform a compaction step on the first chunk, (second) - // make it for higher levels (when all threads / intervals have - // exactly one child) unnecessary to perform a compaction of the second - // chunk - if (threadIdx.x < num_threads_active) { - if (left != right) { - // store intervals - storeNonEmptyIntervals(threadIdx.x, num_threads_active, s_left, s_right, - s_left_count, s_right_count, left, mid, right, - left_count, mid_count, right_count, epsilon, - compact_second_chunk, s_compaction_list_exc, - is_active_second); - } else { - storeIntervalConverged( - s_left, s_right, s_left_count, s_right_count, left, mid, right, - left_count, mid_count, right_count, s_compaction_list_exc, - compact_second_chunk, num_threads_active, is_active_second); - } - } - - // necessary so that compact_second_chunk is up-to-date - cg::sync(cta); - - // perform compaction of chunk where second children are stored - // scan of (num_threads_active / 2) elements, thus at most - // (num_threads_active / 4) threads are needed - if (compact_second_chunk > 0) { - createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, - cta); - - compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right, - mid_count, right_count, s_compaction_list, - num_threads_active, is_active_second); - } + s_compaction_list[threadIdx.x] = 0; + s_left[threadIdx.x] = 0; + s_right[threadIdx.x] = 0; + s_left_count[threadIdx.x] = 0; + s_right_count[threadIdx.x] = 0; cg::sync(cta); + // set up initial configuration if (0 == threadIdx.x) { - // update number of active threads with result of reduction - num_threads_active += s_compaction_list[num_threads_active]; + s_left[0] = lg; + s_right[0] = ug; + s_left_count[0] = lg_eig_count; + s_right_count[0] = ug_eig_count; - num_threads_compaction = ceilPow2(num_threads_active); + compact_second_chunk = 0; + num_threads_active = 1; - compact_second_chunk = 0; + num_threads_compaction = 1; + } + + // for all active threads read intervals from the last level + // the number of (worst case) active threads per level l is 2^l + while (true) { + all_threads_converged = 1; + cg::sync(cta); + + is_active_second = 0; + subdivideActiveInterval(threadIdx.x, + s_left, + s_right, + s_left_count, + s_right_count, + num_threads_active, + left, + right, + left_count, + right_count, + mid, + all_threads_converged); + + cg::sync(cta); + + // check if done + if (1 == all_threads_converged) { + break; + } + + cg::sync(cta); + + // compute number of eigenvalues smaller than mid + // use all threads for reading the necessary matrix data from global + // memory + // use s_left and s_right as scratch space for diagonal and + // superdiagonal of matrix + mid_count = computeNumSmallerEigenvals( + g_d, g_s, n, mid, threadIdx.x, num_threads_active, s_left, s_right, (left == right), cta); + + cg::sync(cta); + + // store intervals + // for all threads store the first child interval in a continuous chunk of + // memory, and the second child interval -- if it exists -- in a second + // chunk; it is likely that all threads reach convergence up to + // \a epsilon at the same level; furthermore, for higher level most / all + // threads will have only one child, storing the first child compactly will + // (first) avoid to perform a compaction step on the first chunk, (second) + // make it for higher levels (when all threads / intervals have + // exactly one child) unnecessary to perform a compaction of the second + // chunk + if (threadIdx.x < num_threads_active) { + if (left != right) { + // store intervals + storeNonEmptyIntervals(threadIdx.x, + num_threads_active, + s_left, + s_right, + s_left_count, + s_right_count, + left, + mid, + right, + left_count, + mid_count, + right_count, + epsilon, + compact_second_chunk, + s_compaction_list_exc, + is_active_second); + } + else { + storeIntervalConverged(s_left, + s_right, + s_left_count, + s_right_count, + left, + mid, + right, + left_count, + mid_count, + right_count, + s_compaction_list_exc, + compact_second_chunk, + num_threads_active, + is_active_second); + } + } + + // necessary so that compact_second_chunk is up-to-date + cg::sync(cta); + + // perform compaction of chunk where second children are stored + // scan of (num_threads_active / 2) elements, thus at most + // (num_threads_active / 4) threads are needed + if (compact_second_chunk > 0) { + createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, cta); + + compactIntervals(s_left, + s_right, + s_left_count, + s_right_count, + mid, + right, + mid_count, + right_count, + s_compaction_list, + num_threads_active, + is_active_second); + } + + cg::sync(cta); + + if (0 == threadIdx.x) { + // update number of active threads with result of reduction + num_threads_active += s_compaction_list[num_threads_active]; + + num_threads_compaction = ceilPow2(num_threads_active); + + compact_second_chunk = 0; + } + + cg::sync(cta); } cg::sync(cta); - } - cg::sync(cta); + // write resulting intervals to global mem + // for all threads write if they have been converged to an eigenvalue to + // a separate array - // write resulting intervals to global mem - // for all threads write if they have been converged to an eigenvalue to - // a separate array - - // at most n valid intervals - if (threadIdx.x < n) { - // intervals converged so left and right limit are identical - g_left[threadIdx.x] = s_left[threadIdx.x]; - // left count is sufficient to have global order - g_left_count[threadIdx.x] = s_left_count[threadIdx.x]; - } + // at most n valid intervals + if (threadIdx.x < n) { + // intervals converged so left and right limit are identical + g_left[threadIdx.x] = s_left[threadIdx.x]; + // left count is sufficient to have global order + g_left_count[threadIdx.x] = s_left_count[threadIdx.x]; + } } -#endif // #ifndef _BISECT_KERNEL_SMALL_H_ +#endif // #ifndef _BISECT_KERNEL_SMALL_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cu b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cu index 204ed6ad..6b1c2021 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cu +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cu @@ -28,117 +28,103 @@ /* Computation of eigenvalues of a large symmetric, tridiagonal matrix */ // includes, system -#include -#include -#include -#include #include +#include +#include +#include +#include // includes, project -#include "helper_functions.h" -#include "helper_cuda.h" +#include "bisect_large.cuh" #include "config.h" +#include "helper_cuda.h" +#include "helper_functions.h" +#include "matlab.h" #include "structs.h" #include "util.h" -#include "matlab.h" - -#include "bisect_large.cuh" // includes, kernels #include "bisect_kernel_large.cuh" -#include "bisect_kernel_large_onei.cuh" #include "bisect_kernel_large_multi.cuh" +#include "bisect_kernel_large_onei.cuh" //////////////////////////////////////////////////////////////////////////////// //! Initialize variables and memory for result //! @param result handles to memory //! @param matrix_size size of the matrix //////////////////////////////////////////////////////////////////////////////// -void initResultDataLargeMatrix(ResultDataLarge &result, - const unsigned int mat_size) { - // helper variables to initialize memory - unsigned int zero = 0; - unsigned int mat_size_f = sizeof(float) * mat_size; - unsigned int mat_size_ui = sizeof(unsigned int) * mat_size; +void initResultDataLargeMatrix(ResultDataLarge &result, const unsigned int mat_size) +{ + // helper variables to initialize memory + unsigned int zero = 0; + unsigned int mat_size_f = sizeof(float) * mat_size; + unsigned int mat_size_ui = sizeof(unsigned int) * mat_size; - float *tempf = (float *)malloc(mat_size_f); - unsigned int *tempui = (unsigned int *)malloc(mat_size_ui); + float *tempf = (float *)malloc(mat_size_f); + unsigned int *tempui = (unsigned int *)malloc(mat_size_ui); - for (unsigned int i = 0; i < mat_size; ++i) { - tempf[i] = 0.0f; - tempui[i] = 0; - } + for (unsigned int i = 0; i < mat_size; ++i) { + tempf[i] = 0.0f; + tempui[i] = 0; + } - // number of intervals containing only one eigenvalue after the first step - checkCudaErrors(cudaMalloc((void **)&result.g_num_one, sizeof(unsigned int))); - checkCudaErrors(cudaMemcpy(result.g_num_one, &zero, sizeof(unsigned int), - cudaMemcpyHostToDevice)); + // number of intervals containing only one eigenvalue after the first step + checkCudaErrors(cudaMalloc((void **)&result.g_num_one, sizeof(unsigned int))); + checkCudaErrors(cudaMemcpy(result.g_num_one, &zero, sizeof(unsigned int), cudaMemcpyHostToDevice)); - // number of (thread) blocks of intervals with multiple eigenvalues after - // the first iteration - checkCudaErrors( - cudaMalloc((void **)&result.g_num_blocks_mult, sizeof(unsigned int))); - checkCudaErrors(cudaMemcpy(result.g_num_blocks_mult, &zero, - sizeof(unsigned int), cudaMemcpyHostToDevice)); + // number of (thread) blocks of intervals with multiple eigenvalues after + // the first iteration + checkCudaErrors(cudaMalloc((void **)&result.g_num_blocks_mult, sizeof(unsigned int))); + checkCudaErrors(cudaMemcpy(result.g_num_blocks_mult, &zero, sizeof(unsigned int), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMalloc((void **)&result.g_left_one, mat_size_f)); - checkCudaErrors(cudaMalloc((void **)&result.g_right_one, mat_size_f)); - checkCudaErrors(cudaMalloc((void **)&result.g_pos_one, mat_size_ui)); + checkCudaErrors(cudaMalloc((void **)&result.g_left_one, mat_size_f)); + checkCudaErrors(cudaMalloc((void **)&result.g_right_one, mat_size_f)); + checkCudaErrors(cudaMalloc((void **)&result.g_pos_one, mat_size_ui)); - checkCudaErrors(cudaMalloc((void **)&result.g_left_mult, mat_size_f)); - checkCudaErrors(cudaMalloc((void **)&result.g_right_mult, mat_size_f)); - checkCudaErrors(cudaMalloc((void **)&result.g_left_count_mult, mat_size_ui)); - checkCudaErrors(cudaMalloc((void **)&result.g_right_count_mult, mat_size_ui)); + checkCudaErrors(cudaMalloc((void **)&result.g_left_mult, mat_size_f)); + checkCudaErrors(cudaMalloc((void **)&result.g_right_mult, mat_size_f)); + checkCudaErrors(cudaMalloc((void **)&result.g_left_count_mult, mat_size_ui)); + checkCudaErrors(cudaMalloc((void **)&result.g_right_count_mult, mat_size_ui)); - checkCudaErrors( - cudaMemcpy(result.g_left_one, tempf, mat_size_f, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_right_one, tempf, mat_size_f, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_pos_one, tempui, mat_size_ui, - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_left_one, tempf, mat_size_f, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_right_one, tempf, mat_size_f, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_pos_one, tempui, mat_size_ui, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_left_mult, tempf, mat_size_f, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_right_mult, tempf, mat_size_f, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_left_count_mult, tempui, mat_size_ui, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_right_count_mult, tempui, mat_size_ui, - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_left_mult, tempf, mat_size_f, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_right_mult, tempf, mat_size_f, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_left_count_mult, tempui, mat_size_ui, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_right_count_mult, tempui, mat_size_ui, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult, mat_size_ui)); - checkCudaErrors(cudaMemcpy(result.g_blocks_mult, tempui, mat_size_ui, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult_sum, mat_size_ui)); - checkCudaErrors(cudaMemcpy(result.g_blocks_mult_sum, tempui, mat_size_ui, - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult, mat_size_ui)); + checkCudaErrors(cudaMemcpy(result.g_blocks_mult, tempui, mat_size_ui, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc((void **)&result.g_blocks_mult_sum, mat_size_ui)); + checkCudaErrors(cudaMemcpy(result.g_blocks_mult_sum, tempui, mat_size_ui, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMalloc((void **)&result.g_lambda_mult, mat_size_f)); - checkCudaErrors(cudaMemcpy(result.g_lambda_mult, tempf, mat_size_f, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMalloc((void **)&result.g_pos_mult, mat_size_ui)); - checkCudaErrors(cudaMemcpy(result.g_pos_mult, tempf, mat_size_ui, - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc((void **)&result.g_lambda_mult, mat_size_f)); + checkCudaErrors(cudaMemcpy(result.g_lambda_mult, tempf, mat_size_f, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc((void **)&result.g_pos_mult, mat_size_ui)); + checkCudaErrors(cudaMemcpy(result.g_pos_mult, tempf, mat_size_ui, cudaMemcpyHostToDevice)); } //////////////////////////////////////////////////////////////////////////////// //! Cleanup result memory //! @param result handles to memory //////////////////////////////////////////////////////////////////////////////// -void cleanupResultDataLargeMatrix(ResultDataLarge &result) { - checkCudaErrors(cudaFree(result.g_num_one)); - checkCudaErrors(cudaFree(result.g_num_blocks_mult)); - checkCudaErrors(cudaFree(result.g_left_one)); - checkCudaErrors(cudaFree(result.g_right_one)); - checkCudaErrors(cudaFree(result.g_pos_one)); - checkCudaErrors(cudaFree(result.g_left_mult)); - checkCudaErrors(cudaFree(result.g_right_mult)); - checkCudaErrors(cudaFree(result.g_left_count_mult)); - checkCudaErrors(cudaFree(result.g_right_count_mult)); - checkCudaErrors(cudaFree(result.g_blocks_mult)); - checkCudaErrors(cudaFree(result.g_blocks_mult_sum)); - checkCudaErrors(cudaFree(result.g_lambda_mult)); - checkCudaErrors(cudaFree(result.g_pos_mult)); +void cleanupResultDataLargeMatrix(ResultDataLarge &result) +{ + checkCudaErrors(cudaFree(result.g_num_one)); + checkCudaErrors(cudaFree(result.g_num_blocks_mult)); + checkCudaErrors(cudaFree(result.g_left_one)); + checkCudaErrors(cudaFree(result.g_right_one)); + checkCudaErrors(cudaFree(result.g_pos_one)); + checkCudaErrors(cudaFree(result.g_left_mult)); + checkCudaErrors(cudaFree(result.g_right_mult)); + checkCudaErrors(cudaFree(result.g_left_count_mult)); + checkCudaErrors(cudaFree(result.g_right_count_mult)); + checkCudaErrors(cudaFree(result.g_blocks_mult)); + checkCudaErrors(cudaFree(result.g_blocks_mult_sum)); + checkCudaErrors(cudaFree(result.g_lambda_mult)); + checkCudaErrors(cudaFree(result.g_pos_mult)); } //////////////////////////////////////////////////////////////////////////////// @@ -151,109 +137,130 @@ void cleanupResultDataLargeMatrix(ResultDataLarge &result) { //! @param ug upper limit of Gerschgorin interval //! @param iterations number of iterations (for timing) //////////////////////////////////////////////////////////////////////////////// -void computeEigenvaluesLargeMatrix(const InputData &input, +void computeEigenvaluesLargeMatrix(const InputData &input, const ResultDataLarge &result, - const unsigned int mat_size, - const float precision, const float lg, - const float ug, - const unsigned int iterations) { - dim3 blocks(1, 1, 1); - dim3 threads(MAX_THREADS_BLOCK, 1, 1); + const unsigned int mat_size, + const float precision, + const float lg, + const float ug, + const unsigned int iterations) +{ + dim3 blocks(1, 1, 1); + dim3 threads(MAX_THREADS_BLOCK, 1, 1); - StopWatchInterface *timer_step1 = NULL; - StopWatchInterface *timer_step2_one = NULL; - StopWatchInterface *timer_step2_mult = NULL; - StopWatchInterface *timer_total = NULL; - sdkCreateTimer(&timer_step1); - sdkCreateTimer(&timer_step2_one); - sdkCreateTimer(&timer_step2_mult); - sdkCreateTimer(&timer_total); + StopWatchInterface *timer_step1 = NULL; + StopWatchInterface *timer_step2_one = NULL; + StopWatchInterface *timer_step2_mult = NULL; + StopWatchInterface *timer_total = NULL; + sdkCreateTimer(&timer_step1); + sdkCreateTimer(&timer_step2_one); + sdkCreateTimer(&timer_step2_mult); + sdkCreateTimer(&timer_total); - sdkStartTimer(&timer_total); + sdkStartTimer(&timer_total); - // do for multiple iterations to improve timing accuracy - for (unsigned int iter = 0; iter < iterations; ++iter) { - sdkStartTimer(&timer_step1); - bisectKernelLarge<<>>( - input.g_a, input.g_b, mat_size, lg, ug, 0, mat_size, precision, - result.g_num_one, result.g_num_blocks_mult, result.g_left_one, - result.g_right_one, result.g_pos_one, result.g_left_mult, - result.g_right_mult, result.g_left_count_mult, - result.g_right_count_mult, result.g_blocks_mult, - result.g_blocks_mult_sum); + // do for multiple iterations to improve timing accuracy + for (unsigned int iter = 0; iter < iterations; ++iter) { + sdkStartTimer(&timer_step1); + bisectKernelLarge<<>>(input.g_a, + input.g_b, + mat_size, + lg, + ug, + 0, + mat_size, + precision, + result.g_num_one, + result.g_num_blocks_mult, + result.g_left_one, + result.g_right_one, + result.g_pos_one, + result.g_left_mult, + result.g_right_mult, + result.g_left_count_mult, + result.g_right_count_mult, + result.g_blocks_mult, + result.g_blocks_mult_sum); - getLastCudaError("Kernel launch failed."); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer_step1); + getLastCudaError("Kernel launch failed."); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer_step1); - // get the number of intervals containing one eigenvalue after the first - // processing step - unsigned int num_one_intervals; - checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one, - sizeof(unsigned int), cudaMemcpyDeviceToHost)); + // get the number of intervals containing one eigenvalue after the first + // processing step + unsigned int num_one_intervals; + checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one, sizeof(unsigned int), cudaMemcpyDeviceToHost)); - dim3 grid_onei; - grid_onei.x = getNumBlocksLinear(num_one_intervals, MAX_THREADS_BLOCK); - dim3 threads_onei; - // use always max number of available threads to better balance load times - // for matrix data - threads_onei.x = MAX_THREADS_BLOCK; + dim3 grid_onei; + grid_onei.x = getNumBlocksLinear(num_one_intervals, MAX_THREADS_BLOCK); + dim3 threads_onei; + // use always max number of available threads to better balance load times + // for matrix data + threads_onei.x = MAX_THREADS_BLOCK; - // compute eigenvalues for intervals that contained only one eigenvalue - // after the first processing step - sdkStartTimer(&timer_step2_one); + // compute eigenvalues for intervals that contained only one eigenvalue + // after the first processing step + sdkStartTimer(&timer_step2_one); - bisectKernelLarge_OneIntervals<<>>( - input.g_a, input.g_b, mat_size, num_one_intervals, result.g_left_one, - result.g_right_one, result.g_pos_one, precision); + bisectKernelLarge_OneIntervals<<>>(input.g_a, + input.g_b, + mat_size, + num_one_intervals, + result.g_left_one, + result.g_right_one, + result.g_pos_one, + precision); - getLastCudaError("bisectKernelLarge_OneIntervals() FAILED."); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer_step2_one); + getLastCudaError("bisectKernelLarge_OneIntervals() FAILED."); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer_step2_one); - // process intervals that contained more than one eigenvalue after - // the first processing step + // process intervals that contained more than one eigenvalue after + // the first processing step - // get the number of blocks of intervals that contain, in total when - // each interval contains only one eigenvalue, not more than - // MAX_THREADS_BLOCK threads - unsigned int num_blocks_mult = 0; - checkCudaErrors(cudaMemcpy(&num_blocks_mult, result.g_num_blocks_mult, - sizeof(unsigned int), cudaMemcpyDeviceToHost)); + // get the number of blocks of intervals that contain, in total when + // each interval contains only one eigenvalue, not more than + // MAX_THREADS_BLOCK threads + unsigned int num_blocks_mult = 0; + checkCudaErrors( + cudaMemcpy(&num_blocks_mult, result.g_num_blocks_mult, sizeof(unsigned int), cudaMemcpyDeviceToHost)); - // setup the execution environment - dim3 grid_mult(num_blocks_mult, 1, 1); - dim3 threads_mult(MAX_THREADS_BLOCK, 1, 1); + // setup the execution environment + dim3 grid_mult(num_blocks_mult, 1, 1); + dim3 threads_mult(MAX_THREADS_BLOCK, 1, 1); - sdkStartTimer(&timer_step2_mult); + sdkStartTimer(&timer_step2_mult); - bisectKernelLarge_MultIntervals<<>>( - input.g_a, input.g_b, mat_size, result.g_blocks_mult, - result.g_blocks_mult_sum, result.g_left_mult, result.g_right_mult, - result.g_left_count_mult, result.g_right_count_mult, - result.g_lambda_mult, result.g_pos_mult, precision); + bisectKernelLarge_MultIntervals<<>>(input.g_a, + input.g_b, + mat_size, + result.g_blocks_mult, + result.g_blocks_mult_sum, + result.g_left_mult, + result.g_right_mult, + result.g_left_count_mult, + result.g_right_count_mult, + result.g_lambda_mult, + result.g_pos_mult, + precision); - getLastCudaError("bisectKernelLarge_MultIntervals() FAILED."); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer_step2_mult); - } + getLastCudaError("bisectKernelLarge_MultIntervals() FAILED."); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer_step2_mult); + } - sdkStopTimer(&timer_total); + sdkStopTimer(&timer_total); - printf("Average time step 1: %f ms\n", - sdkGetTimerValue(&timer_step1) / (float)iterations); - printf("Average time step 2, one intervals: %f ms\n", - sdkGetTimerValue(&timer_step2_one) / (float)iterations); - printf("Average time step 2, mult intervals: %f ms\n", - sdkGetTimerValue(&timer_step2_mult) / (float)iterations); + printf("Average time step 1: %f ms\n", sdkGetTimerValue(&timer_step1) / (float)iterations); + printf("Average time step 2, one intervals: %f ms\n", sdkGetTimerValue(&timer_step2_one) / (float)iterations); + printf("Average time step 2, mult intervals: %f ms\n", sdkGetTimerValue(&timer_step2_mult) / (float)iterations); - printf("Average time TOTAL: %f ms\n", - sdkGetTimerValue(&timer_total) / (float)iterations); + printf("Average time TOTAL: %f ms\n", sdkGetTimerValue(&timer_total) / (float)iterations); - sdkDeleteTimer(&timer_step1); - sdkDeleteTimer(&timer_step2_one); - sdkDeleteTimer(&timer_step2_mult); - sdkDeleteTimer(&timer_total); + sdkDeleteTimer(&timer_step1); + sdkDeleteTimer(&timer_step2_one); + sdkDeleteTimer(&timer_step2_mult); + sdkDeleteTimer(&timer_total); } //////////////////////////////////////////////////////////////////////////////// @@ -264,105 +271,98 @@ void computeEigenvaluesLargeMatrix(const InputData &input, //! @param mat_size matrix size //! @param filename output filename //////////////////////////////////////////////////////////////////////////////// -bool processResultDataLargeMatrix(const InputData &input, +bool processResultDataLargeMatrix(const InputData &input, const ResultDataLarge &result, - const unsigned int mat_size, - const char *filename, - const unsigned int user_defined, - char *exec_path) { - bool bCompareResult = false; - const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size; - const unsigned int mat_size_f = sizeof(float) * mat_size; + const unsigned int mat_size, + const char *filename, + const unsigned int user_defined, + char *exec_path) +{ + bool bCompareResult = false; + const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size; + const unsigned int mat_size_f = sizeof(float) * mat_size; - // copy data from intervals that contained more than one eigenvalue after - // the first processing step - float *lambda_mult = (float *)malloc(sizeof(float) * mat_size); - checkCudaErrors(cudaMemcpy(lambda_mult, result.g_lambda_mult, - sizeof(float) * mat_size, cudaMemcpyDeviceToHost)); - unsigned int *pos_mult = - (unsigned int *)malloc(sizeof(unsigned int) * mat_size); - checkCudaErrors(cudaMemcpy(pos_mult, result.g_pos_mult, - sizeof(unsigned int) * mat_size, - cudaMemcpyDeviceToHost)); + // copy data from intervals that contained more than one eigenvalue after + // the first processing step + float *lambda_mult = (float *)malloc(sizeof(float) * mat_size); + checkCudaErrors(cudaMemcpy(lambda_mult, result.g_lambda_mult, sizeof(float) * mat_size, cudaMemcpyDeviceToHost)); + unsigned int *pos_mult = (unsigned int *)malloc(sizeof(unsigned int) * mat_size); + checkCudaErrors(cudaMemcpy(pos_mult, result.g_pos_mult, sizeof(unsigned int) * mat_size, cudaMemcpyDeviceToHost)); - unsigned int *blocks_mult_sum = - (unsigned int *)malloc(sizeof(unsigned int) * mat_size); - checkCudaErrors(cudaMemcpy(blocks_mult_sum, result.g_blocks_mult_sum, - sizeof(unsigned int) * mat_size, - cudaMemcpyDeviceToHost)); + unsigned int *blocks_mult_sum = (unsigned int *)malloc(sizeof(unsigned int) * mat_size); + checkCudaErrors( + cudaMemcpy(blocks_mult_sum, result.g_blocks_mult_sum, sizeof(unsigned int) * mat_size, cudaMemcpyDeviceToHost)); - unsigned int num_one_intervals; - checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one, - sizeof(unsigned int), cudaMemcpyDeviceToHost)); + unsigned int num_one_intervals; + checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one, sizeof(unsigned int), cudaMemcpyDeviceToHost)); - unsigned int sum_blocks_mult = mat_size - num_one_intervals; + unsigned int sum_blocks_mult = mat_size - num_one_intervals; - // copy data for intervals that contained one eigenvalue after the first - // processing step - float *left_one = (float *)malloc(mat_size_f); - float *right_one = (float *)malloc(mat_size_f); - unsigned int *pos_one = (unsigned int *)malloc(mat_size_ui); - checkCudaErrors(cudaMemcpy(left_one, result.g_left_one, mat_size_f, - cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(right_one, result.g_right_one, mat_size_f, - cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(pos_one, result.g_pos_one, mat_size_ui, - cudaMemcpyDeviceToHost)); + // copy data for intervals that contained one eigenvalue after the first + // processing step + float *left_one = (float *)malloc(mat_size_f); + float *right_one = (float *)malloc(mat_size_f); + unsigned int *pos_one = (unsigned int *)malloc(mat_size_ui); + checkCudaErrors(cudaMemcpy(left_one, result.g_left_one, mat_size_f, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(right_one, result.g_right_one, mat_size_f, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(pos_one, result.g_pos_one, mat_size_ui, cudaMemcpyDeviceToHost)); - // extract eigenvalues - float *eigenvals = (float *)malloc(mat_size_f); + // extract eigenvalues + float *eigenvals = (float *)malloc(mat_size_f); - // singleton intervals generated in the second step - for (unsigned int i = 0; i < sum_blocks_mult; ++i) { - eigenvals[pos_mult[i] - 1] = lambda_mult[i]; - } - - // singleton intervals generated in the first step - unsigned int index = 0; - - for (unsigned int i = 0; i < num_one_intervals; ++i, ++index) { - eigenvals[pos_one[i] - 1] = left_one[i]; - } - - if (1 == user_defined) { - // store result - writeTridiagSymMatlab(filename, input.a, input.b + 1, eigenvals, mat_size); - // getLastCudaError( sdkWriteFilef( filename, eigenvals, mat_size, 0.0f)); - - printf("User requests non-default argument(s), skipping self-check!\n"); - bCompareResult = true; - } else { - // compare with reference solution - - float *reference = NULL; - unsigned int input_data_size = 0; - - char *ref_path = sdkFindFilePath("reference.dat", exec_path); - assert(NULL != ref_path); - sdkReadFile(ref_path, &reference, &input_data_size, false); - assert(input_data_size == mat_size); - - // there's an imprecision of Sturm count computation which makes an - // additional offset necessary - float tolerance = 1.0e-5f + 5.0e-6f; - - if (sdkCompareL2fe(reference, eigenvals, mat_size, tolerance) == true) { - bCompareResult = true; - } else { - bCompareResult = false; + // singleton intervals generated in the second step + for (unsigned int i = 0; i < sum_blocks_mult; ++i) { + eigenvals[pos_mult[i] - 1] = lambda_mult[i]; } - free(ref_path); - free(reference); - } + // singleton intervals generated in the first step + unsigned int index = 0; - freePtr(eigenvals); - freePtr(lambda_mult); - freePtr(pos_mult); - freePtr(blocks_mult_sum); - freePtr(left_one); - freePtr(right_one); - freePtr(pos_one); + for (unsigned int i = 0; i < num_one_intervals; ++i, ++index) { + eigenvals[pos_one[i] - 1] = left_one[i]; + } - return bCompareResult; + if (1 == user_defined) { + // store result + writeTridiagSymMatlab(filename, input.a, input.b + 1, eigenvals, mat_size); + // getLastCudaError( sdkWriteFilef( filename, eigenvals, mat_size, 0.0f)); + + printf("User requests non-default argument(s), skipping self-check!\n"); + bCompareResult = true; + } + else { + // compare with reference solution + + float *reference = NULL; + unsigned int input_data_size = 0; + + char *ref_path = sdkFindFilePath("reference.dat", exec_path); + assert(NULL != ref_path); + sdkReadFile(ref_path, &reference, &input_data_size, false); + assert(input_data_size == mat_size); + + // there's an imprecision of Sturm count computation which makes an + // additional offset necessary + float tolerance = 1.0e-5f + 5.0e-6f; + + if (sdkCompareL2fe(reference, eigenvals, mat_size, tolerance) == true) { + bCompareResult = true; + } + else { + bCompareResult = false; + } + + free(ref_path); + free(reference); + } + + freePtr(eigenvals); + freePtr(lambda_mult); + freePtr(pos_mult); + freePtr(blocks_mult_sum); + freePtr(left_one); + freePtr(right_one); + freePtr(pos_one); + + return bCompareResult; } diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cuh b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cuh index d65bfd43..c4085055 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cuh +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_large.cuh @@ -30,53 +30,56 @@ #ifndef _BISECT_LARGE_CUH_ #define _BISECT_LARGE_CUH_ -extern "C" { +#include "structs.h" -//////////////////////////////////////////////////////////////////////////////// -//! Run the kernels to compute the eigenvalues for large matrices -//! @param input handles to input data -//! @param result handles to result data -//! @param mat_size matrix size -//! @param precision desired precision of eigenvalues -//! @param lg lower limit of Gerschgorin interval -//! @param ug upper limit of Gerschgorin interval -//! @param iterations number of iterations (for timing) -//////////////////////////////////////////////////////////////////////////////// -void computeEigenvaluesLargeMatrix(const InputData &input, - const ResultDataLarge &result, - const unsigned int mat_size, - const float precision, const float lg, - const float ug, - const unsigned int iterations); +extern "C" +{ -//////////////////////////////////////////////////////////////////////////////// -//! Initialize variables and memory for result -//! @param result handles to memory -//! @param matr_size size of the matrix -//////////////////////////////////////////////////////////////////////////////// -void initResultDataLargeMatrix(ResultDataLarge &result, - const unsigned int mat_size); + //////////////////////////////////////////////////////////////////////////////// + //! Run the kernels to compute the eigenvalues for large matrices + //! @param input handles to input data + //! @param result handles to result data + //! @param mat_size matrix size + //! @param precision desired precision of eigenvalues + //! @param lg lower limit of Gerschgorin interval + //! @param ug upper limit of Gerschgorin interval + //! @param iterations number of iterations (for timing) + //////////////////////////////////////////////////////////////////////////////// + void computeEigenvaluesLargeMatrix(const InputData &input, + const ResultDataLarge &result, + const unsigned int mat_size, + const float precision, + const float lg, + const float ug, + const unsigned int iterations); -//////////////////////////////////////////////////////////////////////////////// -//! Cleanup result memory -//! @param result handles to memory -//////////////////////////////////////////////////////////////////////////////// -void cleanupResultDataLargeMatrix(ResultDataLarge &result); + //////////////////////////////////////////////////////////////////////////////// + //! Initialize variables and memory for result + //! @param result handles to memory + //! @param matr_size size of the matrix + //////////////////////////////////////////////////////////////////////////////// + void initResultDataLargeMatrix(ResultDataLarge &result, const unsigned int mat_size); -//////////////////////////////////////////////////////////////////////////////// -//! Process the result, that is obtain result from device and do simple sanity -//! checking -//! @param input handles to input data -//! @param result handles to result data -//! @param mat_size matrix size -//! @param filename output filename -//////////////////////////////////////////////////////////////////////////////// -bool processResultDataLargeMatrix(const InputData &input, - const ResultDataLarge &result, - const unsigned int mat_size, - const char *filename, - const unsigned int user_defined, - char *exec_path); + //////////////////////////////////////////////////////////////////////////////// + //! Cleanup result memory + //! @param result handles to memory + //////////////////////////////////////////////////////////////////////////////// + void cleanupResultDataLargeMatrix(ResultDataLarge &result); + + //////////////////////////////////////////////////////////////////////////////// + //! Process the result, that is obtain result from device and do simple sanity + //! checking + //! @param input handles to input data + //! @param result handles to result data + //! @param mat_size matrix size + //! @param filename output filename + //////////////////////////////////////////////////////////////////////////////// + bool processResultDataLargeMatrix(const InputData &input, + const ResultDataLarge &result, + const unsigned int mat_size, + const char *filename, + const unsigned int user_defined, + char *exec_path); }; -#endif // #ifndef _BISECT_LARGE_CUH_ +#endif // #ifndef _BISECT_LARGE_CUH_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cu b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cu index 4da03c34..baefc64c 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cu +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cu @@ -28,18 +28,18 @@ /* Computation of eigenvalues of a small symmetric, tridiagonal matrix */ // includes, system -#include -#include -#include -#include #include +#include +#include +#include +#include // includes, project -#include "helper_functions.h" -#include "helper_cuda.h" #include "config.h" -#include "structs.h" +#include "helper_cuda.h" +#include "helper_functions.h" #include "matlab.h" +#include "structs.h" // includes, kernels #include "bisect_kernel_small.cuh" @@ -58,32 +58,42 @@ //! @param precision desired precision of eigenvalues //! @param iterations number of iterations for timing //////////////////////////////////////////////////////////////////////////////// -void computeEigenvaluesSmallMatrix(const InputData &input, - ResultDataSmall &result, - const unsigned int mat_size, const float lg, - const float ug, const float precision, - const unsigned int iterations) { - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - sdkStartTimer(&timer); +void computeEigenvaluesSmallMatrix(const InputData &input, + ResultDataSmall &result, + const unsigned int mat_size, + const float lg, + const float ug, + const float precision, + const unsigned int iterations) +{ + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkStartTimer(&timer); - for (unsigned int i = 0; i < iterations; ++i) { - dim3 blocks(1, 1, 1); - dim3 threads(MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1); + for (unsigned int i = 0; i < iterations; ++i) { + dim3 blocks(1, 1, 1); + dim3 threads(MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1); - bisectKernel<<>>(input.g_a, input.g_b, mat_size, - result.g_left, result.g_right, - result.g_left_count, result.g_right_count, - lg, ug, 0, mat_size, precision); - } + bisectKernel<<>>(input.g_a, + input.g_b, + mat_size, + result.g_left, + result.g_right, + result.g_left_count, + result.g_right_count, + lg, + ug, + 0, + mat_size, + precision); + } - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer); - getLastCudaError("Kernel launch failed"); - printf("Average time: %f ms (%i iterations)\n", - sdkGetTimerValue(&timer) / (float)iterations, iterations); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer); + getLastCudaError("Kernel launch failed"); + printf("Average time: %f ms (%i iterations)\n", sdkGetTimerValue(&timer) / (float)iterations, iterations); - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); } //////////////////////////////////////////////////////////////////////////////// @@ -91,56 +101,51 @@ void computeEigenvaluesSmallMatrix(const InputData &input, //! @param result handles to the necessary memory //! @param mat_size matrix_size //////////////////////////////////////////////////////////////////////////////// -void initResultSmallMatrix(ResultDataSmall &result, - const unsigned int mat_size) { - result.mat_size_f = sizeof(float) * mat_size; - result.mat_size_ui = sizeof(unsigned int) * mat_size; +void initResultSmallMatrix(ResultDataSmall &result, const unsigned int mat_size) +{ + result.mat_size_f = sizeof(float) * mat_size; + result.mat_size_ui = sizeof(unsigned int) * mat_size; - result.eigenvalues = (float *)malloc(result.mat_size_f); + result.eigenvalues = (float *)malloc(result.mat_size_f); - // helper variables - result.zero_f = (float *)malloc(result.mat_size_f); - result.zero_ui = (unsigned int *)malloc(result.mat_size_ui); + // helper variables + result.zero_f = (float *)malloc(result.mat_size_f); + result.zero_ui = (unsigned int *)malloc(result.mat_size_ui); - for (unsigned int i = 0; i < mat_size; ++i) { - result.zero_f[i] = 0.0f; - result.zero_ui[i] = 0; + for (unsigned int i = 0; i < mat_size; ++i) { + result.zero_f[i] = 0.0f; + result.zero_ui[i] = 0; - result.eigenvalues[i] = 0.0f; - } + result.eigenvalues[i] = 0.0f; + } - checkCudaErrors(cudaMalloc((void **)&result.g_left, result.mat_size_f)); - checkCudaErrors(cudaMalloc((void **)&result.g_right, result.mat_size_f)); + checkCudaErrors(cudaMalloc((void **)&result.g_left, result.mat_size_f)); + checkCudaErrors(cudaMalloc((void **)&result.g_right, result.mat_size_f)); - checkCudaErrors( - cudaMalloc((void **)&result.g_left_count, result.mat_size_ui)); - checkCudaErrors( - cudaMalloc((void **)&result.g_right_count, result.mat_size_ui)); + checkCudaErrors(cudaMalloc((void **)&result.g_left_count, result.mat_size_ui)); + checkCudaErrors(cudaMalloc((void **)&result.g_right_count, result.mat_size_ui)); - // initialize result memory - checkCudaErrors(cudaMemcpy(result.g_left, result.zero_f, result.mat_size_f, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_right, result.zero_f, result.mat_size_f, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_right_count, result.zero_ui, - result.mat_size_ui, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(result.g_left_count, result.zero_ui, - result.mat_size_ui, cudaMemcpyHostToDevice)); + // initialize result memory + checkCudaErrors(cudaMemcpy(result.g_left, result.zero_f, result.mat_size_f, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_right, result.zero_f, result.mat_size_f, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_right_count, result.zero_ui, result.mat_size_ui, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(result.g_left_count, result.zero_ui, result.mat_size_ui, cudaMemcpyHostToDevice)); } //////////////////////////////////////////////////////////////////////////////// //! Cleanup memory and variables for result for small matrices //! @param result handle to variables //////////////////////////////////////////////////////////////////////////////// -void cleanupResultSmallMatrix(ResultDataSmall &result) { - freePtr(result.eigenvalues); - freePtr(result.zero_f); - freePtr(result.zero_ui); +void cleanupResultSmallMatrix(ResultDataSmall &result) +{ + freePtr(result.eigenvalues); + freePtr(result.zero_f); + freePtr(result.zero_ui); - checkCudaErrors(cudaFree(result.g_left)); - checkCudaErrors(cudaFree(result.g_right)); - checkCudaErrors(cudaFree(result.g_left_count)); - checkCudaErrors(cudaFree(result.g_right_count)); + checkCudaErrors(cudaFree(result.g_left)); + checkCudaErrors(cudaFree(result.g_right)); + checkCudaErrors(cudaFree(result.g_left_count)); + checkCudaErrors(cudaFree(result.g_right_count)); } //////////////////////////////////////////////////////////////////////////////// @@ -151,32 +156,31 @@ void cleanupResultSmallMatrix(ResultDataSmall &result) { //! @param mat_size matrix size //! @param filename output filename //////////////////////////////////////////////////////////////////////////////// -void processResultSmallMatrix(const InputData &input, +void processResultSmallMatrix(const InputData &input, const ResultDataSmall &result, - const unsigned int mat_size, - const char *filename) { - const unsigned int mat_size_f = sizeof(float) * mat_size; - const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size; + const unsigned int mat_size, + const char *filename) +{ + const unsigned int mat_size_f = sizeof(float) * mat_size; + const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size; - // copy data back to host - float *left = (float *)malloc(mat_size_f); - unsigned int *left_count = (unsigned int *)malloc(mat_size_ui); + // copy data back to host + float *left = (float *)malloc(mat_size_f); + unsigned int *left_count = (unsigned int *)malloc(mat_size_ui); - checkCudaErrors( - cudaMemcpy(left, result.g_left, mat_size_f, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(left_count, result.g_left_count, mat_size_ui, - cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(left, result.g_left, mat_size_f, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(left_count, result.g_left_count, mat_size_ui, cudaMemcpyDeviceToHost)); - float *eigenvalues = (float *)malloc(mat_size_f); + float *eigenvalues = (float *)malloc(mat_size_f); - for (unsigned int i = 0; i < mat_size; ++i) { - eigenvalues[left_count[i]] = left[i]; - } + for (unsigned int i = 0; i < mat_size; ++i) { + eigenvalues[left_count[i]] = left[i]; + } - // save result in matlab format - writeTridiagSymMatlab(filename, input.a, input.b + 1, eigenvalues, mat_size); + // save result in matlab format + writeTridiagSymMatlab(filename, input.a, input.b + 1, eigenvalues, mat_size); - freePtr(left); - freePtr(left_count); - freePtr(eigenvalues); + freePtr(left); + freePtr(left_count); + freePtr(eigenvalues); } diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cuh b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cuh index d373b128..a064de6d 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cuh +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_small.cuh @@ -30,51 +30,53 @@ #ifndef _BISECT_SMALL_CUH_ #define _BISECT_SMALL_CUH_ -extern "C" { +extern "C" +{ -//////////////////////////////////////////////////////////////////////////////// -//! Determine eigenvalues for matrices smaller than MAX_SMALL_MATRIX -//! @param TimingIterations number of iterations for timing -//! @param input handles to input data of kernel -//! @param result handles to result of kernel -//! @param mat_size matrix size -//! @param lg lower limit of Gerschgorin interval -//! @param ug upper limit of Gerschgorin interval -//! @param precision desired precision of eigenvalues -//! @param iterations number of iterations for timing -//////////////////////////////////////////////////////////////////////////////// -void computeEigenvaluesSmallMatrix(const InputData &input, - ResultDataSmall &result, - const unsigned int mat_size, const float lg, - const float ug, const float precision, - const unsigned int iterations); + //////////////////////////////////////////////////////////////////////////////// + //! Determine eigenvalues for matrices smaller than MAX_SMALL_MATRIX + //! @param TimingIterations number of iterations for timing + //! @param input handles to input data of kernel + //! @param result handles to result of kernel + //! @param mat_size matrix size + //! @param lg lower limit of Gerschgorin interval + //! @param ug upper limit of Gerschgorin interval + //! @param precision desired precision of eigenvalues + //! @param iterations number of iterations for timing + //////////////////////////////////////////////////////////////////////////////// + void computeEigenvaluesSmallMatrix(const InputData &input, + ResultDataSmall &result, + const unsigned int mat_size, + const float lg, + const float ug, + const float precision, + const unsigned int iterations); -//////////////////////////////////////////////////////////////////////////////// -//! Initialize variables and memory for the result for small matrices -//! @param result handles to the necessary memory -//! @param mat_size matrix_size -//////////////////////////////////////////////////////////////////////////////// -void initResultSmallMatrix(ResultDataSmall &result, - const unsigned int mat_size); + //////////////////////////////////////////////////////////////////////////////// + //! Initialize variables and memory for the result for small matrices + //! @param result handles to the necessary memory + //! @param mat_size matrix_size + //////////////////////////////////////////////////////////////////////////////// + void initResultSmallMatrix(ResultDataSmall &result, const unsigned int mat_size); -//////////////////////////////////////////////////////////////////////////////// -//! Cleanup memory and variables for result for small matrices -//! @param result handle to variables -//////////////////////////////////////////////////////////////////////////////// -void cleanupResultSmallMatrix(ResultDataSmall &result); + //////////////////////////////////////////////////////////////////////////////// + //! Cleanup memory and variables for result for small matrices + //! @param result handle to variables + //////////////////////////////////////////////////////////////////////////////// + void cleanupResultSmallMatrix(ResultDataSmall &result); -//////////////////////////////////////////////////////////////////////////////// -//! Process the result obtained on the device, that is transfer to host and -//! perform basic sanity checking -//! @param input handles to input data -//! @param result handles to result variables -//! @param mat_size matrix size -//! @param filename output filename -//////////////////////////////////////////////////////////////////////////////// -void processResultSmallMatrix(const InputData &input, - const ResultDataSmall &result, - const unsigned int mat_size, - const char *filename); + //////////////////////////////////////////////////////////////////////////////// + //! Process the result obtained on the device, that is transfer to host and + //! perform basic sanity checking + //! @param input handles to input data + //! @param result handles to result variables + //! @param mat_size matrix size + //! @param filename output filename + //////////////////////////////////////////////////////////////////////////////// + void processResultSmallMatrix(const InputData &input, + const ResultDataSmall &result, + const unsigned int mat_size, + const char *filename); } -#endif // #ifndef _BISECT_SMALL_CUH_ +#endif // #ifndef _BISECT_SMALL_CUH_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_util.cu b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_util.cu index d443889b..4551f610 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_util.cu +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/bisect_util.cu @@ -42,30 +42,32 @@ namespace cg = cooperative_groups; //! Compute the next lower power of two of n //! @param n number for which next higher power of two is sought //////////////////////////////////////////////////////////////////////////////// -__device__ inline int floorPow2(int n) { - // early out if already power of two - if (0 == (n & (n - 1))) { - return n; - } +__device__ inline int floorPow2(int n) +{ + // early out if already power of two + if (0 == (n & (n - 1))) { + return n; + } - int exp; - frexp((float)n, &exp); - return (1 << (exp - 1)); + int exp; + frexp((float)n, &exp); + return (1 << (exp - 1)); } //////////////////////////////////////////////////////////////////////////////// //! Compute the next higher power of two of n //! @param n number for which next higher power of two is sought //////////////////////////////////////////////////////////////////////////////// -__device__ inline int ceilPow2(int n) { - // early out if already power of two - if (0 == (n & (n - 1))) { - return n; - } +__device__ inline int ceilPow2(int n) +{ + // early out if already power of two + if (0 == (n & (n - 1))) { + return n; + } - int exp; - frexp((float)n, &exp); - return (1 << exp); + int exp; + frexp((float)n, &exp); + return (1 << exp); } //////////////////////////////////////////////////////////////////////////////// @@ -74,16 +76,18 @@ __device__ inline int ceilPow2(int n) { //! @param left left / lower limit of interval //! @param right right / upper limit of interval //////////////////////////////////////////////////////////////////////////////// -__device__ inline float computeMidpoint(const float left, const float right) { - float mid; +__device__ inline float computeMidpoint(const float left, const float right) +{ + float mid; - if (sign_f(left) == sign_f(right)) { - mid = left + (right - left) * 0.5f; - } else { - mid = (left + right) * 0.5f; - } + if (sign_f(left) == sign_f(right)) { + mid = left + (right - left) * 0.5f; + } + else { + mid = (left + right) * 0.5f; + } - return mid; + return mid; } //////////////////////////////////////////////////////////////////////////////// @@ -102,29 +106,37 @@ __device__ inline float computeMidpoint(const float left, const float right) { //! @param precision desired precision for eigenvalues //////////////////////////////////////////////////////////////////////////////// template -__device__ void storeInterval(unsigned int addr, float *s_left, float *s_right, - T *s_left_count, T *s_right_count, float left, - float right, S left_count, S right_count, - float precision) { - s_left_count[addr] = left_count; - s_right_count[addr] = right_count; +__device__ void storeInterval(unsigned int addr, + float *s_left, + float *s_right, + T *s_left_count, + T *s_right_count, + float left, + float right, + S left_count, + S right_count, + float precision) +{ + s_left_count[addr] = left_count; + s_right_count[addr] = right_count; - // check if interval converged - float t0 = abs(right - left); - float t1 = max(abs(left), abs(right)) * precision; + // check if interval converged + float t0 = abs(right - left); + float t1 = max(abs(left), abs(right)) * precision; - if (t0 <= max(MIN_ABS_INTERVAL, t1)) { - // compute mid point - float lambda = computeMidpoint(left, right); + if (t0 <= max(MIN_ABS_INTERVAL, t1)) { + // compute mid point + float lambda = computeMidpoint(left, right); - // mark as converged - s_left[addr] = lambda; - s_right[addr] = lambda; - } else { - // store current limits - s_left[addr] = left; - s_right[addr] = right; - } + // mark as converged + s_left[addr] = lambda; + s_right[addr] = lambda; + } + else { + // store current limits + s_left[addr] = left; + s_right[addr] = right; + } } //////////////////////////////////////////////////////////////////////////////// @@ -145,35 +157,42 @@ __device__ void storeInterval(unsigned int addr, float *s_left, float *s_right, //! @param converged flag if the current thread is already converged (that //! is count does not have to be computed) //////////////////////////////////////////////////////////////////////////////// -__device__ inline unsigned int computeNumSmallerEigenvals( - float *g_d, float *g_s, const unsigned int n, const float x, - const unsigned int tid, const unsigned int num_intervals_active, float *s_d, - float *s_s, unsigned int converged, cg::thread_block cta) { - float delta = 1.0f; - unsigned int count = 0; +__device__ inline unsigned int computeNumSmallerEigenvals(float *g_d, + float *g_s, + const unsigned int n, + const float x, + const unsigned int tid, + const unsigned int num_intervals_active, + float *s_d, + float *s_s, + unsigned int converged, + cg::thread_block cta) +{ + float delta = 1.0f; + unsigned int count = 0; - cg::sync(cta); + cg::sync(cta); - // read data into shared memory - if (threadIdx.x < n) { - s_d[threadIdx.x] = *(g_d + threadIdx.x); - s_s[threadIdx.x] = *(g_s + threadIdx.x - 1); - } - - cg::sync(cta); - - // perform loop only for active threads - if ((tid < num_intervals_active) && (0 == converged)) { - // perform (optimized) Gaussian elimination to determine the number - // of eigenvalues that are smaller than n - for (unsigned int k = 0; k < n; ++k) { - delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; - count += (delta < 0) ? 1 : 0; + // read data into shared memory + if (threadIdx.x < n) { + s_d[threadIdx.x] = *(g_d + threadIdx.x); + s_s[threadIdx.x] = *(g_s + threadIdx.x - 1); } - } // end if thread currently processing an interval + cg::sync(cta); - return count; + // perform loop only for active threads + if ((tid < num_intervals_active) && (0 == converged)) { + // perform (optimized) Gaussian elimination to determine the number + // of eigenvalues that are smaller than n + for (unsigned int k = 0; k < n; ++k) { + delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; + count += (delta < 0) ? 1 : 0; + } + + } // end if thread currently processing an interval + + return count; } //////////////////////////////////////////////////////////////////////////////// @@ -194,42 +213,49 @@ __device__ inline unsigned int computeNumSmallerEigenvals( //! @param converged flag if the current thread is already converged (that //! is count does not have to be computed) //////////////////////////////////////////////////////////////////////////////// -__device__ inline unsigned int computeNumSmallerEigenvalsLarge( - float *g_d, float *g_s, const unsigned int n, const float x, - const unsigned int tid, const unsigned int num_intervals_active, float *s_d, - float *s_s, unsigned int converged, cg::thread_block cta) { - float delta = 1.0f; - unsigned int count = 0; +__device__ inline unsigned int computeNumSmallerEigenvalsLarge(float *g_d, + float *g_s, + const unsigned int n, + const float x, + const unsigned int tid, + const unsigned int num_intervals_active, + float *s_d, + float *s_s, + unsigned int converged, + cg::thread_block cta) +{ + float delta = 1.0f; + unsigned int count = 0; - unsigned int rem = n; + unsigned int rem = n; - // do until whole diagonal and superdiagonal has been loaded and processed - for (unsigned int i = 0; i < n; i += blockDim.x) { - cg::sync(cta); + // do until whole diagonal and superdiagonal has been loaded and processed + for (unsigned int i = 0; i < n; i += blockDim.x) { + cg::sync(cta); - // read new chunk of data into shared memory - if ((i + threadIdx.x) < n) { - s_d[threadIdx.x] = *(g_d + i + threadIdx.x); - s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1); + // read new chunk of data into shared memory + if ((i + threadIdx.x) < n) { + s_d[threadIdx.x] = *(g_d + i + threadIdx.x); + s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1); + } + + cg::sync(cta); + + if (tid < num_intervals_active) { + // perform (optimized) Gaussian elimination to determine the number + // of eigenvalues that are smaller than n + for (unsigned int k = 0; k < min(rem, blockDim.x); ++k) { + delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; + // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; + count += (delta < 0) ? 1 : 0; + } + + } // end if thread currently processing an interval + + rem -= blockDim.x; } - cg::sync(cta); - - if (tid < num_intervals_active) { - // perform (optimized) Gaussian elimination to determine the number - // of eigenvalues that are smaller than n - for (unsigned int k = 0; k < min(rem, blockDim.x); ++k) { - delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; - // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; - count += (delta < 0) ? 1 : 0; - } - - } // end if thread currently processing an interval - - rem -= blockDim.x; - } - - return count; + return count; } //////////////////////////////////////////////////////////////////////////////// @@ -258,39 +284,51 @@ __device__ inline unsigned int computeNumSmallerEigenvalsLarge( //! @is_active_interval mark is thread has a second non-empty child interval //////////////////////////////////////////////////////////////////////////////// template -__device__ void storeNonEmptyIntervals( - unsigned int addr, const unsigned int num_threads_active, float *s_left, - float *s_right, T *s_left_count, T *s_right_count, float left, float mid, - float right, const S left_count, const S mid_count, const S right_count, - float precision, unsigned int &compact_second_chunk, - T *s_compaction_list_exc, unsigned int &is_active_second) { - // check if both child intervals are valid - if ((left_count != mid_count) && (mid_count != right_count)) { - // store the left interval - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid, - left_count, mid_count, precision); +__device__ void storeNonEmptyIntervals(unsigned int addr, + const unsigned int num_threads_active, + float *s_left, + float *s_right, + T *s_left_count, + T *s_right_count, + float left, + float mid, + float right, + const S left_count, + const S mid_count, + const S right_count, + float precision, + unsigned int &compact_second_chunk, + T *s_compaction_list_exc, + unsigned int &is_active_second) +{ + // check if both child intervals are valid + if ((left_count != mid_count) && (mid_count != right_count)) { + // store the left interval + storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, mid, left_count, mid_count, precision); - // mark that a second interval has been generated, only stored after - // stream compaction of second chunk - is_active_second = 1; - s_compaction_list_exc[threadIdx.x] = 1; - atomicExch(&compact_second_chunk, 1); - } else { - // only one non-empty child interval - - // mark that no second child - is_active_second = 0; - s_compaction_list_exc[threadIdx.x] = 0; - - // store the one valid child interval - if (left_count != mid_count) { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, left, - mid, left_count, mid_count, precision); - } else { - storeInterval(addr, s_left, s_right, s_left_count, s_right_count, mid, - right, mid_count, right_count, precision); + // mark that a second interval has been generated, only stored after + // stream compaction of second chunk + is_active_second = 1; + s_compaction_list_exc[threadIdx.x] = 1; + atomicExch(&compact_second_chunk, 1); + } + else { + // only one non-empty child interval + + // mark that no second child + is_active_second = 0; + s_compaction_list_exc[threadIdx.x] = 0; + + // store the one valid child interval + if (left_count != mid_count) { + storeInterval( + addr, s_left, s_right, s_left_count, s_right_count, left, mid, left_count, mid_count, precision); + } + else { + storeInterval( + addr, s_left, s_right, s_left_count, s_right_count, mid, right, mid_count, right_count, precision); + } } - } } //////////////////////////////////////////////////////////////////////////////// //! Create indices for compaction, that is process \a s_compaction_list_exc @@ -303,42 +341,40 @@ __device__ void storeNonEmptyIntervals( //! @param num_threads_compaction number of threads to employ for compaction //////////////////////////////////////////////////////////////////////////////// template -__device__ void createIndicesCompaction(T *s_compaction_list_exc, - unsigned int num_threads_compaction, - cg::thread_block cta) { - unsigned int offset = 1; - const unsigned int tid = threadIdx.x; +__device__ void +createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction, cg::thread_block cta) +{ + unsigned int offset = 1; + const unsigned int tid = threadIdx.x; - // higher levels of scan tree - for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) { - cg::sync(cta); + // higher levels of scan tree + for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) { + cg::sync(cta); - if (tid < d) { - unsigned int ai = offset * (2 * tid + 1) - 1; - unsigned int bi = offset * (2 * tid + 2) - 1; + if (tid < d) { + unsigned int ai = offset * (2 * tid + 1) - 1; + unsigned int bi = offset * (2 * tid + 2) - 1; - s_compaction_list_exc[bi] = - s_compaction_list_exc[bi] + s_compaction_list_exc[ai]; + s_compaction_list_exc[bi] = s_compaction_list_exc[bi] + s_compaction_list_exc[ai]; + } + + offset <<= 1; } - offset <<= 1; - } + // traverse down tree: first down to level 2 across + for (int d = 2; d < num_threads_compaction; d <<= 1) { + offset >>= 1; + cg::sync(cta); - // traverse down tree: first down to level 2 across - for (int d = 2; d < num_threads_compaction; d <<= 1) { - offset >>= 1; - cg::sync(cta); + if (tid < (d - 1)) { + unsigned int ai = offset * (tid + 1) - 1; + unsigned int bi = ai + (offset >> 1); - if (tid < (d - 1)) { - unsigned int ai = offset * (tid + 1) - 1; - unsigned int bi = ai + (offset >> 1); - - s_compaction_list_exc[bi] = - s_compaction_list_exc[bi] + s_compaction_list_exc[ai]; + s_compaction_list_exc[bi] = s_compaction_list_exc[bi] + s_compaction_list_exc[ai]; + } } - } - cg::sync(cta); + cg::sync(cta); } /////////////////////////////////////////////////////////////////////////////// @@ -359,24 +395,30 @@ __device__ void createIndicesCompaction(T *s_compaction_list_exc, //! @is_active_interval mark is thread has a second non-empty child interval /////////////////////////////////////////////////////////////////////////////// template -__device__ void compactIntervals(float *s_left, float *s_right, T *s_left_count, - T *s_right_count, float mid, float right, +__device__ void compactIntervals(float *s_left, + float *s_right, + T *s_left_count, + T *s_right_count, + float mid, + float right, unsigned int mid_count, - unsigned int right_count, T *s_compaction_list, + unsigned int right_count, + T *s_compaction_list, unsigned int num_threads_active, - unsigned int is_active_second) { - const unsigned int tid = threadIdx.x; + unsigned int is_active_second) +{ + const unsigned int tid = threadIdx.x; - // perform compaction / copy data for all threads where the second - // child is not dead - if ((tid < num_threads_active) && (1 == is_active_second)) { - unsigned int addr_w = num_threads_active + s_compaction_list[tid]; + // perform compaction / copy data for all threads where the second + // child is not dead + if ((tid < num_threads_active) && (1 == is_active_second)) { + unsigned int addr_w = num_threads_active + s_compaction_list[tid]; - s_left[addr_w] = mid; - s_right[addr_w] = right; - s_left_count[addr_w] = mid_count; - s_right_count[addr_w] = right_count; - } + s_left[addr_w] = mid; + s_right[addr_w] = right; + s_left_count[addr_w] = mid_count; + s_right_count[addr_w] = right_count; + } } /////////////////////////////////////////////////////////////////////////////// @@ -401,86 +443,102 @@ __device__ void compactIntervals(float *s_left, float *s_right, T *s_left_count, //! @param num_threads_active number of active threads / intervals /////////////////////////////////////////////////////////////////////////////// template -__device__ void storeIntervalConverged(float *s_left, float *s_right, - T *s_left_count, T *s_right_count, - float &left, float &mid, float &right, - S &left_count, S &mid_count, - S &right_count, T *s_compaction_list_exc, - unsigned int &compact_second_chunk, - const unsigned int num_threads_active) { - const unsigned int tid = threadIdx.x; - const unsigned int multiplicity = right_count - left_count; +__device__ void storeIntervalConverged(float *s_left, + float *s_right, + T *s_left_count, + T *s_right_count, + float &left, + float &mid, + float &right, + S &left_count, + S &mid_count, + S &right_count, + T *s_compaction_list_exc, + unsigned int &compact_second_chunk, + const unsigned int num_threads_active) +{ + const unsigned int tid = threadIdx.x; + const unsigned int multiplicity = right_count - left_count; - // check multiplicity of eigenvalue - if (1 == multiplicity) { - // just re-store intervals, simple eigenvalue - s_left[tid] = left; - s_right[tid] = right; - s_left_count[tid] = left_count; - s_right_count[tid] = right_count; + // check multiplicity of eigenvalue + if (1 == multiplicity) { + // just re-store intervals, simple eigenvalue + s_left[tid] = left; + s_right[tid] = right; + s_left_count[tid] = left_count; + s_right_count[tid] = right_count; - // mark that no second child / clear - s_right_count[tid + num_threads_active] = 0; - s_compaction_list_exc[tid] = 0; - } else { - // number of eigenvalues after the split less than mid - mid_count = left_count + (multiplicity >> 1); + // mark that no second child / clear + s_right_count[tid + num_threads_active] = 0; + s_compaction_list_exc[tid] = 0; + } + else { + // number of eigenvalues after the split less than mid + mid_count = left_count + (multiplicity >> 1); - // store left interval - s_left[tid] = left; - s_right[tid] = right; - s_left_count[tid] = left_count; - s_right_count[tid] = mid_count; + // store left interval + s_left[tid] = left; + s_right[tid] = right; + s_left_count[tid] = left_count; + s_right_count[tid] = mid_count; - mid = left; + mid = left; - // mark that second child interval exists - s_right_count[tid + num_threads_active] = right_count; - s_compaction_list_exc[tid] = 1; - compact_second_chunk = 1; - } + // mark that second child interval exists + s_right_count[tid + num_threads_active] = right_count; + s_compaction_list_exc[tid] = 1; + compact_second_chunk = 1; + } } template -__device__ void storeIntervalConverged(float *s_left, float *s_right, - T *s_left_count, T *s_right_count, - float &left, float &mid, float &right, - S &left_count, S &mid_count, - S &right_count, T *s_compaction_list_exc, - unsigned int &compact_second_chunk, +__device__ void storeIntervalConverged(float *s_left, + float *s_right, + T *s_left_count, + T *s_right_count, + float &left, + float &mid, + float &right, + S &left_count, + S &mid_count, + S &right_count, + T *s_compaction_list_exc, + unsigned int &compact_second_chunk, const unsigned int num_threads_active, - unsigned int &is_active_second) { - const unsigned int tid = threadIdx.x; - const unsigned int multiplicity = right_count - left_count; + unsigned int &is_active_second) +{ + const unsigned int tid = threadIdx.x; + const unsigned int multiplicity = right_count - left_count; - // check multiplicity of eigenvalue - if (1 == multiplicity) { - // just re-store intervals, simple eigenvalue - s_left[tid] = left; - s_right[tid] = right; - s_left_count[tid] = left_count; - s_right_count[tid] = right_count; + // check multiplicity of eigenvalue + if (1 == multiplicity) { + // just re-store intervals, simple eigenvalue + s_left[tid] = left; + s_right[tid] = right; + s_left_count[tid] = left_count; + s_right_count[tid] = right_count; - // mark that no second child / clear - is_active_second = 0; - s_compaction_list_exc[tid] = 0; - } else { - // number of eigenvalues after the split less than mid - mid_count = left_count + (multiplicity >> 1); + // mark that no second child / clear + is_active_second = 0; + s_compaction_list_exc[tid] = 0; + } + else { + // number of eigenvalues after the split less than mid + mid_count = left_count + (multiplicity >> 1); - // store left interval - s_left[tid] = left; - s_right[tid] = right; - s_left_count[tid] = left_count; - s_right_count[tid] = mid_count; + // store left interval + s_left[tid] = left; + s_right[tid] = right; + s_left_count[tid] = left_count; + s_right_count[tid] = mid_count; - mid = left; + mid = left; - // mark that second child interval exists - is_active_second = 1; - s_compaction_list_exc[tid] = 1; - compact_second_chunk = 1; - } + // mark that second child interval exists + is_active_second = 1; + s_compaction_list_exc[tid] = 1; + compact_second_chunk = 1; + } } /////////////////////////////////////////////////////////////////////////////// @@ -501,29 +559,38 @@ __device__ void storeIntervalConverged(float *s_left, float *s_right, //! converged /////////////////////////////////////////////////////////////////////////////// template -__device__ void subdivideActiveInterval( - const unsigned int tid, float *s_left, float *s_right, T *s_left_count, - T *s_right_count, const unsigned int num_threads_active, float &left, - float &right, unsigned int &left_count, unsigned int &right_count, - float &mid, unsigned int &all_threads_converged) { - // for all active threads - if (tid < num_threads_active) { - left = s_left[tid]; - right = s_right[tid]; - left_count = s_left_count[tid]; - right_count = s_right_count[tid]; +__device__ void subdivideActiveInterval(const unsigned int tid, + float *s_left, + float *s_right, + T *s_left_count, + T *s_right_count, + const unsigned int num_threads_active, + float &left, + float &right, + unsigned int &left_count, + unsigned int &right_count, + float &mid, + unsigned int &all_threads_converged) +{ + // for all active threads + if (tid < num_threads_active) { + left = s_left[tid]; + right = s_right[tid]; + left_count = s_left_count[tid]; + right_count = s_right_count[tid]; - // check if thread already converged - if (left != right) { - mid = computeMidpoint(left, right); - atomicExch(&all_threads_converged, 0); - } else if ((right_count - left_count) > 1) { - // mark as not converged if multiple eigenvalues enclosed - // duplicate interval in storeIntervalsConverged() - atomicExch(&all_threads_converged, 0); - } + // check if thread already converged + if (left != right) { + mid = computeMidpoint(left, right); + atomicExch(&all_threads_converged, 0); + } + else if ((right_count - left_count) > 1) { + // mark as not converged if multiple eigenvalues enclosed + // duplicate interval in storeIntervalsConverged() + atomicExch(&all_threads_converged, 0); + } - } // end for all active threads + } // end for all active threads } -#endif // #ifndef _BISECT_UTIL_H_ +#endif // #ifndef _BISECT_UTIL_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/config.h b/Samples/2_Concepts_and_Techniques/eigenvalues/config.h index 497f9e62..791e0907 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/config.h +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/config.h @@ -33,9 +33,9 @@ // should be power of two #define MAX_THREADS_BLOCK 256 -#define MAX_SMALL_MATRIX 512 +#define MAX_SMALL_MATRIX 512 #define MAX_THREADS_BLOCK_SMALL_MATRIX 512 #define MIN_ABS_INTERVAL 5.0e-37 -#endif // #ifndef _CONFIG_H_ +#endif // #ifndef _CONFIG_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.cpp b/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.cpp index 37c994e5..ff50ad78 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.cpp +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.cpp @@ -27,13 +27,15 @@ /* Computation of Gerschgorin interval for symmetric, tridiagonal matrix */ + +#include "gerschgorin.h" + +#include +#include #include #include -#include -#include #include "util.h" -#include "gerschgorin.h" //////////////////////////////////////////////////////////////////////////////// //! Compute Gerschgorin interval for symmetric, tridiagonal matrix @@ -43,40 +45,40 @@ //! @param lg lower limit of Gerschgorin interval //! @param ug upper limit of Gerschgorin interval //////////////////////////////////////////////////////////////////////////////// -void computeGerschgorin(float *d, float *s, unsigned int n, float &lg, - float &ug) { - lg = FLT_MAX; - ug = -FLT_MAX; +void computeGerschgorin(float *d, float *s, unsigned int n, float &lg, float &ug) +{ + lg = FLT_MAX; + ug = -FLT_MAX; - // compute bounds - for (unsigned int i = 1; i < (n - 1); ++i) { - // sum over the absolute values of all elements of row i - float sum_abs_ni = fabsf(s[i - 1]) + fabsf(s[i]); + // compute bounds + for (unsigned int i = 1; i < (n - 1); ++i) { + // sum over the absolute values of all elements of row i + float sum_abs_ni = fabsf(s[i - 1]) + fabsf(s[i]); - lg = min(lg, d[i] - sum_abs_ni); - ug = max(ug, d[i] + sum_abs_ni); - } + lg = min(lg, d[i] - sum_abs_ni); + ug = max(ug, d[i] + sum_abs_ni); + } - // first and last row, only one superdiagonal element + // first and last row, only one superdiagonal element - // first row - lg = min(lg, d[0] - fabsf(s[0])); - ug = max(ug, d[0] + fabsf(s[0])); + // first row + lg = min(lg, d[0] - fabsf(s[0])); + ug = max(ug, d[0] + fabsf(s[0])); - // last row - lg = min(lg, d[n - 1] - fabsf(s[n - 2])); - ug = max(ug, d[n - 1] + fabsf(s[n - 2])); + // last row + lg = min(lg, d[n - 1] - fabsf(s[n - 2])); + ug = max(ug, d[n - 1] + fabsf(s[n - 2])); - // increase interval to avoid side effects of fp arithmetic - float bnorm = max(fabsf(ug), fabsf(lg)); + // increase interval to avoid side effects of fp arithmetic + float bnorm = max(fabsf(ug), fabsf(lg)); - // these values depend on the implementation of floating count that is - // employed in the following - float psi_0 = 11 * FLT_EPSILON * bnorm; - float psi_n = 11 * FLT_EPSILON * bnorm; + // these values depend on the implementation of floating count that is + // employed in the following + float psi_0 = 11 * FLT_EPSILON * bnorm; + float psi_n = 11 * FLT_EPSILON * bnorm; - lg = lg - bnorm * 2 * n * FLT_EPSILON - psi_0; - ug = ug + bnorm * 2 * n * FLT_EPSILON + psi_n; + lg = lg - bnorm * 2 * n * FLT_EPSILON - psi_0; + ug = ug + bnorm * 2 * n * FLT_EPSILON + psi_n; - ug = max(lg, ug); + ug = max(lg, ug); } diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.h b/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.h index 4d3de6ad..39dee5bd 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.h +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/gerschgorin.h @@ -38,7 +38,6 @@ //! @param lg lower limit of Gerschgorin interval //! @param ug upper limit of Gerschgorin interval //////////////////////////////////////////////////////////////////////////////// -extern "C" void computeGerschgorin(float *d, float *s, unsigned int n, - float &lg, float &ug); +extern "C" void computeGerschgorin(float *d, float *s, unsigned int n, float &lg, float &ug); -#endif // #ifndef _GERSCHGORIN_H_ +#endif // #ifndef _GERSCHGORIN_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/main.cu b/Samples/2_Concepts_and_Techniques/eigenvalues/main.cu index 5d0b33e5..4cfdd8db 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/main.cu +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/main.cu @@ -30,24 +30,24 @@ */ // includes, system -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include // includes, project -#include #include -#include "config.h" -#include "structs.h" -#include "matlab.h" -#include "util.h" -#include "gerschgorin.h" +#include -#include "bisect_small.cuh" #include "bisect_large.cuh" +#include "bisect_small.cuh" +#include "config.h" +#include "gerschgorin.h" +#include "matlab.h" +#include "structs.h" +#include "util.h" //////////////////////////////////////////////////////////////////////////////// // declaration, forward @@ -56,15 +56,16 @@ bool runTest(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - bool bQAResults = false; +int main(int argc, char **argv) +{ + bool bQAResults = false; - printf("Starting eigenvalues\n"); + printf("Starting eigenvalues\n"); - bQAResults = runTest(argc, argv); - printf("Test %s\n", bQAResults ? "Succeeded!" : "Failed!"); + bQAResults = runTest(argc, argv); + printf("Test %s\n", bQAResults ? "Succeeded!" : "Failed!"); - exit(bQAResults ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bQAResults ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// @@ -75,69 +76,67 @@ int main(int argc, char **argv) { //! @param user_defined 1 if the matrix size has been requested by the user, //! 0 if the default size //////////////////////////////////////////////////////////////////////////////// -void initInputData(InputData &input, char *exec_path, - const unsigned int mat_size, - const unsigned int user_defined) { - // allocate memory - input.a = (float *)malloc(sizeof(float) * mat_size); - input.b = (float *)malloc(sizeof(float) * mat_size); +void initInputData(InputData &input, char *exec_path, const unsigned int mat_size, const unsigned int user_defined) +{ + // allocate memory + input.a = (float *)malloc(sizeof(float) * mat_size); + input.b = (float *)malloc(sizeof(float) * mat_size); - if (1 == user_defined) { - // initialize diagonal and superdiagonal entries with random values - srand(278217421); + if (1 == user_defined) { + // initialize diagonal and superdiagonal entries with random values + srand(278217421); - // srand( clock()); - for (unsigned int i = 0; i < mat_size; ++i) { - input.a[i] = (float)(2.0 * (((double)rand() / (double)RAND_MAX) - 0.5)); - input.b[i] = (float)(2.0 * (((double)rand() / (double)RAND_MAX) - 0.5)); + // srand( clock()); + for (unsigned int i = 0; i < mat_size; ++i) { + input.a[i] = (float)(2.0 * (((double)rand() / (double)RAND_MAX) - 0.5)); + input.b[i] = (float)(2.0 * (((double)rand() / (double)RAND_MAX) - 0.5)); + } + + // the first element of s is used as padding on the device (thus the + // whole vector is copied to the device but the kernels are launched + // with (s+1) as start address + input.b[0] = 0.0f; + } + else { + // read default matrix + unsigned int input_data_size = mat_size; + char *diag_path = sdkFindFilePath("diagonal.dat", exec_path); + assert(NULL != diag_path); + sdkReadFile(diag_path, &(input.a), &input_data_size, false); + + char *sdiag_path = sdkFindFilePath("superdiagonal.dat", exec_path); + assert(NULL != sdiag_path); + sdkReadFile(sdiag_path, &(input.b), &input_data_size, false); + + free(diag_path); + free(sdiag_path); } - // the first element of s is used as padding on the device (thus the - // whole vector is copied to the device but the kernels are launched - // with (s+1) as start address - input.b[0] = 0.0f; - } else { - // read default matrix - unsigned int input_data_size = mat_size; - char *diag_path = sdkFindFilePath("diagonal.dat", exec_path); - assert(NULL != diag_path); - sdkReadFile(diag_path, &(input.a), &input_data_size, false); + // allocate device memory for input + checkCudaErrors(cudaMalloc((void **)&(input.g_a), sizeof(float) * mat_size)); + checkCudaErrors(cudaMalloc((void **)&(input.g_b_raw), sizeof(float) * mat_size)); - char *sdiag_path = sdkFindFilePath("superdiagonal.dat", exec_path); - assert(NULL != sdiag_path); - sdkReadFile(sdiag_path, &(input.b), &input_data_size, false); + // copy data to device + checkCudaErrors(cudaMemcpy(input.g_a, input.a, sizeof(float) * mat_size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(input.g_b_raw, input.b, sizeof(float) * mat_size, cudaMemcpyHostToDevice)); - free(diag_path); - free(sdiag_path); - } - - // allocate device memory for input - checkCudaErrors(cudaMalloc((void **)&(input.g_a), sizeof(float) * mat_size)); - checkCudaErrors( - cudaMalloc((void **)&(input.g_b_raw), sizeof(float) * mat_size)); - - // copy data to device - checkCudaErrors(cudaMemcpy(input.g_a, input.a, sizeof(float) * mat_size, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(input.g_b_raw, input.b, sizeof(float) * mat_size, - cudaMemcpyHostToDevice)); - - input.g_b = input.g_b_raw + 1; + input.g_b = input.g_b_raw + 1; } //////////////////////////////////////////////////////////////////////////////// //! Clean up input data, in particular allocated memory //! @param input handles to the input data //////////////////////////////////////////////////////////////////////////////// -void cleanupInputData(InputData &input) { - freePtr(input.a); - freePtr(input.b); +void cleanupInputData(InputData &input) +{ + freePtr(input.a); + freePtr(input.b); - checkCudaErrors(cudaFree(input.g_a)); - input.g_a = NULL; - checkCudaErrors(cudaFree(input.g_b_raw)); - input.g_b_raw = NULL; - input.g_b = NULL; + checkCudaErrors(cudaFree(input.g_a)); + input.g_a = NULL; + checkCudaErrors(cudaFree(input.g_b_raw)); + input.g_b_raw = NULL; + input.g_b = NULL; } //////////////////////////////////////////////////////////////////////////////// @@ -147,26 +146,26 @@ void cleanupInputData(InputData &input) { //! @param matrix_size size of matrix, updated if specific size specified on //! command line //////////////////////////////////////////////////////////////////////////////// -void getMatrixSize(int argc, char **argv, unsigned int &mat_size, - unsigned int &user_defined) { - int temp = -1; +void getMatrixSize(int argc, char **argv, unsigned int &mat_size, unsigned int &user_defined) +{ + int temp = -1; - if (checkCmdLineFlag(argc, (const char **)argv, "matrix-size")) { - temp = getCmdLineArgumentInt(argc, (const char **)argv, "matrix-size"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "matrix-size")) { + temp = getCmdLineArgumentInt(argc, (const char **)argv, "matrix-size"); + } - if (temp > 0) { - mat_size = (unsigned int)temp; - // data type short is used in the kernel - assert(mat_size < (1 << 16)); + if (temp > 0) { + mat_size = (unsigned int)temp; + // data type short is used in the kernel + assert(mat_size < (1 << 16)); - // mat_size should be large than 2 - assert(mat_size >= 2); + // mat_size should be large than 2 + assert(mat_size >= 2); - user_defined = 1; - } + user_defined = 1; + } - printf("Matrix size: %i x %i\n", mat_size, mat_size); + printf("Matrix size: %i x %i\n", mat_size, mat_size); } //////////////////////////////////////////////////////////////////////////////// @@ -178,21 +177,21 @@ void getMatrixSize(int argc, char **argv, unsigned int &mat_size, //! @param user_defined 1 if the precision has been requested by the user, //! 0 if the default size //////////////////////////////////////////////////////////////////////////////// -void getPrecision(int argc, char **argv, float &precision, - unsigned int &user_defined) { - float temp = -1.0f; +void getPrecision(int argc, char **argv, float &precision, unsigned int &user_defined) +{ + float temp = -1.0f; - if (checkCmdLineFlag(argc, (const char **)argv, "precision")) { - temp = getCmdLineArgumentFloat(argc, (const char **)argv, "precision"); - printf("Precision is between [0.001, 0.000001]\n"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "precision")) { + temp = getCmdLineArgumentFloat(argc, (const char **)argv, "precision"); + printf("Precision is between [0.001, 0.000001]\n"); + } - if (temp > 1e-6 && temp <= 0.001) { - precision = temp; - user_defined = 1; - } + if (temp > 1e-6 && temp <= 0.001) { + precision = temp; + user_defined = 1; + } - printf("Precision: %f\n", precision); + printf("Precision: %f\n", precision); } //////////////////////////////////////////////////////////////////////////////// @@ -202,18 +201,19 @@ void getPrecision(int argc, char **argv, float &precision, //! @param iters_timing number of timing iterations, updated if user //! specific value //////////////////////////////////////////////////////////////////////////////// -void getItersTiming(int argc, char **argv, unsigned int &iters_timing) { - int temp = -1; +void getItersTiming(int argc, char **argv, unsigned int &iters_timing) +{ + int temp = -1; - if (checkCmdLineFlag(argc, (const char **)argv, "iters-timing")) { - temp = getCmdLineArgumentInt(argc, (const char **)argv, "iters-timing"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "iters-timing")) { + temp = getCmdLineArgumentInt(argc, (const char **)argv, "iters-timing"); + } - if (temp > 0) { - iters_timing = temp; - } + if (temp > 0) { + iters_timing = temp; + } - printf("Iterations to be timed: %i\n", iters_timing); + printf("Iterations to be timed: %i\n", iters_timing); } //////////////////////////////////////////////////////////////////////////////// @@ -224,102 +224,102 @@ void getItersTiming(int argc, char **argv, unsigned int &iters_timing) { //! @param filename filename of result file, updated if user specified //! filename //////////////////////////////////////////////////////////////////////////////// -void getResultFilename(int argc, char **argv, char *&filename) { - char *temp = NULL; - getCmdLineArgumentString(argc, (const char **)argv, "filename-result", &temp); +void getResultFilename(int argc, char **argv, char *&filename) +{ + char *temp = NULL; + getCmdLineArgumentString(argc, (const char **)argv, "filename-result", &temp); - if (NULL != temp) { - filename = (char *)malloc(sizeof(char) * strlen(temp)); - strcpy(filename, temp); + if (NULL != temp) { + filename = (char *)malloc(sizeof(char) * strlen(temp)); + strcpy(filename, temp); - free(temp); - } + free(temp); + } - printf("Result filename: '%s'\n", filename); + printf("Result filename: '%s'\n", filename); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -bool runTest(int argc, char **argv) { - bool bCompareResult = false; +bool runTest(int argc, char **argv) +{ + bool bCompareResult = false; - findCudaDevice(argc, (const char **)argv); + findCudaDevice(argc, (const char **)argv); - StopWatchInterface *timer = NULL; - StopWatchInterface *timer_total = NULL; - sdkCreateTimer(&timer); - sdkCreateTimer(&timer_total); + StopWatchInterface *timer = NULL; + StopWatchInterface *timer_total = NULL; + sdkCreateTimer(&timer); + sdkCreateTimer(&timer_total); - // default - unsigned int mat_size = 2048; - // flag if the matrix size is due to explicit user request - unsigned int user_defined = 0; - // desired precision of eigenvalues - float precision = 0.00001f; - unsigned int iters_timing = 100; - char *result_file = (char *)"eigenvalues.dat"; + // default + unsigned int mat_size = 2048; + // flag if the matrix size is due to explicit user request + unsigned int user_defined = 0; + // desired precision of eigenvalues + float precision = 0.00001f; + unsigned int iters_timing = 100; + char *result_file = (char *)"eigenvalues.dat"; - // check if there is a command line request for the matrix size - getMatrixSize(argc, argv, mat_size, user_defined); + // check if there is a command line request for the matrix size + getMatrixSize(argc, argv, mat_size, user_defined); - // check if user requested specific precision - getPrecision(argc, argv, precision, user_defined); + // check if user requested specific precision + getPrecision(argc, argv, precision, user_defined); - // check if user requested specific number of iterations for timing - getItersTiming(argc, argv, iters_timing); + // check if user requested specific number of iterations for timing + getItersTiming(argc, argv, iters_timing); - // file name for result file - getResultFilename(argc, argv, result_file); + // file name for result file + getResultFilename(argc, argv, result_file); - // set up input - InputData input; - initInputData(input, argv[0], mat_size, user_defined); + // set up input + InputData input; + initInputData(input, argv[0], mat_size, user_defined); - // compute Gerschgorin interval - float lg = FLT_MAX; - float ug = -FLT_MAX; - computeGerschgorin(input.a, input.b + 1, mat_size, lg, ug); - printf("Gerschgorin interval: %f / %f\n", lg, ug); + // compute Gerschgorin interval + float lg = FLT_MAX; + float ug = -FLT_MAX; + computeGerschgorin(input.a, input.b + 1, mat_size, lg, ug); + printf("Gerschgorin interval: %f / %f\n", lg, ug); - // two kernels, for small matrices a lot of overhead can be avoided - if (mat_size <= MAX_SMALL_MATRIX) { - // initialize memory for result - ResultDataSmall result; - initResultSmallMatrix(result, mat_size); + // two kernels, for small matrices a lot of overhead can be avoided + if (mat_size <= MAX_SMALL_MATRIX) { + // initialize memory for result + ResultDataSmall result; + initResultSmallMatrix(result, mat_size); - // run the kernel - computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision, - iters_timing); + // run the kernel + computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision, iters_timing); - // get the result from the device and do some sanity checks, - // save the result - processResultSmallMatrix(input, result, mat_size, result_file); + // get the result from the device and do some sanity checks, + // save the result + processResultSmallMatrix(input, result, mat_size, result_file); - // clean up - cleanupResultSmallMatrix(result); + // clean up + cleanupResultSmallMatrix(result); - printf("User requests non-default argument(s), skipping self-check!\n"); - bCompareResult = true; - } else { - // initialize memory for result - ResultDataLarge result; - initResultDataLargeMatrix(result, mat_size); + printf("User requests non-default argument(s), skipping self-check!\n"); + bCompareResult = true; + } + else { + // initialize memory for result + ResultDataLarge result; + initResultDataLargeMatrix(result, mat_size); - // run the kernel - computeEigenvaluesLargeMatrix(input, result, mat_size, precision, lg, ug, - iters_timing); + // run the kernel + computeEigenvaluesLargeMatrix(input, result, mat_size, precision, lg, ug, iters_timing); - // get the result from the device and do some sanity checks - // save the result if user specified matrix size - bCompareResult = processResultDataLargeMatrix( - input, result, mat_size, result_file, user_defined, argv[0]); + // get the result from the device and do some sanity checks + // save the result if user specified matrix size + bCompareResult = processResultDataLargeMatrix(input, result, mat_size, result_file, user_defined, argv[0]); - // cleanup - cleanupResultDataLargeMatrix(result); - } + // cleanup + cleanupResultDataLargeMatrix(result); + } - cleanupInputData(input); + cleanupInputData(input); - return bCompareResult; + return bCompareResult; } diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.cpp b/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.cpp index 3c8076f1..485e33ff 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.cpp +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.cpp @@ -26,20 +26,21 @@ */ //! includes, system -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include #include +#include // includes, projcet #include "matlab.h" // namespace, unnamed -namespace {} // end namespace, unnamed +namespace { +} // namespace /////////////////////////////////////////////////////////////////////////////// //! Write a tridiagonal, symmetric matrix in vector representation and @@ -52,18 +53,18 @@ namespace {} // end namespace, unnamed //! if these are sorted in ascending order //! @param n size of the matrix /////////////////////////////////////////////////////////////////////////////// -void writeTridiagSymMatlab(const char *filename, float *d, float *s, - float *eigenvals, const unsigned int n) { - std::ofstream file(filename, std::ios::out); +void writeTridiagSymMatlab(const char *filename, float *d, float *s, float *eigenvals, const unsigned int n) +{ + std::ofstream file(filename, std::ios::out); - // write diagonal entries - writeVectorMatlab(file, "d", d, n); + // write diagonal entries + writeVectorMatlab(file, "d", d, n); - // write superdiagonal entries - writeVectorMatlab(file, "s", s, n - 1); + // write superdiagonal entries + writeVectorMatlab(file, "s", s, n - 1); - // write eigenvalues - writeVectorMatlab(file, "eigvals", eigenvals, n); + // write eigenvalues + writeVectorMatlab(file, "eigvals", eigenvals, n); - file.close(); + file.close(); } diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.h b/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.h index d82adcf3..e4b60358 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.h +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/matlab.h @@ -26,17 +26,17 @@ */ /* Header for utility functionality. -* Host code. -*/ + * Host code. + */ #ifndef _MATLAB_H_ #define _MATLAB_H_ // includes, system -#include -#include -#include #include +#include +#include +#include // includes, project @@ -51,8 +51,7 @@ //! if these are sorted in ascending order //! @param n size of the matrix //////////////////////////////////////////////////////////////////////////////// -extern "C" void writeTridiagSymMatlab(const char *filename, float *d, float *s, - float *eigenvals, const unsigned int n); +extern "C" void writeTridiagSymMatlab(const char *filename, float *d, float *s, float *eigenvals, const unsigned int n); //////////////////////////////////////////////////////////////////////////////// //! Write matrix to a file in Matlab format @@ -61,9 +60,7 @@ extern "C" void writeTridiagSymMatlab(const char *filename, float *d, float *s, //! @param mat matrix to write to the file //! @param mat_size size of the (square) matrix \a mat //////////////////////////////////////////////////////////////////////////////// -template -void writeMatrixMatlab(T &file, const char *mat_name, S *&mat, - const unsigned int mat_size); +template void writeMatrixMatlab(T &file, const char *mat_name, S *&mat, const unsigned int mat_size); //////////////////////////////////////////////////////////////////////////////// //! Write vector to a file in Matlab format @@ -72,9 +69,7 @@ void writeMatrixMatlab(T &file, const char *mat_name, S *&mat, //! @param vec matrix to write to the file //! @param vec_len length of the vector //////////////////////////////////////////////////////////////////////////////// -template -void writeVectorMatlab(T &file, const char *vec_name, S *&vec, - const unsigned int vec_len); +template void writeVectorMatlab(T &file, const char *vec_name, S *&vec, const unsigned int vec_len); // implementations @@ -85,24 +80,23 @@ void writeVectorMatlab(T &file, const char *vec_name, S *&vec, //! @param mat matrix to write to the file //! @param mat_size size of the (square) matrix \a mat //////////////////////////////////////////////////////////////////////////////// -template -void writeMatrixMatlab(T &file, const char *mat_name, S *&mat, - const unsigned int mat_size) { - const unsigned int pitch = sizeof(S) * mat_size; +template void writeMatrixMatlab(T &file, const char *mat_name, S *&mat, const unsigned int mat_size) +{ + const unsigned int pitch = sizeof(S) * mat_size; - file << mat_name << " = ["; + file << mat_name << " = ["; - for (unsigned int i = 0; i < mat_size; ++i) { - for (unsigned int j = 0; j < mat_size; ++j) { - file << getMatrix(mat, pitch, i, j) << " "; + for (unsigned int i = 0; i < mat_size; ++i) { + for (unsigned int j = 0; j < mat_size; ++j) { + file << getMatrix(mat, pitch, i, j) << " "; + } + + if (i != mat_size - 1) { + file << "; "; + } } - if (i != mat_size - 1) { - file << "; "; - } - } - - file << "];\n"; + file << "];\n"; } //////////////////////////////////////////////////////////////////////////////// @@ -112,16 +106,15 @@ void writeMatrixMatlab(T &file, const char *mat_name, S *&mat, //! @param vec matrix to write to the file //! @param vec_len length of the vector //////////////////////////////////////////////////////////////////////////////// -template -void writeVectorMatlab(T &file, const char *vec_name, S *&vec, - const unsigned int vec_len) { - file << vec_name << " = ["; +template void writeVectorMatlab(T &file, const char *vec_name, S *&vec, const unsigned int vec_len) +{ + file << vec_name << " = ["; - for (unsigned int i = 0; i < vec_len; ++i) { - file << vec[i] << " "; - } + for (unsigned int i = 0; i < vec_len; ++i) { + file << vec[i] << " "; + } - file << "];\n"; + file << "];\n"; } -#endif // _MATLAB_H_ +#endif // _MATLAB_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/structs.h b/Samples/2_Concepts_and_Techniques/eigenvalues/structs.h index 95cff54b..63eb944f 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/structs.h +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/structs.h @@ -30,100 +30,103 @@ #ifndef _STRUCTS_H_ #define _STRUCTS_H_ -struct InputData { - //! host side representation of diagonal - float *a; - //! host side representation superdiagonal - float *b; +struct InputData +{ + //! host side representation of diagonal + float *a; + //! host side representation superdiagonal + float *b; - //! device side representation of diagonal - float *g_a; - //! device side representation of superdiagonal - float *g_b; - //! helper variable pointing to the mem allocated for g_b which provides - //! space for one additional element of padding at the beginning - float *g_b_raw; + //! device side representation of diagonal + float *g_a; + //! device side representation of superdiagonal + float *g_b; + //! helper variable pointing to the mem allocated for g_b which provides + //! space for one additional element of padding at the beginning + float *g_b_raw; }; -struct ResultDataSmall { - //! eigenvalues (host side) - float *eigenvalues; +struct ResultDataSmall +{ + //! eigenvalues (host side) + float *eigenvalues; - // left interval limits at the end of the computation - float *g_left; + // left interval limits at the end of the computation + float *g_left; - // right interval limits at the end of the computation - float *g_right; + // right interval limits at the end of the computation + float *g_right; - // number of eigenvalues smaller than the left interval limit - unsigned int *g_left_count; + // number of eigenvalues smaller than the left interval limit + unsigned int *g_left_count; - // number of eigenvalues bigger than the right interval limit - unsigned int *g_right_count; + // number of eigenvalues bigger than the right interval limit + unsigned int *g_right_count; - //! flag if algorithm converged - unsigned int *g_converged; + //! flag if algorithm converged + unsigned int *g_converged; - // helper variables + // helper variables - unsigned int mat_size_f; - unsigned int mat_size_ui; + unsigned int mat_size_f; + unsigned int mat_size_ui; - float *zero_f; - unsigned int *zero_ui; + float *zero_f; + unsigned int *zero_ui; }; -struct ResultDataLarge { - // number of intervals containing one eigenvalue after the first step - unsigned int *g_num_one; +struct ResultDataLarge +{ + // number of intervals containing one eigenvalue after the first step + unsigned int *g_num_one; - // number of (thread) blocks of intervals containing multiple eigenvalues - // after the first step - unsigned int *g_num_blocks_mult; + // number of (thread) blocks of intervals containing multiple eigenvalues + // after the first step + unsigned int *g_num_blocks_mult; - //! left interval limits of intervals containing one eigenvalue after the - //! first iteration step - float *g_left_one; + //! left interval limits of intervals containing one eigenvalue after the + //! first iteration step + float *g_left_one; - //! right interval limits of intervals containing one eigenvalue after the - //! first iteration step - float *g_right_one; + //! right interval limits of intervals containing one eigenvalue after the + //! first iteration step + float *g_right_one; - //! interval indices (position in sorted listed of eigenvalues) - //! of intervals containing one eigenvalue after the first iteration step - unsigned int *g_pos_one; + //! interval indices (position in sorted listed of eigenvalues) + //! of intervals containing one eigenvalue after the first iteration step + unsigned int *g_pos_one; - //! left interval limits of intervals containing multiple eigenvalues - //! after the first iteration step - float *g_left_mult; + //! left interval limits of intervals containing multiple eigenvalues + //! after the first iteration step + float *g_left_mult; - //! right interval limits of intervals containing multiple eigenvalues - //! after the first iteration step - float *g_right_mult; + //! right interval limits of intervals containing multiple eigenvalues + //! after the first iteration step + float *g_right_mult; - //! number of eigenvalues less than the left limit of the eigenvalue - //! intervals containing multiple eigenvalues - unsigned int *g_left_count_mult; + //! number of eigenvalues less than the left limit of the eigenvalue + //! intervals containing multiple eigenvalues + unsigned int *g_left_count_mult; - //! number of eigenvalues less than the right limit of the eigenvalue - //! intervals containing multiple eigenvalues - unsigned int *g_right_count_mult; + //! number of eigenvalues less than the right limit of the eigenvalue + //! intervals containing multiple eigenvalues + unsigned int *g_right_count_mult; - //! start addresses in g_left_mult etc. of blocks of intervals containing - //! more than one eigenvalue after the first step - unsigned int *g_blocks_mult; + //! start addresses in g_left_mult etc. of blocks of intervals containing + //! more than one eigenvalue after the first step + unsigned int *g_blocks_mult; - //! accumulated number of intervals in g_left_mult etc. of blocks of - //! intervals containing more than one eigenvalue after the first step - unsigned int *g_blocks_mult_sum; + //! accumulated number of intervals in g_left_mult etc. of blocks of + //! intervals containing more than one eigenvalue after the first step + unsigned int *g_blocks_mult_sum; - //! eigenvalues that have been generated in the second step from intervals - //! that still contained multiple eigenvalues after the first step - float *g_lambda_mult; + //! eigenvalues that have been generated in the second step from intervals + //! that still contained multiple eigenvalues after the first step + float *g_lambda_mult; - //! eigenvalue index of intervals that have been generated in the second - //! processing step - unsigned int *g_pos_mult; + //! eigenvalue index of intervals that have been generated in the second + //! processing step + unsigned int *g_pos_mult; }; -#endif // #ifndef _STRUCTS_H_ +#endif // #ifndef _STRUCTS_H_ diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/util.h b/Samples/2_Concepts_and_Techniques/eigenvalues/util.h index 2d43b770..db03ce33 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/util.h +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/util.h @@ -33,12 +33,12 @@ //////////////////////////////////////////////////////////////////////////////// //! Safely free() for pointer //////////////////////////////////////////////////////////////////////////////// -template -inline void freePtr(T *&ptr) { - if (NULL != ptr) { - free(ptr); - ptr = NULL; - } +template inline void freePtr(T *&ptr) +{ + if (NULL != ptr) { + free(ptr); + ptr = NULL; + } } //////////////////////////////////////////////////////////////////////////////// @@ -50,9 +50,10 @@ __host__ __device__ #endif T - min(const T &lhs, const T &rhs) { + min(const T &lhs, const T &rhs) +{ - return (lhs < rhs) ? lhs : rhs; + return (lhs < rhs) ? lhs : rhs; } //////////////////////////////////////////////////////////////////////////////// @@ -64,9 +65,10 @@ __host__ __device__ #endif T - max(const T &lhs, const T &rhs) { + max(const T &lhs, const T &rhs) +{ - return (lhs < rhs) ? rhs : lhs; + return (lhs < rhs) ? rhs : lhs; } //////////////////////////////////////////////////////////////////////////////// @@ -78,8 +80,9 @@ __host__ __device__ #endif T - sign_i(const T &val) { - return (val < 0) ? -1 : 1; + sign_i(const T &val) +{ + return (val < 0) ? -1 : 1; } //////////////////////////////////////////////////////////////////////////////// @@ -89,8 +92,9 @@ __host__ __host__ __device__ #endif inline float - sign_f(const float &val) { - return (val < 0.0f) ? -1.0f : 1.0f; + sign_f(const float &val) +{ + return (val < 0.0f) ? -1.0f : 1.0f; } //////////////////////////////////////////////////////////////////////////////// @@ -100,8 +104,9 @@ __host__ __device__ __host__ __device__ #endif inline double - sign_d(const double &val) { - return (val < 0.0) ? -1.0 : 1.0; + sign_d(const double &val) +{ + return (val < 0.0) ? -1.0 : 1.0; } //////////////////////////////////////////////////////////////////////////////// @@ -112,22 +117,22 @@ template __host__ __device__ #endif void - swap(T &lhs, T &rhs) { + swap(T &lhs, T &rhs) +{ - T temp = rhs; - rhs = lhs; - lhs = temp; + T temp = rhs; + rhs = lhs; + lhs = temp; } /////////////////////////////////////////////////////////////////////////////// //! Get the number of blocks that are required to process \a num_threads with //! \a num_threads_blocks threads per block /////////////////////////////////////////////////////////////////////////////// -extern "C" inline unsigned int getNumBlocksLinear( - const unsigned int num_threads, const unsigned int num_threads_block) { - const unsigned int block_rem = - ((num_threads % num_threads_block) != 0) ? 1 : 0; - return (num_threads / num_threads_block) + block_rem; +extern "C" inline unsigned int getNumBlocksLinear(const unsigned int num_threads, const unsigned int num_threads_block) +{ + const unsigned int block_rem = ((num_threads % num_threads_block) != 0) ? 1 : 0; + return (num_threads / num_threads_block) + block_rem; } -#endif // #ifndef _UTIL_H_ +#endif // #ifndef _UTIL_H_ diff --git a/Samples/2_Concepts_and_Techniques/histogram/histogram256.cu b/Samples/2_Concepts_and_Techniques/histogram/histogram256.cu index 9cc1fed4..570170c1 100644 --- a/Samples/2_Concepts_and_Techniques/histogram/histogram256.cu +++ b/Samples/2_Concepts_and_Techniques/histogram/histogram256.cu @@ -26,13 +26,14 @@ */ #include +#include #include #include #include -#include namespace cg = cooperative_groups; #include + #include "histogram_common.h" //////////////////////////////////////////////////////////////////////////////// @@ -40,59 +41,53 @@ namespace cg = cooperative_groups; //////////////////////////////////////////////////////////////////////////////// #define TAG_MASK 0xFFFFFFFFU -inline __device__ void addByte(uint *s_WarpHist, uint data, uint threadTag) { - atomicAdd(s_WarpHist + data, 1); +inline __device__ void addByte(uint *s_WarpHist, uint data, uint threadTag) { atomicAdd(s_WarpHist + data, 1); } + +inline __device__ void addWord(uint *s_WarpHist, uint data, uint tag) +{ + addByte(s_WarpHist, (data >> 0) & 0xFFU, tag); + addByte(s_WarpHist, (data >> 8) & 0xFFU, tag); + addByte(s_WarpHist, (data >> 16) & 0xFFU, tag); + addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); } -inline __device__ void addWord(uint *s_WarpHist, uint data, uint tag) { - addByte(s_WarpHist, (data >> 0) & 0xFFU, tag); - addByte(s_WarpHist, (data >> 8) & 0xFFU, tag); - addByte(s_WarpHist, (data >> 16) & 0xFFU, tag); - addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); -} - -__global__ void histogram256Kernel(uint *d_PartialHistograms, uint *d_Data, - uint dataCount) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Per-warp subhistogram storage - __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY]; - uint *s_WarpHist = - s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT; +__global__ void histogram256Kernel(uint *d_PartialHistograms, uint *d_Data, uint dataCount) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Per-warp subhistogram storage + __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY]; + uint *s_WarpHist = s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT; // Clear shared memory storage for current threadblock before processing #pragma unroll - for (uint i = 0; - i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); - i++) { - s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0; - } - - // Cycle through the entire data set, update subhistograms for each warp - const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE); - - cg::sync(cta); - - for (uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount; - pos += UMUL(blockDim.x, gridDim.x)) { - uint data = d_Data[pos]; - addWord(s_WarpHist, data, tag); - } - - // Merge per-warp histograms into per-block and write to global memory - cg::sync(cta); - - for (uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; - bin += HISTOGRAM256_THREADBLOCK_SIZE) { - uint sum = 0; - - for (uint i = 0; i < WARP_COUNT; i++) { - sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK; + for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++) { + s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0; } - d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum; - } + // Cycle through the entire data set, update subhistograms for each warp + const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE); + + cg::sync(cta); + + for (uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount; pos += UMUL(blockDim.x, gridDim.x)) { + uint data = d_Data[pos]; + addWord(s_WarpHist, data, tag); + } + + // Merge per-warp histograms into per-block and write to global memory + cg::sync(cta); + + for (uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE) { + uint sum = 0; + + for (uint i = 0; i < WARP_COUNT; i++) { + sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK; + } + + d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum; + } } //////////////////////////////////////////////////////////////////////////////// @@ -103,32 +98,31 @@ __global__ void histogram256Kernel(uint *d_PartialHistograms, uint *d_Data, //////////////////////////////////////////////////////////////////////////////// #define MERGE_THREADBLOCK_SIZE 256 -__global__ void mergeHistogram256Kernel(uint *d_Histogram, - uint *d_PartialHistograms, - uint histogramCount) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void mergeHistogram256Kernel(uint *d_Histogram, uint *d_PartialHistograms, uint histogramCount) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - uint sum = 0; + uint sum = 0; - for (uint i = threadIdx.x; i < histogramCount; i += MERGE_THREADBLOCK_SIZE) { - sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT]; - } - - __shared__ uint data[MERGE_THREADBLOCK_SIZE]; - data[threadIdx.x] = sum; - - for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - - if (threadIdx.x < stride) { - data[threadIdx.x] += data[threadIdx.x + stride]; + for (uint i = threadIdx.x; i < histogramCount; i += MERGE_THREADBLOCK_SIZE) { + sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT]; } - } - if (threadIdx.x == 0) { - d_Histogram[blockIdx.x] = data[0]; - } + __shared__ uint data[MERGE_THREADBLOCK_SIZE]; + data[threadIdx.x] = sum; + + for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + + if (threadIdx.x < stride) { + data[threadIdx.x] += data[threadIdx.x + stride]; + } + } + + if (threadIdx.x == 0) { + d_Histogram[blockIdx.x] = data[0]; + } } //////////////////////////////////////////////////////////////////////////////// @@ -136,28 +130,26 @@ __global__ void mergeHistogram256Kernel(uint *d_Histogram, //////////////////////////////////////////////////////////////////////////////// // histogram256kernel() intermediate results buffer static const uint PARTIAL_HISTOGRAM256_COUNT = 240; -static uint *d_PartialHistograms; +static uint *d_PartialHistograms; // Internal memory allocation -extern "C" void initHistogram256(void) { - checkCudaErrors(cudaMalloc( - (void **)&d_PartialHistograms, - PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT * sizeof(uint))); +extern "C" void initHistogram256(void) +{ + checkCudaErrors( + cudaMalloc((void **)&d_PartialHistograms, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT * sizeof(uint))); } // Internal memory deallocation -extern "C" void closeHistogram256(void) { - checkCudaErrors(cudaFree(d_PartialHistograms)); -} +extern "C" void closeHistogram256(void) { checkCudaErrors(cudaFree(d_PartialHistograms)); } -extern "C" void histogram256(uint *d_Histogram, void *d_Data, uint byteCount) { - assert(byteCount % sizeof(uint) == 0); - histogram256Kernel<<>>( - d_PartialHistograms, (uint *)d_Data, byteCount / sizeof(uint)); - getLastCudaError("histogram256Kernel() execution failed\n"); +extern "C" void histogram256(uint *d_Histogram, void *d_Data, uint byteCount) +{ + assert(byteCount % sizeof(uint) == 0); + histogram256Kernel<<>>( + d_PartialHistograms, (uint *)d_Data, byteCount / sizeof(uint)); + getLastCudaError("histogram256Kernel() execution failed\n"); - mergeHistogram256Kernel<<>>( - d_Histogram, d_PartialHistograms, PARTIAL_HISTOGRAM256_COUNT); - getLastCudaError("mergeHistogram256Kernel() execution failed\n"); + mergeHistogram256Kernel<<>>( + d_Histogram, d_PartialHistograms, PARTIAL_HISTOGRAM256_COUNT); + getLastCudaError("mergeHistogram256Kernel() execution failed\n"); } diff --git a/Samples/2_Concepts_and_Techniques/histogram/histogram64.cu b/Samples/2_Concepts_and_Techniques/histogram/histogram64.cu index ce74195a..cc3051ea 100644 --- a/Samples/2_Concepts_and_Techniques/histogram/histogram64.cu +++ b/Samples/2_Concepts_and_Techniques/histogram/histogram64.cu @@ -26,13 +26,14 @@ */ #include +#include #include #include #include -#include namespace cg = cooperative_groups; #include + #include "histogram_common.h" //////////////////////////////////////////////////////////////////////////////// @@ -48,78 +49,77 @@ typedef uint4 data_t; // Main computation pass: compute gridDim.x partial histograms //////////////////////////////////////////////////////////////////////////////// // Count a byte into shared-memory storage -inline __device__ void addByte(uchar *s_ThreadBase, uint data) { - s_ThreadBase[UMUL(data, HISTOGRAM64_THREADBLOCK_SIZE)]++; +inline __device__ void addByte(uchar *s_ThreadBase, uint data) +{ + s_ThreadBase[UMUL(data, HISTOGRAM64_THREADBLOCK_SIZE)]++; } // Count four bytes of a word -inline __device__ void addWord(uchar *s_ThreadBase, uint data) { - // Only higher 6 bits of each byte matter, as this is a 64-bin histogram - addByte(s_ThreadBase, (data >> 2) & 0x3FU); - addByte(s_ThreadBase, (data >> 10) & 0x3FU); - addByte(s_ThreadBase, (data >> 18) & 0x3FU); - addByte(s_ThreadBase, (data >> 26) & 0x3FU); +inline __device__ void addWord(uchar *s_ThreadBase, uint data) +{ + // Only higher 6 bits of each byte matter, as this is a 64-bin histogram + addByte(s_ThreadBase, (data >> 2) & 0x3FU); + addByte(s_ThreadBase, (data >> 10) & 0x3FU); + addByte(s_ThreadBase, (data >> 18) & 0x3FU); + addByte(s_ThreadBase, (data >> 26) & 0x3FU); } -__global__ void histogram64Kernel(uint *d_PartialHistograms, data_t *d_Data, - uint dataCount) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Encode thread index in order to avoid bank conflicts in s_Hist[] access: - // each group of SHARED_MEMORY_BANKS threads accesses consecutive shared - // memory banks - // and the same bytes [0..3] within the banks - // Because of this permutation block size should be a multiple of 4 * - // SHARED_MEMORY_BANKS - const uint threadPos = ((threadIdx.x & ~(SHARED_MEMORY_BANKS * 4 - 1)) << 0) | - ((threadIdx.x & (SHARED_MEMORY_BANKS - 1)) << 2) | - ((threadIdx.x & (SHARED_MEMORY_BANKS * 3)) >> 4); +__global__ void histogram64Kernel(uint *d_PartialHistograms, data_t *d_Data, uint dataCount) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Encode thread index in order to avoid bank conflicts in s_Hist[] access: + // each group of SHARED_MEMORY_BANKS threads accesses consecutive shared + // memory banks + // and the same bytes [0..3] within the banks + // Because of this permutation block size should be a multiple of 4 * + // SHARED_MEMORY_BANKS + const uint threadPos = ((threadIdx.x & ~(SHARED_MEMORY_BANKS * 4 - 1)) << 0) + | ((threadIdx.x & (SHARED_MEMORY_BANKS - 1)) << 2) + | ((threadIdx.x & (SHARED_MEMORY_BANKS * 3)) >> 4); - // Per-thread histogram storage - __shared__ uchar s_Hist[HISTOGRAM64_THREADBLOCK_SIZE * HISTOGRAM64_BIN_COUNT]; - uchar *s_ThreadBase = s_Hist + threadPos; + // Per-thread histogram storage + __shared__ uchar s_Hist[HISTOGRAM64_THREADBLOCK_SIZE * HISTOGRAM64_BIN_COUNT]; + uchar *s_ThreadBase = s_Hist + threadPos; // Initialize shared memory (writing 32-bit words) #pragma unroll - for (uint i = 0; i < (HISTOGRAM64_BIN_COUNT / 4); i++) { - ((uint *)s_Hist)[threadIdx.x + i * HISTOGRAM64_THREADBLOCK_SIZE] = 0; - } + for (uint i = 0; i < (HISTOGRAM64_BIN_COUNT / 4); i++) { + ((uint *)s_Hist)[threadIdx.x + i * HISTOGRAM64_THREADBLOCK_SIZE] = 0; + } - // Read data from global memory and submit to the shared-memory histogram - // Since histogram counters are byte-sized, every single thread can't do more - // than 255 submission - cg::sync(cta); + // Read data from global memory and submit to the shared-memory histogram + // Since histogram counters are byte-sized, every single thread can't do more + // than 255 submission + cg::sync(cta); - for (uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount; - pos += UMUL(blockDim.x, gridDim.x)) { - data_t data = d_Data[pos]; - addWord(s_ThreadBase, data.x); - addWord(s_ThreadBase, data.y); - addWord(s_ThreadBase, data.z); - addWord(s_ThreadBase, data.w); - } + for (uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount; pos += UMUL(blockDim.x, gridDim.x)) { + data_t data = d_Data[pos]; + addWord(s_ThreadBase, data.x); + addWord(s_ThreadBase, data.y); + addWord(s_ThreadBase, data.z); + addWord(s_ThreadBase, data.w); + } - // Accumulate per-thread histograms into per-block and write to global memory - cg::sync(cta); + // Accumulate per-thread histograms into per-block and write to global memory + cg::sync(cta); - if (threadIdx.x < HISTOGRAM64_BIN_COUNT) { - uchar *s_HistBase = - s_Hist + UMUL(threadIdx.x, HISTOGRAM64_THREADBLOCK_SIZE); + if (threadIdx.x < HISTOGRAM64_BIN_COUNT) { + uchar *s_HistBase = s_Hist + UMUL(threadIdx.x, HISTOGRAM64_THREADBLOCK_SIZE); - uint sum = 0; - uint pos = 4 * (threadIdx.x & (SHARED_MEMORY_BANKS - 1)); + uint sum = 0; + uint pos = 4 * (threadIdx.x & (SHARED_MEMORY_BANKS - 1)); #pragma unroll - for (uint i = 0; i < (HISTOGRAM64_THREADBLOCK_SIZE / 4); i++) { - sum += s_HistBase[pos + 0] + s_HistBase[pos + 1] + s_HistBase[pos + 2] + - s_HistBase[pos + 3]; - pos = (pos + 4) & (HISTOGRAM64_THREADBLOCK_SIZE - 1); - } + for (uint i = 0; i < (HISTOGRAM64_THREADBLOCK_SIZE / 4); i++) { + sum += s_HistBase[pos + 0] + s_HistBase[pos + 1] + s_HistBase[pos + 2] + s_HistBase[pos + 3]; + pos = (pos + 4) & (HISTOGRAM64_THREADBLOCK_SIZE - 1); + } - d_PartialHistograms[blockIdx.x * HISTOGRAM64_BIN_COUNT + threadIdx.x] = sum; - } + d_PartialHistograms[blockIdx.x * HISTOGRAM64_BIN_COUNT + threadIdx.x] = sum; + } } //////////////////////////////////////////////////////////////////////////////// @@ -130,32 +130,31 @@ __global__ void histogram64Kernel(uint *d_PartialHistograms, data_t *d_Data, //////////////////////////////////////////////////////////////////////////////// #define MERGE_THREADBLOCK_SIZE 256 -__global__ void mergeHistogram64Kernel(uint *d_Histogram, - uint *d_PartialHistograms, - uint histogramCount) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ uint data[MERGE_THREADBLOCK_SIZE]; +__global__ void mergeHistogram64Kernel(uint *d_Histogram, uint *d_PartialHistograms, uint histogramCount) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ uint data[MERGE_THREADBLOCK_SIZE]; - uint sum = 0; + uint sum = 0; - for (uint i = threadIdx.x; i < histogramCount; i += MERGE_THREADBLOCK_SIZE) { - sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM64_BIN_COUNT]; - } - - data[threadIdx.x] = sum; - - for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - - if (threadIdx.x < stride) { - data[threadIdx.x] += data[threadIdx.x + stride]; + for (uint i = threadIdx.x; i < histogramCount; i += MERGE_THREADBLOCK_SIZE) { + sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM64_BIN_COUNT]; } - } - if (threadIdx.x == 0) { - d_Histogram[blockIdx.x] = data[0]; - } + data[threadIdx.x] = sum; + + for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + + if (threadIdx.x < stride) { + data[threadIdx.x] += data[threadIdx.x + stride]; + } + } + + if (threadIdx.x == 0) { + d_Histogram[blockIdx.x] = data[0]; + } } //////////////////////////////////////////////////////////////////////////////// @@ -165,41 +164,37 @@ __global__ void mergeHistogram64Kernel(uint *d_Histogram, // MAX_PARTIAL_HISTOGRAM64_COUNT == 32768 and HISTOGRAM64_THREADBLOCK_SIZE == 64 // amounts to max. 480MB of input data static const uint MAX_PARTIAL_HISTOGRAM64_COUNT = 32768; -static uint *d_PartialHistograms; +static uint *d_PartialHistograms; // Internal memory allocation -extern "C" void initHistogram64(void) { - assert(HISTOGRAM64_THREADBLOCK_SIZE % (4 * SHARED_MEMORY_BANKS) == 0); - checkCudaErrors(cudaMalloc( - (void **)&d_PartialHistograms, - MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint))); +extern "C" void initHistogram64(void) +{ + assert(HISTOGRAM64_THREADBLOCK_SIZE % (4 * SHARED_MEMORY_BANKS) == 0); + checkCudaErrors(cudaMalloc((void **)&d_PartialHistograms, + MAX_PARTIAL_HISTOGRAM64_COUNT * HISTOGRAM64_BIN_COUNT * sizeof(uint))); } // Internal memory deallocation -extern "C" void closeHistogram64(void) { - checkCudaErrors(cudaFree(d_PartialHistograms)); -} +extern "C" void closeHistogram64(void) { checkCudaErrors(cudaFree(d_PartialHistograms)); } // Round a / b to nearest higher integer value -inline uint iDivUp(uint a, uint b) { - return (a % b != 0) ? (a / b + 1) : (a / b); -} +inline uint iDivUp(uint a, uint b) { return (a % b != 0) ? (a / b + 1) : (a / b); } // Snap a to nearest lower multiple of b inline uint iSnapDown(uint a, uint b) { return a - a % b; } -extern "C" void histogram64(uint *d_Histogram, void *d_Data, uint byteCount) { - const uint histogramCount = iDivUp( - byteCount, HISTOGRAM64_THREADBLOCK_SIZE * iSnapDown(255, sizeof(data_t))); +extern "C" void histogram64(uint *d_Histogram, void *d_Data, uint byteCount) +{ + const uint histogramCount = iDivUp(byteCount, HISTOGRAM64_THREADBLOCK_SIZE * iSnapDown(255, sizeof(data_t))); - assert(byteCount % sizeof(data_t) == 0); - assert(histogramCount <= MAX_PARTIAL_HISTOGRAM64_COUNT); + assert(byteCount % sizeof(data_t) == 0); + assert(histogramCount <= MAX_PARTIAL_HISTOGRAM64_COUNT); - histogram64Kernel<<>>( - d_PartialHistograms, (data_t *)d_Data, byteCount / sizeof(data_t)); - getLastCudaError("histogram64Kernel() execution failed\n"); + histogram64Kernel<<>>( + d_PartialHistograms, (data_t *)d_Data, byteCount / sizeof(data_t)); + getLastCudaError("histogram64Kernel() execution failed\n"); - mergeHistogram64Kernel<<>>( - d_Histogram, d_PartialHistograms, histogramCount); - getLastCudaError("mergeHistogram64() execution failed\n"); + mergeHistogram64Kernel<<>>( + d_Histogram, d_PartialHistograms, histogramCount); + getLastCudaError("mergeHistogram64() execution failed\n"); } diff --git a/Samples/2_Concepts_and_Techniques/histogram/histogram_common.h b/Samples/2_Concepts_and_Techniques/histogram/histogram_common.h index 512cc30c..59c843ab 100644 --- a/Samples/2_Concepts_and_Techniques/histogram/histogram_common.h +++ b/Samples/2_Concepts_and_Techniques/histogram/histogram_common.h @@ -31,17 +31,17 @@ //////////////////////////////////////////////////////////////////////////////// // Common definitions //////////////////////////////////////////////////////////////////////////////// -#define HISTOGRAM64_BIN_COUNT 64 +#define HISTOGRAM64_BIN_COUNT 64 #define HISTOGRAM256_BIN_COUNT 256 -#define UINT_BITS 32 -typedef unsigned int uint; +#define UINT_BITS 32 +typedef unsigned int uint; typedef unsigned char uchar; //////////////////////////////////////////////////////////////////////////////// // GPU-specific common definitions //////////////////////////////////////////////////////////////////////////////// #define LOG2_WARP_SIZE 5U -#define WARP_SIZE (1U << LOG2_WARP_SIZE) +#define WARP_SIZE (1U << LOG2_WARP_SIZE) // May change on future hardware, so better parametrize the code #define SHARED_MEMORY_BANKS 16 @@ -59,7 +59,7 @@ typedef unsigned char uchar; // Shared memory per threadblock #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT) -#define UMUL(a, b) ((a) * (b)) +#define UMUL(a, b) ((a) * (b)) #define UMAD(a, b, c) (UMUL((a), (b)) + (c)) //////////////////////////////////////////////////////////////////////////////// @@ -67,8 +67,7 @@ typedef unsigned char uchar; //////////////////////////////////////////////////////////////////////////////// extern "C" void histogram64CPU(uint *h_Histogram, void *h_Data, uint byteCount); -extern "C" void histogram256CPU(uint *h_Histogram, void *h_Data, - uint byteCount); +extern "C" void histogram256CPU(uint *h_Histogram, void *h_Data, uint byteCount); //////////////////////////////////////////////////////////////////////////////// // GPU histogram diff --git a/Samples/2_Concepts_and_Techniques/histogram/histogram_gold.cpp b/Samples/2_Concepts_and_Techniques/histogram/histogram_gold.cpp index c8b0ef2d..90d38327 100644 --- a/Samples/2_Concepts_and_Techniques/histogram/histogram_gold.cpp +++ b/Samples/2_Concepts_and_Techniques/histogram/histogram_gold.cpp @@ -26,34 +26,37 @@ */ #include + #include "histogram_common.h" -extern "C" void histogram64CPU(uint *h_Histogram, void *h_Data, - uint byteCount) { - for (uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) h_Histogram[i] = 0; +extern "C" void histogram64CPU(uint *h_Histogram, void *h_Data, uint byteCount) +{ + for (uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) + h_Histogram[i] = 0; - assert(sizeof(uint) == 4 && (byteCount % 4) == 0); + assert(sizeof(uint) == 4 && (byteCount % 4) == 0); - for (uint i = 0; i < (byteCount / 4); i++) { - uint data = ((uint *)h_Data)[i]; - h_Histogram[(data >> 2) & 0x3FU]++; - h_Histogram[(data >> 10) & 0x3FU]++; - h_Histogram[(data >> 18) & 0x3FU]++; - h_Histogram[(data >> 26) & 0x3FU]++; - } + for (uint i = 0; i < (byteCount / 4); i++) { + uint data = ((uint *)h_Data)[i]; + h_Histogram[(data >> 2) & 0x3FU]++; + h_Histogram[(data >> 10) & 0x3FU]++; + h_Histogram[(data >> 18) & 0x3FU]++; + h_Histogram[(data >> 26) & 0x3FU]++; + } } -extern "C" void histogram256CPU(uint *h_Histogram, void *h_Data, - uint byteCount) { - for (uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) h_Histogram[i] = 0; +extern "C" void histogram256CPU(uint *h_Histogram, void *h_Data, uint byteCount) +{ + for (uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) + h_Histogram[i] = 0; - assert(sizeof(uint) == 4 && (byteCount % 4) == 0); + assert(sizeof(uint) == 4 && (byteCount % 4) == 0); - for (uint i = 0; i < (byteCount / 4); i++) { - uint data = ((uint *)h_Data)[i]; - h_Histogram[(data >> 0) & 0xFFU]++; - h_Histogram[(data >> 8) & 0xFFU]++; - h_Histogram[(data >> 16) & 0xFFU]++; - h_Histogram[(data >> 24) & 0xFFU]++; - } + for (uint i = 0; i < (byteCount / 4); i++) { + uint data = ((uint *)h_Data)[i]; + h_Histogram[(data >> 0) & 0xFFU]++; + h_Histogram[(data >> 8) & 0xFFU]++; + h_Histogram[(data >> 16) & 0xFFU]++; + h_Histogram[(data >> 24) & 0xFFU]++; + } } diff --git a/Samples/2_Concepts_and_Techniques/histogram/main.cpp b/Samples/2_Concepts_and_Techniques/histogram/main.cpp index 124140bf..e4b9bcc7 100644 --- a/Samples/2_Concepts_and_Techniques/histogram/main.cpp +++ b/Samples/2_Concepts_and_Techniques/histogram/main.cpp @@ -26,205 +26,203 @@ */ /* -* This sample implements 64-bin histogram calculation -* of arbitrary-sized 8-bit data array -*/ + * This sample implements 64-bin histogram calculation + * of arbitrary-sized 8-bit data array + */ // CUDA Runtime #include // Utility and system includes #include -#include // helper for shared that are common to CUDA Samples +#include // helper for shared that are common to CUDA Samples // project include #include "histogram_common.h" -const int numRuns = 16; +const int numRuns = 16; const static char *sSDKsample = "[histogram]\0"; -int main(int argc, char **argv) { - uchar *h_Data; - uint *h_HistogramCPU, *h_HistogramGPU; - uchar *d_Data; - uint *d_Histogram; - StopWatchInterface *hTimer = NULL; - int PassFailFlag = 1; - uint byteCount = 64 * 1048576; - uint uiSizeMult = 1; +int main(int argc, char **argv) +{ + uchar *h_Data; + uint *h_HistogramCPU, *h_HistogramGPU; + uchar *d_Data; + uint *d_Histogram; + StopWatchInterface *hTimer = NULL; + int PassFailFlag = 1; + uint byteCount = 64 * 1048576; + uint uiSizeMult = 1; - cudaDeviceProp deviceProp; - deviceProp.major = 0; - deviceProp.minor = 0; + cudaDeviceProp deviceProp; + deviceProp.major = 0; + deviceProp.minor = 0; - // set logfile name and start logs - printf("[%s] - Starting...\n", sSDKsample); + // set logfile name and start logs + printf("[%s] - Starting...\n", sSDKsample); - // Use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - int dev = findCudaDevice(argc, (const char **)argv); + // Use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + int dev = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", - deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, - deviceProp.minor); + printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", + deviceProp.name, + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - // Optional Command-line multiplier to increase size of array to histogram - if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) { - uiSizeMult = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); - uiSizeMult = MAX(1, MIN(uiSizeMult, 10)); - byteCount *= uiSizeMult; - } + // Optional Command-line multiplier to increase size of array to histogram + if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) { + uiSizeMult = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); + uiSizeMult = MAX(1, MIN(uiSizeMult, 10)); + byteCount *= uiSizeMult; + } - printf("Initializing data...\n"); - printf("...allocating CPU memory.\n"); - h_Data = (uchar *)malloc(byteCount); - h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); - h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); + printf("Initializing data...\n"); + printf("...allocating CPU memory.\n"); + h_Data = (uchar *)malloc(byteCount); + h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); + h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); - printf("...generating input data\n"); - srand(2009); + printf("...generating input data\n"); + srand(2009); - for (uint i = 0; i < byteCount; i++) { - h_Data[i] = rand() % 256; - } + for (uint i = 0; i < byteCount; i++) { + h_Data[i] = rand() % 256; + } - printf("...allocating GPU memory and copying input data\n\n"); - checkCudaErrors(cudaMalloc((void **)&d_Data, byteCount)); - checkCudaErrors( - cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint))); - checkCudaErrors( - cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice)); + printf("...allocating GPU memory and copying input data\n\n"); + checkCudaErrors(cudaMalloc((void **)&d_Data, byteCount)); + checkCudaErrors(cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint))); + checkCudaErrors(cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice)); - { - printf("Starting up 64-bin histogram...\n\n"); - initHistogram64(); + { + printf("Starting up 64-bin histogram...\n\n"); + initHistogram64(); - printf("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", - byteCount, numRuns); + printf("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); + + for (int iter = -1; iter < numRuns; iter++) { + // iter == -1 -- warmup iteration + if (iter == 0) { + cudaDeviceSynchronize(); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } + + histogram64(d_Histogram, d_Data, byteCount); + } - for (int iter = -1; iter < numRuns; iter++) { - // iter == -1 -- warmup iteration - if (iter == 0) { cudaDeviceSynchronize(); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - } + sdkStopTimer(&hTimer); + double dAvgSecs = 1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)numRuns; + printf("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", + dAvgSecs, + ((double)byteCount * 1.0e-6) / dAvgSecs); + printf("histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, " + "NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-6 * (double)byteCount / dAvgSecs), + dAvgSecs, + byteCount, + 1, + HISTOGRAM64_THREADBLOCK_SIZE); - histogram64(d_Histogram, d_Data, byteCount); + printf("\nValidating GPU results...\n"); + printf(" ...reading back GPU results\n"); + checkCudaErrors( + cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost)); + + printf(" ...histogram64CPU()\n"); + histogram64CPU(h_HistogramCPU, h_Data, byteCount); + + printf(" ...comparing the results...\n"); + + for (uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) + if (h_HistogramGPU[i] != h_HistogramCPU[i]) { + PassFailFlag = 0; + } + + printf(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n"); + + printf("Shutting down 64-bin histogram...\n\n\n"); + closeHistogram64(); } - cudaDeviceSynchronize(); - sdkStopTimer(&hTimer); - double dAvgSecs = - 1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)numRuns; - printf("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, - ((double)byteCount * 1.0e-6) / dAvgSecs); - printf( - "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, " - "NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, - HISTOGRAM64_THREADBLOCK_SIZE); + { + printf("Initializing 256-bin histogram...\n"); + initHistogram256(); - printf("\nValidating GPU results...\n"); - printf(" ...reading back GPU results\n"); - checkCudaErrors(cudaMemcpy(h_HistogramGPU, d_Histogram, - HISTOGRAM64_BIN_COUNT * sizeof(uint), - cudaMemcpyDeviceToHost)); + printf("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); - printf(" ...histogram64CPU()\n"); - histogram64CPU(h_HistogramCPU, h_Data, byteCount); + for (int iter = -1; iter < numRuns; iter++) { + // iter == -1 -- warmup iteration + if (iter == 0) { + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } - printf(" ...comparing the results...\n"); + histogram256(d_Histogram, d_Data, byteCount); + } - for (uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) - if (h_HistogramGPU[i] != h_HistogramCPU[i]) { - PassFailFlag = 0; - } + cudaDeviceSynchronize(); + sdkStopTimer(&hTimer); + double dAvgSecs = 1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)numRuns; + printf("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", + dAvgSecs, + ((double)byteCount * 1.0e-6) / dAvgSecs); + printf("histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, " + "NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-6 * (double)byteCount / dAvgSecs), + dAvgSecs, + byteCount, + 1, + HISTOGRAM256_THREADBLOCK_SIZE); - printf(PassFailFlag ? " ...64-bin histograms match\n\n" - : " ***64-bin histograms do not match!!!***\n\n"); + printf("\nValidating GPU results...\n"); + printf(" ...reading back GPU results\n"); + checkCudaErrors( + cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost)); - printf("Shutting down 64-bin histogram...\n\n\n"); - closeHistogram64(); - } + printf(" ...histogram256CPU()\n"); + histogram256CPU(h_HistogramCPU, h_Data, byteCount); - { - printf("Initializing 256-bin histogram...\n"); - initHistogram256(); + printf(" ...comparing the results\n"); - printf("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", - byteCount, numRuns); + for (uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) + if (h_HistogramGPU[i] != h_HistogramCPU[i]) { + PassFailFlag = 0; + } - for (int iter = -1; iter < numRuns; iter++) { - // iter == -1 -- warmup iteration - if (iter == 0) { - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - } + printf(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n"); - histogram256(d_Histogram, d_Data, byteCount); + printf("Shutting down 256-bin histogram...\n\n\n"); + closeHistogram256(); } - cudaDeviceSynchronize(); - sdkStopTimer(&hTimer); - double dAvgSecs = - 1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)numRuns; - printf("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", - dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); - printf( - "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, " - "NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, - HISTOGRAM256_THREADBLOCK_SIZE); + printf("Shutting down...\n"); + sdkDeleteTimer(&hTimer); + checkCudaErrors(cudaFree(d_Histogram)); + checkCudaErrors(cudaFree(d_Data)); + free(h_HistogramGPU); + free(h_HistogramCPU); + free(h_Data); - printf("\nValidating GPU results...\n"); - printf(" ...reading back GPU results\n"); - checkCudaErrors(cudaMemcpy(h_HistogramGPU, d_Histogram, - HISTOGRAM256_BIN_COUNT * sizeof(uint), - cudaMemcpyDeviceToHost)); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - printf(" ...histogram256CPU()\n"); - histogram256CPU(h_HistogramCPU, h_Data, byteCount); + printf("%s - Test Summary\n", sSDKsample); - printf(" ...comparing the results\n"); + // pass or fail (for both 64 bit and 256 bit histograms) + if (!PassFailFlag) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - for (uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) - if (h_HistogramGPU[i] != h_HistogramCPU[i]) { - PassFailFlag = 0; - } - - printf(PassFailFlag ? " ...256-bin histograms match\n\n" - : " ***256-bin histograms do not match!!!***\n\n"); - - printf("Shutting down 256-bin histogram...\n\n\n"); - closeHistogram256(); - } - - printf("Shutting down...\n"); - sdkDeleteTimer(&hTimer); - checkCudaErrors(cudaFree(d_Histogram)); - checkCudaErrors(cudaFree(d_Data)); - free(h_HistogramGPU); - free(h_HistogramCPU); - free(h_Data); - - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); - - printf("%s - Test Summary\n", sSDKsample); - - // pass or fail (for both 64 bit and 256 bit histograms) - if (!PassFailFlag) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } - - printf("Test passed\n"); - exit(EXIT_SUCCESS); + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/bmploader.cpp b/Samples/2_Concepts_and_Techniques/imageDenoising/bmploader.cpp index 46f4ba2d..3191c2b2 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/bmploader.cpp +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/bmploader.cpp @@ -29,100 +29,107 @@ #include #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#pragma warning(disable : 4996) // disable deprecated warning +#pragma warning(disable : 4996) // disable deprecated warning #endif #pragma pack(1) -typedef struct { - short type; - int size; - short reserved1; - short reserved2; - int offset; +typedef struct +{ + short type; + int size; + short reserved1; + short reserved2; + int offset; } BMPHeader; -typedef struct { - int size; - int width; - int height; - short planes; - short bitsPerPixel; - unsigned compression; - unsigned imageSize; - int xPelsPerMeter; - int yPelsPerMeter; - int clrUsed; - int clrImportant; +typedef struct +{ + int size; + int width; + int height; + short planes; + short bitsPerPixel; + unsigned compression; + unsigned imageSize; + int xPelsPerMeter; + int yPelsPerMeter; + int clrUsed; + int clrImportant; } BMPInfoHeader; // Isolated definition -typedef struct { unsigned char x, y, z, w; } uchar4; +typedef struct +{ + unsigned char x, y, z, w; +} uchar4; -extern "C" void LoadBMPFile(uchar4 **dst, int *width, int *height, - const char *name) { - BMPHeader hdr; - BMPInfoHeader infoHdr; - int x, y; +extern "C" void LoadBMPFile(uchar4 **dst, int *width, int *height, const char *name) +{ + BMPHeader hdr; + BMPInfoHeader infoHdr; + int x, y; - FILE *fd; + FILE *fd; - printf("Loading %s...\n", name); + printf("Loading %s...\n", name); - if (sizeof(uchar4) != 4) { - printf("***Bad uchar4 size***\n"); - exit(EXIT_SUCCESS); - } - - if (!(fd = fopen(name, "rb"))) { - printf("***BMP load error: file access denied***\n"); - exit(EXIT_SUCCESS); - } - - fread(&hdr, sizeof(hdr), 1, fd); - - if (hdr.type != 0x4D42) { - printf("***BMP load error: bad file format***\n"); - exit(EXIT_SUCCESS); - } - - fread(&infoHdr, sizeof(infoHdr), 1, fd); - - if (infoHdr.bitsPerPixel != 24) { - printf("***BMP load error: invalid color depth***\n"); - exit(EXIT_SUCCESS); - } - - if (infoHdr.compression) { - printf("***BMP load error: compressed image***\n"); - exit(EXIT_SUCCESS); - } - - *width = infoHdr.width; - *height = infoHdr.height; - *dst = (uchar4 *)malloc(*width * *height * 4); - - printf("BMP width: %u\n", infoHdr.width); - printf("BMP height: %u\n", infoHdr.height); - - fseek(fd, hdr.offset - sizeof(hdr) - sizeof(infoHdr), SEEK_CUR); - - for (y = 0; y < infoHdr.height; y++) { - for (x = 0; x < infoHdr.width; x++) { - (*dst)[(y * infoHdr.width + x)].z = fgetc(fd); - (*dst)[(y * infoHdr.width + x)].y = fgetc(fd); - (*dst)[(y * infoHdr.width + x)].x = fgetc(fd); + if (sizeof(uchar4) != 4) { + printf("***Bad uchar4 size***\n"); + exit(EXIT_SUCCESS); } - for (x = 0; x < (4 - (3 * infoHdr.width) % 4) % 4; x++) fgetc(fd); - } + if (!(fd = fopen(name, "rb"))) { + printf("***BMP load error: file access denied***\n"); + exit(EXIT_SUCCESS); + } - if (ferror(fd)) { - printf("***Unknown BMP load error.***\n"); - free(*dst); - exit(EXIT_SUCCESS); - } else - printf("BMP file loaded successfully!\n"); + fread(&hdr, sizeof(hdr), 1, fd); - fclose(fd); + if (hdr.type != 0x4D42) { + printf("***BMP load error: bad file format***\n"); + exit(EXIT_SUCCESS); + } + + fread(&infoHdr, sizeof(infoHdr), 1, fd); + + if (infoHdr.bitsPerPixel != 24) { + printf("***BMP load error: invalid color depth***\n"); + exit(EXIT_SUCCESS); + } + + if (infoHdr.compression) { + printf("***BMP load error: compressed image***\n"); + exit(EXIT_SUCCESS); + } + + *width = infoHdr.width; + *height = infoHdr.height; + *dst = (uchar4 *)malloc(*width * *height * 4); + + printf("BMP width: %u\n", infoHdr.width); + printf("BMP height: %u\n", infoHdr.height); + + fseek(fd, hdr.offset - sizeof(hdr) - sizeof(infoHdr), SEEK_CUR); + + for (y = 0; y < infoHdr.height; y++) { + for (x = 0; x < infoHdr.width; x++) { + (*dst)[(y * infoHdr.width + x)].z = fgetc(fd); + (*dst)[(y * infoHdr.width + x)].y = fgetc(fd); + (*dst)[(y * infoHdr.width + x)].x = fgetc(fd); + } + + for (x = 0; x < (4 - (3 * infoHdr.width) % 4) % 4; x++) + fgetc(fd); + } + + if (ferror(fd)) { + printf("***Unknown BMP load error.***\n"); + free(*dst); + exit(EXIT_SUCCESS); + } + else + printf("BMP file loaded successfully!\n"); + + fclose(fd); } diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.cu b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.cu index db35af62..ebce21be 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.cu +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.cu @@ -35,10 +35,11 @@ * See supplied whitepaper for more explanations. */ +#include #include #include #include -#include + #include "imageDenoising.h" //////////////////////////////////////////////////////////////////////////////// @@ -52,21 +53,21 @@ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); } __device__ float lerpf(float a, float b, float c) { return a + (b - a) * c; } -__device__ float vecLen(float4 a, float4 b) { - return ((b.x - a.x) * (b.x - a.x) + (b.y - a.y) * (b.y - a.y) + - (b.z - a.z) * (b.z - a.z)); +__device__ float vecLen(float4 a, float4 b) +{ + return ((b.x - a.x) * (b.x - a.x) + (b.y - a.y) * (b.y - a.y) + (b.z - a.z) * (b.z - a.z)); } -__device__ TColor make_color(float r, float g, float b, float a) { - return ((int)(a * 255.0f) << 24) | ((int)(b * 255.0f) << 16) | - ((int)(g * 255.0f) << 8) | ((int)(r * 255.0f) << 0); +__device__ TColor make_color(float r, float g, float b, float a) +{ + return ((int)(a * 255.0f) << 24) | ((int)(b * 255.0f) << 16) | ((int)(g * 255.0f) << 8) | ((int)(r * 255.0f) << 0); } //////////////////////////////////////////////////////////////////////////////// // Global data handlers and parameters //////////////////////////////////////////////////////////////////////////////// // Texture object and channel descriptor for image texture -cudaTextureObject_t texImage; +cudaTextureObject_t texImage; cudaChannelFormatDesc uchar4tex = cudaCreateChannelDesc(); // CUDA array descriptor @@ -77,36 +78,35 @@ cudaArray *a_Src; //////////////////////////////////////////////////////////////////////////////// #include "imageDenoising_copy_kernel.cuh" #include "imageDenoising_knn_kernel.cuh" -#include "imageDenoising_nlm_kernel.cuh" #include "imageDenoising_nlm2_kernel.cuh" +#include "imageDenoising_nlm_kernel.cuh" -extern "C" cudaError_t CUDA_MallocArray(uchar4 **h_Src, int imageW, - int imageH) { - cudaError_t error; +extern "C" cudaError_t CUDA_MallocArray(uchar4 **h_Src, int imageW, int imageH) +{ + cudaError_t error; - error = cudaMallocArray(&a_Src, &uchar4tex, imageW, imageH); - error = cudaMemcpy2DToArray(a_Src, 0, 0, *h_Src, sizeof(uchar4) * imageW, - sizeof(uchar4) * imageW, imageH, - cudaMemcpyHostToDevice); + error = cudaMallocArray(&a_Src, &uchar4tex, imageW, imageH); + error = cudaMemcpy2DToArray( + a_Src, 0, 0, *h_Src, sizeof(uchar4) * imageW, sizeof(uchar4) * imageW, imageH, cudaMemcpyHostToDevice); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = a_Src; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = a_Src; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors(cudaCreateTextureObject(&texImage, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texImage, &texRes, &texDescr, NULL)); - return error; + return error; } extern "C" cudaError_t CUDA_FreeArray() { return cudaFreeArray(a_Src); } diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.h b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.h index bd5c622a..6fb9dd99 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.h +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising.h @@ -33,20 +33,18 @@ typedef unsigned int TColor; //////////////////////////////////////////////////////////////////////////////// // Filter configuration //////////////////////////////////////////////////////////////////////////////// -#define KNN_WINDOW_RADIUS 3 -#define NLM_WINDOW_RADIUS 3 -#define NLM_BLOCK_RADIUS 3 -#define KNN_WINDOW_AREA \ - ((2 * KNN_WINDOW_RADIUS + 1) * (2 * KNN_WINDOW_RADIUS + 1)) -#define NLM_WINDOW_AREA \ - ((2 * NLM_WINDOW_RADIUS + 1) * (2 * NLM_WINDOW_RADIUS + 1)) +#define KNN_WINDOW_RADIUS 3 +#define NLM_WINDOW_RADIUS 3 +#define NLM_BLOCK_RADIUS 3 +#define KNN_WINDOW_AREA ((2 * KNN_WINDOW_RADIUS + 1) * (2 * KNN_WINDOW_RADIUS + 1)) +#define NLM_WINDOW_AREA ((2 * NLM_WINDOW_RADIUS + 1) * (2 * NLM_WINDOW_RADIUS + 1)) #define INV_KNN_WINDOW_AREA (1.0f / (float)KNN_WINDOW_AREA) #define INV_NLM_WINDOW_AREA (1.0f / (float)NLM_WINDOW_AREA) #define KNN_WEIGHT_THRESHOLD 0.02f -#define KNN_LERP_THRESHOLD 0.79f +#define KNN_LERP_THRESHOLD 0.79f #define NLM_WEIGHT_THRESHOLD 0.10f -#define NLM_LERP_THRESHOLD 0.10f +#define NLM_LERP_THRESHOLD 0.10f #define BLOCKDIM_X 8 #define BLOCKDIM_Y 8 @@ -59,8 +57,7 @@ typedef unsigned int TColor; #endif // functions to load images -extern "C" void LoadBMPFile(uchar4 **dst, int *width, int *height, - const char *name); +extern "C" void LoadBMPFile(uchar4 **dst, int *width, int *height, const char *name); // CUDA wrapper functions for allocation/freeing texture arrays extern "C" cudaTextureObject_t texImage; @@ -69,21 +66,17 @@ extern "C" cudaError_t CUDA_MallocArray(uchar4 **h_Src, int imageW, int imageH); extern "C" cudaError_t CUDA_FreeArray(); // CUDA kernel functions -extern "C" void cuda_Copy(TColor *d_dst, int imageW, int imageH, - cudaTextureObject_t texImage); -extern "C" void cuda_KNN(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage); -extern "C" void cuda_KNNdiag(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage); -extern "C" void cuda_NLM(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage); -extern "C" void cuda_NLMdiag(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage); +extern "C" void cuda_Copy(TColor *d_dst, int imageW, int imageH, cudaTextureObject_t texImage); +extern "C" void cuda_KNN(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage); +extern "C" void +cuda_KNNdiag(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage); +extern "C" void cuda_NLM(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage); +extern "C" void +cuda_NLMdiag(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage); -extern "C" void cuda_NLM2(TColor *d_dst, int imageW, int imageH, float Noise, - float LerpC, cudaTextureObject_t texImage); -extern "C" void cuda_NLM2diag(TColor *d_dst, int imageW, int imageH, - float Noise, float LerpC, - cudaTextureObject_t texImage); +extern "C" void +cuda_NLM2(TColor *d_dst, int imageW, int imageH, float Noise, float LerpC, cudaTextureObject_t texImage); +extern "C" void +cuda_NLM2diag(TColor *d_dst, int imageW, int imageH, float Noise, float LerpC, cudaTextureObject_t texImage); #endif diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoisingGL.cpp b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoisingGL.cpp index 6936d955..a8ee88b8 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoisingGL.cpp +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoisingGL.cpp @@ -45,478 +45,387 @@ #endif // CUDA utilities and system includes -#include #include +#include // Includes #include #include #include + #include "imageDenoising.h" // includes, project -#include // includes for helper utility functions -#include // includes for cuda error checking and initialization +#include // includes for cuda error checking and initialization +#include // includes for helper utility functions const char *sSDKsample = "CUDA ImageDenoising"; -const char *filterMode[] = {"Passthrough", "KNN method", "NLM method", - "Quick NLM(NLM2) method", NULL}; +const char *filterMode[] = {"Passthrough", "KNN method", "NLM method", "Quick NLM(NLM2) method", NULL}; // Define the files that are to be save and the reference images for validation -const char *sOriginal[] = {"image_passthru.ppm", "image_knn.ppm", - "image_nlm.ppm", "image_nlm2.ppm", NULL}; +const char *sOriginal[] = {"image_passthru.ppm", "image_knn.ppm", "image_nlm.ppm", "image_nlm2.ppm", NULL}; -const char *sReference[] = {"ref_passthru.ppm", "ref_knn.ppm", "ref_nlm.ppm", - "ref_nlm2.ppm", NULL}; +const char *sReference[] = {"ref_passthru.ppm", "ref_knn.ppm", "ref_nlm.ppm", "ref_nlm2.ppm", NULL}; //////////////////////////////////////////////////////////////////////////////// // Global data handlers and parameters //////////////////////////////////////////////////////////////////////////////// // OpenGL PBO and texture "names" -GLuint gl_PBO, gl_Tex; -struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange +GLuint gl_PBO, gl_Tex; +struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange // Source image on the host side uchar4 *h_Src; -int imageW, imageH; -GLuint shader; +int imageW, imageH; +GLuint shader; //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int g_Kernel = 0; -bool g_FPS = false; -bool g_Diag = false; -StopWatchInterface *timer = NULL; +int g_Kernel = 0; +bool g_FPS = false; +bool g_Diag = false; +StopWatchInterface *timer = NULL; // Algorithms global parameters -const float noiseStep = 0.025f; -const float lerpStep = 0.025f; -static float knnNoise = 0.32f; -static float nlmNoise = 1.45f; -static float lerpC = 0.2f; +const float noiseStep = 0.025f; +const float lerpStep = 0.025f; +static float knnNoise = 0.32f; +static float nlmNoise = 1.45f; +static float lerpC = 0.2f; -const int frameN = 24; -int frameCounter = 0; +const int frameN = 24; +int frameCounter = 0; #define BUFFER_DATA(i) ((char *)0 + i) // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; #define MAX_EPSILON_ERROR 5 -#define REFRESH_DELAY 10 // ms +#define REFRESH_DELAY 10 // ms void cleanup(); -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "<%s>: %3.1f fps", filterMode[g_Kernel], ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "<%s>: %3.1f fps", filterMode[g_Kernel], ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - // fpsLimit = (int)MAX(ifps, 1.f); - sdkResetTimer(&timer); - } + // fpsLimit = (int)MAX(ifps, 1.f); + sdkResetTimer(&timer); + } } -void runImageFilters(TColor *d_dst) { - switch (g_Kernel) { +void runImageFilters(TColor *d_dst) +{ + switch (g_Kernel) { case 0: - cuda_Copy(d_dst, imageW, imageH, texImage); - break; + cuda_Copy(d_dst, imageW, imageH, texImage); + break; case 1: - if (!g_Diag) { - cuda_KNN(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC, - texImage); - } else { - cuda_KNNdiag(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC, - texImage); - } + if (!g_Diag) { + cuda_KNN(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC, texImage); + } + else { + cuda_KNNdiag(d_dst, imageW, imageH, 1.0f / (knnNoise * knnNoise), lerpC, texImage); + } - break; + break; case 2: - if (!g_Diag) { - cuda_NLM(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC, - texImage); - } else { - cuda_NLMdiag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC, - texImage); - } + if (!g_Diag) { + cuda_NLM(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC, texImage); + } + else { + cuda_NLMdiag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC, texImage); + } - break; + break; case 3: - if (!g_Diag) { - cuda_NLM2(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC, - texImage); - } else { - cuda_NLM2diag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), - lerpC, texImage); - } + if (!g_Diag) { + cuda_NLM2(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC, texImage); + } + else { + cuda_NLM2diag(d_dst, imageW, imageH, 1.0f / (nlmNoise * nlmNoise), lerpC, texImage); + } - break; - } - - getLastCudaError("Filtering kernel execution failed.\n"); -} - -void displayFunc(void) { - sdkStartTimer(&timer); - TColor *d_dst = NULL; - size_t num_bytes; - - if (frameCounter++ == 0) { - sdkResetTimer(&timer); - } - - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - getLastCudaError("cudaGraphicsMapResources failed"); - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_dst, &num_bytes, cuda_pbo_resource)); - getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); - - runImageFilters(d_dst); - - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - - // Common display code path - { - glClear(GL_COLOR_BUFFER_BIT); - - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, - GL_UNSIGNED_BYTE, BUFFER_DATA(0)); - glBegin(GL_TRIANGLES); - glTexCoord2f(0, 0); - glVertex2f(-1, -1); - glTexCoord2f(2, 0); - glVertex2f(+3, -1); - glTexCoord2f(0, 2); - glVertex2f(-1, +3); - glEnd(); - glFinish(); - } - - if (frameCounter == frameN) { - frameCounter = 0; - - if (g_FPS) { - printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001)); - g_FPS = false; + break; } - } - glutSwapBuffers(); - glutReportErrors(); - - sdkStopTimer(&timer); - - computeFPS(); + getLastCudaError("Filtering kernel execution failed.\n"); } -void timerEvent(int value) { - if (glutGetWindow()) { - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - } +void displayFunc(void) +{ + sdkStartTimer(&timer); + TColor *d_dst = NULL; + size_t num_bytes; + + if (frameCounter++ == 0) { + sdkResetTimer(&timer); + } + + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + getLastCudaError("cudaGraphicsMapResources failed"); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); + getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); + + runImageFilters(d_dst); + + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + + // Common display code path + { + glClear(GL_COLOR_BUFFER_BIT); + + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0)); + glBegin(GL_TRIANGLES); + glTexCoord2f(0, 0); + glVertex2f(-1, -1); + glTexCoord2f(2, 0); + glVertex2f(+3, -1); + glTexCoord2f(0, 2); + glVertex2f(-1, +3); + glEnd(); + glFinish(); + } + + if (frameCounter == frameN) { + frameCounter = 0; + + if (g_FPS) { + printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001)); + g_FPS = false; + } + } + + glutSwapBuffers(); + glutReportErrors(); + + sdkStopTimer(&timer); + + computeFPS(); } -void keyboard(unsigned char k, int /*x*/, int /*y*/) { - switch (k) { +void timerEvent(int value) +{ + if (glutGetWindow()) { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + } +} + +void keyboard(unsigned char k, int /*x*/, int /*y*/) +{ + switch (k) { case 27: case 'q': case 'Q': #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif case '1': - printf("Passthrough.\n"); - g_Kernel = 0; - break; + printf("Passthrough.\n"); + g_Kernel = 0; + break; case '2': - printf("KNN method \n"); - g_Kernel = 1; - break; + printf("KNN method \n"); + g_Kernel = 1; + break; case '3': - printf("NLM method\n"); - g_Kernel = 2; - break; + printf("NLM method\n"); + g_Kernel = 2; + break; case '4': - printf("Quick NLM(NLM2) method\n"); - g_Kernel = 3; - break; + printf("Quick NLM(NLM2) method\n"); + g_Kernel = 3; + break; case '*': - printf(g_Diag ? "LERP highlighting mode.\n" : "Normal mode.\n"); - g_Diag = !g_Diag; - break; + printf(g_Diag ? "LERP highlighting mode.\n" : "Normal mode.\n"); + g_Diag = !g_Diag; + break; case 'n': - printf("Decrease noise level.\n"); - knnNoise -= noiseStep; - nlmNoise -= noiseStep; - break; + printf("Decrease noise level.\n"); + knnNoise -= noiseStep; + nlmNoise -= noiseStep; + break; case 'N': - printf("Increase noise level.\n"); - knnNoise += noiseStep; - nlmNoise += noiseStep; - break; + printf("Increase noise level.\n"); + knnNoise += noiseStep; + nlmNoise += noiseStep; + break; case 'l': - printf("Decrease LERP quotient.\n"); - lerpC = MAX(lerpC - lerpStep, 0.0f); - break; + printf("Decrease LERP quotient.\n"); + lerpC = MAX(lerpC - lerpStep, 0.0f); + break; case 'L': - printf("Increase LERP quotient.\n"); - lerpC = MIN(lerpC + lerpStep, 1.0f); - break; + printf("Increase LERP quotient.\n"); + lerpC = MIN(lerpC + lerpStep, 1.0f); + break; case 'f': case 'F': - g_FPS = true; - break; + g_FPS = true; + break; case '?': - printf("lerpC = %5.5f\n", lerpC); - printf("knnNoise = %5.5f\n", knnNoise); - printf("nlmNoise = %5.5f\n", nlmNoise); - break; - } + printf("lerpC = %5.5f\n", lerpC); + printf("knnNoise = %5.5f\n", knnNoise); + printf("nlmNoise = %5.5f\n", nlmNoise); + break; + } } -int initGL(int *argc, char **argv) { - printf("Initializing GLUT...\n"); - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); - glutInitWindowSize(imageW, imageH); - glutInitWindowPosition(512 - imageW / 2, 384 - imageH / 2); - glutCreateWindow(argv[0]); - glutDisplayFunc(displayFunc); - glutKeyboardFunc(keyboard); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - printf("OpenGL window created.\n"); +int initGL(int *argc, char **argv) +{ + printf("Initializing GLUT...\n"); + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(imageW, imageH); + glutInitWindowPosition(512 - imageW / 2, 384 - imageH / 2); + glutCreateWindow(argv[0]); + glutDisplayFunc(displayFunc); + glutKeyboardFunc(keyboard); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + printf("OpenGL window created.\n"); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - if (!isGLVersionSupported(1, 5) || - !areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); - fprintf(stderr, "This sample requires:\n"); - fprintf(stderr, " OpenGL version 1.5\n"); - fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); - fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); - fflush(stderr); - return false; - } + if (!isGLVersionSupported(1, 5) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); + fprintf(stderr, "This sample requires:\n"); + fprintf(stderr, " OpenGL version 1.5\n"); + fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); + fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); + fflush(stderr); + return false; + } - return 0; + return 0; } // shader for displaying floating-point texture -static const char *shader_code = - "!!ARBfp1.0\n" - "TEX result.color, fragment.texcoord, texture[0], 2D; \n" - "END"; +static const char *shader_code = "!!ARBfp1.0\n" + "TEX result.color, fragment.texcoord, texture[0], 2D; \n" + "END"; -GLuint compileASMShader(GLenum program_type, const char *code) { - GLuint program_id; - glGenProgramsARB(1, &program_id); - glBindProgramARB(program_type, program_id); - glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, - (GLsizei)strlen(code), (GLubyte *)code); +GLuint compileASMShader(GLenum program_type, const char *code) +{ + GLuint program_id; + glGenProgramsARB(1, &program_id); + glBindProgramARB(program_type, program_id); + glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)strlen(code), (GLubyte *)code); - GLint error_pos; - glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); + GLint error_pos; + glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); - if (error_pos != -1) { - const GLubyte *error_string; - error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); - fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, - error_string); - return 0; - } + if (error_pos != -1) { + const GLubyte *error_string; + error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); + fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, error_string); + return 0; + } - return program_id; + return program_id; } -void initOpenGLBuffers() { - printf("Creating GL texture...\n"); - glEnable(GL_TEXTURE_2D); - glGenTextures(1, &gl_Tex); - glBindTexture(GL_TEXTURE_2D, gl_Tex); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, imageW, imageH, 0, GL_RGBA, - GL_UNSIGNED_BYTE, h_Src); - printf("Texture created.\n"); +void initOpenGLBuffers() +{ + printf("Creating GL texture...\n"); + glEnable(GL_TEXTURE_2D); + glGenTextures(1, &gl_Tex); + glBindTexture(GL_TEXTURE_2D, gl_Tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, imageW, imageH, 0, GL_RGBA, GL_UNSIGNED_BYTE, h_Src); + printf("Texture created.\n"); - printf("Creating PBO...\n"); - glGenBuffers(1, &gl_PBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, gl_PBO); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, imageW * imageH * 4, h_Src, - GL_STREAM_COPY); - // While a PBO is registered to CUDA, it can't be used - // as the destination for OpenGL drawing calls. - // But in our particular case OpenGL is only used - // to display the content of the PBO, specified by CUDA kernels, - // so we need to register/unregister it only once. - // DEPRECATED: checkCudaErrors(cudaGLRegisterBufferObject(gl_PBO) ); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, gl_PBO, cudaGraphicsMapFlagsWriteDiscard)); - GLenum gl_error = glGetError(); + printf("Creating PBO...\n"); + glGenBuffers(1, &gl_PBO); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, gl_PBO); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, imageW * imageH * 4, h_Src, GL_STREAM_COPY); + // While a PBO is registered to CUDA, it can't be used + // as the destination for OpenGL drawing calls. + // But in our particular case OpenGL is only used + // to display the content of the PBO, specified by CUDA kernels, + // so we need to register/unregister it only once. + // DEPRECATED: checkCudaErrors(cudaGLRegisterBufferObject(gl_PBO) ); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, gl_PBO, cudaGraphicsMapFlagsWriteDiscard)); + GLenum gl_error = glGetError(); - if (gl_error != GL_NO_ERROR) { + if (gl_error != GL_NO_ERROR) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - char tmpStr[512]; - // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at - // the right line when the user double clicks on the error line in the - // Output pane. Like any compile error. - sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", __FILE__, __LINE__, - gluErrorString(gl_error)); - OutputDebugString(tmpStr); + char tmpStr[512]; + // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at + // the right line when the user double clicks on the error line in the + // Output pane. Like any compile error. + sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", __FILE__, __LINE__, gluErrorString(gl_error)); + OutputDebugString(tmpStr); #endif - fprintf(stderr, "GL Error in file '%s' in line %d :\n", __FILE__, __LINE__); - fprintf(stderr, "%s\n", gluErrorString(gl_error)); - exit(EXIT_FAILURE); - } - - printf("PBO created.\n"); - - // load shader program - shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); -} - -void cleanup() { - free(h_Src); - checkCudaErrors(CUDA_FreeArray()); - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); - - glDeleteProgramsARB(1, &shader); - - sdkDeleteTimer(&timer); -} - -void runAutoTest(int argc, char **argv, const char *filename, - int kernel_param) { - printf("[%s] - (automated testing w/ readback)\n", sSDKsample); - - int devID = findCudaDevice(argc, (const char **)argv); - - // First load the image, so we know what the size of the image (imageW and - // imageH) - printf("Allocating host and CUDA memory and loading image file...\n"); - const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); - - if (image_path == NULL) { - printf( - "imageDenoisingGL was unable to find and load image file " - ".\nExiting...\n"); - exit(EXIT_FAILURE); - } - - LoadBMPFile(&h_Src, &imageW, &imageH, image_path); - printf("Data init done.\n"); - - checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); - - TColor *d_dst = NULL; - unsigned char *h_dst = NULL; - checkCudaErrors( - cudaMalloc((void **)&d_dst, imageW * imageH * sizeof(TColor))); - h_dst = (unsigned char *)malloc(imageH * imageW * 4); - - { - g_Kernel = kernel_param; - printf("[AutoTest]: %s <%s>\n", sSDKsample, filterMode[g_Kernel]); - - runImageFilters(d_dst); - - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cudaMemcpy(h_dst, d_dst, imageW * imageH * sizeof(TColor), - cudaMemcpyDeviceToHost)); - sdkSavePPM4ub(filename, h_dst, imageW, imageH); - } - - checkCudaErrors(CUDA_FreeArray()); - free(h_Src); - - checkCudaErrors(cudaFree(d_dst)); - free(h_dst); - - printf("\n[%s] -> Kernel %d, Saved: %s\n", sSDKsample, kernel_param, - filename); - - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); -} - -int main(int argc, char **argv) { - char *dump_file = NULL; - -#if defined(__linux__) - setenv("DISPLAY", ":0", 0); -#endif - - pArgc = &argc; - pArgv = argv; - - printf("%s Starting...\n\n", sSDKsample); - - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", - (char **)&dump_file); - - int kernel = 1; - - if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { - kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); + fprintf(stderr, "GL Error in file '%s' in line %d :\n", __FILE__, __LINE__); + fprintf(stderr, "%s\n", gluErrorString(gl_error)); + exit(EXIT_FAILURE); } - runAutoTest(argc, argv, dump_file, kernel); - } else { - printf("[%s]\n", sSDKsample); + printf("PBO created.\n"); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf("[%s]\n", argv[0]); - printf(" Does not explicitly support -device=n in OpenGL mode\n"); - printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); - printf(" > %s -device=n -qatest\n", argv[0]); - printf("exiting...\n"); - exit(EXIT_SUCCESS); - } + // load shader program + shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); +} + +void cleanup() +{ + free(h_Src); + checkCudaErrors(CUDA_FreeArray()); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); + + glDeleteProgramsARB(1, &shader); + + sdkDeleteTimer(&timer); +} + +void runAutoTest(int argc, char **argv, const char *filename, int kernel_param) +{ + printf("[%s] - (automated testing w/ readback)\n", sSDKsample); + + int devID = findCudaDevice(argc, (const char **)argv); // First load the image, so we know what the size of the image (imageW and // imageH) @@ -524,37 +433,117 @@ int main(int argc, char **argv) { const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { - printf( - "imageDenoisingGL was unable to find and load image file " - ".\nExiting...\n"); - exit(EXIT_FAILURE); + printf("imageDenoisingGL was unable to find and load image file " + ".\nExiting...\n"); + exit(EXIT_FAILURE); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); - initGL(&argc, argv); - findCudaDevice(argc, (const char **)argv); - checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); - initOpenGLBuffers(); - } + TColor *d_dst = NULL; + unsigned char *h_dst = NULL; + checkCudaErrors(cudaMalloc((void **)&d_dst, imageW * imageH * sizeof(TColor))); + h_dst = (unsigned char *)malloc(imageH * imageW * 4); - printf("Starting GLUT main loop...\n"); - printf("Press [1] to view noisy image\n"); - printf("Press [2] to view image restored with knn filter\n"); - printf("Press [3] to view image restored with nlm filter\n"); - printf("Press [4] to view image restored with modified nlm filter\n"); - printf( - "Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is " - "active\n"); - printf("Press [f] to print frame rate\n"); - printf("Press [?] to print Noise and Lerp Ct's\n"); - printf("Press [q] to exit\n"); + { + g_Kernel = kernel_param; + printf("[AutoTest]: %s <%s>\n", sSDKsample, filterMode[g_Kernel]); - sdkCreateTimer(&timer); - sdkStartTimer(&timer); + runImageFilters(d_dst); - glutMainLoop(); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cudaMemcpy(h_dst, d_dst, imageW * imageH * sizeof(TColor), cudaMemcpyDeviceToHost)); + sdkSavePPM4ub(filename, h_dst, imageW, imageH); + } + + checkCudaErrors(CUDA_FreeArray()); + free(h_Src); + + checkCudaErrors(cudaFree(d_dst)); + free(h_dst); + + printf("\n[%s] -> Kernel %d, Saved: %s\n", sSDKsample, kernel_param, filename); + + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + char *dump_file = NULL; + +#if defined(__linux__) + setenv("DISPLAY", ":0", 0); +#endif + + pArgc = &argc; + pArgv = argv; + + printf("%s Starting...\n\n", sSDKsample); + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&dump_file); + + int kernel = 1; + + if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { + kernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); + } + + runAutoTest(argc, argv, dump_file, kernel); + } + else { + printf("[%s]\n", sSDKsample); + + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf("[%s]\n", argv[0]); + printf(" Does not explicitly support -device=n in OpenGL mode\n"); + printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); + printf(" > %s -device=n -qatest\n", argv[0]); + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } + + // First load the image, so we know what the size of the image (imageW and + // imageH) + printf("Allocating host and CUDA memory and loading image file...\n"); + const char *image_path = sdkFindFilePath("portrait_noise.bmp", argv[0]); + + if (image_path == NULL) { + printf("imageDenoisingGL was unable to find and load image file " + ".\nExiting...\n"); + exit(EXIT_FAILURE); + } + + LoadBMPFile(&h_Src, &imageW, &imageH, image_path); + printf("Data init done.\n"); + + initGL(&argc, argv); + findCudaDevice(argc, (const char **)argv); + + checkCudaErrors(CUDA_MallocArray(&h_Src, imageW, imageH)); + + initOpenGLBuffers(); + } + + printf("Starting GLUT main loop...\n"); + printf("Press [1] to view noisy image\n"); + printf("Press [2] to view image restored with knn filter\n"); + printf("Press [3] to view image restored with nlm filter\n"); + printf("Press [4] to view image restored with modified nlm filter\n"); + printf("Press [*] to view smooth/edgy areas [RED/BLUE] Ct's when a filter is " + "active\n"); + printf("Press [f] to print frame rate\n"); + printf("Press [?] to print Noise and Lerp Ct's\n"); + printf("Press [q] to exit\n"); + + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + + glutMainLoop(); } diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_copy_kernel.cuh b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_copy_kernel.cuh index 1ddf8265..06badf3b 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_copy_kernel.cuh +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_copy_kernel.cuh @@ -25,24 +25,24 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -__global__ void Copy(TColor *dst, int imageW, int imageH, - cudaTextureObject_t texImage) { - const int ix = blockDim.x * blockIdx.x + threadIdx.x; - const int iy = blockDim.y * blockIdx.y + threadIdx.y; - // Add half of a texel to always address exact texel centers - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; +__global__ void Copy(TColor *dst, int imageW, int imageH, cudaTextureObject_t texImage) +{ + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + // Add half of a texel to always address exact texel centers + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; - if (ix < imageW && iy < imageH) { - float4 fresult = tex2D(texImage, x, y); - dst[imageW * iy + ix] = make_color(fresult.x, fresult.y, fresult.z, 0); - } + if (ix < imageW && iy < imageH) { + float4 fresult = tex2D(texImage, x, y); + dst[imageW * iy + ix] = make_color(fresult.x, fresult.y, fresult.z, 0); + } } -extern "C" void cuda_Copy(TColor *d_dst, int imageW, int imageH, - cudaTextureObject_t texImage) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +extern "C" void cuda_Copy(TColor *d_dst, int imageW, int imageH, cudaTextureObject_t texImage) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - Copy<<>>(d_dst, imageW, imageH, texImage); + Copy<<>>(d_dst, imageW, imageH, texImage); } diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_knn_kernel.cuh b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_knn_kernel.cuh index a58da5d4..de28d044 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_knn_kernel.cuh +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_knn_kernel.cuh @@ -28,119 +28,117 @@ //////////////////////////////////////////////////////////////////////////////// // KNN kernel //////////////////////////////////////////////////////////////////////////////// -__global__ void KNN(TColor *dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - const int ix = blockDim.x * blockIdx.x + threadIdx.x; - const int iy = blockDim.y * blockIdx.y + threadIdx.y; - // Add half of a texel to always address exact texel centers - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; +__global__ void KNN(TColor *dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + // Add half of a texel to always address exact texel centers + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; - if (ix < imageW && iy < imageH) { - // Normalized counter for the weight threshold - float fCount = 0; - // Total sum of pixel weights - float sumWeights = 0; - // Result accumulator - float3 clr = {0, 0, 0}; - // Center of the KNN window - float4 clr00 = tex2D(texImage, x, y); + if (ix < imageW && iy < imageH) { + // Normalized counter for the weight threshold + float fCount = 0; + // Total sum of pixel weights + float sumWeights = 0; + // Result accumulator + float3 clr = {0, 0, 0}; + // Center of the KNN window + float4 clr00 = tex2D(texImage, x, y); - // Cycle through KNN window, surrounding (x, y) texel - for (float i = -KNN_WINDOW_RADIUS; i <= KNN_WINDOW_RADIUS; i++) - for (float j = -KNN_WINDOW_RADIUS; j <= KNN_WINDOW_RADIUS; j++) { - float4 clrIJ = tex2D(texImage, x + j, y + i); - float distanceIJ = vecLen(clr00, clrIJ); + // Cycle through KNN window, surrounding (x, y) texel + for (float i = -KNN_WINDOW_RADIUS; i <= KNN_WINDOW_RADIUS; i++) + for (float j = -KNN_WINDOW_RADIUS; j <= KNN_WINDOW_RADIUS; j++) { + float4 clrIJ = tex2D(texImage, x + j, y + i); + float distanceIJ = vecLen(clr00, clrIJ); - // Derive final weight from color distance - float weightIJ = __expf( - -(distanceIJ * Noise + (i * i + j * j) * INV_KNN_WINDOW_AREA)); + // Derive final weight from color distance + float weightIJ = __expf(-(distanceIJ * Noise + (i * i + j * j) * INV_KNN_WINDOW_AREA)); - // Accumulate (x + j, y + i) texel color with computed weight - clr.x += clrIJ.x * weightIJ; - clr.y += clrIJ.y * weightIJ; - clr.z += clrIJ.z * weightIJ; + // Accumulate (x + j, y + i) texel color with computed weight + clr.x += clrIJ.x * weightIJ; + clr.y += clrIJ.y * weightIJ; + clr.z += clrIJ.z * weightIJ; - // Sum of weights for color normalization to [0..1] range - sumWeights += weightIJ; + // Sum of weights for color normalization to [0..1] range + sumWeights += weightIJ; - // Update weight counter, if KNN weight for current window texel - // exceeds the weight threshold - fCount += (weightIJ > KNN_WEIGHT_THRESHOLD) ? INV_KNN_WINDOW_AREA : 0; - } + // Update weight counter, if KNN weight for current window texel + // exceeds the weight threshold + fCount += (weightIJ > KNN_WEIGHT_THRESHOLD) ? INV_KNN_WINDOW_AREA : 0; + } - // Normalize result color by sum of weights - sumWeights = 1.0f / sumWeights; - clr.x *= sumWeights; - clr.y *= sumWeights; - clr.z *= sumWeights; + // Normalize result color by sum of weights + sumWeights = 1.0f / sumWeights; + clr.x *= sumWeights; + clr.y *= sumWeights; + clr.z *= sumWeights; - // Choose LERP quotient basing on how many texels - // within the KNN window exceeded the weight threshold - float lerpQ = (fCount > KNN_LERP_THRESHOLD) ? lerpC : 1.0f - lerpC; + // Choose LERP quotient basing on how many texels + // within the KNN window exceeded the weight threshold + float lerpQ = (fCount > KNN_LERP_THRESHOLD) ? lerpC : 1.0f - lerpC; - // Write final result to global memory - clr.x = lerpf(clr.x, clr00.x, lerpQ); - clr.y = lerpf(clr.y, clr00.y, lerpQ); - clr.z = lerpf(clr.z, clr00.z, lerpQ); - dst[imageW * iy + ix] = make_color(clr.x, clr.y, clr.z, 0); - }; + // Write final result to global memory + clr.x = lerpf(clr.x, clr00.x, lerpQ); + clr.y = lerpf(clr.y, clr00.y, lerpQ); + clr.z = lerpf(clr.z, clr00.z, lerpQ); + dst[imageW * iy + ix] = make_color(clr.x, clr.y, clr.z, 0); + }; } -extern "C" void cuda_KNN(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +extern "C" void cuda_KNN(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - KNN<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); + KNN<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); } //////////////////////////////////////////////////////////////////////////////// // Stripped KNN kernel, only highlighting areas with different LERP directions //////////////////////////////////////////////////////////////////////////////// -__global__ void KNNdiag(TColor *dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - const int ix = blockDim.x * blockIdx.x + threadIdx.x; - const int iy = blockDim.y * blockIdx.y + threadIdx.y; - // Add half of a texel to always address exact texel centers - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; +__global__ void KNNdiag(TColor *dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + // Add half of a texel to always address exact texel centers + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; - if (ix < imageW && iy < imageH) { - // Normalized counter for the weight threshold - float fCount = 0; - // Center of the KNN window - float4 clr00 = tex2D(texImage, x, y); + if (ix < imageW && iy < imageH) { + // Normalized counter for the weight threshold + float fCount = 0; + // Center of the KNN window + float4 clr00 = tex2D(texImage, x, y); - // Cycle through KNN window, surrounding (x, y) texel - for (float i = -KNN_WINDOW_RADIUS; i <= KNN_WINDOW_RADIUS; i++) - for (float j = -KNN_WINDOW_RADIUS; j <= KNN_WINDOW_RADIUS; j++) { - float4 clrIJ = tex2D(texImage, x + j, y + i); - float distanceIJ = vecLen(clr00, clrIJ); + // Cycle through KNN window, surrounding (x, y) texel + for (float i = -KNN_WINDOW_RADIUS; i <= KNN_WINDOW_RADIUS; i++) + for (float j = -KNN_WINDOW_RADIUS; j <= KNN_WINDOW_RADIUS; j++) { + float4 clrIJ = tex2D(texImage, x + j, y + i); + float distanceIJ = vecLen(clr00, clrIJ); - // Derive final weight from color and geometric distance - float weightIJ = __expf( - -(distanceIJ * Noise + (i * i + j * j) * INV_KNN_WINDOW_AREA)); + // Derive final weight from color and geometric distance + float weightIJ = __expf(-(distanceIJ * Noise + (i * i + j * j) * INV_KNN_WINDOW_AREA)); - // Update weight counter, if KNN weight for current window texel - // exceeds the weight threshold - fCount += - (weightIJ > KNN_WEIGHT_THRESHOLD) ? INV_KNN_WINDOW_AREA : 0.0f; - } + // Update weight counter, if KNN weight for current window texel + // exceeds the weight threshold + fCount += (weightIJ > KNN_WEIGHT_THRESHOLD) ? INV_KNN_WINDOW_AREA : 0.0f; + } - // Choose LERP quotient basing on how many texels - // within the KNN window exceeded the weight threshold - float lerpQ = (fCount > KNN_LERP_THRESHOLD) ? 1.0f : 0; + // Choose LERP quotient basing on how many texels + // within the KNN window exceeded the weight threshold + float lerpQ = (fCount > KNN_LERP_THRESHOLD) ? 1.0f : 0; - // Write final result to global memory - dst[imageW * iy + ix] = make_color(lerpQ, 0, (1.0f - lerpQ), 0); - }; + // Write final result to global memory + dst[imageW * iy + ix] = make_color(lerpQ, 0, (1.0f - lerpQ), 0); + }; } -extern "C" void cuda_KNNdiag(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +extern "C" void +cuda_KNNdiag(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - KNNdiag<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); + KNNdiag<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); } diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm2_kernel.cuh b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm2_kernel.cuh index 84175a4b..97af5ae3 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm2_kernel.cuh +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm2_kernel.cuh @@ -48,170 +48,166 @@ namespace cg = cooperative_groups; -__global__ void NLM2(TColor *dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void NLM2(TColor *dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - // Weights cache - __shared__ float fWeights[BLOCKDIM_X * BLOCKDIM_Y]; + // Weights cache + __shared__ float fWeights[BLOCKDIM_X * BLOCKDIM_Y]; - const int ix = blockDim.x * blockIdx.x + threadIdx.x; - const int iy = blockDim.y * blockIdx.y + threadIdx.y; - // Add half of a texel to always address exact texel centers - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; - const float cx = blockDim.x * blockIdx.x + NLM_WINDOW_RADIUS + 0.5f; - const float cy = blockDim.x * blockIdx.y + NLM_WINDOW_RADIUS + 0.5f; + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + // Add half of a texel to always address exact texel centers + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; + const float cx = blockDim.x * blockIdx.x + NLM_WINDOW_RADIUS + 0.5f; + const float cy = blockDim.x * blockIdx.y + NLM_WINDOW_RADIUS + 0.5f; - if (ix < imageW && iy < imageH) { - // Find color distance from current texel to the center of NLM window - float weight = 0; + if (ix < imageW && iy < imageH) { + // Find color distance from current texel to the center of NLM window + float weight = 0; - for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS; n++) - for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS; m++) - weight += vecLen(tex2D(texImage, cx + m, cy + n), - tex2D(texImage, x + m, y + n)); + for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS; n++) + for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS; m++) + weight += vecLen(tex2D(texImage, cx + m, cy + n), tex2D(texImage, x + m, y + n)); - // Geometric distance from current texel to the center of NLM window - float dist = - (threadIdx.x - NLM_WINDOW_RADIUS) * (threadIdx.x - NLM_WINDOW_RADIUS) + - (threadIdx.y - NLM_WINDOW_RADIUS) * (threadIdx.y - NLM_WINDOW_RADIUS); + // Geometric distance from current texel to the center of NLM window + float dist = (threadIdx.x - NLM_WINDOW_RADIUS) * (threadIdx.x - NLM_WINDOW_RADIUS) + + (threadIdx.y - NLM_WINDOW_RADIUS) * (threadIdx.y - NLM_WINDOW_RADIUS); - // Derive final weight from color and geometric distance - weight = __expf(-(weight * Noise + dist * INV_NLM_WINDOW_AREA)); + // Derive final weight from color and geometric distance + weight = __expf(-(weight * Noise + dist * INV_NLM_WINDOW_AREA)); - // Write the result to shared memory - fWeights[threadIdx.y * BLOCKDIM_X + threadIdx.x] = weight; - // Wait until all the weights are ready - cg::sync(cta); + // Write the result to shared memory + fWeights[threadIdx.y * BLOCKDIM_X + threadIdx.x] = weight; + // Wait until all the weights are ready + cg::sync(cta); - // Normalized counter for the NLM weight threshold - float fCount = 0; - // Total sum of pixel weights - float sumWeights = 0; - // Result accumulator - float3 clr = {0, 0, 0}; + // Normalized counter for the NLM weight threshold + float fCount = 0; + // Total sum of pixel weights + float sumWeights = 0; + // Result accumulator + float3 clr = {0, 0, 0}; - int idx = 0; + int idx = 0; - // Cycle through NLM window, surrounding (x, y) texel - for (float i = -NLM_WINDOW_RADIUS; i <= NLM_WINDOW_RADIUS + 1; i++) - for (float j = -NLM_WINDOW_RADIUS; j <= NLM_WINDOW_RADIUS + 1; j++) { - // Load precomputed weight - float weightIJ = fWeights[idx++]; + // Cycle through NLM window, surrounding (x, y) texel + for (float i = -NLM_WINDOW_RADIUS; i <= NLM_WINDOW_RADIUS + 1; i++) + for (float j = -NLM_WINDOW_RADIUS; j <= NLM_WINDOW_RADIUS + 1; j++) { + // Load precomputed weight + float weightIJ = fWeights[idx++]; - // Accumulate (x + j, y + i) texel color with computed weight - float4 clrIJ = tex2D(texImage, x + j, y + i); - clr.x += clrIJ.x * weightIJ; - clr.y += clrIJ.y * weightIJ; - clr.z += clrIJ.z * weightIJ; + // Accumulate (x + j, y + i) texel color with computed weight + float4 clrIJ = tex2D(texImage, x + j, y + i); + clr.x += clrIJ.x * weightIJ; + clr.y += clrIJ.y * weightIJ; + clr.z += clrIJ.z * weightIJ; - // Sum of weights for color normalization to [0..1] range - sumWeights += weightIJ; + // Sum of weights for color normalization to [0..1] range + sumWeights += weightIJ; - // Update weight counter, if NLM weight for current window texel - // exceeds the weight threshold - fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; - } + // Update weight counter, if NLM weight for current window texel + // exceeds the weight threshold + fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; + } - // Normalize result color by sum of weights - sumWeights = 1.0f / sumWeights; - clr.x *= sumWeights; - clr.y *= sumWeights; - clr.z *= sumWeights; + // Normalize result color by sum of weights + sumWeights = 1.0f / sumWeights; + clr.x *= sumWeights; + clr.y *= sumWeights; + clr.z *= sumWeights; - // Choose LERP quotient basing on how many texels - // within the NLM window exceeded the weight threshold - float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? lerpC : 1.0f - lerpC; + // Choose LERP quotient basing on how many texels + // within the NLM window exceeded the weight threshold + float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? lerpC : 1.0f - lerpC; - // Write final result to global memory - float4 clr00 = tex2D(texImage, x, y); - clr.x = lerpf(clr.x, clr00.x, lerpQ); - clr.y = lerpf(clr.y, clr00.y, lerpQ); - clr.z = lerpf(clr.z, clr00.z, lerpQ); - dst[imageW * iy + ix] = make_color(clr.x, clr.y, clr.z, 0); - } + // Write final result to global memory + float4 clr00 = tex2D(texImage, x, y); + clr.x = lerpf(clr.x, clr00.x, lerpQ); + clr.y = lerpf(clr.y, clr00.y, lerpQ); + clr.z = lerpf(clr.z, clr00.z, lerpQ); + dst[imageW * iy + ix] = make_color(clr.x, clr.y, clr.z, 0); + } } -extern "C" void cuda_NLM2(TColor *d_dst, int imageW, int imageH, float Noise, - float LerpC, cudaTextureObject_t texImage) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +extern "C" void cuda_NLM2(TColor *d_dst, int imageW, int imageH, float Noise, float LerpC, cudaTextureObject_t texImage) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - NLM2<<>>(d_dst, imageW, imageH, Noise, LerpC, texImage); + NLM2<<>>(d_dst, imageW, imageH, Noise, LerpC, texImage); } //////////////////////////////////////////////////////////////////////////////// // Stripped NLM2 kernel, only highlighting areas with different LERP directions //////////////////////////////////////////////////////////////////////////////// -__global__ void NLM2diag(TColor *dst, int imageW, int imageH, float Noise, - float LerpC, cudaTextureObject_t texImage) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void NLM2diag(TColor *dst, int imageW, int imageH, float Noise, float LerpC, cudaTextureObject_t texImage) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - // Weights cache - __shared__ float fWeights[BLOCKDIM_X * BLOCKDIM_Y]; + // Weights cache + __shared__ float fWeights[BLOCKDIM_X * BLOCKDIM_Y]; - const int ix = blockDim.x * blockIdx.x + threadIdx.x; - const int iy = blockDim.y * blockIdx.y + threadIdx.y; - // Add half of a texel to always address exact texel centers - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; - const float cx = blockDim.x * blockIdx.x + NLM_WINDOW_RADIUS + 0.5f; - const float cy = blockDim.x * blockIdx.y + NLM_WINDOW_RADIUS + 0.5f; + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + // Add half of a texel to always address exact texel centers + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; + const float cx = blockDim.x * blockIdx.x + NLM_WINDOW_RADIUS + 0.5f; + const float cy = blockDim.x * blockIdx.y + NLM_WINDOW_RADIUS + 0.5f; - if (ix < imageW && iy < imageH) { - // Find color distance from current texel to the center of NLM window - float weight = 0; + if (ix < imageW && iy < imageH) { + // Find color distance from current texel to the center of NLM window + float weight = 0; - for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS + 1; n++) - for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS + 1; m++) - weight += vecLen(tex2D(texImage, cx + m, cy + n), - tex2D(texImage, x + m, y + n)); + for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS + 1; n++) + for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS + 1; m++) + weight += vecLen(tex2D(texImage, cx + m, cy + n), tex2D(texImage, x + m, y + n)); - // Geometric distance from current texel to the center of NLM window - float dist = - (threadIdx.x - NLM_WINDOW_RADIUS) * (threadIdx.x - NLM_WINDOW_RADIUS) + - (threadIdx.y - NLM_WINDOW_RADIUS) * (threadIdx.y - NLM_WINDOW_RADIUS); + // Geometric distance from current texel to the center of NLM window + float dist = (threadIdx.x - NLM_WINDOW_RADIUS) * (threadIdx.x - NLM_WINDOW_RADIUS) + + (threadIdx.y - NLM_WINDOW_RADIUS) * (threadIdx.y - NLM_WINDOW_RADIUS); - // Derive final weight from color and geometric distance - weight = __expf(-(weight * Noise + dist * INV_NLM_WINDOW_AREA)); + // Derive final weight from color and geometric distance + weight = __expf(-(weight * Noise + dist * INV_NLM_WINDOW_AREA)); - // Write the result to shared memory - fWeights[threadIdx.y * BLOCKDIM_X + threadIdx.x] = weight; - // Wait until all the weights are ready - cg::sync(cta); + // Write the result to shared memory + fWeights[threadIdx.y * BLOCKDIM_X + threadIdx.x] = weight; + // Wait until all the weights are ready + cg::sync(cta); - // Normalized counter for the NLM weight threshold - float fCount = 0; - int idx = 0; + // Normalized counter for the NLM weight threshold + float fCount = 0; + int idx = 0; - // Cycle through NLM window, surrounding (x, y) texel - for (float n = -NLM_WINDOW_RADIUS; n <= NLM_WINDOW_RADIUS + 1; n++) - for (float m = -NLM_WINDOW_RADIUS; m <= NLM_WINDOW_RADIUS + 1; m++) { - // Load precomputed weight - float weightIJ = fWeights[idx++]; + // Cycle through NLM window, surrounding (x, y) texel + for (float n = -NLM_WINDOW_RADIUS; n <= NLM_WINDOW_RADIUS + 1; n++) + for (float m = -NLM_WINDOW_RADIUS; m <= NLM_WINDOW_RADIUS + 1; m++) { + // Load precomputed weight + float weightIJ = fWeights[idx++]; - // Update weight counter, if NLM weight for current window texel - // exceeds the weight threshold - fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; - } + // Update weight counter, if NLM weight for current window texel + // exceeds the weight threshold + fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; + } - // Choose LERP quotient basing on how many texels - // within the NLM window exceeded the weight threshold - float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? 1.0f : 0.0f; + // Choose LERP quotient basing on how many texels + // within the NLM window exceeded the weight threshold + float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? 1.0f : 0.0f; - // Write final result to global memory - dst[imageW * iy + ix] = make_color(lerpQ, 0, (1.0f - lerpQ), 0); - }; + // Write final result to global memory + dst[imageW * iy + ix] = make_color(lerpQ, 0, (1.0f - lerpQ), 0); + }; } -extern "C" void cuda_NLM2diag(TColor *d_dst, int imageW, int imageH, - float Noise, float LerpC, - cudaTextureObject_t texImage) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +extern "C" void +cuda_NLM2diag(TColor *d_dst, int imageW, int imageH, float Noise, float LerpC, cudaTextureObject_t texImage) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - NLM2diag<<>>(d_dst, imageW, imageH, Noise, LerpC, texImage); + NLM2diag<<>>(d_dst, imageW, imageH, Noise, LerpC, texImage); } diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm_kernel.cuh b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm_kernel.cuh index 2e1b9732..60a2a5a6 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm_kernel.cuh +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/imageDenoising_nlm_kernel.cuh @@ -28,126 +28,125 @@ //////////////////////////////////////////////////////////////////////////////// // NLM kernel //////////////////////////////////////////////////////////////////////////////// -__global__ void NLM(TColor *dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - const int ix = blockDim.x * blockIdx.x + threadIdx.x; - const int iy = blockDim.y * blockIdx.y + threadIdx.y; - // Add half of a texel to always address exact texel centers - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; +__global__ void NLM(TColor *dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + // Add half of a texel to always address exact texel centers + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; - if (ix < imageW && iy < imageH) { - // Normalized counter for the NLM weight threshold - float fCount = 0; - // Total sum of pixel weights - float sumWeights = 0; - // Result accumulator - float3 clr = {0, 0, 0}; + if (ix < imageW && iy < imageH) { + // Normalized counter for the NLM weight threshold + float fCount = 0; + // Total sum of pixel weights + float sumWeights = 0; + // Result accumulator + float3 clr = {0, 0, 0}; - // Cycle through NLM window, surrounding (x, y) texel - for (float i = -NLM_WINDOW_RADIUS; i <= NLM_WINDOW_RADIUS; i++) - for (float j = -NLM_WINDOW_RADIUS; j <= NLM_WINDOW_RADIUS; j++) { - // Find color distance from (x, y) to (x + j, y + i) - float weightIJ = 0; + // Cycle through NLM window, surrounding (x, y) texel + for (float i = -NLM_WINDOW_RADIUS; i <= NLM_WINDOW_RADIUS; i++) + for (float j = -NLM_WINDOW_RADIUS; j <= NLM_WINDOW_RADIUS; j++) { + // Find color distance from (x, y) to (x + j, y + i) + float weightIJ = 0; - for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS; n++) - for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS; m++) - weightIJ += vecLen(tex2D(texImage, x + j + m, y + i + n), - tex2D(texImage, x + m, y + n)); + for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS; n++) + for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS; m++) + weightIJ += vecLen(tex2D(texImage, x + j + m, y + i + n), + tex2D(texImage, x + m, y + n)); - // Derive final weight from color and geometric distance - weightIJ = - __expf(-(weightIJ * Noise + (i * i + j * j) * INV_NLM_WINDOW_AREA)); + // Derive final weight from color and geometric distance + weightIJ = __expf(-(weightIJ * Noise + (i * i + j * j) * INV_NLM_WINDOW_AREA)); - // Accumulate (x + j, y + i) texel color with computed weight - float4 clrIJ = tex2D(texImage, x + j, y + i); - clr.x += clrIJ.x * weightIJ; - clr.y += clrIJ.y * weightIJ; - clr.z += clrIJ.z * weightIJ; + // Accumulate (x + j, y + i) texel color with computed weight + float4 clrIJ = tex2D(texImage, x + j, y + i); + clr.x += clrIJ.x * weightIJ; + clr.y += clrIJ.y * weightIJ; + clr.z += clrIJ.z * weightIJ; - // Sum of weights for color normalization to [0..1] range - sumWeights += weightIJ; + // Sum of weights for color normalization to [0..1] range + sumWeights += weightIJ; - // Update weight counter, if NLM weight for current window texel - // exceeds the weight threshold - fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; - } + // Update weight counter, if NLM weight for current window texel + // exceeds the weight threshold + fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; + } - // Normalize result color by sum of weights - sumWeights = 1.0f / sumWeights; - clr.x *= sumWeights; - clr.y *= sumWeights; - clr.z *= sumWeights; + // Normalize result color by sum of weights + sumWeights = 1.0f / sumWeights; + clr.x *= sumWeights; + clr.y *= sumWeights; + clr.z *= sumWeights; - // Choose LERP quotient basing on how many texels - // within the NLM window exceeded the weight threshold - float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? lerpC : 1.0f - lerpC; + // Choose LERP quotient basing on how many texels + // within the NLM window exceeded the weight threshold + float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? lerpC : 1.0f - lerpC; - // Write final result to global memory - float4 clr00 = tex2D(texImage, x, y); - clr.x = lerpf(clr.x, clr00.x, lerpQ); - clr.y = lerpf(clr.y, clr00.y, lerpQ); - clr.z = lerpf(clr.z, clr00.z, lerpQ); - dst[imageW * iy + ix] = make_color(clr.x, clr.y, clr.z, 0); - } + // Write final result to global memory + float4 clr00 = tex2D(texImage, x, y); + clr.x = lerpf(clr.x, clr00.x, lerpQ); + clr.y = lerpf(clr.y, clr00.y, lerpQ); + clr.z = lerpf(clr.z, clr00.z, lerpQ); + dst[imageW * iy + ix] = make_color(clr.x, clr.y, clr.z, 0); + } } -extern "C" void cuda_NLM(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +extern "C" void cuda_NLM(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - NLM<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); + NLM<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); } //////////////////////////////////////////////////////////////////////////////// // Stripped NLM kernel, only highlighting areas with different LERP directions //////////////////////////////////////////////////////////////////////////////// -__global__ void NLMdiag(TColor *dst, unsigned int imageW, unsigned int imageH, - float Noise, float lerpC, - cudaTextureObject_t texImage) { - const int ix = blockDim.x * blockIdx.x + threadIdx.x; - const int iy = blockDim.y * blockIdx.y + threadIdx.y; - // Add half of a texel to always address exact texel centers - const float x = (float)ix + 0.5f; - const float y = (float)iy + 0.5f; +__global__ void +NLMdiag(TColor *dst, unsigned int imageW, unsigned int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + // Add half of a texel to always address exact texel centers + const float x = (float)ix + 0.5f; + const float y = (float)iy + 0.5f; - if (ix < imageW && iy < imageH) { - // Normalized counter for the weight threshold - float fCount = 0; + if (ix < imageW && iy < imageH) { + // Normalized counter for the weight threshold + float fCount = 0; - // Cycle through NLM window, surrounding (x, y) texel - for (float i = -NLM_WINDOW_RADIUS; i <= NLM_WINDOW_RADIUS; i++) - for (float j = -NLM_WINDOW_RADIUS; j <= NLM_WINDOW_RADIUS; j++) { - // Find color distance between (x, y) and (x + j, y + i) - float weightIJ = 0; + // Cycle through NLM window, surrounding (x, y) texel + for (float i = -NLM_WINDOW_RADIUS; i <= NLM_WINDOW_RADIUS; i++) + for (float j = -NLM_WINDOW_RADIUS; j <= NLM_WINDOW_RADIUS; j++) { + // Find color distance between (x, y) and (x + j, y + i) + float weightIJ = 0; - for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS; n++) - for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS; m++) - weightIJ += vecLen(tex2D(texImage, x + j + m, y + i + n), - tex2D(texImage, x + m, y + n)); + for (float n = -NLM_BLOCK_RADIUS; n <= NLM_BLOCK_RADIUS; n++) + for (float m = -NLM_BLOCK_RADIUS; m <= NLM_BLOCK_RADIUS; m++) + weightIJ += vecLen(tex2D(texImage, x + j + m, y + i + n), + tex2D(texImage, x + m, y + n)); - // Derive final weight from color and geometric distance - weightIJ = - __expf(-(weightIJ * Noise + (i * i + j * j) * INV_NLM_WINDOW_AREA)); + // Derive final weight from color and geometric distance + weightIJ = __expf(-(weightIJ * Noise + (i * i + j * j) * INV_NLM_WINDOW_AREA)); - // Increase the weight threshold counter - fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; - } + // Increase the weight threshold counter + fCount += (weightIJ > NLM_WEIGHT_THRESHOLD) ? INV_NLM_WINDOW_AREA : 0; + } - // Choose LERP quotient basing on how many texels - // within the NLM window exceeded the LERP threshold - float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? 1.0f : 0; + // Choose LERP quotient basing on how many texels + // within the NLM window exceeded the LERP threshold + float lerpQ = (fCount > NLM_LERP_THRESHOLD) ? 1.0f : 0; - // Write final result to global memory - dst[imageW * iy + ix] = make_color(lerpQ, 0, (1.0f - lerpQ), 0); - }; + // Write final result to global memory + dst[imageW * iy + ix] = make_color(lerpQ, 0, (1.0f - lerpQ), 0); + }; } -extern "C" void cuda_NLMdiag(TColor *d_dst, int imageW, int imageH, float Noise, - float lerpC, cudaTextureObject_t texImage) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +extern "C" void +cuda_NLMdiag(TColor *d_dst, int imageW, int imageH, float Noise, float lerpC, cudaTextureObject_t texImage) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - NLMdiag<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); + NLMdiag<<>>(d_dst, imageW, imageH, Noise, lerpC, texImage); } diff --git a/Samples/2_Concepts_and_Techniques/inlinePTX/inlinePTX.cu b/Samples/2_Concepts_and_Techniques/inlinePTX/inlinePTX.cu index 26fec901..49a02d14 100644 --- a/Samples/2_Concepts_and_Techniques/inlinePTX/inlinePTX.cu +++ b/Samples/2_Concepts_and_Techniques/inlinePTX/inlinePTX.cu @@ -30,24 +30,23 @@ */ // System includes -#include #include +#include // CUDA runtime #include // helper functions and utilities to work with CUDA -#include #include +#include __global__ void sequence_gpu(int *d_ptr, int length) { int elemID = blockIdx.x * blockDim.x + threadIdx.x; - if (elemID < length) - { + if (elemID < length) { unsigned int laneid; - //This command gets the lane ID within the current warp + // This command gets the lane ID within the current warp asm("mov.u32 %0, %%laneid;" : "=r"(laneid)); d_ptr[elemID] = laneid; } @@ -56,8 +55,7 @@ __global__ void sequence_gpu(int *d_ptr, int length) void sequence_cpu(int *h_ptr, int length) { - for (int elemID=0; elemID>>(d_ptr, N); checkCudaErrors(cudaGetLastError()); @@ -90,15 +87,13 @@ int main(int argc, char **argv) sequence_cpu(h_ptr, N); int *h_d_ptr; - checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int))); - checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMallocHost(&h_d_ptr, N * sizeof(int))); + checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N * sizeof(int), cudaMemcpyDeviceToHost)); bool bValid = true; - for (int i=0; i #include +#include // CUDA runtime #include @@ -40,63 +40,69 @@ // helper functions and utilities to work with CUDA #include -void sequence_cpu(int *h_ptr, int length) { - for (int elemID = 0; elemID < length; elemID++) { - h_ptr[elemID] = elemID % 32; - } -} - -int main(int argc, char **argv) { - printf("CUDA inline PTX assembler sample\n"); - - char *cubin, *kernel_file; - size_t cubinSize; - - kernel_file = sdkFindFilePath("inlinePTX_kernel.cu", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - - CUmodule module = loadCUBIN(cubin, argc, argv); - - CUfunction kernel_addr; - - checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "sequence_gpu")); - - const int N = 1000; - int *h_ptr = (int *)malloc(N * sizeof(int)); - - dim3 cudaBlockSize(256, 1, 1); - dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1); - - CUdeviceptr d_ptr; - checkCudaErrors(cuMemAlloc(&d_ptr, N * sizeof(int))); - - void *arr[] = {(void *)&d_ptr, (void *)&N}; - checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - cudaBlockSize.x, cudaBlockSize.y, - cudaBlockSize.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &arr[0], /* arguments */ - 0)); - - checkCudaErrors(cuCtxSynchronize()); - - sequence_cpu(h_ptr, N); - - int *h_d_ptr = (int *)malloc(N * sizeof(int)); - checkCudaErrors(cuMemcpyDtoH(h_d_ptr, d_ptr, N * sizeof(int))); - - bool bValid = true; - - for (int i = 0; i < N && bValid; i++) { - if (h_ptr[i] != h_d_ptr[i]) { - bValid = false; +void sequence_cpu(int *h_ptr, int length) +{ + for (int elemID = 0; elemID < length; elemID++) { + h_ptr[elemID] = elemID % 32; } - } - - printf("Test %s.\n", bValid ? "Successful" : "Failed"); - - checkCudaErrors(cuMemFree(d_ptr)); - - return bValid ? EXIT_SUCCESS : EXIT_FAILURE; +} + +int main(int argc, char **argv) +{ + printf("CUDA inline PTX assembler sample\n"); + + char *cubin, *kernel_file; + size_t cubinSize; + + kernel_file = sdkFindFilePath("inlinePTX_kernel.cu", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + + CUmodule module = loadCUBIN(cubin, argc, argv); + + CUfunction kernel_addr; + + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "sequence_gpu")); + + const int N = 1000; + int *h_ptr = (int *)malloc(N * sizeof(int)); + + dim3 cudaBlockSize(256, 1, 1); + dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1); + + CUdeviceptr d_ptr; + checkCudaErrors(cuMemAlloc(&d_ptr, N * sizeof(int))); + + void *arr[] = {(void *)&d_ptr, (void *)&N}; + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + cudaBlockSize.x, + cudaBlockSize.y, + cudaBlockSize.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &arr[0], /* arguments */ + 0)); + + checkCudaErrors(cuCtxSynchronize()); + + sequence_cpu(h_ptr, N); + + int *h_d_ptr = (int *)malloc(N * sizeof(int)); + checkCudaErrors(cuMemcpyDtoH(h_d_ptr, d_ptr, N * sizeof(int))); + + bool bValid = true; + + for (int i = 0; i < N && bValid; i++) { + if (h_ptr[i] != h_d_ptr[i]) { + bValid = false; + } + } + + printf("Test %s.\n", bValid ? "Successful" : "Failed"); + + checkCudaErrors(cuMemFree(d_ptr)); + + return bValid ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/inlinePTX_kernel.cu b/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/inlinePTX_kernel.cu index 3ecef91c..bbb2a905 100644 --- a/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/inlinePTX_kernel.cu +++ b/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/inlinePTX_kernel.cu @@ -25,15 +25,16 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -extern "C" __global__ void sequence_gpu(int *d_ptr, int length) { - int elemID = blockIdx.x * blockDim.x + threadIdx.x; +extern "C" __global__ void sequence_gpu(int *d_ptr, int length) +{ + int elemID = blockIdx.x * blockDim.x + threadIdx.x; - if (elemID < length) { - unsigned int laneid; + if (elemID < length) { + unsigned int laneid; - // This command gets the lane ID within the current warp - asm("mov.u32 %0, %%laneid;" : "=r"(laneid)); + // This command gets the lane ID within the current warp + asm("mov.u32 %0, %%laneid;" : "=r"(laneid)); - d_ptr[elemID] = laneid; - } + d_ptr[elemID] = laneid; + } } diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config.hpp index 055a2785..bbfc9a4e 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config.hpp @@ -1,8 +1,8 @@ // Boost config.hpp configuration header file ------------------------------// -// (C) Copyright John Maddock 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org/libs/config for most recent version. @@ -19,52 +19,41 @@ // if we don't have a user config, then use the default location: #if !defined(BOOST_USER_CONFIG) && !defined(BOOST_NO_USER_CONFIG) -# define BOOST_USER_CONFIG +#define BOOST_USER_CONFIG #endif // include it first: #ifdef BOOST_USER_CONFIG -# include BOOST_USER_CONFIG +#include BOOST_USER_CONFIG #endif // if we don't have a compiler config set, try and find one: #if !defined(BOOST_COMPILER_CONFIG) && !defined(BOOST_NO_COMPILER_CONFIG) && !defined(BOOST_NO_CONFIG) -# include +#include #endif // if we have a compiler config, include it now: #ifdef BOOST_COMPILER_CONFIG -# include BOOST_COMPILER_CONFIG +#include BOOST_COMPILER_CONFIG #endif // if we don't have a std library config set, try and find one: #if !defined(BOOST_STDLIB_CONFIG) && !defined(BOOST_NO_STDLIB_CONFIG) && !defined(BOOST_NO_CONFIG) -# include +#include #endif // if we have a std library config, include it now: #ifdef BOOST_STDLIB_CONFIG -# include BOOST_STDLIB_CONFIG +#include BOOST_STDLIB_CONFIG #endif // if we don't have a platform config set, try and find one: #if !defined(BOOST_PLATFORM_CONFIG) && !defined(BOOST_NO_PLATFORM_CONFIG) && !defined(BOOST_NO_CONFIG) -# include +#include #endif // if we have a platform config, include it now: #ifdef BOOST_PLATFORM_CONFIG -# include BOOST_PLATFORM_CONFIG +#include BOOST_PLATFORM_CONFIG #endif // get config suffix code: #include -#endif // BOOST_CONFIG_HPP - - - - - - - - - - - +#endif // BOOST_CONFIG_HPP diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_prefix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_prefix.hpp index 49f42494..b140cb0e 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_prefix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_prefix.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // for C++ Builder the following options effect the ABI: @@ -21,7 +21,4 @@ // 8026 - functions taking class by value arguments are not expanded inline #pragma nopushoptwarn -# pragma option push -Vx -Ve -a8 -b -pc -Vmv -VC- -Vl- -w-8027 -w-8026 - - - +#pragma option push -Vx -Ve -a8 -b -pc -Vmv -VC- -Vl - -w -8027 -w-8026 diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_suffix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_suffix.hpp index 940535f3..c65a68fe 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_suffix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/borland_suffix.hpp @@ -1,12 +1,7 @@ -// (C) Copyright John Maddock 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -# pragma option pop +#pragma option pop #pragma nopushoptwarn - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_prefix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_prefix.hpp index 97f06cdc..c4ed6a06 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_prefix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_prefix.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // @@ -9,14 +9,12 @@ // code will no longer be binary compatible with the bjam built binaries // unless this header is included to force Boost code into a consistent ABI. // -// Note that inclusion of this header is only necessary for libraries with +// Note that inclusion of this header is only necessary for libraries with // separate source, header only libraries DO NOT need this as long as all // translation units are built with the same options. // #if defined(_M_X64) -# pragma pack(push,16) +#pragma pack(push, 16) #else -# pragma pack(push,8) +#pragma pack(push, 8) #endif - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_suffix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_suffix.hpp index a64d783e..0a67ee36 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_suffix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi/msvc_suffix.hpp @@ -1,8 +1,6 @@ -// (C) Copyright John Maddock 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #pragma pack(pop) - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_prefix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_prefix.hpp index 3b134749..577e0cc8 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_prefix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_prefix.hpp @@ -1,25 +1,24 @@ // abi_prefix header -------------------------------------------------------// // (c) Copyright John Maddock 2003 - + // Use, modification and distribution are subject to the Boost Software License, // Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt). #ifndef BOOST_CONFIG_ABI_PREFIX_HPP -# define BOOST_CONFIG_ABI_PREFIX_HPP +#define BOOST_CONFIG_ABI_PREFIX_HPP #else -# error double inclusion of header boost/config/abi_prefix.hpp is an error +#error double inclusion of header boost/config/abi_prefix.hpp is an error #endif #include // this must occur after all other includes and before any code appears: #ifdef BOOST_HAS_ABI_HEADERS -# include BOOST_ABI_PREFIX +#include BOOST_ABI_PREFIX #endif -#if defined( __BORLANDC__ ) +#if defined(__BORLANDC__) #pragma nopushoptwarn #endif - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_suffix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_suffix.hpp index 93916166..0fb1aaf0 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_suffix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/abi_suffix.hpp @@ -1,7 +1,7 @@ // abi_sufffix header -------------------------------------------------------// // (c) Copyright John Maddock 2003 - + // Use, modification and distribution are subject to the Boost Software License, // Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt). @@ -10,18 +10,16 @@ // . #ifndef BOOST_CONFIG_ABI_PREFIX_HPP -# error Header boost/config/abi_suffix.hpp must only be used after boost/config/abi_prefix.hpp +#error Header boost/config/abi_suffix.hpp must only be used after boost/config/abi_prefix.hpp #else -# undef BOOST_CONFIG_ABI_PREFIX_HPP +#undef BOOST_CONFIG_ABI_PREFIX_HPP #endif // the suffix header occurs after all of our code: #ifdef BOOST_HAS_ABI_HEADERS -# include BOOST_ABI_SUFFIX +#include BOOST_ABI_SUFFIX #endif -#if defined( __BORLANDC__ ) +#if defined(__BORLANDC__) #pragma nopushoptwarn #endif - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/auto_link.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/auto_link.hpp index f2eb583f..36f71f34 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/auto_link.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/auto_link.hpp @@ -3,12 +3,12 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - /* - * LOCATION: see http://www.boost.org for most recent version. - * FILE auto_link.hpp - * VERSION see - * DESCRIPTION: Automatic library inclusion for Borland/Microsoft compilers. - */ +/* + * LOCATION: see http://www.boost.org for most recent version. + * FILE auto_link.hpp + * VERSION see + * DESCRIPTION: Automatic library inclusion for Borland/Microsoft compilers. + */ /************************************************************************* @@ -70,40 +70,38 @@ BOOST_LIB_VERSION: The Boost version, in the form x_y, for Boost version x.y. ***************************************************************************/ #ifdef __cplusplus -# ifndef BOOST_CONFIG_HPP -# include -# endif +#ifndef BOOST_CONFIG_HPP +#include +#endif #elif defined(_MSC_VER) && !defined(__MWERKS__) && !defined(__EDG_VERSION__) // // C language compatability (no, honestly) // -# define BOOST_MSVC _MSC_VER -# define BOOST_STRINGIZE(X) BOOST_DO_STRINGIZE(X) -# define BOOST_DO_STRINGIZE(X) #X +#define BOOST_MSVC _MSC_VER +#define BOOST_STRINGIZE(X) BOOST_DO_STRINGIZE(X) +#define BOOST_DO_STRINGIZE(X) #X #endif // // Only include what follows for known and supported compilers: // -#if defined(BOOST_MSVC) \ - || defined(__BORLANDC__) \ - || (defined(__MWERKS__) && defined(_WIN32) && (__MWERKS__ >= 0x3000)) \ +#if defined(BOOST_MSVC) || defined(__BORLANDC__) || (defined(__MWERKS__) && defined(_WIN32) && (__MWERKS__ >= 0x3000)) \ || (defined(__ICL) && defined(_MSC_EXTENSIONS) && (_MSC_VER >= 1200)) #ifndef BOOST_VERSION_HPP -# include +#include #endif #ifndef BOOST_LIB_NAME -# error "Macro BOOST_LIB_NAME not set (internal error)" +#error "Macro BOOST_LIB_NAME not set (internal error)" #endif // // error check: // #if defined(__MSVC_RUNTIME_CHECKS) && !defined(_DEBUG) -# pragma message("Using the /RTC option without specifying a debug runtime will lead to linker errors") -# pragma message("Hint: go to the code generation options and switch to one of the debugging runtimes") -# error "Incompatible build options" +#pragma message("Using the /RTC option without specifying a debug runtime will lead to linker errors") +#pragma message("Hint: go to the code generation options and switch to one of the debugging runtimes") +#error "Incompatible build options" #endif // // select toolset if not defined already: @@ -112,58 +110,58 @@ BOOST_LIB_VERSION: The Boost version, in the form x_y, for Boost version x.y. // Note: no compilers before 1200 are supported #if defined(BOOST_MSVC) && (BOOST_MSVC < 1300) -# ifdef UNDER_CE - // vc6: -# define BOOST_LIB_TOOLSET "evc4" -# else - // vc6: -# define BOOST_LIB_TOOLSET "vc6" -# endif +#ifdef UNDER_CE +// vc6: +#define BOOST_LIB_TOOLSET "evc4" +#else +// vc6: +#define BOOST_LIB_TOOLSET "vc6" +#endif #elif defined(BOOST_MSVC) && (BOOST_MSVC == 1300) - // vc7: -# define BOOST_LIB_TOOLSET "vc7" +// vc7: +#define BOOST_LIB_TOOLSET "vc7" #elif defined(BOOST_MSVC) && (BOOST_MSVC == 1310) - // vc71: -# define BOOST_LIB_TOOLSET "vc71" +// vc71: +#define BOOST_LIB_TOOLSET "vc71" #elif defined(BOOST_MSVC) && (BOOST_MSVC == 1400) - // vc80: -# define BOOST_LIB_TOOLSET "vc80" +// vc80: +#define BOOST_LIB_TOOLSET "vc80" #elif defined(BOOST_MSVC) && (BOOST_MSVC == 1500) - // vc90: -# define BOOST_LIB_TOOLSET "vc90" +// vc90: +#define BOOST_LIB_TOOLSET "vc90" #elif defined(BOOST_MSVC) && (BOOST_MSVC >= 1600) - // vc10: -# define BOOST_LIB_TOOLSET "vc100" +// vc10: +#define BOOST_LIB_TOOLSET "vc100" #elif defined(__BORLANDC__) - // CBuilder 6: -# define BOOST_LIB_TOOLSET "bcb" +// CBuilder 6: +#define BOOST_LIB_TOOLSET "bcb" #elif defined(__ICL) - // Intel C++, no version number: -# define BOOST_LIB_TOOLSET "iw" +// Intel C++, no version number: +#define BOOST_LIB_TOOLSET "iw" -#elif defined(__MWERKS__) && (__MWERKS__ <= 0x31FF ) +#elif defined(__MWERKS__) && (__MWERKS__ <= 0x31FF) - // Metrowerks CodeWarrior 8.x -# define BOOST_LIB_TOOLSET "cw8" +// Metrowerks CodeWarrior 8.x +#define BOOST_LIB_TOOLSET "cw8" -#elif defined(__MWERKS__) && (__MWERKS__ <= 0x32FF ) +#elif defined(__MWERKS__) && (__MWERKS__ <= 0x32FF) - // Metrowerks CodeWarrior 9.x -# define BOOST_LIB_TOOLSET "cw9" +// Metrowerks CodeWarrior 9.x +#define BOOST_LIB_TOOLSET "cw9" #endif #endif // BOOST_LIB_TOOLSET @@ -172,86 +170,88 @@ BOOST_LIB_VERSION: The Boost version, in the form x_y, for Boost version x.y. // select thread opt: // #if defined(_MT) || defined(__MT__) -# define BOOST_LIB_THREAD_OPT "-mt" +#define BOOST_LIB_THREAD_OPT "-mt" #else -# define BOOST_LIB_THREAD_OPT +#define BOOST_LIB_THREAD_OPT #endif #if defined(_MSC_VER) || defined(__MWERKS__) -# ifdef _DLL +#ifdef _DLL -# if (defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) && (defined(_STLP_OWN_IOSTREAMS) || defined(__STL_OWN_IOSTREAMS)) +#if (defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) \ + && (defined(_STLP_OWN_IOSTREAMS) || defined(__STL_OWN_IOSTREAMS)) -# if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) -# define BOOST_LIB_RT_OPT "-gdp" -# elif defined(_DEBUG) -# define BOOST_LIB_RT_OPT "-gdp" -# pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") -# error "Build options aren't compatible with pre-built libraries" -# else -# define BOOST_LIB_RT_OPT "-p" -# endif +#if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) +#define BOOST_LIB_RT_OPT "-gdp" +#elif defined(_DEBUG) +#define BOOST_LIB_RT_OPT "-gdp" +#pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") +#error "Build options aren't compatible with pre-built libraries" +#else +#define BOOST_LIB_RT_OPT "-p" +#endif -# elif defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) +#elif defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) -# if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) -# define BOOST_LIB_RT_OPT "-gdpn" -# elif defined(_DEBUG) -# define BOOST_LIB_RT_OPT "-gdpn" -# pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") -# error "Build options aren't compatible with pre-built libraries" -# else -# define BOOST_LIB_RT_OPT "-pn" -# endif +#if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) +#define BOOST_LIB_RT_OPT "-gdpn" +#elif defined(_DEBUG) +#define BOOST_LIB_RT_OPT "-gdpn" +#pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") +#error "Build options aren't compatible with pre-built libraries" +#else +#define BOOST_LIB_RT_OPT "-pn" +#endif -# else +#else -# if defined(_DEBUG) -# define BOOST_LIB_RT_OPT "-gd" -# else -# define BOOST_LIB_RT_OPT -# endif +#if defined(_DEBUG) +#define BOOST_LIB_RT_OPT "-gd" +#else +#define BOOST_LIB_RT_OPT +#endif -# endif +#endif -# else +#else -# if (defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) && (defined(_STLP_OWN_IOSTREAMS) || defined(__STL_OWN_IOSTREAMS)) +#if (defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) \ + && (defined(_STLP_OWN_IOSTREAMS) || defined(__STL_OWN_IOSTREAMS)) -# if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) -# define BOOST_LIB_RT_OPT "-sgdp" -# elif defined(_DEBUG) -# define BOOST_LIB_RT_OPT "-sgdp" -# pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") -# error "Build options aren't compatible with pre-built libraries" -# else -# define BOOST_LIB_RT_OPT "-sp" -# endif +#if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) +#define BOOST_LIB_RT_OPT "-sgdp" +#elif defined(_DEBUG) +#define BOOST_LIB_RT_OPT "-sgdp" +#pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") +#error "Build options aren't compatible with pre-built libraries" +#else +#define BOOST_LIB_RT_OPT "-sp" +#endif -# elif defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) +#elif defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) -# if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) -# define BOOST_LIB_RT_OPT "-sgdpn" -# elif defined(_DEBUG) -# define BOOST_LIB_RT_OPT "-sgdpn" -# pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") -# error "Build options aren't compatible with pre-built libraries" -# else -# define BOOST_LIB_RT_OPT "-spn" -# endif +#if defined(_DEBUG) && (defined(__STL_DEBUG) || defined(_STLP_DEBUG)) +#define BOOST_LIB_RT_OPT "-sgdpn" +#elif defined(_DEBUG) +#define BOOST_LIB_RT_OPT "-sgdpn" +#pragma message("warning: STLPort debug versions are built with /D_STLP_DEBUG=1") +#error "Build options aren't compatible with pre-built libraries" +#else +#define BOOST_LIB_RT_OPT "-spn" +#endif -# else +#else -# if defined(_DEBUG) -# define BOOST_LIB_RT_OPT "-sgd" -# else -# define BOOST_LIB_RT_OPT "-s" -# endif +#if defined(_DEBUG) +#define BOOST_LIB_RT_OPT "-sgd" +#else +#define BOOST_LIB_RT_OPT "-s" +#endif -# endif +#endif -# endif +#endif #elif defined(__BORLANDC__) @@ -259,7 +259,7 @@ BOOST_LIB_VERSION: The Boost version, in the form x_y, for Boost version x.y. // figure out whether we want the debug builds or not: // #if __BORLANDC__ > 0x561 -#pragma defineonoption BOOST_BORLAND_DEBUG -v +#pragma defineonoption BOOST_BORLAND_DEBUG - v #endif // // sanity check: @@ -268,23 +268,23 @@ BOOST_LIB_VERSION: The Boost version, in the form x_y, for Boost version x.y. #error "Pre-built versions of the Boost libraries are not provided in STLPort-debug form" #endif -# ifdef _RTLDLL +#ifdef _RTLDLL -# ifdef BOOST_BORLAND_DEBUG -# define BOOST_LIB_RT_OPT "-d" -# else -# define BOOST_LIB_RT_OPT -# endif +#ifdef BOOST_BORLAND_DEBUG +#define BOOST_LIB_RT_OPT "-d" +#else +#define BOOST_LIB_RT_OPT +#endif -# else +#else -# ifdef BOOST_BORLAND_DEBUG -# define BOOST_LIB_RT_OPT "-sd" -# else -# define BOOST_LIB_RT_OPT "-s" -# endif +#ifdef BOOST_BORLAND_DEBUG +#define BOOST_LIB_RT_OPT "-sd" +#else +#define BOOST_LIB_RT_OPT "-s" +#endif -# endif +#endif #endif @@ -292,37 +292,37 @@ BOOST_LIB_VERSION: The Boost version, in the form x_y, for Boost version x.y. // select linkage opt: // #if (defined(_DLL) || defined(_RTLDLL)) && defined(BOOST_DYN_LINK) -# define BOOST_LIB_PREFIX +#define BOOST_LIB_PREFIX #elif defined(BOOST_DYN_LINK) -# error "Mixing a dll boost library with a static runtime is a really bad idea..." +#error "Mixing a dll boost library with a static runtime is a really bad idea..." #else -# define BOOST_LIB_PREFIX "lib" +#define BOOST_LIB_PREFIX "lib" #endif // // now include the lib: // -#if defined(BOOST_LIB_NAME) \ - && defined(BOOST_LIB_PREFIX) \ - && defined(BOOST_LIB_TOOLSET) \ - && defined(BOOST_LIB_THREAD_OPT) \ - && defined(BOOST_LIB_RT_OPT) \ - && defined(BOOST_LIB_VERSION) +#if defined(BOOST_LIB_NAME) && defined(BOOST_LIB_PREFIX) && defined(BOOST_LIB_TOOLSET) \ + && defined(BOOST_LIB_THREAD_OPT) && defined(BOOST_LIB_RT_OPT) && defined(BOOST_LIB_VERSION) #ifndef BOOST_AUTO_LINK_NOMANGLE -# pragma comment(lib, BOOST_LIB_PREFIX BOOST_STRINGIZE(BOOST_LIB_NAME) "-" BOOST_LIB_TOOLSET BOOST_LIB_THREAD_OPT BOOST_LIB_RT_OPT "-" BOOST_LIB_VERSION ".lib") -# ifdef BOOST_LIB_DIAGNOSTIC -# pragma message ("Linking to lib file: " BOOST_LIB_PREFIX BOOST_STRINGIZE(BOOST_LIB_NAME) "-" BOOST_LIB_TOOLSET BOOST_LIB_THREAD_OPT BOOST_LIB_RT_OPT "-" BOOST_LIB_VERSION ".lib") -# endif +#pragma comment( \ + lib, \ + BOOST_LIB_PREFIX BOOST_STRINGIZE(BOOST_LIB_NAME) "-" BOOST_LIB_TOOLSET BOOST_LIB_THREAD_OPT BOOST_LIB_RT_OPT \ + "-" BOOST_LIB_VERSION ".lib") +#ifdef BOOST_LIB_DIAGNOSTIC +#pragma message("Linking to lib file: " BOOST_LIB_PREFIX BOOST_STRINGIZE( \ + BOOST_LIB_NAME) "-" BOOST_LIB_TOOLSET BOOST_LIB_THREAD_OPT BOOST_LIB_RT_OPT "-" BOOST_LIB_VERSION ".lib") +#endif #else -# pragma comment(lib, BOOST_STRINGIZE(BOOST_LIB_NAME) ".lib") -# ifdef BOOST_LIB_DIAGNOSTIC -# pragma message ("Linking to lib file: " BOOST_STRINGIZE(BOOST_LIB_NAME) ".lib") -# endif +#pragma comment(lib, BOOST_STRINGIZE(BOOST_LIB_NAME) ".lib") +#ifdef BOOST_LIB_DIAGNOSTIC +#pragma message("Linking to lib file: " BOOST_STRINGIZE(BOOST_LIB_NAME) ".lib") +#endif #endif #else -# error "some required macros where not defined (internal logic error)." +#error "some required macros where not defined (internal logic error)." #endif @@ -332,42 +332,31 @@ BOOST_LIB_VERSION: The Boost version, in the form x_y, for Boost version x.y. // finally undef any macros we may have set: // #ifdef BOOST_LIB_PREFIX -# undef BOOST_LIB_PREFIX +#undef BOOST_LIB_PREFIX #endif #if defined(BOOST_LIB_NAME) -# undef BOOST_LIB_NAME +#undef BOOST_LIB_NAME #endif -// Don't undef this one: it can be set by the user and should be the +// Don't undef this one: it can be set by the user and should be the // same for all libraries: -//#if defined(BOOST_LIB_TOOLSET) -//# undef BOOST_LIB_TOOLSET -//#endif +// #if defined(BOOST_LIB_TOOLSET) +// # undef BOOST_LIB_TOOLSET +// #endif #if defined(BOOST_LIB_THREAD_OPT) -# undef BOOST_LIB_THREAD_OPT +#undef BOOST_LIB_THREAD_OPT #endif #if defined(BOOST_LIB_RT_OPT) -# undef BOOST_LIB_RT_OPT +#undef BOOST_LIB_RT_OPT #endif #if defined(BOOST_LIB_LINK_OPT) -# undef BOOST_LIB_LINK_OPT +#undef BOOST_LIB_LINK_OPT #endif #if defined(BOOST_LIB_DEBUG_OPT) -# undef BOOST_LIB_DEBUG_OPT +#undef BOOST_LIB_DEBUG_OPT #endif #if defined(BOOST_DYN_LINK) -# undef BOOST_DYN_LINK +#undef BOOST_DYN_LINK #endif #if defined(BOOST_AUTO_LINK_NOMANGLE) -# undef BOOST_AUTO_LINK_NOMANGLE +#undef BOOST_AUTO_LINK_NOMANGLE #endif - - - - - - - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/borland.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/borland.hpp index 6a7b988d..cd09b17b 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/borland.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/borland.hpp @@ -13,123 +13,123 @@ // versions check: // we don't support Borland prior to version 5.4: #if __BORLANDC__ < 0x540 -# error "Compiler not supported or configured - please reconfigure" +#error "Compiler not supported or configured - please reconfigure" #endif // last known compiler version: #if (__BORLANDC__ > 0x613) -//# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -//# else -//# pragma message( "Unknown compiler version - please run the configure tests and report the results") -//# endif +// # if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +// # else +// # pragma message( "Unknown compiler version - please run the configure tests and report the results") +// # endif #elif (__BORLANDC__ == 0x600) -# error "CBuilderX preview compiler is no longer supported" +#error "CBuilderX preview compiler is no longer supported" #endif // // Support macros to help with standard library detection #if (__BORLANDC__ < 0x560) || defined(_USE_OLD_RW_STL) -# define BOOST_BCB_WITH_ROGUE_WAVE +#define BOOST_BCB_WITH_ROGUE_WAVE #elif __BORLANDC__ < 0x570 -# define BOOST_BCB_WITH_STLPORT +#define BOOST_BCB_WITH_STLPORT #else -# define BOOST_BCB_WITH_DINKUMWARE +#define BOOST_BCB_WITH_DINKUMWARE #endif // // Version 5.0 and below: -# if __BORLANDC__ <= 0x0550 +#if __BORLANDC__ <= 0x0550 // Borland C++Builder 4 and 5: -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -# if __BORLANDC__ == 0x0550 +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#if __BORLANDC__ == 0x0550 // Borland C++Builder 5, command-line compiler 5.5: -# define BOOST_NO_OPERATORS_IN_NAMESPACE -# endif -# endif +#define BOOST_NO_OPERATORS_IN_NAMESPACE +#endif +#endif // Version 5.51 and below: #if (__BORLANDC__ <= 0x551) -# define BOOST_NO_CV_SPECIALIZATIONS -# define BOOST_NO_CV_VOID_SPECIALIZATIONS -# define BOOST_NO_DEDUCED_TYPENAME +#define BOOST_NO_CV_SPECIALIZATIONS +#define BOOST_NO_CV_VOID_SPECIALIZATIONS +#define BOOST_NO_DEDUCED_TYPENAME // workaround for missing WCHAR_MAX/WCHAR_MIN: #include #include #ifndef WCHAR_MAX -# define WCHAR_MAX 0xffff +#define WCHAR_MAX 0xffff #endif #ifndef WCHAR_MIN -# define WCHAR_MIN 0 +#define WCHAR_MIN 0 #endif #endif // Borland C++ Builder 6 and below: #if (__BORLANDC__ <= 0x564) -# ifdef NDEBUG - // fix broken so that Boost.test works: -# include -# undef strcmp -# endif - // fix broken errno declaration: -# include -# ifndef errno -# define errno errno -# endif +#ifdef NDEBUG +// fix broken so that Boost.test works: +#include +#undef strcmp +#endif +// fix broken errno declaration: +#include +#ifndef errno +#define errno errno +#endif #endif // // new bug in 5.61: #if (__BORLANDC__ >= 0x561) && (__BORLANDC__ <= 0x580) - // this seems to be needed by the command line compiler, but not the IDE: -# define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS +// this seems to be needed by the command line compiler, but not the IDE: +#define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS #endif // Borland C++ Builder 2006 Update 2 and below: #if (__BORLANDC__ <= 0x582) -# define BOOST_NO_SFINAE -# define BOOST_BCB_PARTIAL_SPECIALIZATION_BUG -# define BOOST_NO_TEMPLATE_TEMPLATES +#define BOOST_NO_SFINAE +#define BOOST_BCB_PARTIAL_SPECIALIZATION_BUG +#define BOOST_NO_TEMPLATE_TEMPLATES -# define BOOST_NO_PRIVATE_IN_AGGREGATE +#define BOOST_NO_PRIVATE_IN_AGGREGATE -# ifdef _WIN32 -# define BOOST_NO_SWPRINTF -# elif defined(linux) || defined(__linux__) || defined(__linux) - // we should really be able to do without this - // but the wcs* functions aren't imported into std:: -# define BOOST_NO_STDC_NAMESPACE - // _CPPUNWIND doesn't get automatically set for some reason: -# pragma defineonoption BOOST_CPPUNWIND -x -# endif +#ifdef _WIN32 +#define BOOST_NO_SWPRINTF +#elif defined(linux) || defined(__linux__) || defined(__linux) +// we should really be able to do without this +// but the wcs* functions aren't imported into std:: +#define BOOST_NO_STDC_NAMESPACE +// _CPPUNWIND doesn't get automatically set for some reason: +#pragma defineonoption BOOST_CPPUNWIND - x +#endif #endif -#if (__BORLANDC__ <= 0x613) // Beman has asked Alisdair for more info - // we shouldn't really need this - but too many things choke - // without it, this needs more investigation: -# define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS -# define BOOST_NO_IS_ABSTRACT -# define BOOST_NO_FUNCTION_TYPE_SPECIALIZATIONS -# define BOOST_NO_USING_TEMPLATE -# define BOOST_SP_NO_SP_CONVERTIBLE +#if (__BORLANDC__ <= 0x613) // Beman has asked Alisdair for more info + // we shouldn't really need this - but too many things choke + // without it, this needs more investigation: +#define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS +#define BOOST_NO_IS_ABSTRACT +#define BOOST_NO_FUNCTION_TYPE_SPECIALIZATIONS +#define BOOST_NO_USING_TEMPLATE +#define BOOST_SP_NO_SP_CONVERTIBLE // Temporary workaround #define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS #endif // Borland C++ Builder 2008 and below: -# define BOOST_NO_INTEGRAL_INT64_T -# define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL -# define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP -# define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE -# define BOOST_NO_NESTED_FRIENDSHIP -# define BOOST_NO_TYPENAME_WITH_CTOR +#define BOOST_NO_INTEGRAL_INT64_T +#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL +#define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE +#define BOOST_NO_NESTED_FRIENDSHIP +#define BOOST_NO_TYPENAME_WITH_CTOR #if (__BORLANDC__ < 0x600) -# define BOOST_ILLEGAL_CV_REFERENCES +#define BOOST_ILLEGAL_CV_REFERENCES #endif // @@ -137,29 +137,29 @@ // // Borland C++ Builder 2008 and below: #if (__BORLANDC__ >= 0x599) -# pragma defineonoption BOOST_CODEGEAR_0X_SUPPORT -Ax +#pragma defineonoption BOOST_CODEGEAR_0X_SUPPORT - Ax #endif // // C++0x Macros: // -#if !defined( BOOST_CODEGEAR_0X_SUPPORT ) || (__BORLANDC__ < 0x610) -# define BOOST_NO_CHAR16_T -# define BOOST_NO_CHAR32_T -# define BOOST_NO_DECLTYPE -# define BOOST_NO_EXPLICIT_CONVERSION_OPERATORS -# define BOOST_NO_EXTERN_TEMPLATE -# define BOOST_NO_RVALUE_REFERENCES -# define BOOST_NO_SCOPED_ENUMS -# define BOOST_NO_STATIC_ASSERT +#if !defined(BOOST_CODEGEAR_0X_SUPPORT) || (__BORLANDC__ < 0x610) +#define BOOST_NO_CHAR16_T +#define BOOST_NO_CHAR32_T +#define BOOST_NO_DECLTYPE +#define BOOST_NO_EXPLICIT_CONVERSION_OPERATORS +#define BOOST_NO_EXTERN_TEMPLATE +#define BOOST_NO_RVALUE_REFERENCES +#define BOOST_NO_SCOPED_ENUMS +#define BOOST_NO_STATIC_ASSERT #else -# define BOOST_HAS_ALIGNOF -# define BOOST_HAS_CHAR16_T -# define BOOST_HAS_CHAR32_T -# define BOOST_HAS_DECLTYPE -# define BOOST_HAS_EXPLICIT_CONVERSION_OPS -# define BOOST_HAS_REF_QUALIFIER -# define BOOST_HAS_RVALUE_REFS -# define BOOST_HAS_STATIC_ASSERT +#define BOOST_HAS_ALIGNOF +#define BOOST_HAS_CHAR16_T +#define BOOST_HAS_CHAR32_T +#define BOOST_HAS_DECLTYPE +#define BOOST_HAS_EXPLICIT_CONVERSION_OPS +#define BOOST_HAS_REF_QUALIFIER +#define BOOST_HAS_RVALUE_REFS +#define BOOST_HAS_STATIC_ASSERT #endif #define BOOST_NO_AUTO_DECLARATIONS @@ -177,33 +177,33 @@ #define BOOST_NO_SCOPED_ENUMS #define BOOST_NO_SFINAE_EXPR #define BOOST_NO_TEMPLATE_ALIASES -#define BOOST_NO_UNICODE_LITERALS // UTF-8 still not supported +#define BOOST_NO_UNICODE_LITERALS // UTF-8 still not supported #define BOOST_NO_VARIADIC_TEMPLATES #if __BORLANDC__ >= 0x590 -# define BOOST_HAS_TR1_HASH +#define BOOST_HAS_TR1_HASH -# define BOOST_HAS_MACRO_USE_FACET +#define BOOST_HAS_MACRO_USE_FACET #endif // // Post 0x561 we have long long and stdint.h: #if __BORLANDC__ >= 0x561 -# ifndef __NO_LONG_LONG -# define BOOST_HAS_LONG_LONG -# else -# define BOOST_NO_LONG_LONG -# endif - // On non-Win32 platforms let the platform config figure this out: -# ifdef _WIN32 -# define BOOST_HAS_STDINT_H -# endif +#ifndef __NO_LONG_LONG +#define BOOST_HAS_LONG_LONG +#else +#define BOOST_NO_LONG_LONG +#endif +// On non-Win32 platforms let the platform config figure this out: +#ifdef _WIN32 +#define BOOST_HAS_STDINT_H +#endif #endif // Borland C++Builder 6 defaults to using STLPort. If _USE_OLD_RW_STL is // defined, then we have 0x560 or greater with the Rogue Wave implementation // which presumably has the std::DBL_MAX bug. -#if defined( BOOST_BCB_WITH_ROGUE_WAVE ) +#if defined(BOOST_BCB_WITH_ROGUE_WAVE) // is partly broken, some macros define symbols that are really in // namespace std, so you end up having to use illegal constructs like // std::DBL_MAX, as a fix we'll just include float.h and have done with: @@ -213,55 +213,52 @@ // __int64: // #if (__BORLANDC__ >= 0x530) && !defined(__STRICT_ANSI__) -# define BOOST_HAS_MS_INT64 +#define BOOST_HAS_MS_INT64 #endif // // check for exception handling support: // #if !defined(_CPPUNWIND) && !defined(BOOST_CPPUNWIND) && !defined(__EXCEPTIONS) -# define BOOST_NO_EXCEPTIONS +#define BOOST_NO_EXCEPTIONS #endif // // all versions have a : // #ifndef __STRICT_ANSI__ -# define BOOST_HAS_DIRENT_H +#define BOOST_HAS_DIRENT_H #endif // // all versions support __declspec: // #ifndef __STRICT_ANSI__ -# define BOOST_HAS_DECLSPEC +#define BOOST_HAS_DECLSPEC #endif // // ABI fixing headers: // #if __BORLANDC__ != 0x600 // not implemented for version 6 compiler yet #ifndef BOOST_ABI_PREFIX -# define BOOST_ABI_PREFIX "boost/config/abi/borland_prefix.hpp" +#define BOOST_ABI_PREFIX "boost/config/abi/borland_prefix.hpp" #endif #ifndef BOOST_ABI_SUFFIX -# define BOOST_ABI_SUFFIX "boost/config/abi/borland_suffix.hpp" +#define BOOST_ABI_SUFFIX "boost/config/abi/borland_suffix.hpp" #endif #endif // // Disable Win32 support in ANSI mode: // #if __BORLANDC__ < 0x600 -# pragma defineonoption BOOST_DISABLE_WIN32 -A +#pragma defineonoption BOOST_DISABLE_WIN32 - A #elif defined(__STRICT_ANSI__) -# define BOOST_DISABLE_WIN32 +#define BOOST_DISABLE_WIN32 #endif // // MSVC compatibility mode does some nasty things: // TODO: look up if this doesn't apply to the whole 12xx range // #if defined(_MSC_VER) && (_MSC_VER <= 1200) -# define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP -# define BOOST_NO_VOID_RETURNS +#define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP +#define BOOST_NO_VOID_RETURNS #endif #define BOOST_COMPILER "Borland C++ version " BOOST_STRINGIZE(__BORLANDC__) - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/codegear.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/codegear.hpp index 698624ec..40c41e4d 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/codegear.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/codegear.hpp @@ -9,58 +9,58 @@ // CodeGear C++ compiler setup: -#if !defined( BOOST_WITH_CODEGEAR_WARNINGS ) +#if !defined(BOOST_WITH_CODEGEAR_WARNINGS) // these warnings occur frequently in optimized template code -# pragma warn -8004 // var assigned value, but never used -# pragma warn -8008 // condition always true/false -# pragma warn -8066 // dead code can never execute -# pragma warn -8104 // static members with ctors not threadsafe -# pragma warn -8105 // reference member in class without ctors +#pragma warn - 8004 // var assigned value, but never used +#pragma warn - 8008 // condition always true/false +#pragma warn - 8066 // dead code can never execute +#pragma warn - 8104 // static members with ctors not threadsafe +#pragma warn - 8105 // reference member in class without ctors #endif // // versions check: // last known and checked version is 0x620 #if (__CODEGEARC__ > 0x620) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# else -# pragma message( "Unknown compiler version - please run the configure tests and report the results") -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#else +#pragma message("Unknown compiler version - please run the configure tests and report the results") +#endif #endif // CodeGear C++ Builder 2009 #if (__CODEGEARC__ <= 0x613) -# define BOOST_NO_INTEGRAL_INT64_T -# define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS -# define BOOST_NO_PRIVATE_IN_AGGREGATE -# define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE - // we shouldn't really need this - but too many things choke - // without it, this needs more investigation: -# define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS -# define BOOST_SP_NO_SP_CONVERTIBLE +#define BOOST_NO_INTEGRAL_INT64_T +#define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS +#define BOOST_NO_PRIVATE_IN_AGGREGATE +#define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE +// we shouldn't really need this - but too many things choke +// without it, this needs more investigation: +#define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS +#define BOOST_SP_NO_SP_CONVERTIBLE #endif // CodeGear C++ Builder 2010 #if (__CODEGEARC__ <= 0x620) -# define BOOST_NO_TYPENAME_WITH_CTOR // Cannot use typename keyword when making temporaries of a dependant type -# define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -# define BOOST_NO_NESTED_FRIENDSHIP // TC1 gives nested classes access rights as any other member -# define BOOST_NO_USING_TEMPLATE -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#define BOOST_NO_TYPENAME_WITH_CTOR // Cannot use typename keyword when making temporaries of a dependant type +#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#define BOOST_NO_NESTED_FRIENDSHIP // TC1 gives nested classes access rights as any other member +#define BOOST_NO_USING_TEMPLATE +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP // Temporary hack, until specific MPL preprocessed headers are generated -# define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS +#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS -# ifdef NDEBUG - // fix broken so that Boost.test works: -# include -# undef strcmp -# endif - // fix broken errno declaration: -# include -# ifndef errno -# define errno errno -# endif +#ifdef NDEBUG +// fix broken so that Boost.test works: +#include +#undef strcmp +#endif +// fix broken errno declaration: +#include +#ifndef errno +#define errno errno +#endif #endif // @@ -110,54 +110,53 @@ // On non-Win32 platforms let the platform config figure this out: #ifdef _WIN32 -# define BOOST_HAS_STDINT_H +#define BOOST_HAS_STDINT_H #endif // // __int64: // #if !defined(__STRICT_ANSI__) -# define BOOST_HAS_MS_INT64 +#define BOOST_HAS_MS_INT64 #endif // // check for exception handling support: // #if !defined(_CPPUNWIND) && !defined(BOOST_CPPUNWIND) && !defined(__EXCEPTIONS) -# define BOOST_NO_EXCEPTIONS +#define BOOST_NO_EXCEPTIONS #endif // // all versions have a : // #if !defined(__STRICT_ANSI__) -# define BOOST_HAS_DIRENT_H +#define BOOST_HAS_DIRENT_H #endif // // all versions support __declspec: // #if !defined(__STRICT_ANSI__) -# define BOOST_HAS_DECLSPEC +#define BOOST_HAS_DECLSPEC #endif // // ABI fixing headers: // #ifndef BOOST_ABI_PREFIX -# define BOOST_ABI_PREFIX "boost/config/abi/borland_prefix.hpp" +#define BOOST_ABI_PREFIX "boost/config/abi/borland_prefix.hpp" #endif #ifndef BOOST_ABI_SUFFIX -# define BOOST_ABI_SUFFIX "boost/config/abi/borland_suffix.hpp" +#define BOOST_ABI_SUFFIX "boost/config/abi/borland_suffix.hpp" #endif // // Disable Win32 support in ANSI mode: // -# pragma defineonoption BOOST_DISABLE_WIN32 -A +#pragma defineonoption BOOST_DISABLE_WIN32 - A // // MSVC compatibility mode does some nasty things: // TODO: look up if this doesn't apply to the whole 12xx range // #if defined(_MSC_VER) && (_MSC_VER <= 1200) -# define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP -# define BOOST_NO_VOID_RETURNS +#define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP +#define BOOST_NO_VOID_RETURNS #endif #define BOOST_COMPILER "CodeGear C++ version " BOOST_STRINGIZE(__CODEGEARC__) - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/comeau.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/comeau.hpp index 278222dc..edfc37f8 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/comeau.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/comeau.hpp @@ -1,11 +1,11 @@ -// (C) Copyright John Maddock 2001. -// (C) Copyright Douglas Gregor 2001. -// (C) Copyright Peter Dimov 2001. -// (C) Copyright Aleksey Gurtovoy 2003. -// (C) Copyright Beman Dawes 2003. -// (C) Copyright Jens Maurer 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// (C) Copyright Douglas Gregor 2001. +// (C) Copyright Peter Dimov 2001. +// (C) Copyright Aleksey Gurtovoy 2003. +// (C) Copyright Beman Dawes 2003. +// (C) Copyright Jens Maurer 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -16,27 +16,27 @@ #if (__COMO_VERSION__ <= 4245) -# if defined(_MSC_VER) && _MSC_VER <= 1300 -# if _MSC_VER > 100 - // only set this in non-strict mode: -# define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP -# endif -# endif +#if defined(_MSC_VER) && _MSC_VER <= 1300 +#if _MSC_VER > 100 +// only set this in non-strict mode: +#define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP +#endif +#endif // Void returns don't work when emulating VC 6 (Peter Dimov) // TODO: look up if this doesn't apply to the whole 12xx range -# if defined(_MSC_VER) && (_MSC_VER < 1300) -# define BOOST_NO_VOID_RETURNS -# endif +#if defined(_MSC_VER) && (_MSC_VER < 1300) +#define BOOST_NO_VOID_RETURNS +#endif -#endif // version 4245 +#endif // version 4245 // // enable __int64 support in VC emulation mode // -# if defined(_MSC_VER) && (_MSC_VER >= 1200) -# define BOOST_HAS_MS_INT64 -# endif +#if defined(_MSC_VER) && (_MSC_VER >= 1200) +#define BOOST_HAS_MS_INT64 +#endif #define BOOST_COMPILER "Comeau compiler version " BOOST_STRINGIZE(__COMO_VERSION__) @@ -44,16 +44,12 @@ // versions check: // we don't know Comeau prior to version 4245: #if __COMO_VERSION__ < 4245 -# error "Compiler not configured - please reconfigure" +#error "Compiler not configured - please reconfigure" #endif // // last known and checked version is 4245: #if (__COMO_VERSION__ > 4245) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/common_edg.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/common_edg.hpp index 9dc4cef8..00f9d1d8 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/common_edg.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/common_edg.hpp @@ -1,10 +1,10 @@ -// (C) Copyright John Maddock 2001 - 2002. -// (C) Copyright Jens Maurer 2001. -// (C) Copyright David Abrahams 2002. -// (C) Copyright Aleksey Gurtovoy 2002. +// (C) Copyright John Maddock 2001 - 2002. +// (C) Copyright Jens Maurer 2001. +// (C) Copyright David Abrahams 2002. +// (C) Copyright Aleksey Gurtovoy 2002. // (C) Copyright Markus Schoepflin 2005. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -14,45 +14,45 @@ // // This is included from within the individual compiler mini-configs. -#ifndef __EDG_VERSION__ -# error This file requires that __EDG_VERSION__ be defined. +#ifndef __EDG_VERSION__ +#error This file requires that __EDG_VERSION__ be defined. #endif #if (__EDG_VERSION__ <= 238) -# define BOOST_NO_INTEGRAL_INT64_T -# define BOOST_NO_SFINAE +#define BOOST_NO_INTEGRAL_INT64_T +#define BOOST_NO_SFINAE #endif #if (__EDG_VERSION__ <= 240) -# define BOOST_NO_VOID_RETURNS +#define BOOST_NO_VOID_RETURNS #endif #if (__EDG_VERSION__ <= 241) && !defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) -# define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP +#define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP #endif #if (__EDG_VERSION__ <= 244) && !defined(BOOST_NO_TEMPLATE_TEMPLATES) -# define BOOST_NO_TEMPLATE_TEMPLATES -#endif +#define BOOST_NO_TEMPLATE_TEMPLATES +#endif #if (__EDG_VERSION__ < 300) && !defined(BOOST_NO_IS_ABSTRACT) -# define BOOST_NO_IS_ABSTRACT -#endif +#define BOOST_NO_IS_ABSTRACT +#endif #if (__EDG_VERSION__ <= 303) && !defined(BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL) -# define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL -#endif +#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL +#endif // See also kai.hpp which checks a Kai-specific symbol for EH -# if !defined(__KCC) && !defined(__EXCEPTIONS) -# define BOOST_NO_EXCEPTIONS -# endif +#if !defined(__KCC) && !defined(__EXCEPTIONS) +#define BOOST_NO_EXCEPTIONS +#endif -# if !defined(__NO_LONG_LONG) -# define BOOST_HAS_LONG_LONG -# else -# define BOOST_NO_LONG_LONG -# endif +#if !defined(__NO_LONG_LONG) +#define BOOST_HAS_LONG_LONG +#else +#define BOOST_NO_LONG_LONG +#endif // // C++0x features @@ -61,7 +61,7 @@ // #if (__EDG_VERSION__ <= 310) || !defined(BOOST_STRICT_CONFIG) // No support for initializer lists -# define BOOST_NO_INITIALIZER_LISTS +#define BOOST_NO_INITIALIZER_LISTS #endif #define BOOST_NO_AUTO_DECLARATIONS @@ -92,6 +92,3 @@ // However, some libraries have insufficient "long long" support // #define BOOST_HAS_LONG_LONG #endif - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/compaq_cxx.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/compaq_cxx.hpp index b44486c6..2953f35a 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/compaq_cxx.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/compaq_cxx.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2001 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -14,6 +14,3 @@ // // versions check: // Nothing to do here? - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/digitalmars.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/digitalmars.hpp index a01b4c28..0235d445 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/digitalmars.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/digitalmars.hpp @@ -1,8 +1,8 @@ // Copyright (C) Christof Meerwald 2003 // Copyright (C) Dan Watkins 2003 // -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // Digital Mars C++ compiler setup: @@ -46,13 +46,13 @@ // #include #if !defined(__STL_IMPORT_VENDOR_CSTD) && !defined(_STLP_IMPORT_VENDOR_CSTD) -# define BOOST_NO_STDC_NAMESPACE +#define BOOST_NO_STDC_NAMESPACE #endif // check for exception handling support: #ifndef _CPPUNWIND -# define BOOST_NO_EXCEPTIONS +#define BOOST_NO_EXCEPTIONS #endif // @@ -87,7 +87,7 @@ // // last known and checked version is ...: #if (__DMC__ > 0x848) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc.hpp index 6cae94ca..29a6ee01 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc.hpp @@ -1,12 +1,12 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Darin Adler 2001 - 2002. -// (C) Copyright Jens Maurer 2001 - 2002. -// (C) Copyright Beman Dawes 2001 - 2003. -// (C) Copyright Douglas Gregor 2002. -// (C) Copyright David Abrahams 2002 - 2003. -// (C) Copyright Synge Todo 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Darin Adler 2001 - 2002. +// (C) Copyright Jens Maurer 2001 - 2002. +// (C) Copyright Beman Dawes 2001 - 2003. +// (C) Copyright Douglas Gregor 2002. +// (C) Copyright David Abrahams 2002 - 2003. +// (C) Copyright Synge Todo 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -14,63 +14,63 @@ // GNU C++ compiler setup: #if __GNUC__ < 3 -# if __GNUC_MINOR__ == 91 - // egcs 1.1 won't parse shared_ptr.hpp without this: -# define BOOST_NO_AUTO_PTR -# endif -# if __GNUC_MINOR__ < 95 - // - // Prior to gcc 2.95 member templates only partly - // work - define BOOST_MSVC6_MEMBER_TEMPLATES - // instead since inline member templates mostly work. - // -# define BOOST_NO_MEMBER_TEMPLATES -# if __GNUC_MINOR__ >= 9 -# define BOOST_MSVC6_MEMBER_TEMPLATES -# endif -# endif +#if __GNUC_MINOR__ == 91 +// egcs 1.1 won't parse shared_ptr.hpp without this: +#define BOOST_NO_AUTO_PTR +#endif +#if __GNUC_MINOR__ < 95 +// +// Prior to gcc 2.95 member templates only partly +// work - define BOOST_MSVC6_MEMBER_TEMPLATES +// instead since inline member templates mostly work. +// +#define BOOST_NO_MEMBER_TEMPLATES +#if __GNUC_MINOR__ >= 9 +#define BOOST_MSVC6_MEMBER_TEMPLATES +#endif +#endif -# if __GNUC_MINOR__ < 96 -# define BOOST_NO_SFINAE -# endif +#if __GNUC_MINOR__ < 96 +#define BOOST_NO_SFINAE +#endif -# if __GNUC_MINOR__ <= 97 -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -# define BOOST_NO_OPERATORS_IN_NAMESPACE -# endif +#if __GNUC_MINOR__ <= 97 +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#define BOOST_NO_OPERATORS_IN_NAMESPACE +#endif -# define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE -# define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL -# define BOOST_NO_IS_ABSTRACT +#define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE +#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL +#define BOOST_NO_IS_ABSTRACT #elif __GNUC__ == 3 -# if defined (__PATHSCALE__) -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP -# define BOOST_NO_IS_ABSTRACT -# endif - // - // gcc-3.x problems: - // - // Bug specific to gcc 3.1 and 3.2: - // -# if ((__GNUC_MINOR__ == 1) || (__GNUC_MINOR__ == 2)) -# define BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS -# endif -# if __GNUC_MINOR__ < 4 -# define BOOST_NO_IS_ABSTRACT -# endif +#if defined(__PATHSCALE__) +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#define BOOST_NO_IS_ABSTRACT +#endif +// +// gcc-3.x problems: +// +// Bug specific to gcc 3.1 and 3.2: +// +#if ((__GNUC_MINOR__ == 1) || (__GNUC_MINOR__ == 2)) +#define BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS +#endif +#if __GNUC_MINOR__ < 4 +#define BOOST_NO_IS_ABSTRACT +#endif #endif #if __GNUC__ < 4 // // All problems to gcc-3.x and earlier here: // #define BOOST_NO_TWO_PHASE_NAME_LOOKUP -# ifdef __OPEN64__ -# define BOOST_NO_IS_ABSTRACT -# endif +#ifdef __OPEN64__ +#define BOOST_NO_IS_ABSTRACT +#endif #endif #ifndef __EXCEPTIONS -# define BOOST_NO_EXCEPTIONS +#define BOOST_NO_EXCEPTIONS #endif @@ -80,8 +80,8 @@ // later if no threading API is detected. // #if !defined(__MINGW32__) && !defined(linux) && !defined(__linux) && !defined(__linux__) -# define BOOST_HAS_THREADS -#endif +#define BOOST_HAS_THREADS +#endif // // gcc has "long long" @@ -91,17 +91,17 @@ // // gcc implements the named return value optimization since version 3.1 // -#if __GNUC__ > 3 || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 1 ) +#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1) #define BOOST_HAS_NRVO #endif // // RTTI and typeinfo detection is possible post gcc-4.3: // #if __GNUC__ * 100 + __GNUC_MINOR__ >= 403 -# ifndef __GXX_RTTI -# define BOOST_NO_TYPEID -# define BOOST_NO_RTTI -# endif +#ifndef __GXX_RTTI +#define BOOST_NO_TYPEID +#define BOOST_NO_RTTI +#endif #endif // C++0x features not implemented in any GCC version @@ -120,85 +120,83 @@ // C++0x features are only enabled when -std=c++0x or -std=gnu++0x are // passed on the command line, which in turn defines // __GXX_EXPERIMENTAL_CXX0X__. -# define BOOST_HAS_DECLTYPE -# define BOOST_HAS_RVALUE_REFS -# define BOOST_HAS_STATIC_ASSERT -# define BOOST_HAS_VARIADIC_TMPL +#define BOOST_HAS_DECLTYPE +#define BOOST_HAS_RVALUE_REFS +#define BOOST_HAS_STATIC_ASSERT +#define BOOST_HAS_VARIADIC_TMPL #else -# define BOOST_NO_DECLTYPE -# define BOOST_NO_FUNCTION_TEMPLATE_DEFAULT_ARGS -# define BOOST_NO_RVALUE_REFERENCES -# define BOOST_NO_STATIC_ASSERT +#define BOOST_NO_DECLTYPE +#define BOOST_NO_FUNCTION_TEMPLATE_DEFAULT_ARGS +#define BOOST_NO_RVALUE_REFERENCES +#define BOOST_NO_STATIC_ASSERT -// Variadic templates compiler: +// Variadic templates compiler: // http://www.generic-programming.org/~dgregor/cpp/variadic-templates.html -# ifdef __VARIADIC_TEMPLATES -# define BOOST_HAS_VARIADIC_TMPL -# else -# define BOOST_NO_VARIADIC_TEMPLATES -# endif +#ifdef __VARIADIC_TEMPLATES +#define BOOST_HAS_VARIADIC_TMPL +#else +#define BOOST_NO_VARIADIC_TEMPLATES +#endif #endif // C++0x features in 4.4.n and later // #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4) || !defined(__GXX_EXPERIMENTAL_CXX0X__) -# define BOOST_NO_AUTO_DECLARATIONS -# define BOOST_NO_AUTO_MULTIDECLARATIONS -# define BOOST_NO_CHAR16_T -# define BOOST_NO_CHAR32_T -# define BOOST_NO_DEFAULTED_FUNCTIONS -# define BOOST_NO_DELETED_FUNCTIONS -# define BOOST_NO_INITIALIZER_LISTS -# define BOOST_NO_SCOPED_ENUMS +#define BOOST_NO_AUTO_DECLARATIONS +#define BOOST_NO_AUTO_MULTIDECLARATIONS +#define BOOST_NO_CHAR16_T +#define BOOST_NO_CHAR32_T +#define BOOST_NO_DEFAULTED_FUNCTIONS +#define BOOST_NO_DELETED_FUNCTIONS +#define BOOST_NO_INITIALIZER_LISTS +#define BOOST_NO_SCOPED_ENUMS #endif #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4) -# define BOOST_NO_SFINAE_EXPR +#define BOOST_NO_SFINAE_EXPR #endif // C++0x features in 4.4.1 and later // -#if (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__ < 40401) || !defined(__GXX_EXPERIMENTAL_CXX0X__) +#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40401) || !defined(__GXX_EXPERIMENTAL_CXX0X__) // scoped enums have a serious bug in 4.4.0, so define BOOST_NO_SCOPED_ENUMS before 4.4.1 // See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38064 -# define BOOST_NO_SCOPED_ENUMS +#define BOOST_NO_SCOPED_ENUMS #endif // C++0x features in 4.5.n and later // #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5) || !defined(__GXX_EXPERIMENTAL_CXX0X__) -# define BOOST_NO_EXPLICIT_CONVERSION_OPERATORS +#define BOOST_NO_EXPLICIT_CONVERSION_OPERATORS #endif // ConceptGCC compiler: // http://www.generic-programming.org/software/ConceptGCC/ #ifdef __GXX_CONCEPTS__ -# define BOOST_HAS_CONCEPTS -# define BOOST_COMPILER "ConceptGCC version " __VERSION__ +#define BOOST_HAS_CONCEPTS +#define BOOST_COMPILER "ConceptGCC version " __VERSION__ #else -# define BOOST_NO_CONCEPTS +#define BOOST_NO_CONCEPTS #endif #ifndef BOOST_COMPILER -# define BOOST_COMPILER "GNU C++ version " __VERSION__ +#define BOOST_COMPILER "GNU C++ version " __VERSION__ #endif // // versions check: // we don't know gcc prior to version 2.90: #if (__GNUC__ == 2) && (__GNUC_MINOR__ < 90) -# error "Compiler not configured - please reconfigure" +#error "Compiler not configured - please reconfigure" #endif // // last known and checked version is 4.4 (Pre-release): #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 4)) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# else +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#else // we don't emit warnings here anymore since there are no defect macros defined for // gcc post 3.4, so any failures are gcc regressions... -//# warning "Unknown compiler version - please run the configure tests and report the results" -# endif +// # warning "Unknown compiler version - please run the configure tests and report the results" +#endif #endif - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc_xml.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc_xml.hpp index 5dd67c76..a6dafab0 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc_xml.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/gcc_xml.hpp @@ -1,15 +1,15 @@ -// (C) Copyright John Maddock 2006. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2006. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. // GCC-XML C++ compiler setup: -# if !defined(__GCCXML_GNUC__) || ((__GCCXML_GNUC__ <= 3) && (__GCCXML_GNUC_MINOR__ <= 3)) -# define BOOST_NO_IS_ABSTRACT -# endif +#if !defined(__GCCXML_GNUC__) || ((__GCCXML_GNUC__ <= 3) && (__GCCXML_GNUC_MINOR__ <= 3)) +#define BOOST_NO_IS_ABSTRACT +#endif // // Threading support: Turn this on unconditionally here (except for @@ -17,8 +17,8 @@ // later if no threading API is detected. // #if !defined(__MINGW32__) && !defined(_MSC_VER) && !defined(linux) && !defined(__linux) && !defined(__linux__) -# define BOOST_HAS_THREADS -#endif +#define BOOST_HAS_THREADS +#endif // // gcc has "long long" @@ -26,5 +26,3 @@ #define BOOST_HAS_LONG_LONG #define BOOST_COMPILER "GCC-XML C++ version " __GCCXML__ - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/greenhills.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/greenhills.hpp index 038b6b2b..c9fc6484 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/greenhills.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/greenhills.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2001. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -15,14 +15,12 @@ // versions check: // we don't support Greenhills prior to version 0: #if __ghs < 0 -# error "Compiler not supported or configured - please reconfigure" +#error "Compiler not supported or configured - please reconfigure" #endif // // last known and checked version is 0: #if (__ghs > 0) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/hp_acc.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/hp_acc.hpp index 98e7772a..4e009508 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/hp_acc.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/hp_acc.hpp @@ -1,11 +1,11 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Jens Maurer 2001 - 2003. -// (C) Copyright Aleksey Gurtovoy 2002. -// (C) Copyright David Abrahams 2002 - 2003. -// (C) Copyright Toon Knapen 2003. +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Jens Maurer 2001 - 2003. +// (C) Copyright Aleksey Gurtovoy 2002. +// (C) Copyright David Abrahams 2002 - 2003. +// (C) Copyright Toon Knapen 2003. // (C) Copyright Boris Gubenko 2006 - 2007. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -17,42 +17,42 @@ #endif #if (__HP_aCC <= 33100) -# define BOOST_NO_INTEGRAL_INT64_T -# define BOOST_NO_OPERATORS_IN_NAMESPACE -# if !defined(_NAMESPACE_STD) -# define BOOST_NO_STD_LOCALE -# define BOOST_NO_STRINGSTREAM -# endif +#define BOOST_NO_INTEGRAL_INT64_T +#define BOOST_NO_OPERATORS_IN_NAMESPACE +#if !defined(_NAMESPACE_STD) +#define BOOST_NO_STD_LOCALE +#define BOOST_NO_STRINGSTREAM +#endif #endif #if (__HP_aCC <= 33300) // member templates are sufficiently broken that we disable them for now -# define BOOST_NO_MEMBER_TEMPLATES -# define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS -# define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE +#define BOOST_NO_MEMBER_TEMPLATES +#define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS +#define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE #endif #if (__HP_aCC <= 38000) -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP #endif #if (__HP_aCC > 50000) && (__HP_aCC < 60000) -# define BOOST_NO_UNREACHABLE_RETURN_DETECTION -# define BOOST_NO_TEMPLATE_TEMPLATES -# define BOOST_NO_SWPRINTF -# define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS -# define BOOST_NO_IS_ABSTRACT -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -#endif +#define BOOST_NO_UNREACHABLE_RETURN_DETECTION +#define BOOST_NO_TEMPLATE_TEMPLATES +#define BOOST_NO_SWPRINTF +#define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS +#define BOOST_NO_IS_ABSTRACT +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#endif // optional features rather than defects: #if (__HP_aCC >= 33900) -# define BOOST_HAS_LONG_LONG -# define BOOST_HAS_PARTIAL_STD_ALLOCATOR +#define BOOST_HAS_LONG_LONG +#define BOOST_HAS_PARTIAL_STD_ALLOCATOR #endif -#if (__HP_aCC >= 50000 ) && (__HP_aCC <= 53800 ) || (__HP_aCC < 31300 ) -# define BOOST_NO_MEMBER_TEMPLATE_KEYWORD +#if (__HP_aCC >= 50000) && (__HP_aCC <= 53800) || (__HP_aCC < 31300) +#define BOOST_NO_MEMBER_TEMPLATE_KEYWORD #endif // This macro should not be defined when compiling in strict ansi @@ -61,7 +61,7 @@ // of aCC6 compiler will provide predefined macros reflecting the // compilation options, including the standard mode. #if (__HP_aCC >= 60000) || ((__HP_aCC > 38000) && defined(__hpxstd98)) -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP #endif #define BOOST_COMPILER "HP aCC version " BOOST_STRINGIZE(__HP_aCC) @@ -70,19 +70,19 @@ // versions check: // we don't support HP aCC prior to version 33000: #if __HP_aCC < 33000 -# error "Compiler not supported or configured - please reconfigure" +#error "Compiler not supported or configured - please reconfigure" #endif // // Extended checks for supporting aCC on PA-RISC #if __HP_aCC > 30000 && __HP_aCC < 50000 -# if __HP_aCC < 38000 - // versions prior to version A.03.80 not supported -# error "Compiler version not supported - version A.03.80 or higher is required" -# elif !defined(__hpxstd98) - // must compile using the option +hpxstd98 with version A.03.80 and above -# error "Compiler option '+hpxstd98' is required for proper support" -# endif //PA-RISC +#if __HP_aCC < 38000 +// versions prior to version A.03.80 not supported +#error "Compiler version not supported - version A.03.80 or higher is required" +#elif !defined(__hpxstd98) +// must compile using the option +hpxstd98 with version A.03.80 and above +#error "Compiler option '+hpxstd98' is required for proper support" +#endif // PA-RISC #endif // @@ -121,7 +121,7 @@ // last known and checked version for HP-UX/ia64 is 61300 // last known and checked version for PA-RISC is 38000 #if ((__HP_aCC > 61300) || ((__HP_aCC > 38000) && defined(__hpxstd98))) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/intel.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/intel.hpp index 531242e9..eece4d8f 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/intel.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/intel.hpp @@ -17,32 +17,32 @@ #include "boost/config/compiler/common_edg.hpp" #if defined(__INTEL_COMPILER) -# define BOOST_INTEL_CXX_VERSION __INTEL_COMPILER +#define BOOST_INTEL_CXX_VERSION __INTEL_COMPILER #elif defined(__ICL) -# define BOOST_INTEL_CXX_VERSION __ICL +#define BOOST_INTEL_CXX_VERSION __ICL #elif defined(__ICC) -# define BOOST_INTEL_CXX_VERSION __ICC +#define BOOST_INTEL_CXX_VERSION __ICC #elif defined(__ECC) -# define BOOST_INTEL_CXX_VERSION __ECC +#define BOOST_INTEL_CXX_VERSION __ECC #endif #define BOOST_COMPILER "Intel C++ version " BOOST_STRINGIZE(BOOST_INTEL_CXX_VERSION) -#define BOOST_INTEL BOOST_INTEL_CXX_VERSION +#define BOOST_INTEL BOOST_INTEL_CXX_VERSION #if defined(_WIN32) || defined(_WIN64) -# define BOOST_INTEL_WIN BOOST_INTEL +#define BOOST_INTEL_WIN BOOST_INTEL #else -# define BOOST_INTEL_LINUX BOOST_INTEL +#define BOOST_INTEL_LINUX BOOST_INTEL #endif #if (BOOST_INTEL_CXX_VERSION <= 500) && defined(_MSC_VER) -# define BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS -# define BOOST_NO_TEMPLATE_TEMPLATES +#define BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS +#define BOOST_NO_TEMPLATE_TEMPLATES #endif #if (BOOST_INTEL_CXX_VERSION <= 600) -# if defined(_MSC_VER) && (_MSC_VER <= 1300) // added check for <= VC 7 (Peter Dimov) +#if defined(_MSC_VER) && (_MSC_VER <= 1300) // added check for <= VC 7 (Peter Dimov) // Boost libraries assume strong standard conformance unless otherwise // indicated by a config macro. As configured by Intel, the EDG front-end @@ -55,25 +55,25 @@ // Thus BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP will not be defined, even if // the compiler option is not enabled. -# define BOOST_NO_SWPRINTF -# endif +#define BOOST_NO_SWPRINTF +#endif // Void returns, 64 bit integrals don't work when emulating VC 6 (Peter Dimov) -# if defined(_MSC_VER) && (_MSC_VER <= 1200) -# define BOOST_NO_VOID_RETURNS -# define BOOST_NO_INTEGRAL_INT64_T -# endif +#if defined(_MSC_VER) && (_MSC_VER <= 1200) +#define BOOST_NO_VOID_RETURNS +#define BOOST_NO_INTEGRAL_INT64_T +#endif #endif #if (BOOST_INTEL_CXX_VERSION <= 710) && defined(_WIN32) -# define BOOST_NO_POINTER_TO_MEMBER_TEMPLATE_PARAMETERS +#define BOOST_NO_POINTER_TO_MEMBER_TEMPLATE_PARAMETERS #endif // See http://aspn.activestate.com/ASPN/Mail/Message/boost/1614864 #if BOOST_INTEL_CXX_VERSION < 600 -# define BOOST_NO_INTRINSIC_WCHAR_T +#define BOOST_NO_INTRINSIC_WCHAR_T #else // We should test the macro _WCHAR_T_DEFINED to check if the compiler // supports wchar_t natively. *BUT* there is a problem here: the standard @@ -83,9 +83,9 @@ // or not. // Under UNIX, the situation is exactly the same, but the macro _WCHAR_T // is used instead. -# if ((_WCHAR_T_DEFINED + 0) == 0) && ((_WCHAR_T + 0) == 0) -# define BOOST_NO_INTRINSIC_WCHAR_T -# endif +#if ((_WCHAR_T_DEFINED + 0) == 0) && ((_WCHAR_T + 0) == 0) +#define BOOST_NO_INTRINSIC_WCHAR_T +#endif #endif #if defined(__GNUC__) && !defined(BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL) @@ -95,9 +95,9 @@ // later than that if they are set up to emulate gcc 3.2 // or earlier): // -# if ((__GNUC__ == 3) && (__GNUC_MINOR__ <= 2)) || (BOOST_INTEL < 900) || (__INTEL_COMPILER_BUILD_DATE < 20050912) -# define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL -# endif +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ <= 2)) || (BOOST_INTEL < 900) || (__INTEL_COMPILER_BUILD_DATE < 20050912) +#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL +#endif #endif #if (defined(__GNUC__) && (__GNUC__ < 4)) || defined(_WIN32) || (BOOST_INTEL_CXX_VERSION <= 1110) // GCC or VC emulation: @@ -111,63 +111,69 @@ // #if defined(BOOST_NO_INTRINSIC_WCHAR_T) #include -template< typename T > struct assert_no_intrinsic_wchar_t; -template<> struct assert_no_intrinsic_wchar_t { typedef void type; }; +template struct assert_no_intrinsic_wchar_t; +template <> struct assert_no_intrinsic_wchar_t +{ + typedef void type; +}; // if you see an error here then you need to unset BOOST_NO_INTRINSIC_WCHAR_T // where it is defined above: typedef assert_no_intrinsic_wchar_t::type assert_no_intrinsic_wchar_t_; #else -template< typename T > struct assert_intrinsic_wchar_t; -template<> struct assert_intrinsic_wchar_t {}; +template struct assert_intrinsic_wchar_t; +template <> struct assert_intrinsic_wchar_t +{ +}; // if you see an error here then define BOOST_NO_INTRINSIC_WCHAR_T on the command line: -template<> struct assert_intrinsic_wchar_t {}; +template <> struct assert_intrinsic_wchar_t +{ +}; #endif -#if _MSC_VER+0 >= 1000 -# if _MSC_VER >= 1200 -# define BOOST_HAS_MS_INT64 -# endif -# define BOOST_NO_SWPRINTF -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#if _MSC_VER + 0 >= 1000 +#if _MSC_VER >= 1200 +#define BOOST_HAS_MS_INT64 +#endif +#define BOOST_NO_SWPRINTF +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP #elif defined(_WIN32) -# define BOOST_DISABLE_WIN32 +#define BOOST_DISABLE_WIN32 #endif // I checked version 6.0 build 020312Z, it implements the NRVO. // Correct this as you find out which version of the compiler // implemented the NRVO first. (Daniel Frey) #if (BOOST_INTEL_CXX_VERSION >= 600) -# define BOOST_HAS_NRVO +#define BOOST_HAS_NRVO #endif // // versions check: // we don't support Intel prior to version 5.0: #if BOOST_INTEL_CXX_VERSION < 500 -# error "Compiler not supported or configured - please reconfigure" +#error "Compiler not supported or configured - please reconfigure" #endif // Intel on MacOS requires #if defined(__APPLE__) && defined(__INTEL_COMPILER) -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP #endif // Intel on Altix Itanium #if defined(__itanium__) && defined(__INTEL_COMPILER) -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP #endif // // last known and checked version: #if (BOOST_INTEL_CXX_VERSION > 1110) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# elif defined(_MSC_VER) +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#elif defined(_MSC_VER) // // We don't emit this warning any more, since we have so few // defect macros set anyway (just the one). // -//# pragma message("Unknown compiler version - please run the configure tests and report the results") -# endif +// # pragma message("Unknown compiler version - please run the configure tests and report the results") +#endif #endif - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/kai.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/kai.hpp index ea06f9f4..fee3db69 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/kai.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/kai.hpp @@ -1,8 +1,8 @@ -// (C) Copyright John Maddock 2001. -// (C) Copyright David Abrahams 2002. -// (C) Copyright Aleksey Gurtovoy 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// (C) Copyright David Abrahams 2002. +// (C) Copyright Aleksey Gurtovoy 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -11,23 +11,20 @@ #include "boost/config/compiler/common_edg.hpp" -# if (__KCC_VERSION <= 4001) || !defined(BOOST_STRICT_CONFIG) - // at least on Sun, the contents of is not in namespace std -# define BOOST_NO_STDC_NAMESPACE -# endif +#if (__KCC_VERSION <= 4001) || !defined(BOOST_STRICT_CONFIG) +// at least on Sun, the contents of is not in namespace std +#define BOOST_NO_STDC_NAMESPACE +#endif // see also common_edg.hpp which needs a special check for __KCC -# if !defined(_EXCEPTIONS) -# define BOOST_NO_EXCEPTIONS -# endif +#if !defined(_EXCEPTIONS) +#define BOOST_NO_EXCEPTIONS +#endif // // last known and checked version is 4001: #if (__KCC_VERSION > 4001) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/metrowerks.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/metrowerks.hpp index aeba7f80..7412539c 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/metrowerks.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/metrowerks.hpp @@ -1,11 +1,11 @@ -// (C) Copyright John Maddock 2001. -// (C) Copyright Darin Adler 2001. -// (C) Copyright Peter Dimov 2001. -// (C) Copyright David Abrahams 2001 - 2002. -// (C) Copyright Beman Dawes 2001 - 2003. -// (C) Copyright Stefan Slapeta 2004. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// (C) Copyright Darin Adler 2001. +// (C) Copyright Peter Dimov 2001. +// (C) Copyright David Abrahams 2001 - 2002. +// (C) Copyright Beman Dawes 2001 - 2003. +// (C) Copyright Stefan Slapeta 2004. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -13,73 +13,73 @@ // Metrowerks C++ compiler setup: // locale support is disabled when linking with the dynamic runtime -# ifdef _MSL_NO_LOCALE -# define BOOST_NO_STD_LOCALE -# endif +#ifdef _MSL_NO_LOCALE +#define BOOST_NO_STD_LOCALE +#endif -# if __MWERKS__ <= 0x2301 // 5.3 -# define BOOST_NO_FUNCTION_TEMPLATE_ORDERING -# define BOOST_NO_POINTER_TO_MEMBER_CONST -# define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS -# define BOOST_NO_MEMBER_TEMPLATE_KEYWORD -# endif +#if __MWERKS__ <= 0x2301 // 5.3 +#define BOOST_NO_FUNCTION_TEMPLATE_ORDERING +#define BOOST_NO_POINTER_TO_MEMBER_CONST +#define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS +#define BOOST_NO_MEMBER_TEMPLATE_KEYWORD +#endif -# if __MWERKS__ <= 0x2401 // 6.2 -//# define BOOST_NO_FUNCTION_TEMPLATE_ORDERING -# endif +#if __MWERKS__ <= 0x2401 // 6.2 +// # define BOOST_NO_FUNCTION_TEMPLATE_ORDERING +#endif -# if(__MWERKS__ <= 0x2407) // 7.x -# define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS -# define BOOST_NO_UNREACHABLE_RETURN_DETECTION -# endif +#if (__MWERKS__ <= 0x2407) // 7.x +#define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS +#define BOOST_NO_UNREACHABLE_RETURN_DETECTION +#endif -# if(__MWERKS__ <= 0x3003) // 8.x -# define BOOST_NO_SFINAE -# endif +#if (__MWERKS__ <= 0x3003) // 8.x +#define BOOST_NO_SFINAE +#endif // the "|| !defined(BOOST_STRICT_CONFIG)" part should apply to the last // tested version *only*: -# if(__MWERKS__ <= 0x3207) || !defined(BOOST_STRICT_CONFIG) // 9.6 -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -# define BOOST_NO_IS_ABSTRACT -# endif +#if (__MWERKS__ <= 0x3207) || !defined(BOOST_STRICT_CONFIG) // 9.6 +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#define BOOST_NO_IS_ABSTRACT +#endif #if !__option(wchar_type) -# define BOOST_NO_INTRINSIC_WCHAR_T +#define BOOST_NO_INTRINSIC_WCHAR_T #endif #if !__option(exceptions) -# define BOOST_NO_EXCEPTIONS +#define BOOST_NO_EXCEPTIONS #endif #if (__INTEL__ && _WIN32) || (__POWERPC__ && macintosh) -# if __MWERKS__ == 0x3000 -# define BOOST_COMPILER_VERSION 8.0 -# elif __MWERKS__ == 0x3001 -# define BOOST_COMPILER_VERSION 8.1 -# elif __MWERKS__ == 0x3002 -# define BOOST_COMPILER_VERSION 8.2 -# elif __MWERKS__ == 0x3003 -# define BOOST_COMPILER_VERSION 8.3 -# elif __MWERKS__ == 0x3200 -# define BOOST_COMPILER_VERSION 9.0 -# elif __MWERKS__ == 0x3201 -# define BOOST_COMPILER_VERSION 9.1 -# elif __MWERKS__ == 0x3202 -# define BOOST_COMPILER_VERSION 9.2 -# elif __MWERKS__ == 0x3204 -# define BOOST_COMPILER_VERSION 9.3 -# elif __MWERKS__ == 0x3205 -# define BOOST_COMPILER_VERSION 9.4 -# elif __MWERKS__ == 0x3206 -# define BOOST_COMPILER_VERSION 9.5 -# elif __MWERKS__ == 0x3207 -# define BOOST_COMPILER_VERSION 9.6 -# else -# define BOOST_COMPILER_VERSION __MWERKS__ -# endif +#if __MWERKS__ == 0x3000 +#define BOOST_COMPILER_VERSION 8.0 +#elif __MWERKS__ == 0x3001 +#define BOOST_COMPILER_VERSION 8.1 +#elif __MWERKS__ == 0x3002 +#define BOOST_COMPILER_VERSION 8.2 +#elif __MWERKS__ == 0x3003 +#define BOOST_COMPILER_VERSION 8.3 +#elif __MWERKS__ == 0x3200 +#define BOOST_COMPILER_VERSION 9.0 +#elif __MWERKS__ == 0x3201 +#define BOOST_COMPILER_VERSION 9.1 +#elif __MWERKS__ == 0x3202 +#define BOOST_COMPILER_VERSION 9.2 +#elif __MWERKS__ == 0x3204 +#define BOOST_COMPILER_VERSION 9.3 +#elif __MWERKS__ == 0x3205 +#define BOOST_COMPILER_VERSION 9.4 +#elif __MWERKS__ == 0x3206 +#define BOOST_COMPILER_VERSION 9.5 +#elif __MWERKS__ == 0x3207 +#define BOOST_COMPILER_VERSION 9.6 #else -# define BOOST_COMPILER_VERSION __MWERKS__ +#define BOOST_COMPILER_VERSION __MWERKS__ +#endif +#else +#define BOOST_COMPILER_VERSION __MWERKS__ #endif // @@ -88,9 +88,9 @@ // See boost\config\suffix.hpp for BOOST_NO_LONG_LONG // #if __MWERKS__ > 0x3206 && __option(rvalue_refs) -# define BOOST_HAS_RVALUE_REFS +#define BOOST_HAS_RVALUE_REFS #else -# define BOOST_NO_RVALUE_REFERENCES +#define BOOST_NO_RVALUE_REFERENCES #endif #define BOOST_NO_AUTO_DECLARATIONS #define BOOST_NO_AUTO_MULTIDECLARATIONS @@ -121,19 +121,12 @@ // versions check: // we don't support Metrowerks prior to version 5.3: #if __MWERKS__ < 0x2301 -# error "Compiler not supported or configured - please reconfigure" +#error "Compiler not supported or configured - please reconfigure" #endif // // last known and checked version: #if (__MWERKS__ > 0x3205) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif - - - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/mpw.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/mpw.hpp index 4db14dde..21ec2241 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/mpw.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/mpw.hpp @@ -1,37 +1,37 @@ -// (C) Copyright John Maddock 2001 - 2002. -// (C) Copyright Aleksey Gurtovoy 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2002. +// (C) Copyright Aleksey Gurtovoy 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. // MPW C++ compilers setup: -# if defined(__SC__) -# define BOOST_COMPILER "MPW SCpp version " BOOST_STRINGIZE(__SC__) -# elif defined(__MRC__) -# define BOOST_COMPILER "MPW MrCpp version " BOOST_STRINGIZE(__MRC__) -# else -# error "Using MPW compiler configuration by mistake. Please update." -# endif +#if defined(__SC__) +#define BOOST_COMPILER "MPW SCpp version " BOOST_STRINGIZE(__SC__) +#elif defined(__MRC__) +#define BOOST_COMPILER "MPW MrCpp version " BOOST_STRINGIZE(__MRC__) +#else +#error "Using MPW compiler configuration by mistake. Please update." +#endif // // MPW 8.90: // #if (MPW_CPLUS <= 0x890) || !defined(BOOST_STRICT_CONFIG) -# define BOOST_NO_CV_SPECIALIZATIONS -# define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS -# define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS -# define BOOST_NO_INCLASS_MEMBER_INITIALIZATION -# define BOOST_NO_INTRINSIC_WCHAR_T -# define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION -# define BOOST_NO_USING_TEMPLATE +#define BOOST_NO_CV_SPECIALIZATIONS +#define BOOST_NO_DEPENDENT_NESTED_DERIVATIONS +#define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS +#define BOOST_NO_INCLASS_MEMBER_INITIALIZATION +#define BOOST_NO_INTRINSIC_WCHAR_T +#define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION +#define BOOST_NO_USING_TEMPLATE -# define BOOST_NO_CWCHAR -# define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS +#define BOOST_NO_CWCHAR +#define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS -# define BOOST_NO_STD_ALLOCATOR /* actually a bug with const reference overloading */ +#define BOOST_NO_STD_ALLOCATOR /* actually a bug with const reference overloading */ #endif @@ -68,14 +68,12 @@ // versions check: // we don't support MPW prior to version 8.9: #if MPW_CPLUS < 0x890 -# error "Compiler not supported or configured - please reconfigure" +#error "Compiler not supported or configured - please reconfigure" #endif // // last known and checked version is 0x890: #if (MPW_CPLUS > 0x890) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/pgi.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/pgi.hpp index e40553ef..64225a02 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/pgi.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/pgi.hpp @@ -1,6 +1,6 @@ // (C) Copyright Noel Belcourt 2007. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -8,7 +8,7 @@ // PGI C++ compiler setup: #define BOOST_COMPILER_VERSION __PGIC__##__PGIC_MINOR__ -#define BOOST_COMPILER "PGI compiler version " BOOST_STRINGIZE(_COMPILER_VERSION) +#define BOOST_COMPILER "PGI compiler version " BOOST_STRINGIZE(_COMPILER_VERSION) // // Threading support: @@ -18,13 +18,13 @@ #if (__PGIC__ >= 7) -#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL +#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL #define BOOST_NO_TWO_PHASE_NAME_LOOKUP #define BOOST_NO_SWPRINTF #else -# error "Pgi compiler not configured - please reconfigure" +#error "Pgi compiler not configured - please reconfigure" #endif // @@ -59,4 +59,3 @@ // // version check: // probably nothing to do here? - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sgi_mipspro.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sgi_mipspro.hpp index 90688314..fd85fc27 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sgi_mipspro.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sgi_mipspro.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2001 - 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -25,5 +25,3 @@ // // version check: // probably nothing to do here? - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sunpro_cc.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sunpro_cc.hpp index f5184887..50914237 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sunpro_cc.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/sunpro_cc.hpp @@ -1,73 +1,73 @@ -// (C) Copyright John Maddock 2001. -// (C) Copyright Jens Maurer 2001 - 2003. -// (C) Copyright Peter Dimov 2002. -// (C) Copyright Aleksey Gurtovoy 2002 - 2003. -// (C) Copyright David Abrahams 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// (C) Copyright Jens Maurer 2001 - 2003. +// (C) Copyright Peter Dimov 2002. +// (C) Copyright Aleksey Gurtovoy 2002 - 2003. +// (C) Copyright David Abrahams 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. // Sun C++ compiler setup: -# if __SUNPRO_CC <= 0x500 -# define BOOST_NO_MEMBER_TEMPLATES -# define BOOST_NO_FUNCTION_TEMPLATE_ORDERING -# endif +#if __SUNPRO_CC <= 0x500 +#define BOOST_NO_MEMBER_TEMPLATES +#define BOOST_NO_FUNCTION_TEMPLATE_ORDERING +#endif -# if (__SUNPRO_CC <= 0x520) - // - // Sunpro 5.2 and earler: - // - // although sunpro 5.2 supports the syntax for - // inline initialization it often gets the value - // wrong, especially where the value is computed - // from other constants (J Maddock 6th May 2001) -# define BOOST_NO_INCLASS_MEMBER_INITIALIZATION +#if (__SUNPRO_CC <= 0x520) +// +// Sunpro 5.2 and earler: +// +// although sunpro 5.2 supports the syntax for +// inline initialization it often gets the value +// wrong, especially where the value is computed +// from other constants (J Maddock 6th May 2001) +#define BOOST_NO_INCLASS_MEMBER_INITIALIZATION - // Although sunpro 5.2 supports the syntax for - // partial specialization, it often seems to - // bind to the wrong specialization. Better - // to disable it until suppport becomes more stable - // (J Maddock 6th May 2001). -# define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION -# endif +// Although sunpro 5.2 supports the syntax for +// partial specialization, it often seems to +// bind to the wrong specialization. Better +// to disable it until suppport becomes more stable +// (J Maddock 6th May 2001). +#define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION +#endif -# if (__SUNPRO_CC <= 0x530) - // Requesting debug info (-g) with Boost.Python results - // in an internal compiler error for "static const" - // initialized in-class. - // >> Assertion: (../links/dbg_cstabs.cc, line 611) - // while processing ../test.cpp at line 0. - // (Jens Maurer according to Gottfried Ganssauge 04 Mar 2002) -# define BOOST_NO_INCLASS_MEMBER_INITIALIZATION +#if (__SUNPRO_CC <= 0x530) +// Requesting debug info (-g) with Boost.Python results +// in an internal compiler error for "static const" +// initialized in-class. +// >> Assertion: (../links/dbg_cstabs.cc, line 611) +// while processing ../test.cpp at line 0. +// (Jens Maurer according to Gottfried Ganssauge 04 Mar 2002) +#define BOOST_NO_INCLASS_MEMBER_INITIALIZATION - // SunPro 5.3 has better support for partial specialization, - // but breaks when compiling std::less > - // (Jens Maurer 4 Nov 2001). +// SunPro 5.3 has better support for partial specialization, +// but breaks when compiling std::less > +// (Jens Maurer 4 Nov 2001). - // std::less specialization fixed as reported by George - // Heintzelman; partial specialization re-enabled - // (Peter Dimov 17 Jan 2002) +// std::less specialization fixed as reported by George +// Heintzelman; partial specialization re-enabled +// (Peter Dimov 17 Jan 2002) -//# define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION +// # define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION - // integral constant expressions with 64 bit numbers fail -# define BOOST_NO_INTEGRAL_INT64_T -# endif +// integral constant expressions with 64 bit numbers fail +#define BOOST_NO_INTEGRAL_INT64_T +#endif -# if (__SUNPRO_CC < 0x570) -# define BOOST_NO_TEMPLATE_TEMPLATES - // see http://lists.boost.org/MailArchives/boost/msg47184.php - // and http://lists.boost.org/MailArchives/boost/msg47220.php -# define BOOST_NO_INCLASS_MEMBER_INITIALIZATION -# define BOOST_NO_SFINAE -# define BOOST_NO_ARRAY_TYPE_SPECIALIZATIONS -# endif -# if (__SUNPRO_CC <= 0x580) -# define BOOST_NO_IS_ABSTRACT -# endif +#if (__SUNPRO_CC < 0x570) +#define BOOST_NO_TEMPLATE_TEMPLATES +// see http://lists.boost.org/MailArchives/boost/msg47184.php +// and http://lists.boost.org/MailArchives/boost/msg47220.php +#define BOOST_NO_INCLASS_MEMBER_INITIALIZATION +#define BOOST_NO_SFINAE +#define BOOST_NO_ARRAY_TYPE_SPECIALIZATIONS +#endif +#if (__SUNPRO_CC <= 0x580) +#define BOOST_NO_IS_ABSTRACT +#endif // // Issues that effect all known versions: @@ -79,10 +79,10 @@ // C++0x features // -#if(__SUNPRO_CC >= 0x590) -# define BOOST_HAS_LONG_LONG +#if (__SUNPRO_CC >= 0x590) +#define BOOST_HAS_LONG_LONG #else -# define BOOST_NO_LONG_LONG +#define BOOST_NO_LONG_LONG #endif #define BOOST_NO_AUTO_DECLARATIONS @@ -124,7 +124,7 @@ // // last known and checked version is 0x590: #if (__SUNPRO_CC > 0x590) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/vacpp.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/vacpp.hpp index 01956d3a..896c97d1 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/vacpp.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/vacpp.hpp @@ -1,10 +1,10 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Toon Knapen 2001 - 2003. -// (C) Copyright Lie-Quan Lee 2001. -// (C) Copyright Markus Schoepflin 2002 - 2003. -// (C) Copyright Beman Dawes 2002 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Toon Knapen 2001 - 2003. +// (C) Copyright Lie-Quan Lee 2001. +// (C) Copyright Markus Schoepflin 2002 - 2003. +// (C) Copyright Beman Dawes 2002 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -12,29 +12,29 @@ // Visual Age (IBM) C++ compiler setup: #if __IBMCPP__ <= 501 -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -# define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS #endif -#if (__IBMCPP__ <= 502) +#if (__IBMCPP__ <= 502) // Actually the compiler supports inclass member initialization but it // requires a definition for the class member and it doesn't recognize // it as an integral constant expression when used as a template argument. -# define BOOST_NO_INCLASS_MEMBER_INITIALIZATION -# define BOOST_NO_INTEGRAL_INT64_T -# define BOOST_NO_MEMBER_TEMPLATE_KEYWORD +#define BOOST_NO_INCLASS_MEMBER_INITIALIZATION +#define BOOST_NO_INTEGRAL_INT64_T +#define BOOST_NO_MEMBER_TEMPLATE_KEYWORD #endif #if (__IBMCPP__ <= 600) || !defined(BOOST_STRICT_CONFIG) -# define BOOST_NO_POINTER_TO_MEMBER_TEMPLATE_PARAMETERS -# define BOOST_NO_INITIALIZER_LISTS +#define BOOST_NO_POINTER_TO_MEMBER_TEMPLATE_PARAMETERS +#define BOOST_NO_INITIALIZER_LISTS #endif // // On AIX thread support seems to be indicated by _THREAD_SAFE: // #ifdef _THREAD_SAFE -# define BOOST_HAS_THREADS +#define BOOST_HAS_THREADS #endif #define BOOST_COMPILER "IBM Visual Age version " BOOST_STRINGIZE(__IBMCPP__) @@ -48,9 +48,9 @@ // // last known and checked version is 600: #if (__IBMCPP__ > 1010) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#endif #endif // Some versions of the compiler have issues with default arguments on partial specializations @@ -83,6 +83,3 @@ #define BOOST_NO_TEMPLATE_ALIASES #define BOOST_NO_UNICODE_LITERALS #define BOOST_NO_VARIADIC_TEMPLATES - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/visualc.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/visualc.hpp index 990901f0..6134b5c0 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/visualc.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/compiler/visualc.hpp @@ -1,11 +1,11 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Darin Adler 2001 - 2002. -// (C) Copyright Peter Dimov 2001. -// (C) Copyright Aleksey Gurtovoy 2002. -// (C) Copyright David Abrahams 2002 - 2003. -// (C) Copyright Beman Dawes 2002 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Darin Adler 2001 - 2002. +// (C) Copyright Peter Dimov 2001. +// (C) Copyright Aleksey Gurtovoy 2002. +// (C) Copyright David Abrahams 2002 - 2003. +// (C) Copyright Beman Dawes 2002 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -15,133 +15,133 @@ #define BOOST_MSVC _MSC_VER #if _MSC_FULL_VER > 100000000 -# define BOOST_MSVC_FULL_VER _MSC_FULL_VER +#define BOOST_MSVC_FULL_VER _MSC_FULL_VER #else -# define BOOST_MSVC_FULL_VER (_MSC_FULL_VER * 10) +#define BOOST_MSVC_FULL_VER (_MSC_FULL_VER * 10) #endif // turn off the warnings before we #include anything -#pragma warning( disable : 4503 ) // warning: decorated name length exceeded +#pragma warning(disable : 4503) // warning: decorated name length exceeded -#if _MSC_VER < 1300 // 1200 == VC++ 6.0, 1200-1202 == eVC++4 -# pragma warning( disable : 4786 ) // ident trunc to '255' chars in debug info -# define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS -# define BOOST_NO_VOID_RETURNS -# define BOOST_NO_EXCEPTION_STD_NAMESPACE +#if _MSC_VER < 1300 // 1200 == VC++ 6.0, 1200-1202 == eVC++4 +#pragma warning(disable : 4786) // ident trunc to '255' chars in debug info +#define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS +#define BOOST_NO_VOID_RETURNS +#define BOOST_NO_EXCEPTION_STD_NAMESPACE -# if BOOST_MSVC == 1202 -# define BOOST_NO_STD_TYPEINFO -# endif - - // disable min/max macro defines on vc6: - // +#if BOOST_MSVC == 1202 +#define BOOST_NO_STD_TYPEINFO #endif -#if (_MSC_VER <= 1300) // 1300 == VC++ 7.0 +// disable min/max macro defines on vc6: +// +#endif -# if !defined(_MSC_EXTENSIONS) && !defined(BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS) // VC7 bug with /Za -# define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS -# endif +#if (_MSC_VER <= 1300) // 1300 == VC++ 7.0 -# define BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS -# define BOOST_NO_INCLASS_MEMBER_INITIALIZATION -# define BOOST_NO_PRIVATE_IN_AGGREGATE -# define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP -# define BOOST_NO_INTEGRAL_INT64_T -# define BOOST_NO_DEDUCED_TYPENAME -# define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE +#if !defined(_MSC_EXTENSIONS) && !defined(BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS) // VC7 bug with /Za +#define BOOST_NO_DEPENDENT_TYPES_IN_TEMPLATE_VALUE_PARAMETERS +#endif + +#define BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS +#define BOOST_NO_INCLASS_MEMBER_INITIALIZATION +#define BOOST_NO_PRIVATE_IN_AGGREGATE +#define BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP +#define BOOST_NO_INTEGRAL_INT64_T +#define BOOST_NO_DEDUCED_TYPENAME +#define BOOST_NO_USING_DECLARATION_OVERLOADS_FROM_TYPENAME_BASE // VC++ 6/7 has member templates but they have numerous problems including // cases of silent failure, so for safety we define: -# define BOOST_NO_MEMBER_TEMPLATES +#define BOOST_NO_MEMBER_TEMPLATES // For VC++ experts wishing to attempt workarounds, we define: -# define BOOST_MSVC6_MEMBER_TEMPLATES +#define BOOST_MSVC6_MEMBER_TEMPLATES -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS -# define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION -# define BOOST_NO_CV_VOID_SPECIALIZATIONS -# define BOOST_NO_FUNCTION_TEMPLATE_ORDERING -# define BOOST_NO_USING_TEMPLATE -# define BOOST_NO_SWPRINTF -# define BOOST_NO_TEMPLATE_TEMPLATES -# define BOOST_NO_SFINAE -# define BOOST_NO_POINTER_TO_MEMBER_TEMPLATE_PARAMETERS -# define BOOST_NO_IS_ABSTRACT -# define BOOST_NO_FUNCTION_TYPE_SPECIALIZATIONS +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#define BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION +#define BOOST_NO_CV_VOID_SPECIALIZATIONS +#define BOOST_NO_FUNCTION_TEMPLATE_ORDERING +#define BOOST_NO_USING_TEMPLATE +#define BOOST_NO_SWPRINTF +#define BOOST_NO_TEMPLATE_TEMPLATES +#define BOOST_NO_SFINAE +#define BOOST_NO_POINTER_TO_MEMBER_TEMPLATE_PARAMETERS +#define BOOST_NO_IS_ABSTRACT +#define BOOST_NO_FUNCTION_TYPE_SPECIALIZATIONS // TODO: what version is meant here? Have there really been any fixes in cl 12.01 (as e.g. shipped with eVC4)? -# if (_MSC_VER > 1200) -# define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS -# endif +#if (_MSC_VER > 1200) +#define BOOST_NO_MEMBER_FUNCTION_SPECIALIZATIONS +#endif #endif -#if _MSC_VER < 1400 +#if _MSC_VER < 1400 // although a conforming signature for swprint exists in VC7.1 // it appears not to actually work: -# define BOOST_NO_SWPRINTF +#define BOOST_NO_SWPRINTF #endif #if defined(UNDER_CE) // Windows CE does not have a conforming signature for swprintf -# define BOOST_NO_SWPRINTF +#define BOOST_NO_SWPRINTF #endif -#if _MSC_VER <= 1400 // 1400 == VC++ 8.0 -# define BOOST_NO_MEMBER_TEMPLATE_FRIENDS +#if _MSC_VER <= 1400 // 1400 == VC++ 8.0 +#define BOOST_NO_MEMBER_TEMPLATE_FRIENDS #endif -#if _MSC_VER <= 1600 // 1600 == VC++ 10.0 -# define BOOST_NO_TWO_PHASE_NAME_LOOKUP +#if _MSC_VER <= 1600 // 1600 == VC++ 10.0 +#define BOOST_NO_TWO_PHASE_NAME_LOOKUP #endif -#if _MSC_VER == 1500 // 1500 == VC++ 9.0 - // A bug in VC9: -# define BOOST_NO_ADL_BARRIER +#if _MSC_VER == 1500 // 1500 == VC++ 9.0 + // A bug in VC9: +#define BOOST_NO_ADL_BARRIER #endif -#if _MSC_VER <= 1500 || !defined(BOOST_STRICT_CONFIG) // 1500 == VC++ 9.0 -# define BOOST_NO_INITIALIZER_LISTS +#if _MSC_VER <= 1500 || !defined(BOOST_STRICT_CONFIG) // 1500 == VC++ 9.0 +#define BOOST_NO_INITIALIZER_LISTS #endif #ifndef _NATIVE_WCHAR_T_DEFINED -# define BOOST_NO_INTRINSIC_WCHAR_T +#define BOOST_NO_INTRINSIC_WCHAR_T #endif #if defined(_WIN32_WCE) || defined(UNDER_CE) -# define BOOST_NO_THREADEX -# define BOOST_NO_GETSYSTEMTIMEASFILETIME -# define BOOST_NO_SWPRINTF +#define BOOST_NO_THREADEX +#define BOOST_NO_GETSYSTEMTIMEASFILETIME +#define BOOST_NO_SWPRINTF #endif -// -// check for exception handling support: -#ifndef _CPPUNWIND -# define BOOST_NO_EXCEPTIONS -#endif +// +// check for exception handling support: +#ifndef _CPPUNWIND +#define BOOST_NO_EXCEPTIONS +#endif // // __int64 support: // #if (_MSC_VER >= 1200) -# define BOOST_HAS_MS_INT64 +#define BOOST_HAS_MS_INT64 #endif #if (_MSC_VER >= 1310) && (defined(_MSC_EXTENSIONS) || (_MSC_VER >= 1500)) -# define BOOST_HAS_LONG_LONG +#define BOOST_HAS_LONG_LONG #else -# define BOOST_NO_LONG_LONG +#define BOOST_NO_LONG_LONG #endif #if (_MSC_VER >= 1400) && !defined(_DEBUG) -# define BOOST_HAS_NRVO +#define BOOST_HAS_NRVO #endif // // disable Win32 API's if compiler extentions are // turned off: // #if !defined(_MSC_EXTENSIONS) && !defined(BOOST_DISABLE_WIN32) -# define BOOST_DISABLE_WIN32 +#define BOOST_DISABLE_WIN32 #endif #if !defined(_CPPRTTI) && !defined(BOOST_NO_RTTI) -# define BOOST_NO_RTTI +#define BOOST_NO_RTTI #endif // @@ -188,56 +188,56 @@ // prefix and suffix headers: // #ifndef BOOST_ABI_PREFIX -# define BOOST_ABI_PREFIX "boost/config/abi/msvc_prefix.hpp" +#define BOOST_ABI_PREFIX "boost/config/abi/msvc_prefix.hpp" #endif #ifndef BOOST_ABI_SUFFIX -# define BOOST_ABI_SUFFIX "boost/config/abi/msvc_suffix.hpp" +#define BOOST_ABI_SUFFIX "boost/config/abi/msvc_suffix.hpp" #endif // TODO: -// these things are mostly bogus. 1200 means version 12.0 of the compiler. The +// these things are mostly bogus. 1200 means version 12.0 of the compiler. The // artificial versions assigned to them only refer to the versions of some IDE // these compilers have been shipped with, and even that is not all of it. Some // were shipped with freely downloadable SDKs, others as crosscompilers in eVC. // IOW, you can't use these 'versions' in any sensible way. Sorry. -# if defined(UNDER_CE) -# if _MSC_VER < 1200 - // Note: these are so far off, they are not really supported -# elif _MSC_VER < 1300 // eVC++ 4 comes with 1200-1202 -# define BOOST_COMPILER_VERSION evc4.0 -# elif _MSC_VER == 1400 -# define BOOST_COMPILER_VERSION evc8 -# elif _MSC_VER == 1500 -# define BOOST_COMPILER_VERSION evc9 -# elif _MSC_VER == 1600 -# define BOOST_COMPILER_VERSION evc10 -# else -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown EVC++ compiler version - please run the configure tests and report the results" -# else -# pragma message("Unknown EVC++ compiler version - please run the configure tests and report the results") -# endif -# endif -# else -# if _MSC_VER < 1200 - // Note: these are so far off, they are not really supported -# define BOOST_COMPILER_VERSION 5.0 -# elif _MSC_VER < 1300 -# define BOOST_COMPILER_VERSION 6.0 -# elif _MSC_VER == 1300 -# define BOOST_COMPILER_VERSION 7.0 -# elif _MSC_VER == 1310 -# define BOOST_COMPILER_VERSION 7.1 -# elif _MSC_VER == 1400 -# define BOOST_COMPILER_VERSION 8.0 -# elif _MSC_VER == 1500 -# define BOOST_COMPILER_VERSION 9.0 -# elif _MSC_VER == 1600 -# define BOOST_COMPILER_VERSION 10.0 -# else -# define BOOST_COMPILER_VERSION _MSC_VER -# endif -# endif +#if defined(UNDER_CE) +#if _MSC_VER < 1200 +// Note: these are so far off, they are not really supported +#elif _MSC_VER < 1300 // eVC++ 4 comes with 1200-1202 +#define BOOST_COMPILER_VERSION evc4.0 +#elif _MSC_VER == 1400 +#define BOOST_COMPILER_VERSION evc8 +#elif _MSC_VER == 1500 +#define BOOST_COMPILER_VERSION evc9 +#elif _MSC_VER == 1600 +#define BOOST_COMPILER_VERSION evc10 +#else +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown EVC++ compiler version - please run the configure tests and report the results" +#else +#pragma message("Unknown EVC++ compiler version - please run the configure tests and report the results") +#endif +#endif +#else +#if _MSC_VER < 1200 +// Note: these are so far off, they are not really supported +#define BOOST_COMPILER_VERSION 5.0 +#elif _MSC_VER < 1300 +#define BOOST_COMPILER_VERSION 6.0 +#elif _MSC_VER == 1300 +#define BOOST_COMPILER_VERSION 7.0 +#elif _MSC_VER == 1310 +#define BOOST_COMPILER_VERSION 7.1 +#elif _MSC_VER == 1400 +#define BOOST_COMPILER_VERSION 8.0 +#elif _MSC_VER == 1500 +#define BOOST_COMPILER_VERSION 9.0 +#elif _MSC_VER == 1600 +#define BOOST_COMPILER_VERSION 10.0 +#else +#define BOOST_COMPILER_VERSION _MSC_VER +#endif +#endif #define BOOST_COMPILER "Microsoft Visual C++ version " BOOST_STRINGIZE(BOOST_COMPILER_VERSION) @@ -250,9 +250,9 @@ // // last known and checked version is 1600 (VC10, aka 2010): #if (_MSC_VER > 1600) -# if defined(BOOST_ASSERT_CONFIG) -# error "Unknown compiler version - please run the configure tests and report the results" -# else -# pragma message("Unknown compiler version - please run the configure tests and report the results") -# endif +#if defined(BOOST_ASSERT_CONFIG) +#error "Unknown compiler version - please run the configure tests and report the results" +#else +#pragma message("Unknown compiler version - please run the configure tests and report the results") +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/cmath.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/cmath.hpp index d8268d84..4d6a7589 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/cmath.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/cmath.hpp @@ -11,18 +11,18 @@ // #ifndef BOOST_CONFIG_CMATH -# define BOOST_CONFIG_CMATH +#define BOOST_CONFIG_CMATH -# ifndef BOOST_TR1_NO_RECURSION -# define BOOST_TR1_NO_RECURSION -# define BOOST_CONFIG_NO_CMATH_RECURSION -# endif +#ifndef BOOST_TR1_NO_RECURSION +#define BOOST_TR1_NO_RECURSION +#define BOOST_CONFIG_NO_CMATH_RECURSION +#endif -# include +#include -# ifdef BOOST_CONFIG_NO_CMATH_RECURSION -# undef BOOST_TR1_NO_RECURSION -# undef BOOST_CONFIG_NO_CMATH_RECURSION -# endif +#ifdef BOOST_CONFIG_NO_CMATH_RECURSION +#undef BOOST_TR1_NO_RECURSION +#undef BOOST_CONFIG_NO_CMATH_RECURSION +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/complex.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/complex.hpp index ca200922..4965433a 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/complex.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/complex.hpp @@ -11,18 +11,18 @@ // #ifndef BOOST_CONFIG_COMPLEX -# define BOOST_CONFIG_COMPLEX +#define BOOST_CONFIG_COMPLEX -# ifndef BOOST_TR1_NO_RECURSION -# define BOOST_TR1_NO_RECURSION -# define BOOST_CONFIG_NO_COMPLEX_RECURSION -# endif +#ifndef BOOST_TR1_NO_RECURSION +#define BOOST_TR1_NO_RECURSION +#define BOOST_CONFIG_NO_COMPLEX_RECURSION +#endif -# include +#include -# ifdef BOOST_CONFIG_NO_COMPLEX_RECURSION -# undef BOOST_TR1_NO_RECURSION -# undef BOOST_CONFIG_NO_COMPLEX_RECURSION -# endif +#ifdef BOOST_CONFIG_NO_COMPLEX_RECURSION +#undef BOOST_TR1_NO_RECURSION +#undef BOOST_CONFIG_NO_COMPLEX_RECURSION +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/functional.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/functional.hpp index e395efc1..89d87462 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/functional.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/functional.hpp @@ -11,18 +11,18 @@ // #ifndef BOOST_CONFIG_FUNCTIONAL -# define BOOST_CONFIG_FUNCTIONAL +#define BOOST_CONFIG_FUNCTIONAL -# ifndef BOOST_TR1_NO_RECURSION -# define BOOST_TR1_NO_RECURSION -# define BOOST_CONFIG_NO_FUNCTIONAL_RECURSION -# endif +#ifndef BOOST_TR1_NO_RECURSION +#define BOOST_TR1_NO_RECURSION +#define BOOST_CONFIG_NO_FUNCTIONAL_RECURSION +#endif -# include +#include -# ifdef BOOST_CONFIG_NO_FUNCTIONAL_RECURSION -# undef BOOST_TR1_NO_RECURSION -# undef BOOST_CONFIG_NO_FUNCTIONAL_RECURSION -# endif +#ifdef BOOST_CONFIG_NO_FUNCTIONAL_RECURSION +#undef BOOST_TR1_NO_RECURSION +#undef BOOST_CONFIG_NO_FUNCTIONAL_RECURSION +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/memory.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/memory.hpp index 2b5d2080..29b028d1 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/memory.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/memory.hpp @@ -11,18 +11,18 @@ // #ifndef BOOST_CONFIG_MEMORY -# define BOOST_CONFIG_MEMORY +#define BOOST_CONFIG_MEMORY -# ifndef BOOST_TR1_NO_RECURSION -# define BOOST_TR1_NO_RECURSION -# define BOOST_CONFIG_NO_MEMORY_RECURSION -# endif +#ifndef BOOST_TR1_NO_RECURSION +#define BOOST_TR1_NO_RECURSION +#define BOOST_CONFIG_NO_MEMORY_RECURSION +#endif -# include +#include -# ifdef BOOST_CONFIG_NO_MEMORY_RECURSION -# undef BOOST_TR1_NO_RECURSION -# undef BOOST_CONFIG_NO_MEMORY_RECURSION -# endif +#ifdef BOOST_CONFIG_NO_MEMORY_RECURSION +#undef BOOST_TR1_NO_RECURSION +#undef BOOST_CONFIG_NO_MEMORY_RECURSION +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/utility.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/utility.hpp index dea8f115..6ffcea8a 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/utility.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/no_tr1/utility.hpp @@ -11,18 +11,18 @@ // #ifndef BOOST_CONFIG_UTILITY -# define BOOST_CONFIG_UTILITY +#define BOOST_CONFIG_UTILITY -# ifndef BOOST_TR1_NO_RECURSION -# define BOOST_TR1_NO_RECURSION -# define BOOST_CONFIG_NO_UTILITY_RECURSION -# endif +#ifndef BOOST_TR1_NO_RECURSION +#define BOOST_TR1_NO_RECURSION +#define BOOST_CONFIG_NO_UTILITY_RECURSION +#endif -# include +#include -# ifdef BOOST_CONFIG_NO_UTILITY_RECURSION -# undef BOOST_TR1_NO_RECURSION -# undef BOOST_CONFIG_NO_UTILITY_RECURSION -# endif +#ifdef BOOST_CONFIG_NO_UTILITY_RECURSION +#undef BOOST_TR1_NO_RECURSION +#undef BOOST_CONFIG_NO_UTILITY_RECURSION +#endif #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/aix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/aix.hpp index 894ef42c..c15fc809 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/aix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/aix.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2001 - 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -23,11 +23,7 @@ #define BOOST_HAS_PTHREADS #define BOOST_HAS_PTHREAD_DELAY_NP #define BOOST_HAS_SCHED_YIELD -//#define BOOST_HAS_PTHREAD_YIELD +// #define BOOST_HAS_PTHREAD_YIELD // boilerplate code: #include - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/amigaos.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/amigaos.hpp index 34bcf412..512b4ad3 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/amigaos.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/amigaos.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -11,5 +11,3 @@ #define BOOST_NO_CWCHAR #define BOOST_NO_STD_WSTRING #define BOOST_NO_INTRINSIC_WCHAR_T - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/beos.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/beos.hpp index 48c3d8dc..d4ff57d6 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/beos.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/beos.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2001. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -16,11 +16,8 @@ #define BOOST_HAS_BETHREADS #ifndef BOOST_DISABLE_THREADS -# define BOOST_HAS_THREADS +#define BOOST_HAS_THREADS #endif // boilerplate code: #include - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/bsd.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/bsd.hpp index f02b0e26..160ed7c2 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/bsd.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/bsd.hpp @@ -1,8 +1,8 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Darin Adler 2001. -// (C) Copyright Douglas Gregor 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Darin Adler 2001. +// (C) Copyright Douglas Gregor 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -29,25 +29,22 @@ // advertise the fact in : // #if (defined(__FreeBSD__) && (__FreeBSD__ >= 3)) || defined(__DragonFly__) -# define BOOST_HAS_NL_TYPES_H +#define BOOST_HAS_NL_TYPES_H #endif // // FreeBSD 3.x has pthreads support, but defines _POSIX_THREADS in // and not in // -#if (defined(__FreeBSD__) && (__FreeBSD__ <= 3))\ - || defined(__OpenBSD__) || defined(__DragonFly__) -# define BOOST_HAS_PTHREADS +#if (defined(__FreeBSD__) && (__FreeBSD__ <= 3)) || defined(__OpenBSD__) || defined(__DragonFly__) +#define BOOST_HAS_PTHREADS #endif // // No wide character support in the BSD header files: // #if defined(__NetBSD__) -#define __NetBSD_GCC__ (__GNUC__ * 1000000 \ - + __GNUC_MINOR__ * 1000 \ - + __GNUC_PATCHLEVEL__) +#define __NetBSD_GCC__ (__GNUC__ * 1000000 + __GNUC_MINOR__ * 1000 + __GNUC_PATCHLEVEL__) // XXX - the following is required until c++config.h // defines _GLIBCXX_HAVE_SWPRINTF and friends // or the preprocessor conditionals are removed @@ -55,15 +52,14 @@ #define _GLIBCXX_HAVE_SWPRINTF 1 #endif -#if !((defined(__FreeBSD__) && (__FreeBSD__ >= 5)) \ - || (__NetBSD_GCC__ >= 2095003) || defined(__DragonFly__)) -# define BOOST_NO_CWCHAR +#if !((defined(__FreeBSD__) && (__FreeBSD__ >= 5)) || (__NetBSD_GCC__ >= 2095003) || defined(__DragonFly__)) +#define BOOST_NO_CWCHAR #endif // // The BSD has macros only, no functions: // #if !defined(__OpenBSD__) || defined(__DragonFly__) -# define BOOST_NO_CTYPE_FUNCTIONS +#define BOOST_NO_CTYPE_FUNCTIONS #endif // @@ -78,9 +74,3 @@ // boilerplate code: #define BOOST_HAS_UNISTD_H #include - - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/cygwin.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/cygwin.hpp index 41fcaa10..e8a53779 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/cygwin.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/cygwin.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2001 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -21,17 +21,17 @@ // revert to native Win threads. #define BOOST_HAS_UNISTD_H #include -#if defined(_POSIX_THREADS) && (_POSIX_THREADS+0 >= 0) && !defined(BOOST_HAS_WINTHREADS) -# define BOOST_HAS_PTHREADS -# define BOOST_HAS_SCHED_YIELD -# define BOOST_HAS_GETTIMEOFDAY -# define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE -# define BOOST_HAS_SIGACTION +#if defined(_POSIX_THREADS) && (_POSIX_THREADS + 0 >= 0) && !defined(BOOST_HAS_WINTHREADS) +#define BOOST_HAS_PTHREADS +#define BOOST_HAS_SCHED_YIELD +#define BOOST_HAS_GETTIMEOFDAY +#define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE +#define BOOST_HAS_SIGACTION #else -# if !defined(BOOST_HAS_WINTHREADS) -# define BOOST_HAS_WINTHREADS -# endif -# define BOOST_HAS_FTIME +#if !defined(BOOST_HAS_WINTHREADS) +#define BOOST_HAS_WINTHREADS +#endif +#define BOOST_HAS_FTIME #endif // @@ -44,8 +44,3 @@ // boilerplate code: #include - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/hpux.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/hpux.hpp index 19ce68e5..64724792 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/hpux.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/hpux.hpp @@ -1,10 +1,10 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Jens Maurer 2001 - 2003. -// (C) Copyright David Abrahams 2002. -// (C) Copyright Toon Knapen 2003. +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Jens Maurer 2001 - 2003. +// (C) Copyright David Abrahams 2002. +// (C) Copyright Toon Knapen 2003. // (C) Copyright Boris Gubenko 2006 - 2007. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -18,27 +18,27 @@ // Use of UINT32_C(0) results in "0u l" for the preprocessed source // (verifyable with gcc 2.95.3) #if (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__HP_aCC) -# define BOOST_HAS_STDINT_H +#define BOOST_HAS_STDINT_H #endif #if !(defined(__HP_aCC) || !defined(_INCLUDE__STDC_A1_SOURCE)) -# define BOOST_NO_SWPRINTF +#define BOOST_NO_SWPRINTF #endif #if defined(__HP_aCC) && !defined(_INCLUDE__STDC_A1_SOURCE) -# define BOOST_NO_CWCTYPE +#define BOOST_NO_CWCTYPE #endif #if defined(__GNUC__) -# if (__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)) - // GNU C on HP-UX does not support threads (checked up to gcc 3.3) -# define BOOST_DISABLE_THREADS -# elif !defined(BOOST_DISABLE_THREADS) - // threads supported from gcc-3.3 onwards: -# define BOOST_HAS_THREADS -# define BOOST_HAS_PTHREADS -# endif +#if (__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)) +// GNU C on HP-UX does not support threads (checked up to gcc 3.3) +#define BOOST_DISABLE_THREADS +#elif !defined(BOOST_DISABLE_THREADS) +// threads supported from gcc-3.3 onwards: +#define BOOST_HAS_THREADS +#define BOOST_HAS_PTHREADS +#endif #elif defined(__HP_aCC) && !defined(BOOST_DISABLE_THREADS) -# define BOOST_HAS_PTHREADS +#define BOOST_HAS_PTHREADS #endif // boilerplate code: @@ -47,41 +47,40 @@ // the following are always available: #ifndef BOOST_HAS_GETTIMEOFDAY -# define BOOST_HAS_GETTIMEOFDAY +#define BOOST_HAS_GETTIMEOFDAY #endif #ifndef BOOST_HAS_SCHED_YIELD -# define BOOST_HAS_SCHED_YIELD +#define BOOST_HAS_SCHED_YIELD #endif #ifndef BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE -# define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE +#define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE #endif #ifndef BOOST_HAS_NL_TYPES_H -# define BOOST_HAS_NL_TYPES_H +#define BOOST_HAS_NL_TYPES_H #endif #ifndef BOOST_HAS_NANOSLEEP -# define BOOST_HAS_NANOSLEEP +#define BOOST_HAS_NANOSLEEP #endif #ifndef BOOST_HAS_GETTIMEOFDAY -# define BOOST_HAS_GETTIMEOFDAY +#define BOOST_HAS_GETTIMEOFDAY #endif #ifndef BOOST_HAS_DIRENT_H -# define BOOST_HAS_DIRENT_H +#define BOOST_HAS_DIRENT_H #endif #ifndef BOOST_HAS_CLOCK_GETTIME -# define BOOST_HAS_CLOCK_GETTIME +#define BOOST_HAS_CLOCK_GETTIME #endif #ifndef BOOST_HAS_SIGACTION -# define BOOST_HAS_SIGACTION +#define BOOST_HAS_SIGACTION #endif -#ifndef BOOST_HAS_NRVO -# ifndef __parisc -# define BOOST_HAS_NRVO -# endif +#ifndef BOOST_HAS_NRVO +#ifndef __parisc +#define BOOST_HAS_NRVO #endif -#ifndef BOOST_HAS_LOG1P -# define BOOST_HAS_LOG1P +#endif +#ifndef BOOST_HAS_LOG1P +#define BOOST_HAS_LOG1P #endif #ifndef BOOST_HAS_EXPM1 -# define BOOST_HAS_EXPM1 +#define BOOST_HAS_EXPM1 #endif - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/irix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/irix.hpp index aeae49c8..d250dea8 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/irix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/irix.hpp @@ -1,7 +1,7 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Jens Maurer 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Jens Maurer 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -11,7 +11,7 @@ #define BOOST_PLATFORM "SGI Irix" -#define BOOST_NO_SWPRINTF +#define BOOST_NO_SWPRINTF // // these are not auto detected by POSIX feature tests: // @@ -19,13 +19,10 @@ #define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE #ifdef __GNUC__ - // GNU C on IRIX does not support threads (checked up to gcc 3.3) -# define BOOST_DISABLE_THREADS +// GNU C on IRIX does not support threads (checked up to gcc 3.3) +#define BOOST_DISABLE_THREADS #endif // boilerplate code: #define BOOST_HAS_UNISTD_H #include - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/linux.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/linux.hpp index 51ae1334..3116ac7d 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/linux.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/linux.hpp @@ -1,7 +1,7 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Jens Maurer 2001 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Jens Maurer 2001 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -18,27 +18,27 @@ // We can only test for 2.1 though: // #if defined(__GLIBC__) && ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 1))) - // defines int64_t unconditionally, but defines - // int64_t only if __GNUC__. Thus, assume a fully usable - // only when using GCC. -# if defined __GNUC__ -# define BOOST_HAS_STDINT_H -# endif +// defines int64_t unconditionally, but defines +// int64_t only if __GNUC__. Thus, assume a fully usable +// only when using GCC. +#if defined __GNUC__ +#define BOOST_HAS_STDINT_H +#endif #endif #if defined(__LIBCOMO__) - // - // como on linux doesn't have std:: c functions: - // NOTE: versions of libcomo prior to beta28 have octal version numbering, - // e.g. version 25 is 21 (dec) - // -# if __LIBCOMO_VERSION__ <= 20 -# define BOOST_NO_STDC_NAMESPACE -# endif +// +// como on linux doesn't have std:: c functions: +// NOTE: versions of libcomo prior to beta28 have octal version numbering, +// e.g. version 25 is 21 (dec) +// +#if __LIBCOMO_VERSION__ <= 20 +#define BOOST_NO_STDC_NAMESPACE +#endif -# if __LIBCOMO_VERSION__ <= 21 -# define BOOST_NO_SWPRINTF -# endif +#if __LIBCOMO_VERSION__ <= 21 +#define BOOST_NO_SWPRINTF +#endif #endif @@ -47,22 +47,22 @@ // gettimeofday, earlier versions may or may not have it: // #if defined(__GLIBC__) && (__GLIBC__ >= 2) -# define BOOST_HAS_GETTIMEOFDAY +#define BOOST_HAS_GETTIMEOFDAY #endif #ifdef __USE_POSIX199309 -# define BOOST_HAS_NANOSLEEP +#define BOOST_HAS_NANOSLEEP #endif #if defined(__GLIBC__) && defined(__GLIBC_PREREQ) // __GLIBC_PREREQ is available since 2.1.2 - // swprintf is available since glibc 2.2.0 -# if !__GLIBC_PREREQ(2,2) || (!defined(__USE_ISOC99) && !defined(__USE_UNIX98)) -# define BOOST_NO_SWPRINTF -# endif +// swprintf is available since glibc 2.2.0 +#if !__GLIBC_PREREQ(2, 2) || (!defined(__USE_ISOC99) && !defined(__USE_UNIX98)) +#define BOOST_NO_SWPRINTF +#endif #else -# define BOOST_NO_SWPRINTF +#define BOOST_NO_SWPRINTF #endif // boilerplate code: @@ -75,24 +75,22 @@ // the GNU system headers, some of which (mainly ) // use GNU specific extensions: // -# ifndef __extension__ -# define __extension__ -# endif -# ifndef __const__ -# define __const__ const -# endif -# ifndef __volatile__ -# define __volatile__ volatile -# endif -# ifndef __signed__ -# define __signed__ signed -# endif -# ifndef __typeof__ -# define __typeof__ typeof -# endif -# ifndef __inline__ -# define __inline__ inline -# endif +#ifndef __extension__ +#define __extension__ +#endif +#ifndef __const__ +#define __const__ const +#endif +#ifndef __volatile__ +#define __volatile__ volatile +#endif +#ifndef __signed__ +#define __signed__ signed +#endif +#ifndef __typeof__ +#define __typeof__ typeof +#endif +#ifndef __inline__ +#define __inline__ inline +#endif #endif - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/macos.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/macos.hpp index 2780ef99..43584f76 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/macos.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/macos.hpp @@ -1,8 +1,8 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Darin Adler 2001 - 2002. -// (C) Copyright Bill Kempf 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Darin Adler 2001 - 2002. +// (C) Copyright Bill Kempf 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -15,9 +15,9 @@ // Using the Mac OS X system BSD-style C library. -# ifndef BOOST_HAS_UNISTD_H -# define BOOST_HAS_UNISTD_H -# endif +#ifndef BOOST_HAS_UNISTD_H +#define BOOST_HAS_UNISTD_H +#endif // // Begin by including our boilerplate code for POSIX // feature detection, this is safe even when using @@ -25,35 +25,35 @@ // to replace the platform-native BSD one. G++ users // should also always be able to do this on MaxOS X. // -# include -# ifndef BOOST_HAS_STDINT_H -# define BOOST_HAS_STDINT_H -# endif +#include +#ifndef BOOST_HAS_STDINT_H +#define BOOST_HAS_STDINT_H +#endif // // BSD runtime has pthreads, sigaction, sched_yield and gettimeofday, -// of these only pthreads are advertised in , so set the +// of these only pthreads are advertised in , so set the // other options explicitly: // -# define BOOST_HAS_SCHED_YIELD -# define BOOST_HAS_GETTIMEOFDAY -# define BOOST_HAS_SIGACTION +#define BOOST_HAS_SCHED_YIELD +#define BOOST_HAS_GETTIMEOFDAY +#define BOOST_HAS_SIGACTION -# if (__GNUC__ < 3) && !defined( __APPLE_CC__) +#if (__GNUC__ < 3) && !defined(__APPLE_CC__) // GCC strange "ignore std" mode works better if you pretend everything // is in the std namespace, for the most part. -# define BOOST_NO_STDC_NAMESPACE -# endif +#define BOOST_NO_STDC_NAMESPACE +#endif -# if (__GNUC__ == 4) +#if (__GNUC__ == 4) -// Both gcc and intel require these. -# define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE -# define BOOST_HAS_NANOSLEEP +// Both gcc and intel require these. +#define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE +#define BOOST_HAS_NANOSLEEP -# endif +#endif #else @@ -61,26 +61,23 @@ // We will eventually support threads in non-Carbon builds, but we do // not support this yet. -# if ( defined(TARGET_API_MAC_CARBON) && TARGET_API_MAC_CARBON ) || ( defined(TARGET_CARBON) && TARGET_CARBON ) +#if (defined(TARGET_API_MAC_CARBON) && TARGET_API_MAC_CARBON) || (defined(TARGET_CARBON) && TARGET_CARBON) -# if !defined(BOOST_HAS_PTHREADS) -# define BOOST_HAS_MPTASKS -# elif ( __dest_os == __mac_os_x ) +#if !defined(BOOST_HAS_PTHREADS) +#define BOOST_HAS_MPTASKS +#elif (__dest_os == __mac_os_x) // We are doing a Carbon/Mach-O/MSL build which has pthreads, but only the // gettimeofday and no posix. -# define BOOST_HAS_GETTIMEOFDAY -# endif +#define BOOST_HAS_GETTIMEOFDAY +#endif // The MP task implementation of Boost Threads aims to replace MP-unsafe // parts of the MSL, so we turn on threads unconditionally. -# define BOOST_HAS_THREADS +#define BOOST_HAS_THREADS // The remote call manager depends on this. -# define BOOST_BIND_ENABLE_PASCAL - -# endif +#define BOOST_BIND_ENABLE_PASCAL #endif - - +#endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/qnxnto.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/qnxnto.hpp index b1377c8d..7e68bfe5 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/qnxnto.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/qnxnto.hpp @@ -1,6 +1,6 @@ -// (C) Copyright Jim Douglas 2005. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright Jim Douglas 2005. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -14,9 +14,9 @@ // QNX claims XOpen version 5 compatibility, but doesn't have an nl_types.h // or log1p and expm1: -#undef BOOST_HAS_NL_TYPES_H -#undef BOOST_HAS_LOG1P -#undef BOOST_HAS_EXPM1 +#undef BOOST_HAS_NL_TYPES_H +#undef BOOST_HAS_LOG1P +#undef BOOST_HAS_EXPM1 #define BOOST_HAS_PTHREADS #define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE @@ -24,8 +24,3 @@ #define BOOST_HAS_GETTIMEOFDAY #define BOOST_HAS_CLOCK_GETTIME #define BOOST_HAS_NANOSLEEP - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/solaris.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/solaris.hpp index 9f925666..9a2e9655 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/solaris.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/solaris.hpp @@ -1,7 +1,7 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Jens Maurer 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Jens Maurer 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -20,9 +20,5 @@ // pthreads don't actually work with gcc unless _PTHREADS is defined: // #if defined(__GNUC__) && defined(_POSIX_THREADS) && !defined(_PTHREADS) -# undef BOOST_HAS_PTHREADS +#undef BOOST_HAS_PTHREADS #endif - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/vxworks.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/vxworks.hpp index 6ec5171e..1f7e035e 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/vxworks.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/vxworks.hpp @@ -1,6 +1,6 @@ -// (C) Copyright Dustin Spicuzza 2009. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright Dustin Spicuzza 2009. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -20,7 +20,7 @@ // these allow posix_features to work, since vxWorks doesn't // define them itself -#define _POSIX_TIMERS 1 +#define _POSIX_TIMERS 1 #define _POSIX_THREADS 1 // vxworks doesn't work with asio serial ports @@ -28,4 +28,3 @@ // boilerplate code: #include - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/win32.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/win32.hpp index 9344818f..3826e92f 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/win32.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/platform/win32.hpp @@ -1,9 +1,9 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Bill Kempf 2001. -// (C) Copyright Aleksey Gurtovoy 2003. +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Bill Kempf 2001. +// (C) Copyright Aleksey Gurtovoy 2003. // (C) Copyright Rene Rivera 2005. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -14,28 +14,29 @@ // Get the information about the MinGW runtime, i.e. __MINGW32_*VERSION. #if defined(__MINGW32__) -# include <_mingw.h> +#include <_mingw.h> #endif #if defined(__GNUC__) && !defined(BOOST_NO_SWPRINTF) -# define BOOST_NO_SWPRINTF +#define BOOST_NO_SWPRINTF #endif #if !defined(__GNUC__) && !defined(BOOST_HAS_DECLSPEC) -# define BOOST_HAS_DECLSPEC +#define BOOST_HAS_DECLSPEC #endif -#if defined(__MINGW32__) && ((__MINGW32_MAJOR_VERSION > 2) || ((__MINGW32_MAJOR_VERSION == 2) && (__MINGW32_MINOR_VERSION >= 0))) -# define BOOST_HAS_STDINT_H -# define __STDC_LIMIT_MACROS -# define BOOST_HAS_DIRENT_H -# define BOOST_HAS_UNISTD_H +#if defined(__MINGW32__) \ + && ((__MINGW32_MAJOR_VERSION > 2) || ((__MINGW32_MAJOR_VERSION == 2) && (__MINGW32_MINOR_VERSION >= 0))) +#define BOOST_HAS_STDINT_H +#define __STDC_LIMIT_MACROS +#define BOOST_HAS_DIRENT_H +#define BOOST_HAS_UNISTD_H #endif // // Win32 will normally be using native Win32 threads, // but there is a pthread library avaliable as an option, -// we used to disable this when BOOST_DISABLE_WIN32 was +// we used to disable this when BOOST_DISABLE_WIN32 was // defined but no longer - this should allow some // files to be compiled in strict mode - while maintaining // a consistent setting of BOOST_HAS_THREADS across @@ -43,11 +44,11 @@ // #ifdef _WIN32_WCE -# define BOOST_NO_ANSI_APIS +#define BOOST_NO_ANSI_APIS #endif #ifndef BOOST_HAS_PTHREADS -# define BOOST_HAS_WINTHREADS +#define BOOST_HAS_WINTHREADS #endif #ifndef BOOST_DISABLE_WIN32 diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/posix_features.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/posix_features.hpp index d1295479..b3204563 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/posix_features.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/posix_features.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2001 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -14,82 +14,78 @@ // to the user to do this *before* including any header, although // in most cases the compiler will do this for you). -# if defined(BOOST_HAS_UNISTD_H) -# include +#if defined(BOOST_HAS_UNISTD_H) +#include - // XOpen has , but is this the correct version check? -# if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 3) -# define BOOST_HAS_NL_TYPES_H -# endif +// XOpen has , but is this the correct version check? +#if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 3) +#define BOOST_HAS_NL_TYPES_H +#endif - // POSIX version 6 requires -# if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200100) -# define BOOST_HAS_STDINT_H -# endif +// POSIX version 6 requires +#if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200100) +#define BOOST_HAS_STDINT_H +#endif - // POSIX version 2 requires -# if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 199009L) -# define BOOST_HAS_DIRENT_H -# endif +// POSIX version 2 requires +#if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 199009L) +#define BOOST_HAS_DIRENT_H +#endif - // POSIX version 3 requires to have sigaction: -# if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 199506L) -# define BOOST_HAS_SIGACTION -# endif - // POSIX defines _POSIX_THREADS > 0 for pthread support, - // however some platforms define _POSIX_THREADS without - // a value, hence the (_POSIX_THREADS+0 >= 0) check. - // Strictly speaking this may catch platforms with a - // non-functioning stub , but such occurrences should - // occur very rarely if at all. -# if defined(_POSIX_THREADS) && (_POSIX_THREADS+0 >= 0) && !defined(BOOST_HAS_WINTHREADS) && !defined(BOOST_HAS_MPTASKS) -# define BOOST_HAS_PTHREADS -# endif +// POSIX version 3 requires to have sigaction: +#if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 199506L) +#define BOOST_HAS_SIGACTION +#endif +// POSIX defines _POSIX_THREADS > 0 for pthread support, +// however some platforms define _POSIX_THREADS without +// a value, hence the (_POSIX_THREADS+0 >= 0) check. +// Strictly speaking this may catch platforms with a +// non-functioning stub , but such occurrences should +// occur very rarely if at all. +#if defined(_POSIX_THREADS) && (_POSIX_THREADS + 0 >= 0) && !defined(BOOST_HAS_WINTHREADS) \ + && !defined(BOOST_HAS_MPTASKS) +#define BOOST_HAS_PTHREADS +#endif - // BOOST_HAS_NANOSLEEP: - // This is predicated on _POSIX_TIMERS or _XOPEN_REALTIME: -# if (defined(_POSIX_TIMERS) && (_POSIX_TIMERS+0 >= 0)) \ - || (defined(_XOPEN_REALTIME) && (_XOPEN_REALTIME+0 >= 0)) -# define BOOST_HAS_NANOSLEEP -# endif - - // BOOST_HAS_CLOCK_GETTIME: - // This is predicated on _POSIX_TIMERS (also on _XOPEN_REALTIME - // but at least one platform - linux - defines that flag without - // defining clock_gettime): -# if (defined(_POSIX_TIMERS) && (_POSIX_TIMERS+0 >= 0)) -# define BOOST_HAS_CLOCK_GETTIME -# endif - - // BOOST_HAS_SCHED_YIELD: - // This is predicated on _POSIX_PRIORITY_SCHEDULING or - // on _POSIX_THREAD_PRIORITY_SCHEDULING or on _XOPEN_REALTIME. -# if defined(_POSIX_PRIORITY_SCHEDULING) && (_POSIX_PRIORITY_SCHEDULING+0 > 0)\ - || (defined(_POSIX_THREAD_PRIORITY_SCHEDULING) && (_POSIX_THREAD_PRIORITY_SCHEDULING+0 > 0))\ - || (defined(_XOPEN_REALTIME) && (_XOPEN_REALTIME+0 >= 0)) -# define BOOST_HAS_SCHED_YIELD -# endif - - // BOOST_HAS_GETTIMEOFDAY: - // BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE: - // These are predicated on _XOPEN_VERSION, and appears to be first released - // in issue 4, version 2 (_XOPEN_VERSION > 500). - // Likewise for the functions log1p and expm1. -# if defined(_XOPEN_VERSION) && (_XOPEN_VERSION+0 >= 500) -# define BOOST_HAS_GETTIMEOFDAY -# if defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE+0 >= 500) -# define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE -# endif -# ifndef BOOST_HAS_LOG1P -# define BOOST_HAS_LOG1P -# endif -# ifndef BOOST_HAS_EXPM1 -# define BOOST_HAS_EXPM1 -# endif -# endif - -# endif +// BOOST_HAS_NANOSLEEP: +// This is predicated on _POSIX_TIMERS or _XOPEN_REALTIME: +#if (defined(_POSIX_TIMERS) && (_POSIX_TIMERS + 0 >= 0)) || (defined(_XOPEN_REALTIME) && (_XOPEN_REALTIME + 0 >= 0)) +#define BOOST_HAS_NANOSLEEP +#endif +// BOOST_HAS_CLOCK_GETTIME: +// This is predicated on _POSIX_TIMERS (also on _XOPEN_REALTIME +// but at least one platform - linux - defines that flag without +// defining clock_gettime): +#if (defined(_POSIX_TIMERS) && (_POSIX_TIMERS + 0 >= 0)) +#define BOOST_HAS_CLOCK_GETTIME +#endif +// BOOST_HAS_SCHED_YIELD: +// This is predicated on _POSIX_PRIORITY_SCHEDULING or +// on _POSIX_THREAD_PRIORITY_SCHEDULING or on _XOPEN_REALTIME. +#if defined(_POSIX_PRIORITY_SCHEDULING) && (_POSIX_PRIORITY_SCHEDULING + 0 > 0) \ + || (defined(_POSIX_THREAD_PRIORITY_SCHEDULING) && (_POSIX_THREAD_PRIORITY_SCHEDULING + 0 > 0)) \ + || (defined(_XOPEN_REALTIME) && (_XOPEN_REALTIME + 0 >= 0)) +#define BOOST_HAS_SCHED_YIELD +#endif +// BOOST_HAS_GETTIMEOFDAY: +// BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE: +// These are predicated on _XOPEN_VERSION, and appears to be first released +// in issue 4, version 2 (_XOPEN_VERSION > 500). +// Likewise for the functions log1p and expm1. +#if defined(_XOPEN_VERSION) && (_XOPEN_VERSION + 0 >= 500) +#define BOOST_HAS_GETTIMEOFDAY +#if defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE + 0 >= 500) +#define BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE +#endif +#ifndef BOOST_HAS_LOG1P +#define BOOST_HAS_LOG1P +#endif +#ifndef BOOST_HAS_EXPM1 +#define BOOST_HAS_EXPM1 +#endif +#endif +#endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/requires_threads.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/requires_threads.hpp index cfaff230..1735f5f4 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/requires_threads.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/requires_threads.hpp @@ -1,6 +1,6 @@ -// (C) Copyright John Maddock 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -8,7 +8,7 @@ #define BOOST_CONFIG_REQUIRES_THREADS_HPP #ifndef BOOST_CONFIG_HPP -# include +#include #endif #if defined(BOOST_DISABLE_THREADS) @@ -21,69 +21,77 @@ // this is checked up to gcc 3.3: // #if defined(__sgi) || defined(__hpux) -# error "Multi-threaded programs are not supported by gcc on HPUX or Irix (last checked with gcc 3.3)" +#error "Multi-threaded programs are not supported by gcc on HPUX or Irix (last checked with gcc 3.3)" #endif #endif -# error "Threading support unavaliable: it has been explicitly disabled with BOOST_DISABLE_THREADS" +#error "Threading support unavaliable: it has been explicitly disabled with BOOST_DISABLE_THREADS" #elif !defined(BOOST_HAS_THREADS) -# if defined __COMO__ +#if defined __COMO__ // Comeau C++ -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -D_MT (Windows) or -D_REENTRANT (Unix)" +#error \ + "Compiler threading support is not turned on. Please set the correct command line options for threading: -D_MT (Windows) or -D_REENTRANT (Unix)" #elif defined(__INTEL_COMPILER) || defined(__ICL) || defined(__ICC) || defined(__ECC) // Intel #ifdef _WIN32 -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: either /MT /MTd /MD or /MDd" +#error \ + "Compiler threading support is not turned on. Please set the correct command line options for threading: either /MT /MTd /MD or /MDd" #else -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -openmp" +#error "Compiler threading support is not turned on. Please set the correct command line options for threading: -openmp" #endif -# elif defined __GNUC__ +#elif defined __GNUC__ // GNU C++: -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)" +#error \ + "Compiler threading support is not turned on. Please set the correct command line options for threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)" #elif defined __sgi // SGI MIPSpro C++ -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -D_SGI_MP_SOURCE" +#error \ + "Compiler threading support is not turned on. Please set the correct command line options for threading: -D_SGI_MP_SOURCE" #elif defined __DECCXX // Compaq Tru64 Unix cxx -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -pthread" +#error \ + "Compiler threading support is not turned on. Please set the correct command line options for threading: -pthread" #elif defined __BORLANDC__ // Borland -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -tWM" +#error "Compiler threading support is not turned on. Please set the correct command line options for threading: -tWM" -#elif defined __MWERKS__ +#elif defined __MWERKS__ // Metrowerks CodeWarrior -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: either -runtime sm, -runtime smd, -runtime dm, or -runtime dmd" +#error \ + "Compiler threading support is not turned on. Please set the correct command line options for threading: either -runtime sm, -runtime smd, -runtime dm, or -runtime dmd" -#elif defined __SUNPRO_CC +#elif defined __SUNPRO_CC // Sun Workshop Compiler C++ -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -mt" +#error "Compiler threading support is not turned on. Please set the correct command line options for threading: -mt" #elif defined __HP_aCC // HP aCC -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: -mt" +#error "Compiler threading support is not turned on. Please set the correct command line options for threading: -mt" #elif defined(__IBMCPP__) // IBM Visual Age -# error "Compiler threading support is not turned on. Please compile the code with the xlC_r compiler" +#error "Compiler threading support is not turned on. Please compile the code with the xlC_r compiler" #elif defined _MSC_VER // Microsoft Visual C++ // // Must remain the last #elif since some other vendors (Metrowerks, for // example) also #define _MSC_VER -# error "Compiler threading support is not turned on. Please set the correct command line options for threading: either /MT /MTd /MD or /MDd" +#error \ + "Compiler threading support is not turned on. Please set the correct command line options for threading: either /MT /MTd /MD or /MDd" #else -# error "Compiler threading support is not turned on. Please consult your compiler's documentation for the appropriate options to use" +#error \ + "Compiler threading support is not turned on. Please consult your compiler's documentation for the appropriate options to use" #endif // compilers diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/select_compiler_config.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/select_compiler_config.hpp index 9141cd63..b95204e5 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/select_compiler_config.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/select_compiler_config.hpp @@ -1,6 +1,6 @@ // Boost compiler configuration selection header file -// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright John Maddock 2001 - 2003. // (C) Copyright Martin Wille 2003. // (C) Copyright Guillaume Melquiond 2003. // @@ -14,106 +14,107 @@ // one identification macro for each of the // compilers we support: -# define BOOST_CXX_GCCXML 0 -# define BOOST_CXX_COMO 0 -# define BOOST_CXX_DMC 0 -# define BOOST_CXX_INTEL 0 -# define BOOST_CXX_GNUC 0 -# define BOOST_CXX_KCC 0 -# define BOOST_CXX_SGI 0 -# define BOOST_CXX_TRU64 0 -# define BOOST_CXX_GHS 0 -# define BOOST_CXX_BORLAND 0 -# define BOOST_CXX_CW 0 -# define BOOST_CXX_SUNPRO 0 -# define BOOST_CXX_HPACC 0 -# define BOOST_CXX_MPW 0 -# define BOOST_CXX_IBMCPP 0 -# define BOOST_CXX_MSVC 0 -# define BOOST_CXX_PGI 0 +#define BOOST_CXX_GCCXML 0 +#define BOOST_CXX_COMO 0 +#define BOOST_CXX_DMC 0 +#define BOOST_CXX_INTEL 0 +#define BOOST_CXX_GNUC 0 +#define BOOST_CXX_KCC 0 +#define BOOST_CXX_SGI 0 +#define BOOST_CXX_TRU64 0 +#define BOOST_CXX_GHS 0 +#define BOOST_CXX_BORLAND 0 +#define BOOST_CXX_CW 0 +#define BOOST_CXX_SUNPRO 0 +#define BOOST_CXX_HPACC 0 +#define BOOST_CXX_MPW 0 +#define BOOST_CXX_IBMCPP 0 +#define BOOST_CXX_MSVC 0 +#define BOOST_CXX_PGI 0 // locate which compiler we are using and define -// BOOST_COMPILER_CONFIG as needed: +// BOOST_COMPILER_CONFIG as needed: #if defined(__GCCXML__) // GCC-XML emulates other compilers, it has to appear first here! -# define BOOST_COMPILER_CONFIG "boost/config/compiler/gcc_xml.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/gcc_xml.hpp" #elif defined __COMO__ // Comeau C++ -# define BOOST_COMPILER_CONFIG "boost/config/compiler/comeau.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/comeau.hpp" #elif defined __DMC__ // Digital Mars C++ -# define BOOST_COMPILER_CONFIG "boost/config/compiler/digitalmars.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/digitalmars.hpp" #elif defined(__INTEL_COMPILER) || defined(__ICL) || defined(__ICC) || defined(__ECC) // Intel -# define BOOST_COMPILER_CONFIG "boost/config/compiler/intel.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/intel.hpp" -# elif defined __GNUC__ +#elif defined __GNUC__ // GNU C++: -# define BOOST_COMPILER_CONFIG "boost/config/compiler/gcc.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/gcc.hpp" #elif defined __KCC // Kai C++ -# define BOOST_COMPILER_CONFIG "boost/config/compiler/kai.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/kai.hpp" #elif defined __sgi // SGI MIPSpro C++ -# define BOOST_COMPILER_CONFIG "boost/config/compiler/sgi_mipspro.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/sgi_mipspro.hpp" #elif defined __DECCXX // Compaq Tru64 Unix cxx -# define BOOST_COMPILER_CONFIG "boost/config/compiler/compaq_cxx.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/compaq_cxx.hpp" #elif defined __ghs // Greenhills C++ -# define BOOST_COMPILER_CONFIG "boost/config/compiler/greenhills.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/greenhills.hpp" #elif defined __CODEGEARC__ // CodeGear - must be checked for before Borland -# define BOOST_COMPILER_CONFIG "boost/config/compiler/codegear.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/codegear.hpp" #elif defined __BORLANDC__ // Borland -# define BOOST_COMPILER_CONFIG "boost/config/compiler/borland.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/borland.hpp" -#elif defined __MWERKS__ +#elif defined __MWERKS__ // Metrowerks CodeWarrior -# define BOOST_COMPILER_CONFIG "boost/config/compiler/metrowerks.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/metrowerks.hpp" -#elif defined __SUNPRO_CC +#elif defined __SUNPRO_CC // Sun Workshop Compiler C++ -# define BOOST_COMPILER_CONFIG "boost/config/compiler/sunpro_cc.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/sunpro_cc.hpp" #elif defined __HP_aCC // HP aCC -# define BOOST_COMPILER_CONFIG "boost/config/compiler/hp_acc.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/hp_acc.hpp" #elif defined(__MRC__) || defined(__SC__) // MPW MrCpp or SCpp -# define BOOST_COMPILER_CONFIG "boost/config/compiler/mpw.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/mpw.hpp" #elif defined(__IBMCPP__) // IBM Visual Age -# define BOOST_COMPILER_CONFIG "boost/config/compiler/vacpp.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/vacpp.hpp" #elif defined(__PGI) // Portland Group Inc. -# define BOOST_COMPILER_CONFIG "boost/config/compiler/pgi.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/pgi.hpp" #elif defined _MSC_VER // Microsoft Visual C++ // // Must remain the last #elif since some other vendors (Metrowerks, for // example) also #define _MSC_VER -# define BOOST_COMPILER_CONFIG "boost/config/compiler/visualc.hpp" +#define BOOST_COMPILER_CONFIG "boost/config/compiler/visualc.hpp" -#elif defined (BOOST_ASSERT_CONFIG) +#elif defined(BOOST_ASSERT_CONFIG) // this must come last - generate an error if we don't // recognise the compiler: -# error "Unknown compiler - please configure (http://www.boost.org/libs/config/config.htm#configuring) and report the results to the main boost mailing list (http://www.boost.org/more/mailing_lists.htm#main)" +#error \ + "Unknown compiler - please configure (http://www.boost.org/libs/config/config.htm#configuring) and report the results to the main boost mailing list (http://www.boost.org/more/mailing_lists.htm#main)" #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/select_platform_config.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/select_platform_config.hpp index 615bb064..3d7bad40 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/select_platform_config.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/select_platform_config.hpp @@ -1,9 +1,9 @@ // Boost compiler configuration selection header file -// (C) Copyright John Maddock 2001 - 2002. -// (C) Copyright Jens Maurer 2001. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2002. +// (C) Copyright Jens Maurer 2001. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -13,82 +13,76 @@ // in order to prevent macro expansion within the header // name (for example "linux" is a macro on linux systems). -#if defined(linux) || defined(__linux) || defined(__linux__) || defined(__GNU__) || defined(__GLIBC__) +#if defined(linux) || defined(__linux) || defined(__linux__) || defined(__GNU__) || defined(__GLIBC__) // linux, also other platforms (Hurd etc) that use GLIBC, should these really have their own config headers though? -# define BOOST_PLATFORM_CONFIG "boost/config/platform/linux.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/linux.hpp" #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) // BSD: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/bsd.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/bsd.hpp" #elif defined(sun) || defined(__sun) // solaris: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/solaris.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/solaris.hpp" #elif defined(__sgi) // SGI Irix: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/irix.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/irix.hpp" #elif defined(__hpux) // hp unix: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/hpux.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/hpux.hpp" #elif defined(__CYGWIN__) // cygwin is not win32: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/cygwin.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/cygwin.hpp" #elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32) // win32: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/win32.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/win32.hpp" #elif defined(__BEOS__) // BeOS -# define BOOST_PLATFORM_CONFIG "boost/config/platform/beos.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/beos.hpp" #elif defined(macintosh) || defined(__APPLE__) || defined(__APPLE_CC__) // MacOS -# define BOOST_PLATFORM_CONFIG "boost/config/platform/macos.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/macos.hpp" #elif defined(__IBMCPP__) || defined(_AIX) // IBM -# define BOOST_PLATFORM_CONFIG "boost/config/platform/aix.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/aix.hpp" #elif defined(__amigaos__) // AmigaOS -# define BOOST_PLATFORM_CONFIG "boost/config/platform/amigaos.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/amigaos.hpp" #elif defined(__QNXNTO__) // QNX: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/qnxnto.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/qnxnto.hpp" #elif defined(__VXWORKS__) // vxWorks: -# define BOOST_PLATFORM_CONFIG "boost/config/platform/vxworks.hpp" +#define BOOST_PLATFORM_CONFIG "boost/config/platform/vxworks.hpp" #else -# if defined(unix) \ - || defined(__unix) \ - || defined(_XOPEN_SOURCE) \ - || defined(_POSIX_SOURCE) +#if defined(unix) || defined(__unix) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE) - // generic unix platform: +// generic unix platform: -# ifndef BOOST_HAS_UNISTD_H -# define BOOST_HAS_UNISTD_H -# endif +#ifndef BOOST_HAS_UNISTD_H +#define BOOST_HAS_UNISTD_H +#endif -# include - -# endif - -# if defined (BOOST_ASSERT_CONFIG) - // this must come last - generate an error if we don't - // recognise the platform: -# error "Unknown platform - please configure and report the results to boost.org" -# endif +#include #endif +#if defined(BOOST_ASSERT_CONFIG) +// this must come last - generate an error if we don't +// recognise the platform: +#error "Unknown platform - please configure and report the results to boost.org" +#endif - +#endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/select_stdlib_config.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/select_stdlib_config.hpp index 2a1430ae..1f735355 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/select_stdlib_config.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/select_stdlib_config.hpp @@ -1,9 +1,9 @@ // Boost compiler configuration selection header file -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Jens Maurer 2001 - 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Jens Maurer 2001 - 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -12,7 +12,7 @@ // locate which std lib we are using and define BOOST_STDLIB_CONFIG as needed: // First include to determine if some version of STLport is in use as the std lib -// (do not rely on this header being included since users can short-circuit this header +// (do not rely on this header being included since users can short-circuit this header // if they know whose std lib they are using.) #include @@ -20,15 +20,15 @@ // STLPort library; this _must_ come first, otherwise since // STLport typically sits on top of some other library, we // can end up detecting that first rather than STLport: -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/stlport.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/stlport.hpp" #else -// If our std lib was not some version of STLport, then include as it is about +// If our std lib was not some version of STLport, then include as it is about // the smallest of the std lib headers that includes real C++ stuff. (Some std libs do not // include their C++-related macros in so this additional include makes sure // we get those definitions) -// (again do not rely on this header being included since users can short-circuit this +// (again do not rely on this header being included since users can short-circuit this // header if they know whose std lib they are using.) #include @@ -38,40 +38,37 @@ #elif defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER) // Rogue Wave library: -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/roguewave.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/roguewave.hpp" #elif defined(__GLIBCPP__) || defined(__GLIBCXX__) // GNU libstdc++ 3 -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/libstdcpp3.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/libstdcpp3.hpp" #elif defined(__STL_CONFIG_H) // generic SGI STL -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/sgi.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/sgi.hpp" #elif defined(__MSL_CPP__) // MSL standard lib: -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/msl.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/msl.hpp" #elif defined(__IBMCPP__) // take the default VACPP std lib -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/vacpp.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/vacpp.hpp" #elif defined(MSIPL_COMPILE_H) // Modena C++ standard library -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/modena.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/modena.hpp" #elif (defined(_YVALS) && !defined(__IBMCPP__)) || defined(_CPPLIB_VER) // Dinkumware Library (this has to appear after any possible replacement libraries): -# define BOOST_STDLIB_CONFIG "boost/config/stdlib/dinkumware.hpp" +#define BOOST_STDLIB_CONFIG "boost/config/stdlib/dinkumware.hpp" -#elif defined (BOOST_ASSERT_CONFIG) +#elif defined(BOOST_ASSERT_CONFIG) // this must come last - generate an error if we don't // recognise the library: -# error "Unknown standard library - please configure and report the results to boost.org" +#error "Unknown standard library - please configure and report the results to boost.org" #endif #endif - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/dinkumware.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/dinkumware.hpp index ab770599..f06d7899 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/dinkumware.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/dinkumware.hpp @@ -20,119 +20,112 @@ #if defined(_CPPLIB_VER) && (_CPPLIB_VER >= 306) - // full dinkumware 3.06 and above - // fully conforming provided the compiler supports it: -# if !(defined(_GLOBAL_USING) && (_GLOBAL_USING+0 > 0)) && !defined(__BORLANDC__) && !defined(_STD) && !(defined(__ICC) && (__ICC >= 700)) // can be defined in yvals.h -# define BOOST_NO_STDC_NAMESPACE -# endif -# if !(defined(_HAS_MEMBER_TEMPLATES_REBIND) && (_HAS_MEMBER_TEMPLATES_REBIND+0 > 0)) && !(defined(_MSC_VER) && (_MSC_VER > 1300)) && defined(BOOST_MSVC) -# define BOOST_NO_STD_ALLOCATOR -# endif -# define BOOST_HAS_PARTIAL_STD_ALLOCATOR -# if defined(BOOST_MSVC) && (BOOST_MSVC < 1300) - // if this lib version is set up for vc6 then there is no std::use_facet: -# define BOOST_NO_STD_USE_FACET -# define BOOST_HAS_TWO_ARG_USE_FACET - // C lib functions aren't in namespace std either: -# define BOOST_NO_STDC_NAMESPACE - // and nor is -# define BOOST_NO_EXCEPTION_STD_NAMESPACE -# endif +// full dinkumware 3.06 and above +// fully conforming provided the compiler supports it: +#if !(defined(_GLOBAL_USING) && (_GLOBAL_USING + 0 > 0)) && !defined(__BORLANDC__) && !defined(_STD) \ + && !(defined(__ICC) && (__ICC >= 700)) // can be defined in yvals.h +#define BOOST_NO_STDC_NAMESPACE +#endif +#if !(defined(_HAS_MEMBER_TEMPLATES_REBIND) && (_HAS_MEMBER_TEMPLATES_REBIND + 0 > 0)) \ + && !(defined(_MSC_VER) && (_MSC_VER > 1300)) && defined(BOOST_MSVC) +#define BOOST_NO_STD_ALLOCATOR +#endif +#define BOOST_HAS_PARTIAL_STD_ALLOCATOR +#if defined(BOOST_MSVC) && (BOOST_MSVC < 1300) +// if this lib version is set up for vc6 then there is no std::use_facet: +#define BOOST_NO_STD_USE_FACET +#define BOOST_HAS_TWO_ARG_USE_FACET +// C lib functions aren't in namespace std either: +#define BOOST_NO_STDC_NAMESPACE +// and nor is +#define BOOST_NO_EXCEPTION_STD_NAMESPACE +#endif // There's no numeric_limits support unless _LONGLONG is defined: -# if !defined(_LONGLONG) && (_CPPLIB_VER <= 310) -# define BOOST_NO_MS_INT64_NUMERIC_LIMITS -# endif +#if !defined(_LONGLONG) && (_CPPLIB_VER <= 310) +#define BOOST_NO_MS_INT64_NUMERIC_LIMITS +#endif // 3.06 appears to have (non-sgi versions of) & , // and no at all #else -# define BOOST_MSVC_STD_ITERATOR 1 -# define BOOST_NO_STD_ITERATOR -# define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS -# define BOOST_NO_STD_ALLOCATOR -# define BOOST_NO_STDC_NAMESPACE -# define BOOST_NO_STD_USE_FACET -# define BOOST_NO_STD_OUTPUT_ITERATOR_ASSIGN -# define BOOST_HAS_MACRO_USE_FACET -# ifndef _CPPLIB_VER - // Updated Dinkum library defines this, and provides - // its own min and max definitions, as does MTA version. -# ifndef __MTA__ -# define BOOST_NO_STD_MIN_MAX -# endif -# define BOOST_NO_MS_INT64_NUMERIC_LIMITS -# endif +#define BOOST_MSVC_STD_ITERATOR 1 +#define BOOST_NO_STD_ITERATOR +#define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS +#define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_STDC_NAMESPACE +#define BOOST_NO_STD_USE_FACET +#define BOOST_NO_STD_OUTPUT_ITERATOR_ASSIGN +#define BOOST_HAS_MACRO_USE_FACET +#ifndef _CPPLIB_VER +// Updated Dinkum library defines this, and provides +// its own min and max definitions, as does MTA version. +#ifndef __MTA__ +#define BOOST_NO_STD_MIN_MAX +#endif +#define BOOST_NO_MS_INT64_NUMERIC_LIMITS +#endif #endif // -// std extension namespace is stdext for vc7.1 and later, +// std extension namespace is stdext for vc7.1 and later, // the same applies to other compilers that sit on top // of vc7.1 (Intel and Comeau): // #if defined(_MSC_VER) && (_MSC_VER >= 1310) && !defined(__BORLANDC__) -# define BOOST_STD_EXTENSION_NAMESPACE stdext +#define BOOST_STD_EXTENSION_NAMESPACE stdext #endif #if (defined(_MSC_VER) && (_MSC_VER <= 1300) && !defined(__BORLANDC__)) || !defined(_CPPLIB_VER) || (_CPPLIB_VER < 306) - // if we're using a dinkum lib that's - // been configured for VC6/7 then there is - // no iterator traits (true even for icl) -# define BOOST_NO_STD_ITERATOR_TRAITS +// if we're using a dinkum lib that's +// been configured for VC6/7 then there is +// no iterator traits (true even for icl) +#define BOOST_NO_STD_ITERATOR_TRAITS #endif #if defined(__ICL) && (__ICL < 800) && defined(_CPPLIB_VER) && (_CPPLIB_VER <= 310) // Intel C++ chokes over any non-trivial use of // this may be an overly restrictive define, but regex fails without it: -# define BOOST_NO_STD_LOCALE +#define BOOST_NO_STD_LOCALE #endif // C++0x headers implemented in 520 (as shipped by Microsoft) // #if !defined(_CPPLIB_VER) || _CPPLIB_VER < 520 -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET #endif // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE #ifdef _CPPLIB_VER -# define BOOST_DINKUMWARE_STDLIB _CPPLIB_VER +#define BOOST_DINKUMWARE_STDLIB _CPPLIB_VER #else -# define BOOST_DINKUMWARE_STDLIB 1 +#define BOOST_DINKUMWARE_STDLIB 1 #endif #ifdef _CPPLIB_VER -# define BOOST_STDLIB "Dinkumware standard library version " BOOST_STRINGIZE(_CPPLIB_VER) +#define BOOST_STDLIB "Dinkumware standard library version " BOOST_STRINGIZE(_CPPLIB_VER) #else -# define BOOST_STDLIB "Dinkumware standard library version 1.x" +#define BOOST_STDLIB "Dinkumware standard library version 1.x" #endif - - - - - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libcomo.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libcomo.hpp index 06731e32..0fefdc33 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libcomo.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libcomo.hpp @@ -1,8 +1,8 @@ -// (C) Copyright John Maddock 2002 - 2003. -// (C) Copyright Jens Maurer 2002 - 2003. -// (C) Copyright Beman Dawes 2002 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2002 - 2003. +// (C) Copyright Jens Maurer 2002 - 2003. +// (C) Copyright Beman Dawes 2002 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -10,10 +10,10 @@ // Comeau STL: #if !defined(__LIBCOMO__) -# include -# if !defined(__LIBCOMO__) -# error "This is not the Comeau STL!" -# endif +#include +#if !defined(__LIBCOMO__) +#error "This is not the Comeau STL!" +#endif #endif // @@ -21,7 +21,7 @@ // NOTE: versions of libcomo prior to beta28 have octal version numbering, // e.g. version 25 is 21 (dec) #if __LIBCOMO_VERSION__ <= 22 -# define BOOST_NO_STD_WSTREAMBUF +#define BOOST_NO_STD_WSTREAMBUF #endif #if (__LIBCOMO_VERSION__ <= 31) && defined(_WIN32) @@ -29,34 +29,34 @@ #endif #if __LIBCOMO_VERSION__ >= 31 -# define BOOST_HAS_HASH -# define BOOST_HAS_SLIST +#define BOOST_HAS_HASH +#define BOOST_HAS_SLIST #endif // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET // // Intrinsic type_traits support. @@ -67,5 +67,3 @@ #define BOOST_HAS_SGI_TYPE_TRAITS #define BOOST_STDLIB "Comeau standard library " BOOST_STRINGIZE(__LIBCOMO_VERSION__) - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libstdcpp3.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libstdcpp3.hpp index 6a57319f..ca3e2d35 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libstdcpp3.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/libstdcpp3.hpp @@ -16,65 +16,61 @@ #endif #if !defined(_GLIBCPP_USE_WCHAR_T) && !defined(_GLIBCXX_USE_WCHAR_T) -# define BOOST_NO_CWCHAR -# define BOOST_NO_CWCTYPE -# define BOOST_NO_STD_WSTRING -# define BOOST_NO_STD_WSTREAMBUF +#define BOOST_NO_CWCHAR +#define BOOST_NO_CWCTYPE +#define BOOST_NO_STD_WSTRING +#define BOOST_NO_STD_WSTREAMBUF #endif #if defined(__osf__) && !defined(_REENTRANT) \ - && ( defined(_GLIBCXX_HAVE_GTHR_DEFAULT) || defined(_GLIBCPP_HAVE_GTHR_DEFAULT) ) + && (defined(_GLIBCXX_HAVE_GTHR_DEFAULT) || defined(_GLIBCPP_HAVE_GTHR_DEFAULT)) // GCC 3 on Tru64 forces the definition of _REENTRANT when any std lib header // file is included, therefore for consistency we define it here as well. -# define _REENTRANT +#define _REENTRANT #endif #ifdef __GLIBCXX__ // gcc 3.4 and greater: -# if defined(_GLIBCXX_HAVE_GTHR_DEFAULT) \ - || defined(_GLIBCXX__PTHREADS) - // - // If the std lib has thread support turned on, then turn it on in Boost - // as well. We do this because some gcc-3.4 std lib headers define _REENTANT - // while others do not... - // -# define BOOST_HAS_THREADS -# else -# define BOOST_DISABLE_THREADS -# endif -#elif defined(__GLIBCPP__) \ - && !defined(_GLIBCPP_HAVE_GTHR_DEFAULT) \ - && !defined(_GLIBCPP__PTHREADS) - // disable thread support if the std lib was built single threaded: -# define BOOST_DISABLE_THREADS +#if defined(_GLIBCXX_HAVE_GTHR_DEFAULT) || defined(_GLIBCXX__PTHREADS) +// +// If the std lib has thread support turned on, then turn it on in Boost +// as well. We do this because some gcc-3.4 std lib headers define _REENTANT +// while others do not... +// +#define BOOST_HAS_THREADS +#else +#define BOOST_DISABLE_THREADS +#endif +#elif defined(__GLIBCPP__) && !defined(_GLIBCPP_HAVE_GTHR_DEFAULT) && !defined(_GLIBCPP__PTHREADS) +// disable thread support if the std lib was built single threaded: +#define BOOST_DISABLE_THREADS #endif -#if (defined(linux) || defined(__linux) || defined(__linux__)) && defined(__arm__) && defined(_GLIBCPP_HAVE_GTHR_DEFAULT) +#if (defined(linux) || defined(__linux) || defined(__linux__)) && defined(__arm__) \ + && defined(_GLIBCPP_HAVE_GTHR_DEFAULT) // linux on arm apparently doesn't define _REENTRANT // so just turn on threading support whenever the std lib is thread safe: -# define BOOST_HAS_THREADS +#define BOOST_HAS_THREADS #endif -#if !defined(_GLIBCPP_USE_LONG_LONG) \ - && !defined(_GLIBCXX_USE_LONG_LONG)\ - && defined(BOOST_HAS_LONG_LONG) +#if !defined(_GLIBCPP_USE_LONG_LONG) && !defined(_GLIBCXX_USE_LONG_LONG) && defined(BOOST_HAS_LONG_LONG) // May have been set by compiler/*.hpp, but "long long" without library // support is useless. -# undef BOOST_HAS_LONG_LONG +#undef BOOST_HAS_LONG_LONG #endif -#if defined(__GLIBCXX__) || (defined(__GLIBCPP__) && __GLIBCPP__>=20020514) // GCC >= 3.1.0 -# define BOOST_STD_EXTENSION_NAMESPACE __gnu_cxx -# define BOOST_HAS_SLIST -# define BOOST_HAS_HASH -# define BOOST_SLIST_HEADER -# if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3) -# define BOOST_HASH_SET_HEADER -# define BOOST_HASH_MAP_HEADER -# else -# define BOOST_HASH_SET_HEADER -# define BOOST_HASH_MAP_HEADER -# endif +#if defined(__GLIBCXX__) || (defined(__GLIBCPP__) && __GLIBCPP__ >= 20020514) // GCC >= 3.1.0 +#define BOOST_STD_EXTENSION_NAMESPACE __gnu_cxx +#define BOOST_HAS_SLIST +#define BOOST_HAS_HASH +#define BOOST_SLIST_HEADER +#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3) +#define BOOST_HASH_SET_HEADER +#define BOOST_HASH_MAP_HEADER +#else +#define BOOST_HASH_SET_HEADER +#define BOOST_HASH_MAP_HEADER +#endif #endif // stdlibc++ C++0x support is detected via __GNUC__, __GNUC_MINOR__, and possibly @@ -92,36 +88,36 @@ // C++0x headers in GCC 4.3.0 and later // #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || !defined(__GXX_EXPERIMENTAL_CXX0X__) -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET #endif // C++0x headers in GCC 4.4.0 and later // #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4) || !defined(__GXX_EXPERIMENTAL_CXX0X__) -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD #endif // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS // --- end --- diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/modena.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/modena.hpp index 7bd50cec..d7763454 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/modena.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/modena.hpp @@ -1,6 +1,6 @@ -// (C) Copyright Jens Maurer 2001. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright Jens Maurer 2001. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -8,10 +8,10 @@ // Modena C++ standard library (comes with KAI C++) #if !defined(MSIPL_COMPILE_H) -# include -# if !defined(__MSIPL_COMPILE_H) -# error "This is not the Modena C++ library!" -# endif +#include +#if !defined(__MSIPL_COMPILE_H) +#error "This is not the Modena C++ library!" +#endif #endif #ifndef MSIPL_NL_TYPES @@ -24,32 +24,27 @@ // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET #define BOOST_STDLIB "Modena C++ standard library" - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/msl.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/msl.hpp index 6bcd232a..59aba774 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/msl.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/msl.hpp @@ -1,7 +1,7 @@ -// (C) Copyright John Maddock 2001. -// (C) Copyright Darin Adler 2001. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// (C) Copyright Darin Adler 2001. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -9,75 +9,66 @@ // Metrowerks standard library: #ifndef __MSL_CPP__ -# include -# ifndef __MSL_CPP__ -# error This is not the MSL standard library! -# endif +#include +#ifndef __MSL_CPP__ +#error This is not the MSL standard library! +#endif #endif -#if __MSL_CPP__ >= 0x6000 // Pro 6 -# define BOOST_HAS_HASH -# define BOOST_STD_EXTENSION_NAMESPACE Metrowerks +#if __MSL_CPP__ >= 0x6000 // Pro 6 +#define BOOST_HAS_HASH +#define BOOST_STD_EXTENSION_NAMESPACE Metrowerks #endif #define BOOST_HAS_SLIST #if __MSL_CPP__ < 0x6209 -# define BOOST_NO_STD_MESSAGES +#define BOOST_NO_STD_MESSAGES #endif // check C lib version for #include #if defined(__MSL__) && (__MSL__ >= 0x5000) -# define BOOST_HAS_STDINT_H -# if !defined(__PALMOS_TRAPS__) -# define BOOST_HAS_UNISTD_H -# endif - // boilerplate code: -# include +#define BOOST_HAS_STDINT_H +#if !defined(__PALMOS_TRAPS__) +#define BOOST_HAS_UNISTD_H +#endif +// boilerplate code: +#include #endif #if defined(_MWMT) || _MSL_THREADSAFE -# define BOOST_HAS_THREADS +#define BOOST_HAS_THREADS #endif #ifdef _MSL_NO_EXPLICIT_FUNC_TEMPLATE_ARG -# define BOOST_NO_STD_USE_FACET -# define BOOST_HAS_TWO_ARG_USE_FACET +#define BOOST_NO_STD_USE_FACET +#define BOOST_HAS_TWO_ARG_USE_FACET #endif // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET #define BOOST_STDLIB "Metrowerks Standard Library version " BOOST_STRINGIZE(__MSL_CPP__) - - - - - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/roguewave.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/roguewave.hpp index cba2f54a..b30254a4 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/roguewave.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/roguewave.hpp @@ -1,9 +1,9 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Jens Maurer 2001. -// (C) Copyright David Abrahams 2003. -// (C) Copyright Boris Gubenko 2007. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Jens Maurer 2001. +// (C) Copyright David Abrahams 2003. +// (C) Copyright Boris Gubenko 2007. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -11,32 +11,32 @@ // Rogue Wave std lib: #if !defined(__STD_RWCOMPILER_H__) && !defined(_RWSTD_VER) -# include -# if !defined(__STD_RWCOMPILER_H__) && !defined(_RWSTD_VER) -# error This is not the Rogue Wave standard library -# endif +#include +#if !defined(__STD_RWCOMPILER_H__) && !defined(_RWSTD_VER) +#error This is not the Rogue Wave standard library +#endif #endif // // figure out a consistent version number: // #ifndef _RWSTD_VER -# define BOOST_RWSTD_VER 0x010000 +#define BOOST_RWSTD_VER 0x010000 #elif _RWSTD_VER < 0x010000 -# define BOOST_RWSTD_VER (_RWSTD_VER << 8) +#define BOOST_RWSTD_VER (_RWSTD_VER << 8) #else -# define BOOST_RWSTD_VER _RWSTD_VER +#define BOOST_RWSTD_VER _RWSTD_VER #endif #ifndef _RWSTD_VER -# define BOOST_STDLIB "Rogue Wave standard library version (Unknown version)" +#define BOOST_STDLIB "Rogue Wave standard library version (Unknown version)" #elif _RWSTD_VER < 0x04010200 - # define BOOST_STDLIB "Rogue Wave standard library version " BOOST_STRINGIZE(_RWSTD_VER) +#define BOOST_STDLIB "Rogue Wave standard library version " BOOST_STRINGIZE(_RWSTD_VER) #else -# ifdef _RWSTD_VER_STR -# define BOOST_STDLIB "Apache STDCXX standard library version " _RWSTD_VER_STR -# else -# define BOOST_STDLIB "Apache STDCXX standard library version " BOOST_STRINGIZE(_RWSTD_VER) -# endif +#ifdef _RWSTD_VER_STR +#define BOOST_STDLIB "Apache STDCXX standard library version " _RWSTD_VER_STR +#else +#define BOOST_STDLIB "Apache STDCXX standard library version " BOOST_STRINGIZE(_RWSTD_VER) +#endif #endif // @@ -45,34 +45,35 @@ // template do: // #if BOOST_RWSTD_VER < 0x020200 -# define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS +#define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS #endif // Sun CC 5.5 patch 113817-07 adds long long specialization, but does not change the // library version number (http://sunsolve6.sun.com/search/document.do?assetkey=1-21-113817): #if BOOST_RWSTD_VER <= 0x020101 && (!defined(__SUNPRO_CC) || (__SUNPRO_CC < 0x550)) -# define BOOST_NO_LONG_LONG_NUMERIC_LIMITS -# endif +#define BOOST_NO_LONG_LONG_NUMERIC_LIMITS +#endif // // Borland version of numeric_limits lacks __int64 specialisation: // #ifdef __BORLANDC__ -# define BOOST_NO_MS_INT64_NUMERIC_LIMITS +#define BOOST_NO_MS_INT64_NUMERIC_LIMITS #endif // // No std::iterator if it can't figure out default template args: // -#if defined(_RWSTD_NO_SIMPLE_DEFAULT_TEMPLATES) || defined(RWSTD_NO_SIMPLE_DEFAULT_TEMPLATES) || (BOOST_RWSTD_VER < 0x020000) -# define BOOST_NO_STD_ITERATOR +#if defined(_RWSTD_NO_SIMPLE_DEFAULT_TEMPLATES) || defined(RWSTD_NO_SIMPLE_DEFAULT_TEMPLATES) \ + || (BOOST_RWSTD_VER < 0x020000) +#define BOOST_NO_STD_ITERATOR #endif // // No iterator traits without partial specialization: // #if defined(_RWSTD_NO_CLASS_PARTIAL_SPEC) || defined(RWSTD_NO_CLASS_PARTIAL_SPEC) -# define BOOST_NO_STD_ITERATOR_TRAITS +#define BOOST_NO_STD_ITERATOR_TRAITS #endif // @@ -80,17 +81,17 @@ // new-style iostreams, and no conformant std::allocator: // #if (BOOST_RWSTD_VER < 0x020000) -# define BOOST_NO_AUTO_PTR -# define BOOST_NO_STRINGSTREAM -# define BOOST_NO_STD_ALLOCATOR -# define BOOST_NO_STD_LOCALE +#define BOOST_NO_AUTO_PTR +#define BOOST_NO_STRINGSTREAM +#define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_STD_LOCALE #endif // // No template iterator constructors without member template support: // #if defined(RWSTD_NO_MEMBER_TEMPLATES) || defined(_RWSTD_NO_MEMBER_TEMPLATES) -# define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS +#define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS #endif // @@ -99,15 +100,15 @@ // on HP aCC systems even though the allocator is in fact broken): // #if !defined(_RWSTD_ALLOCATOR) || (defined(__HP_aCC) && __HP_aCC <= 33100) -# define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_STD_ALLOCATOR #endif // // If we have a std::locale, we still may not have std::use_facet: // #if defined(_RWSTD_NO_TEMPLATE_ON_RETURN_TYPE) && !defined(BOOST_NO_STD_LOCALE) -# define BOOST_NO_STD_USE_FACET -# define BOOST_HAS_TWO_ARG_USE_FACET +#define BOOST_NO_STD_USE_FACET +#define BOOST_HAS_TWO_ARG_USE_FACET #endif // @@ -115,7 +116,7 @@ // partial specialization support: // #if (BOOST_RWSTD_VER < 0x020000) || defined(_RWSTD_NO_CLASS_PARTIAL_SPEC) - #define BOOST_NO_STD_DISTANCE +#define BOOST_NO_STD_DISTANCE #endif // @@ -123,57 +124,56 @@ // OutputIterators: // #if BOOST_RWSTD_VER < 0x020100 -# define BOOST_NO_STD_OUTPUT_ITERATOR_ASSIGN +#define BOOST_NO_STD_OUTPUT_ITERATOR_ASSIGN #endif // // Disable BOOST_HAS_LONG_LONG when the library has no support for it. // #if !defined(_RWSTD_LONG_LONG) && defined(BOOST_HAS_LONG_LONG) -# undef BOOST_HAS_LONG_LONG +#undef BOOST_HAS_LONG_LONG #endif // // check that on HP-UX, the proper RW library is used // #if defined(__HP_aCC) && !defined(_HP_NAMESPACE_STD) -# error "Boost requires Standard RW library. Please compile and link with -AA" +#error "Boost requires Standard RW library. Please compile and link with -AA" #endif // // Define macros specific to RW V2.2 on HP-UX // #if defined(__HP_aCC) && (BOOST_RWSTD_VER == 0x02020100) -# ifndef __HP_TC1_MAKE_PAIR -# define __HP_TC1_MAKE_PAIR -# endif -# ifndef _HP_INSTANTIATE_STD2_VL -# define _HP_INSTANTIATE_STD2_VL -# endif +#ifndef __HP_TC1_MAKE_PAIR +#define __HP_TC1_MAKE_PAIR +#endif +#ifndef _HP_INSTANTIATE_STD2_VL +#define _HP_INSTANTIATE_STD2_VL +#endif #endif // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET - +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/sgi.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/sgi.hpp index c505008b..e5928524 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/sgi.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/sgi.hpp @@ -1,8 +1,8 @@ -// (C) Copyright John Maddock 2001 - 2003. -// (C) Copyright Darin Adler 2001. -// (C) Copyright Jens Maurer 2001 - 2003. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2003. +// (C) Copyright Darin Adler 2001. +// (C) Copyright Jens Maurer 2001 - 2003. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -10,34 +10,33 @@ // generic SGI STL: #if !defined(__STL_CONFIG_H) -# include -# if !defined(__STL_CONFIG_H) -# error "This is not the SGI STL!" -# endif +#include +#if !defined(__STL_CONFIG_H) +#error "This is not the SGI STL!" +#endif #endif // // No std::iterator traits without partial specialisation: // #if !defined(__STL_CLASS_PARTIAL_SPECIALIZATION) -# define BOOST_NO_STD_ITERATOR_TRAITS +#define BOOST_NO_STD_ITERATOR_TRAITS #endif // // No std::stringstream with gcc < 3 // -#if defined(__GNUC__) && (__GNUC__ < 3) && \ - ((__GNUC_MINOR__ < 95) || (__GNUC_MINOR__ == 96)) && \ - !defined(__STL_USE_NEW_IOSTREAMS) || \ - defined(__APPLE_CC__) - // Note that we only set this for GNU C++ prior to 2.95 since the - // latest patches for that release do contain a minimal - // If you are running a 2.95 release prior to 2.95.3 then this will need - // setting, but there is no way to detect that automatically (other - // than by running the configure script). - // Also, the unofficial GNU C++ 2.96 included in RedHat 7.1 doesn't - // have . -# define BOOST_NO_STRINGSTREAM +#if defined(__GNUC__) && (__GNUC__ < 3) && ((__GNUC_MINOR__ < 95) || (__GNUC_MINOR__ == 96)) \ + && !defined(__STL_USE_NEW_IOSTREAMS) \ + || defined(__APPLE_CC__) +// Note that we only set this for GNU C++ prior to 2.95 since the +// latest patches for that release do contain a minimal +// If you are running a 2.95 release prior to 2.95.3 then this will need +// setting, but there is no way to detect that automatically (other +// than by running the configure script). +// Also, the unofficial GNU C++ 2.96 included in RedHat 7.1 doesn't +// have . +#define BOOST_NO_STRINGSTREAM #endif // @@ -45,21 +44,21 @@ // incorrect assumption in some cases): // #if !defined(__SGI_STL_OWN_IOSTREAMS) && !defined(__STL_USE_NEW_IOSTREAMS) -# define BOOST_NO_STD_LOCALE +#define BOOST_NO_STD_LOCALE #endif // // Original native SGI streams have non-standard std::messages facet: // #if defined(__sgi) && (_COMPILER_VERSION <= 650) && !defined(__SGI_STL_OWN_IOSTREAMS) -# define BOOST_NO_STD_LOCALE +#define BOOST_NO_STD_LOCALE #endif // // SGI's new iostreams have missing "const" in messages<>::open // #if defined(__sgi) && (_COMPILER_VERSION <= 740) && defined(__STL_USE_NEW_IOSTREAMS) -# define BOOST_NO_STD_MESSAGES +#define BOOST_NO_STD_MESSAGES #endif // @@ -67,8 +66,8 @@ // without member templates: // #if !defined(__STL_MEMBER_TEMPLATES) -# define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS -# define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS +#define BOOST_NO_STD_ALLOCATOR #endif // @@ -81,20 +80,20 @@ // If this is GNU libstdc++2, then no and no std::wstring: // #if (defined(__GNUC__) && (__GNUC__ < 3)) -# include -# if defined(__BASTRING__) -# define BOOST_NO_LIMITS +#include +#if defined(__BASTRING__) +#define BOOST_NO_LIMITS // Note: will provide compile-time constants -# undef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS -# define BOOST_NO_STD_WSTRING -# endif +#undef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS +#define BOOST_NO_STD_WSTRING +#endif #endif // // There is no standard iterator unless we have namespace support: // #if !defined(__STL_USE_NAMESPACES) -# define BOOST_NO_STD_ITERATOR +#define BOOST_NO_STD_ITERATOR #endif // @@ -107,30 +106,27 @@ // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET #define BOOST_STDLIB "SGI standard library" - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/stlport.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/stlport.hpp index 3dfd529e..7200d5bc 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/stlport.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/stlport.hpp @@ -1,8 +1,8 @@ -// (C) Copyright John Maddock 2001 - 2002. -// (C) Copyright Darin Adler 2001. -// (C) Copyright Jens Maurer 2001. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2002. +// (C) Copyright Darin Adler 2001. +// (C) Copyright Jens Maurer 2001. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. @@ -10,10 +10,10 @@ // STLPort standard library config: #if !defined(__SGI_STL_PORT) && !defined(_STLPORT_VERSION) -# include -# if !defined(__SGI_STL_PORT) && !defined(_STLPORT_VERSION) -# error "This is not STLPort!" -# endif +#include +#if !defined(__SGI_STL_PORT) && !defined(_STLPORT_VERSION) +#error "This is not STLPort!" +#endif #endif // @@ -21,7 +21,7 @@ // for versions prior to 4.1(beta) // #if (defined(__STL_STATIC_CONST_INIT_BUG) || defined(_STLP_STATIC_CONST_INIT_BUG)) && (__SGI_STL_PORT <= 0x400) -# define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS +#define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS #endif // @@ -29,22 +29,23 @@ // std::iterator traits: // #if !(defined(_STLP_CLASS_PARTIAL_SPECIALIZATION) || defined(__STL_CLASS_PARTIAL_SPECIALIZATION)) -# define BOOST_NO_STD_ITERATOR_TRAITS +#define BOOST_NO_STD_ITERATOR_TRAITS #endif // // No new style iostreams on GCC without STLport's iostreams enabled: // #if (defined(__GNUC__) && (__GNUC__ < 3)) && !(defined(__SGI_STL_OWN_IOSTREAMS) || defined(_STLP_OWN_IOSTREAMS)) -# define BOOST_NO_STRINGSTREAM +#define BOOST_NO_STRINGSTREAM #endif // // No new iostreams implies no std::locale, and no std::stringstream: // -#if defined(__STL_NO_IOSTREAMS) || defined(__STL_NO_NEW_IOSTREAMS) || defined(_STLP_NO_IOSTREAMS) || defined(_STLP_NO_NEW_IOSTREAMS) -# define BOOST_NO_STD_LOCALE -# define BOOST_NO_STRINGSTREAM +#if defined(__STL_NO_IOSTREAMS) || defined(__STL_NO_NEW_IOSTREAMS) || defined(_STLP_NO_IOSTREAMS) \ + || defined(_STLP_NO_NEW_IOSTREAMS) +#define BOOST_NO_STD_LOCALE +#define BOOST_NO_STRINGSTREAM #endif // @@ -52,30 +53,32 @@ // then the io stream facets are not available in namespace std:: // #ifdef _STLPORT_VERSION -# if !(_STLPORT_VERSION >= 0x500) && !defined(_STLP_OWN_IOSTREAMS) && defined(_STLP_USE_NAMESPACES) && defined(BOOST_NO_USING_TEMPLATE) && !defined(__BORLANDC__) -# define BOOST_NO_STD_LOCALE -# endif +#if !(_STLPORT_VERSION >= 0x500) && !defined(_STLP_OWN_IOSTREAMS) && defined(_STLP_USE_NAMESPACES) \ + && defined(BOOST_NO_USING_TEMPLATE) && !defined(__BORLANDC__) +#define BOOST_NO_STD_LOCALE +#endif #else -# if !defined(__SGI_STL_OWN_IOSTREAMS) && defined(__STL_USE_NAMESPACES) && defined(BOOST_NO_USING_TEMPLATE) && !defined(__BORLANDC__) -# define BOOST_NO_STD_LOCALE -# endif +#if !defined(__SGI_STL_OWN_IOSTREAMS) && defined(__STL_USE_NAMESPACES) && defined(BOOST_NO_USING_TEMPLATE) \ + && !defined(__BORLANDC__) +#define BOOST_NO_STD_LOCALE +#endif #endif #if defined(_STLPORT_VERSION) && ((_STLPORT_VERSION < 0x500) || (_STLPORT_VERSION >= 0x520)) -# define BOOST_NO_STD_UNORDERED +#define BOOST_NO_STD_UNORDERED #endif #if defined(_STLPORT_VERSION) && (_STLPORT_VERSION >= 0x520) -# define BOOST_HAS_TR1_UNORDERED_SET -# define BOOST_HAS_TR1_UNORDERED_MAP +#define BOOST_HAS_TR1_UNORDERED_SET +#define BOOST_HAS_TR1_UNORDERED_MAP #endif // // Without member template support enabled, their are no template // iterate constructors, and no std::allocator: // #if !(defined(__STL_MEMBER_TEMPLATES) || defined(_STLP_MEMBER_TEMPLATES)) -# define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS -# define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS +#define BOOST_NO_STD_ALLOCATOR #endif // // however we always have at least a partial allocator: @@ -83,11 +86,11 @@ #define BOOST_HAS_PARTIAL_STD_ALLOCATOR #if !defined(_STLP_MEMBER_TEMPLATE_CLASSES) || defined(_STLP_DONT_SUPPORT_REBIND_MEMBER_TEMPLATE) -# define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_STD_ALLOCATOR #endif #if defined(_STLP_NO_MEMBER_TEMPLATE_KEYWORD) && defined(BOOST_MSVC) && (BOOST_MSVC <= 1300) -# define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_STD_ALLOCATOR #endif // @@ -95,12 +98,12 @@ // the support for the relevant specilazations of std:: templates. // #if !defined(_STLP_HAS_WCHAR_T) && !defined(_STLP_WCHAR_T_IS_USHORT) -# ifndef BOOST_NO_STD_WSTRING -# define BOOST_NO_STD_WSTRING -# endif -# ifndef BOOST_NO_STD_WSTREAMBUF -# define BOOST_NO_STD_WSTREAMBUF -# endif +#ifndef BOOST_NO_STD_WSTRING +#define BOOST_NO_STD_WSTRING +#endif +#ifndef BOOST_NO_STD_WSTREAMBUF +#define BOOST_NO_STD_WSTREAMBUF +#endif #endif // @@ -119,7 +122,7 @@ // // Harold Howe says: // Borland switched to STLport in BCB6. Defining BOOST_NO_STDC_NAMESPACE with -// BCB6 does cause problems. If we detect C++ Builder, then don't define +// BCB6 does cause problems. If we detect C++ Builder, then don't define // BOOST_NO_STDC_NAMESPACE // #if !defined(__BORLANDC__) && !defined(__DMC__) @@ -128,29 +131,32 @@ // the global namespace, then we duplicate STLport's using declarations // (by defining BOOST_NO_STDC_NAMESPACE), we do this because STLport doesn't // necessarily import all the names we need into namespace std:: -// -# if (defined(__STL_IMPORT_VENDOR_CSTD) \ - || defined(__STL_USE_OWN_NAMESPACE) \ - || defined(_STLP_IMPORT_VENDOR_CSTD) \ - || defined(_STLP_USE_OWN_NAMESPACE)) \ - && (defined(__STL_VENDOR_GLOBAL_CSTD) || defined (_STLP_VENDOR_GLOBAL_CSTD)) -# define BOOST_NO_STDC_NAMESPACE -# define BOOST_NO_EXCEPTION_STD_NAMESPACE -# endif +// +#if (defined(__STL_IMPORT_VENDOR_CSTD) || defined(__STL_USE_OWN_NAMESPACE) || defined(_STLP_IMPORT_VENDOR_CSTD) \ + || defined(_STLP_USE_OWN_NAMESPACE)) \ + && (defined(__STL_VENDOR_GLOBAL_CSTD) || defined(_STLP_VENDOR_GLOBAL_CSTD)) +#define BOOST_NO_STDC_NAMESPACE +#define BOOST_NO_EXCEPTION_STD_NAMESPACE +#endif #elif defined(__BORLANDC__) && __BORLANDC__ < 0x560 // STLport doesn't import std::abs correctly: #include -namespace std { using ::abs; } +namespace std { +using ::abs; +} // and strcmp/strcpy don't get imported either ('cos they are macros) #include #ifdef strcpy -# undef strcpy +#undef strcpy #endif #ifdef strcmp -# undef strcmp +#undef strcmp #endif #ifdef _STLP_VENDOR_CSTD -namespace std{ using _STLP_VENDOR_CSTD::strcmp; using _STLP_VENDOR_CSTD::strcpy; } +namespace std { +using _STLP_VENDOR_CSTD::strcmp; +using _STLP_VENDOR_CSTD::strcpy; +} // namespace std #endif #endif @@ -158,18 +164,18 @@ namespace std{ using _STLP_VENDOR_CSTD::strcmp; using _STLP_VENDOR_CSTD::strcpy; // std::use_facet may be non-standard, uses a class instead: // #if defined(__STL_NO_EXPLICIT_FUNCTION_TMPL_ARGS) || defined(_STLP_NO_EXPLICIT_FUNCTION_TMPL_ARGS) -# define BOOST_NO_STD_USE_FACET -# define BOOST_HAS_STLP_USE_FACET +#define BOOST_NO_STD_USE_FACET +#define BOOST_HAS_STLP_USE_FACET #endif // // If STLport thinks there are no wide functions, etc. is not working; but -// only if BOOST_NO_STDC_NAMESPACE is not defined (if it is then we do the import +// only if BOOST_NO_STDC_NAMESPACE is not defined (if it is then we do the import // into std:: ourselves). // #if defined(_STLP_NO_NATIVE_WIDE_FUNCTIONS) && !defined(BOOST_NO_STDC_NAMESPACE) -# define BOOST_NO_CWCHAR -# define BOOST_NO_CWCTYPE +#define BOOST_NO_CWCHAR +#define BOOST_NO_CWCTYPE #endif // @@ -177,9 +183,9 @@ namespace std{ using _STLP_VENDOR_CSTD::strcmp; using _STLP_VENDOR_CSTD::strcpy; // is not an intrinsic type, then we have to disable the support for it as // well (we would be missing required specializations otherwise). // -#if !defined( _STLP_HAS_WCHAR_T) || defined(_STLP_WCHAR_T_IS_USHORT) -# undef BOOST_NO_INTRINSIC_WCHAR_T -# define BOOST_NO_INTRINSIC_WCHAR_T +#if !defined(_STLP_HAS_WCHAR_T) || defined(_STLP_WCHAR_T_IS_USHORT) +#undef BOOST_NO_INTRINSIC_WCHAR_T +#define BOOST_NO_INTRINSIC_WCHAR_T #endif // @@ -187,50 +193,45 @@ namespace std{ using _STLP_VENDOR_CSTD::strcmp; using _STLP_VENDOR_CSTD::strcpy; // hashtables and the like: // #if defined(__BORLANDC__) && (__BORLANDC__ == 0x560) -# undef BOOST_HAS_HASH +#undef BOOST_HAS_HASH #endif // // gcc-2.95.3/STLPort does not like the using declarations we use to get ADL with std::min/max // #if defined(__GNUC__) && (__GNUC__ < 3) -# include // for std::min and std::max -# define BOOST_USING_STD_MIN() ((void)0) -# define BOOST_USING_STD_MAX() ((void)0) -namespace boost { using std::min; using std::max; } +#include // for std::min and std::max +#define BOOST_USING_STD_MIN() ((void)0) +#define BOOST_USING_STD_MAX() ((void)0) +namespace boost { +using std::max; +using std::min; +} // namespace boost #endif // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET #define BOOST_STDLIB "STLPort standard library version " BOOST_STRINGIZE(__SGI_STL_PORT) - - - - - - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/vacpp.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/vacpp.hpp index c8d6d5ad..cd4528a0 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/vacpp.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/stdlib/vacpp.hpp @@ -1,12 +1,12 @@ -// (C) Copyright John Maddock 2001 - 2002. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001 - 2002. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org for most recent version. #if __IBMCPP__ <= 501 -# define BOOST_NO_STD_ALLOCATOR +#define BOOST_NO_STD_ALLOCATOR #endif #define BOOST_HAS_MACRO_USE_FACET @@ -14,30 +14,27 @@ // C++0x headers not yet implemented // -# define BOOST_NO_0X_HDR_ARRAY -# define BOOST_NO_0X_HDR_CHRONO -# define BOOST_NO_0X_HDR_CODECVT -# define BOOST_NO_0X_HDR_CONCEPTS -# define BOOST_NO_0X_HDR_CONDITION_VARIABLE -# define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS -# define BOOST_NO_0X_HDR_FORWARD_LIST -# define BOOST_NO_0X_HDR_FUTURE -# define BOOST_NO_0X_HDR_INITIALIZER_LIST -# define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS -# define BOOST_NO_0X_HDR_MEMORY_CONCEPTS -# define BOOST_NO_0X_HDR_MUTEX -# define BOOST_NO_0X_HDR_RANDOM -# define BOOST_NO_0X_HDR_RATIO -# define BOOST_NO_0X_HDR_REGEX -# define BOOST_NO_0X_HDR_SYSTEM_ERROR -# define BOOST_NO_0X_HDR_THREAD -# define BOOST_NO_0X_HDR_TUPLE -# define BOOST_NO_0X_HDR_TYPE_TRAITS -# define BOOST_NO_STD_UNORDERED // deprecated; see following -# define BOOST_NO_0X_HDR_UNORDERED_MAP -# define BOOST_NO_0X_HDR_UNORDERED_SET +#define BOOST_NO_0X_HDR_ARRAY +#define BOOST_NO_0X_HDR_CHRONO +#define BOOST_NO_0X_HDR_CODECVT +#define BOOST_NO_0X_HDR_CONCEPTS +#define BOOST_NO_0X_HDR_CONDITION_VARIABLE +#define BOOST_NO_0X_HDR_CONTAINER_CONCEPTS +#define BOOST_NO_0X_HDR_FORWARD_LIST +#define BOOST_NO_0X_HDR_FUTURE +#define BOOST_NO_0X_HDR_INITIALIZER_LIST +#define BOOST_NO_0X_HDR_ITERATOR_CONCEPTS +#define BOOST_NO_0X_HDR_MEMORY_CONCEPTS +#define BOOST_NO_0X_HDR_MUTEX +#define BOOST_NO_0X_HDR_RANDOM +#define BOOST_NO_0X_HDR_RATIO +#define BOOST_NO_0X_HDR_REGEX +#define BOOST_NO_0X_HDR_SYSTEM_ERROR +#define BOOST_NO_0X_HDR_THREAD +#define BOOST_NO_0X_HDR_TUPLE +#define BOOST_NO_0X_HDR_TYPE_TRAITS +#define BOOST_NO_STD_UNORDERED // deprecated; see following +#define BOOST_NO_0X_HDR_UNORDERED_MAP +#define BOOST_NO_0X_HDR_UNORDERED_SET #define BOOST_STDLIB "Visual Age default standard library" - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/suffix.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/suffix.hpp index e9b40a19..71134158 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/suffix.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/suffix.hpp @@ -3,7 +3,7 @@ // Copyright (c) 2001-2003 John Maddock // Copyright (c) 2001 Darin Adler // Copyright (c) 2001 Peter Dimov -// Copyright (c) 2002 Bill Kempf +// Copyright (c) 2002 Bill Kempf // Copyright (c) 2002 Jens Maurer // Copyright (c) 2002-2003 David Abrahams // Copyright (c) 2003 Gennaro Prota @@ -31,55 +31,52 @@ // remember that since these just declare a bunch of macros, there should be // no namespace issues from this. // -#if !defined(BOOST_HAS_LONG_LONG) && !defined(BOOST_NO_LONG_LONG) \ - && !defined(BOOST_MSVC) && !defined(__BORLANDC__) -# include -# if (defined(ULLONG_MAX) || defined(ULONG_LONG_MAX) || defined(ULONGLONG_MAX)) -# define BOOST_HAS_LONG_LONG -# else -# define BOOST_NO_LONG_LONG -# endif +#if !defined(BOOST_HAS_LONG_LONG) && !defined(BOOST_NO_LONG_LONG) && !defined(BOOST_MSVC) && !defined(__BORLANDC__) +#include +#if (defined(ULLONG_MAX) || defined(ULONG_LONG_MAX) || defined(ULONGLONG_MAX)) +#define BOOST_HAS_LONG_LONG +#else +#define BOOST_NO_LONG_LONG +#endif #endif // GCC 3.x will clean up all of those nasty macro definitions that // BOOST_NO_CTYPE_FUNCTIONS is intended to help work around, so undefine // it under GCC 3.x. #if defined(__GNUC__) && (__GNUC__ >= 3) && defined(BOOST_NO_CTYPE_FUNCTIONS) -# undef BOOST_NO_CTYPE_FUNCTIONS +#undef BOOST_NO_CTYPE_FUNCTIONS #endif // // Assume any extensions are in namespace std:: unless stated otherwise: // -# ifndef BOOST_STD_EXTENSION_NAMESPACE -# define BOOST_STD_EXTENSION_NAMESPACE std -# endif +#ifndef BOOST_STD_EXTENSION_NAMESPACE +#define BOOST_STD_EXTENSION_NAMESPACE std +#endif // // If cv-qualified specializations are not allowed, then neither are cv-void ones: // -# if defined(BOOST_NO_CV_SPECIALIZATIONS) \ - && !defined(BOOST_NO_CV_VOID_SPECIALIZATIONS) -# define BOOST_NO_CV_VOID_SPECIALIZATIONS -# endif +#if defined(BOOST_NO_CV_SPECIALIZATIONS) && !defined(BOOST_NO_CV_VOID_SPECIALIZATIONS) +#define BOOST_NO_CV_VOID_SPECIALIZATIONS +#endif // // If there is no numeric_limits template, then it can't have any compile time // constants either! // -# if defined(BOOST_NO_LIMITS) \ - && !defined(BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS) -# define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS -# define BOOST_NO_MS_INT64_NUMERIC_LIMITS -# define BOOST_NO_LONG_LONG_NUMERIC_LIMITS -# endif +#if defined(BOOST_NO_LIMITS) && !defined(BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS) +#define BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS +#define BOOST_NO_MS_INT64_NUMERIC_LIMITS +#define BOOST_NO_LONG_LONG_NUMERIC_LIMITS +#endif // // if there is no long long then there is no specialisation // for numeric_limits either: // #if !defined(BOOST_HAS_LONG_LONG) && !defined(BOOST_NO_LONG_LONG_NUMERIC_LIMITS) -# define BOOST_NO_LONG_LONG_NUMERIC_LIMITS +#define BOOST_NO_LONG_LONG_NUMERIC_LIMITS #endif // @@ -87,135 +84,126 @@ // for numeric_limits<__int64> either: // #if !defined(BOOST_HAS_MS_INT64) && !defined(BOOST_NO_MS_INT64_NUMERIC_LIMITS) -# define BOOST_NO_MS_INT64_NUMERIC_LIMITS +#define BOOST_NO_MS_INT64_NUMERIC_LIMITS #endif // // if member templates are supported then so is the // VC6 subset of member templates: // -# if !defined(BOOST_NO_MEMBER_TEMPLATES) \ - && !defined(BOOST_MSVC6_MEMBER_TEMPLATES) -# define BOOST_MSVC6_MEMBER_TEMPLATES -# endif +#if !defined(BOOST_NO_MEMBER_TEMPLATES) && !defined(BOOST_MSVC6_MEMBER_TEMPLATES) +#define BOOST_MSVC6_MEMBER_TEMPLATES +#endif // // Without partial specialization, can't test for partial specialisation bugs: // -# if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) \ - && !defined(BOOST_BCB_PARTIAL_SPECIALIZATION_BUG) -# define BOOST_BCB_PARTIAL_SPECIALIZATION_BUG -# endif +#if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) && !defined(BOOST_BCB_PARTIAL_SPECIALIZATION_BUG) +#define BOOST_BCB_PARTIAL_SPECIALIZATION_BUG +#endif // // Without partial specialization, we can't have array-type partial specialisations: // -# if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) \ - && !defined(BOOST_NO_ARRAY_TYPE_SPECIALIZATIONS) -# define BOOST_NO_ARRAY_TYPE_SPECIALIZATIONS -# endif +#if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) && !defined(BOOST_NO_ARRAY_TYPE_SPECIALIZATIONS) +#define BOOST_NO_ARRAY_TYPE_SPECIALIZATIONS +#endif // // Without partial specialization, std::iterator_traits can't work: // -# if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) \ - && !defined(BOOST_NO_STD_ITERATOR_TRAITS) -# define BOOST_NO_STD_ITERATOR_TRAITS -# endif +#if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) && !defined(BOOST_NO_STD_ITERATOR_TRAITS) +#define BOOST_NO_STD_ITERATOR_TRAITS +#endif // -// Without partial specialization, partial +// Without partial specialization, partial // specialization with default args won't work either: // -# if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) \ - && !defined(BOOST_NO_PARTIAL_SPECIALIZATION_IMPLICIT_DEFAULT_ARGS) -# define BOOST_NO_PARTIAL_SPECIALIZATION_IMPLICIT_DEFAULT_ARGS -# endif +#if defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) && !defined(BOOST_NO_PARTIAL_SPECIALIZATION_IMPLICIT_DEFAULT_ARGS) +#define BOOST_NO_PARTIAL_SPECIALIZATION_IMPLICIT_DEFAULT_ARGS +#endif // // Without member template support, we can't have template constructors // in the standard library either: // -# if defined(BOOST_NO_MEMBER_TEMPLATES) \ - && !defined(BOOST_MSVC6_MEMBER_TEMPLATES) \ - && !defined(BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS) -# define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS -# endif +#if defined(BOOST_NO_MEMBER_TEMPLATES) && !defined(BOOST_MSVC6_MEMBER_TEMPLATES) \ + && !defined(BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS) +#define BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS +#endif // // Without member template support, we can't have a conforming // std::allocator template either: // -# if defined(BOOST_NO_MEMBER_TEMPLATES) \ - && !defined(BOOST_MSVC6_MEMBER_TEMPLATES) \ - && !defined(BOOST_NO_STD_ALLOCATOR) -# define BOOST_NO_STD_ALLOCATOR -# endif +#if defined(BOOST_NO_MEMBER_TEMPLATES) && !defined(BOOST_MSVC6_MEMBER_TEMPLATES) && !defined(BOOST_NO_STD_ALLOCATOR) +#define BOOST_NO_STD_ALLOCATOR +#endif // // without ADL support then using declarations will break ADL as well: // #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) && !defined(BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL) -# define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL +#define BOOST_FUNCTION_SCOPE_USING_DECLARATION_BREAKS_ADL #endif // // Without typeid support we have no dynamic RTTI either: // #if defined(BOOST_NO_TYPEID) && !defined(BOOST_NO_RTTI) -# define BOOST_NO_RTTI +#define BOOST_NO_RTTI #endif // // If we have a standard allocator, then we have a partial one as well: // #if !defined(BOOST_NO_STD_ALLOCATOR) -# define BOOST_HAS_PARTIAL_STD_ALLOCATOR +#define BOOST_HAS_PARTIAL_STD_ALLOCATOR #endif // // We can't have a working std::use_facet if there is no std::locale: // -# if defined(BOOST_NO_STD_LOCALE) && !defined(BOOST_NO_STD_USE_FACET) -# define BOOST_NO_STD_USE_FACET -# endif +#if defined(BOOST_NO_STD_LOCALE) && !defined(BOOST_NO_STD_USE_FACET) +#define BOOST_NO_STD_USE_FACET +#endif // // We can't have a std::messages facet if there is no std::locale: // -# if defined(BOOST_NO_STD_LOCALE) && !defined(BOOST_NO_STD_MESSAGES) -# define BOOST_NO_STD_MESSAGES -# endif +#if defined(BOOST_NO_STD_LOCALE) && !defined(BOOST_NO_STD_MESSAGES) +#define BOOST_NO_STD_MESSAGES +#endif // // We can't have a working std::wstreambuf if there is no std::locale: // -# if defined(BOOST_NO_STD_LOCALE) && !defined(BOOST_NO_STD_WSTREAMBUF) -# define BOOST_NO_STD_WSTREAMBUF -# endif +#if defined(BOOST_NO_STD_LOCALE) && !defined(BOOST_NO_STD_WSTREAMBUF) +#define BOOST_NO_STD_WSTREAMBUF +#endif // // We can't have a if there is no : // -# if defined(BOOST_NO_CWCHAR) && !defined(BOOST_NO_CWCTYPE) -# define BOOST_NO_CWCTYPE -# endif +#if defined(BOOST_NO_CWCHAR) && !defined(BOOST_NO_CWCTYPE) +#define BOOST_NO_CWCTYPE +#endif // // We can't have a swprintf if there is no : // -# if defined(BOOST_NO_CWCHAR) && !defined(BOOST_NO_SWPRINTF) -# define BOOST_NO_SWPRINTF -# endif +#if defined(BOOST_NO_CWCHAR) && !defined(BOOST_NO_SWPRINTF) +#define BOOST_NO_SWPRINTF +#endif // // If Win32 support is turned off, then we must turn off // threading support also, unless there is some other // thread API enabled: // -#if defined(BOOST_DISABLE_WIN32) && defined(_WIN32) \ - && !defined(BOOST_DISABLE_THREADS) && !defined(BOOST_HAS_PTHREADS) -# define BOOST_DISABLE_THREADS +#if defined(BOOST_DISABLE_WIN32) && defined(_WIN32) && !defined(BOOST_DISABLE_THREADS) && !defined(BOOST_HAS_PTHREADS) +#define BOOST_DISABLE_THREADS #endif // @@ -224,86 +212,85 @@ // limited number of macros that identify this (if there's any missing // from here then add to the appropriate compiler section): // -#if (defined(__MT__) || defined(_MT) || defined(_REENTRANT) \ - || defined(_PTHREADS) || defined(__APPLE__) || defined(__DragonFly__)) \ +#if (defined(__MT__) || defined(_MT) || defined(_REENTRANT) || defined(_PTHREADS) || defined(__APPLE__) \ + || defined(__DragonFly__)) \ && !defined(BOOST_HAS_THREADS) -# define BOOST_HAS_THREADS +#define BOOST_HAS_THREADS #endif // // Turn threading support off if BOOST_DISABLE_THREADS is defined: // #if defined(BOOST_DISABLE_THREADS) && defined(BOOST_HAS_THREADS) -# undef BOOST_HAS_THREADS +#undef BOOST_HAS_THREADS #endif // // Turn threading support off if we don't recognise the threading API: // -#if defined(BOOST_HAS_THREADS) && !defined(BOOST_HAS_PTHREADS)\ - && !defined(BOOST_HAS_WINTHREADS) && !defined(BOOST_HAS_BETHREADS)\ - && !defined(BOOST_HAS_MPTASKS) -# undef BOOST_HAS_THREADS +#if defined(BOOST_HAS_THREADS) && !defined(BOOST_HAS_PTHREADS) && !defined(BOOST_HAS_WINTHREADS) \ + && !defined(BOOST_HAS_BETHREADS) && !defined(BOOST_HAS_MPTASKS) +#undef BOOST_HAS_THREADS #endif // // Turn threading detail macros off if we don't (want to) use threading // #ifndef BOOST_HAS_THREADS -# undef BOOST_HAS_PTHREADS -# undef BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE -# undef BOOST_HAS_PTHREAD_YIELD -# undef BOOST_HAS_PTHREAD_DELAY_NP -# undef BOOST_HAS_WINTHREADS -# undef BOOST_HAS_BETHREADS -# undef BOOST_HAS_MPTASKS +#undef BOOST_HAS_PTHREADS +#undef BOOST_HAS_PTHREAD_MUTEXATTR_SETTYPE +#undef BOOST_HAS_PTHREAD_YIELD +#undef BOOST_HAS_PTHREAD_DELAY_NP +#undef BOOST_HAS_WINTHREADS +#undef BOOST_HAS_BETHREADS +#undef BOOST_HAS_MPTASKS #endif // // If the compiler claims to be C99 conformant, then it had better // have a : // -# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901) -# define BOOST_HAS_STDINT_H -# ifndef BOOST_HAS_LOG1P -# define BOOST_HAS_LOG1P -# endif -# ifndef BOOST_HAS_EXPM1 -# define BOOST_HAS_EXPM1 -# endif -# endif +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901) +#define BOOST_HAS_STDINT_H +#ifndef BOOST_HAS_LOG1P +#define BOOST_HAS_LOG1P +#endif +#ifndef BOOST_HAS_EXPM1 +#define BOOST_HAS_EXPM1 +#endif +#endif // // Define BOOST_NO_SLIST and BOOST_NO_HASH if required. // Note that this is for backwards compatibility only. // -# if !defined(BOOST_HAS_SLIST) && !defined(BOOST_NO_SLIST) -# define BOOST_NO_SLIST -# endif +#if !defined(BOOST_HAS_SLIST) && !defined(BOOST_NO_SLIST) +#define BOOST_NO_SLIST +#endif -# if !defined(BOOST_HAS_HASH) && !defined(BOOST_NO_HASH) -# define BOOST_NO_HASH -# endif +#if !defined(BOOST_HAS_HASH) && !defined(BOOST_NO_HASH) +#define BOOST_NO_HASH +#endif // // Set BOOST_SLIST_HEADER if not set already: // #if defined(BOOST_HAS_SLIST) && !defined(BOOST_SLIST_HEADER) -# define BOOST_SLIST_HEADER +#define BOOST_SLIST_HEADER #endif // // Set BOOST_HASH_SET_HEADER if not set already: // #if defined(BOOST_HAS_HASH) && !defined(BOOST_HASH_SET_HEADER) -# define BOOST_HASH_SET_HEADER +#define BOOST_HASH_SET_HEADER #endif // // Set BOOST_HASH_MAP_HEADER if not set already: // #if defined(BOOST_HAS_HASH) && !defined(BOOST_HASH_MAP_HEADER) -# define BOOST_HASH_MAP_HEADER +#define BOOST_HASH_MAP_HEADER #endif // @@ -311,18 +298,18 @@ // #if defined(BOOST_NO_0X_HDR_INITIALIZER_LIST) && !defined(BOOST_NO_INITIALIZER_LISTS) -# define BOOST_NO_INITIALIZER_LISTS +#define BOOST_NO_INITIALIZER_LISTS #endif // BOOST_HAS_ABI_HEADERS // This macro gets set if we have headers that fix the ABI, // and prevent ODR violations when linking to external libraries: #if defined(BOOST_ABI_PREFIX) && defined(BOOST_ABI_SUFFIX) && !defined(BOOST_HAS_ABI_HEADERS) -# define BOOST_HAS_ABI_HEADERS +#define BOOST_HAS_ABI_HEADERS #endif #if defined(BOOST_HAS_ABI_HEADERS) && defined(BOOST_DISABLE_ABI_HEADERS) -# undef BOOST_HAS_ABI_HEADERS +#undef BOOST_HAS_ABI_HEADERS #endif // BOOST_NO_STDC_NAMESPACE workaround --------------------------------------// @@ -334,39 +321,42 @@ // works as expected with standard conforming compilers. The resulting // double inclusion of is harmless. -# ifdef BOOST_NO_STDC_NAMESPACE -# include - namespace std { using ::ptrdiff_t; using ::size_t; } -# endif +#ifdef BOOST_NO_STDC_NAMESPACE +#include +namespace std { +using ::ptrdiff_t; +using ::size_t; +} // namespace std +#endif // Workaround for the unfortunate min/max macros defined by some platform headers #define BOOST_PREVENT_MACRO_SUBSTITUTION #ifndef BOOST_USING_STD_MIN -# define BOOST_USING_STD_MIN() using std::min +#define BOOST_USING_STD_MIN() using std::min #endif #ifndef BOOST_USING_STD_MAX -# define BOOST_USING_STD_MAX() using std::max +#define BOOST_USING_STD_MAX() using std::max #endif // BOOST_NO_STD_MIN_MAX workaround -----------------------------------------// -# ifdef BOOST_NO_STD_MIN_MAX +#ifdef BOOST_NO_STD_MIN_MAX namespace std_ns { - template - inline const _Tp& min BOOST_PREVENT_MACRO_SUBSTITUTION (const _Tp& __a, const _Tp& __b) { +template inline const _Tp &min BOOST_PREVENT_MACRO_SUBSTITUTION(const _Tp &__a, const _Tp &__b) +{ return __b < __a ? __b : __a; - } - template - inline const _Tp& max BOOST_PREVENT_MACRO_SUBSTITUTION (const _Tp& __a, const _Tp& __b) { - return __a < __b ? __b : __a; - } } +template inline const _Tp &max BOOST_PREVENT_MACRO_SUBSTITUTION(const _Tp &__a, const _Tp &__b) +{ + return __a < __b ? __b : __a; +} +} // namespace std_ns -# endif +#endif // BOOST_STATIC_CONSTANT workaround --------------------------------------- // // On compilers which don't allow in-class initialization of static integral @@ -374,11 +364,11 @@ namespace std_ns { // to be available at compile-time. This macro gives us a convenient way to // declare such constants. -# ifdef BOOST_NO_INCLASS_MEMBER_INITIALIZATION -# define BOOST_STATIC_CONSTANT(type, assignment) enum { assignment } -# else -# define BOOST_STATIC_CONSTANT(type, assignment) static const type assignment -# endif +#ifdef BOOST_NO_INCLASS_MEMBER_INITIALIZATION +#define BOOST_STATIC_CONSTANT(type, assignment) enum { assignment } +#else +#define BOOST_STATIC_CONSTANT(type, assignment) static const type assignment +#endif // BOOST_USE_FACET / HAS_FACET workaround ----------------------------------// // When the standard library does not have a conforming std::use_facet there @@ -394,19 +384,19 @@ namespace std_ns { // Use for BOOST_HAS_FACET is analogous. #if defined(BOOST_NO_STD_USE_FACET) -# ifdef BOOST_HAS_TWO_ARG_USE_FACET -# define BOOST_USE_FACET(Type, loc) std::use_facet(loc, static_cast(0)) -# define BOOST_HAS_FACET(Type, loc) std::has_facet(loc, static_cast(0)) -# elif defined(BOOST_HAS_MACRO_USE_FACET) -# define BOOST_USE_FACET(Type, loc) std::_USE(loc, Type) -# define BOOST_HAS_FACET(Type, loc) std::_HAS(loc, Type) -# elif defined(BOOST_HAS_STLP_USE_FACET) -# define BOOST_USE_FACET(Type, loc) (*std::_Use_facet(loc)) -# define BOOST_HAS_FACET(Type, loc) std::has_facet< Type >(loc) -# endif +#ifdef BOOST_HAS_TWO_ARG_USE_FACET +#define BOOST_USE_FACET(Type, loc) std::use_facet(loc, static_cast(0)) +#define BOOST_HAS_FACET(Type, loc) std::has_facet(loc, static_cast(0)) +#elif defined(BOOST_HAS_MACRO_USE_FACET) +#define BOOST_USE_FACET(Type, loc) std::_USE(loc, Type) +#define BOOST_HAS_FACET(Type, loc) std::_HAS(loc, Type) +#elif defined(BOOST_HAS_STLP_USE_FACET) +#define BOOST_USE_FACET(Type, loc) (*std::_Use_facet(loc)) +#define BOOST_HAS_FACET(Type, loc) std::has_facet(loc) +#endif #else -# define BOOST_USE_FACET(Type, loc) std::use_facet< Type >(loc) -# define BOOST_HAS_FACET(Type, loc) std::has_facet< Type >(loc) +#define BOOST_USE_FACET(Type, loc) std::use_facet(loc) +#define BOOST_HAS_FACET(Type, loc) std::has_facet(loc) #endif // BOOST_NESTED_TEMPLATE workaround ------------------------------------------// @@ -420,9 +410,9 @@ namespace std_ns { // typedef typename A::BOOST_NESTED_TEMPLATE rebind binder; #ifndef BOOST_NO_MEMBER_TEMPLATE_KEYWORD -# define BOOST_NESTED_TEMPLATE template +#define BOOST_NESTED_TEMPLATE template #else -# define BOOST_NESTED_TEMPLATE +#define BOOST_NESTED_TEMPLATE #endif // BOOST_UNREACHABLE_RETURN(x) workaround -------------------------------------// @@ -431,9 +421,9 @@ namespace std_ns { // statement that can never be reached. #ifdef BOOST_NO_UNREACHABLE_RETURN_DETECTION -# define BOOST_UNREACHABLE_RETURN(x) return x; +#define BOOST_UNREACHABLE_RETURN(x) return x; #else -# define BOOST_UNREACHABLE_RETURN(x) +#define BOOST_UNREACHABLE_RETURN(x) #endif // BOOST_DEDUCED_TYPENAME workaround ------------------------------------------// @@ -448,15 +438,15 @@ namespace std_ns { // template void f(T, BOOST_DEDUCED_TYPENAME T::type); #ifndef BOOST_NO_DEDUCED_TYPENAME -# define BOOST_DEDUCED_TYPENAME typename +#define BOOST_DEDUCED_TYPENAME typename #else -# define BOOST_DEDUCED_TYPENAME +#define BOOST_DEDUCED_TYPENAME #endif #ifndef BOOST_NO_TYPENAME_WITH_CTOR -# define BOOST_CTOR_TYPENAME typename +#define BOOST_CTOR_TYPENAME typename #else -# define BOOST_CTOR_TYPENAME +#define BOOST_CTOR_TYPENAME #endif // long long workaround ------------------------------------------// @@ -465,15 +455,15 @@ namespace std_ns { // (with -pedantic -ansi) unless it's use is prefixed by __extension__ // #if defined(BOOST_HAS_LONG_LONG) -namespace boost{ -# ifdef __GNUC__ - __extension__ typedef long long long_long_type; - __extension__ typedef unsigned long long ulong_long_type; -# else - typedef long long long_long_type; - typedef unsigned long long ulong_long_type; -# endif -} +namespace boost { +#ifdef __GNUC__ +__extension__ typedef long long long_long_type; +__extension__ typedef unsigned long long ulong_long_type; +#else +typedef long long long_long_type; +typedef unsigned long long ulong_long_type; +#endif +} // namespace boost #endif // BOOST_[APPEND_]EXPLICIT_TEMPLATE_[NON_]TYPE macros --------------------------// @@ -520,36 +510,32 @@ namespace boost{ #if defined BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS -# include "boost/type.hpp" -# include "boost/non_type.hpp" +#include "boost/non_type.hpp" +#include "boost/type.hpp" -# define BOOST_EXPLICIT_TEMPLATE_TYPE(t) boost::type* = 0 -# define BOOST_EXPLICIT_TEMPLATE_TYPE_SPEC(t) boost::type* -# define BOOST_EXPLICIT_TEMPLATE_NON_TYPE(t, v) boost::non_type* = 0 -# define BOOST_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) boost::non_type* +#define BOOST_EXPLICIT_TEMPLATE_TYPE(t) boost::type * = 0 +#define BOOST_EXPLICIT_TEMPLATE_TYPE_SPEC(t) boost::type * +#define BOOST_EXPLICIT_TEMPLATE_NON_TYPE(t, v) boost::non_type * = 0 +#define BOOST_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) boost::non_type * -# define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE(t) \ - , BOOST_EXPLICIT_TEMPLATE_TYPE(t) -# define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(t) \ - , BOOST_EXPLICIT_TEMPLATE_TYPE_SPEC(t) -# define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(t, v) \ - , BOOST_EXPLICIT_TEMPLATE_NON_TYPE(t, v) -# define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) \ - , BOOST_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE(t) , BOOST_EXPLICIT_TEMPLATE_TYPE(t) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(t) , BOOST_EXPLICIT_TEMPLATE_TYPE_SPEC(t) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(t, v) , BOOST_EXPLICIT_TEMPLATE_NON_TYPE(t, v) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) , BOOST_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) #else // no workaround needed: expand to nothing -# define BOOST_EXPLICIT_TEMPLATE_TYPE(t) -# define BOOST_EXPLICIT_TEMPLATE_TYPE_SPEC(t) -# define BOOST_EXPLICIT_TEMPLATE_NON_TYPE(t, v) -# define BOOST_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) +#define BOOST_EXPLICIT_TEMPLATE_TYPE(t) +#define BOOST_EXPLICIT_TEMPLATE_TYPE_SPEC(t) +#define BOOST_EXPLICIT_TEMPLATE_NON_TYPE(t, v) +#define BOOST_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) -# define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE(t) -# define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(t) -# define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(t, v) -# define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE(t) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(t) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(t, v) +#define BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE_SPEC(t, v) #endif // defined BOOST_NO_EXPLICIT_FUNCTION_TEMPLATE_ARGUMENTS @@ -562,7 +548,7 @@ namespace boost{ // Converts the parameter X to a string after macro replacement // on X has been performed. // -#define BOOST_STRINGIZE(X) BOOST_DO_STRINGIZE(X) +#define BOOST_STRINGIZE(X) BOOST_DO_STRINGIZE(X) #define BOOST_DO_STRINGIZE(X) #X // @@ -573,29 +559,26 @@ namespace boost{ // is that macro expansion of macro arguments does not // occur in BOOST_DO_JOIN2 but does in BOOST_DO_JOIN. // -#define BOOST_JOIN( X, Y ) BOOST_DO_JOIN( X, Y ) -#define BOOST_DO_JOIN( X, Y ) BOOST_DO_JOIN2(X,Y) -#define BOOST_DO_JOIN2( X, Y ) X##Y +#define BOOST_JOIN(X, Y) BOOST_DO_JOIN(X, Y) +#define BOOST_DO_JOIN(X, Y) BOOST_DO_JOIN2(X, Y) +#define BOOST_DO_JOIN2(X, Y) X##Y // // Set some default values for compiler/library/platform names. // These are for debugging config setup only: // -# ifndef BOOST_COMPILER -# define BOOST_COMPILER "Unknown ISO C++ Compiler" -# endif -# ifndef BOOST_STDLIB -# define BOOST_STDLIB "Unknown ISO standard library" -# endif -# ifndef BOOST_PLATFORM -# if defined(unix) || defined(__unix) || defined(_XOPEN_SOURCE) \ - || defined(_POSIX_SOURCE) -# define BOOST_PLATFORM "Generic Unix" -# else -# define BOOST_PLATFORM "Unknown" -# endif -# endif - +#ifndef BOOST_COMPILER +#define BOOST_COMPILER "Unknown ISO C++ Compiler" +#endif +#ifndef BOOST_STDLIB +#define BOOST_STDLIB "Unknown ISO standard library" +#endif +#ifndef BOOST_PLATFORM +#if defined(unix) || defined(__unix) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE) +#define BOOST_PLATFORM "Generic Unix" +#else +#define BOOST_PLATFORM "Unknown" +#endif #endif - +#endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/user.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/user.hpp index 5a4a9d47..b4d645ae 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/user.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/user.hpp @@ -1,8 +1,8 @@ // boost/config/user.hpp ---------------------------------------------------// -// (C) Copyright John Maddock 2001. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file +// (C) Copyright John Maddock 2001. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // Do not check in modified versions of this file, @@ -65,60 +65,57 @@ // even when available: // #define BOOST_DISABLE_WIN32 -// BOOST_DISABLE_ABI_HEADERS: Stops boost headers from including any -// prefix/suffix headers that normally control things like struct -// packing and alignment. +// BOOST_DISABLE_ABI_HEADERS: Stops boost headers from including any +// prefix/suffix headers that normally control things like struct +// packing and alignment. // #define BOOST_DISABLE_ABI_HEADERS // BOOST_ABI_PREFIX: A prefix header to include in place of whatever -// boost.config would normally select, any replacement should set up -// struct packing and alignment options as required. +// boost.config would normally select, any replacement should set up +// struct packing and alignment options as required. // #define BOOST_ABI_PREFIX my-header-name -// BOOST_ABI_SUFFIX: A suffix header to include in place of whatever -// boost.config would normally select, any replacement should undo -// the effects of the prefix header. +// BOOST_ABI_SUFFIX: A suffix header to include in place of whatever +// boost.config would normally select, any replacement should undo +// the effects of the prefix header. // #define BOOST_ABI_SUFFIX my-header-name -// BOOST_ALL_DYN_LINK: Forces all libraries that have separate source, -// to be linked as dll's rather than static libraries on Microsoft Windows -// (this macro is used to turn on __declspec(dllimport) modifiers, so that -// the compiler knows which symbols to look for in a dll rather than in a -// static library). Note that there may be some libraries that can only -// be statically linked (Boost.Test for example) and others which may only -// be dynamically linked (Boost.Threads for example), in these cases this +// BOOST_ALL_DYN_LINK: Forces all libraries that have separate source, +// to be linked as dll's rather than static libraries on Microsoft Windows +// (this macro is used to turn on __declspec(dllimport) modifiers, so that +// the compiler knows which symbols to look for in a dll rather than in a +// static library). Note that there may be some libraries that can only +// be statically linked (Boost.Test for example) and others which may only +// be dynamically linked (Boost.Threads for example), in these cases this // macro has no effect. // #define BOOST_ALL_DYN_LINK - -// BOOST_WHATEVER_DYN_LINK: Forces library "whatever" to be linked as a dll -// rather than a static library on Microsoft Windows: replace the WHATEVER -// part of the macro name with the name of the library that you want to -// dynamically link to, for example use BOOST_DATE_TIME_DYN_LINK or -// BOOST_REGEX_DYN_LINK etc (this macro is used to turn on __declspec(dllimport) -// modifiers, so that the compiler knows which symbols to look for in a dll -// rather than in a static library). -// Note that there may be some libraries that can only be statically linked -// (Boost.Test for example) and others which may only be dynamically linked + +// BOOST_WHATEVER_DYN_LINK: Forces library "whatever" to be linked as a dll +// rather than a static library on Microsoft Windows: replace the WHATEVER +// part of the macro name with the name of the library that you want to +// dynamically link to, for example use BOOST_DATE_TIME_DYN_LINK or +// BOOST_REGEX_DYN_LINK etc (this macro is used to turn on __declspec(dllimport) +// modifiers, so that the compiler knows which symbols to look for in a dll +// rather than in a static library). +// Note that there may be some libraries that can only be statically linked +// (Boost.Test for example) and others which may only be dynamically linked // (Boost.Threads for example), in these cases this macro is unsupported. // #define BOOST_WHATEVER_DYN_LINK - -// BOOST_ALL_NO_LIB: Tells the config system not to automatically select -// which libraries to link against. -// Normally if a compiler supports #pragma lib, then the correct library -// build variant will be automatically selected and linked against, -// simply by the act of including one of that library's headers. + +// BOOST_ALL_NO_LIB: Tells the config system not to automatically select +// which libraries to link against. +// Normally if a compiler supports #pragma lib, then the correct library +// build variant will be automatically selected and linked against, +// simply by the act of including one of that library's headers. // This macro turns that feature off. // #define BOOST_ALL_NO_LIB - -// BOOST_WHATEVER_NO_LIB: Tells the config system not to automatically -// select which library to link against for library "whatever", -// replace WHATEVER in the macro name with the name of the library; -// for example BOOST_DATE_TIME_NO_LIB or BOOST_REGEX_NO_LIB. -// Normally if a compiler supports #pragma lib, then the correct library -// build variant will be automatically selected and linked against, simply -// by the act of including one of that library's headers. This macro turns + +// BOOST_WHATEVER_NO_LIB: Tells the config system not to automatically +// select which library to link against for library "whatever", +// replace WHATEVER in the macro name with the name of the library; +// for example BOOST_DATE_TIME_NO_LIB or BOOST_REGEX_NO_LIB. +// Normally if a compiler supports #pragma lib, then the correct library +// build variant will be automatically selected and linked against, simply +// by the act of including one of that library's headers. This macro turns // that feature off. // #define BOOST_WHATEVER_NO_LIB - - - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/config/warning_disable.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/config/warning_disable.hpp index 26ff1323..2f1d15ad 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/config/warning_disable.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/config/warning_disable.hpp @@ -17,7 +17,7 @@ // // * Quite unreasonably pedantic. // * Generally only emitted by a single compiler. -// * Can't easily be fixed: for example if the vendors own std lib +// * Can't easily be fixed: for example if the vendors own std lib // code emits these warnings! // // Note that THIS HEADER MUST NOT INCLUDE ANY OTHER HEADERS: @@ -30,18 +30,18 @@ #ifndef BOOST_CONFIG_WARNING_DISABLE_HPP #define BOOST_CONFIG_WARNING_DISABLE_HPP -#if defined(_MSC_VER) && (_MSC_VER >= 1400) - // Error 'function': was declared deprecated - // http://msdn2.microsoft.com/en-us/library/ttcz0bys(VS.80).aspx - // This error is emitted when you use some perfectly conforming - // std lib functions in a perfectly correct way, and also by - // some of Microsoft's own std lib code ! -# pragma warning(disable:4996) +#if defined(_MSC_VER) && (_MSC_VER >= 1400) +// Error 'function': was declared deprecated +// http://msdn2.microsoft.com/en-us/library/ttcz0bys(VS.80).aspx +// This error is emitted when you use some perfectly conforming +// std lib functions in a perfectly correct way, and also by +// some of Microsoft's own std lib code ! +#pragma warning(disable : 4996) #endif #if defined(__INTEL_COMPILER) || defined(__ICL) - // As above: gives warning when a "deprecated" - // std library function is encountered. -# pragma warning(disable:1786) +// As above: gives warning when a "deprecated" +// std library function is encountered. +#pragma warning(disable : 1786) #endif #endif // BOOST_CONFIG_WARNING_DISABLE_HPP diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/limits.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/limits.hpp index d3747a1a..a40cdcad 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/limits.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/limits.hpp @@ -1,5 +1,5 @@ -// (C) Copyright John maddock 1999. +// (C) Copyright John maddock 1999. // (C) David Abrahams 2002. Distributed under the Boost // Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -14,133 +14,125 @@ #include #ifdef BOOST_NO_LIMITS -# include +#include #else -# include +#include #endif #if (defined(BOOST_HAS_LONG_LONG) && defined(BOOST_NO_LONG_LONG_NUMERIC_LIMITS)) \ - || (defined(BOOST_HAS_MS_INT64) && defined(BOOST_NO_MS_INT64_NUMERIC_LIMITS)) + || (defined(BOOST_HAS_MS_INT64) && defined(BOOST_NO_MS_INT64_NUMERIC_LIMITS)) // Add missing specializations for numeric_limits: #ifdef BOOST_HAS_MS_INT64 -# define BOOST_LLT __int64 -# define BOOST_ULLT unsigned __int64 +#define BOOST_LLT __int64 +#define BOOST_ULLT unsigned __int64 #else -# define BOOST_LLT ::boost::long_long_type -# define BOOST_ULLT ::boost::ulong_long_type +#define BOOST_LLT ::boost::long_long_type +#define BOOST_ULLT ::boost::ulong_long_type #endif -#include // for CHAR_BIT +#include // for CHAR_BIT -namespace std +namespace std { +template <> class numeric_limits { - template<> - class numeric_limits - { - public: - - BOOST_STATIC_CONSTANT(bool, is_specialized = true); +public: + BOOST_STATIC_CONSTANT(bool, is_specialized = true); #ifdef BOOST_HAS_MS_INT64 - static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return 0x8000000000000000i64; } - static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return 0x7FFFFFFFFFFFFFFFi64; } + static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return 0x8000000000000000i64; } + static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return 0x7FFFFFFFFFFFFFFFi64; } #elif defined(LLONG_MAX) - static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return LLONG_MIN; } - static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return LLONG_MAX; } + static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return LLONG_MIN; } + static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return LLONG_MAX; } #elif defined(LONGLONG_MAX) - static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return LONGLONG_MIN; } - static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return LONGLONG_MAX; } + static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return LONGLONG_MIN; } + static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return LONGLONG_MAX; } #else - static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return 1LL << (sizeof(BOOST_LLT) * CHAR_BIT - 1); } - static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return ~(min)(); } + static BOOST_LLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return 1LL << (sizeof(BOOST_LLT) * CHAR_BIT - 1); } + static BOOST_LLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return ~(min)(); } #endif - BOOST_STATIC_CONSTANT(int, digits = sizeof(BOOST_LLT) * CHAR_BIT -1); - BOOST_STATIC_CONSTANT(int, digits10 = (CHAR_BIT * sizeof (BOOST_LLT) - 1) * 301L / 1000); - BOOST_STATIC_CONSTANT(bool, is_signed = true); - BOOST_STATIC_CONSTANT(bool, is_integer = true); - BOOST_STATIC_CONSTANT(bool, is_exact = true); - BOOST_STATIC_CONSTANT(int, radix = 2); - static BOOST_LLT epsilon() throw() { return 0; }; - static BOOST_LLT round_error() throw() { return 0; }; + BOOST_STATIC_CONSTANT(int, digits = sizeof(BOOST_LLT) * CHAR_BIT - 1); + BOOST_STATIC_CONSTANT(int, digits10 = (CHAR_BIT * sizeof(BOOST_LLT) - 1) * 301L / 1000); + BOOST_STATIC_CONSTANT(bool, is_signed = true); + BOOST_STATIC_CONSTANT(bool, is_integer = true); + BOOST_STATIC_CONSTANT(bool, is_exact = true); + BOOST_STATIC_CONSTANT(int, radix = 2); + static BOOST_LLT epsilon() throw() { return 0; }; + static BOOST_LLT round_error() throw() { return 0; }; - BOOST_STATIC_CONSTANT(int, min_exponent = 0); - BOOST_STATIC_CONSTANT(int, min_exponent10 = 0); - BOOST_STATIC_CONSTANT(int, max_exponent = 0); - BOOST_STATIC_CONSTANT(int, max_exponent10 = 0); + BOOST_STATIC_CONSTANT(int, min_exponent = 0); + BOOST_STATIC_CONSTANT(int, min_exponent10 = 0); + BOOST_STATIC_CONSTANT(int, max_exponent = 0); + BOOST_STATIC_CONSTANT(int, max_exponent10 = 0); - BOOST_STATIC_CONSTANT(bool, has_infinity = false); - BOOST_STATIC_CONSTANT(bool, has_quiet_NaN = false); - BOOST_STATIC_CONSTANT(bool, has_signaling_NaN = false); - BOOST_STATIC_CONSTANT(bool, has_denorm = false); - BOOST_STATIC_CONSTANT(bool, has_denorm_loss = false); - static BOOST_LLT infinity() throw() { return 0; }; - static BOOST_LLT quiet_NaN() throw() { return 0; }; - static BOOST_LLT signaling_NaN() throw() { return 0; }; - static BOOST_LLT denorm_min() throw() { return 0; }; + BOOST_STATIC_CONSTANT(bool, has_infinity = false); + BOOST_STATIC_CONSTANT(bool, has_quiet_NaN = false); + BOOST_STATIC_CONSTANT(bool, has_signaling_NaN = false); + BOOST_STATIC_CONSTANT(bool, has_denorm = false); + BOOST_STATIC_CONSTANT(bool, has_denorm_loss = false); + static BOOST_LLT infinity() throw() { return 0; }; + static BOOST_LLT quiet_NaN() throw() { return 0; }; + static BOOST_LLT signaling_NaN() throw() { return 0; }; + static BOOST_LLT denorm_min() throw() { return 0; }; - BOOST_STATIC_CONSTANT(bool, is_iec559 = false); - BOOST_STATIC_CONSTANT(bool, is_bounded = true); - BOOST_STATIC_CONSTANT(bool, is_modulo = true); + BOOST_STATIC_CONSTANT(bool, is_iec559 = false); + BOOST_STATIC_CONSTANT(bool, is_bounded = true); + BOOST_STATIC_CONSTANT(bool, is_modulo = true); - BOOST_STATIC_CONSTANT(bool, traps = false); - BOOST_STATIC_CONSTANT(bool, tinyness_before = false); - BOOST_STATIC_CONSTANT(float_round_style, round_style = round_toward_zero); - - }; + BOOST_STATIC_CONSTANT(bool, traps = false); + BOOST_STATIC_CONSTANT(bool, tinyness_before = false); + BOOST_STATIC_CONSTANT(float_round_style, round_style = round_toward_zero); +}; - template<> - class numeric_limits - { - public: - - BOOST_STATIC_CONSTANT(bool, is_specialized = true); +template <> class numeric_limits +{ +public: + BOOST_STATIC_CONSTANT(bool, is_specialized = true); #ifdef BOOST_HAS_MS_INT64 - static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return 0ui64; } - static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return 0xFFFFFFFFFFFFFFFFui64; } + static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return 0ui64; } + static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return 0xFFFFFFFFFFFFFFFFui64; } #elif defined(ULLONG_MAX) && defined(ULLONG_MIN) - static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return ULLONG_MIN; } - static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return ULLONG_MAX; } + static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return ULLONG_MIN; } + static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return ULLONG_MAX; } #elif defined(ULONGLONG_MAX) && defined(ULONGLONG_MIN) - static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return ULONGLONG_MIN; } - static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return ULONGLONG_MAX; } + static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return ULONGLONG_MIN; } + static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return ULONGLONG_MAX; } #else - static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION (){ return 0uLL; } - static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION (){ return ~0uLL; } + static BOOST_ULLT min BOOST_PREVENT_MACRO_SUBSTITUTION() { return 0uLL; } + static BOOST_ULLT max BOOST_PREVENT_MACRO_SUBSTITUTION() { return ~0uLL; } #endif - BOOST_STATIC_CONSTANT(int, digits = sizeof(BOOST_LLT) * CHAR_BIT); - BOOST_STATIC_CONSTANT(int, digits10 = (CHAR_BIT * sizeof (BOOST_LLT)) * 301L / 1000); - BOOST_STATIC_CONSTANT(bool, is_signed = false); - BOOST_STATIC_CONSTANT(bool, is_integer = true); - BOOST_STATIC_CONSTANT(bool, is_exact = true); - BOOST_STATIC_CONSTANT(int, radix = 2); - static BOOST_ULLT epsilon() throw() { return 0; }; - static BOOST_ULLT round_error() throw() { return 0; }; + BOOST_STATIC_CONSTANT(int, digits = sizeof(BOOST_LLT) * CHAR_BIT); + BOOST_STATIC_CONSTANT(int, digits10 = (CHAR_BIT * sizeof(BOOST_LLT)) * 301L / 1000); + BOOST_STATIC_CONSTANT(bool, is_signed = false); + BOOST_STATIC_CONSTANT(bool, is_integer = true); + BOOST_STATIC_CONSTANT(bool, is_exact = true); + BOOST_STATIC_CONSTANT(int, radix = 2); + static BOOST_ULLT epsilon() throw() { return 0; }; + static BOOST_ULLT round_error() throw() { return 0; }; - BOOST_STATIC_CONSTANT(int, min_exponent = 0); - BOOST_STATIC_CONSTANT(int, min_exponent10 = 0); - BOOST_STATIC_CONSTANT(int, max_exponent = 0); - BOOST_STATIC_CONSTANT(int, max_exponent10 = 0); + BOOST_STATIC_CONSTANT(int, min_exponent = 0); + BOOST_STATIC_CONSTANT(int, min_exponent10 = 0); + BOOST_STATIC_CONSTANT(int, max_exponent = 0); + BOOST_STATIC_CONSTANT(int, max_exponent10 = 0); - BOOST_STATIC_CONSTANT(bool, has_infinity = false); - BOOST_STATIC_CONSTANT(bool, has_quiet_NaN = false); - BOOST_STATIC_CONSTANT(bool, has_signaling_NaN = false); - BOOST_STATIC_CONSTANT(bool, has_denorm = false); - BOOST_STATIC_CONSTANT(bool, has_denorm_loss = false); - static BOOST_ULLT infinity() throw() { return 0; }; - static BOOST_ULLT quiet_NaN() throw() { return 0; }; - static BOOST_ULLT signaling_NaN() throw() { return 0; }; - static BOOST_ULLT denorm_min() throw() { return 0; }; + BOOST_STATIC_CONSTANT(bool, has_infinity = false); + BOOST_STATIC_CONSTANT(bool, has_quiet_NaN = false); + BOOST_STATIC_CONSTANT(bool, has_signaling_NaN = false); + BOOST_STATIC_CONSTANT(bool, has_denorm = false); + BOOST_STATIC_CONSTANT(bool, has_denorm_loss = false); + static BOOST_ULLT infinity() throw() { return 0; }; + static BOOST_ULLT quiet_NaN() throw() { return 0; }; + static BOOST_ULLT signaling_NaN() throw() { return 0; }; + static BOOST_ULLT denorm_min() throw() { return 0; }; - BOOST_STATIC_CONSTANT(bool, is_iec559 = false); - BOOST_STATIC_CONSTANT(bool, is_bounded = true); - BOOST_STATIC_CONSTANT(bool, is_modulo = true); - - BOOST_STATIC_CONSTANT(bool, traps = false); - BOOST_STATIC_CONSTANT(bool, tinyness_before = false); - BOOST_STATIC_CONSTANT(float_round_style, round_style = round_toward_zero); - - }; -} -#endif + BOOST_STATIC_CONSTANT(bool, is_iec559 = false); + BOOST_STATIC_CONSTANT(bool, is_bounded = true); + BOOST_STATIC_CONSTANT(bool, is_modulo = true); + BOOST_STATIC_CONSTANT(bool, traps = false); + BOOST_STATIC_CONSTANT(bool, tinyness_before = false); + BOOST_STATIC_CONSTANT(float_round_style, round_style = round_toward_zero); +}; +} // namespace std #endif +#endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval.hpp index e6f976df..9df31842 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval.hpp @@ -12,21 +12,18 @@ #define BOOST_NUMERIC_INTERVAL_HPP #include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include #endif // BOOST_NUMERIC_INTERVAL_HPP diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith.hpp index c0320c45..8dc1d3a5 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith.hpp @@ -11,12 +11,12 @@ #ifndef BOOST_NUMERIC_INTERVAL_ARITH_HPP #define BOOST_NUMERIC_INTERVAL_ARITH_HPP -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include namespace boost { namespace numeric { @@ -25,278 +25,253 @@ namespace numeric { * Basic arithmetic operators */ -template inline -const interval& operator+(const interval& x) +template inline const interval &operator+(const interval &x) { - return x; + return x; } -template inline -interval operator-(const interval& x) +template inline interval operator-(const interval &x) { - if (interval_lib::detail::test_input(x)) - return interval::empty(); - return interval(-x.upper(), -x.lower(), true); -} - -template inline -interval& interval::operator+=(const interval& r) -{ - if (interval_lib::detail::test_input(*this, r)) - set_empty(); - else { - typename Policies::rounding rnd; - set(rnd.add_down(low, r.low), rnd.add_up(up, r.up)); - } - return *this; -} - -template inline -interval& interval::operator+=(const T& r) -{ - if (interval_lib::detail::test_input(*this, r)) - set_empty(); - else { - typename Policies::rounding rnd; - set(rnd.add_down(low, r), rnd.add_up(up, r)); - } - return *this; -} - -template inline -interval& interval::operator-=(const interval& r) -{ - if (interval_lib::detail::test_input(*this, r)) - set_empty(); - else { - typename Policies::rounding rnd; - set(rnd.sub_down(low, r.up), rnd.sub_up(up, r.low)); - } - return *this; -} - -template inline -interval& interval::operator-=(const T& r) -{ - if (interval_lib::detail::test_input(*this, r)) - set_empty(); - else { - typename Policies::rounding rnd; - set(rnd.sub_down(low, r), rnd.sub_up(up, r)); - } - return *this; -} - -template inline -interval& interval::operator*=(const interval& r) -{ - return *this = *this * r; -} - -template inline -interval& interval::operator*=(const T& r) -{ - return *this = r * *this; -} - -template inline -interval& interval::operator/=(const interval& r) -{ - return *this = *this / r; -} - -template inline -interval& interval::operator/=(const T& r) -{ - return *this = *this / r; -} - -template inline -interval operator+(const interval& x, - const interval& y) -{ - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - return interval(rnd.add_down(x.lower(), y.lower()), - rnd.add_up (x.upper(), y.upper()), true); -} - -template inline -interval operator+(const T& x, const interval& y) -{ - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - return interval(rnd.add_down(x, y.lower()), - rnd.add_up (x, y.upper()), true); -} - -template inline -interval operator+(const interval& x, const T& y) -{ return y + x; } - -template inline -interval operator-(const interval& x, - const interval& y) -{ - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - return interval(rnd.sub_down(x.lower(), y.upper()), - rnd.sub_up (x.upper(), y.lower()), true); -} - -template inline -interval operator-(const T& x, const interval& y) -{ - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - return interval(rnd.sub_down(x, y.upper()), - rnd.sub_up (x, y.lower()), true); -} - -template inline -interval operator-(const interval& x, const T& y) -{ - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - return interval(rnd.sub_down(x.lower(), y), - rnd.sub_up (x.upper(), y), true); -} - -template inline -interval operator*(const interval& x, - const interval& y) -{ - BOOST_USING_STD_MIN(); - BOOST_USING_STD_MAX(); - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - typename Policies::rounding rnd; - const T& xl = x.lower(); - const T& xu = x.upper(); - const T& yl = y.lower(); - const T& yu = y.upper(); - - if (interval_lib::user::is_neg(xl)) - if (interval_lib::user::is_pos(xu)) - if (interval_lib::user::is_neg(yl)) - if (interval_lib::user::is_pos(yu)) // M * M - return I(min BOOST_PREVENT_MACRO_SUBSTITUTION(rnd.mul_down(xl, yu), rnd.mul_down(xu, yl)), - max BOOST_PREVENT_MACRO_SUBSTITUTION(rnd.mul_up (xl, yl), rnd.mul_up (xu, yu)), true); - else // M * N - return I(rnd.mul_down(xu, yl), rnd.mul_up(xl, yl), true); - else - if (interval_lib::user::is_pos(yu)) // M * P - return I(rnd.mul_down(xl, yu), rnd.mul_up(xu, yu), true); - else // M * Z - return I(static_cast(0), static_cast(0), true); - else - if (interval_lib::user::is_neg(yl)) - if (interval_lib::user::is_pos(yu)) // N * M - return I(rnd.mul_down(xl, yu), rnd.mul_up(xl, yl), true); - else // N * N - return I(rnd.mul_down(xu, yu), rnd.mul_up(xl, yl), true); - else - if (interval_lib::user::is_pos(yu)) // N * P - return I(rnd.mul_down(xl, yu), rnd.mul_up(xu, yl), true); - else // N * Z - return I(static_cast(0), static_cast(0), true); - else - if (interval_lib::user::is_pos(xu)) - if (interval_lib::user::is_neg(yl)) - if (interval_lib::user::is_pos(yu)) // P * M - return I(rnd.mul_down(xu, yl), rnd.mul_up(xu, yu), true); - else // P * N - return I(rnd.mul_down(xu, yl), rnd.mul_up(xl, yu), true); - else - if (interval_lib::user::is_pos(yu)) // P * P - return I(rnd.mul_down(xl, yl), rnd.mul_up(xu, yu), true); - else // P * Z - return I(static_cast(0), static_cast(0), true); - else // Z * ? - return I(static_cast(0), static_cast(0), true); -} - -template inline -interval operator*(const T& x, const interval& y) -{ - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - typename Policies::rounding rnd; - const T& yl = y.lower(); - const T& yu = y.upper(); - // x is supposed not to be infinite - if (interval_lib::user::is_neg(x)) - return I(rnd.mul_down(x, yu), rnd.mul_up(x, yl), true); - else if (interval_lib::user::is_zero(x)) - return I(static_cast(0), static_cast(0), true); - else - return I(rnd.mul_down(x, yl), rnd.mul_up(x, yu), true); -} - -template inline -interval operator*(const interval& x, const T& y) -{ return y * x; } - -template inline -interval operator/(const interval& x, - const interval& y) -{ - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - if (zero_in(y)) - if (!interval_lib::user::is_zero(y.lower())) - if (!interval_lib::user::is_zero(y.upper())) - return interval_lib::detail::div_zero(x); - else - return interval_lib::detail::div_negative(x, y.lower()); - else - if (!interval_lib::user::is_zero(y.upper())) - return interval_lib::detail::div_positive(x, y.upper()); - else + if (interval_lib::detail::test_input(x)) return interval::empty(); - else - return interval_lib::detail::div_non_zero(x, y); + return interval(-x.upper(), -x.lower(), true); } -template inline -interval operator/(const T& x, const interval& y) +template +inline interval &interval::operator+=(const interval &r) { - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - if (zero_in(y)) - if (!interval_lib::user::is_zero(y.lower())) - if (!interval_lib::user::is_zero(y.upper())) - return interval_lib::detail::div_zero(x); - else - return interval_lib::detail::div_negative(x, y.lower()); - else - if (!interval_lib::user::is_zero(y.upper())) - return interval_lib::detail::div_positive(x, y.upper()); - else + if (interval_lib::detail::test_input(*this, r)) + set_empty(); + else { + typename Policies::rounding rnd; + set(rnd.add_down(low, r.low), rnd.add_up(up, r.up)); + } + return *this; +} + +template inline interval &interval::operator+=(const T &r) +{ + if (interval_lib::detail::test_input(*this, r)) + set_empty(); + else { + typename Policies::rounding rnd; + set(rnd.add_down(low, r), rnd.add_up(up, r)); + } + return *this; +} + +template +inline interval &interval::operator-=(const interval &r) +{ + if (interval_lib::detail::test_input(*this, r)) + set_empty(); + else { + typename Policies::rounding rnd; + set(rnd.sub_down(low, r.up), rnd.sub_up(up, r.low)); + } + return *this; +} + +template inline interval &interval::operator-=(const T &r) +{ + if (interval_lib::detail::test_input(*this, r)) + set_empty(); + else { + typename Policies::rounding rnd; + set(rnd.sub_down(low, r), rnd.sub_up(up, r)); + } + return *this; +} + +template +inline interval &interval::operator*=(const interval &r) +{ + return *this = *this * r; +} + +template inline interval &interval::operator*=(const T &r) +{ + return *this = r * *this; +} + +template +inline interval &interval::operator/=(const interval &r) +{ + return *this = *this / r; +} + +template inline interval &interval::operator/=(const T &r) +{ + return *this = *this / r; +} + +template +inline interval operator+(const interval &x, const interval &y) +{ + if (interval_lib::detail::test_input(x, y)) return interval::empty(); - else - return interval_lib::detail::div_non_zero(x, y); + typename Policies::rounding rnd; + return interval(rnd.add_down(x.lower(), y.lower()), rnd.add_up(x.upper(), y.upper()), true); } -template inline -interval operator/(const interval& x, const T& y) +template inline interval operator+(const T &x, const interval &y) { - if (interval_lib::detail::test_input(x, y) || interval_lib::user::is_zero(y)) - return interval::empty(); - typename Policies::rounding rnd; - const T& xl = x.lower(); - const T& xu = x.upper(); - if (interval_lib::user::is_neg(y)) - return interval(rnd.div_down(xu, y), rnd.div_up(xl, y), true); - else - return interval(rnd.div_down(xl, y), rnd.div_up(xu, y), true); + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + typename Policies::rounding rnd; + return interval(rnd.add_down(x, y.lower()), rnd.add_up(x, y.upper()), true); +} + +template inline interval operator+(const interval &x, const T &y) +{ + return y + x; +} + +template +inline interval operator-(const interval &x, const interval &y) +{ + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + typename Policies::rounding rnd; + return interval(rnd.sub_down(x.lower(), y.upper()), rnd.sub_up(x.upper(), y.lower()), true); +} + +template inline interval operator-(const T &x, const interval &y) +{ + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + typename Policies::rounding rnd; + return interval(rnd.sub_down(x, y.upper()), rnd.sub_up(x, y.lower()), true); +} + +template inline interval operator-(const interval &x, const T &y) +{ + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + typename Policies::rounding rnd; + return interval(rnd.sub_down(x.lower(), y), rnd.sub_up(x.upper(), y), true); +} + +template +inline interval operator*(const interval &x, const interval &y) +{ + BOOST_USING_STD_MIN(); + BOOST_USING_STD_MAX(); + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + typename Policies::rounding rnd; + const T &xl = x.lower(); + const T &xu = x.upper(); + const T &yl = y.lower(); + const T &yu = y.upper(); + + if (interval_lib::user::is_neg(xl)) + if (interval_lib::user::is_pos(xu)) + if (interval_lib::user::is_neg(yl)) + if (interval_lib::user::is_pos(yu)) // M * M + return I(min BOOST_PREVENT_MACRO_SUBSTITUTION(rnd.mul_down(xl, yu), rnd.mul_down(xu, yl)), + max BOOST_PREVENT_MACRO_SUBSTITUTION(rnd.mul_up(xl, yl), rnd.mul_up(xu, yu)), + true); + else // M * N + return I(rnd.mul_down(xu, yl), rnd.mul_up(xl, yl), true); + else if (interval_lib::user::is_pos(yu)) // M * P + return I(rnd.mul_down(xl, yu), rnd.mul_up(xu, yu), true); + else // M * Z + return I(static_cast(0), static_cast(0), true); + else if (interval_lib::user::is_neg(yl)) + if (interval_lib::user::is_pos(yu)) // N * M + return I(rnd.mul_down(xl, yu), rnd.mul_up(xl, yl), true); + else // N * N + return I(rnd.mul_down(xu, yu), rnd.mul_up(xl, yl), true); + else if (interval_lib::user::is_pos(yu)) // N * P + return I(rnd.mul_down(xl, yu), rnd.mul_up(xu, yl), true); + else // N * Z + return I(static_cast(0), static_cast(0), true); + else if (interval_lib::user::is_pos(xu)) + if (interval_lib::user::is_neg(yl)) + if (interval_lib::user::is_pos(yu)) // P * M + return I(rnd.mul_down(xu, yl), rnd.mul_up(xu, yu), true); + else // P * N + return I(rnd.mul_down(xu, yl), rnd.mul_up(xl, yu), true); + else if (interval_lib::user::is_pos(yu)) // P * P + return I(rnd.mul_down(xl, yl), rnd.mul_up(xu, yu), true); + else // P * Z + return I(static_cast(0), static_cast(0), true); + else // Z * ? + return I(static_cast(0), static_cast(0), true); +} + +template inline interval operator*(const T &x, const interval &y) +{ + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + typename Policies::rounding rnd; + const T &yl = y.lower(); + const T &yu = y.upper(); + // x is supposed not to be infinite + if (interval_lib::user::is_neg(x)) + return I(rnd.mul_down(x, yu), rnd.mul_up(x, yl), true); + else if (interval_lib::user::is_zero(x)) + return I(static_cast(0), static_cast(0), true); + else + return I(rnd.mul_down(x, yl), rnd.mul_up(x, yu), true); +} + +template inline interval operator*(const interval &x, const T &y) +{ + return y * x; +} + +template +inline interval operator/(const interval &x, const interval &y) +{ + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + if (zero_in(y)) + if (!interval_lib::user::is_zero(y.lower())) + if (!interval_lib::user::is_zero(y.upper())) + return interval_lib::detail::div_zero(x); + else + return interval_lib::detail::div_negative(x, y.lower()); + else if (!interval_lib::user::is_zero(y.upper())) + return interval_lib::detail::div_positive(x, y.upper()); + else + return interval::empty(); + else + return interval_lib::detail::div_non_zero(x, y); +} + +template inline interval operator/(const T &x, const interval &y) +{ + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + if (zero_in(y)) + if (!interval_lib::user::is_zero(y.lower())) + if (!interval_lib::user::is_zero(y.upper())) + return interval_lib::detail::div_zero(x); + else + return interval_lib::detail::div_negative(x, y.lower()); + else if (!interval_lib::user::is_zero(y.upper())) + return interval_lib::detail::div_positive(x, y.upper()); + else + return interval::empty(); + else + return interval_lib::detail::div_non_zero(x, y); +} + +template inline interval operator/(const interval &x, const T &y) +{ + if (interval_lib::detail::test_input(x, y) || interval_lib::user::is_zero(y)) + return interval::empty(); + typename Policies::rounding rnd; + const T &xl = x.lower(); + const T &xu = x.upper(); + if (interval_lib::user::is_neg(y)) + return interval(rnd.div_down(xu, y), rnd.div_up(xl, y), true); + else + return interval(rnd.div_down(xl, y), rnd.div_up(xu, y), true); } } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith2.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith2.hpp index ba7ffbd7..a1cee7de 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith2.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith2.hpp @@ -14,289 +14,296 @@ #ifndef BOOST_NUMERIC_INTERVAL_ARITH2_HPP #define BOOST_NUMERIC_INTERVAL_ARITH2_HPP +#include #include -#include -#include +#include +#include #include #include -#include +#include +#include #include -#include #include -#include namespace boost { namespace numeric { -template inline -interval fmod(const interval& x, - const interval& y) +template +inline interval fmod(const interval &x, const interval &y) { - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - typedef typename interval_lib::unprotect >::type I; - T const &yb = interval_lib::user::is_neg(x.lower()) ? y.lower() : y.upper(); - T n = rnd.int_down(rnd.div_down(x.lower(), yb)); - return (const I&)x - n * (const I&)y; + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + typename Policies::rounding rnd; + typedef typename interval_lib::unprotect>::type I; + T const &yb = interval_lib::user::is_neg(x.lower()) ? y.lower() : y.upper(); + T n = rnd.int_down(rnd.div_down(x.lower(), yb)); + return (const I &)x - n * (const I &)y; } -template inline -interval fmod(const interval& x, const T& y) +template inline interval fmod(const interval &x, const T &y) { - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - typedef typename interval_lib::unprotect >::type I; - T n = rnd.int_down(rnd.div_down(x.lower(), y)); - return (const I&)x - n * I(y); + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + typename Policies::rounding rnd; + typedef typename interval_lib::unprotect>::type I; + T n = rnd.int_down(rnd.div_down(x.lower(), y)); + return (const I &)x - n * I(y); } -template inline -interval fmod(const T& x, const interval& y) +template inline interval fmod(const T &x, const interval &y) { - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - typename Policies::rounding rnd; - typedef typename interval_lib::unprotect >::type I; - T const &yb = interval_lib::user::is_neg(x) ? y.lower() : y.upper(); - T n = rnd.int_down(rnd.div_down(x, yb)); - return x - n * (const I&)y; + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + typename Policies::rounding rnd; + typedef typename interval_lib::unprotect>::type I; + T const &yb = interval_lib::user::is_neg(x) ? y.lower() : y.upper(); + T n = rnd.int_down(rnd.div_down(x, yb)); + return x - n * (const I &)y; } namespace interval_lib { -template inline -interval division_part1(const interval& x, - const interval& y, bool& b) +template +inline interval division_part1(const interval &x, const interval &y, bool &b) { - typedef interval I; - b = false; - if (detail::test_input(x, y)) - return I::empty(); - if (zero_in(y)) - if (!user::is_zero(y.lower())) - if (!user::is_zero(y.upper())) - return detail::div_zero_part1(x, y, b); - else - return detail::div_negative(x, y.lower()); - else - if (!user::is_zero(y.upper())) - return detail::div_positive(x, y.upper()); - else + typedef interval I; + b = false; + if (detail::test_input(x, y)) return I::empty(); - else - return detail::div_non_zero(x, y); + if (zero_in(y)) + if (!user::is_zero(y.lower())) + if (!user::is_zero(y.upper())) + return detail::div_zero_part1(x, y, b); + else + return detail::div_negative(x, y.lower()); + else if (!user::is_zero(y.upper())) + return detail::div_positive(x, y.upper()); + else + return I::empty(); + else + return detail::div_non_zero(x, y); } -template inline -interval division_part2(const interval& x, - const interval& y, bool b = true) +template +inline interval +division_part2(const interval &x, const interval &y, bool b = true) { - if (!b) return interval::empty(); - return detail::div_zero_part2(x, y); + if (!b) + return interval::empty(); + return detail::div_zero_part2(x, y); } -template inline -interval multiplicative_inverse(const interval& x) +template inline interval multiplicative_inverse(const interval &x) { - typedef interval I; - if (detail::test_input(x)) - return I::empty(); - T one = static_cast(1); - typename Policies::rounding rnd; - if (zero_in(x)) { - typedef typename Policies::checking checking; - if (!user::is_zero(x.lower())) - if (!user::is_zero(x.upper())) - return I::whole(); - else - return I(checking::neg_inf(), rnd.div_up(one, x.lower()), true); - else - if (!user::is_zero(x.upper())) - return I(rnd.div_down(one, x.upper()), checking::pos_inf(), true); - else + typedef interval I; + if (detail::test_input(x)) return I::empty(); - } else - return I(rnd.div_down(one, x.upper()), rnd.div_up(one, x.lower()), true); + T one = static_cast(1); + typename Policies::rounding rnd; + if (zero_in(x)) { + typedef typename Policies::checking checking; + if (!user::is_zero(x.lower())) + if (!user::is_zero(x.upper())) + return I::whole(); + else + return I(checking::neg_inf(), rnd.div_up(one, x.lower()), true); + else if (!user::is_zero(x.upper())) + return I(rnd.div_down(one, x.upper()), checking::pos_inf(), true); + else + return I::empty(); + } + else + return I(rnd.div_down(one, x.upper()), rnd.div_up(one, x.lower()), true); } namespace detail { -template inline -T pow_dn(const T& x_, int pwr, Rounding& rnd) // x and pwr are positive +template inline T pow_dn(const T &x_, int pwr, Rounding &rnd) // x and pwr are positive { - T x = x_; - T y = (pwr & 1) ? x_ : static_cast(1); - pwr >>= 1; - while (pwr > 0) { - x = rnd.mul_down(x, x); - if (pwr & 1) y = rnd.mul_down(x, y); + T x = x_; + T y = (pwr & 1) ? x_ : static_cast(1); pwr >>= 1; - } - return y; + while (pwr > 0) { + x = rnd.mul_down(x, x); + if (pwr & 1) + y = rnd.mul_down(x, y); + pwr >>= 1; + } + return y; } -template inline -T pow_up(const T& x_, int pwr, Rounding& rnd) // x and pwr are positive +template inline T pow_up(const T &x_, int pwr, Rounding &rnd) // x and pwr are positive { - T x = x_; - T y = (pwr & 1) ? x_ : static_cast(1); - pwr >>= 1; - while (pwr > 0) { - x = rnd.mul_up(x, x); - if (pwr & 1) y = rnd.mul_up(x, y); + T x = x_; + T y = (pwr & 1) ? x_ : static_cast(1); pwr >>= 1; - } - return y; + while (pwr > 0) { + x = rnd.mul_up(x, x); + if (pwr & 1) + y = rnd.mul_up(x, y); + pwr >>= 1; + } + return y; } } // namespace detail } // namespace interval_lib -template inline -interval pow(const interval& x, int pwr) +template inline interval pow(const interval &x, int pwr) { - BOOST_USING_STD_MAX(); - using interval_lib::detail::pow_dn; - using interval_lib::detail::pow_up; - typedef interval I; + BOOST_USING_STD_MAX(); + using interval_lib::detail::pow_dn; + using interval_lib::detail::pow_up; + typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); + if (interval_lib::detail::test_input(x)) + return I::empty(); - if (pwr == 0) - if (interval_lib::user::is_zero(x.lower()) - && interval_lib::user::is_zero(x.upper())) - return I::empty(); - else - return I(static_cast(1)); - else if (pwr < 0) - return interval_lib::multiplicative_inverse(pow(x, -pwr)); + if (pwr == 0) + if (interval_lib::user::is_zero(x.lower()) && interval_lib::user::is_zero(x.upper())) + return I::empty(); + else + return I(static_cast(1)); + else if (pwr < 0) + return interval_lib::multiplicative_inverse(pow(x, -pwr)); - typename Policies::rounding rnd; - - if (interval_lib::user::is_neg(x.upper())) { // [-2,-1] - T yl = pow_dn(static_cast(-x.upper()), pwr, rnd); - T yu = pow_up(static_cast(-x.lower()), pwr, rnd); - if (pwr & 1) // [-2,-1]^1 - return I(-yu, -yl, true); - else // [-2,-1]^2 - return I(yl, yu, true); - } else if (interval_lib::user::is_neg(x.lower())) { // [-1,1] - if (pwr & 1) { // [-1,1]^1 - return I(-pow_up(-x.lower(), pwr, rnd), pow_up(x.upper(), pwr, rnd), true); - } else { // [-1,1]^2 - return I(static_cast(0), pow_up(max BOOST_PREVENT_MACRO_SUBSTITUTION(static_cast(-x.lower()), x.upper()), pwr, rnd), true); + typename Policies::rounding rnd; + + if (interval_lib::user::is_neg(x.upper())) { // [-2,-1] + T yl = pow_dn(static_cast(-x.upper()), pwr, rnd); + T yu = pow_up(static_cast(-x.lower()), pwr, rnd); + if (pwr & 1) // [-2,-1]^1 + return I(-yu, -yl, true); + else // [-2,-1]^2 + return I(yl, yu, true); + } + else if (interval_lib::user::is_neg(x.lower())) { // [-1,1] + if (pwr & 1) { // [-1,1]^1 + return I(-pow_up(-x.lower(), pwr, rnd), pow_up(x.upper(), pwr, rnd), true); + } + else { // [-1,1]^2 + return I(static_cast(0), + pow_up(max BOOST_PREVENT_MACRO_SUBSTITUTION(static_cast(-x.lower()), x.upper()), pwr, rnd), + true); + } + } + else { // [1,2] + return I(pow_dn(x.lower(), pwr, rnd), pow_up(x.upper(), pwr, rnd), true); } - } else { // [1,2] - return I(pow_dn(x.lower(), pwr, rnd), pow_up(x.upper(), pwr, rnd), true); - } } -template inline -interval sqrt(const interval& x) +template inline interval sqrt(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x) || interval_lib::user::is_neg(x.upper())) - return I::empty(); - typename Policies::rounding rnd; - T l = !interval_lib::user::is_pos(x.lower()) ? static_cast(0) : rnd.sqrt_down(x.lower()); - return I(l, rnd.sqrt_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x) || interval_lib::user::is_neg(x.upper())) + return I::empty(); + typename Policies::rounding rnd; + T l = !interval_lib::user::is_pos(x.lower()) ? static_cast(0) : rnd.sqrt_down(x.lower()); + return I(l, rnd.sqrt_up(x.upper()), true); } -template inline -interval square(const interval& x) +template inline interval square(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - const T& xl = x.lower(); - const T& xu = x.upper(); - if (interval_lib::user::is_neg(xu)) - return I(rnd.mul_down(xu, xu), rnd.mul_up(xl, xl), true); - else if (interval_lib::user::is_pos(x.lower())) - return I(rnd.mul_down(xl, xl), rnd.mul_up(xu, xu), true); - else - return I(static_cast(0), (-xl > xu ? rnd.mul_up(xl, xl) : rnd.mul_up(xu, xu)), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + const T &xl = x.lower(); + const T &xu = x.upper(); + if (interval_lib::user::is_neg(xu)) + return I(rnd.mul_down(xu, xu), rnd.mul_up(xl, xl), true); + else if (interval_lib::user::is_pos(x.lower())) + return I(rnd.mul_down(xl, xl), rnd.mul_up(xu, xu), true); + else + return I(static_cast(0), (-xl > xu ? rnd.mul_up(xl, xl) : rnd.mul_up(xu, xu)), true); } namespace interval_lib { namespace detail { -template< class I > inline -I root_aux(typename I::base_type const &x, int k) // x and k are bigger than one +template inline I root_aux(typename I::base_type const &x, int k) // x and k are bigger than one { - typedef typename I::base_type T; - T tk(k); - I y(static_cast(1), x, true); - for(;;) { - T y0 = median(y); - I yy = intersect(y, y0 - (pow(I(y0, y0, true), k) - x) / (tk * pow(y, k - 1))); - if (equal(y, yy)) return y; - y = yy; - } + typedef typename I::base_type T; + T tk(k); + I y(static_cast(1), x, true); + for (;;) { + T y0 = median(y); + I yy = intersect(y, y0 - (pow(I(y0, y0, true), k) - x) / (tk * pow(y, k - 1))); + if (equal(y, yy)) + return y; + y = yy; + } } -template< class I > inline // x is positive and k bigger than one -typename I::base_type root_aux_dn(typename I::base_type const &x, int k) +template +inline // x is positive and k bigger than one + typename I::base_type + root_aux_dn(typename I::base_type const &x, int k) { - typedef typename I::base_type T; - typedef typename I::traits_type Policies; - typename Policies::rounding rnd; - T one(1); - if (x > one) return root_aux(x, k).lower(); - if (x == one) return one; - return rnd.div_down(one, root_aux(rnd.div_up(one, x), k).upper()); + typedef typename I::base_type T; + typedef typename I::traits_type Policies; + typename Policies::rounding rnd; + T one(1); + if (x > one) + return root_aux(x, k).lower(); + if (x == one) + return one; + return rnd.div_down(one, root_aux(rnd.div_up(one, x), k).upper()); } -template< class I > inline // x is positive and k bigger than one -typename I::base_type root_aux_up(typename I::base_type const &x, int k) +template +inline // x is positive and k bigger than one + typename I::base_type + root_aux_up(typename I::base_type const &x, int k) { - typedef typename I::base_type T; - typedef typename I::traits_type Policies; - typename Policies::rounding rnd; - T one(1); - if (x > one) return root_aux(x, k).upper(); - if (x == one) return one; - return rnd.div_up(one, root_aux(rnd.div_down(one, x), k).lower()); + typedef typename I::base_type T; + typedef typename I::traits_type Policies; + typename Policies::rounding rnd; + T one(1); + if (x > one) + return root_aux(x, k).upper(); + if (x == one) + return one; + return rnd.div_up(one, root_aux(rnd.div_down(one, x), k).lower()); } } // namespace detail } // namespace interval_lib -template< class T, class Policies > inline -interval nth_root(interval const &x, int k) +template inline interval nth_root(interval const &x, int k) { - typedef interval I; - if (interval_lib::detail::test_input(x)) return I::empty(); - assert(k > 0); - if (k == 1) return x; - typename Policies::rounding rnd; - typedef typename interval_lib::unprotect::type R; - if (!interval_lib::user::is_pos(x.upper())) { - if (interval_lib::user::is_zero(x.upper())) { - T zero(0); - if (!(k & 1) || interval_lib::user::is_zero(x.lower())) // [-1,0]^/2 or [0,0] - return I(zero, zero, true); - else // [-1,0]^/3 - return I(-interval_lib::detail::root_aux_up(-x.lower(), k), zero, true); - } else if (!(k & 1)) // [-2,-1]^/2 - return I::empty(); - else { // [-2,-1]^/3 - return I(-interval_lib::detail::root_aux_up(-x.lower(), k), - -interval_lib::detail::root_aux_dn(-x.upper(), k), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + assert(k > 0); + if (k == 1) + return x; + typename Policies::rounding rnd; + typedef typename interval_lib::unprotect::type R; + if (!interval_lib::user::is_pos(x.upper())) { + if (interval_lib::user::is_zero(x.upper())) { + T zero(0); + if (!(k & 1) || interval_lib::user::is_zero(x.lower())) // [-1,0]^/2 or [0,0] + return I(zero, zero, true); + else // [-1,0]^/3 + return I(-interval_lib::detail::root_aux_up(-x.lower(), k), zero, true); + } + else if (!(k & 1)) // [-2,-1]^/2 + return I::empty(); + else { // [-2,-1]^/3 + return I(-interval_lib::detail::root_aux_up(-x.lower(), k), + -interval_lib::detail::root_aux_dn(-x.upper(), k), + true); + } } - } - T u = interval_lib::detail::root_aux_up(x.upper(), k); - if (!interval_lib::user::is_pos(x.lower())) - if (!(k & 1) || interval_lib::user::is_zero(x.lower())) // [-1,1]^/2 or [0,1] - return I(static_cast(0), u, true); - else // [-1,1]^/3 - return I(-interval_lib::detail::root_aux_up(-x.lower(), k), u, true); - else // [1,2] - return I(interval_lib::detail::root_aux_dn(x.lower(), k), u, true); + T u = interval_lib::detail::root_aux_up(x.upper(), k); + if (!interval_lib::user::is_pos(x.lower())) + if (!(k & 1) || interval_lib::user::is_zero(x.lower())) // [-1,1]^/2 or [0,1] + return I(static_cast(0), u, true); + else // [-1,1]^/3 + return I(-interval_lib::detail::root_aux_up(-x.lower(), k), u, true); + else // [1,2] + return I(interval_lib::detail::root_aux_dn(x.lower(), k), u, true); } } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith3.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith3.hpp index 518e6182..bf4a6b0b 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith3.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/arith3.hpp @@ -22,44 +22,40 @@ namespace boost { namespace numeric { namespace interval_lib { -template inline -I add(const typename I::base_type& x, const typename I::base_type& y) +template inline I add(const typename I::base_type &x, const typename I::base_type &y) { - typedef typename I::traits_type Policies; - if (detail::test_input(x, y)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.add_down(x, y), rnd.add_up(x, y), true); + typedef typename I::traits_type Policies; + if (detail::test_input(x, y)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.add_down(x, y), rnd.add_up(x, y), true); } -template inline -I sub(const typename I::base_type& x, const typename I::base_type& y) +template inline I sub(const typename I::base_type &x, const typename I::base_type &y) { - typedef typename I::traits_type Policies; - if (detail::test_input(x, y)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.sub_down(x, y), rnd.sub_up(x, y), true); + typedef typename I::traits_type Policies; + if (detail::test_input(x, y)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.sub_down(x, y), rnd.sub_up(x, y), true); } -template inline -I mul(const typename I::base_type& x, const typename I::base_type& y) +template inline I mul(const typename I::base_type &x, const typename I::base_type &y) { - typedef typename I::traits_type Policies; - if (detail::test_input(x, y)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.mul_down(x, y), rnd.mul_up(x, y), true); + typedef typename I::traits_type Policies; + if (detail::test_input(x, y)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.mul_down(x, y), rnd.mul_up(x, y), true); } -template inline -I div(const typename I::base_type& x, const typename I::base_type& y) +template inline I div(const typename I::base_type &x, const typename I::base_type &y) { - typedef typename I::traits_type Policies; - if (detail::test_input(x, y) || user::is_zero(y)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.div_down(x, y), rnd.div_up(x, y), true); + typedef typename I::traits_type Policies; + if (detail::test_input(x, y) || user::is_zero(y)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.div_down(x, y), rnd.div_up(x, y), true); } } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/checking.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/checking.hpp index 2db486a0..a81cdae3 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/checking.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/checking.hpp @@ -10,10 +10,10 @@ #ifndef BOOST_NUMERIC_INTERVAL_CHECKING_HPP #define BOOST_NUMERIC_INTERVAL_CHECKING_HPP +#include +#include #include #include -#include -#include namespace boost { namespace numeric { @@ -21,107 +21,86 @@ namespace interval_lib { struct exception_create_empty { - void operator()() - { - throw std::runtime_error("boost::interval: empty interval created"); - } + void operator()() { throw std::runtime_error("boost::interval: empty interval created"); } }; struct exception_invalid_number { - void operator()() - { - throw std::invalid_argument("boost::interval: invalid number"); - } + void operator()() { throw std::invalid_argument("boost::interval: invalid number"); } }; -template -struct checking_base +template struct checking_base { - static T pos_inf() - { - assert(std::numeric_limits::has_infinity); - return std::numeric_limits::infinity(); - } - static T neg_inf() - { - assert(std::numeric_limits::has_infinity); - return -std::numeric_limits::infinity(); - } - static T nan() - { - assert(std::numeric_limits::has_quiet_NaN); - return std::numeric_limits::quiet_NaN(); - } - static bool is_nan(const T& x) - { - return std::numeric_limits::has_quiet_NaN && (x != x); - } - static T empty_lower() - { - return (std::numeric_limits::has_quiet_NaN ? - std::numeric_limits::quiet_NaN() : static_cast(1)); - } - static T empty_upper() - { - return (std::numeric_limits::has_quiet_NaN ? - std::numeric_limits::quiet_NaN() : static_cast(0)); - } - static bool is_empty(const T& l, const T& u) - { - return !(l <= u); // safety for partial orders - } + static T pos_inf() + { + assert(std::numeric_limits::has_infinity); + return std::numeric_limits::infinity(); + } + static T neg_inf() + { + assert(std::numeric_limits::has_infinity); + return -std::numeric_limits::infinity(); + } + static T nan() + { + assert(std::numeric_limits::has_quiet_NaN); + return std::numeric_limits::quiet_NaN(); + } + static bool is_nan(const T &x) { return std::numeric_limits::has_quiet_NaN && (x != x); } + static T empty_lower() + { + return (std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : static_cast(1)); + } + static T empty_upper() + { + return (std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : static_cast(0)); + } + static bool is_empty(const T &l, const T &u) + { + return !(l <= u); // safety for partial orders + } }; -template, - class Exception = exception_create_empty> -struct checking_no_empty: Checking +template , class Exception = exception_create_empty> +struct checking_no_empty : Checking { - static T nan() - { - assert(false); - return Checking::nan(); - } - static T empty_lower() - { - Exception()(); - return Checking::empty_lower(); - } - static T empty_upper() - { - Exception()(); - return Checking::empty_upper(); - } - static bool is_empty(const T&, const T&) - { - return false; - } + static T nan() + { + assert(false); + return Checking::nan(); + } + static T empty_lower() + { + Exception()(); + return Checking::empty_lower(); + } + static T empty_upper() + { + Exception()(); + return Checking::empty_upper(); + } + static bool is_empty(const T &, const T &) { return false; } }; -template > -struct checking_no_nan: Checking +template > struct checking_no_nan : Checking { - static bool is_nan(const T&) - { - return false; - } + static bool is_nan(const T &) { return false; } }; -template, - class Exception = exception_invalid_number> -struct checking_catch_nan: Checking +template , class Exception = exception_invalid_number> +struct checking_catch_nan : Checking { - static bool is_nan(const T& x) - { - if (Checking::is_nan(x)) Exception()(); - return false; - } + static bool is_nan(const T &x) + { + if (Checking::is_nan(x)) + Exception()(); + return false; + } }; -template -struct checking_strict: - checking_no_nan > -{}; +template struct checking_strict : checking_no_nan> +{ +}; } // namespace interval_lib } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare.hpp index f21753e7..bb505872 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare.hpp @@ -11,9 +11,9 @@ #define BOOST_NUMERIC_INTERVAL_COMPARE_HPP #include -#include #include #include +#include #include #endif // BOOST_NUMERIC_INTERVAL_COMPARE_HPP diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/certain.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/certain.hpp index 9232d5cd..53ff1fae 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/certain.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/certain.hpp @@ -19,88 +19,94 @@ namespace interval_lib { namespace compare { namespace certain { -template inline -bool operator<(const interval& x, const interval& y) +template +inline bool operator<(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() < y.lower(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() < y.lower(); } -template inline -bool operator<(const interval& x, const T& y) +template inline bool operator<(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() < y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() < y; } -template inline -bool operator<=(const interval& x, const interval& y) +template +inline bool operator<=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() <= y.lower(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() <= y.lower(); } -template inline -bool operator<=(const interval& x, const T& y) +template inline bool operator<=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() <= y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() <= y; } -template inline -bool operator>(const interval& x, const interval& y) +template +inline bool operator>(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() > y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() > y.upper(); } -template inline -bool operator>(const interval& x, const T& y) +template inline bool operator>(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() > y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() > y; } -template inline -bool operator>=(const interval& x, const interval& y) +template +inline bool operator>=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() >= y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() >= y.upper(); } -template inline -bool operator>=(const interval& x, const T& y) +template inline bool operator>=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() >= y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() >= y; } -template inline -bool operator==(const interval& x, const interval& y) +template +inline bool operator==(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() == y.lower() && x.lower() == y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() == y.lower() && x.lower() == y.upper(); } -template inline -bool operator==(const interval& x, const T& y) +template inline bool operator==(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() == y && x.lower() == y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() == y && x.lower() == y; } -template inline -bool operator!=(const interval& x, const interval& y) +template +inline bool operator!=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() < y.lower() || x.lower() > y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() < y.lower() || x.lower() > y.upper(); } -template inline -bool operator!=(const interval& x, const T& y) +template inline bool operator!=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() < y || x.lower() > y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() < y || x.lower() > y; } } // namespace certain diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/explicit.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/explicit.hpp index 8c68be89..43a65943 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/explicit.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/explicit.hpp @@ -21,228 +21,204 @@ namespace interval_lib { * Certainly... operations */ -template inline -bool cerlt(const interval& x, const interval& y) +template +inline bool cerlt(const interval &x, const interval &y) { - return x.upper() < y.lower(); + return x.upper() < y.lower(); } -template inline -bool cerlt(const interval& x, const T& y) +template inline bool cerlt(const interval &x, const T &y) { - return x.upper() < y; + return x.upper() < y; } -template inline -bool cerlt(const T& x, const interval& y) +template inline bool cerlt(const T &x, const interval &y) { - return x < y.lower(); + return x < y.lower(); } -template inline -bool cerle(const interval& x, const interval& y) +template +inline bool cerle(const interval &x, const interval &y) { - return x.upper() <= y.lower(); + return x.upper() <= y.lower(); } -template inline -bool cerle(const interval& x, const T& y) +template inline bool cerle(const interval &x, const T &y) { - return x.upper() <= y; + return x.upper() <= y; } -template inline -bool cerle(const T& x, const interval& y) +template inline bool cerle(const T &x, const interval &y) { - return x <= y.lower(); + return x <= y.lower(); } -template inline -bool cergt(const interval& x, const interval& y) +template +inline bool cergt(const interval &x, const interval &y) { - return x.lower() > y.upper(); + return x.lower() > y.upper(); } -template inline -bool cergt(const interval& x, const T& y) +template inline bool cergt(const interval &x, const T &y) { - return x.lower() > y; + return x.lower() > y; } -template inline -bool cergt(const T& x, const interval& y) +template inline bool cergt(const T &x, const interval &y) { - return x > y.upper(); + return x > y.upper(); } -template inline -bool cerge(const interval& x, const interval& y) +template +inline bool cerge(const interval &x, const interval &y) { - return x.lower() >= y.upper(); + return x.lower() >= y.upper(); } -template inline -bool cerge(const interval& x, const T& y) +template inline bool cerge(const interval &x, const T &y) { - return x.lower() >= y; + return x.lower() >= y; } -template inline -bool cerge(const T& x, const interval& y) +template inline bool cerge(const T &x, const interval &y) { - return x >= y.upper(); + return x >= y.upper(); } -template inline -bool cereq(const interval& x, const interval& y) +template +inline bool cereq(const interval &x, const interval &y) { - return x.lower() == y.upper() && y.lower() == x.upper(); + return x.lower() == y.upper() && y.lower() == x.upper(); } -template inline -bool cereq(const interval& x, const T& y) +template inline bool cereq(const interval &x, const T &y) { - return x.lower() == y && x.upper() == y; + return x.lower() == y && x.upper() == y; } -template inline -bool cereq(const T& x, const interval& y) +template inline bool cereq(const T &x, const interval &y) { - return x == y.lower() && x == y.upper(); + return x == y.lower() && x == y.upper(); } -template inline -bool cerne(const interval& x, const interval& y) +template +inline bool cerne(const interval &x, const interval &y) { - return x.upper() < y.lower() || y.upper() < x.lower(); + return x.upper() < y.lower() || y.upper() < x.lower(); } -template inline -bool cerne(const interval& x, const T& y) +template inline bool cerne(const interval &x, const T &y) { - return x.upper() < y || y < x.lower(); + return x.upper() < y || y < x.lower(); } -template inline -bool cerne(const T& x, const interval& y) +template inline bool cerne(const T &x, const interval &y) { - return x < y.lower() || y.upper() < x; + return x < y.lower() || y.upper() < x; } /* * Possibly... comparisons */ -template inline -bool poslt(const interval& x, const interval& y) +template +inline bool poslt(const interval &x, const interval &y) { - return x.lower() < y.upper(); + return x.lower() < y.upper(); } -template inline -bool poslt(const interval& x, const T& y) +template inline bool poslt(const interval &x, const T &y) { - return x.lower() < y; + return x.lower() < y; } -template inline -bool poslt(const T& x, const interval& y) +template inline bool poslt(const T &x, const interval &y) { - return x < y.upper(); + return x < y.upper(); } -template inline -bool posle(const interval& x, const interval& y) +template +inline bool posle(const interval &x, const interval &y) { - return x.lower() <= y.upper(); + return x.lower() <= y.upper(); } -template inline -bool posle(const interval& x, const T& y) +template inline bool posle(const interval &x, const T &y) { - return x.lower() <= y; + return x.lower() <= y; } -template inline -bool posle(const T& x, const interval& y) +template inline bool posle(const T &x, const interval &y) { - return x <= y.upper(); + return x <= y.upper(); } -template inline -bool posgt(const interval& x, const interval& y) +template +inline bool posgt(const interval &x, const interval &y) { - return x.upper() > y.lower(); + return x.upper() > y.lower(); } -template inline -bool posgt(const interval& x, const T& y) +template inline bool posgt(const interval &x, const T &y) { - return x.upper() > y; + return x.upper() > y; } -template inline -bool posgt(const T& x, const interval & y) +template inline bool posgt(const T &x, const interval &y) { - return x > y.lower(); + return x > y.lower(); } -template inline -bool posge(const interval& x, const interval& y) +template +inline bool posge(const interval &x, const interval &y) { - return x.upper() >= y.lower(); + return x.upper() >= y.lower(); } -template inline -bool posge(const interval& x, const T& y) +template inline bool posge(const interval &x, const T &y) { - return x.upper() >= y; + return x.upper() >= y; } -template inline -bool posge(const T& x, const interval& y) +template inline bool posge(const T &x, const interval &y) { - return x >= y.lower(); + return x >= y.lower(); } -template inline -bool poseq(const interval& x, const interval& y) +template +inline bool poseq(const interval &x, const interval &y) { - return x.upper() >= y.lower() && y.upper() >= x.lower(); + return x.upper() >= y.lower() && y.upper() >= x.lower(); } -template inline -bool poseq(const interval& x, const T& y) +template inline bool poseq(const interval &x, const T &y) { - return x.upper() >= y && y >= x.lower(); + return x.upper() >= y && y >= x.lower(); } -template inline -bool poseq(const T& x, const interval& y) +template inline bool poseq(const T &x, const interval &y) { - return x >= y.lower() && y.upper() >= x; + return x >= y.lower() && y.upper() >= x; } -template inline -bool posne(const interval& x, const interval& y) +template +inline bool posne(const interval &x, const interval &y) { - return x.upper() != y.lower() || y.upper() != x.lower(); + return x.upper() != y.lower() || y.upper() != x.lower(); } -template inline -bool posne(const interval& x, const T& y) +template inline bool posne(const interval &x, const T &y) { - return x.upper() != y || y != x.lower(); + return x.upper() != y || y != x.lower(); } -template inline -bool posne(const T& x, const interval& y) +template inline bool posne(const T &x, const interval &y) { - return x != y.lower() || y.upper() != x; + return x != y.lower() || y.upper() != x; } } // namespace interval_lib } // namespace numeric -} //namespace boost +} // namespace boost #endif // BOOST_NUMERIC_INTERVAL_COMPARE_EXPLICIT_HPP diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/lexicographic.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/lexicographic.hpp index 03f6036d..e98f77cd 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/lexicographic.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/lexicographic.hpp @@ -19,98 +19,104 @@ namespace interval_lib { namespace compare { namespace lexicographic { -template inline -bool operator<(const interval& x, const interval& y) +template +inline bool operator<(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - const T& xl = x.lower(); - const T& yl = y.lower(); - return xl < yl || (xl == yl && x.upper() < y.upper()); + if (detail::test_input(x, y)) + throw comparison_error(); + const T &xl = x.lower(); + const T &yl = y.lower(); + return xl < yl || (xl == yl && x.upper() < y.upper()); } -template inline -bool operator<(const interval& x, const T& y) +template inline bool operator<(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() < y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() < y; } -template inline -bool operator<=(const interval& x, const interval& y) +template +inline bool operator<=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - const T& xl = x.lower(); - const T& yl = y.lower(); - return xl < yl || (xl == yl && x.upper() <= y.upper()); + if (detail::test_input(x, y)) + throw comparison_error(); + const T &xl = x.lower(); + const T &yl = y.lower(); + return xl < yl || (xl == yl && x.upper() <= y.upper()); } -template inline -bool operator<=(const interval& x, const T& y) +template inline bool operator<=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - const T& xl = x.lower(); - return xl < y || (xl == y && x.upper() <= y); + if (detail::test_input(x, y)) + throw comparison_error(); + const T &xl = x.lower(); + return xl < y || (xl == y && x.upper() <= y); } -template inline -bool operator>(const interval& x, const interval& y) +template +inline bool operator>(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - const T& xl = x.lower(); - const T& yl = y.lower(); - return xl > yl || (xl == yl && x.upper() > y.upper()); + if (detail::test_input(x, y)) + throw comparison_error(); + const T &xl = x.lower(); + const T &yl = y.lower(); + return xl > yl || (xl == yl && x.upper() > y.upper()); } -template inline -bool operator>(const interval& x, const T& y) +template inline bool operator>(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - const T& xl = x.lower(); - return xl > y || (xl == y && x.upper() > y); + if (detail::test_input(x, y)) + throw comparison_error(); + const T &xl = x.lower(); + return xl > y || (xl == y && x.upper() > y); } -template inline -bool operator>=(const interval& x, const interval& y) +template +inline bool operator>=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - const T& xl = x.lower(); - const T& yl = y.lower(); - return xl > yl || (xl == yl && x.upper() >= y.upper()); + if (detail::test_input(x, y)) + throw comparison_error(); + const T &xl = x.lower(); + const T &yl = y.lower(); + return xl > yl || (xl == yl && x.upper() >= y.upper()); } -template inline -bool operator>=(const interval& x, const T& y) +template inline bool operator>=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() >= y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() >= y; } -template inline -bool operator==(const interval& x, const interval& y) +template +inline bool operator==(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() == y.lower() && x.upper() == y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() == y.lower() && x.upper() == y.upper(); } -template inline -bool operator==(const interval& x, const T& y) +template inline bool operator==(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() == y && x.upper() == y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() == y && x.upper() == y; } -template inline -bool operator!=(const interval& x, const interval& y) +template +inline bool operator!=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() != y.lower() || x.upper() != y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() != y.lower() || x.upper() != y.upper(); } -template inline -bool operator!=(const interval& x, const T& y) +template inline bool operator!=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() != y || x.upper() != y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() != y || x.upper() != y; } } // namespace lexicographic diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/possible.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/possible.hpp index 59bec31b..3486c7e1 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/possible.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/possible.hpp @@ -19,88 +19,94 @@ namespace interval_lib { namespace compare { namespace possible { -template inline -bool operator<(const interval& x, const interval& y) +template +inline bool operator<(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() < y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() < y.upper(); } -template inline -bool operator<(const interval& x, const T& y) +template inline bool operator<(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() < y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() < y; } -template inline -bool operator<=(const interval& x, const interval& y) +template +inline bool operator<=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() <= y.upper(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() <= y.upper(); } -template inline -bool operator<=(const interval& x, const T& y) +template inline bool operator<=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() <= y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() <= y; } -template inline -bool operator>(const interval& x, const interval& y) +template +inline bool operator>(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() > y.lower(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() > y.lower(); } -template inline -bool operator>(const interval& x, const T& y) +template inline bool operator>(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() > y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() > y; } -template inline -bool operator>=(const interval& x, const interval& y) +template +inline bool operator>=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() >= y.lower(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() >= y.lower(); } -template inline -bool operator>=(const interval& x, const T& y) +template inline bool operator>=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.upper() >= y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.upper() >= y; } -template inline -bool operator==(const interval& x, const interval& y) +template +inline bool operator==(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() <= y.upper() && x.upper() >= y.lower(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() <= y.upper() && x.upper() >= y.lower(); } -template inline -bool operator==(const interval& x, const T& y) +template inline bool operator==(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() <= y && x.upper() >= y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() <= y && x.upper() >= y; } -template inline -bool operator!=(const interval& x, const interval& y) +template +inline bool operator!=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() != y.upper() || x.upper() != y.lower(); + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() != y.upper() || x.upper() != y.lower(); } -template inline -bool operator!=(const interval& x, const T& y) +template inline bool operator!=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - return x.lower() != y || x.upper() != y; + if (detail::test_input(x, y)) + throw comparison_error(); + return x.lower() != y || x.upper() != y; } } // namespace possible diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/set.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/set.hpp index aa4f1716..fb48e8cc 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/set.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/set.hpp @@ -20,76 +20,70 @@ namespace interval_lib { namespace compare { namespace set { -template inline -bool operator<(const interval& x, const interval& y) +template +inline bool operator<(const interval &x, const interval &y) { - return proper_subset(x, y); + return proper_subset(x, y); } -template inline -bool operator<(const interval& x, const T& y) +template inline bool operator<(const interval &x, const T &y) { - throw comparison_error(); + throw comparison_error(); } -template inline -bool operator<=(const interval& x, const interval& y) +template +inline bool operator<=(const interval &x, const interval &y) { - return subset(x, y); + return subset(x, y); } -template inline -bool operator<=(const interval& x, const T& y) +template inline bool operator<=(const interval &x, const T &y) { - throw comparison_error(); + throw comparison_error(); } -template inline -bool operator>(const interval& x, const interval& y) +template +inline bool operator>(const interval &x, const interval &y) { - return proper_subset(y, x); + return proper_subset(y, x); } -template inline -bool operator>(const interval& x, const T& y) +template inline bool operator>(const interval &x, const T &y) { - throw comparison_error(); + throw comparison_error(); } -template inline -bool operator>=(const interval& x, const interval& y) +template +inline bool operator>=(const interval &x, const interval &y) { - return subset(y, x); + return subset(y, x); } -template inline -bool operator>=(const interval& x, const T& y) +template inline bool operator>=(const interval &x, const T &y) { - throw comparison_error(); + throw comparison_error(); } -template inline -bool operator==(const interval& x, const interval& y) +template +inline bool operator==(const interval &x, const interval &y) { - return equal(y, x); + return equal(y, x); } -template inline -bool operator==(const interval& x, const T& y) +template inline bool operator==(const interval &x, const T &y) { - throw comparison_error(); + throw comparison_error(); } -template inline -bool operator!=(const interval& x, const interval& y) +template +inline bool operator!=(const interval &x, const interval &y) { - return !equal(y, x); + return !equal(y, x); } -template inline -bool operator!=(const interval& x, const T& y) +template inline bool operator!=(const interval &x, const T &y) { - throw comparison_error(); + throw comparison_error(); } } // namespace set diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/tribool.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/tribool.hpp index 6e4a83e2..3f001f23 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/tribool.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/compare/tribool.hpp @@ -10,9 +10,9 @@ #ifndef BOOST_NUMERIC_INTERVAL_COMPARE_TRIBOOL_HPP #define BOOST_NUMERIC_INTERVAL_COMPARE_TRIBOOL_HPP +#include #include #include -#include namespace boost { namespace numeric { @@ -20,112 +20,142 @@ namespace interval_lib { namespace compare { namespace tribool { -template inline -logic::tribool operator<(const interval& x, const interval& y) +template +inline logic::tribool operator<(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() < y.lower()) return true; - if (x.lower() >= y.upper()) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() < y.lower()) + return true; + if (x.lower() >= y.upper()) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator<(const interval& x, const T& y) +template inline logic::tribool operator<(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() < y) return true; - if (x.lower() >= y) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() < y) + return true; + if (x.lower() >= y) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator<=(const interval& x, const interval& y) +template +inline logic::tribool operator<=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() <= y.lower()) return true; - if (x.lower() > y.upper()) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() <= y.lower()) + return true; + if (x.lower() > y.upper()) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator<=(const interval& x, const T& y) +template inline logic::tribool operator<=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() <= y) return true; - if (x.lower() > y) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() <= y) + return true; + if (x.lower() > y) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator>(const interval& x, const interval& y) +template +inline logic::tribool operator>(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.lower() > y.upper()) return true; - if (x.upper() <= y.lower()) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.lower() > y.upper()) + return true; + if (x.upper() <= y.lower()) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator>(const interval& x, const T& y) +template inline logic::tribool operator>(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.lower() > y) return true; - if (x.upper() <= y) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.lower() > y) + return true; + if (x.upper() <= y) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator>=(const interval& x, const interval& y) +template +inline logic::tribool operator>=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.lower() >= y.upper()) return true; - if (x.upper() < y.lower()) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.lower() >= y.upper()) + return true; + if (x.upper() < y.lower()) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator>=(const interval& x, const T& y) +template inline logic::tribool operator>=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.lower() >= y) return true; - if (x.upper() < y) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.lower() >= y) + return true; + if (x.upper() < y) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator==(const interval& x, const interval& y) +template +inline logic::tribool operator==(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() == y.lower() && x.lower() == y.upper()) return true; - if (x.upper() < y.lower() || x.lower() > y.upper()) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() == y.lower() && x.lower() == y.upper()) + return true; + if (x.upper() < y.lower() || x.lower() > y.upper()) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator==(const interval& x, const T& y) +template inline logic::tribool operator==(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() == y && x.lower() == y) return true; - if (x.upper() < y || x.lower() > y) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() == y && x.lower() == y) + return true; + if (x.upper() < y || x.lower() > y) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator!=(const interval& x, const interval& y) +template +inline logic::tribool operator!=(const interval &x, const interval &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() < y.lower() || x.lower() > y.upper()) return true; - if (x.upper() == y.lower() && x.lower() == y.upper()) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() < y.lower() || x.lower() > y.upper()) + return true; + if (x.upper() == y.lower() && x.lower() == y.upper()) + return false; + return logic::indeterminate; } -template inline -logic::tribool operator!=(const interval& x, const T& y) +template inline logic::tribool operator!=(const interval &x, const T &y) { - if (detail::test_input(x, y)) throw comparison_error(); - if (x.upper() < y || x.lower() > y) return true; - if (x.upper() == y && x.lower() == y) return false; - return logic::indeterminate; + if (detail::test_input(x, y)) + throw comparison_error(); + if (x.upper() < y || x.lower() > y) + return true; + if (x.upper() == y && x.lower() == y) + return false; + return logic::indeterminate; } } // namespace tribool diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/constants.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/constants.hpp index a3a42efe..be5b8c52 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/constants.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/constants.hpp @@ -19,63 +19,57 @@ namespace constants { // Decimal representations wouldn't do it since the standard doesn't // specify the rounding (even nearest) that should be used. -static const float pi_f_l = 13176794.0f/(1<<22); -static const float pi_f_u = 13176795.0f/(1<<22); -static const double pi_d_l = (3373259426.0 + 273688.0 / (1<<21)) / (1<<30); -static const double pi_d_u = (3373259426.0 + 273689.0 / (1<<21)) / (1<<30); +static const float pi_f_l = 13176794.0f / (1 << 22); +static const float pi_f_u = 13176795.0f / (1 << 22); +static const double pi_d_l = (3373259426.0 + 273688.0 / (1 << 21)) / (1 << 30); +static const double pi_d_u = (3373259426.0 + 273689.0 / (1 << 21)) / (1 << 30); -template inline T pi_lower() { return 3; } -template inline T pi_upper() { return 4; } -template inline T pi_half_lower() { return 1; } -template inline T pi_half_upper() { return 2; } -template inline T pi_twice_lower() { return 6; } -template inline T pi_twice_upper() { return 7; } +template inline T pi_lower() { return 3; } +template inline T pi_upper() { return 4; } +template inline T pi_half_lower() { return 1; } +template inline T pi_half_upper() { return 2; } +template inline T pi_twice_lower() { return 6; } +template inline T pi_twice_upper() { return 7; } -template<> inline float pi_lower() { return pi_f_l; } -template<> inline float pi_upper() { return pi_f_u; } -template<> inline float pi_half_lower() { return pi_f_l / 2; } -template<> inline float pi_half_upper() { return pi_f_u / 2; } -template<> inline float pi_twice_lower() { return pi_f_l * 2; } -template<> inline float pi_twice_upper() { return pi_f_u * 2; } +template <> inline float pi_lower() { return pi_f_l; } +template <> inline float pi_upper() { return pi_f_u; } +template <> inline float pi_half_lower() { return pi_f_l / 2; } +template <> inline float pi_half_upper() { return pi_f_u / 2; } +template <> inline float pi_twice_lower() { return pi_f_l * 2; } +template <> inline float pi_twice_upper() { return pi_f_u * 2; } -template<> inline double pi_lower() { return pi_d_l; } -template<> inline double pi_upper() { return pi_d_u; } -template<> inline double pi_half_lower() { return pi_d_l / 2; } -template<> inline double pi_half_upper() { return pi_d_u / 2; } -template<> inline double pi_twice_lower() { return pi_d_l * 2; } -template<> inline double pi_twice_upper() { return pi_d_u * 2; } +template <> inline double pi_lower() { return pi_d_l; } +template <> inline double pi_upper() { return pi_d_u; } +template <> inline double pi_half_lower() { return pi_d_l / 2; } +template <> inline double pi_half_upper() { return pi_d_u / 2; } +template <> inline double pi_twice_lower() { return pi_d_l * 2; } +template <> inline double pi_twice_upper() { return pi_d_u * 2; } -template<> inline long double pi_lower() { return pi_d_l; } -template<> inline long double pi_upper() { return pi_d_u; } -template<> inline long double pi_half_lower() { return pi_d_l / 2; } -template<> inline long double pi_half_upper() { return pi_d_u / 2; } -template<> inline long double pi_twice_lower() { return pi_d_l * 2; } -template<> inline long double pi_twice_upper() { return pi_d_u * 2; } +template <> inline long double pi_lower() { return pi_d_l; } +template <> inline long double pi_upper() { return pi_d_u; } +template <> inline long double pi_half_lower() { return pi_d_l / 2; } +template <> inline long double pi_half_upper() { return pi_d_u / 2; } +template <> inline long double pi_twice_lower() { return pi_d_l * 2; } +template <> inline long double pi_twice_upper() { return pi_d_u * 2; } } // namespace constants -template inline -I pi() +template inline I pi() { - typedef typename I::base_type T; - return I(constants::pi_lower(), - constants::pi_upper(), true); + typedef typename I::base_type T; + return I(constants::pi_lower(), constants::pi_upper(), true); } -template inline -I pi_half() +template inline I pi_half() { - typedef typename I::base_type T; - return I(constants::pi_half_lower(), - constants::pi_half_upper(), true); + typedef typename I::base_type T; + return I(constants::pi_half_lower(), constants::pi_half_upper(), true); } -template inline -I pi_twice() +template inline I pi_twice() { - typedef typename I::base_type T; - return I(constants::pi_twice_lower(), - constants::pi_twice_upper(), true); + typedef typename I::base_type T; + return I(constants::pi_twice_lower(), constants::pi_twice_upper(), true); } } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/alpha_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/alpha_rounding_control.hpp index d9a6079e..6490b82d 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/alpha_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/alpha_rounding_control.hpp @@ -23,84 +23,81 @@ namespace numeric { namespace interval_lib { namespace detail { -#if defined(__GNUC__ ) - typedef union { +#if defined(__GNUC__) +typedef union +{ ::boost::long_long_type imode; - double dmode; - } rounding_mode_struct; + double dmode; +} rounding_mode_struct; - // set bits 59-58 (DYN), - // clear all exception bits and disable overflow (51) and inexact exceptions (62) - static const rounding_mode_struct mode_upward = { 0x4C08000000000000LL }; - static const rounding_mode_struct mode_downward = { 0x4408000000000000LL }; - static const rounding_mode_struct mode_to_nearest = { 0x4808000000000000LL }; - static const rounding_mode_struct mode_toward_zero = { 0x4008000000000000LL }; +// set bits 59-58 (DYN), +// clear all exception bits and disable overflow (51) and inexact exceptions (62) +static const rounding_mode_struct mode_upward = {0x4C08000000000000LL}; +static const rounding_mode_struct mode_downward = {0x4408000000000000LL}; +static const rounding_mode_struct mode_to_nearest = {0x4808000000000000LL}; +static const rounding_mode_struct mode_toward_zero = {0x4008000000000000LL}; - struct alpha_rounding_control - { +struct alpha_rounding_control +{ typedef double rounding_mode; - static void set_rounding_mode(const rounding_mode mode) - { __asm__ __volatile__ ("mt_fpcr %0" : : "f"(mode)); } + static void set_rounding_mode(const rounding_mode mode) { __asm__ __volatile__("mt_fpcr %0" : : "f"(mode)); } - static void get_rounding_mode(rounding_mode& mode) - { __asm__ __volatile__ ("mf_fpcr %0" : "=f"(mode)); } + static void get_rounding_mode(rounding_mode &mode) { __asm__ __volatile__("mf_fpcr %0" : "=f"(mode)); } - static void downward() { set_rounding_mode(mode_downward.dmode); } - static void upward() { set_rounding_mode(mode_upward.dmode); } - static void to_nearest() { set_rounding_mode(mode_to_nearest.dmode); } + static void downward() { set_rounding_mode(mode_downward.dmode); } + static void upward() { set_rounding_mode(mode_upward.dmode); } + static void to_nearest() { set_rounding_mode(mode_to_nearest.dmode); } static void toward_zero() { set_rounding_mode(mode_toward_zero.dmode); } - }; +}; #elif defined(__digital__) || defined(__DECCXX) #if defined(__DECCXX) && !(defined(__FLT_ROUNDS) && __FLT_ROUNDS == -1) #error Dynamic rounding mode not enabled. See cxx man page for details. #endif - struct alpha_rounding_control - { +struct alpha_rounding_control +{ typedef unsigned int rounding_mode; - static void set_rounding_mode(const rounding_mode& mode) { write_rnd(mode); } - static void get_rounding_mode(rounding_mode& mode) { mode = read_rnd(); } + static void set_rounding_mode(const rounding_mode &mode) { write_rnd(mode); } + static void get_rounding_mode(rounding_mode &mode) { mode = read_rnd(); } - static void downward() { set_rounding_mode(FP_RND_RM); } - static void upward() { set_rounding_mode(FP_RND_RP); } - static void to_nearest() { set_rounding_mode(FP_RND_RN); } + static void downward() { set_rounding_mode(FP_RND_RM); } + static void upward() { set_rounding_mode(FP_RND_RP); } + static void to_nearest() { set_rounding_mode(FP_RND_RN); } static void toward_zero() { set_rounding_mode(FP_RND_RZ); } - }; +}; #endif } // namespace detail -extern "C" { - float rintf(float); - double rint(double); - long double rintl(long double); +extern "C" +{ + float rintf(float); + double rint(double); + long double rintl(long double); } -template<> -struct rounding_control: - detail::alpha_rounding_control +template <> struct rounding_control : detail::alpha_rounding_control { - static float force_rounding(const float r) - { volatile float _r = r; return _r; } - static float to_int(const float& x) { return rintf(x); } + static float force_rounding(const float r) + { + volatile float _r = r; + return _r; + } + static float to_int(const float &x) { return rintf(x); } }; -template<> -struct rounding_control: - detail::alpha_rounding_control +template <> struct rounding_control : detail::alpha_rounding_control { - static const double & force_rounding(const double& r) { return r; } - static double to_int(const double& r) { return rint(r); } + static const double &force_rounding(const double &r) { return r; } + static double to_int(const double &r) { return rint(r); } }; -template<> -struct rounding_control: - detail::alpha_rounding_control +template <> struct rounding_control : detail::alpha_rounding_control { - static const long double & force_rounding(const long double& r) { return r; } - static long double to_int(const long double& r) { return rintl(r); } + static const long double &force_rounding(const long double &r) { return r; } + static long double to_int(const long double &r) { return rintl(r); } }; } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bcc_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bcc_rounding_control.hpp index e3aaf046..48244c32 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bcc_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bcc_rounding_control.hpp @@ -12,14 +12,14 @@ #define BOOST_NUMERIC_INTERVAL_DETAIL_BCC_ROUNDING_CONTROL_HPP #ifndef __BORLANDC__ -# error This header is only intended for Borland C++. +#error This header is only intended for Borland C++. #endif #ifndef _M_IX86 -# error This header only works on x86 CPUs. +#error This header only works on x86 CPUs. #endif -#include // Borland C++ rounding control +#include // Borland C++ rounding control namespace boost { namespace numeric { @@ -27,26 +27,31 @@ namespace interval_lib { namespace detail { #ifndef BOOST_NUMERIC_INTERVAL_KEEP_EXCEPTIONS_FOR_BCC -extern "C" { unsigned int _RTLENTRY _fm_init(void); } +extern "C" +{ + unsigned int _RTLENTRY _fm_init(void); +} -struct borland_workaround { - borland_workaround() { _fm_init(); } +struct borland_workaround +{ + borland_workaround() { _fm_init(); } }; static borland_workaround borland_workaround_exec; #endif // BOOST_NUMERIC_INTERVAL_KEEP_EXCEPTIONS_FOR_BCC __inline double rint(double) -{ __emit__(0xD9); __emit__(0xFC); /* asm FRNDINT */ } +{ + __emit__(0xD9); + __emit__(0xFC); /* asm FRNDINT */ +} struct x86_rounding { - typedef unsigned int rounding_mode; - static void get_rounding_mode(rounding_mode& mode) - { mode = _control87(0, 0); } - static void set_rounding_mode(const rounding_mode mode) - { _control87(mode, 0xffff); } - static double to_int(const double& x) { return rint(x); } + typedef unsigned int rounding_mode; + static void get_rounding_mode(rounding_mode &mode) { mode = _control87(0, 0); } + static void set_rounding_mode(const rounding_mode mode) { _control87(mode, 0xffff); } + static double to_int(const double &x) { return rint(x); } }; } // namespace detail diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bugs.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bugs.hpp index cc37988d..29f2d424 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bugs.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/bugs.hpp @@ -13,36 +13,37 @@ #include -#if defined(__GLIBC__) && (defined(__USE_MISC) || defined(__USE_XOPEN_EXTENDED) || defined(__USE_ISOC99)) && !defined(__ICC) -# define BOOST_HAS_INV_HYPERBOLIC +#if defined(__GLIBC__) && (defined(__USE_MISC) || defined(__USE_XOPEN_EXTENDED) || defined(__USE_ISOC99)) \ + && !defined(__ICC) +#define BOOST_HAS_INV_HYPERBOLIC #endif #ifdef BOOST_NO_STDC_NAMESPACE -# define BOOST_NUMERIC_INTERVAL_using_math(a) using ::a -# ifdef BOOST_HAS_INV_HYPERBOLIC -# define BOOST_NUMERIC_INTERVAL_using_ahyp(a) using ::a -# endif +#define BOOST_NUMERIC_INTERVAL_using_math(a) using ::a +#ifdef BOOST_HAS_INV_HYPERBOLIC +#define BOOST_NUMERIC_INTERVAL_using_ahyp(a) using ::a +#endif #else -# define BOOST_NUMERIC_INTERVAL_using_math(a) using std::a -# if defined(BOOST_HAS_INV_HYPERBOLIC) -# if defined(__GLIBCPP__) || defined(__GLIBCXX__) -# define BOOST_NUMERIC_INTERVAL_using_ahyp(a) using ::a -# else -# define BOOST_NUMERIC_INTERVAL_using_ahyp(a) using std::a -# endif -# endif +#define BOOST_NUMERIC_INTERVAL_using_math(a) using std::a +#if defined(BOOST_HAS_INV_HYPERBOLIC) +#if defined(__GLIBCPP__) || defined(__GLIBCXX__) +#define BOOST_NUMERIC_INTERVAL_using_ahyp(a) using ::a +#else +#define BOOST_NUMERIC_INTERVAL_using_ahyp(a) using std::a +#endif +#endif #endif #if defined(__COMO__) || defined(BOOST_INTEL) -# define BOOST_NUMERIC_INTERVAL_using_max(a) using std::a +#define BOOST_NUMERIC_INTERVAL_using_max(a) using std::a #elif defined(BOOST_NO_STDC_NAMESPACE) -# define BOOST_NUMERIC_INTERVAL_using_max(a) using ::a +#define BOOST_NUMERIC_INTERVAL_using_max(a) using ::a #else -# define BOOST_NUMERIC_INTERVAL_using_max(a) using std::a +#define BOOST_NUMERIC_INTERVAL_using_max(a) using std::a #endif #ifndef BOOST_NUMERIC_INTERVAL_using_ahyp -# define BOOST_NUMERIC_INTERVAL_using_ahyp(a) +#define BOOST_NUMERIC_INTERVAL_using_ahyp(a) #endif #if defined(__GNUC__) && (__GNUC__ <= 2) @@ -51,27 +52,27 @@ #include namespace boost { namespace numeric { - using std::min; - using std::max; - using std::sqrt; - using std::exp; - using std::log; - using std::cos; - using std::tan; - using std::asin; - using std::acos; - using std::atan; - using std::ceil; - using std::floor; - using std::sinh; - using std::cosh; - using std::tanh; -# undef BOOST_NUMERIC_INTERVAL_using_max -# undef BOOST_NUMERIC_INTERVAL_using_math -# define BOOST_NUMERIC_INTERVAL_using_max(a) -# define BOOST_NUMERIC_INTERVAL_using_math(a) -# undef BOOST_NUMERIC_INTERVAL_using_ahyp -# define BOOST_NUMERIC_INTERVAL_using_ahyp(a) +using std::acos; +using std::asin; +using std::atan; +using std::ceil; +using std::cos; +using std::cosh; +using std::exp; +using std::floor; +using std::log; +using std::max; +using std::min; +using std::sinh; +using std::sqrt; +using std::tan; +using std::tanh; +#undef BOOST_NUMERIC_INTERVAL_using_max +#undef BOOST_NUMERIC_INTERVAL_using_math +#define BOOST_NUMERIC_INTERVAL_using_max(a) +#define BOOST_NUMERIC_INTERVAL_using_math(a) +#undef BOOST_NUMERIC_INTERVAL_using_ahyp +#define BOOST_NUMERIC_INTERVAL_using_ahyp(a) } // namespace numeric } // namespace boost #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99_rounding_control.hpp index 181d2866..dfcf1726 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99_rounding_control.hpp @@ -18,25 +18,28 @@ namespace numeric { namespace interval_lib { namespace detail { -struct c99_rounding_control: c99_rounding +struct c99_rounding_control : c99_rounding { - template - static T force_rounding(const T& r) { volatile T r_ = r; return r_; } + template static T force_rounding(const T &r) + { + volatile T r_ = r; + return r_; + } }; } // namespace detail -template<> -struct rounding_control: - detail::c99_rounding_control { }; +template <> struct rounding_control : detail::c99_rounding_control +{ +}; -template<> -struct rounding_control: - detail::c99_rounding_control { }; +template <> struct rounding_control : detail::c99_rounding_control +{ +}; -template<> -struct rounding_control: - detail::c99_rounding_control { }; +template <> struct rounding_control : detail::c99_rounding_control +{ +}; } // namespace interval_lib } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99sub_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99sub_rounding_control.hpp index 571c51fc..53faf0d1 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99sub_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/c99sub_rounding_control.hpp @@ -11,28 +11,30 @@ #ifndef BOOST_NUMERIC_INTERVAL_DETAIL_C99SUB_ROUNDING_CONTROL_HPP #define BOOST_NUMERIC_INTERVAL_DETAIL_C99SUB_ROUNDING_CONTROL_HPP -#include // ISO C 99 rounding mode control +#include // ISO C 99 rounding mode control namespace boost { namespace numeric { namespace interval_lib { namespace detail { -extern "C" { double rint(double); } +extern "C" +{ + double rint(double); +} struct c99_rounding { - typedef int rounding_mode; + typedef int rounding_mode; - static void set_rounding_mode(const rounding_mode mode) { fesetround(mode); } - static void get_rounding_mode(rounding_mode &mode) { mode = fegetround(); } - static void downward() { set_rounding_mode(FE_DOWNWARD); } - static void upward() { set_rounding_mode(FE_UPWARD); } - static void to_nearest() { set_rounding_mode(FE_TONEAREST); } - static void toward_zero() { set_rounding_mode(FE_TOWARDZERO); } + static void set_rounding_mode(const rounding_mode mode) { fesetround(mode); } + static void get_rounding_mode(rounding_mode &mode) { mode = fegetround(); } + static void downward() { set_rounding_mode(FE_DOWNWARD); } + static void upward() { set_rounding_mode(FE_UPWARD); } + static void to_nearest() { set_rounding_mode(FE_TONEAREST); } + static void toward_zero() { set_rounding_mode(FE_TOWARDZERO); } - template - static T to_int(const T& r) { return rint(r); } + template static T to_int(const T &r) { return rint(r); } }; } // namespace detail diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/division.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/division.hpp index 24fb025a..704f3bdb 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/division.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/division.hpp @@ -10,180 +10,184 @@ #ifndef BOOST_NUMERIC_INTERVAL_DETAIL_DIVISION_HPP #define BOOST_NUMERIC_INTERVAL_DETAIL_DIVISION_HPP -#include +#include #include +#include #include #include -#include namespace boost { namespace numeric { namespace interval_lib { namespace detail { -template inline -interval div_non_zero(const interval& x, - const interval& y) +template +inline interval div_non_zero(const interval &x, const interval &y) { - // assert(!in_zero(y)); - typename Policies::rounding rnd; - typedef interval I; - const T& xl = x.lower(); - const T& xu = x.upper(); - const T& yl = y.lower(); - const T& yu = y.upper(); - if (::boost::numeric::interval_lib::user::is_neg(xu)) - if (::boost::numeric::interval_lib::user::is_neg(yu)) - return I(rnd.div_down(xu, yl), rnd.div_up(xl, yu), true); + // assert(!in_zero(y)); + typename Policies::rounding rnd; + typedef interval I; + const T &xl = x.lower(); + const T &xu = x.upper(); + const T &yl = y.lower(); + const T &yu = y.upper(); + if (::boost::numeric::interval_lib::user::is_neg(xu)) + if (::boost::numeric::interval_lib::user::is_neg(yu)) + return I(rnd.div_down(xu, yl), rnd.div_up(xl, yu), true); + else + return I(rnd.div_down(xl, yl), rnd.div_up(xu, yu), true); + else if (::boost::numeric::interval_lib::user::is_neg(xl)) + if (::boost::numeric::interval_lib::user::is_neg(yu)) + return I(rnd.div_down(xu, yu), rnd.div_up(xl, yu), true); + else + return I(rnd.div_down(xl, yl), rnd.div_up(xu, yl), true); + else if (::boost::numeric::interval_lib::user::is_neg(yu)) + return I(rnd.div_down(xu, yu), rnd.div_up(xl, yl), true); else - return I(rnd.div_down(xl, yl), rnd.div_up(xu, yu), true); - else if (::boost::numeric::interval_lib::user::is_neg(xl)) - if (::boost::numeric::interval_lib::user::is_neg(yu)) - return I(rnd.div_down(xu, yu), rnd.div_up(xl, yu), true); + return I(rnd.div_down(xl, yu), rnd.div_up(xu, yl), true); +} + +template inline interval div_non_zero(const T &x, const interval &y) +{ + // assert(!in_zero(y)); + typename Policies::rounding rnd; + typedef interval I; + const T &yl = y.lower(); + const T &yu = y.upper(); + if (::boost::numeric::interval_lib::user::is_neg(x)) + return I(rnd.div_down(x, yl), rnd.div_up(x, yu), true); else - return I(rnd.div_down(xl, yl), rnd.div_up(xu, yl), true); - else - if (::boost::numeric::interval_lib::user::is_neg(yu)) - return I(rnd.div_down(xu, yu), rnd.div_up(xl, yl), true); + return I(rnd.div_down(x, yu), rnd.div_up(x, yl), true); +} + +template +inline interval div_positive(const interval &x, const T &yu) +{ + // assert(::boost::numeric::interval_lib::user::is_pos(yu)); + if (::boost::numeric::interval_lib::user::is_zero(x.lower()) + && ::boost::numeric::interval_lib::user::is_zero(x.upper())) + return x; + typename Policies::rounding rnd; + typedef interval I; + const T &xl = x.lower(); + const T &xu = x.upper(); + typedef typename Policies::checking checking; + if (::boost::numeric::interval_lib::user::is_neg(xu)) + return I(checking::neg_inf(), rnd.div_up(xu, yu), true); + else if (::boost::numeric::interval_lib::user::is_neg(xl)) + return I(checking::neg_inf(), checking::pos_inf(), true); else - return I(rnd.div_down(xl, yu), rnd.div_up(xu, yl), true); + return I(rnd.div_down(xl, yu), checking::pos_inf(), true); } -template inline -interval div_non_zero(const T& x, const interval& y) +template inline interval div_positive(const T &x, const T &yu) { - // assert(!in_zero(y)); - typename Policies::rounding rnd; - typedef interval I; - const T& yl = y.lower(); - const T& yu = y.upper(); - if (::boost::numeric::interval_lib::user::is_neg(x)) - return I(rnd.div_down(x, yl), rnd.div_up(x, yu), true); - else - return I(rnd.div_down(x, yu), rnd.div_up(x, yl), true); + // assert(::boost::numeric::interval_lib::user::is_pos(yu)); + typedef interval I; + if (::boost::numeric::interval_lib::user::is_zero(x)) + return I(static_cast(0), static_cast(0), true); + typename Policies::rounding rnd; + typedef typename Policies::checking checking; + if (::boost::numeric::interval_lib::user::is_neg(x)) + return I(checking::neg_inf(), rnd.div_up(x, yu), true); + else + return I(rnd.div_down(x, yu), checking::pos_inf(), true); } -template inline -interval div_positive(const interval& x, const T& yu) +template +inline interval div_negative(const interval &x, const T &yl) { - // assert(::boost::numeric::interval_lib::user::is_pos(yu)); - if (::boost::numeric::interval_lib::user::is_zero(x.lower()) && - ::boost::numeric::interval_lib::user::is_zero(x.upper())) - return x; - typename Policies::rounding rnd; - typedef interval I; - const T& xl = x.lower(); - const T& xu = x.upper(); - typedef typename Policies::checking checking; - if (::boost::numeric::interval_lib::user::is_neg(xu)) - return I(checking::neg_inf(), rnd.div_up(xu, yu), true); - else if (::boost::numeric::interval_lib::user::is_neg(xl)) - return I(checking::neg_inf(), checking::pos_inf(), true); - else - return I(rnd.div_down(xl, yu), checking::pos_inf(), true); + // assert(::boost::numeric::interval_lib::user::is_neg(yl)); + if (::boost::numeric::interval_lib::user::is_zero(x.lower()) + && ::boost::numeric::interval_lib::user::is_zero(x.upper())) + return x; + typename Policies::rounding rnd; + typedef interval I; + const T &xl = x.lower(); + const T &xu = x.upper(); + typedef typename Policies::checking checking; + if (::boost::numeric::interval_lib::user::is_neg(xu)) + return I(rnd.div_down(xu, yl), checking::pos_inf(), true); + else if (::boost::numeric::interval_lib::user::is_neg(xl)) + return I(checking::neg_inf(), checking::pos_inf(), true); + else + return I(checking::neg_inf(), rnd.div_up(xl, yl), true); } -template inline -interval div_positive(const T& x, const T& yu) +template inline interval div_negative(const T &x, const T &yl) { - // assert(::boost::numeric::interval_lib::user::is_pos(yu)); - typedef interval I; - if (::boost::numeric::interval_lib::user::is_zero(x)) - return I(static_cast(0), static_cast(0), true); - typename Policies::rounding rnd; - typedef typename Policies::checking checking; - if (::boost::numeric::interval_lib::user::is_neg(x)) - return I(checking::neg_inf(), rnd.div_up(x, yu), true); - else - return I(rnd.div_down(x, yu), checking::pos_inf(), true); + // assert(::boost::numeric::interval_lib::user::is_neg(yl)); + typedef interval I; + if (::boost::numeric::interval_lib::user::is_zero(x)) + return I(static_cast(0), static_cast(0), true); + typename Policies::rounding rnd; + typedef typename Policies::checking checking; + if (::boost::numeric::interval_lib::user::is_neg(x)) + return I(rnd.div_down(x, yl), checking::pos_inf(), true); + else + return I(checking::neg_inf(), rnd.div_up(x, yl), true); } -template inline -interval div_negative(const interval& x, const T& yl) +template inline interval div_zero(const interval &x) { - // assert(::boost::numeric::interval_lib::user::is_neg(yl)); - if (::boost::numeric::interval_lib::user::is_zero(x.lower()) && - ::boost::numeric::interval_lib::user::is_zero(x.upper())) - return x; - typename Policies::rounding rnd; - typedef interval I; - const T& xl = x.lower(); - const T& xu = x.upper(); - typedef typename Policies::checking checking; - if (::boost::numeric::interval_lib::user::is_neg(xu)) - return I(rnd.div_down(xu, yl), checking::pos_inf(), true); - else if (::boost::numeric::interval_lib::user::is_neg(xl)) - return I(checking::neg_inf(), checking::pos_inf(), true); - else - return I(checking::neg_inf(), rnd.div_up(xl, yl), true); + if (::boost::numeric::interval_lib::user::is_zero(x.lower()) + && ::boost::numeric::interval_lib::user::is_zero(x.upper())) + return x; + else + return interval::whole(); } -template inline -interval div_negative(const T& x, const T& yl) +template inline interval div_zero(const T &x) { - // assert(::boost::numeric::interval_lib::user::is_neg(yl)); - typedef interval I; - if (::boost::numeric::interval_lib::user::is_zero(x)) - return I(static_cast(0), static_cast(0), true); - typename Policies::rounding rnd; - typedef typename Policies::checking checking; - if (::boost::numeric::interval_lib::user::is_neg(x)) - return I(rnd.div_down(x, yl), checking::pos_inf(), true); - else - return I(checking::neg_inf(), rnd.div_up(x, yl), true); + if (::boost::numeric::interval_lib::user::is_zero(x)) + return interval(static_cast(0), static_cast(0), true); + else + return interval::whole(); } -template inline -interval div_zero(const interval& x) +template +inline interval div_zero_part1(const interval &x, const interval &y, bool &b) { - if (::boost::numeric::interval_lib::user::is_zero(x.lower()) && - ::boost::numeric::interval_lib::user::is_zero(x.upper())) - return x; - else return interval::whole(); + // assert(::boost::numeric::interval_lib::user::is_neg(y.lower()) && + // ::boost::numeric::interval_lib::user::is_pos(y.upper())); + if (::boost::numeric::interval_lib::user::is_zero(x.lower()) + && ::boost::numeric::interval_lib::user::is_zero(x.upper())) { + b = false; + return x; + } + typename Policies::rounding rnd; + typedef interval I; + const T &xl = x.lower(); + const T &xu = x.upper(); + const T &yl = y.lower(); + const T &yu = y.upper(); + typedef typename Policies::checking checking; + if (::boost::numeric::interval_lib::user::is_neg(xu)) { + b = true; + return I(checking::neg_inf(), rnd.div_up(xu, yu), true); + } + else if (::boost::numeric::interval_lib::user::is_neg(xl)) { + b = false; + return I(checking::neg_inf(), checking::pos_inf(), true); + } + else { + b = true; + return I(checking::neg_inf(), rnd.div_up(xl, yl), true); + } } -template inline -interval div_zero(const T& x) +template +inline interval div_zero_part2(const interval &x, const interval &y) { - if (::boost::numeric::interval_lib::user::is_zero(x)) - return interval(static_cast(0), static_cast(0), true); - else return interval::whole(); -} - -template inline -interval div_zero_part1(const interval& x, - const interval& y, bool& b) -{ - // assert(::boost::numeric::interval_lib::user::is_neg(y.lower()) && ::boost::numeric::interval_lib::user::is_pos(y.upper())); - if (::boost::numeric::interval_lib::user::is_zero(x.lower()) && ::boost::numeric::interval_lib::user::is_zero(x.upper())) - { b = false; return x; } - typename Policies::rounding rnd; - typedef interval I; - const T& xl = x.lower(); - const T& xu = x.upper(); - const T& yl = y.lower(); - const T& yu = y.upper(); - typedef typename Policies::checking checking; - if (::boost::numeric::interval_lib::user::is_neg(xu)) - { b = true; return I(checking::neg_inf(), rnd.div_up(xu, yu), true); } - else if (::boost::numeric::interval_lib::user::is_neg(xl)) - { b = false; return I(checking::neg_inf(), checking::pos_inf(), true); } - else - { b = true; return I(checking::neg_inf(), rnd.div_up(xl, yl), true); } -} - -template inline -interval div_zero_part2(const interval& x, - const interval& y) -{ - // assert(::boost::numeric::interval_lib::user::is_neg(y.lower()) && ::boost::numeric::interval_lib::user::is_pos(y.upper()) && (div_zero_part1(x, y, b), b)); - typename Policies::rounding rnd; - typedef interval I; - typedef typename Policies::checking checking; - if (::boost::numeric::interval_lib::user::is_neg(x.upper())) - return I(rnd.div_down(x.upper(), y.lower()), checking::pos_inf(), true); - else - return I(rnd.div_down(x.lower(), y.upper()), checking::pos_inf(), true); + // assert(::boost::numeric::interval_lib::user::is_neg(y.lower()) && + // ::boost::numeric::interval_lib::user::is_pos(y.upper()) && (div_zero_part1(x, y, b), b)); + typename Policies::rounding rnd; + typedef interval I; + typedef typename Policies::checking checking; + if (::boost::numeric::interval_lib::user::is_neg(x.upper())) + return I(rnd.div_down(x.upper(), y.lower()), checking::pos_inf(), true); + else + return I(rnd.div_down(x.lower(), y.upper()), checking::pos_inf(), true); } } // namespace detail diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ia64_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ia64_rounding_control.hpp index be26168c..9b7c30b8 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ia64_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ia64_rounding_control.hpp @@ -16,7 +16,7 @@ #if defined(__hpux) -# include +#include namespace boost { namespace numeric { @@ -24,51 +24,48 @@ namespace interval_lib { namespace detail { - struct ia64_rounding_control - { +struct ia64_rounding_control +{ typedef unsigned int rounding_mode; - static void set_rounding_mode(const rounding_mode& mode) { -fesetround(mode); } - static void get_rounding_mode(rounding_mode& mode) { mode = fegetround(); } + static void set_rounding_mode(const rounding_mode &mode) { fesetround(mode); } + static void get_rounding_mode(rounding_mode &mode) { mode = fegetround(); } - static void downward() { set_rounding_mode(FE_DOWNWARD); } - static void upward() { set_rounding_mode(FE_UPWARD); } - static void to_nearest() { set_rounding_mode(FE_TONEAREST); } + static void downward() { set_rounding_mode(FE_DOWNWARD); } + static void upward() { set_rounding_mode(FE_UPWARD); } + static void to_nearest() { set_rounding_mode(FE_TONEAREST); } static void toward_zero() { set_rounding_mode(FE_TOWARDZERO); } - }; +}; } // namespace detail -extern "C" { - float rintf(float); - double rint(double); - long double rintl(long double); +extern "C" +{ + float rintf(float); + double rint(double); + long double rintl(long double); } -template<> -struct rounding_control: - detail::ia64_rounding_control +template <> struct rounding_control : detail::ia64_rounding_control { - static float force_rounding(const float r) - { volatile float _r = r; return _r; } - static float to_int(const float& x) { return rintf(x); } + static float force_rounding(const float r) + { + volatile float _r = r; + return _r; + } + static float to_int(const float &x) { return rintf(x); } }; -template<> -struct rounding_control: - detail::ia64_rounding_control +template <> struct rounding_control : detail::ia64_rounding_control { - static const double & force_rounding(const double& r) { return r; } - static double to_int(const double& r) { return rint(r); } + static const double &force_rounding(const double &r) { return r; } + static double to_int(const double &r) { return rint(r); } }; -template<> -struct rounding_control: - detail::ia64_rounding_control +template <> struct rounding_control : detail::ia64_rounding_control { - static const long double & force_rounding(const long double& r) { return r; } - static long double to_int(const long double& r) { return rintl(r); } + static const long double &force_rounding(const long double &r) { return r; } + static long double to_int(const long double &r) { return rintl(r); } }; } // namespace interval_lib @@ -80,4 +77,3 @@ struct rounding_control: #endif /* __hpux */ #endif /* BOOST_NUMERIC_INTERVAL_DETAIL_IA64_ROUNDING_CONTROL_HPP */ - diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/interval_prototype.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/interval_prototype.hpp index ac9029b7..350364b8 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/interval_prototype.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/interval_prototype.hpp @@ -15,25 +15,23 @@ namespace numeric { namespace interval_lib { -template struct rounded_math; -template struct checking_strict; +template struct rounded_math; +template struct checking_strict; class comparison_error; -template struct policies; +template struct policies; /* * default policies class */ -template -struct default_policies +template struct default_policies { - typedef policies, checking_strict > type; + typedef policies, checking_strict> type; }; - + } // namespace interval_lib -template::type > -class interval; +template ::type> class interval; } // namespace numeric } // namespace boost diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/msvc_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/msvc_rounding_control.hpp index 53f307b8..e9a1118f 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/msvc_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/msvc_rounding_control.hpp @@ -12,10 +12,10 @@ #define BOOST_NUMERIC_INTERVAL_DETAIL_MSVC_ROUNDING_CONTROL_HPP #ifndef _MSC_VER -# error This header is only intended for MSVC, but might work for Borland as well +#error This header is only intended for MSVC, but might work for Borland as well #endif -#include // MSVC rounding control +#include // MSVC rounding control // Although the function is called _control87, it seems to work for // other FPUs too, so it does not have to be changed to _controlfp. @@ -25,74 +25,123 @@ namespace numeric { namespace interval_lib { namespace detail { -extern "C" { double rint(double); } +extern "C" +{ + double rint(double); +} struct x86_rounding { - static unsigned int hard2msvc(unsigned short m) { - unsigned int n = 0; - if (m & 0x01) n |= _EM_INVALID; - if (m & 0x02) n |= _EM_DENORMAL; - if (m & 0x04) n |= _EM_ZERODIVIDE; - if (m & 0x08) n |= _EM_OVERFLOW; - if (m & 0x10) n |= _EM_UNDERFLOW; - if (m & 0x20) n |= _EM_INEXACT; - switch (m & 0x300) { - case 0x000: n |= _PC_24; break; - case 0x200: n |= _PC_53; break; - case 0x300: n |= _PC_64; break; + static unsigned int hard2msvc(unsigned short m) + { + unsigned int n = 0; + if (m & 0x01) + n |= _EM_INVALID; + if (m & 0x02) + n |= _EM_DENORMAL; + if (m & 0x04) + n |= _EM_ZERODIVIDE; + if (m & 0x08) + n |= _EM_OVERFLOW; + if (m & 0x10) + n |= _EM_UNDERFLOW; + if (m & 0x20) + n |= _EM_INEXACT; + switch (m & 0x300) { + case 0x000: + n |= _PC_24; + break; + case 0x200: + n |= _PC_53; + break; + case 0x300: + n |= _PC_64; + break; + } + switch (m & 0xC00) { + case 0x000: + n |= _RC_NEAR; + break; + case 0x400: + n |= _RC_DOWN; + break; + case 0x800: + n |= _RC_UP; + break; + case 0xC00: + n |= _RC_CHOP; + break; + } + if (m & 0x1000) + n |= _IC_AFFINE; // only useful on 287 + return n; } - switch (m & 0xC00) { - case 0x000: n |= _RC_NEAR; break; - case 0x400: n |= _RC_DOWN; break; - case 0x800: n |= _RC_UP; break; - case 0xC00: n |= _RC_CHOP; break; - } - if (m & 0x1000) n |= _IC_AFFINE; // only useful on 287 - return n; - } - static unsigned short msvc2hard(unsigned int n) { - unsigned short m = 0; - if (n & _EM_INVALID) m |= 0x01; - if (n & _EM_DENORMAL) m |= 0x02; - if (n & _EM_ZERODIVIDE) m |= 0x04; - if (n & _EM_OVERFLOW) m |= 0x08; - if (n & _EM_UNDERFLOW) m |= 0x10; - if (n & _EM_INEXACT) m |= 0x20; - switch (n & _MCW_RC) { - case _RC_NEAR: m |= 0x000; break; - case _RC_DOWN: m |= 0x400; break; - case _RC_UP: m |= 0x800; break; - case _RC_CHOP: m |= 0xC00; break; + static unsigned short msvc2hard(unsigned int n) + { + unsigned short m = 0; + if (n & _EM_INVALID) + m |= 0x01; + if (n & _EM_DENORMAL) + m |= 0x02; + if (n & _EM_ZERODIVIDE) + m |= 0x04; + if (n & _EM_OVERFLOW) + m |= 0x08; + if (n & _EM_UNDERFLOW) + m |= 0x10; + if (n & _EM_INEXACT) + m |= 0x20; + switch (n & _MCW_RC) { + case _RC_NEAR: + m |= 0x000; + break; + case _RC_DOWN: + m |= 0x400; + break; + case _RC_UP: + m |= 0x800; + break; + case _RC_CHOP: + m |= 0xC00; + break; + } + switch (n & _MCW_PC) { + case _PC_24: + m |= 0x000; + break; + case _PC_53: + m |= 0x200; + break; + case _PC_64: + m |= 0x300; + break; + } + if ((n & _MCW_IC) == _IC_AFFINE) + m |= 0x1000; + return m; } - switch (n & _MCW_PC) { - case _PC_24: m |= 0x000; break; - case _PC_53: m |= 0x200; break; - case _PC_64: m |= 0x300; break; - } - if ((n & _MCW_IC) == _IC_AFFINE) m |= 0x1000; - return m; - } - typedef unsigned short rounding_mode; - static void get_rounding_mode(rounding_mode& mode) - { mode = msvc2hard(_control87(0, 0)); } - static void set_rounding_mode(const rounding_mode mode) - { _control87(hard2msvc(mode), - _MCW_EM | _MCW_RC + typedef unsigned short rounding_mode; + static void get_rounding_mode(rounding_mode &mode) { mode = msvc2hard(_control87(0, 0)); } + static void set_rounding_mode(const rounding_mode mode) + { + _control87(hard2msvc(mode), + _MCW_EM + | _MCW_RC // This is updated as per fix in boost 1.58 #if !defined(_M_AMD64) && !defined(_M_ARM) - // x64 ignores _MCW_PC and _MCW_IC, and the Debug CRT library actually - // asserts when these are passed to _control87. - // MSDN says on '_control87' that changing precision (_MCW_PC) or - // infinity (_MCW_IC) handling is not supported on the ARM and x64 - // architectures and that _control87 raises an assertion - // and the invalid parameter handler is invoked. - | _MCW_PC | _MCW_IC + // x64 ignores _MCW_PC and _MCW_IC, and the Debug CRT library actually + // asserts when these are passed to _control87. + // MSDN says on '_control87' that changing precision (_MCW_PC) or + // infinity (_MCW_IC) handling is not supported on the ARM and x64 + // architectures and that _control87 raises an assertion + // and the invalid parameter handler is invoked. + | _MCW_PC | _MCW_IC #endif - ); } - static double to_int(const double& x) { return rint(x); } + ); + } + static double to_int(const double &x) { return rint(x); } }; } // namespace detail diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ppc_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ppc_rounding_control.hpp index 8b80a09f..0a194596 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ppc_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/ppc_rounding_control.hpp @@ -16,37 +16,36 @@ #error This header only works on PPC CPUs. #endif -#if defined(__GNUC__ ) || (__IBMCPP__ >= 700) +#if defined(__GNUC__) || (__IBMCPP__ >= 700) namespace boost { namespace numeric { namespace interval_lib { namespace detail { -typedef union { - ::boost::long_long_type imode; - double dmode; +typedef union +{ + ::boost::long_long_type imode; + double dmode; } rounding_mode_struct; -static const rounding_mode_struct mode_upward = { static_cast(0xFFF8000000000002LL) }; -static const rounding_mode_struct mode_downward = { static_cast(0xFFF8000000000003LL) }; -static const rounding_mode_struct mode_to_nearest = { static_cast(0xFFF8000000000000LL) }; -static const rounding_mode_struct mode_toward_zero = { static_cast(0xFFF8000000000001LL) }; +static const rounding_mode_struct mode_upward = {static_cast(0xFFF8000000000002LL)}; +static const rounding_mode_struct mode_downward = {static_cast(0xFFF8000000000003LL)}; +static const rounding_mode_struct mode_to_nearest = {static_cast(0xFFF8000000000000LL)}; +static const rounding_mode_struct mode_toward_zero = {static_cast(0xFFF8000000000001LL)}; struct ppc_rounding_control { - typedef double rounding_mode; + typedef double rounding_mode; - static void set_rounding_mode(const rounding_mode mode) - { __asm__ __volatile__ ("mtfsf 255,%0" : : "f"(mode)); } + static void set_rounding_mode(const rounding_mode mode) { __asm__ __volatile__("mtfsf 255,%0" : : "f"(mode)); } - static void get_rounding_mode(rounding_mode& mode) - { __asm__ __volatile__ ("mffs %0" : "=f"(mode)); } + static void get_rounding_mode(rounding_mode &mode) { __asm__ __volatile__("mffs %0" : "=f"(mode)); } - static void downward() { set_rounding_mode(mode_downward.dmode); } - static void upward() { set_rounding_mode(mode_upward.dmode); } - static void to_nearest() { set_rounding_mode(mode_to_nearest.dmode); } - static void toward_zero() { set_rounding_mode(mode_toward_zero.dmode); } + static void downward() { set_rounding_mode(mode_downward.dmode); } + static void upward() { set_rounding_mode(mode_upward.dmode); } + static void to_nearest() { set_rounding_mode(mode_to_nearest.dmode); } + static void toward_zero() { set_rounding_mode(mode_toward_zero.dmode); } }; } // namespace detail @@ -54,39 +53,34 @@ struct ppc_rounding_control // Do not declare the following C99 symbols if provides them. // Otherwise, conflicts may occur, due to differences between prototypes. #if !defined(_ISOC99_SOURCE) && !defined(__USE_ISOC99) -extern "C" { - float rintf(float); - double rint(double); +extern "C" +{ + float rintf(float); + double rint(double); } #endif -template<> -struct rounding_control: - detail::ppc_rounding_control +template <> struct rounding_control : detail::ppc_rounding_control { - static float force_rounding(const float r) - { - float tmp; - __asm__ __volatile__ ("frsp %0, %1" : "=f" (tmp) : "f" (r)); - return tmp; - } - static float to_int(const float& x) { return rintf(x); } + static float force_rounding(const float r) + { + float tmp; + __asm__ __volatile__("frsp %0, %1" : "=f"(tmp) : "f"(r)); + return tmp; + } + static float to_int(const float &x) { return rintf(x); } }; -template<> -struct rounding_control: - detail::ppc_rounding_control +template <> struct rounding_control : detail::ppc_rounding_control { - static const double & force_rounding(const double& r) { return r; } - static double to_int(const double& r) { return rint(r); } + static const double &force_rounding(const double &r) { return r; } + static double to_int(const double &r) { return rint(r); } }; -template<> -struct rounding_control: - detail::ppc_rounding_control +template <> struct rounding_control : detail::ppc_rounding_control { - static const long double & force_rounding(const long double& r) { return r; } - static long double to_int(const long double& r) { return rint(static_cast(r)); } + static const long double &force_rounding(const long double &r) { return r; } + static long double to_int(const long double &r) { return rint(static_cast(r)); } }; } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/sparc_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/sparc_rounding_control.hpp index 6ba5baf0..752a36c3 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/sparc_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/sparc_rounding_control.hpp @@ -14,11 +14,11 @@ #define BOOST_NUMERIC_INTERVAL_DETAIL_SPARC_ROUNDING_CONTROL_HPP #if !defined(sparc) && !defined(__sparc__) -# error This header is only intended for SPARC CPUs. +#error This header is only intended for SPARC CPUs. #endif #ifdef __SUNPRO_CC -# include +#include #endif @@ -29,78 +29,73 @@ namespace detail { struct sparc_rounding_control { - typedef unsigned int rounding_mode; + typedef unsigned int rounding_mode; - static void set_rounding_mode(const rounding_mode& mode) - { -# if defined(__GNUC__) - __asm__ __volatile__("ld %0, %%fsr" : : "m"(mode)); -# elif defined (__SUNPRO_CC) - fpsetround(fp_rnd(mode)); -# elif defined(__KCC) - asm("sethi %hi(mode), %o1"); - asm("ld [%o1+%lo(mode)], %fsr"); -# else -# error Unsupported compiler for Sparc rounding control. -# endif - } + static void set_rounding_mode(const rounding_mode &mode) + { +#if defined(__GNUC__) + __asm__ __volatile__("ld %0, %%fsr" : : "m"(mode)); +#elif defined(__SUNPRO_CC) + fpsetround(fp_rnd(mode)); +#elif defined(__KCC) + asm("sethi %hi(mode), %o1"); + asm("ld [%o1+%lo(mode)], %fsr"); +#else +#error Unsupported compiler for Sparc rounding control. +#endif + } - static void get_rounding_mode(rounding_mode& mode) - { -# if defined(__GNUC__) - __asm__ __volatile__("st %%fsr, %0" : "=m"(mode)); -# elif defined (__SUNPRO_CC) - mode = fpgetround(); -# elif defined(__KCC) -# error KCC on Sun SPARC get_round_mode: please fix me - asm("st %fsr, [mode]"); -# else -# error Unsupported compiler for Sparc rounding control. -# endif - } + static void get_rounding_mode(rounding_mode &mode) + { +#if defined(__GNUC__) + __asm__ __volatile__("st %%fsr, %0" : "=m"(mode)); +#elif defined(__SUNPRO_CC) + mode = fpgetround(); +#elif defined(__KCC) +#error KCC on Sun SPARC get_round_mode: please fix me + asm("st %fsr, [mode]"); +#else +#error Unsupported compiler for Sparc rounding control. +#endif + } #if defined(__SUNPRO_CC) - static void downward() { set_rounding_mode(FP_RM); } - static void upward() { set_rounding_mode(FP_RP); } - static void to_nearest() { set_rounding_mode(FP_RN); } - static void toward_zero() { set_rounding_mode(FP_RZ); } + static void downward() { set_rounding_mode(FP_RM); } + static void upward() { set_rounding_mode(FP_RP); } + static void to_nearest() { set_rounding_mode(FP_RN); } + static void toward_zero() { set_rounding_mode(FP_RZ); } #else - static void downward() { set_rounding_mode(0xc0000000); } - static void upward() { set_rounding_mode(0x80000000); } - static void to_nearest() { set_rounding_mode(0x00000000); } - static void toward_zero() { set_rounding_mode(0x40000000); } + static void downward() { set_rounding_mode(0xc0000000); } + static void upward() { set_rounding_mode(0x80000000); } + static void to_nearest() { set_rounding_mode(0x00000000); } + static void toward_zero() { set_rounding_mode(0x40000000); } #endif }; } // namespace detail -extern "C" { - float rintf(float); - double rint(double); +extern "C" +{ + float rintf(float); + double rint(double); } -template<> -struct rounding_control: - detail::sparc_rounding_control +template <> struct rounding_control : detail::sparc_rounding_control { - static const float& force_rounding(const float& x) { return x; } - static float to_int(const float& x) { return rintf(x); } + static const float &force_rounding(const float &x) { return x; } + static float to_int(const float &x) { return rintf(x); } }; -template<> -struct rounding_control: - detail::sparc_rounding_control +template <> struct rounding_control : detail::sparc_rounding_control { - static const double& force_rounding(const double& x) { return x; } - static double to_int(const double& x) { return rint(x); } + static const double &force_rounding(const double &x) { return x; } + static double to_int(const double &x) { return rint(x); } }; -template<> -struct rounding_control: - detail::sparc_rounding_control +template <> struct rounding_control : detail::sparc_rounding_control { - static const long double& force_rounding(const long double& x) { return x; } - static long double to_int(const long double& x) { return rint(x); } + static const long double &force_rounding(const long double &x) { return x; } + static long double to_int(const long double &x) { return rint(x); } }; } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/test_input.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/test_input.hpp index 58695fec..7b5a2440 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/test_input.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/test_input.hpp @@ -17,55 +17,52 @@ namespace numeric { namespace interval_lib { namespace user { -template inline -bool is_zero(T const &v) { return v == static_cast(0); } +template inline bool is_zero(T const &v) { return v == static_cast(0); } -template inline -bool is_neg (T const &v) { return v < static_cast(0); } +template inline bool is_neg(T const &v) { return v < static_cast(0); } -template inline -bool is_pos (T const &v) { return v > static_cast(0); } +template inline bool is_pos(T const &v) { return v > static_cast(0); } } // namespace user namespace detail { -template inline -bool test_input(const interval& x) { - typedef typename Policies::checking checking; - return checking::is_empty(x.lower(), x.upper()); +template inline bool test_input(const interval &x) +{ + typedef typename Policies::checking checking; + return checking::is_empty(x.lower(), x.upper()); } -template inline -bool test_input(const interval& x, const interval& y) { - typedef typename Policies1::checking checking1; - typedef typename Policies2::checking checking2; - return checking1::is_empty(x.lower(), x.upper()) || - checking2::is_empty(y.lower(), y.upper()); +template +inline bool test_input(const interval &x, const interval &y) +{ + typedef typename Policies1::checking checking1; + typedef typename Policies2::checking checking2; + return checking1::is_empty(x.lower(), x.upper()) || checking2::is_empty(y.lower(), y.upper()); } -template inline -bool test_input(const T& x, const interval& y) { - typedef typename Policies::checking checking; - return checking::is_nan(x) || checking::is_empty(y.lower(), y.upper()); +template inline bool test_input(const T &x, const interval &y) +{ + typedef typename Policies::checking checking; + return checking::is_nan(x) || checking::is_empty(y.lower(), y.upper()); } -template inline -bool test_input(const interval& x, const T& y) { - typedef typename Policies::checking checking; - return checking::is_empty(x.lower(), x.upper()) || checking::is_nan(y); +template inline bool test_input(const interval &x, const T &y) +{ + typedef typename Policies::checking checking; + return checking::is_empty(x.lower(), x.upper()) || checking::is_nan(y); } -template inline -bool test_input(const T& x) { - typedef typename Policies::checking checking; - return checking::is_nan(x); +template inline bool test_input(const T &x) +{ + typedef typename Policies::checking checking; + return checking::is_nan(x); } -template inline -bool test_input(const T& x, const T& y) { - typedef typename Policies::checking checking; - return checking::is_nan(x) || checking::is_nan(y); +template inline bool test_input(const T &x, const T &y) +{ + typedef typename Policies::checking checking; + return checking::is_nan(x) || checking::is_nan(y); } } // namespace detail diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86_rounding_control.hpp index 3eebdbac..bd4c8a8e 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86_rounding_control.hpp @@ -12,16 +12,16 @@ #define BOOST_NUMERIC_INTERVAL_DETAIL_X86_ROUNDING_CONTROL_HPP #ifdef __GNUC__ -# include +#include #elif defined(__BORLANDC__) -# include +#include #elif defined(_MSC_VER) -# include +#include #elif defined(__MWERKS__) || defined(__ICC) -# define BOOST_NUMERIC_INTERVAL_USE_C99_SUBSYSTEM -# include +#define BOOST_NUMERIC_INTERVAL_USE_C99_SUBSYSTEM +#include #else -# error Unsupported C++ compiler. +#error Unsupported C++ compiler. #endif namespace boost { @@ -36,68 +36,70 @@ typedef c99_rounding x86_rounding_control; #else struct fpu_rounding_modes { - unsigned short to_nearest; - unsigned short downward; - unsigned short upward; - unsigned short toward_zero; + unsigned short to_nearest; + unsigned short downward; + unsigned short upward; + unsigned short toward_zero; }; // exceptions masked, extended precision // hardware default is 0x037f (0x1000 only has a meaning on 287) -static const fpu_rounding_modes rnd_mode = { 0x137f, 0x177f, 0x1b7f, 0x1f7f }; +static const fpu_rounding_modes rnd_mode = {0x137f, 0x177f, 0x1b7f, 0x1f7f}; -struct x86_rounding_control: x86_rounding +struct x86_rounding_control : x86_rounding { - static void to_nearest() { set_rounding_mode(rnd_mode.to_nearest); } - static void downward() { set_rounding_mode(rnd_mode.downward); } - static void upward() { set_rounding_mode(rnd_mode.upward); } - static void toward_zero() { set_rounding_mode(rnd_mode.toward_zero); } + static void to_nearest() { set_rounding_mode(rnd_mode.to_nearest); } + static void downward() { set_rounding_mode(rnd_mode.downward); } + static void upward() { set_rounding_mode(rnd_mode.upward); } + static void toward_zero() { set_rounding_mode(rnd_mode.toward_zero); } }; #endif // BOOST_NUMERIC_INTERVAL_USE_C99_SUBSYSTEM } // namespace detail -template<> -struct rounding_control: detail::x86_rounding_control +template <> struct rounding_control : detail::x86_rounding_control { - static float force_rounding(const float& r) - { volatile float r_ = r; return r_; } + static float force_rounding(const float &r) + { + volatile float r_ = r; + return r_; + } }; -template<> -struct rounding_control: detail::x86_rounding_control +template <> struct rounding_control : detail::x86_rounding_control { - /*static double force_rounding(double r) - { asm volatile ("" : "+m"(r) : ); return r; }*/ - static double force_rounding(const double& r) - { volatile double r_ = r; return r_; } + /*static double force_rounding(double r) + { asm volatile ("" : "+m"(r) : ); return r; }*/ + static double force_rounding(const double &r) + { + volatile double r_ = r; + return r_; + } }; namespace detail { -template -struct x86_rounding_control_long_double; +template struct x86_rounding_control_long_double; -template<> -struct x86_rounding_control_long_double: x86_rounding_control +template <> struct x86_rounding_control_long_double : x86_rounding_control { - static long double force_rounding(long double const &r) - { volatile long double r_ = r; return r_; } + static long double force_rounding(long double const &r) + { + volatile long double r_ = r; + return r_; + } }; -template<> -struct x86_rounding_control_long_double: x86_rounding_control +template <> struct x86_rounding_control_long_double : x86_rounding_control { - static long double const &force_rounding(long double const &r) - { return r; } + static long double const &force_rounding(long double const &r) { return r; } }; } // namespace detail -template<> -struct rounding_control: - detail::x86_rounding_control_long_double< (sizeof(long double) >= 10) > -{}; +template <> struct rounding_control : detail::x86_rounding_control_long_double<(sizeof(long double) >= 10)> +{ +}; } // namespace interval_lib } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86gcc_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86gcc_rounding_control.hpp index 079d681d..7490ca36 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86gcc_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/detail/x86gcc_rounding_control.hpp @@ -12,11 +12,11 @@ #define BOOST_NUMERIC_INTERVAL_DETAIL_X86GCC_ROUNDING_CONTROL_HPP #ifndef __GNUC__ -# error This header only works with GNU CC. +#error This header only works with GNU CC. #endif #ifndef __i386__ -# error This header only works on x86 CPUs. +#error This header only works on x86 CPUs. #endif namespace boost { @@ -26,21 +26,18 @@ namespace detail { struct x86_rounding { - typedef unsigned short rounding_mode; + typedef unsigned short rounding_mode; - static void set_rounding_mode(const rounding_mode& mode) - { __asm__ __volatile__ ("fldcw %0" : : "m"(mode)); } + static void set_rounding_mode(const rounding_mode &mode) { __asm__ __volatile__("fldcw %0" : : "m"(mode)); } - static void get_rounding_mode(rounding_mode& mode) - { __asm__ __volatile__ ("fnstcw %0" : "=m"(mode)); } + static void get_rounding_mode(rounding_mode &mode) { __asm__ __volatile__("fnstcw %0" : "=m"(mode)); } - template - static T to_int(T r) - { - T r_; - __asm__ ("frndint" : "=&t"(r_) : "0"(r)); - return r_; - } + template static T to_int(T r) + { + T r_; + __asm__("frndint" : "=&t"(r_) : "0"(r)); + return r_; + } }; } // namespace detail diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/integer.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/integer.hpp index 628a343a..c3070192 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/integer.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/integer.hpp @@ -16,52 +16,44 @@ namespace boost { namespace numeric { -template inline -interval operator+ (const interval& x, int y) +template inline interval operator+(const interval &x, int y) { - return x + static_cast(y); + return x + static_cast(y); } -template inline -interval operator+ (int x, const interval& y) +template inline interval operator+(int x, const interval &y) { - return static_cast(x) + y; + return static_cast(x) + y; } -template inline -interval operator- (const interval& x, int y) +template inline interval operator-(const interval &x, int y) { - return x - static_cast(y); + return x - static_cast(y); } -template inline -interval operator- (int x, const interval& y) +template inline interval operator-(int x, const interval &y) { - return static_cast(x) - y; + return static_cast(x) - y; } -template inline -interval operator* (const interval& x, int y) +template inline interval operator*(const interval &x, int y) { - return x * static_cast(y); + return x * static_cast(y); } -template inline -interval operator* (int x, const interval& y) +template inline interval operator*(int x, const interval &y) { - return static_cast(x) * y; + return static_cast(x) * y; } -template inline -interval operator/ (const interval& x, int y) +template inline interval operator/(const interval &x, int y) { - return x / static_cast(y); + return x / static_cast(y); } -template inline -interval operator/ (int x, const interval& y) +template inline interval operator/(int x, const interval &y) { - return static_cast(x) / y; + return static_cast(x) / y; } } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/x86_fast_rounding_control.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/x86_fast_rounding_control.hpp index 7f89a4e5..20b3316d 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/x86_fast_rounding_control.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/ext/x86_fast_rounding_control.hpp @@ -24,43 +24,39 @@ namespace interval_lib { namespace detail { // exceptions masked, expected precision (the mask is 0x0300) -static const fpu_rounding_modes rnd_mode_f = { 0x107f, 0x147f, 0x187f, 0x1c7f }; -static const fpu_rounding_modes rnd_mode_d = { 0x127f, 0x167f, 0x1a7f, 0x1e7f }; -static const fpu_rounding_modes rnd_mode_l = { 0x137f, 0x177f, 0x1b7f, 0x1f7f }; +static const fpu_rounding_modes rnd_mode_f = {0x107f, 0x147f, 0x187f, 0x1c7f}; +static const fpu_rounding_modes rnd_mode_d = {0x127f, 0x167f, 0x1a7f, 0x1e7f}; +static const fpu_rounding_modes rnd_mode_l = {0x137f, 0x177f, 0x1b7f, 0x1f7f}; } // namespace detail -template -struct x86_fast_rounding_control; +template struct x86_fast_rounding_control; -template<> -struct x86_fast_rounding_control: detail::x86_rounding +template <> struct x86_fast_rounding_control : detail::x86_rounding { - static void to_nearest() { set_rounding_mode(detail::rnd_mode_f.to_nearest); } - static void downward() { set_rounding_mode(detail::rnd_mode_f.downward); } - static void upward() { set_rounding_mode(detail::rnd_mode_f.upward); } - static void toward_zero() { set_rounding_mode(detail::rnd_mode_f.toward_zero); } - static const float& force_rounding(const float& r) { return r; } + static void to_nearest() { set_rounding_mode(detail::rnd_mode_f.to_nearest); } + static void downward() { set_rounding_mode(detail::rnd_mode_f.downward); } + static void upward() { set_rounding_mode(detail::rnd_mode_f.upward); } + static void toward_zero() { set_rounding_mode(detail::rnd_mode_f.toward_zero); } + static const float &force_rounding(const float &r) { return r; } }; -template<> -struct x86_fast_rounding_control: detail::x86_rounding +template <> struct x86_fast_rounding_control : detail::x86_rounding { - static void to_nearest() { set_rounding_mode(detail::rnd_mode_d.to_nearest); } - static void downward() { set_rounding_mode(detail::rnd_mode_d.downward); } - static void upward() { set_rounding_mode(detail::rnd_mode_d.upward); } - static void toward_zero() { set_rounding_mode(detail::rnd_mode_d.toward_zero); } - static const double& force_rounding(const double& r) { return r; } + static void to_nearest() { set_rounding_mode(detail::rnd_mode_d.to_nearest); } + static void downward() { set_rounding_mode(detail::rnd_mode_d.downward); } + static void upward() { set_rounding_mode(detail::rnd_mode_d.upward); } + static void toward_zero() { set_rounding_mode(detail::rnd_mode_d.toward_zero); } + static const double &force_rounding(const double &r) { return r; } }; -template<> -struct x86_fast_rounding_control: detail::x86_rounding +template <> struct x86_fast_rounding_control : detail::x86_rounding { - static void to_nearest() { set_rounding_mode(detail::rnd_mode_l.to_nearest); } - static void downward() { set_rounding_mode(detail::rnd_mode_l.downward); } - static void upward() { set_rounding_mode(detail::rnd_mode_l.upward); } - static void toward_zero() { set_rounding_mode(detail::rnd_mode_l.toward_zero); } - static const long double& force_rounding(const long double& r) { return r; } + static void to_nearest() { set_rounding_mode(detail::rnd_mode_l.to_nearest); } + static void downward() { set_rounding_mode(detail::rnd_mode_l.downward); } + static void upward() { set_rounding_mode(detail::rnd_mode_l.upward); } + static void toward_zero() { set_rounding_mode(detail::rnd_mode_l.toward_zero); } + static const long double &force_rounding(const long double &r) { return r; } }; } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/hw_rounding.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/hw_rounding.hpp index fb8c2c39..7025955d 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/hw_rounding.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/hw_rounding.hpp @@ -11,33 +11,33 @@ #ifndef BOOST_NUMERIC_INTERVAL_HW_ROUNDING_HPP #define BOOST_NUMERIC_INTERVAL_HW_ROUNDING_HPP -#include #include +#include #define BOOST_NUMERIC_INTERVAL_NO_HARDWARE // define appropriate specialization of rounding_control for built-in types #if defined(__x86_64__) && defined(__USE_ISOC99) -# include +#include #elif defined(__i386__) || defined(_M_IX86) || defined(__BORLANDC__) || defined(_M_X64) -# include +#include #elif defined(powerpc) || defined(__powerpc__) || defined(__ppc__) -# include +#include #elif defined(sparc) || defined(__sparc__) -# include +#include #elif defined(alpha) || defined(__alpha__) -# include +#include #elif defined(ia64) || defined(__ia64) || defined(__ia64__) -# include +#include #endif #if defined(BOOST_NUMERIC_INTERVAL_NO_HARDWARE) && (defined(__USE_ISOC99) || defined(__MSL__)) -# include +#include #endif #if defined(BOOST_NUMERIC_INTERVAL_NO_HARDWARE) -# undef BOOST_NUMERIC_INTERVAL_NO_HARDWARE -# error Boost.Numeric.Interval: Please specify rounding control mechanism. +#undef BOOST_NUMERIC_INTERVAL_NO_HARDWARE +#error Boost.Numeric.Interval: Please specify rounding control mechanism. #endif namespace boost { @@ -48,20 +48,17 @@ namespace interval_lib { * Three specializations of rounded_math */ -template<> -struct rounded_math - : save_state > -{}; +template <> struct rounded_math : save_state> +{ +}; -template<> -struct rounded_math - : save_state > -{}; +template <> struct rounded_math : save_state> +{ +}; -template<> -struct rounded_math - : save_state > -{}; +template <> struct rounded_math : save_state> +{ +}; } // namespace interval_lib } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/interval.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/interval.hpp index 0b2d0eec..c9952913 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/interval.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/interval.hpp @@ -10,22 +10,22 @@ #ifndef BOOST_NUMERIC_INTERVAL_INTERVAL_HPP #define BOOST_NUMERIC_INTERVAL_INTERVAL_HPP +#include #include #include -#include namespace boost { namespace numeric { namespace interval_lib { - -class comparison_error - : public std::runtime_error + +class comparison_error : public std::runtime_error { public: - comparison_error() - : std::runtime_error("boost::interval: uncertain comparison") - { } + comparison_error() + : std::runtime_error("boost::interval: uncertain comparison") + { + } }; } // namespace interval_lib @@ -34,414 +34,448 @@ public: * interval class */ -template -class interval +template class interval { private: - struct interval_holder; - struct number_holder; + struct interval_holder; + struct number_holder; + public: - typedef T base_type; - typedef Policies traits_type; + typedef T base_type; + typedef Policies traits_type; - T const &lower() const; - T const &upper() const; + T const &lower() const; + T const &upper() const; - interval(); - interval(T const &v); - template interval(T1 const &v); - interval(T const &l, T const &u); - template interval(T1 const &l, T2 const &u); - interval(interval const &r); - template interval(interval const &r); - template interval(interval const &r); + interval(); + interval(T const &v); + template interval(T1 const &v); + interval(T const &l, T const &u); + template interval(T1 const &l, T2 const &u); + interval(interval const &r); + template interval(interval const &r); + template interval(interval const &r); - interval &operator=(T const &v); - template interval &operator=(T1 const &v); - interval &operator=(interval const &r); - template interval &operator=(interval const &r); - template interval &operator=(interval const &r); - - void assign(const T& l, const T& u); + interval &operator=(T const &v); + template interval &operator=(T1 const &v); + interval &operator=(interval const &r); + template interval &operator=(interval const &r); + template interval &operator=(interval const &r); - static interval empty(); - static interval whole(); - static interval hull(const T& x, const T& y); + void assign(const T &l, const T &u); - interval& operator+= (const T& r); - interval& operator+= (const interval& r); - interval& operator-= (const T& r); - interval& operator-= (const interval& r); - interval& operator*= (const T& r); - interval& operator*= (const interval& r); - interval& operator/= (const T& r); - interval& operator/= (const interval& r); + static interval empty(); + static interval whole(); + static interval hull(const T &x, const T &y); - bool operator< (const interval_holder& r) const; - bool operator> (const interval_holder& r) const; - bool operator<= (const interval_holder& r) const; - bool operator>= (const interval_holder& r) const; - bool operator== (const interval_holder& r) const; - bool operator!= (const interval_holder& r) const; + interval &operator+=(const T &r); + interval &operator+=(const interval &r); + interval &operator-=(const T &r); + interval &operator-=(const interval &r); + interval &operator*=(const T &r); + interval &operator*=(const interval &r); + interval &operator/=(const T &r); + interval &operator/=(const interval &r); - bool operator< (const number_holder& r) const; - bool operator> (const number_holder& r) const; - bool operator<= (const number_holder& r) const; - bool operator>= (const number_holder& r) const; - bool operator== (const number_holder& r) const; - bool operator!= (const number_holder& r) const; + bool operator<(const interval_holder &r) const; + bool operator>(const interval_holder &r) const; + bool operator<=(const interval_holder &r) const; + bool operator>=(const interval_holder &r) const; + bool operator==(const interval_holder &r) const; + bool operator!=(const interval_holder &r) const; - // the following is for internal use only, it is not a published interface - // nevertheless, it's public because friends don't always work correctly. - interval(const T& l, const T& u, bool): low(l), up(u) {} - void set_empty(); - void set_whole(); - void set(const T& l, const T& u); + bool operator<(const number_holder &r) const; + bool operator>(const number_holder &r) const; + bool operator<=(const number_holder &r) const; + bool operator>=(const number_holder &r) const; + bool operator==(const number_holder &r) const; + bool operator!=(const number_holder &r) const; + + // the following is for internal use only, it is not a published interface + // nevertheless, it's public because friends don't always work correctly. + interval(const T &l, const T &u, bool) + : low(l) + , up(u) + { + } + void set_empty(); + void set_whole(); + void set(const T &l, const T &u); private: - struct interval_holder { - template - interval_holder(const interval& r) - : low(r.lower()), up(r.upper()) + struct interval_holder { - typedef typename Policies2::checking checking2; - if (checking2::is_empty(low, up)) - throw interval_lib::comparison_error(); - } + template + interval_holder(const interval &r) + : low(r.lower()) + , up(r.upper()) + { + typedef typename Policies2::checking checking2; + if (checking2::is_empty(low, up)) + throw interval_lib::comparison_error(); + } - const T& low; - const T& up; - }; + const T &low; + const T &up; + }; - struct number_holder { - number_holder(const T& r) : val(r) + struct number_holder { - typedef typename Policies::checking checking; - if (checking::is_nan(r)) - throw interval_lib::comparison_error(); - } - - const T& val; - }; + number_holder(const T &r) + : val(r) + { + typedef typename Policies::checking checking; + if (checking::is_nan(r)) + throw interval_lib::comparison_error(); + } - typedef typename Policies::checking checking; - typedef typename Policies::rounding rounding; + const T &val; + }; - T low; - T up; + typedef typename Policies::checking checking; + typedef typename Policies::rounding rounding; + + T low; + T up; }; -template inline -interval::interval(): - low(static_cast(0)), up(static_cast(0)) -{} - -template inline -interval::interval(T const &v): low(v), up(v) +template +inline interval::interval() + : low(static_cast(0)) + , up(static_cast(0)) { - if (checking::is_nan(v)) set_empty(); } -template template inline -interval::interval(T1 const &v) +template +inline interval::interval(T const &v) + : low(v) + , up(v) { - if (checking::is_nan(v)) set_empty(); - else { - rounding rnd; - low = rnd.conv_down(v); - up = rnd.conv_up (v); - } + if (checking::is_nan(v)) + set_empty(); } -template template inline -interval::interval(T1 const &l, T2 const &u) +template template inline interval::interval(T1 const &v) { - if (checking::is_nan(l) || checking::is_nan(u) || !(l <= u)) set_empty(); - else { - rounding rnd; - low = rnd.conv_down(l); - up = rnd.conv_up (u); - } + if (checking::is_nan(v)) + set_empty(); + else { + rounding rnd; + low = rnd.conv_down(v); + up = rnd.conv_up(v); + } } -template inline -interval::interval(T const &l, T const &u): low(l), up(u) +template +template +inline interval::interval(T1 const &l, T2 const &u) { - if (checking::is_nan(l) || checking::is_nan(u) || !(l <= u)) - set_empty(); + if (checking::is_nan(l) || checking::is_nan(u) || !(l <= u)) + set_empty(); + else { + rounding rnd; + low = rnd.conv_down(l); + up = rnd.conv_up(u); + } +} + +template +inline interval::interval(T const &l, T const &u) + : low(l) + , up(u) +{ + if (checking::is_nan(l) || checking::is_nan(u) || !(l <= u)) + set_empty(); } -template inline -interval::interval(interval const &r): low(r.lower()), up(r.upper()) -{} - -template template inline -interval::interval(interval const &r): low(r.lower()), up(r.upper()) +template +inline interval::interval(interval const &r) + : low(r.lower()) + , up(r.upper()) { - typedef typename Policies1::checking checking1; - if (checking1::is_empty(r.lower(), r.upper())) set_empty(); } -template template inline -interval::interval(interval const &r) +template +template +inline interval::interval(interval const &r) + : low(r.lower()) + , up(r.upper()) { - typedef typename Policies1::checking checking1; - if (checking1::is_empty(r.lower(), r.upper())) set_empty(); - else { - rounding rnd; - low = rnd.conv_down(r.lower()); - up = rnd.conv_up (r.upper()); - } + typedef typename Policies1::checking checking1; + if (checking1::is_empty(r.lower(), r.upper())) + set_empty(); } -template inline -interval &interval::operator=(T const &v) +template +template +inline interval::interval(interval const &r) { - if (checking::is_nan(v)) set_empty(); - else low = up = v; - return *this; + typedef typename Policies1::checking checking1; + if (checking1::is_empty(r.lower(), r.upper())) + set_empty(); + else { + rounding rnd; + low = rnd.conv_down(r.lower()); + up = rnd.conv_up(r.upper()); + } } -template template inline -interval &interval::operator=(T1 const &v) +template inline interval &interval::operator=(T const &v) { - if (checking::is_nan(v)) set_empty(); - else { - rounding rnd; - low = rnd.conv_down(v); - up = rnd.conv_up (v); - } - return *this; + if (checking::is_nan(v)) + set_empty(); + else + low = up = v; + return *this; } -template inline -interval &interval::operator=(interval const &r) +template +template +inline interval &interval::operator=(T1 const &v) { - low = r.lower(); - up = r.upper(); - return *this; + if (checking::is_nan(v)) + set_empty(); + else { + rounding rnd; + low = rnd.conv_down(v); + up = rnd.conv_up(v); + } + return *this; } -template template inline -interval &interval::operator=(interval const &r) +template +inline interval &interval::operator=(interval const &r) { - typedef typename Policies1::checking checking1; - if (checking1::is_empty(r.lower(), r.upper())) set_empty(); - else { low = r.lower(); up = r.upper(); - } - return *this; + return *this; } -template template inline -interval &interval::operator=(interval const &r) +template +template +inline interval &interval::operator=(interval const &r) { - typedef typename Policies1::checking checking1; - if (checking1::is_empty(r.lower(), r.upper())) set_empty(); - else { - rounding rnd; - low = rnd.conv_down(r.lower()); - up = rnd.conv_up (r.upper()); - } - return *this; + typedef typename Policies1::checking checking1; + if (checking1::is_empty(r.lower(), r.upper())) + set_empty(); + else { + low = r.lower(); + up = r.upper(); + } + return *this; } -template inline -void interval::assign(const T& l, const T& u) +template +template +inline interval &interval::operator=(interval const &r) { - if (checking::is_nan(l) || checking::is_nan(u) || !(l <= u)) - set_empty(); - else set(l, u); + typedef typename Policies1::checking checking1; + if (checking1::is_empty(r.lower(), r.upper())) + set_empty(); + else { + rounding rnd; + low = rnd.conv_down(r.lower()); + up = rnd.conv_up(r.upper()); + } + return *this; } -template inline -void interval::set(const T& l, const T& u) +template inline void interval::assign(const T &l, const T &u) { - low = l; - up = u; + if (checking::is_nan(l) || checking::is_nan(u) || !(l <= u)) + set_empty(); + else + set(l, u); } -template inline -void interval::set_empty() +template inline void interval::set(const T &l, const T &u) { - low = checking::empty_lower(); - up = checking::empty_upper(); + low = l; + up = u; } -template inline -void interval::set_whole() +template inline void interval::set_empty() { - low = checking::neg_inf(); - up = checking::pos_inf(); + low = checking::empty_lower(); + up = checking::empty_upper(); } -template inline -interval interval::hull(const T& x, const T& y) +template inline void interval::set_whole() { - bool bad_x = checking::is_nan(x); - bool bad_y = checking::is_nan(y); - if (bad_x) - if (bad_y) return interval::empty(); - else return interval(y, y, true); - else - if (bad_y) return interval(x, x, true); - if (x <= y) return interval(x, y, true); - else return interval(y, x, true); + low = checking::neg_inf(); + up = checking::pos_inf(); } -template inline -interval interval::empty() +template inline interval interval::hull(const T &x, const T &y) { - return interval(checking::empty_lower(), - checking::empty_upper(), true); + bool bad_x = checking::is_nan(x); + bool bad_y = checking::is_nan(y); + if (bad_x) + if (bad_y) + return interval::empty(); + else + return interval(y, y, true); + else if (bad_y) + return interval(x, x, true); + if (x <= y) + return interval(x, y, true); + else + return interval(y, x, true); } -template inline -interval interval::whole() +template inline interval interval::empty() { - return interval(checking::neg_inf(), checking::pos_inf(), true); + return interval(checking::empty_lower(), checking::empty_upper(), true); } -template inline -const T& interval::lower() const +template inline interval interval::whole() { - return low; + return interval(checking::neg_inf(), checking::pos_inf(), true); } -template inline -const T& interval::upper() const -{ - return up; -} +template inline const T &interval::lower() const { return low; } + +template inline const T &interval::upper() const { return up; } /* * interval/interval comparisons */ -template inline -bool interval::operator< (const interval_holder& r) const +template inline bool interval::operator<(const interval_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up < r.low) return true; - else if (low >= r.up) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up < r.low) + return true; + else if (low >= r.up) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator> (const interval_holder& r) const +template inline bool interval::operator>(const interval_holder &r) const { - if (!checking::is_empty(low, up)) { - if (low > r.up) return true; - else if (up <= r.low) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (low > r.up) + return true; + else if (up <= r.low) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator<= (const interval_holder& r) const +template inline bool interval::operator<=(const interval_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up <= r.low) return true; - else if (low > r.up) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up <= r.low) + return true; + else if (low > r.up) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator>= (const interval_holder& r) const +template inline bool interval::operator>=(const interval_holder &r) const { - if (!checking::is_empty(low, up)) { - if (low >= r.up) return true; - else if (up < r.low) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (low >= r.up) + return true; + else if (up < r.low) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator== (const interval_holder& r) const +template inline bool interval::operator==(const interval_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up == r.low && low == r.up) return true; - else if (up < r.low || low > r.up) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up == r.low && low == r.up) + return true; + else if (up < r.low || low > r.up) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator!= (const interval_holder& r) const +template inline bool interval::operator!=(const interval_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up < r.low || low > r.up) return true; - else if (up == r.low && low == r.up) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up < r.low || low > r.up) + return true; + else if (up == r.low && low == r.up) + return false; + } + throw interval_lib::comparison_error(); } /* * interval/number comparisons */ -template inline -bool interval::operator< (const number_holder& r) const +template inline bool interval::operator<(const number_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up < r.val) return true; - else if (low >= r.val) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up < r.val) + return true; + else if (low >= r.val) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator> (const number_holder& r) const +template inline bool interval::operator>(const number_holder &r) const { - if (!checking::is_empty(low, up)) { - if (low > r.val) return true; - else if (up <= r.val) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (low > r.val) + return true; + else if (up <= r.val) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator<= (const number_holder& r) const +template inline bool interval::operator<=(const number_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up <= r.val) return true; - else if (low > r.val) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up <= r.val) + return true; + else if (low > r.val) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator>= (const number_holder& r) const +template inline bool interval::operator>=(const number_holder &r) const { - if (!checking::is_empty(low, up)) { - if (low >= r.val) return true; - else if (up < r.val) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (low >= r.val) + return true; + else if (up < r.val) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator== (const number_holder& r) const +template inline bool interval::operator==(const number_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up == r.val && low == r.val) return true; - else if (up < r.val || low > r.val) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up == r.val && low == r.val) + return true; + else if (up < r.val || low > r.val) + return false; + } + throw interval_lib::comparison_error(); } -template inline -bool interval::operator!= (const number_holder& r) const +template inline bool interval::operator!=(const number_holder &r) const { - if (!checking::is_empty(low, up)) { - if (up < r.val || low > r.val) return true; - else if (up == r.val && low == r.val) return false; - } - throw interval_lib::comparison_error(); + if (!checking::is_empty(low, up)) { + if (up < r.val || low > r.val) + return true; + else if (up == r.val && low == r.val) + return false; + } + throw interval_lib::comparison_error(); } } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/io.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/io.hpp index dc4179e5..640848b4 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/io.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/io.hpp @@ -24,15 +24,14 @@ namespace boost { namespace numeric { -template -std::basic_ostream &operator<< - (std::basic_ostream &stream, - interval const &value) +template +std::basic_ostream &operator<<(std::basic_ostream &stream, + interval const &value) { - if (empty(value)) - return stream << "[]"; - else - return stream << '[' << lower(value) << ',' << upper(value) << ']'; + if (empty(value)) + return stream << "[]"; + else + return stream << '[' << lower(value) << ',' << upper(value) << ']'; } } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/limits.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/limits.hpp index d691ccee..260cbdf9 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/limits.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/limits.hpp @@ -19,29 +19,28 @@ namespace std { -template -class numeric_limits > - : public numeric_limits +template +class numeric_limits> : public numeric_limits { private: - typedef boost::numeric::interval I; - typedef numeric_limits bl; + typedef boost::numeric::interval I; + typedef numeric_limits bl; + public: - static I min BOOST_PREVENT_MACRO_SUBSTITUTION () throw() { return I((bl::min)(), (bl::min)()); } - static I max BOOST_PREVENT_MACRO_SUBSTITUTION () throw() { return I((bl::max)(), (bl::max)()); } - static I epsilon() throw() { return I(bl::epsilon(), bl::epsilon()); } + static I min BOOST_PREVENT_MACRO_SUBSTITUTION() throw() { return I((bl::min)(), (bl::min)()); } + static I max BOOST_PREVENT_MACRO_SUBSTITUTION() throw() { return I((bl::max)(), (bl::max)()); } + static I epsilon() throw() { return I(bl::epsilon(), bl::epsilon()); } - BOOST_STATIC_CONSTANT(float_round_style, round_style = round_indeterminate); - BOOST_STATIC_CONSTANT(bool, is_iec559 = false); + BOOST_STATIC_CONSTANT(float_round_style, round_style = round_indeterminate); + BOOST_STATIC_CONSTANT(bool, is_iec559 = false); + + static I infinity() throw() { return I::whole(); } + static I quiet_NaN() throw() { return I::empty(); } + static I signaling_NaN() throw() { return I(bl::signaling_NaN(), bl::signaling_Nan()); } + static I denorm_min() throw() { return I(bl::denorm_min(), bl::denorm_min()); } - static I infinity () throw() { return I::whole(); } - static I quiet_NaN() throw() { return I::empty(); } - static I signaling_NaN() throw() - { return I(bl::signaling_NaN(), bl::signaling_Nan()); } - static I denorm_min() throw() - { return I(bl::denorm_min(), bl::denorm_min()); } private: - static I round_error(); // hide this on purpose, not yet implemented + static I round_error(); // hide this on purpose, not yet implemented }; } // namespace std diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/policies.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/policies.hpp index 70ad220d..fb1de08c 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/policies.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/policies.hpp @@ -20,35 +20,34 @@ namespace interval_lib { * policies class */ -template -struct policies +template struct policies { - typedef Rounding rounding; - typedef Checking checking; + typedef Rounding rounding; + typedef Checking checking; }; /* * policies switching classes */ -template -class change_rounding +template class change_rounding { - typedef typename OldInterval::base_type T; - typedef typename OldInterval::traits_type p; - typedef typename p::checking checking; + typedef typename OldInterval::base_type T; + typedef typename OldInterval::traits_type p; + typedef typename p::checking checking; + public: - typedef interval > type; + typedef interval> type; }; -template -class change_checking +template class change_checking { - typedef typename OldInterval::base_type T; - typedef typename OldInterval::traits_type p; - typedef typename p::rounding rounding; + typedef typename OldInterval::base_type T; + typedef typename OldInterval::traits_type p; + typedef typename p::rounding rounding; + public: - typedef interval > type; + typedef interval> type; }; /* @@ -56,15 +55,15 @@ public: * at each operation, rather than once and for all. */ -template -class unprotect +template class unprotect { - typedef typename OldInterval::base_type T; - typedef typename OldInterval::traits_type p; - typedef typename p::rounding r; - typedef typename r::unprotected_rounding newRounding; + typedef typename OldInterval::base_type T; + typedef typename OldInterval::traits_type p; + typedef typename p::rounding r; + typedef typename r::unprotected_rounding newRounding; + public: - typedef typename change_rounding::type type; + typedef typename change_rounding::type type; }; } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_arith.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_arith.hpp index 8b2d9a71..7c496c16 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_arith.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_arith.hpp @@ -10,9 +10,9 @@ #ifndef BOOST_NUMERIC_INTERVAL_ROUNDED_ARITH_HPP #define BOOST_NUMERIC_INTERVAL_ROUNDED_ARITH_HPP -#include -#include #include +#include +#include namespace boost { namespace numeric { @@ -23,94 +23,132 @@ namespace interval_lib { * See documentation for details. */ -template -struct rounded_arith_exact: Rounding { - void init() { } - template T conv_down(U const &v) { return v; } - template T conv_up (U const &v) { return v; } - T add_down (const T& x, const T& y) { return x + y; } - T add_up (const T& x, const T& y) { return x + y; } - T sub_down (const T& x, const T& y) { return x - y; } - T sub_up (const T& x, const T& y) { return x - y; } - T mul_down (const T& x, const T& y) { return x * y; } - T mul_up (const T& x, const T& y) { return x * y; } - T div_down (const T& x, const T& y) { return x / y; } - T div_up (const T& x, const T& y) { return x / y; } - T median (const T& x, const T& y) { return (x + y) / 2; } - T sqrt_down(const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(sqrt); return sqrt(x); } - T sqrt_up (const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(sqrt); return sqrt(x); } - T int_down (const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(floor); return floor(x); } - T int_up (const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(ceil); return ceil(x); } +template struct rounded_arith_exact : Rounding +{ + void init() {} + template T conv_down(U const &v) { return v; } + template T conv_up(U const &v) { return v; } + T add_down(const T &x, const T &y) { return x + y; } + T add_up(const T &x, const T &y) { return x + y; } + T sub_down(const T &x, const T &y) { return x - y; } + T sub_up(const T &x, const T &y) { return x - y; } + T mul_down(const T &x, const T &y) { return x * y; } + T mul_up(const T &x, const T &y) { return x * y; } + T div_down(const T &x, const T &y) { return x / y; } + T div_up(const T &x, const T &y) { return x / y; } + T median(const T &x, const T &y) { return (x + y) / 2; } + T sqrt_down(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(sqrt); + return sqrt(x); + } + T sqrt_up(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(sqrt); + return sqrt(x); + } + T int_down(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(floor); + return floor(x); + } + T int_up(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(ceil); + return ceil(x); + } }; -template -struct rounded_arith_std: Rounding { -# define BOOST_DN(EXPR) this->downward(); return this->force_rounding(EXPR) -# define BOOST_NR(EXPR) this->to_nearest(); return this->force_rounding(EXPR) -# define BOOST_UP(EXPR) this->upward(); return this->force_rounding(EXPR) - void init() { } - template T conv_down(U const &v) { BOOST_DN(v); } - template T conv_up (U const &v) { BOOST_UP(v); } - T add_down(const T& x, const T& y) { BOOST_DN(x + y); } - T sub_down(const T& x, const T& y) { BOOST_DN(x - y); } - T mul_down(const T& x, const T& y) { BOOST_DN(x * y); } - T div_down(const T& x, const T& y) { BOOST_DN(x / y); } - T add_up (const T& x, const T& y) { BOOST_UP(x + y); } - T sub_up (const T& x, const T& y) { BOOST_UP(x - y); } - T mul_up (const T& x, const T& y) { BOOST_UP(x * y); } - T div_up (const T& x, const T& y) { BOOST_UP(x / y); } - T median(const T& x, const T& y) { BOOST_NR((x + y) / 2); } - T sqrt_down(const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(sqrt); BOOST_DN(sqrt(x)); } - T sqrt_up (const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(sqrt); BOOST_UP(sqrt(x)); } - T int_down(const T& x) { this->downward(); return to_int(x); } - T int_up (const T& x) { this->upward(); return to_int(x); } -# undef BOOST_DN -# undef BOOST_NR -# undef BOOST_UP -}; - -template -struct rounded_arith_opp: Rounding { - void init() { this->upward(); } -# define BOOST_DN(EXPR) \ - this->downward(); \ - T r = this->force_rounding(EXPR); \ - this->upward(); \ - return r -# define BOOST_NR(EXPR) \ +template struct rounded_arith_std : Rounding +{ +#define BOOST_DN(EXPR) \ + this->downward(); \ + return this->force_rounding(EXPR) +#define BOOST_NR(EXPR) \ this->to_nearest(); \ + return this->force_rounding(EXPR) +#define BOOST_UP(EXPR) \ + this->upward(); \ + return this->force_rounding(EXPR) + void init() {} + template T conv_down(U const &v) { BOOST_DN(v); } + template T conv_up(U const &v) { BOOST_UP(v); } + T add_down(const T &x, const T &y) { BOOST_DN(x + y); } + T sub_down(const T &x, const T &y) { BOOST_DN(x - y); } + T mul_down(const T &x, const T &y) { BOOST_DN(x * y); } + T div_down(const T &x, const T &y) { BOOST_DN(x / y); } + T add_up(const T &x, const T &y) { BOOST_UP(x + y); } + T sub_up(const T &x, const T &y) { BOOST_UP(x - y); } + T mul_up(const T &x, const T &y) { BOOST_UP(x * y); } + T div_up(const T &x, const T &y) { BOOST_UP(x / y); } + T median(const T &x, const T &y) { BOOST_NR((x + y) / 2); } + T sqrt_down(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(sqrt); + BOOST_DN(sqrt(x)); + } + T sqrt_up(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(sqrt); + BOOST_UP(sqrt(x)); + } + T int_down(const T &x) + { + this->downward(); + return to_int(x); + } + T int_up(const T &x) + { + this->upward(); + return to_int(x); + } +#undef BOOST_DN +#undef BOOST_NR +#undef BOOST_UP +}; + +template struct rounded_arith_opp : Rounding +{ + void init() { this->upward(); } +#define BOOST_DN(EXPR) \ + this->downward(); \ T r = this->force_rounding(EXPR); \ - this->upward(); \ + this->upward(); \ return r -# define BOOST_UP(EXPR) return this->force_rounding(EXPR) -# define BOOST_UP_NEG(EXPR) return -this->force_rounding(EXPR) - template T conv_down(U const &v) { BOOST_UP_NEG(-v); } - template T conv_up (U const &v) { BOOST_UP(v); } - T add_down(const T& x, const T& y) { BOOST_UP_NEG((-x) - y); } - T sub_down(const T& x, const T& y) { BOOST_UP_NEG(y - x); } - T mul_down(const T& x, const T& y) { BOOST_UP_NEG(x * (-y)); } - T div_down(const T& x, const T& y) { BOOST_UP_NEG(x / (-y)); } - T add_up (const T& x, const T& y) { BOOST_UP(x + y); } - T sub_up (const T& x, const T& y) { BOOST_UP(x - y); } - T mul_up (const T& x, const T& y) { BOOST_UP(x * y); } - T div_up (const T& x, const T& y) { BOOST_UP(x / y); } - T median (const T& x, const T& y) { BOOST_NR((x + y) / 2); } - T sqrt_down(const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(sqrt); BOOST_DN(sqrt(x)); } - T sqrt_up (const T& x) - { BOOST_NUMERIC_INTERVAL_using_math(sqrt); BOOST_UP(sqrt(x)); } - T int_down(const T& x) { return -to_int(-x); } - T int_up (const T& x) { return to_int(x); } -# undef BOOST_DN -# undef BOOST_NR -# undef BOOST_UP -# undef BOOST_UP_NEG +#define BOOST_NR(EXPR) \ + this->to_nearest(); \ + T r = this->force_rounding(EXPR); \ + this->upward(); \ + return r +#define BOOST_UP(EXPR) return this->force_rounding(EXPR) +#define BOOST_UP_NEG(EXPR) return -this->force_rounding(EXPR) + template T conv_down(U const &v) { BOOST_UP_NEG(-v); } + template T conv_up(U const &v) { BOOST_UP(v); } + T add_down(const T &x, const T &y) { BOOST_UP_NEG((-x) - y); } + T sub_down(const T &x, const T &y) { BOOST_UP_NEG(y - x); } + T mul_down(const T &x, const T &y) { BOOST_UP_NEG(x * (-y)); } + T div_down(const T &x, const T &y) { BOOST_UP_NEG(x / (-y)); } + T add_up(const T &x, const T &y) { BOOST_UP(x + y); } + T sub_up(const T &x, const T &y) { BOOST_UP(x - y); } + T mul_up(const T &x, const T &y) { BOOST_UP(x * y); } + T div_up(const T &x, const T &y) { BOOST_UP(x / y); } + T median(const T &x, const T &y) { BOOST_NR((x + y) / 2); } + T sqrt_down(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(sqrt); + BOOST_DN(sqrt(x)); + } + T sqrt_up(const T &x) + { + BOOST_NUMERIC_INTERVAL_using_math(sqrt); + BOOST_UP(sqrt(x)); + } + T int_down(const T &x) { return -to_int(-x); } + T int_up(const T &x) { return to_int(x); } +#undef BOOST_DN +#undef BOOST_NR +#undef BOOST_UP +#undef BOOST_UP_NEG }; } // namespace interval_lib diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_transc.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_transc.hpp index ac4982e3..fa3bc4fc 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_transc.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounded_transc.hpp @@ -10,129 +10,153 @@ #ifndef BOOST_NUMERIC_INTERVAL_ROUNDED_TRANSC_HPP #define BOOST_NUMERIC_INTERVAL_ROUNDED_TRANSC_HPP -#include -#include #include +#include +#include namespace boost { namespace numeric { namespace interval_lib { -template -struct rounded_transc_exact: Rounding +template struct rounded_transc_exact : Rounding { -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) { BOOST_NUMERIC_INTERVAL_using_math(f); return f(x); } \ - T f##_up (const T& x) { BOOST_NUMERIC_INTERVAL_using_math(f); return f(x); } - BOOST_NUMERIC_INTERVAL_new_func(exp) - BOOST_NUMERIC_INTERVAL_new_func(log) - BOOST_NUMERIC_INTERVAL_new_func(sin) - BOOST_NUMERIC_INTERVAL_new_func(cos) - BOOST_NUMERIC_INTERVAL_new_func(tan) - BOOST_NUMERIC_INTERVAL_new_func(asin) - BOOST_NUMERIC_INTERVAL_new_func(acos) - BOOST_NUMERIC_INTERVAL_new_func(atan) - BOOST_NUMERIC_INTERVAL_new_func(sinh) - BOOST_NUMERIC_INTERVAL_new_func(cosh) - BOOST_NUMERIC_INTERVAL_new_func(tanh) -# undef BOOST_NUMERIC_INTERVAL_new_func -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) { BOOST_NUMERIC_INTERVAL_using_ahyp(f); return f(x); } \ - T f##_up (const T& x) { BOOST_NUMERIC_INTERVAL_using_ahyp(f); return f(x); } - BOOST_NUMERIC_INTERVAL_new_func(asinh) - BOOST_NUMERIC_INTERVAL_new_func(acosh) - BOOST_NUMERIC_INTERVAL_new_func(atanh) -# undef BOOST_NUMERIC_INTERVAL_new_func -}; - -template -struct rounded_transc_std: Rounding -{ -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_math(f); \ - this->downward(); return this->force_rounding(f(x)); } \ - T f##_up (const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_math(f); \ - this->upward(); return this->force_rounding(f(x)); } - BOOST_NUMERIC_INTERVAL_new_func(exp) - BOOST_NUMERIC_INTERVAL_new_func(log) - BOOST_NUMERIC_INTERVAL_new_func(sin) - BOOST_NUMERIC_INTERVAL_new_func(cos) - BOOST_NUMERIC_INTERVAL_new_func(tan) - BOOST_NUMERIC_INTERVAL_new_func(asin) - BOOST_NUMERIC_INTERVAL_new_func(acos) - BOOST_NUMERIC_INTERVAL_new_func(atan) - BOOST_NUMERIC_INTERVAL_new_func(sinh) - BOOST_NUMERIC_INTERVAL_new_func(cosh) - BOOST_NUMERIC_INTERVAL_new_func(tanh) -# undef BOOST_NUMERIC_INTERVAL_new_func -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ - this->downward(); return this->force_rounding(f(x)); } \ - T f##_up (const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ - this->upward(); return this->force_rounding(f(x)); } - BOOST_NUMERIC_INTERVAL_new_func(asinh) - BOOST_NUMERIC_INTERVAL_new_func(acosh) - BOOST_NUMERIC_INTERVAL_new_func(atanh) -# undef BOOST_NUMERIC_INTERVAL_new_func +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + return f(x); \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + return f(x); \ + } + BOOST_NUMERIC_INTERVAL_new_func(exp) BOOST_NUMERIC_INTERVAL_new_func(log) BOOST_NUMERIC_INTERVAL_new_func(sin) + BOOST_NUMERIC_INTERVAL_new_func(cos) BOOST_NUMERIC_INTERVAL_new_func(tan) BOOST_NUMERIC_INTERVAL_new_func(asin) + BOOST_NUMERIC_INTERVAL_new_func(acos) BOOST_NUMERIC_INTERVAL_new_func(atan) + BOOST_NUMERIC_INTERVAL_new_func(sinh) BOOST_NUMERIC_INTERVAL_new_func(cosh) + BOOST_NUMERIC_INTERVAL_new_func(tanh) +#undef BOOST_NUMERIC_INTERVAL_new_func +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + return f(x); \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + return f(x); \ + } + BOOST_NUMERIC_INTERVAL_new_func(asinh) BOOST_NUMERIC_INTERVAL_new_func(acosh) + BOOST_NUMERIC_INTERVAL_new_func(atanh) +#undef BOOST_NUMERIC_INTERVAL_new_func }; -template -struct rounded_transc_opp: Rounding +template struct rounded_transc_std : Rounding { -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_math(f); \ - this->downward(); T y = this->force_rounding(f(x)); \ - this->upward(); return y; } \ - T f##_up (const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_math(f); \ - return this->force_rounding(f(x)); } - BOOST_NUMERIC_INTERVAL_new_func(exp) - BOOST_NUMERIC_INTERVAL_new_func(log) - BOOST_NUMERIC_INTERVAL_new_func(cos) - BOOST_NUMERIC_INTERVAL_new_func(acos) - BOOST_NUMERIC_INTERVAL_new_func(cosh) -# undef BOOST_NUMERIC_INTERVAL_new_func -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_math(f); \ - return -this->force_rounding(-f(x)); } \ - T f##_up (const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_math(f); \ - return this->force_rounding(f(x)); } - BOOST_NUMERIC_INTERVAL_new_func(sin) - BOOST_NUMERIC_INTERVAL_new_func(tan) - BOOST_NUMERIC_INTERVAL_new_func(asin) - BOOST_NUMERIC_INTERVAL_new_func(atan) - BOOST_NUMERIC_INTERVAL_new_func(sinh) - BOOST_NUMERIC_INTERVAL_new_func(tanh) -# undef BOOST_NUMERIC_INTERVAL_new_func -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ - this->downward(); T y = this->force_rounding(f(x)); \ - this->upward(); return y; } \ - T f##_up (const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ - return this->force_rounding(f(x)); } - BOOST_NUMERIC_INTERVAL_new_func(asinh) - BOOST_NUMERIC_INTERVAL_new_func(atanh) -# undef BOOST_NUMERIC_INTERVAL_new_func -# define BOOST_NUMERIC_INTERVAL_new_func(f) \ - T f##_down(const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ - return -this->force_rounding(-f(x)); } \ - T f##_up (const T& x) \ - { BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ - return this->force_rounding(f(x)); } - BOOST_NUMERIC_INTERVAL_new_func(acosh) -# undef BOOST_NUMERIC_INTERVAL_new_func +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + this->downward(); \ + return this->force_rounding(f(x)); \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + this->upward(); \ + return this->force_rounding(f(x)); \ + } + BOOST_NUMERIC_INTERVAL_new_func(exp) BOOST_NUMERIC_INTERVAL_new_func(log) BOOST_NUMERIC_INTERVAL_new_func(sin) + BOOST_NUMERIC_INTERVAL_new_func(cos) BOOST_NUMERIC_INTERVAL_new_func(tan) BOOST_NUMERIC_INTERVAL_new_func(asin) + BOOST_NUMERIC_INTERVAL_new_func(acos) BOOST_NUMERIC_INTERVAL_new_func(atan) + BOOST_NUMERIC_INTERVAL_new_func(sinh) BOOST_NUMERIC_INTERVAL_new_func(cosh) + BOOST_NUMERIC_INTERVAL_new_func(tanh) +#undef BOOST_NUMERIC_INTERVAL_new_func +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + this->downward(); \ + return this->force_rounding(f(x)); \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + this->upward(); \ + return this->force_rounding(f(x)); \ + } + BOOST_NUMERIC_INTERVAL_new_func(asinh) BOOST_NUMERIC_INTERVAL_new_func(acosh) + BOOST_NUMERIC_INTERVAL_new_func(atanh) +#undef BOOST_NUMERIC_INTERVAL_new_func }; - + +template struct rounded_transc_opp : Rounding +{ +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + this->downward(); \ + T y = this->force_rounding(f(x)); \ + this->upward(); \ + return y; \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + return this->force_rounding(f(x)); \ + } + BOOST_NUMERIC_INTERVAL_new_func(exp) BOOST_NUMERIC_INTERVAL_new_func(log) BOOST_NUMERIC_INTERVAL_new_func(cos) + BOOST_NUMERIC_INTERVAL_new_func(acos) BOOST_NUMERIC_INTERVAL_new_func(cosh) +#undef BOOST_NUMERIC_INTERVAL_new_func +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + return -this->force_rounding(-f(x)); \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_math(f); \ + return this->force_rounding(f(x)); \ + } + BOOST_NUMERIC_INTERVAL_new_func(sin) BOOST_NUMERIC_INTERVAL_new_func(tan) + BOOST_NUMERIC_INTERVAL_new_func(asin) BOOST_NUMERIC_INTERVAL_new_func(atan) + BOOST_NUMERIC_INTERVAL_new_func(sinh) BOOST_NUMERIC_INTERVAL_new_func(tanh) +#undef BOOST_NUMERIC_INTERVAL_new_func +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + this->downward(); \ + T y = this->force_rounding(f(x)); \ + this->upward(); \ + return y; \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + return this->force_rounding(f(x)); \ + } + BOOST_NUMERIC_INTERVAL_new_func(asinh) BOOST_NUMERIC_INTERVAL_new_func(atanh) +#undef BOOST_NUMERIC_INTERVAL_new_func +#define BOOST_NUMERIC_INTERVAL_new_func(f) \ + T f##_down(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + return -this->force_rounding(-f(x)); \ + } \ + T f##_up(const T &x) \ + { \ + BOOST_NUMERIC_INTERVAL_using_ahyp(f); \ + return this->force_rounding(f(x)); \ + } + BOOST_NUMERIC_INTERVAL_new_func(acosh) +#undef BOOST_NUMERIC_INTERVAL_new_func +}; + } // namespace interval_lib } // namespace numeric } // namespace boost diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounding.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounding.hpp index f69e2e4b..d5897836 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounding.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/rounding.hpp @@ -18,17 +18,16 @@ namespace interval_lib { * Default rounding_control class (does nothing) */ -template -struct rounding_control +template struct rounding_control { - typedef int rounding_mode; - static void get_rounding_mode(rounding_mode&) {} - static void set_rounding_mode(rounding_mode) {} - static void upward() {} - static void downward() {} - static void to_nearest() {} - static const T& to_int(const T& x) { return x; } - static const T& force_rounding(const T& x) { return x; } + typedef int rounding_mode; + static void get_rounding_mode(rounding_mode &) {} + static void set_rounding_mode(rounding_mode) {} + static void upward() {} + static void downward() {} + static void to_nearest() {} + static const T &to_int(const T &x) { return x; } + static const T &force_rounding(const T &x) { return x; } }; /* @@ -37,26 +36,19 @@ struct rounding_control * rounded_transc_* control the rounding of the transcendental functions */ -template > -struct rounded_arith_exact; +template > struct rounded_arith_exact; -template > -struct rounded_arith_std; +template > struct rounded_arith_std; -template > -struct rounded_arith_opp; +template > struct rounded_arith_opp; -template -struct rounded_transc_dummy; +template struct rounded_transc_dummy; -template > -struct rounded_transc_exact; +template > struct rounded_transc_exact; -template > -struct rounded_transc_std; +template > struct rounded_transc_std; -template > -struct rounded_transc_opp; +template > struct rounded_transc_opp; /* * State-saving classes: allow to set and reset rounding control @@ -64,35 +56,33 @@ struct rounded_transc_opp; namespace detail { -template -struct save_state_unprotected: Rounding +template struct save_state_unprotected : Rounding { - typedef save_state_unprotected unprotected_rounding; + typedef save_state_unprotected unprotected_rounding; }; } // namespace detail -template -struct save_state: Rounding +template struct save_state : Rounding { - typename Rounding::rounding_mode mode; - save_state() { - this->get_rounding_mode(mode); - this->init(); - } - ~save_state() { this->set_rounding_mode(mode); } - typedef detail::save_state_unprotected unprotected_rounding; + typename Rounding::rounding_mode mode; + save_state() + { + this->get_rounding_mode(mode); + this->init(); + } + ~save_state() { this->set_rounding_mode(mode); } + typedef detail::save_state_unprotected unprotected_rounding; }; - -template -struct save_state_nothing: Rounding + +template struct save_state_nothing : Rounding +{ + typedef save_state_nothing unprotected_rounding; +}; + +template struct rounded_math : save_state_nothing> { - typedef save_state_nothing unprotected_rounding; }; - -template -struct rounded_math: save_state_nothing > -{}; } // namespace interval_lib } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/transc.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/transc.hpp index 88aebd6b..081b9029 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/transc.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/transc.hpp @@ -11,219 +11,192 @@ #ifndef BOOST_NUMERIC_INTERVAL_TRANSC_HPP #define BOOST_NUMERIC_INTERVAL_TRANSC_HPP +#include #include -#include -#include -#include -#include -#include #include #include -#include +#include +#include +#include +#include +#include namespace boost { namespace numeric { -template inline -interval exp(const interval& x) +template inline interval exp(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.exp_down(x.lower()), rnd.exp_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.exp_down(x.lower()), rnd.exp_up(x.upper()), true); } -template inline -interval log(const interval& x) +template inline interval log(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x) || - !interval_lib::user::is_pos(x.upper())) - return I::empty(); - typename Policies::rounding rnd; - typedef typename Policies::checking checking; - T l = !interval_lib::user::is_pos(x.lower()) - ? checking::neg_inf() : rnd.log_down(x.lower()); - return I(l, rnd.log_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x) || !interval_lib::user::is_pos(x.upper())) + return I::empty(); + typename Policies::rounding rnd; + typedef typename Policies::checking checking; + T l = !interval_lib::user::is_pos(x.lower()) ? checking::neg_inf() : rnd.log_down(x.lower()); + return I(l, rnd.log_up(x.upper()), true); } -template inline -interval cos(const interval& x) +template inline interval cos(const interval &x) { - if (interval_lib::detail::test_input(x)) - return interval::empty(); - typename Policies::rounding rnd; - typedef interval I; - typedef typename interval_lib::unprotect::type R; + if (interval_lib::detail::test_input(x)) + return interval::empty(); + typename Policies::rounding rnd; + typedef interval I; + typedef typename interval_lib::unprotect::type R; - // get lower bound within [0, pi] - const R pi2 = interval_lib::pi_twice(); - R tmp = fmod((const R&)x, pi2); - if (width(tmp) >= pi2.lower()) - return I(static_cast(-1), static_cast(1), true); // we are covering a full period - if (tmp.lower() >= interval_lib::constants::pi_upper()) - return -cos(tmp - interval_lib::pi()); - T l = tmp.lower(); - T u = tmp.upper(); + // get lower bound within [0, pi] + const R pi2 = interval_lib::pi_twice(); + R tmp = fmod((const R &)x, pi2); + if (width(tmp) >= pi2.lower()) + return I(static_cast(-1), static_cast(1), true); // we are covering a full period + if (tmp.lower() >= interval_lib::constants::pi_upper()) + return -cos(tmp - interval_lib::pi()); + T l = tmp.lower(); + T u = tmp.upper(); - BOOST_USING_STD_MIN(); - // separate into monotone subintervals - if (u <= interval_lib::constants::pi_lower()) - return I(rnd.cos_down(u), rnd.cos_up(l), true); - else if (u <= pi2.lower()) - return I(static_cast(-1), rnd.cos_up(min BOOST_PREVENT_MACRO_SUBSTITUTION(rnd.sub_down(pi2.lower(), u), l)), true); - else - return I(static_cast(-1), static_cast(1), true); + BOOST_USING_STD_MIN(); + // separate into monotone subintervals + if (u <= interval_lib::constants::pi_lower()) + return I(rnd.cos_down(u), rnd.cos_up(l), true); + else if (u <= pi2.lower()) + return I(static_cast(-1), + rnd.cos_up(min BOOST_PREVENT_MACRO_SUBSTITUTION(rnd.sub_down(pi2.lower(), u), l)), + true); + else + return I(static_cast(-1), static_cast(1), true); } -template inline -interval sin(const interval& x) +template inline interval sin(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - typedef typename interval_lib::unprotect::type R; - I r = cos((const R&)x - interval_lib::pi_half()); - (void)&rnd; - return r; + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + typedef typename interval_lib::unprotect::type R; + I r = cos((const R &)x - interval_lib::pi_half()); + (void)&rnd; + return r; } -template inline -interval tan(const interval& x) +template inline interval tan(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - typedef typename interval_lib::unprotect::type R; + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + typedef typename interval_lib::unprotect::type R; - // get lower bound within [-pi/2, pi/2] - const R pi = interval_lib::pi(); - R tmp = fmod((const R&)x, pi); - const T pi_half_d = interval_lib::constants::pi_half_lower(); - if (tmp.lower() >= pi_half_d) - tmp -= pi; - if (tmp.lower() <= -pi_half_d || tmp.upper() >= pi_half_d) - return I::whole(); - return I(rnd.tan_down(tmp.lower()), rnd.tan_up(tmp.upper()), true); + // get lower bound within [-pi/2, pi/2] + const R pi = interval_lib::pi(); + R tmp = fmod((const R &)x, pi); + const T pi_half_d = interval_lib::constants::pi_half_lower(); + if (tmp.lower() >= pi_half_d) + tmp -= pi; + if (tmp.lower() <= -pi_half_d || tmp.upper() >= pi_half_d) + return I::whole(); + return I(rnd.tan_down(tmp.lower()), rnd.tan_up(tmp.upper()), true); } -template inline -interval asin(const interval& x) +template inline interval asin(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x) - || x.upper() < static_cast(-1) || x.lower() > static_cast(1)) - return I::empty(); - typename Policies::rounding rnd; - T l = (x.lower() <= static_cast(-1)) - ? -interval_lib::constants::pi_half_upper() - : rnd.asin_down(x.lower()); - T u = (x.upper() >= static_cast(1) ) - ? interval_lib::constants::pi_half_upper() - : rnd.asin_up (x.upper()); - return I(l, u, true); + typedef interval I; + if (interval_lib::detail::test_input(x) || x.upper() < static_cast(-1) || x.lower() > static_cast(1)) + return I::empty(); + typename Policies::rounding rnd; + T l = (x.lower() <= static_cast(-1)) ? -interval_lib::constants::pi_half_upper() : rnd.asin_down(x.lower()); + T u = (x.upper() >= static_cast(1)) ? interval_lib::constants::pi_half_upper() : rnd.asin_up(x.upper()); + return I(l, u, true); } -template inline -interval acos(const interval& x) +template inline interval acos(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x) - || x.upper() < static_cast(-1) || x.lower() > static_cast(1)) - return I::empty(); - typename Policies::rounding rnd; - T l = (x.upper() >= static_cast(1) ) - ? static_cast(0) - : rnd.acos_down(x.upper()); - T u = (x.lower() <= static_cast(-1)) - ? interval_lib::constants::pi_upper() - : rnd.acos_up (x.lower()); - return I(l, u, true); + typedef interval I; + if (interval_lib::detail::test_input(x) || x.upper() < static_cast(-1) || x.lower() > static_cast(1)) + return I::empty(); + typename Policies::rounding rnd; + T l = (x.upper() >= static_cast(1)) ? static_cast(0) : rnd.acos_down(x.upper()); + T u = (x.lower() <= static_cast(-1)) ? interval_lib::constants::pi_upper() : rnd.acos_up(x.lower()); + return I(l, u, true); } -template inline -interval atan(const interval& x) +template inline interval atan(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.atan_down(x.lower()), rnd.atan_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.atan_down(x.lower()), rnd.atan_up(x.upper()), true); } -template inline -interval sinh(const interval& x) +template inline interval sinh(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.sinh_down(x.lower()), rnd.sinh_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.sinh_down(x.lower()), rnd.sinh_up(x.upper()), true); } -template inline -interval cosh(const interval& x) +template inline interval cosh(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - if (interval_lib::user::is_neg(x.upper())) - return I(rnd.cosh_down(x.upper()), rnd.cosh_up(x.lower()), true); - else if (!interval_lib::user::is_neg(x.lower())) - return I(rnd.cosh_down(x.lower()), rnd.cosh_up(x.upper()), true); - else - return I(static_cast(0), rnd.cosh_up(-x.lower() > x.upper() ? x.lower() : x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + if (interval_lib::user::is_neg(x.upper())) + return I(rnd.cosh_down(x.upper()), rnd.cosh_up(x.lower()), true); + else if (!interval_lib::user::is_neg(x.lower())) + return I(rnd.cosh_down(x.lower()), rnd.cosh_up(x.upper()), true); + else + return I(static_cast(0), rnd.cosh_up(-x.lower() > x.upper() ? x.lower() : x.upper()), true); } -template inline -interval tanh(const interval& x) +template inline interval tanh(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.tanh_down(x.lower()), rnd.tanh_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.tanh_down(x.lower()), rnd.tanh_up(x.upper()), true); } -template inline -interval asinh(const interval& x) +template inline interval asinh(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - typename Policies::rounding rnd; - return I(rnd.asinh_down(x.lower()), rnd.asinh_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + typename Policies::rounding rnd; + return I(rnd.asinh_down(x.lower()), rnd.asinh_up(x.upper()), true); } -template inline -interval acosh(const interval& x) +template inline interval acosh(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x) || x.upper() < static_cast(1)) - return I::empty(); - typename Policies::rounding rnd; - T l = x.lower() <= static_cast(1) ? static_cast(0) : rnd.acosh_down(x.lower()); - return I(l, rnd.acosh_up(x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x) || x.upper() < static_cast(1)) + return I::empty(); + typename Policies::rounding rnd; + T l = x.lower() <= static_cast(1) ? static_cast(0) : rnd.acosh_down(x.lower()); + return I(l, rnd.acosh_up(x.upper()), true); } -template inline -interval atanh(const interval& x) +template inline interval atanh(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x) - || x.upper() < static_cast(-1) || x.lower() > static_cast(1)) - return I::empty(); - typename Policies::rounding rnd; - typedef typename Policies::checking checking; - T l = (x.lower() <= static_cast(-1)) - ? checking::neg_inf() : rnd.atanh_down(x.lower()); - T u = (x.upper() >= static_cast(1) ) - ? checking::pos_inf() : rnd.atanh_up (x.upper()); - return I(l, u, true); + typedef interval I; + if (interval_lib::detail::test_input(x) || x.upper() < static_cast(-1) || x.lower() > static_cast(1)) + return I::empty(); + typename Policies::rounding rnd; + typedef typename Policies::checking checking; + T l = (x.lower() <= static_cast(-1)) ? checking::neg_inf() : rnd.atanh_down(x.lower()); + T u = (x.upper() >= static_cast(1)) ? checking::pos_inf() : rnd.atanh_up(x.upper()); + return I(l, u, true); } } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/utility.hpp b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/utility.hpp index b1052b19..8108a9a6 100644 --- a/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/utility.hpp +++ b/Samples/2_Concepts_and_Techniques/interval/boost/numeric/interval/utility.hpp @@ -11,11 +11,11 @@ #ifndef BOOST_NUMERIC_INTERVAL_UTILITY_HPP #define BOOST_NUMERIC_INTERVAL_UTILITY_HPP +#include #include +#include #include #include -#include -#include #include /* @@ -29,306 +29,296 @@ namespace numeric { * Utility Functions */ -template inline -const T& lower(const interval& x) +template inline const T &lower(const interval &x) { return x.lower(); } + +template inline const T &upper(const interval &x) { return x.upper(); } + +template inline T checked_lower(const interval &x) { - return x.lower(); + if (empty(x)) { + typedef typename Policies::checking checking; + return checking::nan(); + } + return x.lower(); } -template inline -const T& upper(const interval& x) +template inline T checked_upper(const interval &x) { - return x.upper(); + if (empty(x)) { + typedef typename Policies::checking checking; + return checking::nan(); + } + return x.upper(); } -template inline -T checked_lower(const interval& x) +template inline T width(const interval &x) { - if (empty(x)) { - typedef typename Policies::checking checking; - return checking::nan(); - } - return x.lower(); + if (interval_lib::detail::test_input(x)) + return static_cast(0); + typename Policies::rounding rnd; + return rnd.sub_up(x.upper(), x.lower()); } -template inline -T checked_upper(const interval& x) +template inline T median(const interval &x) { - if (empty(x)) { - typedef typename Policies::checking checking; - return checking::nan(); - } - return x.upper(); + if (interval_lib::detail::test_input(x)) { + typedef typename Policies::checking checking; + return checking::nan(); + } + typename Policies::rounding rnd; + return rnd.median(x.lower(), x.upper()); } -template inline -T width(const interval& x) +template inline interval widen(const interval &x, const T &v) { - if (interval_lib::detail::test_input(x)) return static_cast(0); - typename Policies::rounding rnd; - return rnd.sub_up(x.upper(), x.lower()); -} - -template inline -T median(const interval& x) -{ - if (interval_lib::detail::test_input(x)) { - typedef typename Policies::checking checking; - return checking::nan(); - } - typename Policies::rounding rnd; - return rnd.median(x.lower(), x.upper()); -} - -template inline -interval widen(const interval& x, const T& v) -{ - if (interval_lib::detail::test_input(x)) - return interval::empty(); - typename Policies::rounding rnd; - return interval(rnd.sub_down(x.lower(), v), - rnd.add_up (x.upper(), v), true); + if (interval_lib::detail::test_input(x)) + return interval::empty(); + typename Policies::rounding rnd; + return interval(rnd.sub_down(x.lower(), v), rnd.add_up(x.upper(), v), true); } /* * Set-like operations */ -template inline -bool empty(const interval& x) +template inline bool empty(const interval &x) { - return interval_lib::detail::test_input(x); + return interval_lib::detail::test_input(x); } -template inline -bool zero_in(const interval& x) +template inline bool zero_in(const interval &x) { - if (interval_lib::detail::test_input(x)) return false; - return (!interval_lib::user::is_pos(x.lower())) && - (!interval_lib::user::is_neg(x.upper())); + if (interval_lib::detail::test_input(x)) + return false; + return (!interval_lib::user::is_pos(x.lower())) && (!interval_lib::user::is_neg(x.upper())); } -template inline -bool in_zero(const interval& x) // DEPRECATED +template inline bool in_zero(const interval &x) // DEPRECATED { - return zero_in(x); + return zero_in(x); } -template inline -bool in(const T& x, const interval& y) +template inline bool in(const T &x, const interval &y) { - if (interval_lib::detail::test_input(x, y)) return false; - return y.lower() <= x && x <= y.upper(); + if (interval_lib::detail::test_input(x, y)) + return false; + return y.lower() <= x && x <= y.upper(); } -template inline -bool subset(const interval& x, - const interval& y) +template inline bool subset(const interval &x, const interval &y) { - if (empty(x)) return true; - return !empty(y) && y.lower() <= x.lower() && x.upper() <= y.upper(); + if (empty(x)) + return true; + return !empty(y) && y.lower() <= x.lower() && x.upper() <= y.upper(); } -template inline -bool proper_subset(const interval& x, - const interval& y) +template +inline bool proper_subset(const interval &x, const interval &y) { - if (empty(y)) return false; - if (empty(x)) return true; - return y.lower() <= x.lower() && x.upper() <= y.upper() && - (y.lower() != x.lower() || x.upper() != y.upper()); + if (empty(y)) + return false; + if (empty(x)) + return true; + return y.lower() <= x.lower() && x.upper() <= y.upper() && (y.lower() != x.lower() || x.upper() != y.upper()); } -template inline -bool overlap(const interval& x, - const interval& y) +template +inline bool overlap(const interval &x, const interval &y) { - if (interval_lib::detail::test_input(x, y)) return false; - return x.lower() <= y.lower() && y.lower() <= x.upper() || - y.lower() <= x.lower() && x.lower() <= y.upper(); + if (interval_lib::detail::test_input(x, y)) + return false; + return x.lower() <= y.lower() && y.lower() <= x.upper() || y.lower() <= x.lower() && x.lower() <= y.upper(); } -template inline -bool singleton(const interval& x) +template inline bool singleton(const interval &x) { - return !empty(x) && x.lower() == x.upper(); + return !empty(x) && x.lower() == x.upper(); } -template inline -bool equal(const interval& x, const interval& y) +template +inline bool equal(const interval &x, const interval &y) { - if (empty(x)) return empty(y); - return !empty(y) && x.lower() == y.lower() && x.upper() == y.upper(); + if (empty(x)) + return empty(y); + return !empty(y) && x.lower() == y.lower() && x.upper() == y.upper(); } -template inline -interval intersect(const interval& x, - const interval& y) +template +inline interval intersect(const interval &x, const interval &y) { - BOOST_USING_STD_MIN(); - BOOST_USING_STD_MAX(); - if (interval_lib::detail::test_input(x, y)) - return interval::empty(); - const T& l = max BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()); - const T& u = min BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()); - if (l <= u) return interval(l, u, true); - else return interval::empty(); + BOOST_USING_STD_MIN(); + BOOST_USING_STD_MAX(); + if (interval_lib::detail::test_input(x, y)) + return interval::empty(); + const T &l = max BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()); + const T &u = min BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()); + if (l <= u) + return interval(l, u, true); + else + return interval::empty(); } -template inline -interval hull(const interval& x, - const interval& y) +template +inline interval hull(const interval &x, const interval &y) { - BOOST_USING_STD_MIN(); - BOOST_USING_STD_MAX(); - bool bad_x = interval_lib::detail::test_input(x); - bool bad_y = interval_lib::detail::test_input(y); - if (bad_x) - if (bad_y) return interval::empty(); - else return y; - else - if (bad_y) return x; - return interval(min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()), - max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()), true); + BOOST_USING_STD_MIN(); + BOOST_USING_STD_MAX(); + bool bad_x = interval_lib::detail::test_input(x); + bool bad_y = interval_lib::detail::test_input(y); + if (bad_x) + if (bad_y) + return interval::empty(); + else + return y; + else if (bad_y) + return x; + return interval(min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()), + max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()), + true); } -template inline -interval hull(const interval& x, const T& y) +template inline interval hull(const interval &x, const T &y) { - BOOST_USING_STD_MIN(); - BOOST_USING_STD_MAX(); - bool bad_x = interval_lib::detail::test_input(x); - bool bad_y = interval_lib::detail::test_input(y); - if (bad_y) - if (bad_x) return interval::empty(); - else return x; - else - if (bad_x) return interval(y, y, true); - return interval(min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y), - max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y), true); + BOOST_USING_STD_MIN(); + BOOST_USING_STD_MAX(); + bool bad_x = interval_lib::detail::test_input(x); + bool bad_y = interval_lib::detail::test_input(y); + if (bad_y) + if (bad_x) + return interval::empty(); + else + return x; + else if (bad_x) + return interval(y, y, true); + return interval( + min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y), max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y), true); } -template inline -interval hull(const T& x, const interval& y) +template inline interval hull(const T &x, const interval &y) { - BOOST_USING_STD_MIN(); - BOOST_USING_STD_MAX(); - bool bad_x = interval_lib::detail::test_input(x); - bool bad_y = interval_lib::detail::test_input(y); - if (bad_x) - if (bad_y) return interval::empty(); - else return y; - else - if (bad_y) return interval(x, x, true); - return interval(min BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.lower()), - max BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.upper()), true); + BOOST_USING_STD_MIN(); + BOOST_USING_STD_MAX(); + bool bad_x = interval_lib::detail::test_input(x); + bool bad_y = interval_lib::detail::test_input(y); + if (bad_x) + if (bad_y) + return interval::empty(); + else + return y; + else if (bad_y) + return interval(x, x, true); + return interval( + min BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.lower()), max BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.upper()), true); } -template inline -interval hull(const T& x, const T& y) -{ - return interval::hull(x, y); -} +template inline interval hull(const T &x, const T &y) { return interval::hull(x, y); } -template inline -std::pair, interval > -bisect(const interval& x) +template +inline std::pair, interval> bisect(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return std::pair(I::empty(), I::empty()); - const T m = median(x); - return std::pair(I(x.lower(), m, true), I(m, x.upper(), true)); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return std::pair(I::empty(), I::empty()); + const T m = median(x); + return std::pair(I(x.lower(), m, true), I(m, x.upper(), true)); } /* * Elementary functions */ -template inline -T norm(const interval& x) +template inline T norm(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) { - typedef typename Policies::checking checking; - return checking::nan(); - } - BOOST_USING_STD_MAX(); - return max BOOST_PREVENT_MACRO_SUBSTITUTION(static_cast(-x.lower()), x.upper()); + typedef interval I; + if (interval_lib::detail::test_input(x)) { + typedef typename Policies::checking checking; + return checking::nan(); + } + BOOST_USING_STD_MAX(); + return max BOOST_PREVENT_MACRO_SUBSTITUTION(static_cast(-x.lower()), x.upper()); } -template inline -interval abs(const interval& x) +template inline interval abs(const interval &x) { - typedef interval I; - if (interval_lib::detail::test_input(x)) - return I::empty(); - if (!interval_lib::user::is_neg(x.lower())) return x; - if (!interval_lib::user::is_pos(x.upper())) return -x; - BOOST_USING_STD_MAX(); - return I(static_cast(0), max BOOST_PREVENT_MACRO_SUBSTITUTION(static_cast(-x.lower()), x.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x)) + return I::empty(); + if (!interval_lib::user::is_neg(x.lower())) + return x; + if (!interval_lib::user::is_pos(x.upper())) + return -x; + BOOST_USING_STD_MAX(); + return I(static_cast(0), max BOOST_PREVENT_MACRO_SUBSTITUTION(static_cast(-x.lower()), x.upper()), true); } -template inline -interval max BOOST_PREVENT_MACRO_SUBSTITUTION (const interval& x, - const interval& y) +template +inline interval max BOOST_PREVENT_MACRO_SUBSTITUTION(const interval &x, + const interval &y) { - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - BOOST_USING_STD_MAX(); - return I(max BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()), max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + BOOST_USING_STD_MAX(); + return I(max BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()), + max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()), + true); } -template inline -interval max BOOST_PREVENT_MACRO_SUBSTITUTION (const interval& x, const T& y) +template +inline interval max BOOST_PREVENT_MACRO_SUBSTITUTION(const interval &x, const T &y) { - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - BOOST_USING_STD_MAX(); - return I(max BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y), max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y), true); + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + BOOST_USING_STD_MAX(); + return I( + max BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y), max BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y), true); } -template inline -interval max BOOST_PREVENT_MACRO_SUBSTITUTION (const T& x, const interval& y) +template +inline interval max BOOST_PREVENT_MACRO_SUBSTITUTION(const T &x, const interval &y) { - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - BOOST_USING_STD_MAX(); - return I(max BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.lower()), max BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + BOOST_USING_STD_MAX(); + return I( + max BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.lower()), max BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.upper()), true); } -template inline -interval min BOOST_PREVENT_MACRO_SUBSTITUTION (const interval& x, - const interval& y) +template +inline interval min BOOST_PREVENT_MACRO_SUBSTITUTION(const interval &x, + const interval &y) { - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - BOOST_USING_STD_MIN(); - return I(min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()), min BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + BOOST_USING_STD_MIN(); + return I(min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y.lower()), + min BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y.upper()), + true); } -template inline -interval min BOOST_PREVENT_MACRO_SUBSTITUTION (const interval& x, const T& y) +template +inline interval min BOOST_PREVENT_MACRO_SUBSTITUTION(const interval &x, const T &y) { - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - BOOST_USING_STD_MIN(); - return I(min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y), min BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y), true); + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + BOOST_USING_STD_MIN(); + return I( + min BOOST_PREVENT_MACRO_SUBSTITUTION(x.lower(), y), min BOOST_PREVENT_MACRO_SUBSTITUTION(x.upper(), y), true); } -template inline -interval min BOOST_PREVENT_MACRO_SUBSTITUTION (const T& x, const interval& y) +template +inline interval min BOOST_PREVENT_MACRO_SUBSTITUTION(const T &x, const interval &y) { - typedef interval I; - if (interval_lib::detail::test_input(x, y)) - return I::empty(); - BOOST_USING_STD_MIN(); - return I(min BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.lower()), min BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.upper()), true); + typedef interval I; + if (interval_lib::detail::test_input(x, y)) + return I::empty(); + BOOST_USING_STD_MIN(); + return I( + min BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.lower()), min BOOST_PREVENT_MACRO_SUBSTITUTION(x, y.upper()), true); } } // namespace numeric diff --git a/Samples/2_Concepts_and_Techniques/interval/cpu_interval.h b/Samples/2_Concepts_and_Techniques/interval/cpu_interval.h index 560bbaf3..c56a7cc4 100644 --- a/Samples/2_Concepts_and_Techniques/interval/cpu_interval.h +++ b/Samples/2_Concepts_and_Techniques/interval/cpu_interval.h @@ -26,8 +26,8 @@ */ /* Simple CPU implementation -* Depends on Boost.Interval -*/ + * Depends on Boost.Interval + */ #ifndef CPU_INTERVAL_H #define CPU_INTERVAL_H @@ -36,76 +36,82 @@ #define __USE_ISOC99 #endif +#include #include #include -#include -//#include -#define UNPROTECTED 0 +#include "cuda_interval.h" +// #include + +#define UNPROTECTED 0 #define USE_RECURSION_CPU 1 using boost::numeric::interval; using namespace boost::numeric; -template -class global_stack_cpu { - private: - T *buf; - int free_index; +template class global_stack_cpu +{ +private: + T *buf; + int free_index; - public: - // buf should point to an allocated global buffer of size N * THREADS * - // sizeof(T) - global_stack_cpu(T *buf, int thread_id) : buf(buf), free_index(thread_id) {} +public: + // buf should point to an allocated global buffer of size N * THREADS * + // sizeof(T) + global_stack_cpu(T *buf, int thread_id) + : buf(buf) + , free_index(thread_id) + { + } - void push(T const &v) { - buf[free_index] = v; - free_index += THREADS; - } - T pop() { - free_index -= THREADS; - return buf[free_index]; - } - bool full() { return free_index >= N * THREADS; } - bool empty() { return free_index < THREADS; } - int size() { return free_index / THREADS; } + void push(T const &v) + { + buf[free_index] = v; + free_index += THREADS; + } + T pop() + { + free_index -= THREADS; + return buf[free_index]; + } + bool full() { return free_index >= N * THREADS; } + bool empty() { return free_index < THREADS; } + int size() { return free_index / THREADS; } }; // The function F of which we want to find roots, defined on intervals // Should typically depend on thread_id (indexing an array of coefficients...) -template -I f_cpu(I const &x, int thread_id) { - typedef typename I::base_type T; - T alpha = -T(thread_id) / T(THREADS); - return square(x - I(1)) + I(alpha) * x; +template I f_cpu(I const &x, int thread_id) +{ + typedef typename I::base_type T; + T alpha = -T(thread_id) / T(THREADS); + return square(x - I(1)) + I(alpha) * x; } // First derivative of F, also defined on intervals -template -I fd_cpu(I const &x, int thread_id) { - typedef typename I::base_type T; - T alpha = -T(thread_id) / T(THREADS); - return I(2) * x + I(alpha - 2); +template I fd_cpu(I const &x, int thread_id) +{ + typedef typename I::base_type T; + T alpha = -T(thread_id) / T(THREADS); + return I(2) * x + I(alpha - 2); } // Is this interval small enough to stop iterating? -template -bool is_minimal_cpu(I const &x, int thread_id) { - typedef typename I::base_type T; - T const epsilon_x = 1e-6f; - T const epsilon_y = 1e-6f; - return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) || - width(f_cpu(x, thread_id)) <= epsilon_y); +template bool is_minimal_cpu(I const &x, int thread_id) +{ + typedef typename I::base_type T; + T const epsilon_x = 1e-6f; + T const epsilon_y = 1e-6f; + return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) || width(f_cpu(x, thread_id)) <= epsilon_y); } // In some cases, Newton iterations converge slowly. // Bisecting the interval accelerates convergence. -template -bool should_bisect_cpu(I const &x, I const &x1, I const &x2, - typename I::base_type alpha) { - typedef typename I::base_type T; - T wmax = alpha * width(x); - return width(x1) > wmax || width(x2) > wmax; +template bool should_bisect_cpu(I const &x, I const &x1, I const &x2, typename I::base_type alpha) +{ + typedef typename I::base_type T; + T wmax = alpha * width(x); + return width(x1) > wmax || width(x2) > wmax; } int const DEPTH_WORK = 128; @@ -115,193 +121,190 @@ int const DEPTH_WORK = 128; // Always keep the next interval to work on in registers (avoids excessive // spilling to local mem) template -void newton_interval_cpu(global_stack_cpu &result, - I const &ix0, int thread_id) { - typedef typename I::base_type T; +void newton_interval_cpu(global_stack_cpu &result, I const &ix0, int thread_id) +{ + typedef typename I::base_type T; - T const alpha = .99f; // Threshold before switching to bisection + T const alpha = .99f; // Threshold before switching to bisection - // Intervals to be processed - I local_buffer[DEPTH_WORK]; - global_stack_cpu work(local_buffer, 0); + // Intervals to be processed + I local_buffer[DEPTH_WORK]; + global_stack_cpu work(local_buffer, 0); - // We start with the whole domain - I ix = ix0; + // We start with the whole domain + I ix = ix0; + + while (true) { + // Compute (x - F({x})/F'(ix)) inter ix + // -> may yield 0, 1 or 2 intervals + T x = median(ix); + I iq = f_cpu(I(x), thread_id); + I id = fd_cpu(ix, thread_id); + + bool has_part2; + I part1, part2; + part1 = division_part1(iq, id, has_part2); + part1 = intersect(I(x) - part1, ix); + + if (has_part2) { + part2 = division_part2(iq, id); + part2 = intersect(I(x) - part2, ix); + } + + // Do we have small-enough intervals? + if (is_minimal_cpu(part1, thread_id)) { + result.push(part1); + part1 = I::empty(); + } + + if (has_part2 && is_minimal_cpu(part2, thread_id)) { + result.push(part2); + part2 = I::empty(); + } + + if (should_bisect_cpu(ix, part1, part2, alpha)) { + // Not so good improvement + // Switch to bisection method for this step + part1 = I(ix.lower(), x); + part2 = I(x, ix.upper()); + has_part2 = true; + } + + if ((part1.lower() <= part1.upper()) && !empty(part1)) { + // At least 1 solution + // We will compute part1 next + ix = part1; + + if (has_part2 && !empty(part2)) { + // 2 solutions + // Save the second solution for later + work.push(part2); + } + } + else if (has_part2 && !empty(part2)) { + // 1 solution + // Work on that next + ix = part2; + } + else { + // No solution + // Do we still have work to do in the stack? + if (work.empty()) // If not, we are done + break; + else + ix = work.pop(); // Otherwise, pick an interval to work on + } + } +} + +template +void newton_interval_rec_cpu(global_stack_cpu &result, I const &ix, int thread_id) +{ + typedef typename I::base_type T; + T const alpha = .99f; // Threshold before switching to bisection + + if (is_minimal_cpu(ix, thread_id)) { + result.push(ix); + return; + } - while (true) { // Compute (x - F({x})/F'(ix)) inter ix // -> may yield 0, 1 or 2 intervals - T x = median(ix); + T x = median(ix); I iq = f_cpu(I(x), thread_id); I id = fd_cpu(ix, thread_id); bool has_part2; - I part1, part2; + I part1, part2; part1 = division_part1(iq, id, has_part2); part1 = intersect(I(x) - part1, ix); if (has_part2) { - part2 = division_part2(iq, id); - part2 = intersect(I(x) - part2, ix); - } - - // Do we have small-enough intervals? - if (is_minimal_cpu(part1, thread_id)) { - result.push(part1); - part1 = I::empty(); - } - - if (has_part2 && is_minimal_cpu(part2, thread_id)) { - result.push(part2); - part2 = I::empty(); + part2 = division_part2(iq, id); + part2 = intersect(I(x) - part2, ix); } if (should_bisect_cpu(ix, part1, part2, alpha)) { - // Not so good improvement - // Switch to bisection method for this step - part1 = I(ix.lower(), x); - part2 = I(x, ix.upper()); - has_part2 = true; + // Not so good improvement + // Switch to bisection method for this step + part1 = I(ix.lower(), x); + part2 = I(x, ix.upper()); + has_part2 = true; } - if ((part1.lower() <= part1.upper()) && !empty(part1)) { - // At least 1 solution - // We will compute part1 next - ix = part1; - - if (has_part2 && !empty(part2)) { - // 2 solutions - // Save the second solution for later - work.push(part2); - } - } else if (has_part2 && !empty(part2)) { - // 1 solution - // Work on that next - ix = part2; - } else { - // No solution - // Do we still have work to do in the stack? - if (work.empty()) // If not, we are done - break; - else - ix = work.pop(); // Otherwise, pick an interval to work on + if ((part1.lower() <= part1.upper()) && (!empty(part1))) { + newton_interval_rec_cpu(result, part1, thread_id); + } + + if (has_part2 && !empty(part2)) { + newton_interval_rec_cpu(result, part2, thread_id); } - } } -template -void newton_interval_rec_cpu(global_stack_cpu &result, - I const &ix, int thread_id) { - typedef typename I::base_type T; - T const alpha = .99f; // Threshold before switching to bisection +template void test_interval_newton_cpu(I *buffer, int *nresults, I i) +{ + typedef typename I::base_type T; - if (is_minimal_cpu(ix, thread_id)) { - result.push(ix); - return; - } - - // Compute (x - F({x})/F'(ix)) inter ix - // -> may yield 0, 1 or 2 intervals - T x = median(ix); - I iq = f_cpu(I(x), thread_id); - I id = fd_cpu(ix, thread_id); - - bool has_part2; - I part1, part2; - part1 = division_part1(iq, id, has_part2); - part1 = intersect(I(x) - part1, ix); - - if (has_part2) { - part2 = division_part2(iq, id); - part2 = intersect(I(x) - part2, ix); - } - - if (should_bisect_cpu(ix, part1, part2, alpha)) { - // Not so good improvement - // Switch to bisection method for this step - part1 = I(ix.lower(), x); - part2 = I(x, ix.upper()); - has_part2 = true; - } - - if ((part1.lower() <= part1.upper()) && (!empty(part1))) { - newton_interval_rec_cpu(result, part1, thread_id); - } - - if (has_part2 && !empty(part2)) { - newton_interval_rec_cpu(result, part2, thread_id); - } -} - -template -void test_interval_newton_cpu(I *buffer, int *nresults, I i) { - typedef typename I::base_type T; - - // Intervals to return - // std::vector local_buffer(BLOCK_SIZE * GRID_SIZE * DEPTH_WORK); - for (int thread_id = 0; thread_id != BLOCK_SIZE * GRID_SIZE; ++thread_id) { - global_stack_cpu result(buffer, thread_id); + // Intervals to return + // std::vector local_buffer(BLOCK_SIZE * GRID_SIZE * DEPTH_WORK); + for (int thread_id = 0; thread_id != BLOCK_SIZE * GRID_SIZE; ++thread_id) { + global_stack_cpu result(buffer, thread_id); #if USE_RECURSION_CPU - newton_interval_rec_cpu(result, i, thread_id); + newton_interval_rec_cpu(result, i, thread_id); #else - newton_interval_cpu(result, i, thread_id); + newton_interval_cpu(result, i, thread_id); #endif - nresults[thread_id] = result.size(); - } + nresults[thread_id] = result.size(); + } } -typedef interval, - interval_lib::checking_base > > - Ibase; +typedef interval, interval_lib::checking_base>> Ibase; #if UNPROTECTED typedef interval_lib::unprotect::type I_CPU; -Ibase::traits_type::rounding rnd; +Ibase::traits_type::rounding rnd; #else typedef Ibase I_CPU; #endif -bool checkAgainstHost(int *h_nresults, int *h_nresults_cpu, I_CPU *h_result, - I_CPU *h_result_cpu) { - std::cout << "\nCheck against Host computation...\n\n"; - int success = 1; - int success1 = 1; - int success2 = 1; +bool checkAgainstHost(int *h_nresults, int *h_nresults_cpu, I_CPU *h_result, I_CPU *h_result_cpu) +{ + std::cout << "\nCheck against Host computation...\n\n"; + int success = 1; + int success1 = 1; + int success2 = 1; - if (h_nresults_cpu[0] == h_nresults[0]) { - for (int i = 0; i != h_nresults[0]; ++i) { - TYPE diff1 = abs(h_result[THREADS * i + 0].lower() - - h_result_cpu[THREADS * i + 0].lower()); - TYPE diff2 = abs(h_result[THREADS * i + 0].upper() - - h_result_cpu[THREADS * i + 0].upper()); + if (h_nresults_cpu[0] == h_nresults[0]) { + for (int i = 0; i != h_nresults[0]; ++i) { + TYPE diff1 = abs(h_result[THREADS * i + 0].lower() - h_result_cpu[THREADS * i + 0].lower()); + TYPE diff2 = abs(h_result[THREADS * i + 0].upper() - h_result_cpu[THREADS * i + 0].upper()); - if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) { - success1 = 0; - break; - } + if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) { + success1 = 0; + break; + } + } + + // in case the two intervals are reversed + for (int i = 0; i != h_nresults[0]; ++i) { + TYPE diff1 = + abs(h_result[THREADS * i + 0].lower() - h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].lower()); + TYPE diff2 = + abs(h_result[THREADS * i + 0].upper() - h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].upper()); + + if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) { + success2 = 0; + break; + } + } + + success = success1 || success2; } + else + success = 0; - // in case the two intervals are reversed - for (int i = 0; i != h_nresults[0]; ++i) { - TYPE diff1 = - abs(h_result[THREADS * i + 0].lower() - - h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].lower()); - TYPE diff2 = - abs(h_result[THREADS * i + 0].upper() - - h_result_cpu[THREADS * (h_nresults[0] - i - 1) + 0].upper()); - - if ((diff1 > 1.0e-6f) || (diff2 > 1.0e-6f)) { - success2 = 0; - break; - } - } - - success = success1 || success2; - } else - success = 0; - - return (bool)success; + return (bool)success; } #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/cuda_interval.h b/Samples/2_Concepts_and_Techniques/interval/cuda_interval.h index 10ad3e90..62f6b6e9 100644 --- a/Samples/2_Concepts_and_Techniques/interval/cuda_interval.h +++ b/Samples/2_Concepts_and_Techniques/interval/cuda_interval.h @@ -28,89 +28,95 @@ #ifndef CUDA_INTERVAL_H #define CUDA_INTERVAL_H -#include "interval.h" #include "cuda_interval_lib.h" +#include "interval.h" // Stack in local memory. Managed independently for each thread. -template -class local_stack { - private: - T buf[N]; - int tos; +template class local_stack +{ +private: + T buf[N]; + int tos; - public: - __device__ local_stack() : tos(-1) {} - __device__ T const &top() const { return buf[tos]; } - __device__ T &top() { return buf[tos]; } - __device__ void push(T const &v) { buf[++tos] = v; } - __device__ T pop() { return buf[tos--]; } - __device__ bool full() { return tos == (N - 1); } - __device__ bool empty() { return tos == -1; } +public: + __device__ local_stack() + : tos(-1) + { + } + __device__ T const &top() const { return buf[tos]; } + __device__ T &top() { return buf[tos]; } + __device__ void push(T const &v) { buf[++tos] = v; } + __device__ T pop() { return buf[tos--]; } + __device__ bool full() { return tos == (N - 1); } + __device__ bool empty() { return tos == -1; } }; // Stacks in global memory. // Same function as local_stack, but accessible from the host. // Interleaved between threads by blocks of THREADS elements. // Independent stack for each thread, no sharing of data between threads. -template -class global_stack { - private: - T *buf; - int free_index; +template class global_stack +{ +private: + T *buf; + int free_index; - public: - // buf should point to an allocated global buffer of - // size N * THREADS * sizeof(T) - __device__ global_stack(T *buf, int thread_id) - : buf(buf), free_index(thread_id) {} +public: + // buf should point to an allocated global buffer of + // size N * THREADS * sizeof(T) + __device__ global_stack(T *buf, int thread_id) + : buf(buf) + , free_index(thread_id) + { + } - __device__ void push(T const &v) { - buf[free_index] = v; - free_index += THREADS; - } - __device__ T pop() { - free_index -= THREADS; - return buf[free_index]; - } - __device__ bool full() { return free_index >= N * THREADS; } - __device__ bool empty() { return free_index < THREADS; } - __device__ int size() { return free_index / THREADS; } + __device__ void push(T const &v) + { + buf[free_index] = v; + free_index += THREADS; + } + __device__ T pop() + { + free_index -= THREADS; + return buf[free_index]; + } + __device__ bool full() { return free_index >= N * THREADS; } + __device__ bool empty() { return free_index < THREADS; } + __device__ int size() { return free_index / THREADS; } }; // The function F of which we want to find roots, defined on intervals // Should typically depend on thread_id (indexing an array of coefficients...) -template -__device__ interval_gpu f(interval_gpu const &x, int thread_id) { - typedef interval_gpu I; - T alpha = -T(thread_id) / T(THREADS); - return square(x - I(1)) + I(alpha) * x; +template __device__ interval_gpu f(interval_gpu const &x, int thread_id) +{ + typedef interval_gpu I; + T alpha = -T(thread_id) / T(THREADS); + return square(x - I(1)) + I(alpha) * x; } // First derivative of F, also defined on intervals -template -__device__ interval_gpu fd(interval_gpu const &x, int thread_id) { - typedef interval_gpu I; - T alpha = -T(thread_id) / T(THREADS); - return I(2) * x + I(alpha - 2); +template __device__ interval_gpu fd(interval_gpu const &x, int thread_id) +{ + typedef interval_gpu I; + T alpha = -T(thread_id) / T(THREADS); + return I(2) * x + I(alpha - 2); } // Is this interval small enough to stop iterating? -template -__device__ bool is_minimal(interval_gpu const &x, int thread_id) { - T const epsilon_x = 1e-6f; - T const epsilon_y = 1e-6f; - return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) || - width(f(x, thread_id)) <= epsilon_y); +template __device__ bool is_minimal(interval_gpu const &x, int thread_id) +{ + T const epsilon_x = 1e-6f; + T const epsilon_y = 1e-6f; + return !empty(x) && (width(x) <= epsilon_x * abs(median(x)) || width(f(x, thread_id)) <= epsilon_y); } // In some cases, Newton iterations converge slowly. // Bisecting the interval accelerates convergence. template -__device__ bool should_bisect(interval_gpu const &x, - interval_gpu const &x1, - interval_gpu const &x2, T alpha) { - T wmax = alpha * width(x); - return (!empty(x1) && width(x1) > wmax) || (!empty(x2) && width(x2) > wmax); +__device__ bool should_bisect(interval_gpu const &x, interval_gpu const &x1, interval_gpu const &x2, T alpha) +{ + T wmax = alpha * width(x); + return (!empty(x1) && width(x1) > wmax) || (!empty(x2) && width(x2) > wmax); } // Main interval Newton loop. @@ -118,214 +124,219 @@ __device__ bool should_bisect(interval_gpu const &x, // Always keep the next interval to work on in registers // (avoids excessive spilling to local mem) template -__device__ void newton_interval( - global_stack, DEPTH_RESULT, THREADS> &result, - interval_gpu const &ix0, int thread_id) { - typedef interval_gpu I; - int const DEPTH_WORK = 128; +__device__ void +newton_interval(global_stack, DEPTH_RESULT, THREADS> &result, interval_gpu const &ix0, int thread_id) +{ + typedef interval_gpu I; + int const DEPTH_WORK = 128; - T const alpha = .99f; // Threshold before switching to bisection + T const alpha = .99f; // Threshold before switching to bisection - // Intervals to be processed - local_stack work; + // Intervals to be processed + local_stack work; - // We start with the whole domain - I ix = ix0; + // We start with the whole domain + I ix = ix0; - while (true) { - // Compute (x - F({x})/F'(ix)) inter ix - // -> may yield 0, 1 or 2 intervals - T x = median(ix); - I iq = f(I(x), thread_id); - I id = fd(ix, thread_id); + while (true) { + // Compute (x - F({x})/F'(ix)) inter ix + // -> may yield 0, 1 or 2 intervals + T x = median(ix); + I iq = f(I(x), thread_id); + I id = fd(ix, thread_id); - bool has_part2; - I part1, part2 = I::empty(); - part1 = division_part1(iq, id, has_part2); - part1 = intersect(I(x) - part1, ix); + bool has_part2; + I part1, part2 = I::empty(); + part1 = division_part1(iq, id, has_part2); + part1 = intersect(I(x) - part1, ix); - if (has_part2) { - part2 = division_part2(iq, id); - part2 = intersect(I(x) - part2, ix); + if (has_part2) { + part2 = division_part2(iq, id); + part2 = intersect(I(x) - part2, ix); + } + + // Do we have small-enough intervals? + if (is_minimal(part1, thread_id)) { + result.push(part1); + part1 = I::empty(); + } + + if (has_part2 && is_minimal(part2, thread_id)) { + result.push(part2); + part2 = I::empty(); + } + + if (should_bisect(ix, part1, part2, alpha)) { + // Not so good improvement + // Switch to bisection method for this step + part1 = I(ix.lower(), x); + part2 = I(x, ix.upper()); + has_part2 = true; + } + + if (!empty(part1)) { + // At least 1 solution + // We will compute part1 next + ix = part1; + + if (has_part2 && !empty(part2)) { + // 2 solutions + // Save the second solution for later + work.push(part2); + } + } + else if (has_part2 && !empty(part2)) { + // 1 solution + // Work on that next + ix = part2; + } + else { + // No solution + // Do we still have work to do in the stack? + if (work.empty()) // If not, we are done + break; + else + ix = work.pop(); // Otherwise, pick an interval to work on + } } - - // Do we have small-enough intervals? - if (is_minimal(part1, thread_id)) { - result.push(part1); - part1 = I::empty(); - } - - if (has_part2 && is_minimal(part2, thread_id)) { - result.push(part2); - part2 = I::empty(); - } - - if (should_bisect(ix, part1, part2, alpha)) { - // Not so good improvement - // Switch to bisection method for this step - part1 = I(ix.lower(), x); - part2 = I(x, ix.upper()); - has_part2 = true; - } - - if (!empty(part1)) { - // At least 1 solution - // We will compute part1 next - ix = part1; - - if (has_part2 && !empty(part2)) { - // 2 solutions - // Save the second solution for later - work.push(part2); - } - } else if (has_part2 && !empty(part2)) { - // 1 solution - // Work on that next - ix = part2; - } else { - // No solution - // Do we still have work to do in the stack? - if (work.empty()) // If not, we are done - break; - else - ix = work.pop(); // Otherwise, pick an interval to work on - } - } } // Recursive implementation template -__device__ void newton_interval_rec( - global_stack, DEPTH_RESULT, THREADS> &result, - interval_gpu const &ix, int thread_id) { - typedef interval_gpu I; - T const alpha = .99f; // Threshold before switching to bisection +__device__ void newton_interval_rec(global_stack, DEPTH_RESULT, THREADS> &result, + interval_gpu const &ix, + int thread_id) +{ + typedef interval_gpu I; + T const alpha = .99f; // Threshold before switching to bisection - if (is_minimal(ix, thread_id)) { - result.push(ix); - return; - } + if (is_minimal(ix, thread_id)) { + result.push(ix); + return; + } - // Compute (x - F({x})/F'(ix)) inter ix - // -> may yield 0, 1 or 2 intervals - T x = median(ix); - I iq = f(I(x), thread_id); - I id = fd(ix, thread_id); + // Compute (x - F({x})/F'(ix)) inter ix + // -> may yield 0, 1 or 2 intervals + T x = median(ix); + I iq = f(I(x), thread_id); + I id = fd(ix, thread_id); - bool has_part2; - I part1, part2 = I::empty(); - part1 = division_part1(iq, id, has_part2); - part1 = intersect(I(x) - part1, ix); + bool has_part2; + I part1, part2 = I::empty(); + part1 = division_part1(iq, id, has_part2); + part1 = intersect(I(x) - part1, ix); - if (has_part2) { - part2 = division_part2(iq, id); - part2 = intersect(I(x) - part2, ix); - } + if (has_part2) { + part2 = division_part2(iq, id); + part2 = intersect(I(x) - part2, ix); + } - if (should_bisect(ix, part1, part2, alpha)) { - // Not so good improvement - // Switch to bisection method for this step - part1 = I(ix.lower(), x); - part2 = I(x, ix.upper()); - has_part2 = true; - } + if (should_bisect(ix, part1, part2, alpha)) { + // Not so good improvement + // Switch to bisection method for this step + part1 = I(ix.lower(), x); + part2 = I(x, ix.upper()); + has_part2 = true; + } - if (has_part2 && !empty(part2)) { - newton_interval_rec(result, part2, thread_id); - } + if (has_part2 && !empty(part2)) { + newton_interval_rec(result, part2, thread_id); + } - if (!empty(part1)) { - newton_interval_rec(result, part1, thread_id); - } + if (!empty(part1)) { + newton_interval_rec(result, part1, thread_id); + } } // Naive implementation, no attempt to keep the top of the stack in registers template -__device__ void newton_interval_naive( - global_stack, DEPTH_RESULT, THREADS> &result, - interval_gpu const &ix0, int thread_id) { - typedef interval_gpu I; - int const DEPTH_WORK = 128; - T const alpha = .99f; // Threshold before switching to bisection +__device__ void newton_interval_naive(global_stack, DEPTH_RESULT, THREADS> &result, + interval_gpu const &ix0, + int thread_id) +{ + typedef interval_gpu I; + int const DEPTH_WORK = 128; + T const alpha = .99f; // Threshold before switching to bisection - // Intervals to be processed - local_stack work; + // Intervals to be processed + local_stack work; - // We start with the whole domain - work.push(ix0); + // We start with the whole domain + work.push(ix0); - while (!work.empty()) { - I ix = work.pop(); + while (!work.empty()) { + I ix = work.pop(); - if (is_minimal(ix, thread_id)) { - result.push(ix); - } else { - // Compute (x - F({x})/F'(ix)) inter ix - // -> may yield 0, 1 or 2 intervals - T x = median(ix); - I iq = f(I(x), thread_id); - I id = fd(ix, thread_id); + if (is_minimal(ix, thread_id)) { + result.push(ix); + } + else { + // Compute (x - F({x})/F'(ix)) inter ix + // -> may yield 0, 1 or 2 intervals + T x = median(ix); + I iq = f(I(x), thread_id); + I id = fd(ix, thread_id); - bool has_part2; - I part1, part2 = I::empty(); - part1 = division_part1(iq, id, has_part2); - part1 = intersect(I(x) - part1, ix); + bool has_part2; + I part1, part2 = I::empty(); + part1 = division_part1(iq, id, has_part2); + part1 = intersect(I(x) - part1, ix); - if (has_part2) { - part2 = division_part2(iq, id); - part2 = intersect(I(x) - part2, ix); - } + if (has_part2) { + part2 = division_part2(iq, id); + part2 = intersect(I(x) - part2, ix); + } - if (should_bisect(ix, part1, part2, alpha)) { - // Not so good improvement - // Switch to bisection method for this step - part1 = I(ix.lower(), x); - part2 = I(x, ix.upper()); - has_part2 = true; - } + if (should_bisect(ix, part1, part2, alpha)) { + // Not so good improvement + // Switch to bisection method for this step + part1 = I(ix.lower(), x); + part2 = I(x, ix.upper()); + has_part2 = true; + } - if (!empty(part1)) { - work.push(part1); - } + if (!empty(part1)) { + work.push(part1); + } - if (has_part2 && !empty(part2)) { - work.push(part2); - } + if (has_part2 && !empty(part2)) { + work.push(part2); + } + } } - } } template -__global__ void test_interval_newton(interval_gpu *buffer, int *nresults, - interval_gpu i, - int implementation_choice) { - int thread_id = blockIdx.x * BLOCK_SIZE + threadIdx.x; - typedef interval_gpu I; +__global__ void +test_interval_newton(interval_gpu *buffer, int *nresults, interval_gpu i, int implementation_choice) +{ + int thread_id = blockIdx.x * BLOCK_SIZE + threadIdx.x; + typedef interval_gpu I; - // Intervals to return - global_stack result(buffer, thread_id); + // Intervals to return + global_stack result(buffer, thread_id); - switch (implementation_choice) { + switch (implementation_choice) { case 0: - newton_interval_naive(result, i, thread_id); - break; + newton_interval_naive(result, i, thread_id); + break; case 1: - newton_interval(result, i, thread_id); - break; + newton_interval(result, i, thread_id); + break; #if (__CUDA_ARCH__ >= 200) case 2: - newton_interval_rec(result, i, thread_id); - break; + newton_interval_rec(result, i, thread_id); + break; #endif default: - newton_interval_naive(result, i, thread_id); - } + newton_interval_naive(result, i, thread_id); + } - nresults[thread_id] = result.size(); + nresults[thread_id] = result.size(); } #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/cuda_interval_lib.h b/Samples/2_Concepts_and_Techniques/interval/cuda_interval_lib.h index 5d4cea9f..b8fbdb7e 100644 --- a/Samples/2_Concepts_and_Techniques/interval/cuda_interval_lib.h +++ b/Samples/2_Concepts_and_Techniques/interval/cuda_interval_lib.h @@ -33,308 +33,293 @@ // Interval template class and basic operations // Interface inspired from the Boost Interval library (www.boost.org) -template -class interval_gpu { - public: - __device__ __host__ interval_gpu(); - __device__ __host__ interval_gpu(T const &v); - __device__ __host__ interval_gpu(T const &l, T const &u); +template class interval_gpu +{ +public: + __device__ __host__ interval_gpu(); + __device__ __host__ interval_gpu(T const &v); + __device__ __host__ interval_gpu(T const &l, T const &u); - __device__ __host__ T const &lower() const; - __device__ __host__ T const &upper() const; + __device__ __host__ T const &lower() const; + __device__ __host__ T const &upper() const; - static __device__ __host__ interval_gpu empty(); + static __device__ __host__ interval_gpu empty(); - private: - T low; - T up; +private: + T low; + T up; }; // Constructors -template -inline __device__ __host__ interval_gpu::interval_gpu() {} +template inline __device__ __host__ interval_gpu::interval_gpu() {} template inline __device__ __host__ interval_gpu::interval_gpu(T const &l, T const &u) - : low(l), up(u) {} + : low(l) + , up(u) +{ +} template inline __device__ __host__ interval_gpu::interval_gpu(T const &v) - : low(v), up(v) {} - -template -inline __device__ __host__ T const &interval_gpu::lower() const { - return low; + : low(v) + , up(v) +{ } -template -inline __device__ __host__ T const &interval_gpu::upper() const { - return up; +template inline __device__ __host__ T const &interval_gpu::lower() const { return low; } + +template inline __device__ __host__ T const &interval_gpu::upper() const { return up; } + +template inline __device__ __host__ interval_gpu interval_gpu::empty() +{ + rounded_arith rnd; + return interval_gpu(rnd.nan(), rnd.nan()); } -template -inline __device__ __host__ interval_gpu interval_gpu::empty() { - rounded_arith rnd; - return interval_gpu(rnd.nan(), rnd.nan()); +template inline __device__ __host__ bool empty(interval_gpu x) +{ + T hash = x.lower() + x.upper(); + return (hash != hash); } -template -inline __device__ __host__ bool empty(interval_gpu x) { - T hash = x.lower() + x.upper(); - return (hash != hash); -} +template inline __device__ T width(interval_gpu x) +{ + if (empty(x)) + return 0; -template -inline __device__ T width(interval_gpu x) { - if (empty(x)) return 0; - - rounded_arith rnd; - return rnd.sub_up(x.upper(), x.lower()); + rounded_arith rnd; + return rnd.sub_up(x.upper(), x.lower()); } // Arithmetic operations // Unary operators -template -inline __device__ interval_gpu const &operator+(interval_gpu const &x) { - return x; -} +template inline __device__ interval_gpu const &operator+(interval_gpu const &x) { return x; } -template -inline __device__ interval_gpu operator-(interval_gpu const &x) { - return interval_gpu(-x.upper(), -x.lower()); +template inline __device__ interval_gpu operator-(interval_gpu const &x) +{ + return interval_gpu(-x.upper(), -x.lower()); } // Binary operators -template -inline __device__ interval_gpu operator+(interval_gpu const &x, - interval_gpu const &y) { - rounded_arith rnd; - return interval_gpu(rnd.add_down(x.lower(), y.lower()), - rnd.add_up(x.upper(), y.upper())); +template inline __device__ interval_gpu operator+(interval_gpu const &x, interval_gpu const &y) +{ + rounded_arith rnd; + return interval_gpu(rnd.add_down(x.lower(), y.lower()), rnd.add_up(x.upper(), y.upper())); } -template -inline __device__ interval_gpu operator-(interval_gpu const &x, - interval_gpu const &y) { - rounded_arith rnd; - return interval_gpu(rnd.sub_down(x.lower(), y.upper()), - rnd.sub_up(x.upper(), y.lower())); +template inline __device__ interval_gpu operator-(interval_gpu const &x, interval_gpu const &y) +{ + rounded_arith rnd; + return interval_gpu(rnd.sub_down(x.lower(), y.upper()), rnd.sub_up(x.upper(), y.lower())); } -inline __device__ float min4(float a, float b, float c, float d) { - return fminf(fminf(a, b), fminf(c, d)); -} +inline __device__ float min4(float a, float b, float c, float d) { return fminf(fminf(a, b), fminf(c, d)); } -inline __device__ float max4(float a, float b, float c, float d) { - return fmaxf(fmaxf(a, b), fmaxf(c, d)); -} +inline __device__ float max4(float a, float b, float c, float d) { return fmaxf(fmaxf(a, b), fmaxf(c, d)); } -inline __device__ double min4(double a, double b, double c, double d) { - return fmin(fmin(a, b), fmin(c, d)); -} +inline __device__ double min4(double a, double b, double c, double d) { return fmin(fmin(a, b), fmin(c, d)); } -inline __device__ double max4(double a, double b, double c, double d) { - return fmax(fmax(a, b), fmax(c, d)); -} +inline __device__ double max4(double a, double b, double c, double d) { return fmax(fmax(a, b), fmax(c, d)); } -template -inline __device__ interval_gpu operator*(interval_gpu const &x, - interval_gpu const &y) { - // Textbook implementation: 14 flops, but no branch. - rounded_arith rnd; - return interval_gpu( - min4(rnd.mul_down(x.lower(), y.lower()), - rnd.mul_down(x.lower(), y.upper()), - rnd.mul_down(x.upper(), y.lower()), - rnd.mul_down(x.upper(), y.upper())), - max4(rnd.mul_up(x.lower(), y.lower()), rnd.mul_up(x.lower(), y.upper()), - rnd.mul_up(x.upper(), y.lower()), rnd.mul_up(x.upper(), y.upper()))); +template inline __device__ interval_gpu operator*(interval_gpu const &x, interval_gpu const &y) +{ + // Textbook implementation: 14 flops, but no branch. + rounded_arith rnd; + return interval_gpu(min4(rnd.mul_down(x.lower(), y.lower()), + rnd.mul_down(x.lower(), y.upper()), + rnd.mul_down(x.upper(), y.lower()), + rnd.mul_down(x.upper(), y.upper())), + max4(rnd.mul_up(x.lower(), y.lower()), + rnd.mul_up(x.lower(), y.upper()), + rnd.mul_up(x.upper(), y.lower()), + rnd.mul_up(x.upper(), y.upper()))); } // Center of an interval // Typically used for bisection -template -inline __device__ T median(interval_gpu const &x) { - rounded_arith rnd; - return rnd.median(x.lower(), x.upper()); +template inline __device__ T median(interval_gpu const &x) +{ + rounded_arith rnd; + return rnd.median(x.lower(), x.upper()); } // Intersection between two intervals (can be empty) -template -inline __device__ interval_gpu intersect(interval_gpu const &x, - interval_gpu const &y) { - rounded_arith rnd; - T const &l = rnd.max(x.lower(), y.lower()); - T const &u = rnd.min(x.upper(), y.upper()); +template inline __device__ interval_gpu intersect(interval_gpu const &x, interval_gpu const &y) +{ + rounded_arith rnd; + T const &l = rnd.max(x.lower(), y.lower()); + T const &u = rnd.min(x.upper(), y.upper()); - if (l <= u) - return interval_gpu(l, u); - else - return interval_gpu::empty(); + if (l <= u) + return interval_gpu(l, u); + else + return interval_gpu::empty(); } // Division by an interval which does not contain 0. // GPU-optimized implementation assuming division is expensive -template -inline __device__ interval_gpu div_non_zero(interval_gpu const &x, - interval_gpu const &y) { - rounded_arith rnd; - typedef interval_gpu I; - T xl, yl, xu, yu; +template inline __device__ interval_gpu div_non_zero(interval_gpu const &x, interval_gpu const &y) +{ + rounded_arith rnd; + typedef interval_gpu I; + T xl, yl, xu, yu; - if (y.upper() < 0) { - xl = x.upper(); - xu = x.lower(); - } else { - xl = x.lower(); - xu = x.upper(); - } - - if (x.upper() < 0) { - yl = y.lower(); - yu = y.upper(); - } else if (x.lower() < 0) { if (y.upper() < 0) { - yl = y.upper(); - yu = y.upper(); - } else { - yl = y.lower(); - yu = y.lower(); + xl = x.upper(); + xu = x.lower(); + } + else { + xl = x.lower(); + xu = x.upper(); } - } else { - yl = y.upper(); - yu = y.lower(); - } - return I(rnd.div_down(xl, yl), rnd.div_up(xu, yu)); + if (x.upper() < 0) { + yl = y.lower(); + yu = y.upper(); + } + else if (x.lower() < 0) { + if (y.upper() < 0) { + yl = y.upper(); + yu = y.upper(); + } + else { + yl = y.lower(); + yu = y.lower(); + } + } + else { + yl = y.upper(); + yu = y.lower(); + } + + return I(rnd.div_down(xl, yl), rnd.div_up(xu, yu)); } -template -inline __device__ interval_gpu div_positive(interval_gpu const &x, - T const &yu) { - // assert(yu > 0); - if (x.lower() == 0 && x.upper() == 0) return x; +template inline __device__ interval_gpu div_positive(interval_gpu const &x, T const &yu) +{ + // assert(yu > 0); + if (x.lower() == 0 && x.upper() == 0) + return x; - rounded_arith rnd; - typedef interval_gpu I; - const T &xl = x.lower(); - const T &xu = x.upper(); + rounded_arith rnd; + typedef interval_gpu I; + const T &xl = x.lower(); + const T &xu = x.upper(); - if (xu < 0) - return I(rnd.neg_inf(), rnd.div_up(xu, yu)); - else if (xl < 0) - return I(rnd.neg_inf(), rnd.pos_inf()); - else - return I(rnd.div_down(xl, yu), rnd.pos_inf()); -} - -template -inline __device__ interval_gpu div_negative(interval_gpu const &x, - T const &yl) { - // assert(yu > 0); - if (x.lower() == 0 && x.upper() == 0) return x; - - rounded_arith rnd; - typedef interval_gpu I; - const T &xl = x.lower(); - const T &xu = x.upper(); - - if (xu < 0) - return I(rnd.div_down(xu, yl), rnd.pos_inf()); - else if (xl < 0) - return I(rnd.neg_inf(), rnd.pos_inf()); - else - return I(rnd.neg_inf(), rnd.div_up(xl, yl)); -} - -template -inline __device__ interval_gpu div_zero_part1(interval_gpu const &x, - interval_gpu const &y, - bool &b) { - if (x.lower() == 0 && x.upper() == 0) { - b = false; - return x; - } - - rounded_arith rnd; - typedef interval_gpu I; - const T &xl = x.lower(); - const T &xu = x.upper(); - const T &yl = y.lower(); - const T &yu = y.upper(); - - if (xu < 0) { - b = true; - return I(rnd.neg_inf(), rnd.div_up(xu, yu)); - } else if (xl < 0) { - b = false; - return I(rnd.neg_inf(), rnd.pos_inf()); - } else { - b = true; - return I(rnd.neg_inf(), rnd.div_up(xl, yl)); - } -} - -template -inline __device__ interval_gpu div_zero_part2(interval_gpu const &x, - interval_gpu const &y) { - rounded_arith rnd; - typedef interval_gpu I; - const T &xl = x.lower(); - const T &xu = x.upper(); - const T &yl = y.lower(); - const T &yu = y.upper(); - - if (xu < 0) - return I(rnd.div_down(xu, yl), rnd.pos_inf()); - else - return I(rnd.div_down(xl, yu), rnd.pos_inf()); -} - -template -inline __device__ interval_gpu division_part1(interval_gpu const &x, - interval_gpu const &y, - bool &b) { - b = false; - - if (y.lower() <= 0 && y.upper() >= 0) - if (y.lower() != 0) - if (y.upper() != 0) - return div_zero_part1(x, y, b); - else - return div_negative(x, y.lower()); - else if (y.upper() != 0) - return div_positive(x, y.upper()); + if (xu < 0) + return I(rnd.neg_inf(), rnd.div_up(xu, yu)); + else if (xl < 0) + return I(rnd.neg_inf(), rnd.pos_inf()); else - return interval_gpu::empty(); - else - return div_non_zero(x, y); + return I(rnd.div_down(xl, yu), rnd.pos_inf()); +} + +template inline __device__ interval_gpu div_negative(interval_gpu const &x, T const &yl) +{ + // assert(yu > 0); + if (x.lower() == 0 && x.upper() == 0) + return x; + + rounded_arith rnd; + typedef interval_gpu I; + const T &xl = x.lower(); + const T &xu = x.upper(); + + if (xu < 0) + return I(rnd.div_down(xu, yl), rnd.pos_inf()); + else if (xl < 0) + return I(rnd.neg_inf(), rnd.pos_inf()); + else + return I(rnd.neg_inf(), rnd.div_up(xl, yl)); } template -inline __device__ interval_gpu division_part2(interval_gpu const &x, - interval_gpu const &y, - bool b = true) { - if (!b) return interval_gpu::empty(); +inline __device__ interval_gpu div_zero_part1(interval_gpu const &x, interval_gpu const &y, bool &b) +{ + if (x.lower() == 0 && x.upper() == 0) { + b = false; + return x; + } - return div_zero_part2(x, y); + rounded_arith rnd; + typedef interval_gpu I; + const T &xl = x.lower(); + const T &xu = x.upper(); + const T &yl = y.lower(); + const T &yu = y.upper(); + + if (xu < 0) { + b = true; + return I(rnd.neg_inf(), rnd.div_up(xu, yu)); + } + else if (xl < 0) { + b = false; + return I(rnd.neg_inf(), rnd.pos_inf()); + } + else { + b = true; + return I(rnd.neg_inf(), rnd.div_up(xl, yl)); + } +} + +template inline __device__ interval_gpu div_zero_part2(interval_gpu const &x, interval_gpu const &y) +{ + rounded_arith rnd; + typedef interval_gpu I; + const T &xl = x.lower(); + const T &xu = x.upper(); + const T &yl = y.lower(); + const T &yu = y.upper(); + + if (xu < 0) + return I(rnd.div_down(xu, yl), rnd.pos_inf()); + else + return I(rnd.div_down(xl, yu), rnd.pos_inf()); } template -inline __device__ interval_gpu square(interval_gpu const &x) { - typedef interval_gpu I; - rounded_arith rnd; - const T &xl = x.lower(); - const T &xu = x.upper(); +inline __device__ interval_gpu division_part1(interval_gpu const &x, interval_gpu const &y, bool &b) +{ + b = false; - if (xl >= 0) - return I(rnd.mul_down(xl, xl), rnd.mul_up(xu, xu)); - else if (xu <= 0) - return I(rnd.mul_down(xu, xu), rnd.mul_up(xl, xl)); - else - return I(static_cast(0), - rnd.max(rnd.mul_up(xl, xl), rnd.mul_up(xu, xu))); + if (y.lower() <= 0 && y.upper() >= 0) + if (y.lower() != 0) + if (y.upper() != 0) + return div_zero_part1(x, y, b); + else + return div_negative(x, y.lower()); + else if (y.upper() != 0) + return div_positive(x, y.upper()); + else + return interval_gpu::empty(); + else + return div_non_zero(x, y); +} + +template +inline __device__ interval_gpu division_part2(interval_gpu const &x, interval_gpu const &y, bool b = true) +{ + if (!b) + return interval_gpu::empty(); + + return div_zero_part2(x, y); +} + +template inline __device__ interval_gpu square(interval_gpu const &x) +{ + typedef interval_gpu I; + rounded_arith rnd; + const T &xl = x.lower(); + const T &xu = x.upper(); + + if (xl >= 0) + return I(rnd.mul_down(xl, xl), rnd.mul_up(xu, xu)); + else if (xu <= 0) + return I(rnd.mul_down(xu, xu), rnd.mul_up(xl, xl)); + else + return I(static_cast(0), rnd.max(rnd.mul_up(xl, xl), rnd.mul_up(xu, xu))); } #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/cuda_interval_rounded_arith.h b/Samples/2_Concepts_and_Techniques/interval/cuda_interval_rounded_arith.h index c6e0b1f4..0ee911a1 100644 --- a/Samples/2_Concepts_and_Techniques/interval/cuda_interval_rounded_arith.h +++ b/Samples/2_Concepts_and_Techniques/interval/cuda_interval_rounded_arith.h @@ -32,145 +32,105 @@ #define CUDA_INTERVAL_ROUNDED_ARITH_H // Generic class, no actual implementation yet -template -struct rounded_arith { - __device__ T add_down(const T &x, const T &y); - __device__ T add_up(const T &x, const T &y); - __device__ T sub_down(const T &x, const T &y); - __device__ T sub_up(const T &x, const T &y); - __device__ T mul_down(const T &x, const T &y); - __device__ T mul_up(const T &x, const T &y); - __device__ T div_down(const T &x, const T &y); - __device__ T div_up(const T &x, const T &y); - __device__ T median(const T &x, const T &y); - __device__ T sqrt_down(const T &x); - __device__ T sqrt_up(const T &x); - __device__ T int_down(const T &x); - __device__ T int_up(const T &x); +template struct rounded_arith +{ + __device__ T add_down(const T &x, const T &y); + __device__ T add_up(const T &x, const T &y); + __device__ T sub_down(const T &x, const T &y); + __device__ T sub_up(const T &x, const T &y); + __device__ T mul_down(const T &x, const T &y); + __device__ T mul_up(const T &x, const T &y); + __device__ T div_down(const T &x, const T &y); + __device__ T div_up(const T &x, const T &y); + __device__ T median(const T &x, const T &y); + __device__ T sqrt_down(const T &x); + __device__ T sqrt_up(const T &x); + __device__ T int_down(const T &x); + __device__ T int_up(const T &x); - __device__ T pos_inf(); - __device__ T neg_inf(); - __device__ __host__ T nan(); - __device__ T min(T const &x, T const &y); - __device__ T max(T const &x, T const &y); + __device__ T pos_inf(); + __device__ T neg_inf(); + __device__ __host__ T nan(); + __device__ T min(T const &x, T const &y); + __device__ T max(T const &x, T const &y); }; // Specialization for float -template <> -struct rounded_arith { - __device__ float add_down(const float &x, const float &y) { - return __fadd_rd(x, y); - } +template <> struct rounded_arith +{ + __device__ float add_down(const float &x, const float &y) { return __fadd_rd(x, y); } - __device__ float add_up(const float &x, const float &y) { - return __fadd_ru(x, y); - } + __device__ float add_up(const float &x, const float &y) { return __fadd_ru(x, y); } - __device__ float sub_down(const float &x, const float &y) { - return __fadd_rd(x, -y); - } + __device__ float sub_down(const float &x, const float &y) { return __fadd_rd(x, -y); } - __device__ float sub_up(const float &x, const float &y) { - return __fadd_ru(x, -y); - } + __device__ float sub_up(const float &x, const float &y) { return __fadd_ru(x, -y); } - __device__ float mul_down(const float &x, const float &y) { - return __fmul_rd(x, y); - } + __device__ float mul_down(const float &x, const float &y) { return __fmul_rd(x, y); } - __device__ float mul_up(const float &x, const float &y) { - return __fmul_ru(x, y); - } + __device__ float mul_up(const float &x, const float &y) { return __fmul_ru(x, y); } - __device__ float div_down(const float &x, const float &y) { - return __fdiv_rd(x, y); - } + __device__ float div_down(const float &x, const float &y) { return __fdiv_rd(x, y); } - __device__ float div_up(const float &x, const float &y) { - return __fdiv_ru(x, y); - } + __device__ float div_up(const float &x, const float &y) { return __fdiv_ru(x, y); } - __device__ float median(const float &x, const float &y) { - return (x + y) * .5f; - } + __device__ float median(const float &x, const float &y) { return (x + y) * .5f; } - __device__ float sqrt_down(const float &x) { return __fsqrt_rd(x); } + __device__ float sqrt_down(const float &x) { return __fsqrt_rd(x); } - __device__ float sqrt_up(const float &x) { return __fsqrt_ru(x); } + __device__ float sqrt_up(const float &x) { return __fsqrt_ru(x); } - __device__ float int_down(const float &x) { return floorf(x); } + __device__ float int_down(const float &x) { return floorf(x); } - __device__ float int_up(const float &x) { return ceilf(x); } + __device__ float int_up(const float &x) { return ceilf(x); } - __device__ float neg_inf() { return __int_as_float(0xff800000); } + __device__ float neg_inf() { return __int_as_float(0xff800000); } - __device__ float pos_inf() { return __int_as_float(0x7f800000); } + __device__ float pos_inf() { return __int_as_float(0x7f800000); } - __device__ __host__ float nan() { return nanf(""); } + __device__ __host__ float nan() { return nanf(""); } - __device__ float min(float const &x, float const &y) { return fminf(x, y); } + __device__ float min(float const &x, float const &y) { return fminf(x, y); } - __device__ float max(float const &x, float const &y) { return fmaxf(x, y); } + __device__ float max(float const &x, float const &y) { return fmaxf(x, y); } }; // Specialization for double -template <> -struct rounded_arith { - __device__ double add_down(const double &x, const double &y) { - return __dadd_rd(x, y); - } +template <> struct rounded_arith +{ + __device__ double add_down(const double &x, const double &y) { return __dadd_rd(x, y); } - __device__ double add_up(const double &x, const double &y) { - return __dadd_ru(x, y); - } + __device__ double add_up(const double &x, const double &y) { return __dadd_ru(x, y); } - __device__ double sub_down(const double &x, const double &y) { - return __dadd_rd(x, -y); - } + __device__ double sub_down(const double &x, const double &y) { return __dadd_rd(x, -y); } - __device__ double sub_up(const double &x, const double &y) { - return __dadd_ru(x, -y); - } + __device__ double sub_up(const double &x, const double &y) { return __dadd_ru(x, -y); } - __device__ double mul_down(const double &x, const double &y) { - return __dmul_rd(x, y); - } + __device__ double mul_down(const double &x, const double &y) { return __dmul_rd(x, y); } - __device__ double mul_up(const double &x, const double &y) { - return __dmul_ru(x, y); - } + __device__ double mul_up(const double &x, const double &y) { return __dmul_ru(x, y); } - __device__ double div_down(const double &x, const double &y) { - return __ddiv_rd(x, y); - } + __device__ double div_down(const double &x, const double &y) { return __ddiv_rd(x, y); } - __device__ double div_up(const double &x, const double &y) { - return __ddiv_ru(x, y); - } - __device__ double median(const double &x, const double &y) { - return (x + y) * .5; - } + __device__ double div_up(const double &x, const double &y) { return __ddiv_ru(x, y); } + __device__ double median(const double &x, const double &y) { return (x + y) * .5; } - __device__ double sqrt_down(const double &x) { return __dsqrt_rd(x); } + __device__ double sqrt_down(const double &x) { return __dsqrt_rd(x); } - __device__ double sqrt_up(const double &x) { return __dsqrt_ru(x); } + __device__ double sqrt_up(const double &x) { return __dsqrt_ru(x); } - __device__ double int_down(const double &x) { return floor(x); } + __device__ double int_down(const double &x) { return floor(x); } - __device__ double int_up(const double &x) { return ceil(x); } + __device__ double int_up(const double &x) { return ceil(x); } - __device__ double neg_inf() { - return __longlong_as_double(0xfff0000000000000ull); - } + __device__ double neg_inf() { return __longlong_as_double(0xfff0000000000000ull); } - __device__ double pos_inf() { - return __longlong_as_double(0x7ff0000000000000ull); - } - __device__ __host__ double nan() { return ::nan(""); } + __device__ double pos_inf() { return __longlong_as_double(0x7ff0000000000000ull); } + __device__ __host__ double nan() { return ::nan(""); } - __device__ double min(double const &x, double const &y) { return fmin(x, y); } + __device__ double min(double const &x, double const &y) { return fmin(x, y); } - __device__ double max(double const &x, double const &y) { return fmax(x, y); } + __device__ double max(double const &x, double const &y) { return fmax(x, y); } }; #endif diff --git a/Samples/2_Concepts_and_Techniques/interval/interval.cu b/Samples/2_Concepts_and_Techniques/interval/interval.cu index 6800437f..24b80e5f 100644 --- a/Samples/2_Concepts_and_Techniques/interval/interval.cu +++ b/Samples/2_Concepts_and_Techniques/interval/interval.cu @@ -27,7 +27,7 @@ /* Example of program using the interval_gpu template class and operators: * Search for roots of a function using an interval Newton method. - * + * * Use the command-line argument "--n=" to select which GPU implementation to * use, * otherwise the naive implementation will be used by default. @@ -41,123 +41,112 @@ const static char *sSDKsample = "Interval Computing"; #include #include + +#include "cpu_interval.h" +#include "cuda_interval.h" #include "helper_cuda.h" #include "interval.h" -#include "cuda_interval.h" -#include "cpu_interval.h" -int main(int argc, char *argv[]) { - int implementation_choice = 0; +int main(int argc, char *argv[]) +{ + int implementation_choice = 0; - printf("[%s] starting ...\n\n", sSDKsample); + printf("[%s] starting ...\n\n", sSDKsample); - if (checkCmdLineFlag(argc, (const char **)argv, "n")) { - implementation_choice = - getCmdLineArgumentInt(argc, (const char **)argv, "n"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + implementation_choice = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + } - // Pick the best GPU available, or if the developer selects one at the command - // line - int devID = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, devID); - printf("> GPU Device has Compute Capabilities SM %d.%d\n\n", deviceProp.major, - deviceProp.minor); + // Pick the best GPU available, or if the developer selects one at the command + // line + int devID = findCudaDevice(argc, (const char **)argv); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, devID); + printf("> GPU Device has Compute Capabilities SM %d.%d\n\n", deviceProp.major, deviceProp.minor); - switch (implementation_choice) { + switch (implementation_choice) { case 0: - printf("GPU naive implementation\n"); - break; + printf("GPU naive implementation\n"); + break; case 1: - printf("GPU optimized implementation\n"); - break; + printf("GPU optimized implementation\n"); + break; case 2: - printf("GPU recursive implementation (requires Compute SM 2.0+)\n"); - break; + printf("GPU recursive implementation (requires Compute SM 2.0+)\n"); + break; default: - printf("GPU naive implementation\n"); - } + printf("GPU naive implementation\n"); + } - interval_gpu *d_result; - int *d_nresults; - int *h_nresults = new int[THREADS]; - cudaEvent_t start, stop; + interval_gpu *d_result; + int *d_nresults; + int *h_nresults = new int[THREADS]; + cudaEvent_t start, stop; - CHECKED_CALL(cudaSetDevice(devID)); - CHECKED_CALL(cudaMalloc((void **)&d_result, - THREADS * DEPTH_RESULT * sizeof(*d_result))); - CHECKED_CALL(cudaMalloc((void **)&d_nresults, THREADS * sizeof(*d_nresults))); - CHECKED_CALL(cudaEventCreate(&start)); - CHECKED_CALL(cudaEventCreate(&stop)); + CHECKED_CALL(cudaSetDevice(devID)); + CHECKED_CALL(cudaMalloc((void **)&d_result, THREADS * DEPTH_RESULT * sizeof(*d_result))); + CHECKED_CALL(cudaMalloc((void **)&d_nresults, THREADS * sizeof(*d_nresults))); + CHECKED_CALL(cudaEventCreate(&start)); + CHECKED_CALL(cudaEventCreate(&stop)); - // We need L1 cache to store the stack (only applicable to sm_20 and higher) - CHECKED_CALL( - cudaFuncSetCacheConfig(test_interval_newton, cudaFuncCachePreferL1)); + // We need L1 cache to store the stack (only applicable to sm_20 and higher) + CHECKED_CALL(cudaFuncSetCacheConfig(test_interval_newton, cudaFuncCachePreferL1)); - // Increase the stack size large enough for the non-inlined and recursive - // function calls (only applicable to sm_20 and higher) - CHECKED_CALL(cudaDeviceSetLimit(cudaLimitStackSize, 8192)); + // Increase the stack size large enough for the non-inlined and recursive + // function calls (only applicable to sm_20 and higher) + CHECKED_CALL(cudaDeviceSetLimit(cudaLimitStackSize, 8192)); - interval_gpu i(0.01f, 4.0f); - std::cout << "Searching for roots in [" << i.lower() << ", " << i.upper() - << "]...\n"; + interval_gpu i(0.01f, 4.0f); + std::cout << "Searching for roots in [" << i.lower() << ", " << i.upper() << "]...\n"; - CHECKED_CALL(cudaEventRecord(start, 0)); + CHECKED_CALL(cudaEventRecord(start, 0)); - for (int it = 0; it < NUM_RUNS; ++it) { - test_interval_newton<<>>(d_result, d_nresults, i, - implementation_choice); - CHECKED_CALL(cudaGetLastError()); - } + for (int it = 0; it < NUM_RUNS; ++it) { + test_interval_newton<<>>(d_result, d_nresults, i, implementation_choice); + CHECKED_CALL(cudaGetLastError()); + } - CHECKED_CALL(cudaEventRecord(stop, 0)); - CHECKED_CALL(cudaDeviceSynchronize()); + CHECKED_CALL(cudaEventRecord(stop, 0)); + CHECKED_CALL(cudaDeviceSynchronize()); - I_CPU *h_result = new I_CPU[THREADS * DEPTH_RESULT]; - CHECKED_CALL(cudaMemcpy(h_result, d_result, - THREADS * DEPTH_RESULT * sizeof(*d_result), - cudaMemcpyDeviceToHost)); - CHECKED_CALL(cudaMemcpy(h_nresults, d_nresults, THREADS * sizeof(*d_nresults), - cudaMemcpyDeviceToHost)); + I_CPU *h_result = new I_CPU[THREADS * DEPTH_RESULT]; + CHECKED_CALL(cudaMemcpy(h_result, d_result, THREADS * DEPTH_RESULT * sizeof(*d_result), cudaMemcpyDeviceToHost)); + CHECKED_CALL(cudaMemcpy(h_nresults, d_nresults, THREADS * sizeof(*d_nresults), cudaMemcpyDeviceToHost)); - std::cout << "Found " << h_nresults[0] - << " intervals that may contain the root(s)\n"; - std::cout.precision(15); + std::cout << "Found " << h_nresults[0] << " intervals that may contain the root(s)\n"; + std::cout.precision(15); - for (int i = 0; i != h_nresults[0]; ++i) { - std::cout << " i[" << i << "] =" - << " [" << h_result[THREADS * i + 0].lower() << ", " - << h_result[THREADS * i + 0].upper() << "]\n"; - } + for (int i = 0; i != h_nresults[0]; ++i) { + std::cout << " i[" << i << "] =" + << " [" << h_result[THREADS * i + 0].lower() << ", " << h_result[THREADS * i + 0].upper() << "]\n"; + } - float time; - CHECKED_CALL(cudaEventElapsedTime(&time, start, stop)); - std::cout << "Number of equations solved: " << THREADS << "\n"; - std::cout << "Time per equation: " - << 1000000.0f * (time / (float)(THREADS)) / NUM_RUNS << " us\n"; + float time; + CHECKED_CALL(cudaEventElapsedTime(&time, start, stop)); + std::cout << "Number of equations solved: " << THREADS << "\n"; + std::cout << "Time per equation: " << 1000000.0f * (time / (float)(THREADS)) / NUM_RUNS << " us\n"; - CHECKED_CALL(cudaEventDestroy(start)); - CHECKED_CALL(cudaEventDestroy(stop)); - CHECKED_CALL(cudaFree(d_result)); - CHECKED_CALL(cudaFree(d_nresults)); + CHECKED_CALL(cudaEventDestroy(start)); + CHECKED_CALL(cudaEventDestroy(stop)); + CHECKED_CALL(cudaFree(d_result)); + CHECKED_CALL(cudaFree(d_nresults)); - // Compute the results using a CPU implementation based on the Boost library - I_CPU i_cpu(0.01f, 4.0f); - I_CPU *h_result_cpu = new I_CPU[THREADS * DEPTH_RESULT]; - int *h_nresults_cpu = new int[THREADS]; - test_interval_newton_cpu(h_result_cpu, h_nresults_cpu, i_cpu); + // Compute the results using a CPU implementation based on the Boost library + I_CPU i_cpu(0.01f, 4.0f); + I_CPU *h_result_cpu = new I_CPU[THREADS * DEPTH_RESULT]; + int *h_nresults_cpu = new int[THREADS]; + test_interval_newton_cpu(h_result_cpu, h_nresults_cpu, i_cpu); - // Compare the CPU and GPU results - bool bTestResult = - checkAgainstHost(h_nresults, h_nresults_cpu, h_result, h_result_cpu); + // Compare the CPU and GPU results + bool bTestResult = checkAgainstHost(h_nresults, h_nresults_cpu, h_result, h_result_cpu); - delete[] h_result_cpu; - delete[] h_nresults_cpu; - delete[] h_result; - delete[] h_nresults; + delete[] h_result_cpu; + delete[] h_nresults_cpu; + delete[] h_result; + delete[] h_nresults; - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/2_Concepts_and_Techniques/interval/interval.h b/Samples/2_Concepts_and_Techniques/interval/interval.h index bd9ba563..e8656e7c 100644 --- a/Samples/2_Concepts_and_Techniques/interval/interval.h +++ b/Samples/2_Concepts_and_Techniques/interval/interval.h @@ -28,25 +28,25 @@ #ifndef INTERVAL_H #define INTERVAL_H -#define DEVICE 0 -#define TYPE double +#define DEVICE 0 +#define TYPE double #define NUM_RUNS (100) typedef TYPE T; -int const BLOCK_SIZE = 64; -int const GRID_SIZE = 1024; -int const THREADS = GRID_SIZE * BLOCK_SIZE; +int const BLOCK_SIZE = 64; +int const GRID_SIZE = 1024; +int const THREADS = GRID_SIZE * BLOCK_SIZE; int const DEPTH_RESULT = 128; -#define CHECKED_CALL(func) \ - do { \ - cudaError_t err = (func); \ - if (err != cudaSuccess) { \ - printf("%s(%d): ERROR: %s returned %s (err#%d)\n", __FILE__, __LINE__, \ - #func, cudaGetErrorString(err), err); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define CHECKED_CALL(func) \ + do { \ + cudaError_t err = (func); \ + if (err != cudaSuccess) { \ + printf( \ + "%s(%d): ERROR: %s returned %s (err#%d)\n", __FILE__, __LINE__, #func, cudaGetErrorString(err), err); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) #endif diff --git a/Samples/2_Concepts_and_Techniques/particles/particleSystem.cpp b/Samples/2_Concepts_and_Techniques/particles/particleSystem.cpp index d7aa2e5c..6d88ab35 100644 --- a/Samples/2_Concepts_and_Techniques/particles/particleSystem.cpp +++ b/Samples/2_Concepts_and_Techniques/particles/particleSystem.cpp @@ -11,66 +11,65 @@ // OpenGL Graphics includes #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION -#include - #include "particleSystem.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "particleSystem.cuh" #include "particles_kernel.cuh" -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - #ifndef CUDART_PI_F -#define CUDART_PI_F 3.141592654f +#define CUDART_PI_F 3.141592654f #endif -ParticleSystem::ParticleSystem(uint numParticles, uint3 gridSize, bool bUseOpenGL) : - m_bInitialized(false), - m_bUseOpenGL(bUseOpenGL), - m_numParticles(numParticles), - m_hPos(0), - m_hVel(0), - m_dPos(0), - m_dVel(0), - m_gridSize(gridSize), - m_timer(NULL), - m_solverIterations(1) +ParticleSystem::ParticleSystem(uint numParticles, uint3 gridSize, bool bUseOpenGL) + : m_bInitialized(false) + , m_bUseOpenGL(bUseOpenGL) + , m_numParticles(numParticles) + , m_hPos(0) + , m_hVel(0) + , m_dPos(0) + , m_dVel(0) + , m_gridSize(gridSize) + , m_timer(NULL) + , m_solverIterations(1) { - m_numGridCells = m_gridSize.x*m_gridSize.y*m_gridSize.z; + m_numGridCells = m_gridSize.x * m_gridSize.y * m_gridSize.z; // float3 worldSize = make_float3(2.0f, 2.0f, 2.0f); - m_gridSortBits = 18; // increase this for larger grids + m_gridSortBits = 18; // increase this for larger grids // set simulation parameters - m_params.gridSize = m_gridSize; - m_params.numCells = m_numGridCells; + m_params.gridSize = m_gridSize; + m_params.numCells = m_numGridCells; m_params.numBodies = m_numParticles; m_params.particleRadius = 1.0f / 64.0f; - m_params.colliderPos = make_float3(-1.2f, -0.8f, 0.8f); + m_params.colliderPos = make_float3(-1.2f, -0.8f, 0.8f); m_params.colliderRadius = 0.2f; m_params.worldOrigin = make_float3(-1.0f, -1.0f, -1.0f); - // m_params.cellSize = make_float3(worldSize.x / m_gridSize.x, worldSize.y / m_gridSize.y, worldSize.z / m_gridSize.z); - float cellSize = m_params.particleRadius * 2.0f; // cell size equal to particle diameter + // m_params.cellSize = make_float3(worldSize.x / m_gridSize.x, worldSize.y / m_gridSize.y, worldSize.z / + // m_gridSize.z); + float cellSize = m_params.particleRadius * 2.0f; // cell size equal to particle diameter m_params.cellSize = make_float3(cellSize, cellSize, cellSize); - m_params.spring = 0.5f; - m_params.damping = 0.02f; - m_params.shear = 0.1f; - m_params.attraction = 0.0f; + m_params.spring = 0.5f; + m_params.damping = 0.02f; + m_params.shear = 0.1f; + m_params.attraction = 0.0f; m_params.boundaryDamping = -0.5f; - m_params.gravity = make_float3(0.0f, -0.0003f, 0.0f); + m_params.gravity = make_float3(0.0f, -0.0003f, 0.0f); m_params.globalDamping = 1.0f; _initialize(numParticles); @@ -82,8 +81,7 @@ ParticleSystem::~ParticleSystem() m_numParticles = 0; } -uint -ParticleSystem::createVBO(uint size) +uint ParticleSystem::createVBO(uint size) { GLuint vbo; glGenBuffers(1, &vbo); @@ -93,63 +91,84 @@ ParticleSystem::createVBO(uint size) return vbo; } -inline float lerp(float a, float b, float t) -{ - return a + t*(b-a); -} +inline float lerp(float a, float b, float t) { return a + t * (b - a); } // create a color ramp void colorRamp(float t, float *r) { - const int ncolors = 7; - float c[ncolors][3] = - { - { 1.0, 0.0, 0.0, }, - { 1.0, 0.5, 0.0, }, - { 1.0, 1.0, 0.0, }, - { 0.0, 1.0, 0.0, }, - { 0.0, 1.0, 1.0, }, - { 0.0, 0.0, 1.0, }, - { 1.0, 0.0, 1.0, }, + const int ncolors = 7; + float c[ncolors][3] = { + { + 1.0, + 0.0, + 0.0, + }, + { + 1.0, + 0.5, + 0.0, + }, + { + 1.0, + 1.0, + 0.0, + }, + { + 0.0, + 1.0, + 0.0, + }, + { + 0.0, + 1.0, + 1.0, + }, + { + 0.0, + 0.0, + 1.0, + }, + { + 1.0, + 0.0, + 1.0, + }, }; - t = t * (ncolors-1); - int i = (int) t; + t = t * (ncolors - 1); + int i = (int)t; float u = t - floorf(t); - r[0] = lerp(c[i][0], c[i+1][0], u); - r[1] = lerp(c[i][1], c[i+1][1], u); - r[2] = lerp(c[i][2], c[i+1][2], u); + r[0] = lerp(c[i][0], c[i + 1][0], u); + r[1] = lerp(c[i][1], c[i + 1][1], u); + r[2] = lerp(c[i][2], c[i + 1][2], u); } -void -ParticleSystem::_initialize(int numParticles) +void ParticleSystem::_initialize(int numParticles) { assert(!m_bInitialized); m_numParticles = numParticles; // allocate host storage - m_hPos = new float[m_numParticles*4]; - m_hVel = new float[m_numParticles*4]; - memset(m_hPos, 0, m_numParticles*4*sizeof(float)); - memset(m_hVel, 0, m_numParticles*4*sizeof(float)); + m_hPos = new float[m_numParticles * 4]; + m_hVel = new float[m_numParticles * 4]; + memset(m_hPos, 0, m_numParticles * 4 * sizeof(float)); + memset(m_hVel, 0, m_numParticles * 4 * sizeof(float)); m_hCellStart = new uint[m_numGridCells]; - memset(m_hCellStart, 0, m_numGridCells*sizeof(uint)); + memset(m_hCellStart, 0, m_numGridCells * sizeof(uint)); m_hCellEnd = new uint[m_numGridCells]; - memset(m_hCellEnd, 0, m_numGridCells*sizeof(uint)); + memset(m_hCellEnd, 0, m_numGridCells * sizeof(uint)); // allocate GPU data unsigned int memSize = sizeof(float) * 4 * m_numParticles; - if (m_bUseOpenGL) - { + if (m_bUseOpenGL) { m_posVbo = createVBO(memSize); registerGLBufferObject(m_posVbo, &m_cuda_posvbo_resource); } - else - { - checkCudaErrors(cudaMalloc((void **)&m_cudaPosVBO, memSize)) ; + else { + checkCudaErrors(cudaMalloc((void **)&m_cudaPosVBO, memSize)); } allocateArray((void **)&m_dVel, memSize); @@ -157,41 +176,38 @@ ParticleSystem::_initialize(int numParticles) allocateArray((void **)&m_dSortedPos, memSize); allocateArray((void **)&m_dSortedVel, memSize); - allocateArray((void **)&m_dGridParticleHash, m_numParticles*sizeof(uint)); - allocateArray((void **)&m_dGridParticleIndex, m_numParticles*sizeof(uint)); + allocateArray((void **)&m_dGridParticleHash, m_numParticles * sizeof(uint)); + allocateArray((void **)&m_dGridParticleIndex, m_numParticles * sizeof(uint)); - allocateArray((void **)&m_dCellStart, m_numGridCells*sizeof(uint)); - allocateArray((void **)&m_dCellEnd, m_numGridCells*sizeof(uint)); + allocateArray((void **)&m_dCellStart, m_numGridCells * sizeof(uint)); + allocateArray((void **)&m_dCellEnd, m_numGridCells * sizeof(uint)); - if (m_bUseOpenGL) - { - m_colorVBO = createVBO(m_numParticles*4*sizeof(float)); + if (m_bUseOpenGL) { + m_colorVBO = createVBO(m_numParticles * 4 * sizeof(float)); registerGLBufferObject(m_colorVBO, &m_cuda_colorvbo_resource); // fill color buffer glBindBuffer(GL_ARRAY_BUFFER, m_colorVBO); - float *data = (float *) glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - float *ptr = data; + float *data = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); + float *ptr = data; - for (uint i=0; i maxCellSize) - { + if (cellSize > maxCellSize) { maxCellSize = cellSize; } } @@ -328,186 +323,166 @@ ParticleSystem::dumpGrid() printf("maximum particles per cell = %d\n", maxCellSize); } -void -ParticleSystem::dumpParticles(uint start, uint count) +void ParticleSystem::dumpParticles(uint start, uint count) { // debug - copyArrayFromDevice(m_hPos, 0, &m_cuda_posvbo_resource, sizeof(float)*4*count); - copyArrayFromDevice(m_hVel, m_dVel, 0, sizeof(float)*4*count); + copyArrayFromDevice(m_hPos, 0, &m_cuda_posvbo_resource, sizeof(float) * 4 * count); + copyArrayFromDevice(m_hVel, m_dVel, 0, sizeof(float) * 4 * count); - for (uint i=start; i + #include "particles_kernel.cuh" #include "vector_functions.h" // Particle system class -class ParticleSystem { - public: - ParticleSystem(uint numParticles, uint3 gridSize, bool bUseOpenGL); - ~ParticleSystem(); +class ParticleSystem +{ +public: + ParticleSystem(uint numParticles, uint3 gridSize, bool bUseOpenGL); + ~ParticleSystem(); - enum ParticleConfig { CONFIG_RANDOM, CONFIG_GRID, _NUM_CONFIGS }; + enum ParticleConfig { CONFIG_RANDOM, CONFIG_GRID, _NUM_CONFIGS }; - enum ParticleArray { - POSITION, - VELOCITY, - }; + enum ParticleArray { + POSITION, + VELOCITY, + }; - void update(float deltaTime); - void reset(ParticleConfig config); + void update(float deltaTime); + void reset(ParticleConfig config); - float *getArray(ParticleArray array); - void setArray(ParticleArray array, const float *data, int start, int count); + float *getArray(ParticleArray array); + void setArray(ParticleArray array, const float *data, int start, int count); - int getNumParticles() const { return m_numParticles; } + int getNumParticles() const { return m_numParticles; } - unsigned int getCurrentReadBuffer() const { return m_posVbo; } - unsigned int getColorBuffer() const { return m_colorVBO; } + unsigned int getCurrentReadBuffer() const { return m_posVbo; } + unsigned int getColorBuffer() const { return m_colorVBO; } - void *getCudaPosVBO() const { return (void *)m_cudaPosVBO; } - void *getCudaColorVBO() const { return (void *)m_cudaColorVBO; } + void *getCudaPosVBO() const { return (void *)m_cudaPosVBO; } + void *getCudaColorVBO() const { return (void *)m_cudaColorVBO; } - void dumpGrid(); - void dumpParticles(uint start, uint count); + void dumpGrid(); + void dumpParticles(uint start, uint count); - void setIterations(int i) { m_solverIterations = i; } + void setIterations(int i) { m_solverIterations = i; } - void setDamping(float x) { m_params.globalDamping = x; } - void setGravity(float x) { m_params.gravity = make_float3(0.0f, x, 0.0f); } + void setDamping(float x) { m_params.globalDamping = x; } + void setGravity(float x) { m_params.gravity = make_float3(0.0f, x, 0.0f); } - void setCollideSpring(float x) { m_params.spring = x; } - void setCollideDamping(float x) { m_params.damping = x; } - void setCollideShear(float x) { m_params.shear = x; } - void setCollideAttraction(float x) { m_params.attraction = x; } + void setCollideSpring(float x) { m_params.spring = x; } + void setCollideDamping(float x) { m_params.damping = x; } + void setCollideShear(float x) { m_params.shear = x; } + void setCollideAttraction(float x) { m_params.attraction = x; } - void setColliderPos(float3 x) { m_params.colliderPos = x; } + void setColliderPos(float3 x) { m_params.colliderPos = x; } - float getParticleRadius() { return m_params.particleRadius; } - float3 getColliderPos() { return m_params.colliderPos; } - float getColliderRadius() { return m_params.colliderRadius; } - uint3 getGridSize() { return m_params.gridSize; } - float3 getWorldOrigin() { return m_params.worldOrigin; } - float3 getCellSize() { return m_params.cellSize; } + float getParticleRadius() { return m_params.particleRadius; } + float3 getColliderPos() { return m_params.colliderPos; } + float getColliderRadius() { return m_params.colliderRadius; } + uint3 getGridSize() { return m_params.gridSize; } + float3 getWorldOrigin() { return m_params.worldOrigin; } + float3 getCellSize() { return m_params.cellSize; } - void addSphere(int index, float *pos, float *vel, int r, float spacing); + void addSphere(int index, float *pos, float *vel, int r, float spacing); - protected: // methods - ParticleSystem() {} - uint createVBO(uint size); +protected: // methods + ParticleSystem() {} + uint createVBO(uint size); - void _initialize(int numParticles); - void _finalize(); + void _initialize(int numParticles); + void _finalize(); - void initGrid(uint *size, float spacing, float jitter, uint numParticles); + void initGrid(uint *size, float spacing, float jitter, uint numParticles); - protected: // data - bool m_bInitialized, m_bUseOpenGL; - uint m_numParticles; +protected: // data + bool m_bInitialized, m_bUseOpenGL; + uint m_numParticles; - // CPU data - float *m_hPos; // particle positions - float *m_hVel; // particle velocities + // CPU data + float *m_hPos; // particle positions + float *m_hVel; // particle velocities - uint *m_hParticleHash; - uint *m_hCellStart; - uint *m_hCellEnd; + uint *m_hParticleHash; + uint *m_hCellStart; + uint *m_hCellEnd; - // GPU data - float *m_dPos; - float *m_dVel; + // GPU data + float *m_dPos; + float *m_dVel; - float *m_dSortedPos; - float *m_dSortedVel; + float *m_dSortedPos; + float *m_dSortedVel; - // grid data for sorting method - uint *m_dGridParticleHash; // grid hash value for each particle - uint *m_dGridParticleIndex; // particle index for each particle - uint *m_dCellStart; // index of start of each cell in sorted list - uint *m_dCellEnd; // index of end of cell + // grid data for sorting method + uint *m_dGridParticleHash; // grid hash value for each particle + uint *m_dGridParticleIndex; // particle index for each particle + uint *m_dCellStart; // index of start of each cell in sorted list + uint *m_dCellEnd; // index of end of cell - uint m_gridSortBits; + uint m_gridSortBits; - uint m_posVbo; // vertex buffer object for particle positions - uint m_colorVBO; // vertex buffer object for colors + uint m_posVbo; // vertex buffer object for particle positions + uint m_colorVBO; // vertex buffer object for colors - float *m_cudaPosVBO; // these are the CUDA deviceMem Pos - float *m_cudaColorVBO; // these are the CUDA deviceMem Color + float *m_cudaPosVBO; // these are the CUDA deviceMem Pos + float *m_cudaColorVBO; // these are the CUDA deviceMem Color - struct cudaGraphicsResource - *m_cuda_posvbo_resource; // handles OpenGL-CUDA exchange - struct cudaGraphicsResource - *m_cuda_colorvbo_resource; // handles OpenGL-CUDA exchange + struct cudaGraphicsResource *m_cuda_posvbo_resource; // handles OpenGL-CUDA exchange + struct cudaGraphicsResource *m_cuda_colorvbo_resource; // handles OpenGL-CUDA exchange - // params - SimParams m_params; - uint3 m_gridSize; - uint m_numGridCells; + // params + SimParams m_params; + uint3 m_gridSize; + uint m_numGridCells; - StopWatchInterface *m_timer; + StopWatchInterface *m_timer; - uint m_solverIterations; + uint m_solverIterations; }; -#endif // __PARTICLESYSTEM_H__ +#endif // __PARTICLESYSTEM_H__ diff --git a/Samples/2_Concepts_and_Techniques/particles/particleSystem_cuda.cu b/Samples/2_Concepts_and_Techniques/particles/particleSystem_cuda.cu index f1c97348..ce9e82a8 100644 --- a/Samples/2_Concepts_and_Techniques/particles/particleSystem_cuda.cu +++ b/Samples/2_Concepts_and_Techniques/particles/particleSystem_cuda.cu @@ -35,189 +35,184 @@ #include #endif -#include #include +#include +#include +#include +#include +#include #include -#include -#include - -#include - -#include +#include "particles_kernel_impl.cuh" #include "thrust/device_ptr.h" #include "thrust/for_each.h" #include "thrust/iterator/zip_iterator.h" #include "thrust/sort.h" -#include "particles_kernel_impl.cuh" - extern "C" { - void cudaInit(int argc, char **argv) - { - int devID; - - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - devID = findCudaDevice(argc, (const char **)argv); - - if (devID < 0) + void cudaInit(int argc, char **argv) { - printf("No CUDA Capable devices found, exiting...\n"); - exit(EXIT_SUCCESS); - } - } + int devID; - void allocateArray(void **devPtr, size_t size) - { - checkCudaErrors(cudaMalloc(devPtr, size)); - } + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + devID = findCudaDevice(argc, (const char **)argv); - void freeArray(void *devPtr) { checkCudaErrors(cudaFree(devPtr)); } - - void threadSync() { checkCudaErrors(cudaDeviceSynchronize()); } - - void copyArrayToDevice(void *device, const void *host, int offset, int size) - { - checkCudaErrors( - cudaMemcpy((char *)device + offset, host, size, cudaMemcpyHostToDevice)); - } - - void registerGLBufferObject(uint vbo, - struct cudaGraphicsResource **cuda_vbo_resource) - { - checkCudaErrors(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, vbo, - cudaGraphicsMapFlagsNone)); - } - - void unregisterGLBufferObject(struct cudaGraphicsResource *cuda_vbo_resource) - { - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); - } - - void *mapGLBufferObject(struct cudaGraphicsResource **cuda_vbo_resource) - { - void *ptr; - checkCudaErrors(cudaGraphicsMapResources(1, cuda_vbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&ptr, &num_bytes, *cuda_vbo_resource)); - return ptr; - } - - void unmapGLBufferObject(struct cudaGraphicsResource *cuda_vbo_resource) - { - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); - } - - void copyArrayFromDevice(void *host, const void *device, - struct cudaGraphicsResource **cuda_vbo_resource, - int size) - { - if (cuda_vbo_resource) - { - device = mapGLBufferObject(cuda_vbo_resource); + if (devID < 0) { + printf("No CUDA Capable devices found, exiting...\n"); + exit(EXIT_SUCCESS); + } } - checkCudaErrors(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost)); + void allocateArray(void **devPtr, size_t size) { checkCudaErrors(cudaMalloc(devPtr, size)); } - if (cuda_vbo_resource) + void freeArray(void *devPtr) { checkCudaErrors(cudaFree(devPtr)); } + + void threadSync() { checkCudaErrors(cudaDeviceSynchronize()); } + + void copyArrayToDevice(void *device, const void *host, int offset, int size) { - unmapGLBufferObject(*cuda_vbo_resource); + checkCudaErrors(cudaMemcpy((char *)device + offset, host, size, cudaMemcpyHostToDevice)); } - } - void setParameters(SimParams *hostParams) - { - // copy parameters to constant memory - checkCudaErrors(cudaMemcpyToSymbol(cudaParams, hostParams, sizeof(SimParams))); - } + void registerGLBufferObject(uint vbo, struct cudaGraphicsResource **cuda_vbo_resource) + { + checkCudaErrors(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); + } - // Round a / b to nearest higher integer value - uint iDivUp(uint a, uint b) { return (a % b != 0) ? (a / b + 1) : (a / b); } + void unregisterGLBufferObject(struct cudaGraphicsResource *cuda_vbo_resource) + { + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); + } - // compute grid and thread block size for a given number of elements - void computeGridSize(uint n, uint blockSize, uint &numBlocks, - uint &numThreads) - { - numThreads = min(blockSize, n); - numBlocks = iDivUp(n, numThreads); - } + void *mapGLBufferObject(struct cudaGraphicsResource **cuda_vbo_resource) + { + void *ptr; + checkCudaErrors(cudaGraphicsMapResources(1, cuda_vbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&ptr, &num_bytes, *cuda_vbo_resource)); + return ptr; + } - void integrateSystem(float *pos, float *vel, float deltaTime, - uint numParticles) - { - thrust::device_ptr d_pos4((float4 *)pos); - thrust::device_ptr d_vel4((float4 *)vel); + void unmapGLBufferObject(struct cudaGraphicsResource *cuda_vbo_resource) + { + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); + } - thrust::for_each( - thrust::make_zip_iterator(thrust::make_tuple(d_pos4, d_vel4)), - thrust::make_zip_iterator( - thrust::make_tuple(d_pos4 + numParticles, d_vel4 + numParticles)), - integrate_functor(deltaTime)); - } + void copyArrayFromDevice(void *host, const void *device, struct cudaGraphicsResource **cuda_vbo_resource, int size) + { + if (cuda_vbo_resource) { + device = mapGLBufferObject(cuda_vbo_resource); + } - void calcHash(uint *gridParticleHash, uint *gridParticleIndex, float *pos, - int numParticles) - { - uint numThreads, numBlocks; - computeGridSize(numParticles, 256, numBlocks, numThreads); + checkCudaErrors(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost)); - // execute the kernel - calcHashD<<>>(gridParticleHash, gridParticleIndex, - (float4 *)pos, numParticles); + if (cuda_vbo_resource) { + unmapGLBufferObject(*cuda_vbo_resource); + } + } - // check if kernel invocation generated an error - getLastCudaError("Kernel execution failed"); - } + void setParameters(SimParams *hostParams) + { + // copy parameters to constant memory + checkCudaErrors(cudaMemcpyToSymbol(cudaParams, hostParams, sizeof(SimParams))); + } - void reorderDataAndFindCellStart(uint *cellStart, uint *cellEnd, - float *sortedPos, float *sortedVel, - uint *gridParticleHash, - uint *gridParticleIndex, float *oldPos, - float *oldVel, uint numParticles, - uint numCells) - { - uint numThreads, numBlocks; - computeGridSize(numParticles, 256, numBlocks, numThreads); + // Round a / b to nearest higher integer value + uint iDivUp(uint a, uint b) { return (a % b != 0) ? (a / b + 1) : (a / b); } - // set all cells to empty - checkCudaErrors(cudaMemset(cellStart, 0xffffffff, numCells * sizeof(uint))); + // compute grid and thread block size for a given number of elements + void computeGridSize(uint n, uint blockSize, uint &numBlocks, uint &numThreads) + { + numThreads = min(blockSize, n); + numBlocks = iDivUp(n, numThreads); + } - uint smemSize = sizeof(uint) * (numThreads + 1); - reorderDataAndFindCellStartD<<>>( - cellStart, cellEnd, (float4 *)sortedPos, (float4 *)sortedVel, - gridParticleHash, gridParticleIndex, (float4 *)oldPos, (float4 *)oldVel, - numParticles); - getLastCudaError("Kernel execution failed: reorderDataAndFindCellStartD"); - } + void integrateSystem(float *pos, float *vel, float deltaTime, uint numParticles) + { + thrust::device_ptr d_pos4((float4 *)pos); + thrust::device_ptr d_vel4((float4 *)vel); - void collide(float *newVel, float *sortedPos, float *sortedVel, - uint *gridParticleIndex, uint *cellStart, uint *cellEnd, - uint numParticles, uint numCells) - { - // thread per particle - uint numThreads, numBlocks; - computeGridSize(numParticles, 64, numBlocks, numThreads); + thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(d_pos4, d_vel4)), + thrust::make_zip_iterator(thrust::make_tuple(d_pos4 + numParticles, d_vel4 + numParticles)), + integrate_functor(deltaTime)); + } - // execute the kernel - collideD<<>>((float4 *)newVel, (float4 *)sortedPos, - (float4 *)sortedVel, gridParticleIndex, - cellStart, cellEnd, numParticles); + void calcHash(uint *gridParticleHash, uint *gridParticleIndex, float *pos, int numParticles) + { + uint numThreads, numBlocks; + computeGridSize(numParticles, 256, numBlocks, numThreads); - // check if kernel invocation generated an error - getLastCudaError("Kernel execution failed"); - } + // execute the kernel + calcHashD<<>>(gridParticleHash, gridParticleIndex, (float4 *)pos, numParticles); - void sortParticles(uint *dGridParticleHash, uint *dGridParticleIndex, - uint numParticles) - { - thrust::sort_by_key( - thrust::device_ptr(dGridParticleHash), - thrust::device_ptr(dGridParticleHash + numParticles), - thrust::device_ptr(dGridParticleIndex)); - } + // check if kernel invocation generated an error + getLastCudaError("Kernel execution failed"); + } + + void reorderDataAndFindCellStart(uint *cellStart, + uint *cellEnd, + float *sortedPos, + float *sortedVel, + uint *gridParticleHash, + uint *gridParticleIndex, + float *oldPos, + float *oldVel, + uint numParticles, + uint numCells) + { + uint numThreads, numBlocks; + computeGridSize(numParticles, 256, numBlocks, numThreads); + + // set all cells to empty + checkCudaErrors(cudaMemset(cellStart, 0xffffffff, numCells * sizeof(uint))); + + uint smemSize = sizeof(uint) * (numThreads + 1); + reorderDataAndFindCellStartD<<>>(cellStart, + cellEnd, + (float4 *)sortedPos, + (float4 *)sortedVel, + gridParticleHash, + gridParticleIndex, + (float4 *)oldPos, + (float4 *)oldVel, + numParticles); + getLastCudaError("Kernel execution failed: reorderDataAndFindCellStartD"); + } + + void collide(float *newVel, + float *sortedPos, + float *sortedVel, + uint *gridParticleIndex, + uint *cellStart, + uint *cellEnd, + uint numParticles, + uint numCells) + { + // thread per particle + uint numThreads, numBlocks; + computeGridSize(numParticles, 64, numBlocks, numThreads); + + // execute the kernel + collideD<<>>((float4 *)newVel, + (float4 *)sortedPos, + (float4 *)sortedVel, + gridParticleIndex, + cellStart, + cellEnd, + numParticles); + + // check if kernel invocation generated an error + getLastCudaError("Kernel execution failed"); + } + + void sortParticles(uint *dGridParticleHash, uint *dGridParticleIndex, uint numParticles) + { + thrust::sort_by_key(thrust::device_ptr(dGridParticleHash), + thrust::device_ptr(dGridParticleHash + numParticles), + thrust::device_ptr(dGridParticleIndex)); + } } // extern "C" diff --git a/Samples/2_Concepts_and_Techniques/particles/particles.cpp b/Samples/2_Concepts_and_Techniques/particles/particles.cpp index f557b771..d148f676 100644 --- a/Samples/2_Concepts_and_Techniques/particles/particles.cpp +++ b/Samples/2_Concepts_and_Techniques/particles/particles.cpp @@ -55,73 +55,73 @@ #include // CUDA utilities and system includes +#include // includes cuda.h and cuda_runtime_api.h #include -#include // includes cuda.h and cuda_runtime_api.h // Includes -#include -#include -#include #include +#include +#include +#include +#include "paramgl.h" #include "particleSystem.h" #include "render_particles.h" -#include "paramgl.h" #define MAX_EPSILON_ERROR 5.00f -#define THRESHOLD 0.30f +#define THRESHOLD 0.30f -#define GRID_SIZE 64 +#define GRID_SIZE 64 #define NUM_PARTICLES 16384 const uint width = 640, height = 480; // view params -int ox, oy; -int buttonState = 0; -float camera_trans[] = {0, 0, -3}; -float camera_rot[] = {0, 0, 0}; -float camera_trans_lag[] = {0, 0, -3}; -float camera_rot_lag[] = {0, 0, 0}; -const float inertia = 0.1f; -ParticleRenderer::DisplayMode displayMode = ParticleRenderer::PARTICLE_SPHERES; +int ox, oy; +int buttonState = 0; +float camera_trans[] = {0, 0, -3}; +float camera_rot[] = {0, 0, 0}; +float camera_trans_lag[] = {0, 0, -3}; +float camera_rot_lag[] = {0, 0, 0}; +const float inertia = 0.1f; +ParticleRenderer::DisplayMode displayMode = ParticleRenderer::PARTICLE_SPHERES; -int mode = 0; -bool displayEnabled = true; -bool bPause = false; -bool displaySliders = false; -bool wireframe = false; -bool demoMode = false; -int idleCounter = 0; -int demoCounter = 0; -const int idleDelay = 2000; +int mode = 0; +bool displayEnabled = true; +bool bPause = false; +bool displaySliders = false; +bool wireframe = false; +bool demoMode = false; +int idleCounter = 0; +int demoCounter = 0; +const int idleDelay = 2000; enum { M_VIEW = 0, M_MOVE }; -uint numParticles = 0; +uint numParticles = 0; uint3 gridSize; -int numIterations = 0; // run until exit +int numIterations = 0; // run until exit // simulation parameters -float timestep = 0.5f; -float damping = 1.0f; -float gravity = 0.0003f; -int iterations = 1; -int ballr = 10; +float timestep = 0.5f; +float damping = 1.0f; +float gravity = 0.0003f; +int iterations = 1; +int ballr = 10; float collideSpring = 0.5f; ; float collideDamping = 0.02f; ; -float collideShear = 0.1f; +float collideShear = 0.1f; float collideAttraction = 0.0f; ParticleSystem *psystem = 0; // fps -static int fpsCount = 0; -static int fpsLimit = 1; -StopWatchInterface *timer = NULL; +static int fpsCount = 0; +static int fpsLimit = 1; +StopWatchInterface *timer = NULL; ParticleRenderer *renderer = 0; @@ -130,616 +130,624 @@ float modelView[16]; ParamListGL *params; // Auto-Verification Code -const int frameCheckNumber = 4; -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; -char *g_refFile = NULL; +const int frameCheckNumber = 4; +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; +char *g_refFile = NULL; const char *sSDKsample = "CUDA Particles Simulation"; extern "C" void cudaInit(int argc, char **argv); extern "C" void cudaGLInit(int argc, char **argv); -extern "C" void copyArrayFromDevice(void *host, const void *device, - unsigned int vbo, int size); +extern "C" void copyArrayFromDevice(void *host, const void *device, unsigned int vbo, int size); // initialize particle system -void initParticleSystem(int numParticles, uint3 gridSize, bool bUseOpenGL) { - psystem = new ParticleSystem(numParticles, gridSize, bUseOpenGL); - psystem->reset(ParticleSystem::CONFIG_GRID); +void initParticleSystem(int numParticles, uint3 gridSize, bool bUseOpenGL) +{ + psystem = new ParticleSystem(numParticles, gridSize, bUseOpenGL); + psystem->reset(ParticleSystem::CONFIG_GRID); - if (bUseOpenGL) { - renderer = new ParticleRenderer; - renderer->setParticleRadius(psystem->getParticleRadius()); - renderer->setColorBuffer(psystem->getColorBuffer()); - } + if (bUseOpenGL) { + renderer = new ParticleRenderer; + renderer->setParticleRadius(psystem->getParticleRadius()); + renderer->setColorBuffer(psystem->getColorBuffer()); + } - sdkCreateTimer(&timer); + sdkCreateTimer(&timer); } -void cleanup() { - sdkDeleteTimer(&timer); +void cleanup() +{ + sdkDeleteTimer(&timer); - if (psystem) { - delete psystem; - } - return; + if (psystem) { + delete psystem; + } + return; } // initialize OpenGL -void initGL(int *argc, char **argv) { - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); - glutInitWindowSize(width, height); - glutCreateWindow("CUDA Particles"); +void initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); + glutInitWindowSize(width, height); + glutCreateWindow("CUDA Particles"); - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported( - "GL_ARB_multitexture GL_ARB_vertex_buffer_object")) { - fprintf(stderr, "Required OpenGL extensions missing."); - exit(EXIT_FAILURE); - } + if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_multitexture GL_ARB_vertex_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions missing."); + exit(EXIT_FAILURE); + } #if defined(WIN32) - if (wglewIsSupported("WGL_EXT_swap_control")) { - // disable vertical sync - wglSwapIntervalEXT(0); - } + if (wglewIsSupported("WGL_EXT_swap_control")) { + // disable vertical sync + wglSwapIntervalEXT(0); + } #endif - glEnable(GL_DEPTH_TEST); - glClearColor(0.25, 0.25, 0.25, 1.0); - - glutReportErrors(); -} - -void runBenchmark(int iterations, char *exec_path) { - printf("Run %u particles simulation for %d iterations...\n\n", numParticles, - iterations); - cudaDeviceSynchronize(); - sdkStartTimer(&timer); - - for (int i = 0; i < iterations; ++i) { - psystem->update(timestep); - } - - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - float fAvgSeconds = - ((float)1.0e-3 * (float)sdkGetTimerValue(&timer) / (float)iterations); - - printf( - "particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u " - "particles, NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-3 * numParticles) / fAvgSeconds, fAvgSeconds, numParticles, 1, 0); - - if (g_refFile) { - printf("\nChecking result...\n\n"); - float *hPos = - (float *)malloc(sizeof(float) * 4 * psystem->getNumParticles()); - copyArrayFromDevice(hPos, psystem->getCudaPosVBO(), 0, - sizeof(float) * 4 * psystem->getNumParticles()); - - sdkDumpBin((void *)hPos, sizeof(float) * 4 * psystem->getNumParticles(), - "particles.bin"); - - if (!sdkCompareBin2BinFloat("particles.bin", g_refFile, - 4 * psystem->getNumParticles(), - MAX_EPSILON_ERROR, THRESHOLD, exec_path)) { - g_TotalErrors++; - } - } -} - -void computeFPS() { - frameCount++; - fpsCount++; - - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "CUDA Particles (%d particles): %3.1f fps", numParticles, - ifps); - - glutSetWindowTitle(fps); - fpsCount = 0; - - fpsLimit = (int)MAX(ifps, 1.f); - sdkResetTimer(&timer); - } -} - -void display() { - sdkStartTimer(&timer); - - // update the simulation - if (!bPause) { - psystem->setIterations(iterations); - psystem->setDamping(damping); - psystem->setGravity(-gravity); - psystem->setCollideSpring(collideSpring); - psystem->setCollideDamping(collideDamping); - psystem->setCollideShear(collideShear); - psystem->setCollideAttraction(collideAttraction); - - psystem->update(timestep); - - if (renderer) { - renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), - psystem->getNumParticles()); - } - } - - // render - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - - // view transform - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - - for (int c = 0; c < 3; ++c) { - camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; - camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; - } - - glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); - glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); - glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); - - glGetFloatv(GL_MODELVIEW_MATRIX, modelView); - - // cube - glColor3f(1.0, 1.0, 1.0); - glutWireCube(2.0); - - // collider - glPushMatrix(); - float3 p = psystem->getColliderPos(); - glTranslatef(p.x, p.y, p.z); - glColor3f(1.0, 0.0, 0.0); - glutSolidSphere(psystem->getColliderRadius(), 20, 10); - glPopMatrix(); - - if (renderer && displayEnabled) { - renderer->display(displayMode); - } - - if (displaySliders) { - glDisable(GL_DEPTH_TEST); - glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color - glEnable(GL_BLEND); - params->Render(0, 0); - glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); - } + glClearColor(0.25, 0.25, 0.25, 1.0); - sdkStopTimer(&timer); + glutReportErrors(); +} - glutSwapBuffers(); - glutReportErrors(); +void runBenchmark(int iterations, char *exec_path) +{ + printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations); + cudaDeviceSynchronize(); + sdkStartTimer(&timer); - computeFPS(); + for (int i = 0; i < iterations; ++i) { + psystem->update(timestep); + } + + cudaDeviceSynchronize(); + sdkStopTimer(&timer); + float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer) / (float)iterations); + + printf("particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u " + "particles, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-3 * numParticles) / fAvgSeconds, + fAvgSeconds, + numParticles, + 1, + 0); + + if (g_refFile) { + printf("\nChecking result...\n\n"); + float *hPos = (float *)malloc(sizeof(float) * 4 * psystem->getNumParticles()); + copyArrayFromDevice(hPos, psystem->getCudaPosVBO(), 0, sizeof(float) * 4 * psystem->getNumParticles()); + + sdkDumpBin((void *)hPos, sizeof(float) * 4 * psystem->getNumParticles(), "particles.bin"); + + if (!sdkCompareBin2BinFloat( + "particles.bin", g_refFile, 4 * psystem->getNumParticles(), MAX_EPSILON_ERROR, THRESHOLD, exec_path)) { + g_TotalErrors++; + } + } +} + +void computeFPS() +{ + frameCount++; + fpsCount++; + + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "CUDA Particles (%d particles): %3.1f fps", numParticles, ifps); + + glutSetWindowTitle(fps); + fpsCount = 0; + + fpsLimit = (int)MAX(ifps, 1.f); + sdkResetTimer(&timer); + } +} + +void display() +{ + sdkStartTimer(&timer); + + // update the simulation + if (!bPause) { + psystem->setIterations(iterations); + psystem->setDamping(damping); + psystem->setGravity(-gravity); + psystem->setCollideSpring(collideSpring); + psystem->setCollideDamping(collideDamping); + psystem->setCollideShear(collideShear); + psystem->setCollideAttraction(collideAttraction); + + psystem->update(timestep); + + if (renderer) { + renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); + } + } + + // render + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + + // view transform + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + + for (int c = 0; c < 3; ++c) { + camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; + camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; + } + + glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); + glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); + glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); + + glGetFloatv(GL_MODELVIEW_MATRIX, modelView); + + // cube + glColor3f(1.0, 1.0, 1.0); + glutWireCube(2.0); + + // collider + glPushMatrix(); + float3 p = psystem->getColliderPos(); + glTranslatef(p.x, p.y, p.z); + glColor3f(1.0, 0.0, 0.0); + glutSolidSphere(psystem->getColliderRadius(), 20, 10); + glPopMatrix(); + + if (renderer && displayEnabled) { + renderer->display(displayMode); + } + + if (displaySliders) { + glDisable(GL_DEPTH_TEST); + glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color + glEnable(GL_BLEND); + params->Render(0, 0); + glDisable(GL_BLEND); + glEnable(GL_DEPTH_TEST); + } + + sdkStopTimer(&timer); + + glutSwapBuffers(); + glutReportErrors(); + + computeFPS(); } inline float frand() { return rand() / (float)RAND_MAX; } -void addSphere() { - // inject a sphere of particles - float pr = psystem->getParticleRadius(); - float tr = pr + (pr * 2.0f) * ballr; - float pos[4], vel[4]; - pos[0] = -1.0f + tr + frand() * (2.0f - tr * 2.0f); - pos[1] = 1.0f - tr; - pos[2] = -1.0f + tr + frand() * (2.0f - tr * 2.0f); - pos[3] = 0.0f; - vel[0] = vel[1] = vel[2] = vel[3] = 0.0f; - psystem->addSphere(0, pos, vel, ballr, pr * 2.0f); +void addSphere() +{ + // inject a sphere of particles + float pr = psystem->getParticleRadius(); + float tr = pr + (pr * 2.0f) * ballr; + float pos[4], vel[4]; + pos[0] = -1.0f + tr + frand() * (2.0f - tr * 2.0f); + pos[1] = 1.0f - tr; + pos[2] = -1.0f + tr + frand() * (2.0f - tr * 2.0f); + pos[3] = 0.0f; + vel[0] = vel[1] = vel[2] = vel[3] = 0.0f; + psystem->addSphere(0, pos, vel, ballr, pr * 2.0f); } -void reshape(int w, int h) { - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - gluPerspective(60.0, (float)w / (float)h, 0.1, 100.0); +void reshape(int w, int h) +{ + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (float)w / (float)h, 0.1, 100.0); - glMatrixMode(GL_MODELVIEW); - glViewport(0, 0, w, h); + glMatrixMode(GL_MODELVIEW); + glViewport(0, 0, w, h); - if (renderer) { - renderer->setWindowSize(w, h); - renderer->setFOV(60.0); - } -} - -void mouse(int button, int state, int x, int y) { - int mods; - - if (state == GLUT_DOWN) { - buttonState |= 1 << button; - } else if (state == GLUT_UP) { - buttonState = 0; - } - - mods = glutGetModifiers(); - - if (mods & GLUT_ACTIVE_SHIFT) { - buttonState = 2; - } else if (mods & GLUT_ACTIVE_CTRL) { - buttonState = 3; - } - - ox = x; - oy = y; - - demoMode = false; - idleCounter = 0; - - if (displaySliders) { - if (params->Mouse(x, y, button, state)) { - glutPostRedisplay(); - return; + if (renderer) { + renderer->setWindowSize(w, h); + renderer->setFOV(60.0); } - } +} - glutPostRedisplay(); +void mouse(int button, int state, int x, int y) +{ + int mods; + + if (state == GLUT_DOWN) { + buttonState |= 1 << button; + } + else if (state == GLUT_UP) { + buttonState = 0; + } + + mods = glutGetModifiers(); + + if (mods & GLUT_ACTIVE_SHIFT) { + buttonState = 2; + } + else if (mods & GLUT_ACTIVE_CTRL) { + buttonState = 3; + } + + ox = x; + oy = y; + + demoMode = false; + idleCounter = 0; + + if (displaySliders) { + if (params->Mouse(x, y, button, state)) { + glutPostRedisplay(); + return; + } + } + + glutPostRedisplay(); } // transform vector by matrix -void xform(float *v, float *r, GLfloat *m) { - r[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12]; - r[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13]; - r[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14]; +void xform(float *v, float *r, GLfloat *m) +{ + r[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12]; + r[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13]; + r[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14]; } // transform vector by transpose of matrix -void ixform(float *v, float *r, GLfloat *m) { - r[0] = v[0] * m[0] + v[1] * m[1] + v[2] * m[2]; - r[1] = v[0] * m[4] + v[1] * m[5] + v[2] * m[6]; - r[2] = v[0] * m[8] + v[1] * m[9] + v[2] * m[10]; +void ixform(float *v, float *r, GLfloat *m) +{ + r[0] = v[0] * m[0] + v[1] * m[1] + v[2] * m[2]; + r[1] = v[0] * m[4] + v[1] * m[5] + v[2] * m[6]; + r[2] = v[0] * m[8] + v[1] * m[9] + v[2] * m[10]; } -void ixformPoint(float *v, float *r, GLfloat *m) { - float x[4]; - x[0] = v[0] - m[12]; - x[1] = v[1] - m[13]; - x[2] = v[2] - m[14]; - x[3] = 1.0f; - ixform(x, r, m); +void ixformPoint(float *v, float *r, GLfloat *m) +{ + float x[4]; + x[0] = v[0] - m[12]; + x[1] = v[1] - m[13]; + x[2] = v[2] - m[14]; + x[3] = 1.0f; + ixform(x, r, m); } -void motion(int x, int y) { - float dx, dy; - dx = (float)(x - ox); - dy = (float)(y - oy); +void motion(int x, int y) +{ + float dx, dy; + dx = (float)(x - ox); + dy = (float)(y - oy); - if (displaySliders) { - if (params->Motion(x, y)) { - ox = x; - oy = y; - glutPostRedisplay(); - return; + if (displaySliders) { + if (params->Motion(x, y)) { + ox = x; + oy = y; + glutPostRedisplay(); + return; + } } - } - switch (mode) { + switch (mode) { case M_VIEW: - if (buttonState == 3) { - // left+middle = zoom - camera_trans[2] += (dy / 100.0f) * 0.5f * fabs(camera_trans[2]); - } else if (buttonState & 2) { - // middle = translate - camera_trans[0] += dx / 100.0f; - camera_trans[1] -= dy / 100.0f; - } else if (buttonState & 1) { - // left = rotate - camera_rot[0] += dy / 5.0f; - camera_rot[1] += dx / 5.0f; - } + if (buttonState == 3) { + // left+middle = zoom + camera_trans[2] += (dy / 100.0f) * 0.5f * fabs(camera_trans[2]); + } + else if (buttonState & 2) { + // middle = translate + camera_trans[0] += dx / 100.0f; + camera_trans[1] -= dy / 100.0f; + } + else if (buttonState & 1) { + // left = rotate + camera_rot[0] += dy / 5.0f; + camera_rot[1] += dx / 5.0f; + } - break; + break; case M_MOVE: { - float translateSpeed = 0.003f; - float3 p = psystem->getColliderPos(); + float translateSpeed = 0.003f; + float3 p = psystem->getColliderPos(); - if (buttonState == 1) { - float v[3], r[3]; - v[0] = dx * translateSpeed; - v[1] = -dy * translateSpeed; - v[2] = 0.0f; - ixform(v, r, modelView); - p.x += r[0]; - p.y += r[1]; - p.z += r[2]; - } else if (buttonState == 2) { - float v[3], r[3]; - v[0] = 0.0f; - v[1] = 0.0f; - v[2] = dy * translateSpeed; - ixform(v, r, modelView); - p.x += r[0]; - p.y += r[1]; - p.z += r[2]; - } + if (buttonState == 1) { + float v[3], r[3]; + v[0] = dx * translateSpeed; + v[1] = -dy * translateSpeed; + v[2] = 0.0f; + ixform(v, r, modelView); + p.x += r[0]; + p.y += r[1]; + p.z += r[2]; + } + else if (buttonState == 2) { + float v[3], r[3]; + v[0] = 0.0f; + v[1] = 0.0f; + v[2] = dy * translateSpeed; + ixform(v, r, modelView); + p.x += r[0]; + p.y += r[1]; + p.z += r[2]; + } - psystem->setColliderPos(p); + psystem->setColliderPos(p); } break; - } + } - ox = x; - oy = y; + ox = x; + oy = y; - demoMode = false; - idleCounter = 0; + demoMode = false; + idleCounter = 0; - glutPostRedisplay(); + glutPostRedisplay(); } // commented out to remove unused parameter warnings in Linux -void key(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void key(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case ' ': - bPause = !bPause; - break; + bPause = !bPause; + break; case 13: - psystem->update(timestep); + psystem->update(timestep); - if (renderer) { - renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), - psystem->getNumParticles()); - } + if (renderer) { + renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); + } - break; + break; case '\033': case 'q': #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif case 'v': - mode = M_VIEW; - break; + mode = M_VIEW; + break; case 'm': - mode = M_MOVE; - break; + mode = M_MOVE; + break; case 'p': - displayMode = (ParticleRenderer::DisplayMode)( - (displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES); - break; + displayMode = (ParticleRenderer::DisplayMode)((displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES); + break; case 'd': - psystem->dumpGrid(); - break; + psystem->dumpGrid(); + break; case 'u': - psystem->dumpParticles(0, numParticles - 1); - break; + psystem->dumpParticles(0, numParticles - 1); + break; case 'r': - displayEnabled = !displayEnabled; - break; + displayEnabled = !displayEnabled; + break; case '1': - psystem->reset(ParticleSystem::CONFIG_GRID); - break; + psystem->reset(ParticleSystem::CONFIG_GRID); + break; case '2': - psystem->reset(ParticleSystem::CONFIG_RANDOM); - break; + psystem->reset(ParticleSystem::CONFIG_RANDOM); + break; case '3': - addSphere(); - break; + addSphere(); + break; case '4': { - // shoot ball from camera - float pr = psystem->getParticleRadius(); - float vel[4], velw[4], pos[4], posw[4]; - vel[0] = 0.0f; - vel[1] = 0.0f; - vel[2] = -0.05f; - vel[3] = 0.0f; - ixform(vel, velw, modelView); + // shoot ball from camera + float pr = psystem->getParticleRadius(); + float vel[4], velw[4], pos[4], posw[4]; + vel[0] = 0.0f; + vel[1] = 0.0f; + vel[2] = -0.05f; + vel[3] = 0.0f; + ixform(vel, velw, modelView); - pos[0] = 0.0f; - pos[1] = 0.0f; - pos[2] = -2.5f; - pos[3] = 1.0; - ixformPoint(pos, posw, modelView); - posw[3] = 0.0f; + pos[0] = 0.0f; + pos[1] = 0.0f; + pos[2] = -2.5f; + pos[3] = 1.0; + ixformPoint(pos, posw, modelView); + posw[3] = 0.0f; - psystem->addSphere(0, posw, velw, ballr, pr * 2.0f); + psystem->addSphere(0, posw, velw, ballr, pr * 2.0f); } break; case 'w': - wireframe = !wireframe; - break; + wireframe = !wireframe; + break; case 'h': - displaySliders = !displaySliders; - break; - } - - demoMode = false; - idleCounter = 0; - glutPostRedisplay(); -} - -void special(int k, int x, int y) { - if (displaySliders) { - params->Special(k, x, y); - } - - demoMode = false; - idleCounter = 0; -} - -void idle(void) { - if ((idleCounter++ > idleDelay) && (demoMode == false)) { - demoMode = true; - printf("Entering demo mode\n"); - } - - if (demoMode) { - camera_rot[1] += 0.1f; - - if (demoCounter++ > 1000) { - ballr = 10 + (rand() % 10); - addSphere(); - demoCounter = 0; + displaySliders = !displaySliders; + break; } - } - glutPostRedisplay(); + demoMode = false; + idleCounter = 0; + glutPostRedisplay(); } -void initParams() { - if (g_refFile) { - timestep = 0.0f; - damping = 0.0f; - gravity = 0.0f; - ballr = 1; - collideSpring = 0.0f; - collideDamping = 0.0f; - collideShear = 0.0f; - collideAttraction = 0.0f; - } else { - // create a new parameter list - params = new ParamListGL("misc"); - params->AddParam( - new Param("time step", timestep, 0.0f, 1.0f, 0.01f, ×tep)); - params->AddParam( - new Param("damping", damping, 0.0f, 1.0f, 0.001f, &damping)); - params->AddParam( - new Param("gravity", gravity, 0.0f, 0.001f, 0.0001f, &gravity)); - params->AddParam(new Param("ball radius", ballr, 1, 20, 1, &ballr)); +void special(int k, int x, int y) +{ + if (displaySliders) { + params->Special(k, x, y); + } - params->AddParam(new Param("collide spring", collideSpring, 0.0f, - 1.0f, 0.001f, &collideSpring)); - params->AddParam(new Param("collide damping", collideDamping, 0.0f, - 0.1f, 0.001f, &collideDamping)); - params->AddParam(new Param("collide shear", collideShear, 0.0f, 0.1f, - 0.001f, &collideShear)); - params->AddParam(new Param("collide attract", collideAttraction, - 0.0f, 0.1f, 0.001f, &collideAttraction)); - } + demoMode = false; + idleCounter = 0; +} + +void idle(void) +{ + if ((idleCounter++ > idleDelay) && (demoMode == false)) { + demoMode = true; + printf("Entering demo mode\n"); + } + + if (demoMode) { + camera_rot[1] += 0.1f; + + if (demoCounter++ > 1000) { + ballr = 10 + (rand() % 10); + addSphere(); + demoCounter = 0; + } + } + + glutPostRedisplay(); +} + +void initParams() +{ + if (g_refFile) { + timestep = 0.0f; + damping = 0.0f; + gravity = 0.0f; + ballr = 1; + collideSpring = 0.0f; + collideDamping = 0.0f; + collideShear = 0.0f; + collideAttraction = 0.0f; + } + else { + // create a new parameter list + params = new ParamListGL("misc"); + params->AddParam(new Param("time step", timestep, 0.0f, 1.0f, 0.01f, ×tep)); + params->AddParam(new Param("damping", damping, 0.0f, 1.0f, 0.001f, &damping)); + params->AddParam(new Param("gravity", gravity, 0.0f, 0.001f, 0.0001f, &gravity)); + params->AddParam(new Param("ball radius", ballr, 1, 20, 1, &ballr)); + + params->AddParam(new Param("collide spring", collideSpring, 0.0f, 1.0f, 0.001f, &collideSpring)); + params->AddParam(new Param("collide damping", collideDamping, 0.0f, 0.1f, 0.001f, &collideDamping)); + params->AddParam(new Param("collide shear", collideShear, 0.0f, 0.1f, 0.001f, &collideShear)); + params->AddParam( + new Param("collide attract", collideAttraction, 0.0f, 0.1f, 0.001f, &collideAttraction)); + } } void mainMenu(int i) { key((unsigned char)i, 0, 0); } -void initMenus() { - glutCreateMenu(mainMenu); - glutAddMenuEntry("Reset block [1]", '1'); - glutAddMenuEntry("Reset random [2]", '2'); - glutAddMenuEntry("Add sphere [3]", '3'); - glutAddMenuEntry("View mode [v]", 'v'); - glutAddMenuEntry("Move cursor mode [m]", 'm'); - glutAddMenuEntry("Toggle point rendering [p]", 'p'); - glutAddMenuEntry("Toggle animation [ ]", ' '); - glutAddMenuEntry("Step animation [ret]", 13); - glutAddMenuEntry("Toggle sliders [h]", 'h'); - glutAddMenuEntry("Quit (esc)", '\033'); - glutAttachMenu(GLUT_RIGHT_BUTTON); +void initMenus() +{ + glutCreateMenu(mainMenu); + glutAddMenuEntry("Reset block [1]", '1'); + glutAddMenuEntry("Reset random [2]", '2'); + glutAddMenuEntry("Add sphere [3]", '3'); + glutAddMenuEntry("View mode [v]", 'v'); + glutAddMenuEntry("Move cursor mode [m]", 'm'); + glutAddMenuEntry("Toggle point rendering [p]", 'p'); + glutAddMenuEntry("Toggle animation [ ]", ' '); + glutAddMenuEntry("Step animation [ret]", 13); + glutAddMenuEntry("Toggle sliders [h]", 'h'); + glutAddMenuEntry("Quit (esc)", '\033'); + glutAttachMenu(GLUT_RIGHT_BUTTON); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - printf( - "NOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + printf("NOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - numParticles = NUM_PARTICLES; - uint gridDim = GRID_SIZE; - numIterations = 0; + numParticles = NUM_PARTICLES; + uint gridDim = GRID_SIZE; + numIterations = 0; - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "n")) { - numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "grid")) { + gridDim = getCmdLineArgumentInt(argc, (const char **)argv, "grid"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); + fpsLimit = frameCheckNumber; + numIterations = 1; + } } - if (checkCmdLineFlag(argc, (const char **)argv, "grid")) { - gridDim = getCmdLineArgumentInt(argc, (const char **)argv, "grid"); + gridSize.x = gridSize.y = gridSize.z = gridDim; + printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x * gridSize.y * gridSize.z); + printf("particles: %d\n", numParticles); + + bool benchmark = checkCmdLineFlag(argc, (const char **)argv, "benchmark") != 0; + + if (checkCmdLineFlag(argc, (const char **)argv, "i")) { + numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); } - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", &g_refFile); - fpsLimit = frameCheckNumber; - numIterations = 1; + if (benchmark || g_refFile) { + cudaInit(argc, argv); } - } + else { + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf("[%s]\n", argv[0]); + printf(" Does not explicitly support -device=n in OpenGL mode\n"); + printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); + printf(" > %s -device=n -file=<*.bin>\n", argv[0]); + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } - gridSize.x = gridSize.y = gridSize.z = gridDim; - printf("grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, - gridSize.x * gridSize.y * gridSize.z); - printf("particles: %d\n", numParticles); - - bool benchmark = - checkCmdLineFlag(argc, (const char **)argv, "benchmark") != 0; - - if (checkCmdLineFlag(argc, (const char **)argv, "i")) { - numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); - } - - if (benchmark || g_refFile) { - cudaInit(argc, argv); - } else { - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf("[%s]\n", argv[0]); - printf(" Does not explicitly support -device=n in OpenGL mode\n"); - printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); - printf(" > %s -device=n -file=<*.bin>\n", argv[0]); - printf("exiting...\n"); - exit(EXIT_SUCCESS); + initGL(&argc, argv); + cudaInit(argc, argv); } - initGL(&argc, argv); - cudaInit(argc, argv); - } + initParticleSystem(numParticles, gridSize, !benchmark && g_refFile == NULL); + initParams(); - initParticleSystem(numParticles, gridSize, !benchmark && g_refFile == NULL); - initParams(); + if (benchmark || g_refFile) { + if (numIterations <= 0) { + numIterations = 300; + } - if (benchmark || g_refFile) { - if (numIterations <= 0) { - numIterations = 300; + runBenchmark(numIterations, argv[0]); + } + else { + if (!g_refFile) { + initMenus(); + } + + glutDisplayFunc(display); + glutReshapeFunc(reshape); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutKeyboardFunc(key); + glutSpecialFunc(special); + glutIdleFunc(idle); + + glutCloseFunc(cleanup); + + glutMainLoop(); } - runBenchmark(numIterations, argv[0]); - } else { - if (!g_refFile) { - initMenus(); + if (psystem) { + delete psystem; } - glutDisplayFunc(display); - glutReshapeFunc(reshape); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutKeyboardFunc(key); - glutSpecialFunc(special); - glutIdleFunc(idle); - - glutCloseFunc(cleanup); - - glutMainLoop(); - } - - if (psystem) { - delete psystem; - } - - exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); + exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); } diff --git a/Samples/2_Concepts_and_Techniques/particles/particles_kernel.cuh b/Samples/2_Concepts_and_Techniques/particles/particles_kernel.cuh index a89349b9..79f4d240 100644 --- a/Samples/2_Concepts_and_Techniques/particles/particles_kernel.cuh +++ b/Samples/2_Concepts_and_Techniques/particles/particles_kernel.cuh @@ -32,27 +32,28 @@ typedef unsigned int uint; // simulation parameters -struct SimParams { - float3 colliderPos; - float colliderRadius; +struct SimParams +{ + float3 colliderPos; + float colliderRadius; - float3 gravity; - float globalDamping; - float particleRadius; + float3 gravity; + float globalDamping; + float particleRadius; - uint3 gridSize; - uint numCells; - float3 worldOrigin; - float3 cellSize; + uint3 gridSize; + uint numCells; + float3 worldOrigin; + float3 cellSize; - uint numBodies; - uint maxParticlesPerCell; + uint numBodies; + uint maxParticlesPerCell; - float spring; - float damping; - float shear; - float attraction; - float boundaryDamping; + float spring; + float damping; + float shear; + float attraction; + float boundaryDamping; }; #endif diff --git a/Samples/2_Concepts_and_Techniques/particles/particles_kernel_impl.cuh b/Samples/2_Concepts_and_Techniques/particles/particles_kernel_impl.cuh index 5047239c..b855b7c4 100644 --- a/Samples/2_Concepts_and_Techniques/particles/particles_kernel_impl.cuh +++ b/Samples/2_Concepts_and_Techniques/particles/particles_kernel_impl.cuh @@ -32,9 +32,14 @@ #ifndef _PARTICLES_KERNEL_H_ #define _PARTICLES_KERNEL_H_ -#include -#include #include +#include +#include + +#include "thrust/device_ptr.h" +#include "thrust/for_each.h" +#include "thrust/iterator/zip_iterator.h" +#include "thrust/sort.h" namespace cg = cooperative_groups; #include "helper_math.h" @@ -46,298 +51,289 @@ __constant__ SimParams cudaParams; struct integrate_functor { - float deltaTime; + float deltaTime; - __host__ __device__ integrate_functor(float delta_time) - : deltaTime(delta_time) {} + __host__ __device__ integrate_functor(float delta_time) + : deltaTime(delta_time) + { + } - template - __device__ void operator()(Tuple t) - { - volatile float4 posData = thrust::get<0>(t); - volatile float4 velData = thrust::get<1>(t); - float3 pos = make_float3(posData.x, posData.y, posData.z); - float3 vel = make_float3(velData.x, velData.y, velData.z); + template __device__ void operator()(Tuple t) + { + volatile float4 posData = thrust::get<0>(t); + volatile float4 velData = thrust::get<1>(t); + float3 pos = make_float3(posData.x, posData.y, posData.z); + float3 vel = make_float3(velData.x, velData.y, velData.z); - vel += cudaParams.gravity * deltaTime; - vel *= cudaParams.globalDamping; + vel += cudaParams.gravity * deltaTime; + vel *= cudaParams.globalDamping; - // new position = old position + velocity * deltaTime - pos += vel * deltaTime; + // new position = old position + velocity * deltaTime + pos += vel * deltaTime; // set this to zero to disable collisions with cube sides #if 1 - if (pos.x > 1.0f - cudaParams.particleRadius) - { - pos.x = 1.0f - cudaParams.particleRadius; - vel.x *= cudaParams.boundaryDamping; - } + if (pos.x > 1.0f - cudaParams.particleRadius) { + pos.x = 1.0f - cudaParams.particleRadius; + vel.x *= cudaParams.boundaryDamping; + } - if (pos.x < -1.0f + cudaParams.particleRadius) - { - pos.x = -1.0f + cudaParams.particleRadius; - vel.x *= cudaParams.boundaryDamping; - } + if (pos.x < -1.0f + cudaParams.particleRadius) { + pos.x = -1.0f + cudaParams.particleRadius; + vel.x *= cudaParams.boundaryDamping; + } - if (pos.y > 1.0f - cudaParams.particleRadius) - { - pos.y = 1.0f - cudaParams.particleRadius; - vel.y *= cudaParams.boundaryDamping; - } + if (pos.y > 1.0f - cudaParams.particleRadius) { + pos.y = 1.0f - cudaParams.particleRadius; + vel.y *= cudaParams.boundaryDamping; + } - if (pos.z > 1.0f - cudaParams.particleRadius) - { - pos.z = 1.0f - cudaParams.particleRadius; - vel.z *= cudaParams.boundaryDamping; - } + if (pos.z > 1.0f - cudaParams.particleRadius) { + pos.z = 1.0f - cudaParams.particleRadius; + vel.z *= cudaParams.boundaryDamping; + } - if (pos.z < -1.0f + cudaParams.particleRadius) - { - pos.z = -1.0f + cudaParams.particleRadius; - vel.z *= cudaParams.boundaryDamping; - } + if (pos.z < -1.0f + cudaParams.particleRadius) { + pos.z = -1.0f + cudaParams.particleRadius; + vel.z *= cudaParams.boundaryDamping; + } #endif - if (pos.y < -1.0f + cudaParams.particleRadius) - { - pos.y = -1.0f + cudaParams.particleRadius; - vel.y *= cudaParams.boundaryDamping; - } + if (pos.y < -1.0f + cudaParams.particleRadius) { + pos.y = -1.0f + cudaParams.particleRadius; + vel.y *= cudaParams.boundaryDamping; + } - // store new position and velocity - thrust::get<0>(t) = make_float4(pos, posData.w); - thrust::get<1>(t) = make_float4(vel, velData.w); - } + // store new position and velocity + thrust::get<0>(t) = make_float4(pos, posData.w); + thrust::get<1>(t) = make_float4(vel, velData.w); + } }; // calculate position in uniform grid __device__ int3 calcGridPos(float3 p) { - int3 gridPos; - gridPos.x = floorf((p.x - cudaParams.worldOrigin.x) / cudaParams.cellSize.x); - gridPos.y = floorf((p.y - cudaParams.worldOrigin.y) / cudaParams.cellSize.y); - gridPos.z = floorf((p.z - cudaParams.worldOrigin.z) / cudaParams.cellSize.z); - return gridPos; + int3 gridPos; + gridPos.x = floorf((p.x - cudaParams.worldOrigin.x) / cudaParams.cellSize.x); + gridPos.y = floorf((p.y - cudaParams.worldOrigin.y) / cudaParams.cellSize.y); + gridPos.z = floorf((p.z - cudaParams.worldOrigin.z) / cudaParams.cellSize.z); + return gridPos; } // calculate address in grid from position (clamping to edges) __device__ uint calcGridHash(int3 gridPos) { - gridPos.x = gridPos.x & - (cudaParams.gridSize.x - 1); // wrap grid, assumes size is power of 2 - gridPos.y = gridPos.y & (cudaParams.gridSize.y - 1); - gridPos.z = gridPos.z & (cudaParams.gridSize.z - 1); - return __umul24(__umul24(gridPos.z, cudaParams.gridSize.y), cudaParams.gridSize.x) + - __umul24(gridPos.y, cudaParams.gridSize.x) + gridPos.x; + gridPos.x = gridPos.x & (cudaParams.gridSize.x - 1); // wrap grid, assumes size is power of 2 + gridPos.y = gridPos.y & (cudaParams.gridSize.y - 1); + gridPos.z = gridPos.z & (cudaParams.gridSize.z - 1); + return __umul24(__umul24(gridPos.z, cudaParams.gridSize.y), cudaParams.gridSize.x) + + __umul24(gridPos.y, cudaParams.gridSize.x) + gridPos.x; } // calculate grid hash value for each particle -__global__ void calcHashD(uint *gridParticleHash, // output - uint *gridParticleIndex, // output - float4 *pos, // input: positions - uint numParticles) +__global__ void calcHashD(uint *gridParticleHash, // output + uint *gridParticleIndex, // output + float4 *pos, // input: positions + uint numParticles) { - uint index = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; + uint index = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - if (index >= numParticles) - return; + if (index >= numParticles) + return; - volatile float4 p = pos[index]; + volatile float4 p = pos[index]; - // get address in grid - int3 gridPos = calcGridPos(make_float3(p.x, p.y, p.z)); - uint hash = calcGridHash(gridPos); + // get address in grid + int3 gridPos = calcGridPos(make_float3(p.x, p.y, p.z)); + uint hash = calcGridHash(gridPos); - // store grid hash and particle index - gridParticleHash[index] = hash; - gridParticleIndex[index] = index; + // store grid hash and particle index + gridParticleHash[index] = hash; + gridParticleIndex[index] = index; } // rearrange particle data into sorted order, and find the start of each cell // in the sorted hash array -__global__ void reorderDataAndFindCellStartD( - uint *cellStart, // output: cell start index - uint *cellEnd, // output: cell end index - float4 *sortedPos, // output: sorted positions - float4 *sortedVel, // output: sorted velocities - uint *gridParticleHash, // input: sorted grid hashes - uint *gridParticleIndex, // input: sorted particle indices - float4 *oldPos, // input: sorted position array - float4 *oldVel, // input: sorted velocity array - uint numParticles) +__global__ void reorderDataAndFindCellStartD(uint *cellStart, // output: cell start index + uint *cellEnd, // output: cell end index + float4 *sortedPos, // output: sorted positions + float4 *sortedVel, // output: sorted velocities + uint *gridParticleHash, // input: sorted grid hashes + uint *gridParticleIndex, // input: sorted particle indices + float4 *oldPos, // input: sorted position array + float4 *oldVel, // input: sorted velocity array + uint numParticles) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - extern __shared__ uint sharedHash[]; // blockSize + 1 elements - uint index = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + extern __shared__ uint sharedHash[]; // blockSize + 1 elements + uint index = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - uint hash; + uint hash; - // handle case when no. of particles not multiple of block size - if (index < numParticles) - { - hash = gridParticleHash[index]; + // handle case when no. of particles not multiple of block size + if (index < numParticles) { + hash = gridParticleHash[index]; - // Load hash data into shared memory so that we can look - // at neighboring particle's hash value without loading - // two hash values per thread - sharedHash[threadIdx.x + 1] = hash; + // Load hash data into shared memory so that we can look + // at neighboring particle's hash value without loading + // two hash values per thread + sharedHash[threadIdx.x + 1] = hash; - if (index > 0 && threadIdx.x == 0) - { - // first thread in block must load neighbor particle hash - sharedHash[0] = gridParticleHash[index - 1]; - } - } - - cg::sync(cta); - - if (index < numParticles) - { - // If this particle has a different cell index to the previous - // particle then it must be the first particle in the cell, - // so store the index of this particle in the cell. - // As it isn't the first particle, it must also be the cell end of - // the previous particle's cell - - if (index == 0 || hash != sharedHash[threadIdx.x]) - { - cellStart[hash] = index; - - if (index > 0) - cellEnd[sharedHash[threadIdx.x]] = index; + if (index > 0 && threadIdx.x == 0) { + // first thread in block must load neighbor particle hash + sharedHash[0] = gridParticleHash[index - 1]; + } } - if (index == numParticles - 1) - { - cellEnd[hash] = index + 1; + cg::sync(cta); + + if (index < numParticles) { + // If this particle has a different cell index to the previous + // particle then it must be the first particle in the cell, + // so store the index of this particle in the cell. + // As it isn't the first particle, it must also be the cell end of + // the previous particle's cell + + if (index == 0 || hash != sharedHash[threadIdx.x]) { + cellStart[hash] = index; + + if (index > 0) + cellEnd[sharedHash[threadIdx.x]] = index; + } + + if (index == numParticles - 1) { + cellEnd[hash] = index + 1; + } + + // Now use the sorted index to reorder the pos and vel data + uint sortedIndex = gridParticleIndex[index]; + float4 pos = oldPos[sortedIndex]; + float4 vel = oldVel[sortedIndex]; + + sortedPos[index] = pos; + sortedVel[index] = vel; } - - // Now use the sorted index to reorder the pos and vel data - uint sortedIndex = gridParticleIndex[index]; - float4 pos = oldPos[sortedIndex]; - float4 vel = oldVel[sortedIndex]; - - sortedPos[index] = pos; - sortedVel[index] = vel; - } } // collide two spheres using DEM method -__device__ float3 collideSpheres(float3 posA, float3 posB, float3 velA, - float3 velB, float radiusA, float radiusB, - float attraction) +__device__ float3 +collideSpheres(float3 posA, float3 posB, float3 velA, float3 velB, float radiusA, float radiusB, float attraction) { - // calculate relative position - float3 relPos = posB - posA; + // calculate relative position + float3 relPos = posB - posA; - float dist = length(relPos); - float collideDist = radiusA + radiusB; + float dist = length(relPos); + float collideDist = radiusA + radiusB; - float3 force = make_float3(0.0f); + float3 force = make_float3(0.0f); - if (dist < collideDist) - { - float3 norm = relPos / dist; + if (dist < collideDist) { + float3 norm = relPos / dist; - // relative velocity - float3 relVel = velB - velA; + // relative velocity + float3 relVel = velB - velA; - // relative tangential velocity - float3 tanVel = relVel - (dot(relVel, norm) * norm); + // relative tangential velocity + float3 tanVel = relVel - (dot(relVel, norm) * norm); - // spring force - force = -cudaParams.spring * (collideDist - dist) * norm; - // dashpot (damping) force - force += cudaParams.damping * relVel; - // tangential shear force - force += cudaParams.shear * tanVel; - // attraction - force += attraction * relPos; - } + // spring force + force = -cudaParams.spring * (collideDist - dist) * norm; + // dashpot (damping) force + force += cudaParams.damping * relVel; + // tangential shear force + force += cudaParams.shear * tanVel; + // attraction + force += attraction * relPos; + } - return force; + return force; } // collide a particle against all other particles in a given cell -__device__ float3 collideCell(int3 gridPos, uint index, float3 pos, float3 vel, - float4 *oldPos, float4 *oldVel, uint *cellStart, - uint *cellEnd) +__device__ float3 collideCell(int3 gridPos, + uint index, + float3 pos, + float3 vel, + float4 *oldPos, + float4 *oldVel, + uint *cellStart, + uint *cellEnd) { - uint gridHash = calcGridHash(gridPos); + uint gridHash = calcGridHash(gridPos); - // get start of bucket for this cell - uint startIndex = cellStart[gridHash]; + // get start of bucket for this cell + uint startIndex = cellStart[gridHash]; - float3 force = make_float3(0.0f); + float3 force = make_float3(0.0f); - if (startIndex != 0xffffffff) // cell is not empty - { - // iterate over particles in this cell - uint endIndex = cellEnd[gridHash]; - - for (uint j = startIndex; j < endIndex; j++) + if (startIndex != 0xffffffff) // cell is not empty { - if (j != index) // check not colliding with self - { - float3 pos2 = make_float3(oldPos[j]); - float3 vel2 = make_float3(oldVel[j]); + // iterate over particles in this cell + uint endIndex = cellEnd[gridHash]; - // collide two spheres - force += collideSpheres(pos, pos2, vel, vel2, cudaParams.particleRadius, - cudaParams.particleRadius, cudaParams.attraction); - } + for (uint j = startIndex; j < endIndex; j++) { + if (j != index) // check not colliding with self + { + float3 pos2 = make_float3(oldPos[j]); + float3 vel2 = make_float3(oldVel[j]); + + // collide two spheres + force += collideSpheres( + pos, pos2, vel, vel2, cudaParams.particleRadius, cudaParams.particleRadius, cudaParams.attraction); + } + } } - } - return force; + return force; } -__global__ void collideD( - float4 *newVel, // output: new velocity - float4 *oldPos, // input: sorted positions - float4 *oldVel, // input: sorted velocities - uint *gridParticleIndex, // input: sorted particle indices - uint *cellStart, uint *cellEnd, uint numParticles) +__global__ void collideD(float4 *newVel, // output: new velocity + float4 *oldPos, // input: sorted positions + float4 *oldVel, // input: sorted velocities + uint *gridParticleIndex, // input: sorted particle indices + uint *cellStart, + uint *cellEnd, + uint numParticles) { - uint index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + uint index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; - if (index >= numParticles) - return; + if (index >= numParticles) + return; - // read particle data from sorted arrays - float3 pos = make_float3(oldPos[index]); - float3 vel = make_float3(oldVel[index]); + // read particle data from sorted arrays + float3 pos = make_float3(oldPos[index]); + float3 vel = make_float3(oldVel[index]); - // get address in grid - int3 gridPos = calcGridPos(pos); + // get address in grid + int3 gridPos = calcGridPos(pos); - // examine neighbouring cells - float3 force = make_float3(0.0f); + // examine neighbouring cells + float3 force = make_float3(0.0f); - for (int z = -1; z <= 1; z++) - { - for (int y = -1; y <= 1; y++) - { - for (int x = -1; x <= 1; x++) - { - int3 neighbourPos = gridPos + make_int3(x, y, z); - force += collideCell(neighbourPos, index, pos, vel, oldPos, oldVel, - cellStart, cellEnd); - } + for (int z = -1; z <= 1; z++) { + for (int y = -1; y <= 1; y++) { + for (int x = -1; x <= 1; x++) { + int3 neighbourPos = gridPos + make_int3(x, y, z); + force += collideCell(neighbourPos, index, pos, vel, oldPos, oldVel, cellStart, cellEnd); + } + } } - } - // collide with cursor sphere - force += collideSpheres(pos, cudaParams.colliderPos, vel, - make_float3(0.0f, 0.0f, 0.0f), cudaParams.particleRadius, - cudaParams.colliderRadius, 0.0f); + // collide with cursor sphere + force += collideSpheres(pos, + cudaParams.colliderPos, + vel, + make_float3(0.0f, 0.0f, 0.0f), + cudaParams.particleRadius, + cudaParams.colliderRadius, + 0.0f); - // write new velocity back to original unsorted location - uint originalIndex = gridParticleIndex[index]; - newVel[originalIndex] = make_float4(vel + force, 0.0f); + // write new velocity back to original unsorted location + uint originalIndex = gridParticleIndex[index]; + newVel[originalIndex] = make_float4(vel + force, 0.0f); } #endif diff --git a/Samples/2_Concepts_and_Techniques/particles/render_particles.cpp b/Samples/2_Concepts_and_Techniques/particles/render_particles.cpp index 0351e499..f5282e07 100644 --- a/Samples/2_Concepts_and_Techniques/particles/render_particles.cpp +++ b/Samples/2_Concepts_and_Techniques/particles/render_particles.cpp @@ -25,8 +25,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include #include // OpenGL Graphics includes @@ -41,128 +41,134 @@ #endif ParticleRenderer::ParticleRenderer() - : m_pos(0), - m_numParticles(0), - m_pointSize(1.0f), - m_particleRadius(0.125f * 0.5f), - m_program(0), - m_vbo(0), - m_colorVBO(0) { - _initGL(); + : m_pos(0) + , m_numParticles(0) + , m_pointSize(1.0f) + , m_particleRadius(0.125f * 0.5f) + , m_program(0) + , m_vbo(0) + , m_colorVBO(0) +{ + _initGL(); } ParticleRenderer::~ParticleRenderer() { m_pos = 0; } -void ParticleRenderer::setPositions(float *pos, int numParticles) { - m_pos = pos; - m_numParticles = numParticles; +void ParticleRenderer::setPositions(float *pos, int numParticles) +{ + m_pos = pos; + m_numParticles = numParticles; } -void ParticleRenderer::setVertexBuffer(unsigned int vbo, int numParticles) { - m_vbo = vbo; - m_numParticles = numParticles; +void ParticleRenderer::setVertexBuffer(unsigned int vbo, int numParticles) +{ + m_vbo = vbo; + m_numParticles = numParticles; } -void ParticleRenderer::_drawPoints() { - if (!m_vbo) { - glBegin(GL_POINTS); - { - int k = 0; +void ParticleRenderer::_drawPoints() +{ + if (!m_vbo) { + glBegin(GL_POINTS); + { + int k = 0; - for (int i = 0; i < m_numParticles; ++i) { - glVertex3fv(&m_pos[k]); - k += 4; - } + for (int i = 0; i < m_numParticles; ++i) { + glVertex3fv(&m_pos[k]); + k += 4; + } + } + glEnd(); } - glEnd(); - } else { - glBindBuffer(GL_ARRAY_BUFFER, m_vbo); - glVertexPointer(4, GL_FLOAT, 0, 0); - glEnableClientState(GL_VERTEX_ARRAY); + else { + glBindBuffer(GL_ARRAY_BUFFER, m_vbo); + glVertexPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_VERTEX_ARRAY); - if (m_colorVBO) { - glBindBuffer(GL_ARRAY_BUFFER, m_colorVBO); - glColorPointer(4, GL_FLOAT, 0, 0); - glEnableClientState(GL_COLOR_ARRAY); + if (m_colorVBO) { + glBindBuffer(GL_ARRAY_BUFFER, m_colorVBO); + glColorPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_COLOR_ARRAY); + } + + glDrawArrays(GL_POINTS, 0, m_numParticles); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDisableClientState(GL_VERTEX_ARRAY); + glDisableClientState(GL_COLOR_ARRAY); } - - glDrawArrays(GL_POINTS, 0, m_numParticles); - - glBindBuffer(GL_ARRAY_BUFFER, 0); - glDisableClientState(GL_VERTEX_ARRAY); - glDisableClientState(GL_COLOR_ARRAY); - } } -void ParticleRenderer::display(DisplayMode mode /* = PARTICLE_POINTS */) { - switch (mode) { +void ParticleRenderer::display(DisplayMode mode /* = PARTICLE_POINTS */) +{ + switch (mode) { case PARTICLE_POINTS: - glColor3f(1, 1, 1); - glPointSize(m_pointSize); - _drawPoints(); - break; + glColor3f(1, 1, 1); + glPointSize(m_pointSize); + _drawPoints(); + break; default: case PARTICLE_SPHERES: - glEnable(GL_POINT_SPRITE_ARB); - glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE); - glEnable(GL_VERTEX_PROGRAM_POINT_SIZE); - glDepthMask(GL_TRUE); - glEnable(GL_DEPTH_TEST); + glEnable(GL_POINT_SPRITE_ARB); + glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE); + glEnable(GL_VERTEX_PROGRAM_POINT_SIZE); + glDepthMask(GL_TRUE); + glEnable(GL_DEPTH_TEST); - glUseProgram(m_program); - glUniform1f(glGetUniformLocation(m_program, "pointScale"), - m_window_h / tanf(m_fov * 0.5f * (float)M_PI / 180.0f)); - glUniform1f(glGetUniformLocation(m_program, "pointRadius"), - m_particleRadius); + glUseProgram(m_program); + glUniform1f(glGetUniformLocation(m_program, "pointScale"), + m_window_h / tanf(m_fov * 0.5f * (float)M_PI / 180.0f)); + glUniform1f(glGetUniformLocation(m_program, "pointRadius"), m_particleRadius); - glColor3f(1, 1, 1); - _drawPoints(); + glColor3f(1, 1, 1); + _drawPoints(); - glUseProgram(0); - glDisable(GL_POINT_SPRITE_ARB); - break; - } + glUseProgram(0); + glDisable(GL_POINT_SPRITE_ARB); + break; + } } -GLuint ParticleRenderer::_compileProgram(const char *vsource, - const char *fsource) { - GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER); - GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); +GLuint ParticleRenderer::_compileProgram(const char *vsource, const char *fsource) +{ + GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER); + GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); - glShaderSource(vertexShader, 1, &vsource, 0); - glShaderSource(fragmentShader, 1, &fsource, 0); + glShaderSource(vertexShader, 1, &vsource, 0); + glShaderSource(fragmentShader, 1, &fsource, 0); - glCompileShader(vertexShader); - glCompileShader(fragmentShader); + glCompileShader(vertexShader); + glCompileShader(fragmentShader); - GLuint program = glCreateProgram(); + GLuint program = glCreateProgram(); - glAttachShader(program, vertexShader); - glAttachShader(program, fragmentShader); + glAttachShader(program, vertexShader); + glAttachShader(program, fragmentShader); - glLinkProgram(program); + glLinkProgram(program); - // check if program linked - GLint success = 0; - glGetProgramiv(program, GL_LINK_STATUS, &success); + // check if program linked + GLint success = 0; + glGetProgramiv(program, GL_LINK_STATUS, &success); - if (!success) { - char temp[256]; - glGetProgramInfoLog(program, 256, 0, temp); - printf("Failed to link program:\n%s\n", temp); - glDeleteProgram(program); - program = 0; - } + if (!success) { + char temp[256]; + glGetProgramInfoLog(program, 256, 0, temp); + printf("Failed to link program:\n%s\n", temp); + glDeleteProgram(program); + program = 0; + } - return program; + return program; } -void ParticleRenderer::_initGL() { - m_program = _compileProgram(vertexShader, spherePixelShader); +void ParticleRenderer::_initGL() +{ + m_program = _compileProgram(vertexShader, spherePixelShader); #if !defined(__APPLE__) && !defined(MACOSX) - glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB, GL_FALSE); - glClampColorARB(GL_CLAMP_FRAGMENT_COLOR_ARB, GL_FALSE); + glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB, GL_FALSE); + glClampColorARB(GL_CLAMP_FRAGMENT_COLOR_ARB, GL_FALSE); #endif } diff --git a/Samples/2_Concepts_and_Techniques/particles/render_particles.h b/Samples/2_Concepts_and_Techniques/particles/render_particles.h index 2eab40e5..e4ca1e55 100644 --- a/Samples/2_Concepts_and_Techniques/particles/render_particles.h +++ b/Samples/2_Concepts_and_Techniques/particles/render_particles.h @@ -28,46 +28,48 @@ #ifndef __RENDER_PARTICLES__ #define __RENDER_PARTICLES__ -class ParticleRenderer { - public: - ParticleRenderer(); - ~ParticleRenderer(); +class ParticleRenderer +{ +public: + ParticleRenderer(); + ~ParticleRenderer(); - void setPositions(float *pos, int numParticles); - void setVertexBuffer(unsigned int vbo, int numParticles); - void setColorBuffer(unsigned int vbo) { m_colorVBO = vbo; } + void setPositions(float *pos, int numParticles); + void setVertexBuffer(unsigned int vbo, int numParticles); + void setColorBuffer(unsigned int vbo) { m_colorVBO = vbo; } - enum DisplayMode { PARTICLE_POINTS, PARTICLE_SPHERES, PARTICLE_NUM_MODES }; + enum DisplayMode { PARTICLE_POINTS, PARTICLE_SPHERES, PARTICLE_NUM_MODES }; - void display(DisplayMode mode = PARTICLE_POINTS); - void displayGrid(); + void display(DisplayMode mode = PARTICLE_POINTS); + void displayGrid(); - void setPointSize(float size) { m_pointSize = size; } - void setParticleRadius(float r) { m_particleRadius = r; } - void setFOV(float fov) { m_fov = fov; } - void setWindowSize(int w, int h) { - m_window_w = w; - m_window_h = h; - } + void setPointSize(float size) { m_pointSize = size; } + void setParticleRadius(float r) { m_particleRadius = r; } + void setFOV(float fov) { m_fov = fov; } + void setWindowSize(int w, int h) + { + m_window_w = w; + m_window_h = h; + } - protected: // methods - void _initGL(); - void _drawPoints(); - GLuint _compileProgram(const char *vsource, const char *fsource); +protected: // methods + void _initGL(); + void _drawPoints(); + GLuint _compileProgram(const char *vsource, const char *fsource); - protected: // data - float *m_pos; - int m_numParticles; +protected: // data + float *m_pos; + int m_numParticles; - float m_pointSize; - float m_particleRadius; - float m_fov; - int m_window_w, m_window_h; + float m_pointSize; + float m_particleRadius; + float m_fov; + int m_window_w, m_window_h; - GLuint m_program; + GLuint m_program; - GLuint m_vbo; - GLuint m_colorVBO; + GLuint m_vbo; + GLuint m_colorVBO; }; -#endif //__ RENDER_PARTICLES__ +#endif //__ RENDER_PARTICLES__ diff --git a/Samples/2_Concepts_and_Techniques/particles/shaders.cpp b/Samples/2_Concepts_and_Techniques/particles/shaders.cpp index 057d0c30..4503fd62 100644 --- a/Samples/2_Concepts_and_Techniques/particles/shaders.cpp +++ b/Samples/2_Concepts_and_Techniques/particles/shaders.cpp @@ -28,41 +28,38 @@ #define STRINGIFY(A) #A // vertex shader -const char *vertexShader = STRINGIFY( - uniform float pointRadius; // point size in world space - uniform float pointScale; // scale to calculate size in pixels - uniform float densityScale; - uniform float densityOffset; - void main() - { - // calculate window-space point size - vec3 posEye = vec3(gl_ModelViewMatrix * vec4(gl_Vertex.xyz, 1.0)); - float dist = length(posEye); - gl_PointSize = pointRadius * (pointScale / dist); +const char *vertexShader = STRINGIFY(uniform float pointRadius; // point size in world space + uniform float pointScale; // scale to calculate size in pixels + uniform float densityScale; + uniform float densityOffset; + void main() { + // calculate window-space point size + vec3 posEye = vec3(gl_ModelViewMatrix * vec4(gl_Vertex.xyz, 1.0)); + float dist = length(posEye); + gl_PointSize = pointRadius * (pointScale / dist); - gl_TexCoord[0] = gl_MultiTexCoord0; - gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0); + gl_TexCoord[0] = gl_MultiTexCoord0; + gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0); - gl_FrontColor = gl_Color; - }); + gl_FrontColor = gl_Color; + }); // pixel shader for rendering points as shaded spheres -const char *spherePixelShader = STRINGIFY( - void main() - { +const char *spherePixelShader = STRINGIFY(void main() { const vec3 lightDir = vec3(0.577, 0.577, 0.577); // calculate normal from texture coordinates vec3 N; - N.xy = gl_TexCoord[0].xy*vec2(2.0, -2.0) + vec2(-1.0, 1.0); + N.xy = gl_TexCoord[0].xy * vec2(2.0, -2.0) + vec2(-1.0, 1.0); float mag = dot(N.xy, N.xy); - if (mag > 1.0) discard; // kill pixels outside circle + if (mag > 1.0) + discard; // kill pixels outside circle - N.z = sqrt(1.0-mag); + N.z = sqrt(1.0 - mag); // calculate lighting float diffuse = max(0.0, dot(lightDir, N)); gl_FragColor = gl_Color * diffuse; - }); +}); diff --git a/Samples/2_Concepts_and_Techniques/radixSortThrust/doc/readme.txt b/Samples/2_Concepts_and_Techniques/radixSortThrust/doc/readme.txt index f458fdf5..e17362f0 100644 --- a/Samples/2_Concepts_and_Techniques/radixSortThrust/doc/readme.txt +++ b/Samples/2_Concepts_and_Techniques/radixSortThrust/doc/readme.txt @@ -31,7 +31,7 @@ The RadixSort class can also be used within your application by building the rad CITATION -------- -Satish, N., Harris, M., and Garland, M. "Designing Efficient Sorting +Satish, N., Harris, M., and Garland, M. "Designing Efficient Sorting Algorithms for Manycore GPUs". In Proceedings of IEEE International Parallel & Distributed Processing Symposium 2009 (IPDPS 2009). diff --git a/Samples/2_Concepts_and_Techniques/radixSortThrust/radixSortThrust.cu b/Samples/2_Concepts_and_Techniques/radixSortThrust/radixSortThrust.cu index 50ce828e..0ea66be7 100644 --- a/Samples/2_Concepts_and_Techniques/radixSortThrust/radixSortThrust.cu +++ b/Samples/2_Concepts_and_Techniques/radixSortThrust/radixSortThrust.cu @@ -25,191 +25,193 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - #include -#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include -template -bool testSort(int argc, char **argv) { - int cmdVal; - int keybits = 32; +template bool testSort(int argc, char **argv) +{ + int cmdVal; + int keybits = 32; - unsigned int numElements = 1048576; - bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly"); - bool quiet = checkCmdLineFlag(argc, (const char **)argv, "quiet"); + unsigned int numElements = 1048576; + bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly"); + bool quiet = checkCmdLineFlag(argc, (const char **)argv, "quiet"); - if (checkCmdLineFlag(argc, (const char **)argv, "n")) { - cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n"); - numElements = cmdVal; + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + numElements = cmdVal; - if (cmdVal < 0) { - printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal); - exit(EXIT_SUCCESS); + if (cmdVal < 0) { + printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal); + exit(EXIT_SUCCESS); + } } - } - if (checkCmdLineFlag(argc, (const char **)argv, "keybits")) { - cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits"); - keybits = cmdVal; + if (checkCmdLineFlag(argc, (const char **)argv, "keybits")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits"); + keybits = cmdVal; - if (keybits <= 0) { - printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits); - exit(EXIT_SUCCESS); + if (keybits <= 0) { + printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits); + exit(EXIT_SUCCESS); + } } - } - unsigned int numIterations = (numElements >= 16777216) ? 10 : 100; + unsigned int numIterations = (numElements >= 16777216) ? 10 : 100; - if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) { - cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations"); - numIterations = cmdVal; - } - - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Command line:\nradixSortThrust [-option]\n"); - printf("Valid options:\n"); - printf("-n= : number of elements to sort\n"); - printf("-keybits=bits : keybits must be > 0\n"); - printf( - "-keysonly : only sort an array of keys (default sorts key-value " - "pairs)\n"); - printf( - "-float : use 32-bit float keys (default is 32-bit unsigned " - "int)\n"); - printf( - "-quiet : Output only the number of elements and the time to " - "sort\n"); - printf("-help : Output a help message\n"); - exit(EXIT_SUCCESS); - } - - if (!quiet) - printf("\nSorting %d %d-bit %s keys %s\n\n", numElements, keybits, - floatKeys ? "float" : "unsigned int", - keysOnly ? "(only)" : "and values"); - - int deviceID = -1; - - if (cudaSuccess == cudaGetDevice(&deviceID)) { - cudaDeviceProp devprop; - cudaGetDeviceProperties(&devprop, deviceID); - unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T); - - if (devprop.totalGlobalMem < totalMem) { - printf("Error: insufficient amount of memory to sort %d elements.\n", - numElements); - printf("%d bytes needed, %d bytes available\n", (int)totalMem, - (int)devprop.totalGlobalMem); - exit(EXIT_SUCCESS); + if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) { + cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations"); + numIterations = cmdVal; } - } - thrust::host_vector h_keys(numElements); - thrust::host_vector h_keysSorted(numElements); - thrust::host_vector h_values; + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Command line:\nradixSortThrust [-option]\n"); + printf("Valid options:\n"); + printf("-n= : number of elements to sort\n"); + printf("-keybits=bits : keybits must be > 0\n"); + printf("-keysonly : only sort an array of keys (default sorts key-value " + "pairs)\n"); + printf("-float : use 32-bit float keys (default is 32-bit unsigned " + "int)\n"); + printf("-quiet : Output only the number of elements and the time to " + "sort\n"); + printf("-help : Output a help message\n"); + exit(EXIT_SUCCESS); + } - if (!keysOnly) h_values = thrust::host_vector(numElements); + if (!quiet) + printf("\nSorting %d %d-bit %s keys %s\n\n", + numElements, + keybits, + floatKeys ? "float" : "unsigned int", + keysOnly ? "(only)" : "and values"); - // Fill up with some random data - thrust::default_random_engine rng(clock()); + int deviceID = -1; - if (floatKeys) { - thrust::uniform_real_distribution u01(0, 1); + if (cudaSuccess == cudaGetDevice(&deviceID)) { + cudaDeviceProp devprop; + cudaGetDeviceProperties(&devprop, deviceID); + unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T); - for (int i = 0; i < (int)numElements; i++) h_keys[i] = u01(rng); - } else { - thrust::uniform_int_distribution u(0, UINT_MAX); + if (devprop.totalGlobalMem < totalMem) { + printf("Error: insufficient amount of memory to sort %d elements.\n", numElements); + printf("%d bytes needed, %d bytes available\n", (int)totalMem, (int)devprop.totalGlobalMem); + exit(EXIT_SUCCESS); + } + } - for (int i = 0; i < (int)numElements; i++) h_keys[i] = u(rng); - } + thrust::host_vector h_keys(numElements); + thrust::host_vector h_keysSorted(numElements); + thrust::host_vector h_values; - if (!keysOnly) thrust::sequence(h_values.begin(), h_values.end()); + if (!keysOnly) + h_values = thrust::host_vector(numElements); - // Copy data onto the GPU - thrust::device_vector d_keys; - thrust::device_vector d_values; + // Fill up with some random data + thrust::default_random_engine rng(clock()); - // run multiple iterations to compute an average sort time - cudaEvent_t start_event, stop_event; - checkCudaErrors(cudaEventCreate(&start_event)); - checkCudaErrors(cudaEventCreate(&stop_event)); + if (floatKeys) { + thrust::uniform_real_distribution u01(0, 1); - float totalTime = 0; + for (int i = 0; i < (int)numElements; i++) + h_keys[i] = u01(rng); + } + else { + thrust::uniform_int_distribution u(0, UINT_MAX); - for (unsigned int i = 0; i < numIterations; i++) { - // reset data before sort - d_keys = h_keys; + for (int i = 0; i < (int)numElements; i++) + h_keys[i] = u(rng); + } - if (!keysOnly) d_values = h_values; + if (!keysOnly) + thrust::sequence(h_values.begin(), h_values.end()); - checkCudaErrors(cudaEventRecord(start_event, 0)); + // Copy data onto the GPU + thrust::device_vector d_keys; + thrust::device_vector d_values; - if (keysOnly) - thrust::sort(d_keys.begin(), d_keys.end()); + // run multiple iterations to compute an average sort time + cudaEvent_t start_event, stop_event; + checkCudaErrors(cudaEventCreate(&start_event)); + checkCudaErrors(cudaEventCreate(&stop_event)); + + float totalTime = 0; + + for (unsigned int i = 0; i < numIterations; i++) { + // reset data before sort + d_keys = h_keys; + + if (!keysOnly) + d_values = h_values; + + checkCudaErrors(cudaEventRecord(start_event, 0)); + + if (keysOnly) + thrust::sort(d_keys.begin(), d_keys.end()); + else + thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin()); + + checkCudaErrors(cudaEventRecord(stop_event, 0)); + checkCudaErrors(cudaEventSynchronize(stop_event)); + + float time = 0; + checkCudaErrors(cudaEventElapsedTime(&time, start_event, stop_event)); + totalTime += time; + } + + totalTime /= (1.0e3f * numIterations); + printf("radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = " + "%u elements\n", + 1.0e-6f * numElements / totalTime, + totalTime, + numElements); + + getLastCudaError("after radixsort"); + + // Get results back to host for correctness checking + thrust::copy(d_keys.begin(), d_keys.end(), h_keysSorted.begin()); + + if (!keysOnly) + thrust::copy(d_values.begin(), d_values.end(), h_values.begin()); + + getLastCudaError("copying results to host memory"); + + // Check results + bool bTestResult = thrust::is_sorted(h_keysSorted.begin(), h_keysSorted.end()); + + checkCudaErrors(cudaEventDestroy(start_event)); + checkCudaErrors(cudaEventDestroy(stop_event)); + + if (!bTestResult && !quiet) { + return false; + } + + return bTestResult; +} + +int main(int argc, char **argv) +{ + // Start logs + printf("%s Starting...\n\n", argv[0]); + + findCudaDevice(argc, (const char **)argv); + + bool bTestResult = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "float")) + bTestResult = testSort(argc, argv); else - thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin()); + bTestResult = testSort(argc, argv); - checkCudaErrors(cudaEventRecord(stop_event, 0)); - checkCudaErrors(cudaEventSynchronize(stop_event)); - - float time = 0; - checkCudaErrors(cudaEventElapsedTime(&time, start_event, stop_event)); - totalTime += time; - } - - totalTime /= (1.0e3f * numIterations); - printf( - "radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = " - "%u elements\n", - 1.0e-6f * numElements / totalTime, totalTime, numElements); - - getLastCudaError("after radixsort"); - - // Get results back to host for correctness checking - thrust::copy(d_keys.begin(), d_keys.end(), h_keysSorted.begin()); - - if (!keysOnly) - thrust::copy(d_values.begin(), d_values.end(), h_values.begin()); - - getLastCudaError("copying results to host memory"); - - // Check results - bool bTestResult = - thrust::is_sorted(h_keysSorted.begin(), h_keysSorted.end()); - - checkCudaErrors(cudaEventDestroy(start_event)); - checkCudaErrors(cudaEventDestroy(stop_event)); - - if (!bTestResult && !quiet) { - return false; - } - - return bTestResult; -} - -int main(int argc, char **argv) { - // Start logs - printf("%s Starting...\n\n", argv[0]); - - findCudaDevice(argc, (const char **)argv); - - bool bTestResult = false; - - if (checkCmdLineFlag(argc, (const char **)argv, "float")) - bTestResult = testSort(argc, argv); - else - bTestResult = testSort(argc, argv); - - printf(bTestResult ? "Test passed\n" : "Test failed!\n"); + printf(bTestResult ? "Test passed\n" : "Test failed!\n"); } diff --git a/Samples/2_Concepts_and_Techniques/reduction/reduction.cpp b/Samples/2_Concepts_and_Techniques/reduction/reduction.cpp index 25f8f51f..e0087af9 100644 --- a/Samples/2_Concepts_and_Techniques/reduction/reduction.cpp +++ b/Samples/2_Concepts_and_Techniques/reduction/reduction.cpp @@ -65,9 +65,9 @@ #include // Utilities and system includes +#include #include #include -#include // includes, project #include "reduction.h" @@ -76,8 +76,7 @@ enum ReduceType { REDUCE_INT, REDUCE_FLOAT, REDUCE_DOUBLE }; //////////////////////////////////////////////////////////////////////////////// // declaration, forward -template -bool runTest(int argc, char **argv, ReduceType datatype); +template bool runTest(int argc, char **argv, ReduceType datatype); #define MAX_BLOCK_DIM_SIZE 65535 @@ -87,71 +86,74 @@ bool runTest(int argc, char **argv, ReduceType datatype); extern "C" bool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); } -const char *getReduceTypeString(const ReduceType type) { - switch (type) { +const char *getReduceTypeString(const ReduceType type) +{ + switch (type) { case REDUCE_INT: - return "int"; + return "int"; case REDUCE_FLOAT: - return "float"; + return "float"; case REDUCE_DOUBLE: - return "double"; + return "double"; default: - return "unknown"; - } + return "unknown"; + } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + printf("%s Starting...\n\n", argv[0]); - char *typeInput = 0; - getCmdLineArgumentString(argc, (const char **)argv, "type", &typeInput); + char *typeInput = 0; + getCmdLineArgumentString(argc, (const char **)argv, "type", &typeInput); - ReduceType datatype = REDUCE_INT; + ReduceType datatype = REDUCE_INT; - if (0 != typeInput) { - if (!strcasecmp(typeInput, "float")) { - datatype = REDUCE_FLOAT; - } else if (!strcasecmp(typeInput, "double")) { - datatype = REDUCE_DOUBLE; - } else if (strcasecmp(typeInput, "int")) { - printf("Type %s is not recognized. Using default type int.\n\n", - typeInput); + if (0 != typeInput) { + if (!strcasecmp(typeInput, "float")) { + datatype = REDUCE_FLOAT; + } + else if (!strcasecmp(typeInput, "double")) { + datatype = REDUCE_DOUBLE; + } + else if (strcasecmp(typeInput, "int")) { + printf("Type %s is not recognized. Using default type int.\n\n", typeInput); + } } - } - cudaDeviceProp deviceProp; - int dev; + cudaDeviceProp deviceProp; + int dev; - dev = findCudaDevice(argc, (const char **)argv); + dev = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - printf("Using Device %d: %s\n\n", dev, deviceProp.name); - checkCudaErrors(cudaSetDevice(dev)); + printf("Using Device %d: %s\n\n", dev, deviceProp.name); + checkCudaErrors(cudaSetDevice(dev)); - printf("Reducing array of type %s\n\n", getReduceTypeString(datatype)); + printf("Reducing array of type %s\n\n", getReduceTypeString(datatype)); - bool bResult = false; + bool bResult = false; - switch (datatype) { + switch (datatype) { default: case REDUCE_INT: - bResult = runTest(argc, argv, datatype); - break; + bResult = runTest(argc, argv, datatype); + break; case REDUCE_FLOAT: - bResult = runTest(argc, argv, datatype); - break; + bResult = runTest(argc, argv, datatype); + break; case REDUCE_DOUBLE: - bResult = runTest(argc, argv, datatype); - break; - } + bResult = runTest(argc, argv, datatype); + break; + } - printf(bResult ? "Test passed\n" : "Test failed!\n"); + printf(bResult ? "Test passed\n" : "Test failed!\n"); } //////////////////////////////////////////////////////////////////////////////// @@ -162,29 +164,30 @@ int main(int argc, char **argv) { //! @param data pointer to input data //! @param size number of input data elements //////////////////////////////////////////////////////////////////////////////// -template -T reduceCPU(T *data, int size) { - T sum = data[0]; - T c = (T)0.0; +template T reduceCPU(T *data, int size) +{ + T sum = data[0]; + T c = (T)0.0; - for (int i = 1; i < size; i++) { - T y = data[i] - c; - T t = sum + y; - c = (t - sum) - y; - sum = t; - } + for (int i = 1; i < size; i++) { + T y = data[i] - c; + T t = sum + y; + c = (t - sum) - y; + sum = t; + } - return sum; + return sum; } -unsigned int nextPow2(unsigned int x) { - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return ++x; +unsigned int nextPow2(unsigned int x) +{ + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; } #ifndef MIN @@ -198,40 +201,42 @@ unsigned int nextPow2(unsigned int x) { // n. For kernel 6, we observe the maximum specified number of blocks, because // each thread in that kernel can process a variable number of elements. //////////////////////////////////////////////////////////////////////////////// -void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, - int maxThreads, int &blocks, int &threads) { - // get device capability, to avoid block/grid size exceed the upper bound - cudaDeviceProp prop; - int device; - checkCudaErrors(cudaGetDevice(&device)); - checkCudaErrors(cudaGetDeviceProperties(&prop, device)); +void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads) +{ + // get device capability, to avoid block/grid size exceed the upper bound + cudaDeviceProp prop; + int device; + checkCudaErrors(cudaGetDevice(&device)); + checkCudaErrors(cudaGetDeviceProperties(&prop, device)); - if (whichKernel < 3) { - threads = (n < maxThreads) ? nextPow2(n) : maxThreads; - blocks = (n + threads - 1) / threads; - } else { - threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads; - blocks = (n + (threads * 2 - 1)) / (threads * 2); - } + if (whichKernel < 3) { + threads = (n < maxThreads) ? nextPow2(n) : maxThreads; + blocks = (n + threads - 1) / threads; + } + else { + threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads; + blocks = (n + (threads * 2 - 1)) / (threads * 2); + } - if ((float)threads * blocks > - (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock) { - printf("n is too large, please choose a smaller number!\n"); - } + if ((float)threads * blocks > (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock) { + printf("n is too large, please choose a smaller number!\n"); + } - if (blocks > prop.maxGridSize[0]) { - printf( - "Grid size <%d> exceeds the device capability <%d>, set block size as " - "%d (original %d)\n", - blocks, prop.maxGridSize[0], threads * 2, threads); + if (blocks > prop.maxGridSize[0]) { + printf("Grid size <%d> exceeds the device capability <%d>, set block size as " + "%d (original %d)\n", + blocks, + prop.maxGridSize[0], + threads * 2, + threads); - blocks /= 2; - threads *= 2; - } + blocks /= 2; + threads *= 2; + } - if (whichKernel >= 6) { - blocks = MIN(maxBlocks, blocks); - } + if (whichKernel >= 6) { + blocks = MIN(maxBlocks, blocks); + } } //////////////////////////////////////////////////////////////////////////////// @@ -239,85 +244,90 @@ void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, // measures the average reduction time. //////////////////////////////////////////////////////////////////////////////// template -T benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads, - int maxBlocks, int whichKernel, int testIterations, - bool cpuFinalReduction, int cpuFinalThreshold, - StopWatchInterface *timer, T *h_odata, T *d_idata, - T *d_odata) { - T gpu_result = 0; - bool needReadBack = true; +T benchmarkReduce(int n, + int numThreads, + int numBlocks, + int maxThreads, + int maxBlocks, + int whichKernel, + int testIterations, + bool cpuFinalReduction, + int cpuFinalThreshold, + StopWatchInterface *timer, + T *h_odata, + T *d_idata, + T *d_odata) +{ + T gpu_result = 0; + bool needReadBack = true; - T *d_intermediateSums; - checkCudaErrors( - cudaMalloc((void **)&d_intermediateSums, sizeof(T) * numBlocks)); + T *d_intermediateSums; + checkCudaErrors(cudaMalloc((void **)&d_intermediateSums, sizeof(T) * numBlocks)); - for (int i = 0; i < testIterations; ++i) { - gpu_result = 0; + for (int i = 0; i < testIterations; ++i) { + gpu_result = 0; - cudaDeviceSynchronize(); - sdkStartTimer(&timer); + cudaDeviceSynchronize(); + sdkStartTimer(&timer); - // execute the kernel - reduce(n, numThreads, numBlocks, whichKernel, d_idata, d_odata); + // execute the kernel + reduce(n, numThreads, numBlocks, whichKernel, d_idata, d_odata); - // check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); - if (cpuFinalReduction) { - // sum partial sums from each block on CPU - // copy result from device to host - checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(T), - cudaMemcpyDeviceToHost)); + if (cpuFinalReduction) { + // sum partial sums from each block on CPU + // copy result from device to host + checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(T), cudaMemcpyDeviceToHost)); - for (int i = 0; i < numBlocks; i++) { - gpu_result += h_odata[i]; - } + for (int i = 0; i < numBlocks; i++) { + gpu_result += h_odata[i]; + } - needReadBack = false; - } else { - // sum partial block sums on GPU - int s = numBlocks; - int kernel = whichKernel; - - while (s > cpuFinalThreshold) { - int threads = 0, blocks = 0; - getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, - threads); - checkCudaErrors(cudaMemcpy(d_intermediateSums, d_odata, s * sizeof(T), - cudaMemcpyDeviceToDevice)); - reduce(s, threads, blocks, kernel, d_intermediateSums, d_odata); - - if (kernel < 3) { - s = (s + threads - 1) / threads; - } else { - s = (s + (threads * 2 - 1)) / (threads * 2); + needReadBack = false; } - } + else { + // sum partial block sums on GPU + int s = numBlocks; + int kernel = whichKernel; - if (s > 1) { - // copy result from device to host - checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), - cudaMemcpyDeviceToHost)); + while (s > cpuFinalThreshold) { + int threads = 0, blocks = 0; + getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); + checkCudaErrors(cudaMemcpy(d_intermediateSums, d_odata, s * sizeof(T), cudaMemcpyDeviceToDevice)); + reduce(s, threads, blocks, kernel, d_intermediateSums, d_odata); - for (int i = 0; i < s; i++) { - gpu_result += h_odata[i]; + if (kernel < 3) { + s = (s + threads - 1) / threads; + } + else { + s = (s + (threads * 2 - 1)) / (threads * 2); + } + } + + if (s > 1) { + // copy result from device to host + checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < s; i++) { + gpu_result += h_odata[i]; + } + + needReadBack = false; + } } - needReadBack = false; - } + cudaDeviceSynchronize(); + sdkStopTimer(&timer); } - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - } - - if (needReadBack) { - // copy final sum from device to host - checkCudaErrors( - cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost)); - } - checkCudaErrors(cudaFree(d_intermediateSums)); - return gpu_result; + if (needReadBack) { + // copy final sum from device to host + checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost)); + } + checkCudaErrors(cudaFree(d_intermediateSums)); + return gpu_result; } //////////////////////////////////////////////////////////////////////////////// @@ -326,222 +336,91 @@ T benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads, // for generating a "shmoo" plot showing the performance for each kernel // variation over a wide range of input sizes. //////////////////////////////////////////////////////////////////////////////// -template -void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, - ReduceType datatype) { - // create random input data on CPU - unsigned int bytes = maxN * sizeof(T); - - T *h_idata = (T *)malloc(bytes); - - for (int i = 0; i < maxN; i++) { - // Keep the numbers small so we don't get truncation error in the sum - if (datatype == REDUCE_INT) { - h_idata[i] = (T)(rand() & 0xFF); - } else { - h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; - } - } - - int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE); - - // allocate mem for the result on host side - T *h_odata = (T *)malloc(maxNumBlocks * sizeof(T)); - - // allocate device memory and data - T *d_idata = NULL; - T *d_odata = NULL; - - checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); - checkCudaErrors(cudaMalloc((void **)&d_odata, maxNumBlocks * sizeof(T))); - - // copy data directly to device memory - checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks * sizeof(T), - cudaMemcpyHostToDevice)); - - // warm-up - for (int kernel = 0; kernel < 8; kernel++) { - reduce(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); - } - - int testIterations = 100; - - StopWatchInterface *timer = 0; - sdkCreateTimer(&timer); - - // print headers - printf( - "Time in milliseconds for various numbers of elements for each " - "kernel\n\n\n"); - printf("Kernel"); - - for (int i = minN; i <= maxN; i *= 2) { - printf(", %d", i); - } - - for (int kernel = 0; kernel < 8; kernel++) { - printf("\n%d", kernel); - - for (int i = minN; i <= maxN; i *= 2) { - sdkResetTimer(&timer); - int numBlocks = 0; - int numThreads = 0; - getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, - numThreads); - - float reduceTime; - - if (numBlocks <= MAX_BLOCK_DIM_SIZE) { - benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, - testIterations, false, 1, timer, h_odata, d_idata, - d_odata); - reduceTime = sdkGetAverageTimerValue(&timer); - } else { - reduceTime = -1.0; - } - - printf(", %.5f", reduceTime); - } - } - - // cleanup - sdkDeleteTimer(&timer); - free(h_idata); - free(h_odata); - - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(d_odata)); -} - -//////////////////////////////////////////////////////////////////////////////// -// The main function which runs the reduction test. -//////////////////////////////////////////////////////////////////////////////// -template -bool runTest(int argc, char **argv, ReduceType datatype) { - int size = 1 << 24; // number of elements to reduce - int maxThreads = 256; // number of threads per block - int whichKernel = 7; - int maxBlocks = 64; - bool cpuFinalReduction = false; - int cpuFinalThreshold = 1; - - if (checkCmdLineFlag(argc, (const char **)argv, "n")) { - size = getCmdLineArgumentInt(argc, (const char **)argv, "n"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { - maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { - whichKernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "maxblocks")) { - maxBlocks = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks"); - } - - printf("%d elements\n", size); - printf("%d threads (max)\n", maxThreads); - - cpuFinalReduction = checkCmdLineFlag(argc, (const char **)argv, "cpufinal"); - - if (checkCmdLineFlag(argc, (const char **)argv, "cputhresh")) { - cpuFinalThreshold = - getCmdLineArgumentInt(argc, (const char **)argv, "cputhresh"); - } - - bool runShmoo = checkCmdLineFlag(argc, (const char **)argv, "shmoo"); - - if (runShmoo) { - shmoo(1, 33554432, maxThreads, maxBlocks, datatype); - } else { +template void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) +{ // create random input data on CPU - unsigned int bytes = size * sizeof(T); + unsigned int bytes = maxN * sizeof(T); T *h_idata = (T *)malloc(bytes); - for (int i = 0; i < size; i++) { - // Keep the numbers small so we don't get truncation error in the sum - if (datatype == REDUCE_INT) { - h_idata[i] = (T)(rand() & 0xFF); - } else { - h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; - } + for (int i = 0; i < maxN; i++) { + // Keep the numbers small so we don't get truncation error in the sum + if (datatype == REDUCE_INT) { + h_idata[i] = (T)(rand() & 0xFF); + } + else { + h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; + } } - int numBlocks = 0; - int numThreads = 0; - getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, - numThreads); - - if (numBlocks == 1) { - cpuFinalThreshold = 1; - } + int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side - T *h_odata = (T *)malloc(numBlocks * sizeof(T)); - - printf("%d blocks\n\n", numBlocks); + T *h_odata = (T *)malloc(maxNumBlocks * sizeof(T)); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); - checkCudaErrors(cudaMalloc((void **)&d_odata, numBlocks * sizeof(T))); + checkCudaErrors(cudaMalloc((void **)&d_odata, maxNumBlocks * sizeof(T))); // copy data directly to device memory - checkCudaErrors( - cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks * sizeof(T), - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks * sizeof(T), cudaMemcpyHostToDevice)); // warm-up - reduce(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); + for (int kernel = 0; kernel < 8; kernel++) { + reduce(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); + } int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); - T gpu_result = 0; + // print headers + printf("Time in milliseconds for various numbers of elements for each " + "kernel\n\n\n"); + printf("Kernel"); - gpu_result = - benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks, - whichKernel, testIterations, cpuFinalReduction, - cpuFinalThreshold, timer, h_odata, d_idata, d_odata); + for (int i = minN; i <= maxN; i *= 2) { + printf(", %d", i); + } - double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; - printf( - "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, " - "NumDevsUsed = %d, Workgroup = %u\n", - 1.0e-9 * ((double)bytes) / reduceTime, reduceTime, size, 1, numThreads); + for (int kernel = 0; kernel < 8; kernel++) { + printf("\n%d", kernel); - // compute reference solution - T cpu_result = reduceCPU(h_idata, size); + for (int i = minN; i <= maxN; i *= 2) { + sdkResetTimer(&timer); + int numBlocks = 0; + int numThreads = 0; + getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); - int precision = 0; - double threshold = 0; - double diff = 0; + float reduceTime; - if (datatype == REDUCE_INT) { - printf("\nGPU result = %d\n", (int)gpu_result); - printf("CPU result = %d\n\n", (int)cpu_result); - } else { - if (datatype == REDUCE_FLOAT) { - precision = 8; - threshold = 1e-8 * size; - } else { - precision = 12; - threshold = 1e-12 * size; - } + if (numBlocks <= MAX_BLOCK_DIM_SIZE) { + benchmarkReduce(i, + numThreads, + numBlocks, + maxThreads, + maxBlocks, + kernel, + testIterations, + false, + 1, + timer, + h_odata, + d_idata, + d_odata); + reduceTime = sdkGetAverageTimerValue(&timer); + } + else { + reduceTime = -1.0; + } - printf("\nGPU result = %.*f\n", precision, (double)gpu_result); - printf("CPU result = %.*f\n\n", precision, (double)cpu_result); - - diff = fabs((double)gpu_result - (double)cpu_result); + printf(", %.5f", reduceTime); + } } // cleanup @@ -551,13 +430,165 @@ bool runTest(int argc, char **argv, ReduceType datatype) { checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); - - if (datatype == REDUCE_INT) { - return (gpu_result == cpu_result); - } else { - return (diff < threshold); - } - } - - return true; +} + +//////////////////////////////////////////////////////////////////////////////// +// The main function which runs the reduction test. +//////////////////////////////////////////////////////////////////////////////// +template bool runTest(int argc, char **argv, ReduceType datatype) +{ + int size = 1 << 24; // number of elements to reduce + int maxThreads = 256; // number of threads per block + int whichKernel = 7; + int maxBlocks = 64; + bool cpuFinalReduction = false; + int cpuFinalThreshold = 1; + + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + size = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { + maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { + whichKernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "maxblocks")) { + maxBlocks = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks"); + } + + printf("%d elements\n", size); + printf("%d threads (max)\n", maxThreads); + + cpuFinalReduction = checkCmdLineFlag(argc, (const char **)argv, "cpufinal"); + + if (checkCmdLineFlag(argc, (const char **)argv, "cputhresh")) { + cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **)argv, "cputhresh"); + } + + bool runShmoo = checkCmdLineFlag(argc, (const char **)argv, "shmoo"); + + if (runShmoo) { + shmoo(1, 33554432, maxThreads, maxBlocks, datatype); + } + else { + // create random input data on CPU + unsigned int bytes = size * sizeof(T); + + T *h_idata = (T *)malloc(bytes); + + for (int i = 0; i < size; i++) { + // Keep the numbers small so we don't get truncation error in the sum + if (datatype == REDUCE_INT) { + h_idata[i] = (T)(rand() & 0xFF); + } + else { + h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; + } + } + + int numBlocks = 0; + int numThreads = 0; + getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); + + if (numBlocks == 1) { + cpuFinalThreshold = 1; + } + + // allocate mem for the result on host side + T *h_odata = (T *)malloc(numBlocks * sizeof(T)); + + printf("%d blocks\n\n", numBlocks); + + // allocate device memory and data + T *d_idata = NULL; + T *d_odata = NULL; + + checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); + checkCudaErrors(cudaMalloc((void **)&d_odata, numBlocks * sizeof(T))); + + // copy data directly to device memory + checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks * sizeof(T), cudaMemcpyHostToDevice)); + + // warm-up + reduce(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); + + int testIterations = 100; + + StopWatchInterface *timer = 0; + sdkCreateTimer(&timer); + + T gpu_result = 0; + + gpu_result = benchmarkReduce(size, + numThreads, + numBlocks, + maxThreads, + maxBlocks, + whichKernel, + testIterations, + cpuFinalReduction, + cpuFinalThreshold, + timer, + h_odata, + d_idata, + d_odata); + + double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; + printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, " + "NumDevsUsed = %d, Workgroup = %u\n", + 1.0e-9 * ((double)bytes) / reduceTime, + reduceTime, + size, + 1, + numThreads); + + // compute reference solution + T cpu_result = reduceCPU(h_idata, size); + + int precision = 0; + double threshold = 0; + double diff = 0; + + if (datatype == REDUCE_INT) { + printf("\nGPU result = %d\n", (int)gpu_result); + printf("CPU result = %d\n\n", (int)cpu_result); + } + else { + if (datatype == REDUCE_FLOAT) { + precision = 8; + threshold = 1e-8 * size; + } + else { + precision = 12; + threshold = 1e-12 * size; + } + + printf("\nGPU result = %.*f\n", precision, (double)gpu_result); + printf("CPU result = %.*f\n\n", precision, (double)cpu_result); + + diff = fabs((double)gpu_result - (double)cpu_result); + } + + // cleanup + sdkDeleteTimer(&timer); + free(h_idata); + free(h_odata); + + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(d_odata)); + + if (datatype == REDUCE_INT) { + return (gpu_result == cpu_result); + } + else { + return (diff < threshold); + } + } + + return true; } diff --git a/Samples/2_Concepts_and_Techniques/reduction/reduction.h b/Samples/2_Concepts_and_Techniques/reduction/reduction.h index a4e94aac..439509b5 100644 --- a/Samples/2_Concepts_and_Techniques/reduction/reduction.h +++ b/Samples/2_Concepts_and_Techniques/reduction/reduction.h @@ -29,8 +29,6 @@ #ifndef __REDUCTION_H__ #define __REDUCTION_H__ -template -void reduce(int size, int threads, int blocks, - int whichKernel, T *d_idata, T *d_odata); +template void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata, T *d_odata); #endif diff --git a/Samples/2_Concepts_and_Techniques/reduction/reduction_kernel.cu b/Samples/2_Concepts_and_Techniques/reduction/reduction_kernel.cu index 5be1676e..3fb28261 100644 --- a/Samples/2_Concepts_and_Techniques/reduction/reduction_kernel.cu +++ b/Samples/2_Concepts_and_Techniques/reduction/reduction_kernel.cu @@ -40,50 +40,53 @@ namespace cg = cooperative_groups; // Utility class used to avoid linker errors with extern // unsized shared memory arrays with templated type -template -struct SharedMemory { - __device__ inline operator T *() { - extern __shared__ int __smem[]; - return (T *)__smem; - } +template struct SharedMemory +{ + __device__ inline operator T *() + { + extern __shared__ int __smem[]; + return (T *)__smem; + } - __device__ inline operator const T *() const { - extern __shared__ int __smem[]; - return (T *)__smem; - } + __device__ inline operator const T *() const + { + extern __shared__ int __smem[]; + return (T *)__smem; + } }; // specialize for double to avoid unaligned memory // access compile errors -template <> -struct SharedMemory { - __device__ inline operator double *() { - extern __shared__ double __smem_d[]; - return (double *)__smem_d; - } +template <> struct SharedMemory +{ + __device__ inline operator double *() + { + extern __shared__ double __smem_d[]; + return (double *)__smem_d; + } - __device__ inline operator const double *() const { - extern __shared__ double __smem_d[]; - return (double *)__smem_d; - } + __device__ inline operator const double *() const + { + extern __shared__ double __smem_d[]; + return (double *)__smem_d; + } }; -template -__device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) { - for (int offset = warpSize / 2; offset > 0; offset /= 2) { - mySum += __shfl_down_sync(mask, mySum, offset); - } - return mySum; +template __device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) +{ + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + mySum += __shfl_down_sync(mask, mySum, offset); + } + return mySum; } #if __CUDA_ARCH__ >= 800 // Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic // when on SM 8.0 or higher -template <> -__device__ __forceinline__ int warpReduceSum(unsigned int mask, - int mySum) { - mySum = __reduce_add_sync(mask, mySum); - return mySum; +template <> __device__ __forceinline__ int warpReduceSum(unsigned int mask, int mySum) +{ + mySum = __reduce_add_sync(mask, mySum); + return mySum; } #endif @@ -98,129 +101,134 @@ __device__ __forceinline__ int warpReduceSum(unsigned int mask, operator. This operator is very expensive on GPUs, and the interleaved inactivity means that no whole warps are active, which is also very inefficient */ -template -__global__ void reduce0(T *g_idata, T *g_odata, unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - T *sdata = SharedMemory(); +template __global__ void reduce0(T *g_idata, T *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + T *sdata = SharedMemory(); - // load shared mem - unsigned int tid = threadIdx.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - sdata[tid] = (i < n) ? g_idata[i] : 0; - - cg::sync(cta); - - // do reduction in shared mem - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - // modulo arithmetic is slow! - if ((tid % (2 * s)) == 0) { - sdata[tid] += sdata[tid + s]; - } + sdata[tid] = (i < n) ? g_idata[i] : 0; cg::sync(cta); - } - // write result for this block to global mem - if (tid == 0) g_odata[blockIdx.x] = sdata[0]; + // do reduction in shared mem + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + // modulo arithmetic is slow! + if ((tid % (2 * s)) == 0) { + sdata[tid] += sdata[tid + s]; + } + + cg::sync(cta); + } + + // write result for this block to global mem + if (tid == 0) + g_odata[blockIdx.x] = sdata[0]; } /* This version uses contiguous threads, but its interleaved addressing results in many shared memory bank conflicts. */ -template -__global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - T *sdata = SharedMemory(); +template __global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + T *sdata = SharedMemory(); - // load shared mem - unsigned int tid = threadIdx.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - sdata[tid] = (i < n) ? g_idata[i] : 0; - - cg::sync(cta); - - // do reduction in shared mem - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - int index = 2 * s * tid; - - if (index < blockDim.x) { - sdata[index] += sdata[index + s]; - } + sdata[tid] = (i < n) ? g_idata[i] : 0; cg::sync(cta); - } - // write result for this block to global mem - if (tid == 0) g_odata[blockIdx.x] = sdata[0]; + // do reduction in shared mem + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + int index = 2 * s * tid; + + if (index < blockDim.x) { + sdata[index] += sdata[index + s]; + } + + cg::sync(cta); + } + + // write result for this block to global mem + if (tid == 0) + g_odata[blockIdx.x] = sdata[0]; } /* This version uses sequential addressing -- no divergence or bank conflicts. */ -template -__global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - T *sdata = SharedMemory(); +template __global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + T *sdata = SharedMemory(); - // load shared mem - unsigned int tid = threadIdx.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - sdata[tid] = (i < n) ? g_idata[i] : 0; - - cg::sync(cta); - - // do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - sdata[tid] += sdata[tid + s]; - } + sdata[tid] = (i < n) ? g_idata[i] : 0; cg::sync(cta); - } - // write result for this block to global mem - if (tid == 0) g_odata[blockIdx.x] = sdata[0]; + // do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + + cg::sync(cta); + } + + // write result for this block to global mem + if (tid == 0) + g_odata[blockIdx.x] = sdata[0]; } /* This version uses n/2 threads -- it performs the first level of reduction when reading from global memory. */ -template -__global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - T *sdata = SharedMemory(); +template __global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + T *sdata = SharedMemory(); - // perform first level of reduction, - // reading from global memory, writing to shared memory - unsigned int tid = threadIdx.x; - unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - T mySum = (i < n) ? g_idata[i] : 0; + T mySum = (i < n) ? g_idata[i] : 0; - if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x]; + if (i + blockDim.x < n) + mySum += g_idata[i + blockDim.x]; - sdata[tid] = mySum; - cg::sync(cta); + sdata[tid] = mySum; + cg::sync(cta); - // do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - sdata[tid] = mySum = mySum + sdata[tid + s]; + // do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] = mySum = mySum + sdata[tid + s]; + } + + cg::sync(cta); } - cg::sync(cta); - } - - // write result for this block to global mem - if (tid == 0) g_odata[blockIdx.x] = mySum; + // write result for this block to global mem + if (tid == 0) + g_odata[blockIdx.x] = mySum; } /* @@ -237,46 +245,49 @@ __global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) { In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. If blockSize > 32, allocate blockSize*sizeof(T) bytes. */ -template -__global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - T *sdata = SharedMemory(); +template __global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + T *sdata = SharedMemory(); - // perform first level of reduction, - // reading from global memory, writing to shared memory - unsigned int tid = threadIdx.x; - unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; - T mySum = (i < n) ? g_idata[i] : 0; + T mySum = (i < n) ? g_idata[i] : 0; - if (i + blockSize < n) mySum += g_idata[i + blockSize]; - - sdata[tid] = mySum; - cg::sync(cta); - - // do reduction in shared mem - for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) { - if (tid < s) { - sdata[tid] = mySum = mySum + sdata[tid + s]; - } + if (i + blockSize < n) + mySum += g_idata[i + blockSize]; + sdata[tid] = mySum; cg::sync(cta); - } - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + // do reduction in shared mem + for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) { + if (tid < s) { + sdata[tid] = mySum = mySum + sdata[tid + s]; + } - if (cta.thread_rank() < 32) { - // Fetch final intermediate sum from 2nd warp - if (blockSize >= 64) mySum += sdata[tid + 32]; - // Reduce final warp using shuffle - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - mySum += tile32.shfl_down(mySum, offset); + cg::sync(cta); } - } - // write result for this block to global mem - if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum; + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + if (cta.thread_rank() < 32) { + // Fetch final intermediate sum from 2nd warp + if (blockSize >= 64) + mySum += sdata[tid + 32]; + // Reduce final warp using shuffle + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + mySum += tile32.shfl_down(mySum, offset); + } + } + + // write result for this block to global mem + if (cta.thread_rank() == 0) + g_odata[blockIdx.x] = mySum; } /* @@ -291,56 +302,59 @@ __global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) { In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. If blockSize > 32, allocate blockSize*sizeof(T) bytes. */ -template -__global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - T *sdata = SharedMemory(); +template __global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + T *sdata = SharedMemory(); - // perform first level of reduction, - // reading from global memory, writing to shared memory - unsigned int tid = threadIdx.x; - unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x; + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x; - T mySum = (i < n) ? g_idata[i] : 0; + T mySum = (i < n) ? g_idata[i] : 0; - if (i + blockSize < n) mySum += g_idata[i + blockSize]; + if (i + blockSize < n) + mySum += g_idata[i + blockSize]; - sdata[tid] = mySum; - cg::sync(cta); + sdata[tid] = mySum; + cg::sync(cta); - // do reduction in shared mem - if ((blockSize >= 512) && (tid < 256)) { - sdata[tid] = mySum = mySum + sdata[tid + 256]; - } - - cg::sync(cta); - - if ((blockSize >= 256) && (tid < 128)) { - sdata[tid] = mySum = mySum + sdata[tid + 128]; - } - - cg::sync(cta); - - if ((blockSize >= 128) && (tid < 64)) { - sdata[tid] = mySum = mySum + sdata[tid + 64]; - } - - cg::sync(cta); - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - if (cta.thread_rank() < 32) { - // Fetch final intermediate sum from 2nd warp - if (blockSize >= 64) mySum += sdata[tid + 32]; - // Reduce final warp using shuffle - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - mySum += tile32.shfl_down(mySum, offset); + // do reduction in shared mem + if ((blockSize >= 512) && (tid < 256)) { + sdata[tid] = mySum = mySum + sdata[tid + 256]; } - } - // write result for this block to global mem - if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum; + cg::sync(cta); + + if ((blockSize >= 256) && (tid < 128)) { + sdata[tid] = mySum = mySum + sdata[tid + 128]; + } + + cg::sync(cta); + + if ((blockSize >= 128) && (tid < 64)) { + sdata[tid] = mySum = mySum + sdata[tid + 64]; + } + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + if (cta.thread_rank() < 32) { + // Fetch final intermediate sum from 2nd warp + if (blockSize >= 64) + mySum += sdata[tid + 32]; + // Reduce final warp using shuffle + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + mySum += tile32.shfl_down(mySum, offset); + } + } + + // write result for this block to global mem + if (cta.thread_rank() == 0) + g_odata[blockIdx.x] = mySum; } /* @@ -352,257 +366,263 @@ __global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) { In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. If blockSize > 32, allocate blockSize*sizeof(T) bytes. */ -template -__global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - T *sdata = SharedMemory(); +template __global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + T *sdata = SharedMemory(); - // perform first level of reduction, - // reading from global memory, writing to shared memory - unsigned int tid = threadIdx.x; - unsigned int gridSize = blockSize * gridDim.x; + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int gridSize = blockSize * gridDim.x; - T mySum = 0; + T mySum = 0; - // we reduce multiple elements per thread. The number is determined by the - // number of active thread blocks (via gridDim). More blocks will result - // in a larger gridSize and therefore fewer elements per thread - if (nIsPow2) { - unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; - gridSize = gridSize << 1; + // we reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + if (nIsPow2) { + unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; + gridSize = gridSize << 1; - while (i < n) { - mySum += g_idata[i]; - // ensure we don't read out of bounds -- this is optimized away for - // powerOf2 sized arrays - if ((i + blockSize) < n) { - mySum += g_idata[i + blockSize]; - } - i += gridSize; + while (i < n) { + mySum += g_idata[i]; + // ensure we don't read out of bounds -- this is optimized away for + // powerOf2 sized arrays + if ((i + blockSize) < n) { + mySum += g_idata[i + blockSize]; + } + i += gridSize; + } } - } else { - unsigned int i = blockIdx.x * blockSize + threadIdx.x; - while (i < n) { - mySum += g_idata[i]; - i += gridSize; + else { + unsigned int i = blockIdx.x * blockSize + threadIdx.x; + while (i < n) { + mySum += g_idata[i]; + i += gridSize; + } } - } - // each thread puts its local sum into shared memory - sdata[tid] = mySum; - cg::sync(cta); + // each thread puts its local sum into shared memory + sdata[tid] = mySum; + cg::sync(cta); - // do reduction in shared mem - if ((blockSize >= 512) && (tid < 256)) { - sdata[tid] = mySum = mySum + sdata[tid + 256]; - } - - cg::sync(cta); - - if ((blockSize >= 256) && (tid < 128)) { - sdata[tid] = mySum = mySum + sdata[tid + 128]; - } - - cg::sync(cta); - - if ((blockSize >= 128) && (tid < 64)) { - sdata[tid] = mySum = mySum + sdata[tid + 64]; - } - - cg::sync(cta); - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - if (cta.thread_rank() < 32) { - // Fetch final intermediate sum from 2nd warp - if (blockSize >= 64) mySum += sdata[tid + 32]; - // Reduce final warp using shuffle - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - mySum += tile32.shfl_down(mySum, offset); + // do reduction in shared mem + if ((blockSize >= 512) && (tid < 256)) { + sdata[tid] = mySum = mySum + sdata[tid + 256]; } - } - // write result for this block to global mem - if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum; + cg::sync(cta); + + if ((blockSize >= 256) && (tid < 128)) { + sdata[tid] = mySum = mySum + sdata[tid + 128]; + } + + cg::sync(cta); + + if ((blockSize >= 128) && (tid < 64)) { + sdata[tid] = mySum = mySum + sdata[tid + 64]; + } + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + if (cta.thread_rank() < 32) { + // Fetch final intermediate sum from 2nd warp + if (blockSize >= 64) + mySum += sdata[tid + 32]; + // Reduce final warp using shuffle + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + mySum += tile32.shfl_down(mySum, offset); + } + } + + // write result for this block to global mem + if (cta.thread_rank() == 0) + g_odata[blockIdx.x] = mySum; } template -__global__ void reduce7(const T *__restrict__ g_idata, T *__restrict__ g_odata, - unsigned int n) { - T *sdata = SharedMemory(); +__global__ void reduce7(const T *__restrict__ g_idata, T *__restrict__ g_odata, unsigned int n) +{ + T *sdata = SharedMemory(); - // perform first level of reduction, - // reading from global memory, writing to shared memory - unsigned int tid = threadIdx.x; - unsigned int gridSize = blockSize * gridDim.x; - unsigned int maskLength = (blockSize & 31); // 31 = warpSize-1 - maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength; - const unsigned int mask = (0xffffffff) >> maskLength; + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int gridSize = blockSize * gridDim.x; + unsigned int maskLength = (blockSize & 31); // 31 = warpSize-1 + maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength; + const unsigned int mask = (0xffffffff) >> maskLength; - T mySum = 0; + T mySum = 0; - // we reduce multiple elements per thread. The number is determined by the - // number of active thread blocks (via gridDim). More blocks will result - // in a larger gridSize and therefore fewer elements per thread - if (nIsPow2) { - unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; - gridSize = gridSize << 1; + // we reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + if (nIsPow2) { + unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; + gridSize = gridSize << 1; - while (i < n) { - mySum += g_idata[i]; - // ensure we don't read out of bounds -- this is optimized away for - // powerOf2 sized arrays - if ((i + blockSize) < n) { - mySum += g_idata[i + blockSize]; - } - i += gridSize; + while (i < n) { + mySum += g_idata[i]; + // ensure we don't read out of bounds -- this is optimized away for + // powerOf2 sized arrays + if ((i + blockSize) < n) { + mySum += g_idata[i + blockSize]; + } + i += gridSize; + } } - } else { - unsigned int i = blockIdx.x * blockSize + threadIdx.x; - while (i < n) { - mySum += g_idata[i]; - i += gridSize; + else { + unsigned int i = blockIdx.x * blockSize + threadIdx.x; + while (i < n) { + mySum += g_idata[i]; + i += gridSize; + } } - } - // Reduce within warp using shuffle or reduce_add if T==int & CUDA_ARCH == - // SM 8.0 - mySum = warpReduceSum(mask, mySum); - - // each thread puts its local sum into shared memory - if ((tid % warpSize) == 0) { - sdata[tid / warpSize] = mySum; - } - - __syncthreads(); - - const unsigned int shmem_extent = - (blockSize / warpSize) > 0 ? (blockSize / warpSize) : 1; - const unsigned int ballot_result = __ballot_sync(mask, tid < shmem_extent); - if (tid < shmem_extent) { - mySum = sdata[tid]; - // Reduce final warp using shuffle or reduce_add if T==int & CUDA_ARCH == + // Reduce within warp using shuffle or reduce_add if T==int & CUDA_ARCH == // SM 8.0 - mySum = warpReduceSum(ballot_result, mySum); - } + mySum = warpReduceSum(mask, mySum); - // write result for this block to global mem - if (tid == 0) { - g_odata[blockIdx.x] = mySum; - } + // each thread puts its local sum into shared memory + if ((tid % warpSize) == 0) { + sdata[tid / warpSize] = mySum; + } + + __syncthreads(); + + const unsigned int shmem_extent = (blockSize / warpSize) > 0 ? (blockSize / warpSize) : 1; + const unsigned int ballot_result = __ballot_sync(mask, tid < shmem_extent); + if (tid < shmem_extent) { + mySum = sdata[tid]; + // Reduce final warp using shuffle or reduce_add if T==int & CUDA_ARCH == + // SM 8.0 + mySum = warpReduceSum(ballot_result, mySum); + } + + // write result for this block to global mem + if (tid == 0) { + g_odata[blockIdx.x] = mySum; + } } // Performs a reduction step and updates numTotal with how many are remaining -template -__device__ T cg_reduce_n(T in, Group &threads) { - return cg::reduce(threads, in, cg::plus()); +template __device__ T cg_reduce_n(T in, Group &threads) +{ + return cg::reduce(threads, in, cg::plus()); } -template -__global__ void cg_reduce(T *g_idata, T *g_odata, unsigned int n) { - // Shared memory for intermediate steps - T *sdata = SharedMemory(); - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Handle to tile in thread block - cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta); +template __global__ void cg_reduce(T *g_idata, T *g_odata, unsigned int n) +{ + // Shared memory for intermediate steps + T *sdata = SharedMemory(); + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Handle to tile in thread block + cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta); - unsigned int ctaSize = cta.size(); - unsigned int numCtas = gridDim.x; - unsigned int threadRank = cta.thread_rank(); - unsigned int threadIndex = (blockIdx.x * ctaSize) + threadRank; + unsigned int ctaSize = cta.size(); + unsigned int numCtas = gridDim.x; + unsigned int threadRank = cta.thread_rank(); + unsigned int threadIndex = (blockIdx.x * ctaSize) + threadRank; - T threadVal = 0; - { - unsigned int i = threadIndex; - unsigned int indexStride = (numCtas * ctaSize); - while (i < n) { - threadVal += g_idata[i]; - i += indexStride; - } - sdata[threadRank] = threadVal; - } - - // Wait for all tiles to finish and reduce within CTA - { - unsigned int ctaSteps = tile.meta_group_size(); - unsigned int ctaIndex = ctaSize >> 1; - while (ctaIndex >= 32) { - cta.sync(); - if (threadRank < ctaIndex) { - threadVal += sdata[threadRank + ctaIndex]; + T threadVal = 0; + { + unsigned int i = threadIndex; + unsigned int indexStride = (numCtas * ctaSize); + while (i < n) { + threadVal += g_idata[i]; + i += indexStride; + } sdata[threadRank] = threadVal; - } - ctaSteps >>= 1; - ctaIndex >>= 1; } - } - // Shuffle redux instead of smem redux - { - cta.sync(); - if (tile.meta_group_rank() == 0) { - threadVal = cg_reduce_n(threadVal, tile); + // Wait for all tiles to finish and reduce within CTA + { + unsigned int ctaSteps = tile.meta_group_size(); + unsigned int ctaIndex = ctaSize >> 1; + while (ctaIndex >= 32) { + cta.sync(); + if (threadRank < ctaIndex) { + threadVal += sdata[threadRank + ctaIndex]; + sdata[threadRank] = threadVal; + } + ctaSteps >>= 1; + ctaIndex >>= 1; + } } - } - if (threadRank == 0) g_odata[blockIdx.x] = threadVal; + // Shuffle redux instead of smem redux + { + cta.sync(); + if (tile.meta_group_rank() == 0) { + threadVal = cg_reduce_n(threadVal, tile); + } + } + + if (threadRank == 0) + g_odata[blockIdx.x] = threadVal; } template -__global__ void multi_warp_cg_reduce(T *g_idata, T *g_odata, unsigned int n) { - // Shared memory for intermediate steps - T *sdata = SharedMemory(); - __shared__ cg::block_tile_memory scratch; +__global__ void multi_warp_cg_reduce(T *g_idata, T *g_odata, unsigned int n) +{ + // Shared memory for intermediate steps + T *sdata = SharedMemory(); + __shared__ cg::block_tile_memory scratch; - // Handle to thread block group - auto cta = cg::this_thread_block(scratch); - // Handle to multiWarpTile in thread block - auto multiWarpTile = cg::tiled_partition(cta); + // Handle to thread block group + auto cta = cg::this_thread_block(scratch); + // Handle to multiWarpTile in thread block + auto multiWarpTile = cg::tiled_partition(cta); - unsigned int gridSize = BlockSize * gridDim.x; - T threadVal = 0; + unsigned int gridSize = BlockSize * gridDim.x; + T threadVal = 0; - // we reduce multiple elements per thread. The number is determined by the - // number of active thread blocks (via gridDim). More blocks will result - // in a larger gridSize and therefore fewer elements per thread - int nIsPow2 = !(n & n-1); - if (nIsPow2) { - unsigned int i = blockIdx.x * BlockSize * 2 + threadIdx.x; - gridSize = gridSize << 1; + // we reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + int nIsPow2 = !(n & n - 1); + if (nIsPow2) { + unsigned int i = blockIdx.x * BlockSize * 2 + threadIdx.x; + gridSize = gridSize << 1; - while (i < n) { - threadVal += g_idata[i]; - // ensure we don't read out of bounds -- this is optimized away for - // powerOf2 sized arrays - if ((i + BlockSize) < n) { - threadVal += g_idata[i + blockDim.x]; - } - i += gridSize; + while (i < n) { + threadVal += g_idata[i]; + // ensure we don't read out of bounds -- this is optimized away for + // powerOf2 sized arrays + if ((i + BlockSize) < n) { + threadVal += g_idata[i + blockDim.x]; + } + i += gridSize; + } } - } else { - unsigned int i = blockIdx.x * BlockSize + threadIdx.x; - while (i < n) { - threadVal += g_idata[i]; - i += gridSize; + else { + unsigned int i = blockIdx.x * BlockSize + threadIdx.x; + while (i < n) { + threadVal += g_idata[i]; + i += gridSize; + } } - } - threadVal = cg_reduce_n(threadVal, multiWarpTile); + threadVal = cg_reduce_n(threadVal, multiWarpTile); - if (multiWarpTile.thread_rank() == 0) { - sdata[multiWarpTile.meta_group_rank()] = threadVal; - } - cg::sync(cta); - - if (threadIdx.x == 0) { - threadVal = 0; - for (int i=0; i < multiWarpTile.meta_group_size(); i++) { - threadVal += sdata[i]; + if (multiWarpTile.thread_rank() == 0) { + sdata[multiWarpTile.meta_group_rank()] = threadVal; + } + cg::sync(cta); + + if (threadIdx.x == 0) { + threadVal = 0; + for (int i = 0; i < multiWarpTile.meta_group_size(); i++) { + threadVal += sdata[i]; + } + g_odata[blockIdx.x] = threadVal; } - g_odata[blockIdx.x] = threadVal; - } } extern "C" bool isPow2(unsigned int x); @@ -610,428 +630,362 @@ extern "C" bool isPow2(unsigned int x); //////////////////////////////////////////////////////////////////////////////// // Wrapper function for kernel launch //////////////////////////////////////////////////////////////////////////////// -template -void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata, - T *d_odata) { - dim3 dimBlock(threads, 1, 1); - dim3 dimGrid(blocks, 1, 1); +template void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata, T *d_odata) +{ + dim3 dimBlock(threads, 1, 1); + dim3 dimGrid(blocks, 1, 1); - // when there is only one warp per block, we need to allocate two warps - // worth of shared memory so that we don't index shared memory out of bounds - int smemSize = - (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); + // when there is only one warp per block, we need to allocate two warps + // worth of shared memory so that we don't index shared memory out of bounds + int smemSize = (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); - // as kernel 9 - multi_warp_cg_reduce cannot work for more than 64 threads - // we choose to set kernel 7 for this purpose. - if (threads < 64 && whichKernel == 9) - { - whichKernel = 7; - } + // as kernel 9 - multi_warp_cg_reduce cannot work for more than 64 threads + // we choose to set kernel 7 for this purpose. + if (threads < 64 && whichKernel == 9) { + whichKernel = 7; + } - // choose which of the optimized versions of reduction to launch - switch (whichKernel) { + // choose which of the optimized versions of reduction to launch + switch (whichKernel) { case 0: - reduce0<<>>(d_idata, d_odata, size); - break; + reduce0<<>>(d_idata, d_odata, size); + break; case 1: - reduce1<<>>(d_idata, d_odata, size); - break; + reduce1<<>>(d_idata, d_odata, size); + break; case 2: - reduce2<<>>(d_idata, d_odata, size); - break; + reduce2<<>>(d_idata, d_odata, size); + break; case 3: - reduce3<<>>(d_idata, d_odata, size); - break; + reduce3<<>>(d_idata, d_odata, size); + break; case 4: - switch (threads) { + switch (threads) { case 512: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 256: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 128: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 64: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 32: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 16: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 8: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 4: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 2: - reduce4 - <<>>(d_idata, d_odata, size); - break; + reduce4<<>>(d_idata, d_odata, size); + break; case 1: - reduce4 - <<>>(d_idata, d_odata, size); - break; - } + reduce4<<>>(d_idata, d_odata, size); + break; + } - break; + break; case 5: - switch (threads) { + switch (threads) { case 512: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 256: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 128: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 64: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 32: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 16: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 8: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 4: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 2: - reduce5 - <<>>(d_idata, d_odata, size); - break; + reduce5<<>>(d_idata, d_odata, size); + break; case 1: - reduce5 - <<>>(d_idata, d_odata, size); - break; - } + reduce5<<>>(d_idata, d_odata, size); + break; + } - break; + break; case 6: - if (isPow2(size)) { - switch (threads) { - case 512: - reduce6 - <<>>(d_idata, d_odata, size); - break; + if (isPow2(size)) { + switch (threads) { + case 512: + reduce6<<>>(d_idata, d_odata, size); + break; - case 256: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 256: + reduce6<<>>(d_idata, d_odata, size); + break; - case 128: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 128: + reduce6<<>>(d_idata, d_odata, size); + break; - case 64: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 64: + reduce6<<>>(d_idata, d_odata, size); + break; - case 32: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 32: + reduce6<<>>(d_idata, d_odata, size); + break; - case 16: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 16: + reduce6<<>>(d_idata, d_odata, size); + break; - case 8: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 8: + reduce6<<>>(d_idata, d_odata, size); + break; - case 4: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 4: + reduce6<<>>(d_idata, d_odata, size); + break; - case 2: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 2: + reduce6<<>>(d_idata, d_odata, size); + break; - case 1: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 1: + reduce6<<>>(d_idata, d_odata, size); + break; + } } - } else { - switch (threads) { - case 512: - reduce6 - <<>>(d_idata, d_odata, size); - break; + else { + switch (threads) { + case 512: + reduce6<<>>(d_idata, d_odata, size); + break; - case 256: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 256: + reduce6<<>>(d_idata, d_odata, size); + break; - case 128: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 128: + reduce6<<>>(d_idata, d_odata, size); + break; - case 64: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 64: + reduce6<<>>(d_idata, d_odata, size); + break; - case 32: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 32: + reduce6<<>>(d_idata, d_odata, size); + break; - case 16: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 16: + reduce6<<>>(d_idata, d_odata, size); + break; - case 8: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 8: + reduce6<<>>(d_idata, d_odata, size); + break; - case 4: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 4: + reduce6<<>>(d_idata, d_odata, size); + break; - case 2: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 2: + reduce6<<>>(d_idata, d_odata, size); + break; - case 1: - reduce6 - <<>>(d_idata, d_odata, size); - break; + case 1: + reduce6<<>>(d_idata, d_odata, size); + break; + } } - } - break; + break; case 7: - // For reduce7 kernel we require only blockSize/warpSize - // number of elements in shared memory - smemSize = ((threads / 32) + 1) * sizeof(T); - if (isPow2(size)) { - switch (threads) { - case 1024: - reduce7 - <<>>(d_idata, d_odata, size); - break; - case 512: - reduce7 - <<>>(d_idata, d_odata, size); - break; + // For reduce7 kernel we require only blockSize/warpSize + // number of elements in shared memory + smemSize = ((threads / 32) + 1) * sizeof(T); + if (isPow2(size)) { + switch (threads) { + case 1024: + reduce7<<>>(d_idata, d_odata, size); + break; + case 512: + reduce7<<>>(d_idata, d_odata, size); + break; - case 256: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 256: + reduce7<<>>(d_idata, d_odata, size); + break; - case 128: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 128: + reduce7<<>>(d_idata, d_odata, size); + break; - case 64: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 64: + reduce7<<>>(d_idata, d_odata, size); + break; - case 32: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 32: + reduce7<<>>(d_idata, d_odata, size); + break; - case 16: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 16: + reduce7<<>>(d_idata, d_odata, size); + break; - case 8: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 8: + reduce7<<>>(d_idata, d_odata, size); + break; - case 4: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 4: + reduce7<<>>(d_idata, d_odata, size); + break; - case 2: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 2: + reduce7<<>>(d_idata, d_odata, size); + break; - case 1: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 1: + reduce7<<>>(d_idata, d_odata, size); + break; + } } - } else { - switch (threads) { - case 1024: - reduce7 - <<>>(d_idata, d_odata, size); - break; - case 512: - reduce7 - <<>>(d_idata, d_odata, size); - break; + else { + switch (threads) { + case 1024: + reduce7<<>>(d_idata, d_odata, size); + break; + case 512: + reduce7<<>>(d_idata, d_odata, size); + break; - case 256: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 256: + reduce7<<>>(d_idata, d_odata, size); + break; - case 128: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 128: + reduce7<<>>(d_idata, d_odata, size); + break; - case 64: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 64: + reduce7<<>>(d_idata, d_odata, size); + break; - case 32: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 32: + reduce7<<>>(d_idata, d_odata, size); + break; - case 16: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 16: + reduce7<<>>(d_idata, d_odata, size); + break; - case 8: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 8: + reduce7<<>>(d_idata, d_odata, size); + break; - case 4: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 4: + reduce7<<>>(d_idata, d_odata, size); + break; - case 2: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 2: + reduce7<<>>(d_idata, d_odata, size); + break; - case 1: - reduce7 - <<>>(d_idata, d_odata, size); - break; + case 1: + reduce7<<>>(d_idata, d_odata, size); + break; + } } - } - break; + break; case 8: - cg_reduce<<>>(d_idata, d_odata, size); - break; + cg_reduce<<>>(d_idata, d_odata, size); + break; case 9: - constexpr int numOfMultiWarpGroups = 2; - smemSize = numOfMultiWarpGroups * sizeof(T); - switch (threads) { + constexpr int numOfMultiWarpGroups = 2; + smemSize = numOfMultiWarpGroups * sizeof(T); + switch (threads) { case 1024: - multi_warp_cg_reduce - <<>>(d_idata, d_odata, size); - break; + multi_warp_cg_reduce + <<>>(d_idata, d_odata, size); + break; case 512: - multi_warp_cg_reduce - <<>>(d_idata, d_odata, size); - break; + multi_warp_cg_reduce + <<>>(d_idata, d_odata, size); + break; case 256: - multi_warp_cg_reduce - <<>>(d_idata, d_odata, size); - break; + multi_warp_cg_reduce + <<>>(d_idata, d_odata, size); + break; case 128: - multi_warp_cg_reduce - <<>>(d_idata, d_odata, size); - break; + multi_warp_cg_reduce + <<>>(d_idata, d_odata, size); + break; case 64: - multi_warp_cg_reduce - <<>>(d_idata, d_odata, size); - break; + multi_warp_cg_reduce + <<>>(d_idata, d_odata, size); + break; default: - printf("thread block size of < 64 is not supported for this kernel\n"); - break; - } - break; - } + printf("thread block size of < 64 is not supported for this kernel\n"); + break; + } + break; + } } // Instantiate the reduction function for 3 types -template void reduce(int size, int threads, int blocks, int whichKernel, - int *d_idata, int *d_odata); +template void reduce(int size, int threads, int blocks, int whichKernel, int *d_idata, int *d_odata); -template void reduce(int size, int threads, int blocks, int whichKernel, - float *d_idata, float *d_odata); +template void reduce(int size, int threads, int blocks, int whichKernel, float *d_idata, float *d_odata); -template void reduce(int size, int threads, int blocks, int whichKernel, - double *d_idata, double *d_odata); +template void reduce(int size, int threads, int blocks, int whichKernel, double *d_idata, double *d_odata); -#endif // #ifndef _REDUCE_KERNEL_H_ +#endif // #ifndef _REDUCE_KERNEL_H_ diff --git a/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/reductionMultiBlockCG.cu b/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/reductionMultiBlockCG.cu index 52babd8c..87c93ec2 100644 --- a/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/reductionMultiBlockCG.cu +++ b/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/reductionMultiBlockCG.cu @@ -54,22 +54,21 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes, project -#include -#include - #include +#include +#include const char *sSDKsample = "reductionMultiBlockCG"; -#include #include #include +#include namespace cg = cooperative_groups; @@ -87,22 +86,23 @@ namespace cg = cooperative_groups; See the CUDA SDK "reduction" sample for more information. */ -__device__ void reduceBlock(double *sdata, const cg::thread_block &cta) { - const unsigned int tid = cta.thread_rank(); - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); +__device__ void reduceBlock(double *sdata, const cg::thread_block &cta) +{ + const unsigned int tid = cta.thread_rank(); + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); - cg::sync(cta); + sdata[tid] = cg::reduce(tile32, sdata[tid], cg::plus()); + cg::sync(cta); - double beta = 0.0; - if (cta.thread_rank() == 0) { - beta = 0; - for (int i = 0; i < blockDim.x; i += tile32.size()) { - beta += sdata[i]; + double beta = 0.0; + if (cta.thread_rank() == 0) { + beta = 0; + for (int i = 0; i < blockDim.x; i += tile32.size()) { + beta += sdata[i]; + } + sdata[0] = beta; } - sdata[0] = beta; - } - cg::sync(cta); + cg::sync(cta); } // This reduction kernel reduces an arbitrary size array in a single kernel @@ -110,56 +110,56 @@ __device__ void reduceBlock(double *sdata, const cg::thread_block &cta) { // // For more details on the reduction algorithm (notably the multi-pass // approach), see the "reduction" sample in the CUDA SDK. -extern "C" __global__ void reduceSinglePassMultiBlockCG(const float *g_idata, - float *g_odata, - unsigned int n) { - // Handle to thread block group - cg::thread_block block = cg::this_thread_block(); - cg::grid_group grid = cg::this_grid(); +extern "C" __global__ void reduceSinglePassMultiBlockCG(const float *g_idata, float *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block block = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); - extern double __shared__ sdata[]; + extern double __shared__ sdata[]; - // Stride over grid and add the values to a shared memory buffer - sdata[block.thread_rank()] = 0; + // Stride over grid and add the values to a shared memory buffer + sdata[block.thread_rank()] = 0; - for (int i = grid.thread_rank(); i < n; i += grid.size()) { - sdata[block.thread_rank()] += g_idata[i]; - } - - cg::sync(block); - - // Reduce each block (called once per block) - reduceBlock(sdata, block); - // Write out the result to global memory - if (block.thread_rank() == 0) { - g_odata[blockIdx.x] = sdata[0]; - } - cg::sync(grid); - - if (grid.thread_rank() == 0) { - for (int block = 1; block < gridDim.x; block++) { - g_odata[0] += g_odata[block]; + for (int i = grid.thread_rank(); i < n; i += grid.size()) { + sdata[block.thread_rank()] += g_idata[i]; + } + + cg::sync(block); + + // Reduce each block (called once per block) + reduceBlock(sdata, block); + // Write out the result to global memory + if (block.thread_rank() == 0) { + g_odata[blockIdx.x] = sdata[0]; + } + cg::sync(grid); + + if (grid.thread_rank() == 0) { + for (int block = 1; block < gridDim.x; block++) { + g_odata[0] += g_odata[block]; + } } - } } //////////////////////////////////////////////////////////////////////////////// // Wrapper function for kernel launch //////////////////////////////////////////////////////////////////////////////// -void call_reduceSinglePassMultiBlockCG(int size, int threads, int numBlocks, - float *d_idata, float *d_odata) { - int smemSize = threads * sizeof(double); - void *kernelArgs[] = { - (void *)&d_idata, (void *)&d_odata, (void *)&size, - }; +void call_reduceSinglePassMultiBlockCG(int size, int threads, int numBlocks, float *d_idata, float *d_odata) +{ + int smemSize = threads * sizeof(double); + void *kernelArgs[] = { + (void *)&d_idata, + (void *)&d_odata, + (void *)&size, + }; - dim3 dimBlock(threads, 1, 1); - dim3 dimGrid(numBlocks, 1, 1); + dim3 dimBlock(threads, 1, 1); + dim3 dimGrid(numBlocks, 1, 1); - cudaLaunchCooperativeKernel((void *)reduceSinglePassMultiBlockCG, dimGrid, - dimBlock, kernelArgs, smemSize, NULL); - // check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); + cudaLaunchCooperativeKernel((void *)reduceSinglePassMultiBlockCG, dimGrid, dimBlock, kernelArgs, smemSize, NULL); + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); } //////////////////////////////////////////////////////////////////////////////// @@ -169,26 +169,26 @@ bool runTest(int argc, char **argv, int device); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - cudaDeviceProp deviceProp = {0}; - int dev; +int main(int argc, char **argv) +{ + cudaDeviceProp deviceProp = {0}; + int dev; - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - dev = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - if (!deviceProp.cooperativeLaunch) { - printf( - "\nSelected GPU (%d) does not support Cooperative Kernel Launch, " - "Waiving the run\n", - dev); - exit(EXIT_WAIVED); - } + dev = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + if (!deviceProp.cooperativeLaunch) { + printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, " + "Waiving the run\n", + dev); + exit(EXIT_WAIVED); + } - bool bTestPassed = false; - bTestPassed = runTest(argc, argv, dev); + bool bTestPassed = false; + bTestPassed = runTest(argc, argv, dev); - exit(bTestPassed ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestPassed ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// @@ -199,188 +199,191 @@ int main(int argc, char **argv) { //! @param data pointer to input data //! @param size number of input data elements //////////////////////////////////////////////////////////////////////////////// -template -T reduceCPU(T *data, int size) { - T sum = data[0]; - T c = (T)0.0; +template T reduceCPU(T *data, int size) +{ + T sum = data[0]; + T c = (T)0.0; - for (int i = 1; i < size; i++) { - T y = data[i] - c; - T t = sum + y; - c = (t - sum) - y; - sum = t; - } + for (int i = 1; i < size; i++) { + T y = data[i] - c; + T t = sum + y; + c = (t - sum) - y; + sum = t; + } - return sum; + return sum; } -unsigned int nextPow2(unsigned int x) { - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return ++x; +unsigned int nextPow2(unsigned int x) +{ + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; } //////////////////////////////////////////////////////////////////////////////// // Compute the number of threads and blocks to use for the reduction // We set threads / block to the minimum of maxThreads and n/2. //////////////////////////////////////////////////////////////////////////////// -void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, - int &threads) { - if (n == 1) { - threads = 1; - blocks = 1; - } else { - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( - &blocks, &threads, reduceSinglePassMultiBlockCG)); - } +void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads) +{ + if (n == 1) { + threads = 1; + blocks = 1; + } + else { + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, reduceSinglePassMultiBlockCG)); + } - blocks = min(maxBlocks, blocks); + blocks = min(maxBlocks, blocks); } //////////////////////////////////////////////////////////////////////////////// // This function performs a reduction of the input data multiple times and // measures the average reduction time. //////////////////////////////////////////////////////////////////////////////// -float benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads, - int maxBlocks, int testIterations, - StopWatchInterface *timer, float *h_odata, float *d_idata, - float *d_odata) { - float gpu_result = 0; - cudaError_t error; +float benchmarkReduce(int n, + int numThreads, + int numBlocks, + int maxThreads, + int maxBlocks, + int testIterations, + StopWatchInterface *timer, + float *h_odata, + float *d_idata, + float *d_odata) +{ + float gpu_result = 0; + cudaError_t error; - printf("\nLaunching %s kernel\n", - "SinglePass Multi Block Cooperative Groups"); - for (int i = 0; i < testIterations; ++i) { - gpu_result = 0; - sdkStartTimer(&timer); - call_reduceSinglePassMultiBlockCG(n, numThreads, numBlocks, d_idata, - d_odata); - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - } + printf("\nLaunching %s kernel\n", "SinglePass Multi Block Cooperative Groups"); + for (int i = 0; i < testIterations; ++i) { + gpu_result = 0; + sdkStartTimer(&timer); + call_reduceSinglePassMultiBlockCG(n, numThreads, numBlocks, d_idata, d_odata); + cudaDeviceSynchronize(); + sdkStopTimer(&timer); + } - // copy final sum from device to host - error = - cudaMemcpy(&gpu_result, d_odata, sizeof(float), cudaMemcpyDeviceToHost); - checkCudaErrors(error); + // copy final sum from device to host + error = cudaMemcpy(&gpu_result, d_odata, sizeof(float), cudaMemcpyDeviceToHost); + checkCudaErrors(error); - return gpu_result; + return gpu_result; } //////////////////////////////////////////////////////////////////////////////// // The main function which runs the reduction test. //////////////////////////////////////////////////////////////////////////////// -bool runTest(int argc, char **argv, int device) { - int size = 1 << 25; // number of elements to reduce - bool bTestPassed = false; +bool runTest(int argc, char **argv, int device) +{ + int size = 1 << 25; // number of elements to reduce + bool bTestPassed = false; - if (checkCmdLineFlag(argc, (const char **)argv, "n")) { - size = getCmdLineArgumentInt(argc, (const char **)argv, "n"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + size = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + } - printf("%d elements\n", size); + printf("%d elements\n", size); - // Set the device to be used - cudaDeviceProp prop = {0}; - checkCudaErrors(cudaSetDevice(device)); - checkCudaErrors(cudaGetDeviceProperties(&prop, device)); + // Set the device to be used + cudaDeviceProp prop = {0}; + checkCudaErrors(cudaSetDevice(device)); + checkCudaErrors(cudaGetDeviceProperties(&prop, device)); - // create random input data on CPU - unsigned int bytes = size * sizeof(float); + // create random input data on CPU + unsigned int bytes = size * sizeof(float); - float *h_idata = (float *)malloc(bytes); + float *h_idata = (float *)malloc(bytes); - for (int i = 0; i < size; i++) { - // Keep the numbers small so we don't get truncation error in the sum - h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX; - } + for (int i = 0; i < size; i++) { + // Keep the numbers small so we don't get truncation error in the sum + h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX; + } - // Determine the launch configuration (threads, blocks) - int maxThreads = 0; - int maxBlocks = 0; + // Determine the launch configuration (threads, blocks) + int maxThreads = 0; + int maxBlocks = 0; - if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { - maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); - } else { - maxThreads = prop.maxThreadsPerBlock; - } + if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { + maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); + } + else { + maxThreads = prop.maxThreadsPerBlock; + } - if (checkCmdLineFlag(argc, (const char **)argv, "maxblocks")) { - maxBlocks = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks"); - } else { - maxBlocks = prop.multiProcessorCount * - (prop.maxThreadsPerMultiProcessor / prop.maxThreadsPerBlock); - } + if (checkCmdLineFlag(argc, (const char **)argv, "maxblocks")) { + maxBlocks = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks"); + } + else { + maxBlocks = prop.multiProcessorCount * (prop.maxThreadsPerMultiProcessor / prop.maxThreadsPerBlock); + } - int numBlocks = 0; - int numThreads = 0; - getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads); + int numBlocks = 0; + int numThreads = 0; + getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads); - // We calculate the occupancy to know how many block can actually fit on the - // GPU - int numBlocksPerSm = 0; - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm, reduceSinglePassMultiBlockCG, numThreads, - numThreads * sizeof(double))); + // We calculate the occupancy to know how many block can actually fit on the + // GPU + int numBlocksPerSm = 0; + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm, reduceSinglePassMultiBlockCG, numThreads, numThreads * sizeof(double))); - int numSms = prop.multiProcessorCount; - if (numBlocks > numBlocksPerSm * numSms) { - numBlocks = numBlocksPerSm * numSms; - } - printf("numThreads: %d\n", numThreads); - printf("numBlocks: %d\n", numBlocks); + int numSms = prop.multiProcessorCount; + if (numBlocks > numBlocksPerSm * numSms) { + numBlocks = numBlocksPerSm * numSms; + } + printf("numThreads: %d\n", numThreads); + printf("numBlocks: %d\n", numBlocks); - // allocate mem for the result on host side - float *h_odata = (float *)malloc(numBlocks * sizeof(float)); + // allocate mem for the result on host side + float *h_odata = (float *)malloc(numBlocks * sizeof(float)); - // allocate device memory and data - float *d_idata = NULL; - float *d_odata = NULL; + // allocate device memory and data + float *d_idata = NULL; + float *d_odata = NULL; - checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); - checkCudaErrors(cudaMalloc((void **)&d_odata, numBlocks * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); + checkCudaErrors(cudaMalloc((void **)&d_odata, numBlocks * sizeof(float))); - // copy data directly to device memory - checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks * sizeof(float), - cudaMemcpyHostToDevice)); + // copy data directly to device memory + checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks * sizeof(float), cudaMemcpyHostToDevice)); - int testIterations = 100; + int testIterations = 100; - StopWatchInterface *timer = 0; - sdkCreateTimer(&timer); + StopWatchInterface *timer = 0; + sdkCreateTimer(&timer); - float gpu_result = 0; + float gpu_result = 0; - gpu_result = - benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks, - testIterations, timer, h_odata, d_idata, d_odata); + gpu_result = benchmarkReduce( + size, numThreads, numBlocks, maxThreads, maxBlocks, testIterations, timer, h_odata, d_idata, d_odata); - float reduceTime = sdkGetAverageTimerValue(&timer); - printf("Average time: %f ms\n", reduceTime); - printf("Bandwidth: %f GB/s\n\n", - (size * sizeof(int)) / (reduceTime * 1.0e6)); + float reduceTime = sdkGetAverageTimerValue(&timer); + printf("Average time: %f ms\n", reduceTime); + printf("Bandwidth: %f GB/s\n\n", (size * sizeof(int)) / (reduceTime * 1.0e6)); - // compute reference solution - float cpu_result = reduceCPU(h_idata, size); - printf("GPU result = %0.12f\n", gpu_result); - printf("CPU result = %0.12f\n", cpu_result); + // compute reference solution + float cpu_result = reduceCPU(h_idata, size); + printf("GPU result = %0.12f\n", gpu_result); + printf("CPU result = %0.12f\n", cpu_result); - double threshold = 1e-8 * size; - double diff = abs((double)gpu_result - (double)cpu_result); - bTestPassed = (diff < threshold); + double threshold = 1e-8 * size; + double diff = abs((double)gpu_result - (double)cpu_result); + bTestPassed = (diff < threshold); - // cleanup - sdkDeleteTimer(&timer); + // cleanup + sdkDeleteTimer(&timer); - free(h_idata); - free(h_odata); - cudaFree(d_idata); - cudaFree(d_odata); + free(h_idata); + free(h_odata); + cudaFree(d_idata); + cudaFree(d_odata); - return bTestPassed; + return bTestPassed; } diff --git a/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd.cu b/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd.cu index 4138acf7..0e2ce438 100644 --- a/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd.cu +++ b/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd.cu @@ -30,19 +30,17 @@ * given set of input vector pairs */ +#include +#include #include #include -#include #include - -#include -#include +#include /////////////////////////////////////////////////////////////////////////////// // Calculate scalar products of VectorN vectors of ElementN elements on CPU /////////////////////////////////////////////////////////////////////////////// -extern "C" void scalarProdCPU(float *h_C, float *h_A, float *h_B, int vectorN, - int elementN); +extern "C" void scalarProdCPU(float *h_C, float *h_A, float *h_B, int vectorN, int elementN); /////////////////////////////////////////////////////////////////////////////// // Calculate scalar products of VectorN vectors of ElementN elements on GPU @@ -53,9 +51,10 @@ extern "C" void scalarProdCPU(float *h_C, float *h_A, float *h_B, int vectorN, // Helper function, returning uniformly distributed // random float in [low, high] range //////////////////////////////////////////////////////////////////////////////// -float RandFloat(float low, float high) { - float t = (float)rand() / (float)RAND_MAX; - return (1.0f - t) * low + t * high; +float RandFloat(float low, float high) +{ + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; } /////////////////////////////////////////////////////////////////////////////// @@ -71,98 +70,99 @@ const int ELEMENT_N = 4096; // Total number of data elements const int DATA_N = VECTOR_N * ELEMENT_N; -const int DATA_SZ = DATA_N * sizeof(float); +const int DATA_SZ = DATA_N * sizeof(float); const int RESULT_SZ = VECTOR_N * sizeof(float); /////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - float *h_A, *h_B, *h_C_CPU, *h_C_GPU; - float *d_A, *d_B, *d_C; - double delta, ref, sum_delta, sum_ref, L1norm; - StopWatchInterface *hTimer = NULL; - int i; +int main(int argc, char **argv) +{ + float *h_A, *h_B, *h_C_CPU, *h_C_GPU; + float *d_A, *d_B, *d_C; + double delta, ref, sum_delta, sum_ref, L1norm; + StopWatchInterface *hTimer = NULL; + int i; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Initializing data...\n"); - printf("...allocating CPU memory.\n"); - h_A = (float *)malloc(DATA_SZ); - h_B = (float *)malloc(DATA_SZ); - h_C_CPU = (float *)malloc(RESULT_SZ); - h_C_GPU = (float *)malloc(RESULT_SZ); + printf("Initializing data...\n"); + printf("...allocating CPU memory.\n"); + h_A = (float *)malloc(DATA_SZ); + h_B = (float *)malloc(DATA_SZ); + h_C_CPU = (float *)malloc(RESULT_SZ); + h_C_GPU = (float *)malloc(RESULT_SZ); - printf("...allocating GPU memory.\n"); - checkCudaErrors(cudaMalloc((void **)&d_A, DATA_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_B, DATA_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_C, RESULT_SZ)); + printf("...allocating GPU memory.\n"); + checkCudaErrors(cudaMalloc((void **)&d_A, DATA_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_B, DATA_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_C, RESULT_SZ)); - printf("...generating input data in CPU mem.\n"); - srand(123); + printf("...generating input data in CPU mem.\n"); + srand(123); - // Generating input data on CPU - for (i = 0; i < DATA_N; i++) { - h_A[i] = RandFloat(0.0f, 1.0f); - h_B[i] = RandFloat(0.0f, 1.0f); - } + // Generating input data on CPU + for (i = 0; i < DATA_N; i++) { + h_A[i] = RandFloat(0.0f, 1.0f); + h_B[i] = RandFloat(0.0f, 1.0f); + } - printf("...copying input data to GPU mem.\n"); - // Copy options data to GPU memory for further processing - checkCudaErrors(cudaMemcpy(d_A, h_A, DATA_SZ, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_B, h_B, DATA_SZ, cudaMemcpyHostToDevice)); - printf("Data init done.\n"); + printf("...copying input data to GPU mem.\n"); + // Copy options data to GPU memory for further processing + checkCudaErrors(cudaMemcpy(d_A, h_A, DATA_SZ, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_B, h_B, DATA_SZ, cudaMemcpyHostToDevice)); + printf("Data init done.\n"); - printf("Executing GPU kernel...\n"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - scalarProdGPU<<<128, 256>>>(d_C, d_A, d_B, VECTOR_N, ELEMENT_N); - getLastCudaError("scalarProdGPU() execution failed\n"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - printf("GPU time: %f msecs.\n", sdkGetTimerValue(&hTimer)); + printf("Executing GPU kernel...\n"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + scalarProdGPU<<<128, 256>>>(d_C, d_A, d_B, VECTOR_N, ELEMENT_N); + getLastCudaError("scalarProdGPU() execution failed\n"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + printf("GPU time: %f msecs.\n", sdkGetTimerValue(&hTimer)); - printf("Reading back GPU result...\n"); - // Read back GPU results to compare them to CPU results - checkCudaErrors(cudaMemcpy(h_C_GPU, d_C, RESULT_SZ, cudaMemcpyDeviceToHost)); + printf("Reading back GPU result...\n"); + // Read back GPU results to compare them to CPU results + checkCudaErrors(cudaMemcpy(h_C_GPU, d_C, RESULT_SZ, cudaMemcpyDeviceToHost)); - printf("Checking GPU results...\n"); - printf("..running CPU scalar product calculation\n"); - scalarProdCPU(h_C_CPU, h_A, h_B, VECTOR_N, ELEMENT_N); + printf("Checking GPU results...\n"); + printf("..running CPU scalar product calculation\n"); + scalarProdCPU(h_C_CPU, h_A, h_B, VECTOR_N, ELEMENT_N); - printf("...comparing the results\n"); - // Calculate max absolute difference and L1 distance - // between CPU and GPU results - sum_delta = 0; - sum_ref = 0; + printf("...comparing the results\n"); + // Calculate max absolute difference and L1 distance + // between CPU and GPU results + sum_delta = 0; + sum_ref = 0; - for (i = 0; i < VECTOR_N; i++) { - delta = fabs(h_C_GPU[i] - h_C_CPU[i]); - ref = h_C_CPU[i]; - sum_delta += delta; - sum_ref += ref; - } + for (i = 0; i < VECTOR_N; i++) { + delta = fabs(h_C_GPU[i] - h_C_CPU[i]); + ref = h_C_CPU[i]; + sum_delta += delta; + sum_ref += ref; + } - L1norm = sum_delta / sum_ref; + L1norm = sum_delta / sum_ref; - printf("Shutting down...\n"); - checkCudaErrors(cudaFree(d_C)); - checkCudaErrors(cudaFree(d_B)); - checkCudaErrors(cudaFree(d_A)); - free(h_C_GPU); - free(h_C_CPU); - free(h_B); - free(h_A); - sdkDeleteTimer(&hTimer); + printf("Shutting down...\n"); + checkCudaErrors(cudaFree(d_C)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_A)); + free(h_C_GPU); + free(h_C_CPU); + free(h_B); + free(h_A); + sdkDeleteTimer(&hTimer); - printf("L1 error: %E\n", L1norm); - printf((L1norm < 1e-6) ? "Test passed\n" : "Test failed!\n"); - exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); + printf("L1 error: %E\n", L1norm); + printf((L1norm < 1e-6) ? "Test passed\n" : "Test failed!\n"); + exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_cpu.cpp b/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_cpu.cpp index 612e18dc..7419072f 100644 --- a/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_cpu.cpp +++ b/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_cpu.cpp @@ -29,17 +29,17 @@ // Calculate scalar products of VectorN vectors of ElementN elements on CPU. // Straight accumulation in double precision. //////////////////////////////////////////////////////////////////////////// -extern "C" void scalarProdCPU(float *h_C, float *h_A, float *h_B, int vectorN, - int elementN) { - for (int vec = 0; vec < vectorN; vec++) { - int vectorBase = elementN * vec; - int vectorEnd = vectorBase + elementN; +extern "C" void scalarProdCPU(float *h_C, float *h_A, float *h_B, int vectorN, int elementN) +{ + for (int vec = 0; vec < vectorN; vec++) { + int vectorBase = elementN * vec; + int vectorEnd = vectorBase + elementN; - double sum = 0; + double sum = 0; - for (int pos = vectorBase; pos < vectorEnd; pos++) - sum += h_A[pos] * h_B[pos]; + for (int pos = vectorBase; pos < vectorEnd; pos++) + sum += h_A[pos] * h_B[pos]; - h_C[vec] = (float)sum; - } + h_C[vec] = (float)sum; + } } diff --git a/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_kernel.cuh b/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_kernel.cuh index 6432a288..80f27832 100644 --- a/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_kernel.cuh +++ b/Samples/2_Concepts_and_Techniques/scalarProd/scalarProd_kernel.cuh @@ -47,50 +47,51 @@ namespace cg = cooperative_groups; // 2) ACCUM_N must be a power of two. /////////////////////////////////////////////////////////////////////////////// #define ACCUM_N 1024 -__global__ void scalarProdGPU(float *d_C, float *d_A, float *d_B, int vectorN, - int elementN) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Accumulators cache - __shared__ float accumResult[ACCUM_N]; +__global__ void scalarProdGPU(float *d_C, float *d_A, float *d_B, int vectorN, int elementN) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Accumulators cache + __shared__ float accumResult[ACCUM_N]; - //////////////////////////////////////////////////////////////////////////// - // Cycle through every pair of vectors, - // taking into account that vector counts can be different - // from total number of thread blocks - //////////////////////////////////////////////////////////////////////////// - for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) { - int vectorBase = IMUL(elementN, vec); - int vectorEnd = vectorBase + elementN; + //////////////////////////////////////////////////////////////////////////// + // Cycle through every pair of vectors, + // taking into account that vector counts can be different + // from total number of thread blocks + //////////////////////////////////////////////////////////////////////////// + for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) { + int vectorBase = IMUL(elementN, vec); + int vectorEnd = vectorBase + elementN; - //////////////////////////////////////////////////////////////////////// - // Each accumulator cycles through vectors with - // stride equal to number of total number of accumulators ACCUM_N - // At this stage ACCUM_N is only preferred be a multiple of warp size - // to meet memory coalescing alignment constraints. - //////////////////////////////////////////////////////////////////////// - for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) { - float sum = 0; + //////////////////////////////////////////////////////////////////////// + // Each accumulator cycles through vectors with + // stride equal to number of total number of accumulators ACCUM_N + // At this stage ACCUM_N is only preferred be a multiple of warp size + // to meet memory coalescing alignment constraints. + //////////////////////////////////////////////////////////////////////// + for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) { + float sum = 0; - for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) - sum += d_A[pos] * d_B[pos]; + for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) + sum += d_A[pos] * d_B[pos]; - accumResult[iAccum] = sum; + accumResult[iAccum] = sum; + } + + //////////////////////////////////////////////////////////////////////// + // Perform tree-like reduction of accumulators' results. + // ACCUM_N has to be power of two at this stage + //////////////////////////////////////////////////////////////////////// + for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + + for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) + accumResult[iAccum] += accumResult[stride + iAccum]; + } + + cg::sync(cta); + + if (threadIdx.x == 0) + d_C[vec] = accumResult[0]; } - - //////////////////////////////////////////////////////////////////////// - // Perform tree-like reduction of accumulators' results. - // ACCUM_N has to be power of two at this stage - //////////////////////////////////////////////////////////////////////// - for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - - for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) - accumResult[iAccum] += accumResult[stride + iAccum]; - } - - cg::sync(cta); - - if (threadIdx.x == 0) d_C[vec] = accumResult[0]; - } } diff --git a/Samples/2_Concepts_and_Techniques/scan/main.cpp b/Samples/2_Concepts_and_Techniques/scan/main.cpp index 6694c86c..03e70804 100644 --- a/Samples/2_Concepts_and_Techniques/scan/main.cpp +++ b/Samples/2_Concepts_and_Techniques/scan/main.cpp @@ -31,162 +31,152 @@ #include "scan_common.h" -int main(int argc, char **argv) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + printf("%s Starting...\n\n", argv[0]); - // Use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // Use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - uint *d_Input, *d_Output; - uint *h_Input, *h_OutputCPU, *h_OutputGPU; - StopWatchInterface *hTimer = NULL; - const uint N = 13 * 1048576 / 2; + uint *d_Input, *d_Output; + uint *h_Input, *h_OutputCPU, *h_OutputGPU; + StopWatchInterface *hTimer = NULL; + const uint N = 13 * 1048576 / 2; - printf("Allocating and initializing host arrays...\n"); - sdkCreateTimer(&hTimer); - h_Input = (uint *)malloc(N * sizeof(uint)); - h_OutputCPU = (uint *)malloc(N * sizeof(uint)); - h_OutputGPU = (uint *)malloc(N * sizeof(uint)); - srand(2009); - - for (uint i = 0; i < N; i++) { - h_Input[i] = rand(); - } - - printf("Allocating and initializing CUDA arrays...\n"); - checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint))); - checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); - checkCudaErrors( - cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice)); - - printf("Initializing CUDA-C scan...\n\n"); - initScan(); - - int globalFlag = 1; - size_t szWorkgroup; - const int iCycles = 100; - printf( - "*** Running GPU scan for short arrays (%d identical iterations)...\n\n", - iCycles); - - for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; - arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1) { - printf("Running scan for %u elements (%u arrays)...\n", arrayLength, - N / arrayLength); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - for (int i = 0; i < iCycles; i++) { - szWorkgroup = - scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); - } - - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; - - printf("Validating the results...\n"); - printf("...reading back GPU results\n"); - checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), - cudaMemcpyDeviceToHost)); - - printf(" ...scanExclusiveHost()\n"); - scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); - - // Compare GPU results with CPU results and accumulate error for this test - printf(" ...comparing the results\n"); - int localFlag = 1; + printf("Allocating and initializing host arrays...\n"); + sdkCreateTimer(&hTimer); + h_Input = (uint *)malloc(N * sizeof(uint)); + h_OutputCPU = (uint *)malloc(N * sizeof(uint)); + h_OutputGPU = (uint *)malloc(N * sizeof(uint)); + srand(2009); for (uint i = 0; i < N; i++) { - if (h_OutputCPU[i] != h_OutputGPU[i]) { - localFlag = 0; - break; - } + h_Input[i] = rand(); } - // Log message on individual test result, then accumulate to global flag - printf(" ...Results %s\n\n", - (localFlag == 1) ? "Match" : "DON'T Match !!!"); - globalFlag = globalFlag && localFlag; + printf("Allocating and initializing CUDA arrays...\n"); + checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); + checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice)); - // Data log - if (arrayLength == MAX_SHORT_ARRAY_SIZE) { - printf("\n"); - printf( - "scan, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u " - "Elements, NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * (double)arrayLength / timerValue), timerValue, - (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); - printf("\n"); - } - } + printf("Initializing CUDA-C scan...\n\n"); + initScan(); - printf( - "***Running GPU scan for large arrays (%u identical iterations)...\n\n", - iCycles); + int globalFlag = 1; + size_t szWorkgroup; + const int iCycles = 100; + printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); - for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; - arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1) { - printf("Running scan for %u elements (%u arrays)...\n", arrayLength, - N / arrayLength); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1) { + printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - for (int i = 0; i < iCycles; i++) { - szWorkgroup = - scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); + for (int i = 0; i < iCycles; i++) { + szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); + } + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; + + printf("Validating the results...\n"); + printf("...reading back GPU results\n"); + checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); + + printf(" ...scanExclusiveHost()\n"); + scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); + + // Compare GPU results with CPU results and accumulate error for this test + printf(" ...comparing the results\n"); + int localFlag = 1; + + for (uint i = 0; i < N; i++) { + if (h_OutputCPU[i] != h_OutputGPU[i]) { + localFlag = 0; + break; + } + } + + // Log message on individual test result, then accumulate to global flag + printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); + globalFlag = globalFlag && localFlag; + + // Data log + if (arrayLength == MAX_SHORT_ARRAY_SIZE) { + printf("\n"); + printf("scan, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u " + "Elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-6 * (double)arrayLength / timerValue), + timerValue, + (unsigned int)arrayLength, + 1, + (unsigned int)szWorkgroup); + printf("\n"); + } } - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; + printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); - printf("Validating the results...\n"); - printf("...reading back GPU results\n"); - checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), - cudaMemcpyDeviceToHost)); + for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1) { + printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - printf("...scanExclusiveHost()\n"); - scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); + for (int i = 0; i < iCycles; i++) { + szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); + } - // Compare GPU results with CPU results and accumulate error for this test - printf(" ...comparing the results\n"); - int localFlag = 1; + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; - for (uint i = 0; i < N; i++) { - if (h_OutputCPU[i] != h_OutputGPU[i]) { - localFlag = 0; - break; - } + printf("Validating the results...\n"); + printf("...reading back GPU results\n"); + checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); + + printf("...scanExclusiveHost()\n"); + scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); + + // Compare GPU results with CPU results and accumulate error for this test + printf(" ...comparing the results\n"); + int localFlag = 1; + + for (uint i = 0; i < N; i++) { + if (h_OutputCPU[i] != h_OutputGPU[i]) { + localFlag = 0; + break; + } + } + + // Log message on individual test result, then accumulate to global flag + printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); + globalFlag = globalFlag && localFlag; + + // Data log + if (arrayLength == MAX_LARGE_ARRAY_SIZE) { + printf("\n"); + printf("scan, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u " + "Elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-6 * (double)arrayLength / timerValue), + timerValue, + (unsigned int)arrayLength, + 1, + (unsigned int)szWorkgroup); + printf("\n"); + } } - // Log message on individual test result, then accumulate to global flag - printf(" ...Results %s\n\n", - (localFlag == 1) ? "Match" : "DON'T Match !!!"); - globalFlag = globalFlag && localFlag; + printf("Shutting down...\n"); + closeScan(); + checkCudaErrors(cudaFree(d_Output)); + checkCudaErrors(cudaFree(d_Input)); - // Data log - if (arrayLength == MAX_LARGE_ARRAY_SIZE) { - printf("\n"); - printf( - "scan, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u " - "Elements, NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * (double)arrayLength / timerValue), timerValue, - (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); - printf("\n"); - } - } + sdkDeleteTimer(&hTimer); - printf("Shutting down...\n"); - closeScan(); - checkCudaErrors(cudaFree(d_Output)); - checkCudaErrors(cudaFree(d_Input)); - - sdkDeleteTimer(&hTimer); - - // pass or fail (cumulative... all tests in the loop) - exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE); + // pass or fail (cumulative... all tests in the loop) + exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/2_Concepts_and_Techniques/scan/scan.cu b/Samples/2_Concepts_and_Techniques/scan/scan.cu index bd83df1d..c6f435bb 100644 --- a/Samples/2_Concepts_and_Techniques/scan/scan.cu +++ b/Samples/2_Concepts_and_Techniques/scan/scan.cu @@ -30,6 +30,7 @@ namespace cg = cooperative_groups; #include + #include "scan_common.h" // All three kernels run 512 threads per workgroup @@ -43,125 +44,127 @@ namespace cg = cooperative_groups; // Allocate 2 * 'size' local memory, initialize the first half // with 'size' zeros avoiding if(pos >= offset) condition evaluation // and saving instructions -inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, - uint size, cg::thread_block cta) { - uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); - s_Data[pos] = 0; - pos += size; - s_Data[pos] = idata; +inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size, cg::thread_block cta) +{ + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + pos += size; + s_Data[pos] = idata; - for (uint offset = 1; offset < size; offset <<= 1) { - cg::sync(cta); - uint t = s_Data[pos] + s_Data[pos - offset]; - cg::sync(cta); - s_Data[pos] = t; - } + for (uint offset = 1; offset < size; offset <<= 1) { + cg::sync(cta); + uint t = s_Data[pos] + s_Data[pos - offset]; + cg::sync(cta); + s_Data[pos] = t; + } - return s_Data[pos]; + return s_Data[pos]; } -inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, - uint size, cg::thread_block cta) { - return scan1Inclusive(idata, s_Data, size, cta) - idata; +inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size, cg::thread_block cta) +{ + return scan1Inclusive(idata, s_Data, size, cta) - idata; } -inline __device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, - uint size, cg::thread_block cta) { - // Level-0 inclusive scan - idata4.y += idata4.x; - idata4.z += idata4.y; - idata4.w += idata4.z; +inline __device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size, cg::thread_block cta) +{ + // Level-0 inclusive scan + idata4.y += idata4.x; + idata4.z += idata4.y; + idata4.w += idata4.z; - // Level-1 exclusive scan - uint oval = scan1Exclusive(idata4.w, s_Data, size / 4, cta); + // Level-1 exclusive scan + uint oval = scan1Exclusive(idata4.w, s_Data, size / 4, cta); - idata4.x += oval; - idata4.y += oval; - idata4.z += oval; - idata4.w += oval; + idata4.x += oval; + idata4.y += oval; + idata4.z += oval; + idata4.w += oval; - return idata4; + return idata4; } // Exclusive vector scan: the array to be scanned is stored // in local thread memory scope as uint4 -inline __device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, - uint size, cg::thread_block cta) { - uint4 odata4 = scan4Inclusive(idata4, s_Data, size, cta); - odata4.x -= idata4.x; - odata4.y -= idata4.y; - odata4.z -= idata4.z; - odata4.w -= idata4.w; - return odata4; +inline __device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size, cg::thread_block cta) +{ + uint4 odata4 = scan4Inclusive(idata4, s_Data, size, cta); + odata4.x -= idata4.x; + odata4.y -= idata4.y; + odata4.z -= idata4.z; + odata4.w -= idata4.w; + return odata4; } //////////////////////////////////////////////////////////////////////////////// // Scan kernels //////////////////////////////////////////////////////////////////////////////// -__global__ void scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint size) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; +__global__ void scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint size) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; - uint pos = blockIdx.x * blockDim.x + threadIdx.x; + uint pos = blockIdx.x * blockDim.x + threadIdx.x; - // Load data - uint4 idata4 = d_Src[pos]; + // Load data + uint4 idata4 = d_Src[pos]; - // Calculate exclusive scan - uint4 odata4 = scan4Exclusive(idata4, s_Data, size, cta); + // Calculate exclusive scan + uint4 odata4 = scan4Exclusive(idata4, s_Data, size, cta); - // Write back - d_Dst[pos] = odata4; + // Write back + d_Dst[pos] = odata4; } // Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE) -__global__ void scanExclusiveShared2(uint *d_Buf, uint *d_Dst, uint *d_Src, - uint N, uint arrayLength) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; +__global__ void scanExclusiveShared2(uint *d_Buf, uint *d_Dst, uint *d_Src, uint N, uint arrayLength) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; - // Skip loads and stores for inactive threads of last threadblock (pos >= N) - uint pos = blockIdx.x * blockDim.x + threadIdx.x; + // Skip loads and stores for inactive threads of last threadblock (pos >= N) + uint pos = blockIdx.x * blockDim.x + threadIdx.x; - // Load top elements - // Convert results of bottom-level scan back to inclusive - uint idata = 0; + // Load top elements + // Convert results of bottom-level scan back to inclusive + uint idata = 0; - if (pos < N) - idata = d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos] + - d_Src[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos]; + if (pos < N) + idata = d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos] + + d_Src[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos]; - // Compute - uint odata = scan1Exclusive(idata, s_Data, arrayLength, cta); + // Compute + uint odata = scan1Exclusive(idata, s_Data, arrayLength, cta); - // Avoid out-of-bound access - if (pos < N) { - d_Buf[pos] = odata; - } + // Avoid out-of-bound access + if (pos < N) { + d_Buf[pos] = odata; + } } // Final step of large-array scan: combine basic inclusive scan with exclusive // scan of top elements of input arrays -__global__ void uniformUpdate(uint4 *d_Data, uint *d_Buffer) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ uint buf; - uint pos = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void uniformUpdate(uint4 *d_Data, uint *d_Buffer) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ uint buf; + uint pos = blockIdx.x * blockDim.x + threadIdx.x; - if (threadIdx.x == 0) { - buf = d_Buffer[blockIdx.x]; - } + if (threadIdx.x == 0) { + buf = d_Buffer[blockIdx.x]; + } - cg::sync(cta); + cg::sync(cta); - uint4 data4 = d_Data[pos]; - data4.x += buf; - data4.y += buf; - data4.z += buf; - data4.w += buf; - d_Data[pos] = data4; + uint4 data4 = d_Data[pos]; + data4.x += buf; + data4.y += buf; + data4.z += buf; + data4.w += buf; + d_Data[pos] = data4; } //////////////////////////////////////////////////////////////////////////////// @@ -169,99 +172,95 @@ __global__ void uniformUpdate(uint4 *d_Data, uint *d_Buffer) { //////////////////////////////////////////////////////////////////////////////// // Derived as 32768 (max power-of-two gridDim.x) * 4 * THREADBLOCK_SIZE // Due to scanExclusiveShared<<<>>>() 1D block addressing -extern "C" const uint MAX_BATCH_ELEMENTS = 64 * 1048576; +extern "C" const uint MAX_BATCH_ELEMENTS = 64 * 1048576; extern "C" const uint MIN_SHORT_ARRAY_SIZE = 4; extern "C" const uint MAX_SHORT_ARRAY_SIZE = 4 * THREADBLOCK_SIZE; extern "C" const uint MIN_LARGE_ARRAY_SIZE = 8 * THREADBLOCK_SIZE; -extern "C" const uint MAX_LARGE_ARRAY_SIZE = - 4 * THREADBLOCK_SIZE * THREADBLOCK_SIZE; +extern "C" const uint MAX_LARGE_ARRAY_SIZE = 4 * THREADBLOCK_SIZE * THREADBLOCK_SIZE; // Internal exclusive scan buffer static uint *d_Buf; -extern "C" void initScan(void) { - checkCudaErrors( - cudaMalloc((void **)&d_Buf, - (MAX_BATCH_ELEMENTS / (4 * THREADBLOCK_SIZE)) * sizeof(uint))); +extern "C" void initScan(void) +{ + checkCudaErrors(cudaMalloc((void **)&d_Buf, (MAX_BATCH_ELEMENTS / (4 * THREADBLOCK_SIZE)) * sizeof(uint))); } extern "C" void closeScan(void) { checkCudaErrors(cudaFree(d_Buf)); } -static uint factorRadix2(uint &log2L, uint L) { - if (!L) { - log2L = 0; - return 0; - } else { - for (log2L = 0; (L & 1) == 0; L >>= 1, log2L++) - ; +static uint factorRadix2(uint &log2L, uint L) +{ + if (!L) { + log2L = 0; + return 0; + } + else { + for (log2L = 0; (L & 1) == 0; L >>= 1, log2L++) + ; - return L; - } + return L; + } } -static uint iDivUp(uint dividend, uint divisor) { - return ((dividend % divisor) == 0) ? (dividend / divisor) - : (dividend / divisor + 1); +static uint iDivUp(uint dividend, uint divisor) +{ + return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1); } -extern "C" size_t scanExclusiveShort(uint *d_Dst, uint *d_Src, uint batchSize, - uint arrayLength) { - // Check power-of-two factorization - uint log2L; - uint factorizationRemainder = factorRadix2(log2L, arrayLength); - assert(factorizationRemainder == 1); +extern "C" size_t scanExclusiveShort(uint *d_Dst, uint *d_Src, uint batchSize, uint arrayLength) +{ + // Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); - // Check supported size range - assert((arrayLength >= MIN_SHORT_ARRAY_SIZE) && - (arrayLength <= MAX_SHORT_ARRAY_SIZE)); + // Check supported size range + assert((arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE)); - // Check total batch size limit - assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); + // Check total batch size limit + assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); - // Check all threadblocks to be fully packed with data - assert((batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0); + // Check all threadblocks to be fully packed with data + assert((batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0); - scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), - THREADBLOCK_SIZE>>>((uint4 *)d_Dst, (uint4 *)d_Src, - arrayLength); - getLastCudaError("scanExclusiveShared() execution FAILED\n"); + scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, (uint4 *)d_Src, arrayLength); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); - return THREADBLOCK_SIZE; + return THREADBLOCK_SIZE; } -extern "C" size_t scanExclusiveLarge(uint *d_Dst, uint *d_Src, uint batchSize, - uint arrayLength) { - // Check power-of-two factorization - uint log2L; - uint factorizationRemainder = factorRadix2(log2L, arrayLength); - assert(factorizationRemainder == 1); +extern "C" size_t scanExclusiveLarge(uint *d_Dst, uint *d_Src, uint batchSize, uint arrayLength) +{ + // Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); - // Check supported size range - assert((arrayLength >= MIN_LARGE_ARRAY_SIZE) && - (arrayLength <= MAX_LARGE_ARRAY_SIZE)); + // Check supported size range + assert((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE)); - // Check total batch size limit - assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); + // Check total batch size limit + assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); - scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), - THREADBLOCK_SIZE>>>((uint4 *)d_Dst, (uint4 *)d_Src, - 4 * THREADBLOCK_SIZE); - getLastCudaError("scanExclusiveShared() execution FAILED\n"); + scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, (uint4 *)d_Src, 4 * THREADBLOCK_SIZE); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); - // Not all threadblocks need to be packed with input data: - // inactive threads of highest threadblock just don't do global reads and - // writes - const uint blockCount2 = iDivUp( - (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE); - scanExclusiveShared2<<>>( - (uint *)d_Buf, (uint *)d_Dst, (uint *)d_Src, - (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), - arrayLength / (4 * THREADBLOCK_SIZE)); - getLastCudaError("scanExclusiveShared2() execution FAILED\n"); + // Not all threadblocks need to be packed with input data: + // inactive threads of highest threadblock just don't do global reads and + // writes + const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE); + scanExclusiveShared2<<>>((uint *)d_Buf, + (uint *)d_Dst, + (uint *)d_Src, + (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), + arrayLength / (4 * THREADBLOCK_SIZE)); + getLastCudaError("scanExclusiveShared2() execution FAILED\n"); - uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), - THREADBLOCK_SIZE>>>((uint4 *)d_Dst, (uint *)d_Buf); - getLastCudaError("uniformUpdate() execution FAILED\n"); + uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>((uint4 *)d_Dst, + (uint *)d_Buf); + getLastCudaError("uniformUpdate() execution FAILED\n"); - return THREADBLOCK_SIZE; + return THREADBLOCK_SIZE; } diff --git a/Samples/2_Concepts_and_Techniques/scan/scan_common.h b/Samples/2_Concepts_and_Techniques/scan/scan_common.h index 68c7d84d..5b0de360 100644 --- a/Samples/2_Concepts_and_Techniques/scan/scan_common.h +++ b/Samples/2_Concepts_and_Techniques/scan/scan_common.h @@ -50,16 +50,13 @@ extern "C" const uint MAX_LARGE_ARRAY_SIZE; extern "C" void initScan(void); extern "C" void closeScan(void); -extern "C" size_t scanExclusiveShort(uint *d_Dst, uint *d_Src, uint batchSize, - uint arrayLength); +extern "C" size_t scanExclusiveShort(uint *d_Dst, uint *d_Src, uint batchSize, uint arrayLength); -extern "C" size_t scanExclusiveLarge(uint *d_Dst, uint *d_Src, uint batchSize, - uint arrayLength); +extern "C" size_t scanExclusiveLarge(uint *d_Dst, uint *d_Src, uint batchSize, uint arrayLength); //////////////////////////////////////////////////////////////////////////////// // Reference CPU scan //////////////////////////////////////////////////////////////////////////////// -extern "C" void scanExclusiveHost(uint *dst, uint *src, uint batchSize, - uint arrayLength); +extern "C" void scanExclusiveHost(uint *dst, uint *src, uint batchSize, uint arrayLength); #endif diff --git a/Samples/2_Concepts_and_Techniques/scan/scan_gold.cpp b/Samples/2_Concepts_and_Techniques/scan/scan_gold.cpp index 4d079982..c04e3704 100644 --- a/Samples/2_Concepts_and_Techniques/scan/scan_gold.cpp +++ b/Samples/2_Concepts_and_Techniques/scan/scan_gold.cpp @@ -27,11 +27,12 @@ #include "scan_common.h" -extern "C" void scanExclusiveHost(uint *dst, uint *src, uint batchSize, - uint arrayLength) { - for (uint i = 0; i < batchSize; i++, src += arrayLength, dst += arrayLength) { - dst[0] = 0; +extern "C" void scanExclusiveHost(uint *dst, uint *src, uint batchSize, uint arrayLength) +{ + for (uint i = 0; i < batchSize; i++, src += arrayLength, dst += arrayLength) { + dst[0] = 0; - for (uint j = 1; j < arrayLength; j++) dst[j] = src[j - 1] + dst[j - 1]; - } + for (uint j = 1; j < arrayLength; j++) + dst[j] = src[j - 1] + dst[j - 1]; + } } diff --git a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/common.cuh b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/common.cuh index 63471c6a..378fd548 100644 --- a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/common.cuh +++ b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/common.cuh @@ -24,13 +24,12 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - + #ifndef _COMMON_CUH_ #define _COMMON_CUH_ -typedef unsigned char uchar; -typedef unsigned int uint; +typedef unsigned char uchar; +typedef unsigned int uint; typedef unsigned long long int ullint; #endif // #ifndef _COMMON_CUH_ - diff --git a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/kernels.cuh b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/kernels.cuh index 779c9958..d36bb243 100644 --- a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/kernels.cuh +++ b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/kernels.cuh @@ -38,17 +38,15 @@ #include "common.cuh" // Functors used with thrust library. -template -struct IsGreaterEqualThan +template struct IsGreaterEqualThan { - __host__ __device__ IsGreaterEqualThan(uint upperBound) : - upperBound_(upperBound) {} - - __host__ __device__ bool operator()(const Input &value) const + __host__ __device__ IsGreaterEqualThan(uint upperBound) + : upperBound_(upperBound) { - return value >= upperBound_; } + __host__ __device__ bool operator()(const Input &value) const { return value >= upperBound_; } + uint upperBound_; }; @@ -57,94 +55,77 @@ __global__ void addScalar(uint *array, int scalar, uint size) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < size) - { + if (tid < size) { array[tid] += scalar; } } -__global__ void markSegments(const uint *verticesOffsets, - uint *flags, - uint verticesCount) +__global__ void markSegments(const uint *verticesOffsets, uint *flags, uint verticesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < verticesCount) - { + if (tid < verticesCount) { flags[verticesOffsets[tid]] = 1; } } __global__ void getVerticesMapping(const uint *clusteredVerticesIDs, const uint *newVerticesIDs, - uint *verticesMapping, - uint verticesCount) + uint *verticesMapping, + uint verticesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < verticesCount) - { - uint vertexID = clusteredVerticesIDs[tid]; + if (tid < verticesCount) { + uint vertexID = clusteredVerticesIDs[tid]; verticesMapping[vertexID] = newVerticesIDs[tid]; } } __global__ void getSuccessors(const uint *verticesOffsets, const uint *minScannedEdges, - uint *successors, - uint verticesCount, - uint edgesCount) + uint *successors, + uint verticesCount, + uint edgesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < verticesCount) - { - uint successorPos = (tid < verticesCount - 1) ? - (verticesOffsets[tid + 1] - 1) : - (edgesCount - 1); + if (tid < verticesCount) { + uint successorPos = (tid < verticesCount - 1) ? (verticesOffsets[tid + 1] - 1) : (edgesCount - 1); successors[tid] = minScannedEdges[successorPos]; } } -__global__ void removeCycles(uint *successors, - uint verticesCount) +__global__ void removeCycles(uint *successors, uint verticesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < verticesCount) - { - uint successor = successors[tid]; + if (tid < verticesCount) { + uint successor = successors[tid]; uint nextSuccessor = successors[successor]; - if (tid == nextSuccessor) - { - if (tid < successor) - { + if (tid == nextSuccessor) { + if (tid < successor) { successors[tid] = tid; } - else - { + else { successors[successor] = successor; } } } } -__global__ void getRepresentatives(const uint *successors, - uint *representatives, - uint verticesCount) +__global__ void getRepresentatives(const uint *successors, uint *representatives, uint verticesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < verticesCount) - { - uint successor = successors[tid]; + if (tid < verticesCount) { + uint successor = successors[tid]; uint nextSuccessor = successors[successor]; - while (successor != nextSuccessor) - { - successor = nextSuccessor; + while (successor != nextSuccessor) { + successor = nextSuccessor; nextSuccessor = successors[nextSuccessor]; } @@ -152,70 +133,60 @@ __global__ void getRepresentatives(const uint *successors, } } -__global__ void invalidateLoops(const uint *startpoints, - const uint *verticesMapping, - uint *edges, - uint edgesCount) +__global__ void invalidateLoops(const uint *startpoints, const uint *verticesMapping, uint *edges, uint edgesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < edgesCount) - { - uint startpoint = startpoints[tid]; - uint &endpoint = edges[tid]; + if (tid < edgesCount) { + uint startpoint = startpoints[tid]; + uint &endpoint = edges[tid]; uint newStartpoint = verticesMapping[startpoint]; - uint newEndpoint = verticesMapping[endpoint]; + uint newEndpoint = verticesMapping[endpoint]; - if (newStartpoint == newEndpoint) - { + if (newStartpoint == newEndpoint) { endpoint = UINT_MAX; } } } -__global__ void calculateEdgesInfo(const uint *startpoints, - const uint *verticesMapping, - const uint *edges, +__global__ void calculateEdgesInfo(const uint *startpoints, + const uint *verticesMapping, + const uint *edges, const float *weights, - uint *newStartpoints, - uint *survivedEdgesIDs, - uint edgesCount, - uint newVerticesCount) + uint *newStartpoints, + uint *survivedEdgesIDs, + uint edgesCount, + uint newVerticesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < edgesCount) - { + if (tid < edgesCount) { uint startpoint = startpoints[tid]; - uint endpoint = edges[tid]; + uint endpoint = edges[tid]; - newStartpoints[tid] = endpoint < UINT_MAX ? - verticesMapping[startpoint] : - newVerticesCount + verticesMapping[startpoint]; + newStartpoints[tid] = + endpoint < UINT_MAX ? verticesMapping[startpoint] : newVerticesCount + verticesMapping[startpoint]; - survivedEdgesIDs[tid] = endpoint < UINT_MAX ? - tid : - UINT_MAX; + survivedEdgesIDs[tid] = endpoint < UINT_MAX ? tid : UINT_MAX; } } -__global__ void makeNewEdges(const uint *survivedEdgesIDs, - const uint *verticesMapping, - const uint *edges, +__global__ void makeNewEdges(const uint *survivedEdgesIDs, + const uint *verticesMapping, + const uint *edges, const float *weights, - uint *newEdges, - float *newWeights, - uint edgesCount) + uint *newEdges, + float *newWeights, + uint edgesCount) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < edgesCount) - { - uint edgeID = survivedEdgesIDs[tid]; + if (tid < edgesCount) { + uint edgeID = survivedEdgesIDs[tid]; uint oldEdge = edges[edgeID]; - newEdges[tid] = verticesMapping[oldEdge]; + newEdges[tid] = verticesMapping[oldEdge]; newWeights[tid] = weights[edgeID]; } } diff --git a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu index 75c47ddf..7d7c84df 100644 --- a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu +++ b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu @@ -37,42 +37,41 @@ */ // System includes. -#include -#include -#include #include +#include +#include +#include // STL includes. -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include // Thrust library includes. -#include +#include +#include +#include +#include +#include +#include +#include #include +#include #include #include -#include #include -#include #include -#include #include -#include #include -#include -#include - -#include -#include +#include // Sample framework includes. -#include #include +#include // Project includes. #include "common.cuh" @@ -82,106 +81,88 @@ using std::cin; using std::cout; -using std::endl; -using std::vector; -using std::list; using std::deque; +using std::endl; +using std::list; +using std::vector; // Very simple von Neumann middle-square prng. rand() is different across // various OS platforms, which makes testing and the output inconsistent. int myrand(void) { static int seed = 72191; - char sq[22]; + char sq[22]; seed *= seed; sprintf(sq, "%010d", seed); // pull the middle 5 digits out of sq sq[8] = 0; - seed = atoi(&sq[3]); + seed = atoi(&sq[3]); return seed; } // Simple memory pool class. It is nothing more than array of fixed-sized // arrays. -template -class DeviceMemoryPool +template class DeviceMemoryPool { - public: - // The parameters of the constructor are as follows: - // 1) uint chunkSize --- size of the particular array; - // 2) uint chunksCount --- number of fixed-sized arrays. - DeviceMemoryPool(uint chunkSize, uint chunksCount) : - chunkSize_(chunkSize) - { - chunkRawSize_ = (chunkSize * sizeof(T) + 511) & ~511; +public: + // The parameters of the constructor are as follows: + // 1) uint chunkSize --- size of the particular array; + // 2) uint chunksCount --- number of fixed-sized arrays. + DeviceMemoryPool(uint chunkSize, uint chunksCount) + : chunkSize_(chunkSize) + { + chunkRawSize_ = (chunkSize * sizeof(T) + 511) & ~511; - try - { - basePtr_ = - thrust::device_malloc(chunkRawSize_ * chunksCount); - } - catch (thrust::system_error &e) - { - cout << "Pool memory allocation failed (" << e.what() << ")" - << endl; - exit(EXIT_FAILURE); - } - - for (uint chunkIndex = 0; chunkIndex < chunksCount; ++chunkIndex) - { - chunks_.push_back( - thrust::device_ptr( - reinterpret_cast( - static_cast(basePtr_.get()) + - chunkRawSize_ * chunkIndex))); - } + try { + basePtr_ = thrust::device_malloc(chunkRawSize_ * chunksCount); + } + catch (thrust::system_error &e) { + cout << "Pool memory allocation failed (" << e.what() << ")" << endl; + exit(EXIT_FAILURE); } - ~DeviceMemoryPool() - { - try - { - thrust::device_free(basePtr_); - } - catch (thrust::system_error &e) - { - cout << "Pool memory allocation failed (" << e.what() << ")" - << endl; - exit(EXIT_FAILURE); - } + for (uint chunkIndex = 0; chunkIndex < chunksCount; ++chunkIndex) { + chunks_.push_back(thrust::device_ptr( + reinterpret_cast(static_cast(basePtr_.get()) + chunkRawSize_ * chunkIndex))); } + } - // Returns an address of the first available array - // in the memory pool. - thrust::device_ptr get() - { - thrust::device_ptr ptr(chunks_.back()); - chunks_.pop_back(); - - return ptr; + ~DeviceMemoryPool() + { + try { + thrust::device_free(basePtr_); } - - // Pushes an address stored in "ptr" to the list - // of available arrays of the memory pool. - // It should be noted that it is user who is responsible for returning - // the previously requested memory to the appropriate pool. - inline void put(const thrust::device_ptr &ptr) - { - chunks_.push_back(ptr); + catch (thrust::system_error &e) { + cout << "Pool memory allocation failed (" << e.what() << ")" << endl; + exit(EXIT_FAILURE); } + } - uint totalFreeChunks() const - { - return chunks_.size(); - } + // Returns an address of the first available array + // in the memory pool. + thrust::device_ptr get() + { + thrust::device_ptr ptr(chunks_.back()); + chunks_.pop_back(); - private: - uint chunkSize_, chunkRawSize_; - thrust::device_ptr basePtr_; + return ptr; + } - list< thrust::device_ptr > chunks_; + // Pushes an address stored in "ptr" to the list + // of available arrays of the memory pool. + // It should be noted that it is user who is responsible for returning + // the previously requested memory to the appropriate pool. + inline void put(const thrust::device_ptr &ptr) { chunks_.push_back(ptr); } + + uint totalFreeChunks() const { return chunks_.size(); } + +private: + uint chunkSize_, chunkRawSize_; + thrust::device_ptr basePtr_; + + list> chunks_; }; // Graph structure. @@ -189,11 +170,12 @@ struct Graph { Graph() {} - Graph(uint verticesCount, uint edgesCount) : - vertices(verticesCount), - edges(edgesCount), - weights(edgesCount) - {} + Graph(uint verticesCount, uint edgesCount) + : vertices(verticesCount) + , edges(edgesCount) + , weights(edgesCount) + { + } // This vector stores offsets for each vertex in "edges" and "weights" // vectors. For example: @@ -215,724 +197,578 @@ struct Graph // See "Level" class for the details. class Pyramid { +public: + void addLevel(uint totalSuperNodes, + uint totalNodes, + thrust::device_ptr superVerticesOffsets, + thrust::device_ptr verticesIDs) + { + levels_.push_back(Level(totalSuperNodes, totalNodes)); + levels_.back().buildFromDeviceData(superVerticesOffsets, verticesIDs); + } + + uint levelsCount() const { return static_cast(levels_.size()); } + + void dump(uint width, uint height) const + { + char filename[256], format[256]; + uint levelIndex = 0; + + uint requiredDigitsCount = static_cast(log10(static_cast(levelsCount()))) + 1; + sprintf(format, "level_%%0%uu.ppm", requiredDigitsCount); + + for (LevelsIterator level = levels_.rbegin(); level != levels_.rend(); ++level, ++levelIndex) { + + sprintf(filename, format, levelIndex); + dumpLevel(level, width, height, filename); + } + } + +private: + // Level of the segmentation tree. + class Level + { public: - void addLevel(uint totalSuperNodes, - uint totalNodes, - thrust::device_ptr superVerticesOffsets, - thrust::device_ptr verticesIDs) + Level(uint totalSuperNodes, uint totalNodes) + : superNodesOffsets_(totalSuperNodes) + , nodes_(totalNodes) { - levels_.push_back(Level(totalSuperNodes, totalNodes)); - levels_.back().buildFromDeviceData(superVerticesOffsets, - verticesIDs); } - uint levelsCount() const + void buildFromDeviceData(thrust::device_ptr superVerticesOffsets, thrust::device_ptr verticesIDs) { - return static_cast(levels_.size()); - } + checkCudaErrors(cudaMemcpy(&(superNodesOffsets_[0]), + superVerticesOffsets.get(), + sizeof(uint) * superNodesOffsets_.size(), + cudaMemcpyDeviceToHost)); - void dump(uint width, uint height) const - { - char filename[256], format[256]; - uint levelIndex = 0; - - uint requiredDigitsCount = - static_cast(log10(static_cast(levelsCount()))) + - 1; - sprintf(format, "level_%%0%uu.ppm", requiredDigitsCount); - - for (LevelsIterator level = levels_.rbegin(); - level != levels_.rend(); - ++level, ++levelIndex) - { - - sprintf(filename, format, levelIndex); - dumpLevel(level, width, height, filename); - } + checkCudaErrors( + cudaMemcpy(&(nodes_[0]), verticesIDs.get(), sizeof(uint) * nodes_.size(), cudaMemcpyDeviceToHost)); } private: - // Level of the segmentation tree. - class Level + friend class Pyramid; + + // The pair of the following vectors describes the + // relation between the consecutive levels. + // Consider an example. Let the index of the current level be n. + // Then nodes of level #(n-1) with indices stored in + // "nodes[superNodesOffsets_[0]]", + // "nodes[superNodesOffsets_[0] + 1]", + // ..., + // "nodes[superNodesOffsets_[1] - 1]" + // correspond to vertex #0 of level #n. An so on. + vector superNodesOffsets_; + vector nodes_; + }; + + typedef list::const_reverse_iterator LevelsIterator; + + // Dumps level to the file "level_n.ppm" where n + // is index of the level. Segments are drawn in random colors. + void dumpLevel(LevelsIterator level, uint width, uint height, const char *filename) const + { + deque> nodesQueue; + + uint totalSegments; + { - public: - Level(uint totalSuperNodes, uint totalNodes) : - superNodesOffsets_(totalSuperNodes), nodes_(totalNodes) - { + const vector &superNodesOffsets = level->superNodesOffsets_; + const vector &nodes = level->nodes_; + + totalSegments = static_cast(superNodesOffsets.size()); + + for (uint superNodeIndex = 0, nodeIndex = 0; superNodeIndex < superNodesOffsets.size(); ++superNodeIndex) { + + uint superNodeEnd = superNodeIndex + 1 < superNodesOffsets.size() + ? superNodesOffsets[superNodeIndex + 1] + : static_cast(nodes.size()); + + for (; nodeIndex < superNodeEnd; ++nodeIndex) { + nodesQueue.push_back(std::make_pair(nodes[nodeIndex], superNodeIndex)); } + } + } - void buildFromDeviceData( - thrust::device_ptr superVerticesOffsets, - thrust::device_ptr verticesIDs) - { - checkCudaErrors( - cudaMemcpy(&(superNodesOffsets_[0]), - superVerticesOffsets.get(), - sizeof(uint) * superNodesOffsets_.size(), - cudaMemcpyDeviceToHost)); + ++level; - checkCudaErrors( - cudaMemcpy(&(nodes_[0]), - verticesIDs.get(), - sizeof(uint) * nodes_.size(), - cudaMemcpyDeviceToHost)); - } + while (level != levels_.rend()) { + uint superNodesCount = static_cast(nodesQueue.size()); - private: - friend class Pyramid; + const vector &superNodesOffsets = level->superNodesOffsets_; + const vector &nodes = level->nodes_; - // The pair of the following vectors describes the - // relation between the consecutive levels. - // Consider an example. Let the index of the current level be n. - // Then nodes of level #(n-1) with indices stored in - // "nodes[superNodesOffsets_[0]]", - // "nodes[superNodesOffsets_[0] + 1]", - // ..., - // "nodes[superNodesOffsets_[1] - 1]" - // correspond to vertex #0 of level #n. An so on. - vector superNodesOffsets_; - vector nodes_; - }; + while (superNodesCount--) { + std::pair currentNode = nodesQueue.front(); + nodesQueue.pop_front(); - typedef list::const_reverse_iterator LevelsIterator; + uint superNodeBegin = superNodesOffsets[currentNode.first]; - // Dumps level to the file "level_n.ppm" where n - // is index of the level. Segments are drawn in random colors. - void dumpLevel(LevelsIterator level, - uint width, - uint height, - const char *filename) const - { - deque< std::pair > nodesQueue; + uint superNodeEnd = currentNode.first + 1 < superNodesOffsets.size() + ? superNodesOffsets[currentNode.first + 1] + : static_cast(nodes.size()); - uint totalSegments; + for (uint nodeIndex = superNodeBegin; nodeIndex < superNodeEnd; ++nodeIndex) { - { - const vector &superNodesOffsets = - level->superNodesOffsets_; - const vector &nodes = - level->nodes_; - - totalSegments = static_cast(superNodesOffsets.size()); - - for (uint superNodeIndex = 0, nodeIndex = 0; - superNodeIndex < superNodesOffsets.size(); - ++superNodeIndex) - { - - uint superNodeEnd = - superNodeIndex + 1 < superNodesOffsets.size() ? - superNodesOffsets[superNodeIndex + 1] : - static_cast(nodes.size()); - - for (; nodeIndex < superNodeEnd; ++nodeIndex) - { - nodesQueue.push_back(std::make_pair(nodes[nodeIndex], - superNodeIndex)); - } + nodesQueue.push_back(std::make_pair(nodes[nodeIndex], currentNode.second)); } } ++level; - - while (level != levels_.rend()) - { - uint superNodesCount = static_cast(nodesQueue.size()); - - const vector &superNodesOffsets = - level->superNodesOffsets_; - const vector &nodes = - level->nodes_; - - while (superNodesCount--) - { - std::pair currentNode = nodesQueue.front(); - nodesQueue.pop_front(); - - uint superNodeBegin = superNodesOffsets[currentNode.first]; - - uint superNodeEnd = - currentNode.first + 1 < superNodesOffsets.size() ? - superNodesOffsets[currentNode.first + 1] : - static_cast(nodes.size()); - - for (uint nodeIndex = superNodeBegin; - nodeIndex < superNodeEnd; - ++nodeIndex) - { - - nodesQueue.push_back( - std::make_pair(nodes[nodeIndex], - currentNode.second)); - } - } - - ++level; - } - - vector colors(3 * totalSegments); - - for (uint colorIndex = 0; colorIndex < totalSegments; ++colorIndex) - { - colors[colorIndex * 3 ] = myrand() % 256; - colors[colorIndex * 3 + 1] = myrand() % 256; - colors[colorIndex * 3 + 2] = myrand() % 256; - } - - uchar *image = new uchar[width * height * 3]; - - while (!nodesQueue.empty()) - { - std::pair currentNode = nodesQueue.front(); - nodesQueue.pop_front(); - - uint pixelIndex = currentNode.first; - uint pixelSegment = currentNode.second; - - image[pixelIndex * 3 ] = colors[pixelSegment * 3 ]; - image[pixelIndex * 3 + 1] = colors[pixelSegment * 3 + 1]; - image[pixelIndex * 3 + 2] = colors[pixelSegment * 3 + 2]; - } - - __savePPM(filename, image, width, height, 3); - - delete[] image; } - list levels_; + vector colors(3 * totalSegments); + + for (uint colorIndex = 0; colorIndex < totalSegments; ++colorIndex) { + colors[colorIndex * 3] = myrand() % 256; + colors[colorIndex * 3 + 1] = myrand() % 256; + colors[colorIndex * 3 + 2] = myrand() % 256; + } + + uchar *image = new uchar[width * height * 3]; + + while (!nodesQueue.empty()) { + std::pair currentNode = nodesQueue.front(); + nodesQueue.pop_front(); + + uint pixelIndex = currentNode.first; + uint pixelSegment = currentNode.second; + + image[pixelIndex * 3] = colors[pixelSegment * 3]; + image[pixelIndex * 3 + 1] = colors[pixelSegment * 3 + 1]; + image[pixelIndex * 3 + 2] = colors[pixelSegment * 3 + 2]; + } + + __savePPM(filename, image, width, height, 3); + + delete[] image; + } + + list levels_; }; // The class that encapsulates the main algorithm. class SegmentationTreeBuilder { - public: - SegmentationTreeBuilder():verticesCount_(0),edgesCount_(0) {} +public: + SegmentationTreeBuilder() + : verticesCount_(0) + , edgesCount_(0) + { + } - ~SegmentationTreeBuilder() {} + ~SegmentationTreeBuilder() {} - // Repeatedly invokes the step of the algorithm - // until the limiting segmentation is found. - // Returns time (in ms) spent on building the tree. - float run(const Graph &graph, Pyramid &segmentations) - { - cudaEvent_t start, stop; + // Repeatedly invokes the step of the algorithm + // until the limiting segmentation is found. + // Returns time (in ms) spent on building the tree. + float run(const Graph &graph, Pyramid &segmentations) + { + cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + cudaEventCreate(&start); + cudaEventCreate(&stop); - cudaEventRecord(start, 0); + cudaEventRecord(start, 0); - // Allocate required memory pools. We need just 4 types of arrays. - MemoryPoolsCollection pools = - { - DeviceMemoryPool( - static_cast(graph.vertices.size()), - kUintVerticesPoolsRequired), - DeviceMemoryPool( - static_cast(graph.vertices.size()), - kFloatVerticesPoolsRequired), - DeviceMemoryPool( - static_cast(graph.edges.size()), - kUintEdgesPoolsRequired), - DeviceMemoryPool( - static_cast(graph.edges.size()), - kFloatEdgesPoolsRequired) - }; + // Allocate required memory pools. We need just 4 types of arrays. + MemoryPoolsCollection pools = { + DeviceMemoryPool(static_cast(graph.vertices.size()), kUintVerticesPoolsRequired), + DeviceMemoryPool(static_cast(graph.vertices.size()), kFloatVerticesPoolsRequired), + DeviceMemoryPool(static_cast(graph.edges.size()), kUintEdgesPoolsRequired), + DeviceMemoryPool(static_cast(graph.edges.size()), kFloatEdgesPoolsRequired)}; - // Initialize internal variables - try - { - initalizeData(graph, pools); - } - catch (thrust::system_error &e) - { - cout << "Initialization failed (" << e.what() << ")" << endl; - exit(EXIT_FAILURE); - } - - // Run steps - AlgorithmStatus status; - - try - { - do - { - status = invokeStep(pools, segmentations); - } - while (status != ALGORITHM_FINISHED); - } - catch (thrust::system_error &e) - { - cout << "Algorithm failed (" << e.what() << ")" << endl; - exit(EXIT_FAILURE); - } - - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - - float elapsedTime; - cudaEventElapsedTime(&elapsedTime, start, stop); - - return elapsedTime; + // Initialize internal variables + try { + initalizeData(graph, pools); + } + catch (thrust::system_error &e) { + cout << "Initialization failed (" << e.what() << ")" << endl; + exit(EXIT_FAILURE); } - private: - void printMemoryUsage() - { - size_t availableMemory, totalMemory, usedMemory; + // Run steps + AlgorithmStatus status; - cudaMemGetInfo(&availableMemory, &totalMemory); - usedMemory = totalMemory - availableMemory; - - cout << "Device memory: used " << usedMemory - << " available " << availableMemory - << " total " << totalMemory << endl; + try { + do { + status = invokeStep(pools, segmentations); + } while (status != ALGORITHM_FINISHED); + } + catch (thrust::system_error &e) { + cout << "Algorithm failed (" << e.what() << ")" << endl; + exit(EXIT_FAILURE); } - struct MemoryPoolsCollection - { - DeviceMemoryPool uintVertices; - DeviceMemoryPool floatVertices; - DeviceMemoryPool uintEdges; - DeviceMemoryPool floatEdges; - }; + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - static const uint kUintVerticesPoolsRequired = 8; - static const uint kFloatVerticesPoolsRequired = 3; - static const uint kUintEdgesPoolsRequired = 8; - static const uint kFloatEdgesPoolsRequired = 4; + float elapsedTime; + cudaEventElapsedTime(&elapsedTime, start, stop); - void initalizeData(const Graph &graph, MemoryPoolsCollection &pools) - { - // Get memory for the internal variables - verticesCount_ = static_cast(graph.vertices.size()); - edgesCount_ = static_cast(graph.edges.size()); + return elapsedTime; + } - dVertices_ = pools.uintVertices.get(); - dEdges_ = pools.uintEdges.get(); - dWeights_ = pools.floatEdges.get(); +private: + void printMemoryUsage() + { + size_t availableMemory, totalMemory, usedMemory; - dOutputEdgesFlags_ = pools.uintEdges.get(); + cudaMemGetInfo(&availableMemory, &totalMemory); + usedMemory = totalMemory - availableMemory; - // Copy graph to the device memory - checkCudaErrors(cudaMemcpy(dVertices_.get(), - &(graph.vertices[0]), - sizeof(uint) * verticesCount_, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(dEdges_.get(), - &(graph.edges[0]), - sizeof(uint) * edgesCount_, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(dWeights_.get(), - &(graph.weights[0]), - sizeof(float) * edgesCount_, - cudaMemcpyHostToDevice)); + cout << "Device memory: used " << usedMemory << " available " << availableMemory << " total " << totalMemory + << endl; + } + + struct MemoryPoolsCollection + { + DeviceMemoryPool uintVertices; + DeviceMemoryPool floatVertices; + DeviceMemoryPool uintEdges; + DeviceMemoryPool floatEdges; + }; + + static const uint kUintVerticesPoolsRequired = 8; + static const uint kFloatVerticesPoolsRequired = 3; + static const uint kUintEdgesPoolsRequired = 8; + static const uint kFloatEdgesPoolsRequired = 4; + + void initalizeData(const Graph &graph, MemoryPoolsCollection &pools) + { + // Get memory for the internal variables + verticesCount_ = static_cast(graph.vertices.size()); + edgesCount_ = static_cast(graph.edges.size()); + + dVertices_ = pools.uintVertices.get(); + dEdges_ = pools.uintEdges.get(); + dWeights_ = pools.floatEdges.get(); + + dOutputEdgesFlags_ = pools.uintEdges.get(); + + // Copy graph to the device memory + checkCudaErrors( + cudaMemcpy(dVertices_.get(), &(graph.vertices[0]), sizeof(uint) * verticesCount_, cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(dEdges_.get(), &(graph.edges[0]), sizeof(uint) * edgesCount_, cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(dWeights_.get(), &(graph.weights[0]), sizeof(float) * edgesCount_, cudaMemcpyHostToDevice)); - thrust::fill(dOutputEdgesFlags_, - dOutputEdgesFlags_ + edgesCount_, - 0); + thrust::fill(dOutputEdgesFlags_, dOutputEdgesFlags_ + edgesCount_, 0); + } + + static const uint kMaxThreadsPerBlock = 256; + + // Calculates grid parameters of the consecutive kernel calls + // based on the number of elements in the array. + void calculateThreadsDistribution(uint totalElements, uint &blocksCount, uint &threadsPerBlockCount) + { + if (totalElements > kMaxThreadsPerBlock) { + blocksCount = (totalElements + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; + + threadsPerBlockCount = kMaxThreadsPerBlock; + } + else { + blocksCount = 1; + threadsPerBlockCount = totalElements; + } + } + + enum AlgorithmStatus { ALGORITHM_NOT_FINISHED, ALGORITHM_FINISHED }; + + AlgorithmStatus invokeStep(MemoryPoolsCollection &pools, Pyramid &segmentations) + { + uint blocksCount, threadsPerBlockCount; + + calculateThreadsDistribution(edgesCount_, blocksCount, threadsPerBlockCount); + dim3 gridDimsForEdges(blocksCount, 1, 1); + dim3 blockDimsForEdges(threadsPerBlockCount, 1, 1); + + calculateThreadsDistribution(verticesCount_, blocksCount, threadsPerBlockCount); + dim3 gridDimsForVertices(blocksCount, 1, 1); + dim3 blockDimsForVertices(threadsPerBlockCount, 1, 1); + + thrust::device_ptr dEdgesFlags = pools.uintEdges.get(); + + thrust::fill(dEdgesFlags, dEdgesFlags + edgesCount_, 0); + + // Mark the first edge for each vertex in "dEdgesFlags" + markSegments<<>>( + dVertices_.get(), dEdgesFlags.get(), verticesCount_); + getLastCudaError("markSegments launch failed."); + + // Now find minimum edges for each vertex. + thrust::device_ptr dMinScannedEdges = pools.uintEdges.get(); + thrust::device_ptr dMinScannedWeights = pools.floatEdges.get(); + + thrust::inclusive_scan_by_key( + dEdgesFlags, + dEdgesFlags + edgesCount_, + thrust::make_zip_iterator(thrust::make_tuple(dWeights_, dEdges_)), + thrust::make_zip_iterator(thrust::make_tuple(dMinScannedWeights, dMinScannedEdges)), + thrust::greater_equal(), + thrust::minimum>()); + + // To make things clear. + // Let "dEdgesFlags" denote groups of edges that + // correspond to the same vertices. Then the last edge of each group + // (in "dMinScannedEdges" and "dMinScannedWeights") is now minimal. + + // Calculate a successor vertex for each vertex. A successor of the + // vertex v is a neighbouring vertex connected to v + // by the minimal edge. + thrust::device_ptr dSuccessors = pools.uintVertices.get(); + + getSuccessors<<>>( + dVertices_.get(), dMinScannedEdges.get(), dSuccessors.get(), verticesCount_, edgesCount_); + getLastCudaError("getSuccessors launch failed."); + + pools.uintEdges.put(dMinScannedEdges); + pools.floatEdges.put(dMinScannedWeights); + + // Remove cyclic successor dependencies. Note that there can be only + // two vertices in a cycle. See [1] for details. + removeCycles<<>>(dSuccessors.get(), verticesCount_); + getLastCudaError("removeCycles launch failed."); + + // Build up an array of startpoints for edges. As already stated, + // each group of edges denoted by "dEdgesFlags" + // has the same startpoint. + thrust::device_ptr dStartpoints = pools.uintEdges.get(); + + thrust::inclusive_scan(dEdgesFlags, dEdgesFlags + edgesCount_, dStartpoints); + + addScalar<<>>(dStartpoints.get(), -1, edgesCount_); + getLastCudaError("addScalar launch failed."); + + // Shrink the chains of successors. New successors will eventually + // represent superpixels of the new level. + thrust::device_ptr dRepresentatives = pools.uintVertices.get(); + + getRepresentatives<<>>( + dSuccessors.get(), dRepresentatives.get(), verticesCount_); + getLastCudaError("getRepresentatives launch failed."); + + swap(dSuccessors, dRepresentatives); + + pools.uintVertices.put(dRepresentatives); + + // Group vertices by successors' indices. + thrust::device_ptr dClusteredVerticesIDs = pools.uintVertices.get(); + + thrust::sequence(dClusteredVerticesIDs, dClusteredVerticesIDs + verticesCount_); + + thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(thrust::device_ptr(dSuccessors), + thrust::device_ptr(dClusteredVerticesIDs))), + thrust::make_zip_iterator( + thrust::make_tuple(thrust::device_ptr(dSuccessors + verticesCount_), + thrust::device_ptr(dClusteredVerticesIDs + verticesCount_)))); + + // Mark those groups. + thrust::device_ptr dVerticesFlags_ = pools.uintVertices.get(); + + thrust::fill(dVerticesFlags_, dVerticesFlags_ + verticesCount_, 0); + + thrust::adjacent_difference( + dSuccessors, dSuccessors + verticesCount_, dVerticesFlags_, thrust::not_equal_to()); + + cudaMemset((void *)dVerticesFlags_.get(), 0, sizeof(uint)); + + // Assign new indices to the successors (the indices of vertices + // at the new level). + thrust::device_ptr dNewVerticesIDs_ = pools.uintVertices.get(); + + thrust::inclusive_scan(dVerticesFlags_, dVerticesFlags_ + verticesCount_, dNewVerticesIDs_); + + pools.uintVertices.put(dVerticesFlags_); + + // Now we can calculate number of resulting superpixels easily. + uint newVerticesCount; + cudaMemcpy( + &newVerticesCount, (dNewVerticesIDs_ + verticesCount_ - 1).get(), sizeof(uint), cudaMemcpyDeviceToHost); + ++newVerticesCount; + + // There are two special cases when we can stop our algorithm: + // 1) number of vertices in the graph remained unchanged; + // 2) only one vertex remains. + if (newVerticesCount == verticesCount_) { + return ALGORITHM_FINISHED; + } + else if (newVerticesCount == 1) { + thrust::device_ptr dDummyVerticesOffsets = pools.uintVertices.get(); + + cudaMemset((void *)dDummyVerticesOffsets.get(), 0, sizeof(uint)); + + thrust::device_ptr dDummyVerticesIDs = pools.uintVertices.get(); + + thrust::sequence(dDummyVerticesIDs, dDummyVerticesIDs + verticesCount_); + + segmentations.addLevel(1, verticesCount_, dDummyVerticesOffsets, dDummyVerticesIDs); + + return ALGORITHM_FINISHED; } - static const uint kMaxThreadsPerBlock = 256; - - // Calculates grid parameters of the consecutive kernel calls - // based on the number of elements in the array. - void calculateThreadsDistribution(uint totalElements, - uint &blocksCount, - uint &threadsPerBlockCount) - { - if (totalElements > kMaxThreadsPerBlock) - { - blocksCount = - (totalElements + kMaxThreadsPerBlock - 1) / - kMaxThreadsPerBlock; - - threadsPerBlockCount = kMaxThreadsPerBlock; - } - else - { - blocksCount = 1; - threadsPerBlockCount = totalElements; - } - } - - enum AlgorithmStatus { ALGORITHM_NOT_FINISHED, ALGORITHM_FINISHED }; - - AlgorithmStatus invokeStep(MemoryPoolsCollection &pools, - Pyramid &segmentations) - { - uint blocksCount, threadsPerBlockCount; - - calculateThreadsDistribution(edgesCount_, - blocksCount, - threadsPerBlockCount); - dim3 gridDimsForEdges(blocksCount, 1, 1); - dim3 blockDimsForEdges(threadsPerBlockCount, 1, 1); - - calculateThreadsDistribution(verticesCount_, - blocksCount, - threadsPerBlockCount); - dim3 gridDimsForVertices(blocksCount, 1, 1); - dim3 blockDimsForVertices(threadsPerBlockCount, 1, 1); - - thrust::device_ptr dEdgesFlags = pools.uintEdges.get(); - - thrust::fill(dEdgesFlags, dEdgesFlags + edgesCount_, 0); - - // Mark the first edge for each vertex in "dEdgesFlags" - markSegments<<< gridDimsForVertices, blockDimsForVertices, 0 >>> - (dVertices_.get(), dEdgesFlags.get(), verticesCount_); - getLastCudaError("markSegments launch failed."); - - // Now find minimum edges for each vertex. - thrust::device_ptr dMinScannedEdges = - pools.uintEdges.get(); - thrust::device_ptr dMinScannedWeights = - pools.floatEdges.get(); - - thrust::inclusive_scan_by_key( - dEdgesFlags, - dEdgesFlags + edgesCount_, - thrust::make_zip_iterator( - thrust::make_tuple(dWeights_, dEdges_)), - thrust::make_zip_iterator( - thrust::make_tuple(dMinScannedWeights, dMinScannedEdges)), - thrust::greater_equal(), - thrust::minimum< thrust::tuple >()); - - // To make things clear. - // Let "dEdgesFlags" denote groups of edges that - // correspond to the same vertices. Then the last edge of each group - // (in "dMinScannedEdges" and "dMinScannedWeights") is now minimal. - - // Calculate a successor vertex for each vertex. A successor of the - // vertex v is a neighbouring vertex connected to v - // by the minimal edge. - thrust::device_ptr dSuccessors = pools.uintVertices.get(); - - getSuccessors<<< gridDimsForVertices, blockDimsForVertices, 0 >>> - (dVertices_.get(), - dMinScannedEdges.get(), - dSuccessors.get(), - verticesCount_, - edgesCount_); - getLastCudaError("getSuccessors launch failed."); - - pools.uintEdges.put(dMinScannedEdges); - pools.floatEdges.put(dMinScannedWeights); - - // Remove cyclic successor dependencies. Note that there can be only - // two vertices in a cycle. See [1] for details. - removeCycles<<< gridDimsForVertices, blockDimsForVertices, 0 >>> - (dSuccessors.get(), verticesCount_); - getLastCudaError("removeCycles launch failed."); - - // Build up an array of startpoints for edges. As already stated, - // each group of edges denoted by "dEdgesFlags" - // has the same startpoint. - thrust::device_ptr dStartpoints = pools.uintEdges.get(); - - thrust::inclusive_scan(dEdgesFlags, - dEdgesFlags + edgesCount_, - dStartpoints); - - addScalar<<< gridDimsForEdges, blockDimsForEdges, 0 >>> - (dStartpoints.get(), -1, edgesCount_); - getLastCudaError("addScalar launch failed."); - - // Shrink the chains of successors. New successors will eventually - // represent superpixels of the new level. - thrust::device_ptr dRepresentatives = - pools.uintVertices.get(); - - getRepresentatives - <<< gridDimsForVertices, blockDimsForVertices, 0 >>> - (dSuccessors.get(), - dRepresentatives.get(), - verticesCount_); - getLastCudaError("getRepresentatives launch failed."); - - swap(dSuccessors, dRepresentatives); - - pools.uintVertices.put(dRepresentatives); - - // Group vertices by successors' indices. - thrust::device_ptr dClusteredVerticesIDs = - pools.uintVertices.get(); - - thrust::sequence(dClusteredVerticesIDs, - dClusteredVerticesIDs + verticesCount_); - - thrust::sort( - thrust::make_zip_iterator( - thrust::make_tuple( - thrust::device_ptr (dSuccessors), - thrust::device_ptr (dClusteredVerticesIDs))), - thrust::make_zip_iterator( - thrust::make_tuple( - thrust::device_ptr (dSuccessors + - verticesCount_), - thrust::device_ptr (dClusteredVerticesIDs + - verticesCount_)))); - - // Mark those groups. - thrust::device_ptr dVerticesFlags_ = pools.uintVertices.get(); - - thrust::fill(dVerticesFlags_, dVerticesFlags_ + verticesCount_, 0); - - thrust::adjacent_difference(dSuccessors, - dSuccessors + verticesCount_, - dVerticesFlags_, - thrust::not_equal_to()); - - cudaMemset((void *) dVerticesFlags_.get(), 0, sizeof(uint)); - - // Assign new indices to the successors (the indices of vertices - // at the new level). - thrust::device_ptr dNewVerticesIDs_ = - pools.uintVertices.get(); - - thrust::inclusive_scan(dVerticesFlags_, - dVerticesFlags_ + verticesCount_, - dNewVerticesIDs_); - - pools.uintVertices.put(dVerticesFlags_); - - // Now we can calculate number of resulting superpixels easily. - uint newVerticesCount; - cudaMemcpy(&newVerticesCount, - (dNewVerticesIDs_ + verticesCount_ - 1).get(), - sizeof(uint), - cudaMemcpyDeviceToHost); - ++newVerticesCount; - - // There are two special cases when we can stop our algorithm: - // 1) number of vertices in the graph remained unchanged; - // 2) only one vertex remains. - if (newVerticesCount == verticesCount_) - { - return ALGORITHM_FINISHED; - } - else if (newVerticesCount == 1) - { - thrust::device_ptr dDummyVerticesOffsets = - pools.uintVertices.get(); - - cudaMemset((void *) dDummyVerticesOffsets.get(), - 0, - sizeof(uint)); - - thrust::device_ptr dDummyVerticesIDs = - pools.uintVertices.get(); - - thrust::sequence(dDummyVerticesIDs, - dDummyVerticesIDs + verticesCount_); - - segmentations.addLevel(1, - verticesCount_, - dDummyVerticesOffsets, - dDummyVerticesIDs); - - return ALGORITHM_FINISHED; - } - - // Calculate how old vertices IDs map to new vertices IDs. - thrust::device_ptr dVerticesMapping = - pools.uintVertices.get(); - - getVerticesMapping - <<< gridDimsForVertices, blockDimsForVertices, 0 >>> - (dClusteredVerticesIDs.get(), - dNewVerticesIDs_.get(), - dVerticesMapping.get(), - verticesCount_); - getLastCudaError("getVerticesMapping launch failed."); - - pools.uintVertices.put(dNewVerticesIDs_); - pools.uintVertices.put(dClusteredVerticesIDs); - pools.uintVertices.put(dSuccessors); - - // Invalidate self-loops in the reduced graph (the graph - // produced by merging all old vertices that have - // the same successor). - invalidateLoops<<< gridDimsForEdges, blockDimsForEdges, 0 >>> - (dStartpoints.get(), - dVerticesMapping.get(), - dEdges_.get(), - edgesCount_); - getLastCudaError("invalidateLoops launch failed."); - - // Calculate various information about the surviving - // (new startpoints IDs and IDs of edges) and - // non-surviving/contracted edges (their weights). - thrust::device_ptr dNewStartpoints = pools.uintEdges.get(); - thrust::device_ptr dSurvivedEdgesIDs = pools.uintEdges.get(); - - calculateEdgesInfo<<< gridDimsForEdges, blockDimsForEdges, 0 >>> - (dStartpoints.get(), - dVerticesMapping.get(), - dEdges_.get(), - dWeights_.get(), - dNewStartpoints.get(), - dSurvivedEdgesIDs.get(), - edgesCount_, - newVerticesCount); - getLastCudaError("calculateEdgesInfo launch failed."); - - pools.uintEdges.put(dStartpoints); - - // Group that information by the new startpoints IDs. - // Keep in mind that we want to build new (reduced) graph and apply - // the step of the algorithm to that one. Hence we need to - // preserve the structure of the original graph: neighbours and - // weights should be grouped by vertex. - thrust::sort( - thrust::make_zip_iterator( - thrust::make_tuple(dNewStartpoints, - dSurvivedEdgesIDs)), - thrust::make_zip_iterator( - thrust::make_tuple(dNewStartpoints + edgesCount_, - dSurvivedEdgesIDs + edgesCount_))); - - // Find the group of contracted edges. - uint *invalidEdgesPtr = - thrust::find_if( - dNewStartpoints, - dNewStartpoints + edgesCount_, - IsGreaterEqualThan(newVerticesCount)).get(); - - // Calculate how many edges there are in the reduced graph. - uint validEdgesCount = - static_cast(invalidEdgesPtr - dNewStartpoints.get()); - - // Mark groups of edges corresponding to the same vertex in the - // reduced graph. - thrust::adjacent_difference(dNewStartpoints, - dNewStartpoints + edgesCount_, - dEdgesFlags, - thrust::not_equal_to()); - - cudaMemset((void *) dEdgesFlags.get(), 0, sizeof(uint)); - cudaMemset((void *) dEdgesFlags.get(), 1, 1); - - pools.uintEdges.put(dNewStartpoints); - - // Now we are able to build the reduced graph. See "Graph" - // class for the details on the graph's internal structure. - - // Calculate vertices' offsets for the reduced graph. - thrust::copy_if(thrust::make_counting_iterator(0U), - thrust::make_counting_iterator(validEdgesCount), - dEdgesFlags, - dVertices_, - thrust::identity()).get(); - - pools.uintEdges.put(dEdgesFlags); - - // Build up a neighbourhood for each vertex in the reduced graph - // (this includes recalculating edges' weights). - calculateThreadsDistribution(validEdgesCount, - blocksCount, - threadsPerBlockCount); - dim3 newGridDimsForEdges(blocksCount, 1, 1); - dim3 newBlockDimsForEdges(threadsPerBlockCount, 1, 1); - - thrust::device_ptr dNewEdges = pools.uintEdges.get(); - thrust::device_ptr dNewWeights = pools.floatEdges.get(); - - makeNewEdges<<< newGridDimsForEdges, - newBlockDimsForEdges, - 0 >>> - (dSurvivedEdgesIDs.get(), - dVerticesMapping.get(), - dEdges_.get(), - dWeights_.get(), - dNewEdges.get(), - dNewWeights.get(), - validEdgesCount); - getLastCudaError("makeNewEdges launch failed."); - - swap(dEdges_, dNewEdges); - swap(dWeights_, dNewWeights); - - pools.uintEdges.put(dNewEdges); - pools.floatEdges.put(dNewWeights); - - pools.uintEdges.put(dSurvivedEdgesIDs); - - // The graph's reconstruction is now finished. - - // Build new level of the segmentation tree. It is a trivial task - // as we already have "dVerticesMapping" that contains all - // sufficient information about the vertices' transformations. - thrust::device_ptr dVerticesIDs = - pools.uintVertices.get(); - thrust::device_ptr dNewVerticesOffsets = - pools.uintVertices.get(); - - thrust::sequence(dVerticesIDs, dVerticesIDs + verticesCount_); - - thrust::sort_by_key(dVerticesMapping, - dVerticesMapping + verticesCount_, - dVerticesIDs); - - thrust::unique_by_key_copy(dVerticesMapping, - dVerticesMapping + verticesCount_, - thrust::make_counting_iterator(0), - thrust::make_discard_iterator(), - dNewVerticesOffsets); - - segmentations.addLevel(newVerticesCount, - verticesCount_, - dNewVerticesOffsets, - dVerticesIDs); - - pools.uintVertices.put(dVerticesIDs); - pools.uintVertices.put(dNewVerticesOffsets); - pools.uintVertices.put(dVerticesMapping); - - // We can now safely set new counts for vertices and edges. - verticesCount_ = newVerticesCount; - edgesCount_ = validEdgesCount; - - return ALGORITHM_NOT_FINISHED; - } - - uint verticesCount_; - uint edgesCount_; - - thrust::device_ptr dVertices_; - thrust::device_ptr dEdges_; - thrust::device_ptr dWeights_; - - thrust::device_ptr dOutputEdgesFlags_; + // Calculate how old vertices IDs map to new vertices IDs. + thrust::device_ptr dVerticesMapping = pools.uintVertices.get(); + + getVerticesMapping<<>>( + dClusteredVerticesIDs.get(), dNewVerticesIDs_.get(), dVerticesMapping.get(), verticesCount_); + getLastCudaError("getVerticesMapping launch failed."); + + pools.uintVertices.put(dNewVerticesIDs_); + pools.uintVertices.put(dClusteredVerticesIDs); + pools.uintVertices.put(dSuccessors); + + // Invalidate self-loops in the reduced graph (the graph + // produced by merging all old vertices that have + // the same successor). + invalidateLoops<<>>( + dStartpoints.get(), dVerticesMapping.get(), dEdges_.get(), edgesCount_); + getLastCudaError("invalidateLoops launch failed."); + + // Calculate various information about the surviving + // (new startpoints IDs and IDs of edges) and + // non-surviving/contracted edges (their weights). + thrust::device_ptr dNewStartpoints = pools.uintEdges.get(); + thrust::device_ptr dSurvivedEdgesIDs = pools.uintEdges.get(); + + calculateEdgesInfo<<>>(dStartpoints.get(), + dVerticesMapping.get(), + dEdges_.get(), + dWeights_.get(), + dNewStartpoints.get(), + dSurvivedEdgesIDs.get(), + edgesCount_, + newVerticesCount); + getLastCudaError("calculateEdgesInfo launch failed."); + + pools.uintEdges.put(dStartpoints); + + // Group that information by the new startpoints IDs. + // Keep in mind that we want to build new (reduced) graph and apply + // the step of the algorithm to that one. Hence we need to + // preserve the structure of the original graph: neighbours and + // weights should be grouped by vertex. + thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(dNewStartpoints, dSurvivedEdgesIDs)), + thrust::make_zip_iterator( + thrust::make_tuple(dNewStartpoints + edgesCount_, dSurvivedEdgesIDs + edgesCount_))); + + // Find the group of contracted edges. + uint *invalidEdgesPtr = + thrust::find_if(dNewStartpoints, dNewStartpoints + edgesCount_, IsGreaterEqualThan(newVerticesCount)) + .get(); + + // Calculate how many edges there are in the reduced graph. + uint validEdgesCount = static_cast(invalidEdgesPtr - dNewStartpoints.get()); + + // Mark groups of edges corresponding to the same vertex in the + // reduced graph. + thrust::adjacent_difference( + dNewStartpoints, dNewStartpoints + edgesCount_, dEdgesFlags, thrust::not_equal_to()); + + cudaMemset((void *)dEdgesFlags.get(), 0, sizeof(uint)); + cudaMemset((void *)dEdgesFlags.get(), 1, 1); + + pools.uintEdges.put(dNewStartpoints); + + // Now we are able to build the reduced graph. See "Graph" + // class for the details on the graph's internal structure. + + // Calculate vertices' offsets for the reduced graph. + thrust::copy_if(thrust::make_counting_iterator(0U), + thrust::make_counting_iterator(validEdgesCount), + dEdgesFlags, + dVertices_, + thrust::identity()) + .get(); + + pools.uintEdges.put(dEdgesFlags); + + // Build up a neighbourhood for each vertex in the reduced graph + // (this includes recalculating edges' weights). + calculateThreadsDistribution(validEdgesCount, blocksCount, threadsPerBlockCount); + dim3 newGridDimsForEdges(blocksCount, 1, 1); + dim3 newBlockDimsForEdges(threadsPerBlockCount, 1, 1); + + thrust::device_ptr dNewEdges = pools.uintEdges.get(); + thrust::device_ptr dNewWeights = pools.floatEdges.get(); + + makeNewEdges<<>>(dSurvivedEdgesIDs.get(), + dVerticesMapping.get(), + dEdges_.get(), + dWeights_.get(), + dNewEdges.get(), + dNewWeights.get(), + validEdgesCount); + getLastCudaError("makeNewEdges launch failed."); + + swap(dEdges_, dNewEdges); + swap(dWeights_, dNewWeights); + + pools.uintEdges.put(dNewEdges); + pools.floatEdges.put(dNewWeights); + + pools.uintEdges.put(dSurvivedEdgesIDs); + + // The graph's reconstruction is now finished. + + // Build new level of the segmentation tree. It is a trivial task + // as we already have "dVerticesMapping" that contains all + // sufficient information about the vertices' transformations. + thrust::device_ptr dVerticesIDs = pools.uintVertices.get(); + thrust::device_ptr dNewVerticesOffsets = pools.uintVertices.get(); + + thrust::sequence(dVerticesIDs, dVerticesIDs + verticesCount_); + + thrust::sort_by_key(dVerticesMapping, dVerticesMapping + verticesCount_, dVerticesIDs); + + thrust::unique_by_key_copy(dVerticesMapping, + dVerticesMapping + verticesCount_, + thrust::make_counting_iterator(0), + thrust::make_discard_iterator(), + dNewVerticesOffsets); + + segmentations.addLevel(newVerticesCount, verticesCount_, dNewVerticesOffsets, dVerticesIDs); + + pools.uintVertices.put(dVerticesIDs); + pools.uintVertices.put(dNewVerticesOffsets); + pools.uintVertices.put(dVerticesMapping); + + // We can now safely set new counts for vertices and edges. + verticesCount_ = newVerticesCount; + edgesCount_ = validEdgesCount; + + return ALGORITHM_NOT_FINISHED; + } + + uint verticesCount_; + uint edgesCount_; + + thrust::device_ptr dVertices_; + thrust::device_ptr dEdges_; + thrust::device_ptr dWeights_; + + thrust::device_ptr dOutputEdgesFlags_; }; // Loads PPM image. -int loadImage(const char *filename, - const char *executablePath, - vector &data, - uint &width, - uint &height) +int loadImage(const char *filename, const char *executablePath, vector &data, uint &width, uint &height) { const char *imagePath = sdkFindFilePath(filename, executablePath); - if (imagePath == NULL) - { + if (imagePath == NULL) { return -1; } - uchar *dataHandle = NULL; + uchar *dataHandle = NULL; unsigned int channels; - if (!__loadPPM(imagePath, &dataHandle, &width, &height, &channels)) - { + if (!__loadPPM(imagePath, &dataHandle, &width, &height, &channels)) { return -1; } - data.assign(reinterpret_cast(dataHandle), - reinterpret_cast(dataHandle) + width * height); + data.assign(reinterpret_cast(dataHandle), reinterpret_cast(dataHandle) + width * height); free(reinterpret_cast(dataHandle)); @@ -951,10 +787,7 @@ inline float distance(const uchar3 &first, const uchar3 &second) } // Builds a net-graph for the image with 4-connected pixels. -void buildGraph(const vector &image, - uint width, - uint height, - Graph &graph) +void buildGraph(const vector &image, uint width, uint height, Graph &graph) { uint totalNodes = static_cast(image.size()); @@ -964,19 +797,16 @@ void buildGraph(const vector &image, uint edgesProcessed = 0; - for (uint y = 0; y < height; ++y) - { - for (uint x = 0; x < width; ++x) - { - uint nodeIndex = y * width + x; + for (uint y = 0; y < height; ++y) { + for (uint x = 0; x < width; ++x) { + uint nodeIndex = y * width + x; const uchar3 ¢erPixel = image[nodeIndex]; graph.vertices[nodeIndex] = edgesProcessed; - if (y > 0) - { - uint lowerNodeIndex = (y - 1) * width + x; - const uchar3 &lowerPixel = image[lowerNodeIndex]; + if (y > 0) { + uint lowerNodeIndex = (y - 1) * width + x; + const uchar3 &lowerPixel = image[lowerNodeIndex]; graph.edges.push_back(lowerNodeIndex); graph.weights.push_back(distance(centerPixel, lowerPixel)); @@ -984,10 +814,9 @@ void buildGraph(const vector &image, ++edgesProcessed; } - if (y + 1 < height) - { - uint upperNodeIndex = (y + 1) * width + x; - const uchar3 &upperPixel = image[upperNodeIndex]; + if (y + 1 < height) { + uint upperNodeIndex = (y + 1) * width + x; + const uchar3 &upperPixel = image[upperNodeIndex]; graph.edges.push_back(upperNodeIndex); graph.weights.push_back(distance(centerPixel, upperPixel)); @@ -995,10 +824,9 @@ void buildGraph(const vector &image, ++edgesProcessed; } - if (x > 0) - { - uint leftNodeIndex = y * width + x - 1; - const uchar3 &leftPixel = image[leftNodeIndex]; + if (x > 0) { + uint leftNodeIndex = y * width + x - 1; + const uchar3 &leftPixel = image[leftNodeIndex]; graph.edges.push_back(leftNodeIndex); graph.weights.push_back(distance(centerPixel, leftPixel)); @@ -1006,10 +834,9 @@ void buildGraph(const vector &image, ++edgesProcessed; } - if (x + 1 < width) - { - uint rightNodeIndex = y * width + x + 1; - const uchar3 &rightPixel = image[rightNodeIndex]; + if (x + 1 < width) { + uint rightNodeIndex = y * width + x + 1; + const uchar3 &rightPixel = image[rightNodeIndex]; graph.edges.push_back(rightNodeIndex); graph.weights.push_back(distance(centerPixel, rightPixel)); @@ -1020,28 +847,23 @@ void buildGraph(const vector &image, } } -static char *kDefaultImageName = (char*)"test.ppm"; +static char *kDefaultImageName = (char *)"test.ppm"; int main(int argc, char **argv) { vector image; - uint imageWidth, imageHeight; - char *imageName; + uint imageWidth, imageHeight; + char *imageName; printf("%s Starting...\n\n", argv[0]); imageName = (char *)kDefaultImageName; - if (checkCmdLineFlag(argc, (const char **) argv, "file")) - { - getCmdLineArgumentString(argc, - (const char **) argv, - "file", - &imageName); + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &imageName); } - if (loadImage(imageName, argv[0], image, imageWidth, imageHeight) != 0) - { + if (loadImage(imageName, argv[0], image, imageWidth, imageHeight) != 0) { printf("Failed to open <%s>, program exit...\n", imageName); exit(EXIT_FAILURE); } @@ -1057,7 +879,7 @@ int main(int argc, char **argv) cout.flush(); SegmentationTreeBuilder algo; - float elapsedTime = algo.run(graph, segmentations); + float elapsedTime = algo.run(graph, segmentations); cout << "done in " << elapsedTime << " (ms)" << endl; @@ -1067,16 +889,8 @@ int main(int argc, char **argv) bool bResults[2]; - bResults[0] = sdkComparePPM("level_00.ppm", - sdkFindFilePath("ref_00.ppm", argv[0]), - 5.0f, - 0.15f, - false); - bResults[1] = sdkComparePPM("level_09.ppm", - sdkFindFilePath("ref_09.ppm", argv[0]), - 5.0f, - 0.15f, - false); + bResults[0] = sdkComparePPM("level_00.ppm", sdkFindFilePath("ref_00.ppm", argv[0]), 5.0f, 0.15f, false); + bResults[1] = sdkComparePPM("level_09.ppm", sdkFindFilePath("ref_09.ppm", argv[0]), 5.0f, 0.15f, false); exit((bResults[0] && bResults[1]) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_integral_image.cuh b/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_integral_image.cuh index 90b22c6e..9f281216 100644 --- a/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_integral_image.cuh +++ b/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_integral_image.cuh @@ -24,133 +24,139 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - + #include namespace cg = cooperative_groups; // Utility function to extract unsigned chars from an // unsigned integer -__device__ uchar4 uint_to_uchar4(const unsigned int in) { - return make_uchar4((in & 0x000000ff) >> 0, (in & 0x0000ff00) >> 8, - (in & 0x00ff0000) >> 16, (in & 0xff000000) >> 24); +__device__ uchar4 uint_to_uchar4(const unsigned int in) +{ + return make_uchar4( + (in & 0x000000ff) >> 0, (in & 0x0000ff00) >> 8, (in & 0x00ff0000) >> 16, (in & 0xff000000) >> 24); } // Utility for dealing with vector data at different levels. -struct packed_result { - uint4 x, y, z, w; +struct packed_result +{ + uint4 x, y, z, w; }; -__device__ packed_result get_prefix_sum(const uint4 &data, - const cg::thread_block &cta) { - const auto tile = cg::tiled_partition<32>(cta); +__device__ packed_result get_prefix_sum(const uint4 &data, const cg::thread_block &cta) +{ + const auto tile = cg::tiled_partition<32>(cta); - __shared__ unsigned int sums[128]; - const unsigned int lane_id = tile.thread_rank(); - const unsigned int warp_id = tile.meta_group_rank(); + __shared__ unsigned int sums[128]; + const unsigned int lane_id = tile.thread_rank(); + const unsigned int warp_id = tile.meta_group_rank(); - unsigned int result[16] = {}; - { - const uchar4 a = uint_to_uchar4(data.x); - const uchar4 b = uint_to_uchar4(data.y); - const uchar4 c = uint_to_uchar4(data.z); - const uchar4 d = uint_to_uchar4(data.w); + unsigned int result[16] = {}; + { + const uchar4 a = uint_to_uchar4(data.x); + const uchar4 b = uint_to_uchar4(data.y); + const uchar4 c = uint_to_uchar4(data.z); + const uchar4 d = uint_to_uchar4(data.w); - result[0] = a.x; - result[1] = a.x + a.y; - result[2] = a.x + a.y + a.z; - result[3] = a.x + a.y + a.z + a.w; + result[0] = a.x; + result[1] = a.x + a.y; + result[2] = a.x + a.y + a.z; + result[3] = a.x + a.y + a.z + a.w; - result[4] = b.x; - result[5] = b.x + b.y; - result[6] = b.x + b.y + b.z; - result[7] = b.x + b.y + b.z + b.w; + result[4] = b.x; + result[5] = b.x + b.y; + result[6] = b.x + b.y + b.z; + result[7] = b.x + b.y + b.z + b.w; - result[8] = c.x; - result[9] = c.x + c.y; - result[10] = c.x + c.y + c.z; - result[11] = c.x + c.y + c.z + c.w; + result[8] = c.x; + result[9] = c.x + c.y; + result[10] = c.x + c.y + c.z; + result[11] = c.x + c.y + c.z + c.w; - result[12] = d.x; - result[13] = d.x + d.y; - result[14] = d.x + d.y + d.z; - result[15] = d.x + d.y + d.z + d.w; - } - -#pragma unroll - for (unsigned int i = 4; i <= 7; i++) result[i] += result[3]; - -#pragma unroll - for (unsigned int i = 8; i <= 11; i++) result[i] += result[7]; - -#pragma unroll - for (unsigned int i = 12; i <= 15; i++) result[i] += result[11]; - - unsigned int sum = result[15]; - - // the prefix sum for each thread's 16 value is computed, - // now the final sums (result[15]) need to be shared - // with the other threads and add. To do this, - // the __shfl_up() instruction is used and a shuffle scan - // operation is performed to distribute the sums to the correct - // threads - -#pragma unroll - for (unsigned int i = 1; i < 32; i *= 2) { - const unsigned int n = tile.shfl_up(sum, i); - - if (lane_id >= i) { -#pragma unroll - for (unsigned int j = 0; j < 16; j++) { - result[j] += n; - } - - sum += n; - } - } - - // Now the final sum for the warp must be shared - // between warps. This is done by each warp - // having a thread store to shared memory, then - // having some other warp load the values and - // compute a prefix sum, again by using __shfl_up. - // The results are uniformly added back to the warps. - // last thread in the warp holding sum of the warp - // places that in shared - if (tile.thread_rank() == (tile.size() - 1)) { - sums[warp_id] = result[15]; - } - - __syncthreads(); - - if (warp_id == 0) { - unsigned int warp_sum = sums[lane_id]; - -#pragma unroll - for (unsigned int i = 1; i <= 16; i *= 2) { - const unsigned int n = tile.shfl_up(warp_sum, i); - - if (lane_id >= i) warp_sum += n; + result[12] = d.x; + result[13] = d.x + d.y; + result[14] = d.x + d.y + d.z; + result[15] = d.x + d.y + d.z + d.w; } - sums[lane_id] = warp_sum; - } - - __syncthreads(); - - // fold in unused warp - if (warp_id > 0) { - const unsigned int blockSum = sums[warp_id - 1]; +#pragma unroll + for (unsigned int i = 4; i <= 7; i++) + result[i] += result[3]; #pragma unroll - for (unsigned int i = 0; i < 16; i++) { - result[i] += blockSum; - } - } + for (unsigned int i = 8; i <= 11; i++) + result[i] += result[7]; - packed_result out; - memcpy(&out, result, sizeof(out)); - return out; +#pragma unroll + for (unsigned int i = 12; i <= 15; i++) + result[i] += result[11]; + + unsigned int sum = result[15]; + + // the prefix sum for each thread's 16 value is computed, + // now the final sums (result[15]) need to be shared + // with the other threads and add. To do this, + // the __shfl_up() instruction is used and a shuffle scan + // operation is performed to distribute the sums to the correct + // threads + +#pragma unroll + for (unsigned int i = 1; i < 32; i *= 2) { + const unsigned int n = tile.shfl_up(sum, i); + + if (lane_id >= i) { +#pragma unroll + for (unsigned int j = 0; j < 16; j++) { + result[j] += n; + } + + sum += n; + } + } + + // Now the final sum for the warp must be shared + // between warps. This is done by each warp + // having a thread store to shared memory, then + // having some other warp load the values and + // compute a prefix sum, again by using __shfl_up. + // The results are uniformly added back to the warps. + // last thread in the warp holding sum of the warp + // places that in shared + if (tile.thread_rank() == (tile.size() - 1)) { + sums[warp_id] = result[15]; + } + + __syncthreads(); + + if (warp_id == 0) { + unsigned int warp_sum = sums[lane_id]; + +#pragma unroll + for (unsigned int i = 1; i <= 16; i *= 2) { + const unsigned int n = tile.shfl_up(warp_sum, i); + + if (lane_id >= i) + warp_sum += n; + } + + sums[lane_id] = warp_sum; + } + + __syncthreads(); + + // fold in unused warp + if (warp_id > 0) { + const unsigned int blockSum = sums[warp_id - 1]; + +#pragma unroll + for (unsigned int i = 0; i < 16; i++) { + result[i] += blockSum; + } + } + + packed_result out; + memcpy(&out, result, sizeof(out)); + return out; } // This function demonstrates some uses of the shuffle instruction @@ -159,93 +165,94 @@ __device__ packed_result get_prefix_sum(const uint4 &data, // The approach is two pass, a horizontal (scanline) then a vertical // (column) pass. // This is the horizontal pass kernel. -__global__ void shfl_intimage_rows(const uint4 *img, uint4 *integral_image) { - const auto cta = cg::this_thread_block(); - const auto tile = cg::tiled_partition<32>(cta); +__global__ void shfl_intimage_rows(const uint4 *img, uint4 *integral_image) +{ + const auto cta = cg::this_thread_block(); + const auto tile = cg::tiled_partition<32>(cta); - const unsigned int id = threadIdx.x; - // pointer to head of current scanline - const uint4 *scanline = &img[blockIdx.x * 120]; - packed_result result = get_prefix_sum(scanline[id], cta); + const unsigned int id = threadIdx.x; + // pointer to head of current scanline + const uint4 *scanline = &img[blockIdx.x * 120]; + packed_result result = get_prefix_sum(scanline[id], cta); - // This access helper allows packed_result to stay optimized as registers - // rather than spill to stack - auto idxToElem = [&result](unsigned int idx) -> const uint4 { - switch (idx) { - case 0: - return result.x; - case 1: - return result.y; - case 2: - return result.z; - case 3: - return result.w; - } - return {}; - }; + // This access helper allows packed_result to stay optimized as registers + // rather than spill to stack + auto idxToElem = [&result](unsigned int idx) -> const uint4 { + switch (idx) { + case 0: + return result.x; + case 1: + return result.y; + case 2: + return result.z; + case 3: + return result.w; + } + return {}; + }; - // assemble result - // Each thread has 16 values to write, which are - // now integer data (to avoid overflow). Instead of - // each thread writing consecutive uint4s, the - // approach shown here experiments using - // the shuffle command to reformat the data - // inside the registers so that each thread holds - // consecutive data to be written so larger contiguous - // segments can be assembled for writing. - /* - For example data that needs to be written as + // assemble result + // Each thread has 16 values to write, which are + // now integer data (to avoid overflow). Instead of + // each thread writing consecutive uint4s, the + // approach shown here experiments using + // the shuffle command to reformat the data + // inside the registers so that each thread holds + // consecutive data to be written so larger contiguous + // segments can be assembled for writing. + /* + For example data that needs to be written as - GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3 - but is stored in registers (r0..r3), in four threads (0..3) as: + GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3 + but is stored in registers (r0..r3), in four threads (0..3) as: - threadId 0 1 2 3 - r0 x0 y0 z0 w0 - r1 x1 y1 z1 w1 - r2 x2 y2 z2 w2 - r3 x3 y3 z3 w3 + threadId 0 1 2 3 + r0 x0 y0 z0 w0 + r1 x1 y1 z1 w1 + r2 x2 y2 z2 w2 + r3 x3 y3 z3 w3 - after apply __shfl_xor operations to move data between registers r1..r3: + after apply __shfl_xor operations to move data between registers r1..r3: - threadId 00 01 10 11 - x0 y0 z0 w0 - xor(01)->y1 x1 w1 z1 - xor(10)->z2 w2 x2 y2 - xor(11)->w3 z3 y3 x3 + threadId 00 01 10 11 + x0 y0 z0 w0 + xor(01)->y1 x1 w1 z1 + xor(10)->z2 w2 x2 y2 + xor(11)->w3 z3 y3 x3 - and now x0..x3, and z0..z3 can be written out in order by all threads. + and now x0..x3, and z0..z3 can be written out in order by all threads. - In the current code, each register above is actually representing - four integers to be written as uint4's to GMEM. - */ + In the current code, each register above is actually representing + four integers to be written as uint4's to GMEM. + */ - const unsigned int idMask = id & 3; - const unsigned int idSwizzle = (id + 2) & 3; - const unsigned int idShift = (id >> 2) << 4; - const unsigned int blockOffset = blockIdx.x * 480; + const unsigned int idMask = id & 3; + const unsigned int idSwizzle = (id + 2) & 3; + const unsigned int idShift = (id >> 2) << 4; + const unsigned int blockOffset = blockIdx.x * 480; - // Use CG tile to warp shuffle vector types - result.y = tile.shfl_xor(result.y, 1); - result.z = tile.shfl_xor(result.z, 2); - result.w = tile.shfl_xor(result.w, 3); + // Use CG tile to warp shuffle vector types + result.y = tile.shfl_xor(result.y, 1); + result.z = tile.shfl_xor(result.z, 2); + result.w = tile.shfl_xor(result.w, 3); - // First batch - integral_image[blockOffset + idMask + idShift] = idxToElem(idMask); - // Second batch offset by 2 - integral_image[blockOffset + idSwizzle + idShift + 8] = idxToElem(idSwizzle); + // First batch + integral_image[blockOffset + idMask + idShift] = idxToElem(idMask); + // Second batch offset by 2 + integral_image[blockOffset + idSwizzle + idShift + 8] = idxToElem(idSwizzle); - // continuing from the above example, - // this use of __shfl_xor() places the y0..y3 and w0..w3 data - // in order. - result.x = tile.shfl_xor(result.x, 1); - result.y = tile.shfl_xor(result.y, 1); - result.z = tile.shfl_xor(result.z, 1); - result.w = tile.shfl_xor(result.w, 1); + // continuing from the above example, + // this use of __shfl_xor() places the y0..y3 and w0..w3 data + // in order. + result.x = tile.shfl_xor(result.x, 1); + result.y = tile.shfl_xor(result.y, 1); + result.z = tile.shfl_xor(result.z, 1); + result.w = tile.shfl_xor(result.w, 1); - // First batch - integral_image[blockOffset + idMask + idShift + 4] = idxToElem(idMask); - // Second batch offset by 2 - integral_image[blockOffset + idSwizzle + idShift + 12] = idxToElem(idSwizzle); + // First batch + integral_image[blockOffset + idMask + idShift + 4] = idxToElem(idMask); + // Second batch offset by 2 + integral_image[blockOffset + idSwizzle + idShift + 12] = idxToElem(idSwizzle); } // This kernel computes columnwise prefix sums. When the data input is @@ -258,53 +265,55 @@ __global__ void shfl_intimage_rows(const uint4 *img, uint4 *integral_image) { // The final set of sums from the block is then propagated, with the block // computing "down" the image and adding the running sum to the local // block sums. -__global__ void shfl_vertical_shfl(unsigned int *img, int width, int height) { - __shared__ unsigned int sums[32][9]; - int tidx = blockIdx.x * blockDim.x + threadIdx.x; - // int warp_id = threadIdx.x / warpSize ; - unsigned int lane_id = tidx % 8; - // int rows_per_thread = (height / blockDim. y) ; - // int start_row = rows_per_thread * threadIdx.y; - unsigned int stepSum = 0; - unsigned int mask = 0xffffffff; +__global__ void shfl_vertical_shfl(unsigned int *img, int width, int height) +{ + __shared__ unsigned int sums[32][9]; + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + // int warp_id = threadIdx.x / warpSize ; + unsigned int lane_id = tidx % 8; + // int rows_per_thread = (height / blockDim. y) ; + // int start_row = rows_per_thread * threadIdx.y; + unsigned int stepSum = 0; + unsigned int mask = 0xffffffff; - sums[threadIdx.x][threadIdx.y] = 0; - __syncthreads(); - - for (int step = 0; step < 135; step++) { - unsigned int sum = 0; - unsigned int *p = img + (threadIdx.y + step * 8) * width + tidx; - - sum = *p; - sums[threadIdx.x][threadIdx.y] = sum; + sums[threadIdx.x][threadIdx.y] = 0; __syncthreads(); - // place into SMEM - // shfl scan reduce the SMEM, reformating so the column - // sums are computed in a warp - // then read out properly - int partial_sum = 0; - int j = threadIdx.x % 8; - int k = threadIdx.x / 8 + threadIdx.y * 4; + for (int step = 0; step < 135; step++) { + unsigned int sum = 0; + unsigned int *p = img + (threadIdx.y + step * 8) * width + tidx; - partial_sum = sums[k][j]; + sum = *p; + sums[threadIdx.x][threadIdx.y] = sum; + __syncthreads(); - for (int i = 1; i <= 8; i *= 2) { - int n = __shfl_up_sync(mask, partial_sum, i, 32); + // place into SMEM + // shfl scan reduce the SMEM, reformating so the column + // sums are computed in a warp + // then read out properly + int partial_sum = 0; + int j = threadIdx.x % 8; + int k = threadIdx.x / 8 + threadIdx.y * 4; - if (lane_id >= i) partial_sum += n; + partial_sum = sums[k][j]; + + for (int i = 1; i <= 8; i *= 2) { + int n = __shfl_up_sync(mask, partial_sum, i, 32); + + if (lane_id >= i) + partial_sum += n; + } + + sums[k][j] = partial_sum; + __syncthreads(); + + if (threadIdx.y > 0) { + sum += sums[threadIdx.x][threadIdx.y - 1]; + } + + sum += stepSum; + stepSum += sums[threadIdx.x][blockDim.y - 1]; + __syncthreads(); + *p = sum; } - - sums[k][j] = partial_sum; - __syncthreads(); - - if (threadIdx.y > 0) { - sum += sums[threadIdx.x][threadIdx.y - 1]; - } - - sum += stepSum; - stepSum += sums[threadIdx.x][blockDim.y - 1]; - __syncthreads(); - *p = sum; - } } diff --git a/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu b/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu index 29e8e32b..acda9b0e 100644 --- a/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu +++ b/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu @@ -33,12 +33,11 @@ // using the shuffle intrinsic is provided, where the shuffle // scan operation and shuffle xor operations are used -#include - #include - #include #include +#include + #include "shfl_integral_image.cuh" // Scan using shfl - takes log2(n) steps @@ -53,366 +52,368 @@ // final sum in global memory and prefix summing that via another kernel call, // then uniformly adding across the input data via the uniform_add<<<>>> kernel. -__global__ void shfl_scan_test(int *data, int width, int *partial_sums = NULL) { - extern __shared__ int sums[]; - int id = ((blockIdx.x * blockDim.x) + threadIdx.x); - int lane_id = id % warpSize; - // determine a warp_id within a block - int warp_id = threadIdx.x / warpSize; +__global__ void shfl_scan_test(int *data, int width, int *partial_sums = NULL) +{ + extern __shared__ int sums[]; + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); + int lane_id = id % warpSize; + // determine a warp_id within a block + int warp_id = threadIdx.x / warpSize; - // Below is the basic structure of using a shfl instruction - // for a scan. - // Record "value" as a variable - we accumulate it along the way - int value = data[id]; + // Below is the basic structure of using a shfl instruction + // for a scan. + // Record "value" as a variable - we accumulate it along the way + int value = data[id]; - // Now accumulate in log steps up the chain - // compute sums, with another thread's value who is - // distance delta away (i). Note - // those threads where the thread 'i' away would have - // been out of bounds of the warp are unaffected. This - // creates the scan sum. + // Now accumulate in log steps up the chain + // compute sums, with another thread's value who is + // distance delta away (i). Note + // those threads where the thread 'i' away would have + // been out of bounds of the warp are unaffected. This + // creates the scan sum. #pragma unroll - for (int i = 1; i <= width; i *= 2) { - unsigned int mask = 0xffffffff; - int n = __shfl_up_sync(mask, value, i, width); + for (int i = 1; i <= width; i *= 2) { + unsigned int mask = 0xffffffff; + int n = __shfl_up_sync(mask, value, i, width); - if (lane_id >= i) value += n; - } - - // value now holds the scan value for the individual thread - // next sum the largest values for each warp - - // write the sum of the warp to smem - if (threadIdx.x % warpSize == warpSize - 1) { - sums[warp_id] = value; - } - - __syncthreads(); - - // - // scan sum the warp sums - // the same shfl scan operation, but performed on warp sums - // - if (warp_id == 0 && lane_id < (blockDim.x / warpSize)) { - int warp_sum = sums[lane_id]; - - int mask = (1 << (blockDim.x / warpSize)) - 1; - for (int i = 1; i <= (blockDim.x / warpSize); i *= 2) { - int n = __shfl_up_sync(mask, warp_sum, i, (blockDim.x / warpSize)); - - if (lane_id >= i) warp_sum += n; + if (lane_id >= i) + value += n; } - sums[lane_id] = warp_sum; - } + // value now holds the scan value for the individual thread + // next sum the largest values for each warp - __syncthreads(); + // write the sum of the warp to smem + if (threadIdx.x % warpSize == warpSize - 1) { + sums[warp_id] = value; + } - // perform a uniform add across warps in the block - // read neighbouring warp's sum and add it to threads value - int blockSum = 0; + __syncthreads(); - if (warp_id > 0) { - blockSum = sums[warp_id - 1]; - } + // + // scan sum the warp sums + // the same shfl scan operation, but performed on warp sums + // + if (warp_id == 0 && lane_id < (blockDim.x / warpSize)) { + int warp_sum = sums[lane_id]; - value += blockSum; + int mask = (1 << (blockDim.x / warpSize)) - 1; + for (int i = 1; i <= (blockDim.x / warpSize); i *= 2) { + int n = __shfl_up_sync(mask, warp_sum, i, (blockDim.x / warpSize)); - // Now write out our result - data[id] = value; + if (lane_id >= i) + warp_sum += n; + } - // last thread has sum, write write out the block's sum - if (partial_sums != NULL && threadIdx.x == blockDim.x - 1) { - partial_sums[blockIdx.x] = value; - } + sums[lane_id] = warp_sum; + } + + __syncthreads(); + + // perform a uniform add across warps in the block + // read neighbouring warp's sum and add it to threads value + int blockSum = 0; + + if (warp_id > 0) { + blockSum = sums[warp_id - 1]; + } + + value += blockSum; + + // Now write out our result + data[id] = value; + + // last thread has sum, write write out the block's sum + if (partial_sums != NULL && threadIdx.x == blockDim.x - 1) { + partial_sums[blockIdx.x] = value; + } } // Uniform add: add partial sums array -__global__ void uniform_add(int *data, int *partial_sums, int len) { - __shared__ int buf; - int id = ((blockIdx.x * blockDim.x) + threadIdx.x); +__global__ void uniform_add(int *data, int *partial_sums, int len) +{ + __shared__ int buf; + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); - if (id > len) return; + if (id > len) + return; - if (threadIdx.x == 0) { - buf = partial_sums[blockIdx.x]; - } + if (threadIdx.x == 0) { + buf = partial_sums[blockIdx.x]; + } - __syncthreads(); - data[id] += buf; + __syncthreads(); + data[id] += buf; } -static unsigned int iDivUp(unsigned int dividend, unsigned int divisor) { - return ((dividend % divisor) == 0) ? (dividend / divisor) - : (dividend / divisor + 1); +static unsigned int iDivUp(unsigned int dividend, unsigned int divisor) +{ + return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1); } // This function verifies the shuffle scan result, for the simple // prefix sum case. -bool CPUverify(int *h_data, int *h_result, int n_elements) { - // cpu verify - for (int i = 0; i < n_elements - 1; i++) { - h_data[i + 1] = h_data[i] + h_data[i + 1]; - } - - int diff = 0; - - for (int i = 0; i < n_elements; i++) { - diff += h_data[i] - h_result[i]; - } - - printf("CPU verify result diff (GPUvsCPU) = %d\n", diff); - bool bTestResult = false; - - if (diff == 0) bTestResult = true; - - StopWatchInterface *hTimer = NULL; - sdkCreateTimer(&hTimer); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - for (int j = 0; j < 100; j++) +bool CPUverify(int *h_data, int *h_result, int n_elements) +{ + // cpu verify for (int i = 0; i < n_elements - 1; i++) { - h_data[i + 1] = h_data[i] + h_data[i + 1]; + h_data[i + 1] = h_data[i] + h_data[i + 1]; } - sdkStopTimer(&hTimer); - double cput = sdkGetTimerValue(&hTimer); - printf("CPU sum (naive) took %f ms\n", cput / 100); - return bTestResult; + int diff = 0; + + for (int i = 0; i < n_elements; i++) { + diff += h_data[i] - h_result[i]; + } + + printf("CPU verify result diff (GPUvsCPU) = %d\n", diff); + bool bTestResult = false; + + if (diff == 0) + bTestResult = true; + + StopWatchInterface *hTimer = NULL; + sdkCreateTimer(&hTimer); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + + for (int j = 0; j < 100; j++) + for (int i = 0; i < n_elements - 1; i++) { + h_data[i + 1] = h_data[i] + h_data[i + 1]; + } + + sdkStopTimer(&hTimer); + double cput = sdkGetTimerValue(&hTimer); + printf("CPU sum (naive) took %f ms\n", cput / 100); + return bTestResult; } // this verifies the row scan result for synthetic data of all 1's -unsigned int verifyDataRowSums(unsigned int *h_image, int w, int h) { - unsigned int diff = 0; +unsigned int verifyDataRowSums(unsigned int *h_image, int w, int h) +{ + unsigned int diff = 0; - for (int j = 0; j < h; j++) { - for (int i = 0; i < w; i++) { - int gold = i + 1; - diff += - abs(static_cast(gold) - static_cast(h_image[j * w + i])); + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + int gold = i + 1; + diff += abs(static_cast(gold) - static_cast(h_image[j * w + i])); + } } - } - return diff; + return diff; } -bool shuffle_simple_test(int argc, char **argv) { - int *h_data, *h_partial_sums, *h_result; - int *d_data, *d_partial_sums; - const int n_elements = 65536; - int sz = sizeof(int) * n_elements; - int cuda_device = 0; +bool shuffle_simple_test(int argc, char **argv) +{ + int *h_data, *h_partial_sums, *h_result; + int *d_data, *d_partial_sums; + const int n_elements = 65536; + int sz = sizeof(int) * n_elements; + int cuda_device = 0; - printf("Starting shfl_scan\n"); + printf("Starting shfl_scan\n"); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - cuda_device = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + cuda_device = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDevice(&cuda_device)); + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDevice(&cuda_device)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); - printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", - deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); + printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", + deviceProp.major, + deviceProp.minor, + deviceProp.multiProcessorCount); - // __shfl intrinsic needs SM 3.0 or higher - if (deviceProp.major < 3) { - printf("> __shfl() intrinsic requires device SM 3.0+\n"); - printf("> Waiving test.\n"); - exit(EXIT_WAIVED); - } + // __shfl intrinsic needs SM 3.0 or higher + if (deviceProp.major < 3) { + printf("> __shfl() intrinsic requires device SM 3.0+\n"); + printf("> Waiving test.\n"); + exit(EXIT_WAIVED); + } - checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_data), - sizeof(int) * n_elements)); - checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_result), - sizeof(int) * n_elements)); + checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_data), sizeof(int) * n_elements)); + checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_result), sizeof(int) * n_elements)); - // initialize data: - printf("Computing Simple Sum test\n"); - printf("---------------------------------------------------\n"); + // initialize data: + printf("Computing Simple Sum test\n"); + printf("---------------------------------------------------\n"); - printf("Initialize test data [1, 1, 1...]\n"); + printf("Initialize test data [1, 1, 1...]\n"); - for (int i = 0; i < n_elements; i++) { - h_data[i] = 1; - } + for (int i = 0; i < n_elements; i++) { + h_data[i] = 1; + } - int blockSize = 256; - int gridSize = n_elements / blockSize; - int nWarps = blockSize / 32; - int shmem_sz = nWarps * sizeof(int); - int n_partialSums = n_elements / blockSize; - int partial_sz = n_partialSums * sizeof(int); + int blockSize = 256; + int gridSize = n_elements / blockSize; + int nWarps = blockSize / 32; + int shmem_sz = nWarps * sizeof(int); + int n_partialSums = n_elements / blockSize; + int partial_sz = n_partialSums * sizeof(int); - printf("Scan summation for %d elements, %d partial sums\n", n_elements, - n_elements / blockSize); + printf("Scan summation for %d elements, %d partial sums\n", n_elements, n_elements / blockSize); - int p_blockSize = min(n_partialSums, blockSize); - int p_gridSize = iDivUp(n_partialSums, p_blockSize); - printf("Partial summing %d elements with %d blocks of size %d\n", - n_partialSums, p_gridSize, p_blockSize); + int p_blockSize = min(n_partialSums, blockSize); + int p_gridSize = iDivUp(n_partialSums, p_blockSize); + printf("Partial summing %d elements with %d blocks of size %d\n", n_partialSums, p_gridSize, p_blockSize); - // initialize a timer - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - float et = 0; - float inc = 0; + // initialize a timer + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + float et = 0; + float inc = 0; - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_data), sz)); - checkCudaErrors( - cudaMalloc(reinterpret_cast(&d_partial_sums), partial_sz)); - checkCudaErrors(cudaMemset(d_partial_sums, 0, partial_sz)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_data), sz)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_partial_sums), partial_sz)); + checkCudaErrors(cudaMemset(d_partial_sums, 0, partial_sz)); - checkCudaErrors( - cudaMallocHost(reinterpret_cast(&h_partial_sums), partial_sz)); - checkCudaErrors(cudaMemcpy(d_data, h_data, sz, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_partial_sums), partial_sz)); + checkCudaErrors(cudaMemcpy(d_data, h_data, sz, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaEventRecord(start, 0)); - shfl_scan_test<<>>(d_data, 32, d_partial_sums); - shfl_scan_test<<>>(d_partial_sums, 32); - uniform_add<<>>(d_data + blockSize, d_partial_sums, - n_elements); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&inc, start, stop)); - et += inc; + checkCudaErrors(cudaEventRecord(start, 0)); + shfl_scan_test<<>>(d_data, 32, d_partial_sums); + shfl_scan_test<<>>(d_partial_sums, 32); + uniform_add<<>>(d_data + blockSize, d_partial_sums, n_elements); + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&inc, start, stop)); + et += inc; - checkCudaErrors(cudaMemcpy(h_result, d_data, sz, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_partial_sums, d_partial_sums, partial_sz, - cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_result, d_data, sz, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_partial_sums, d_partial_sums, partial_sz, cudaMemcpyDeviceToHost)); - printf("Test Sum: %d\n", h_partial_sums[n_partialSums - 1]); - printf("Time (ms): %f\n", et); - printf("%d elements scanned in %f ms -> %f MegaElements/s\n", n_elements, et, - n_elements / (et / 1000.0f) / 1000000.0f); + printf("Test Sum: %d\n", h_partial_sums[n_partialSums - 1]); + printf("Time (ms): %f\n", et); + printf("%d elements scanned in %f ms -> %f MegaElements/s\n", + n_elements, + et, + n_elements / (et / 1000.0f) / 1000000.0f); - bool bTestResult = CPUverify(h_data, h_result, n_elements); + bool bTestResult = CPUverify(h_data, h_result, n_elements); - checkCudaErrors(cudaFreeHost(h_data)); - checkCudaErrors(cudaFreeHost(h_result)); - checkCudaErrors(cudaFreeHost(h_partial_sums)); - checkCudaErrors(cudaFree(d_data)); - checkCudaErrors(cudaFree(d_partial_sums)); + checkCudaErrors(cudaFreeHost(h_data)); + checkCudaErrors(cudaFreeHost(h_result)); + checkCudaErrors(cudaFreeHost(h_partial_sums)); + checkCudaErrors(cudaFree(d_data)); + checkCudaErrors(cudaFree(d_partial_sums)); - return bTestResult; + return bTestResult; } // This function tests creation of an integral image using // synthetic data, of size 1920x1080 pixels greyscale. -bool shuffle_integral_image_test() { - char *d_data; - unsigned int *h_image; - unsigned int *d_integral_image; - int w = 1920; - int h = 1080; - int n_elements = w * h; - int sz = sizeof(unsigned int) * n_elements; +bool shuffle_integral_image_test() +{ + char *d_data; + unsigned int *h_image; + unsigned int *d_integral_image; + int w = 1920; + int h = 1080; + int n_elements = w * h; + int sz = sizeof(unsigned int) * n_elements; - printf("\nComputing Integral Image Test on size %d x %d synthetic data\n", w, - h); - printf("---------------------------------------------------\n"); - checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_image), sz)); - // fill test "image" with synthetic 1's data - memset(h_image, 0, sz); + printf("\nComputing Integral Image Test on size %d x %d synthetic data\n", w, h); + printf("---------------------------------------------------\n"); + checkCudaErrors(cudaMallocHost(reinterpret_cast(&h_image), sz)); + // fill test "image" with synthetic 1's data + memset(h_image, 0, sz); - // each thread handles 16 values, use 1 block/row - int blockSize = iDivUp(w, 16); - // launch 1 block / row - int gridSize = h; + // each thread handles 16 values, use 1 block/row + int blockSize = iDivUp(w, 16); + // launch 1 block / row + int gridSize = h; - // Create a synthetic image for testing - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_data), sz)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_integral_image), - n_elements * sizeof(int) * 4)); - checkCudaErrors(cudaMemset(d_data, 1, sz)); - checkCudaErrors(cudaMemset(d_integral_image, 0, sz)); + // Create a synthetic image for testing + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_data), sz)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_integral_image), n_elements * sizeof(int) * 4)); + checkCudaErrors(cudaMemset(d_data, 1, sz)); + checkCudaErrors(cudaMemset(d_integral_image, 0, sz)); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - float et = 0; - unsigned int err; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float et = 0; + unsigned int err; - // Execute scan line prefix sum kernel, and time it - cudaEventRecord(start); - shfl_intimage_rows<<>>( - reinterpret_cast(d_data), - reinterpret_cast(d_integral_image)); - cudaEventRecord(stop); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&et, start, stop)); - printf("Method: Fast Time (GPU Timer): %f ms ", et); + // Execute scan line prefix sum kernel, and time it + cudaEventRecord(start); + shfl_intimage_rows<<>>(reinterpret_cast(d_data), + reinterpret_cast(d_integral_image)); + cudaEventRecord(stop); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&et, start, stop)); + printf("Method: Fast Time (GPU Timer): %f ms ", et); - // verify the scan line results - checkCudaErrors( - cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost)); - err = verifyDataRowSums(h_image, w, h); - printf("Diff = %d\n", err); + // verify the scan line results + checkCudaErrors(cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost)); + err = verifyDataRowSums(h_image, w, h); + printf("Diff = %d\n", err); - // Execute column prefix sum kernel and time it - dim3 blockSz(32, 8); - dim3 testGrid(w / blockSz.x, 1); + // Execute column prefix sum kernel and time it + dim3 blockSz(32, 8); + dim3 testGrid(w / blockSz.x, 1); - cudaEventRecord(start); - shfl_vertical_shfl<<>>((unsigned int *)d_integral_image, w, - h); - cudaEventRecord(stop); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&et, start, stop)); - printf("Method: Vertical Scan Time (GPU Timer): %f ms ", et); + cudaEventRecord(start); + shfl_vertical_shfl<<>>((unsigned int *)d_integral_image, w, h); + cudaEventRecord(stop); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&et, start, stop)); + printf("Method: Vertical Scan Time (GPU Timer): %f ms ", et); - // Verify the column results - checkCudaErrors( - cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost)); - printf("\n"); + // Verify the column results + checkCudaErrors(cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost)); + printf("\n"); - int finalSum = h_image[w * h - 1]; - printf("CheckSum: %d, (expect %dx%d=%d)\n", finalSum, w, h, w * h); + int finalSum = h_image[w * h - 1]; + printf("CheckSum: %d, (expect %dx%d=%d)\n", finalSum, w, h, w * h); - checkCudaErrors(cudaFree(d_data)); - checkCudaErrors(cudaFree(d_integral_image)); - checkCudaErrors(cudaFreeHost(h_image)); - // verify final sum: if the final value in the corner is the same as the size - // of the buffer (all 1's) then the integral image was generated successfully - return (finalSum == w * h) ? true : false; + checkCudaErrors(cudaFree(d_data)); + checkCudaErrors(cudaFree(d_integral_image)); + checkCudaErrors(cudaFreeHost(h_image)); + // verify final sum: if the final value in the corner is the same as the size + // of the buffer (all 1's) then the integral image was generated successfully + return (finalSum == w * h) ? true : false; } -int main(int argc, char *argv[]) { - // Initialization. The shuffle intrinsic is not available on SM < 3.0 - // so waive the test if the hardware is not present. - int cuda_device = 0; +int main(int argc, char *argv[]) +{ + // Initialization. The shuffle intrinsic is not available on SM < 3.0 + // so waive the test if the hardware is not present. + int cuda_device = 0; - printf("Starting shfl_scan\n"); + printf("Starting shfl_scan\n"); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - cuda_device = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + cuda_device = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDevice(&cuda_device)); + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDevice(&cuda_device)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); - printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", - deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); + printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", + deviceProp.major, + deviceProp.minor, + deviceProp.multiProcessorCount); - // __shfl intrinsic needs SM 3.0 or higher - if (deviceProp.major < 3) { - printf("> __shfl() intrinsic requires device SM 3.0+\n"); - printf("> Waiving test.\n"); - exit(EXIT_WAIVED); - } + // __shfl intrinsic needs SM 3.0 or higher + if (deviceProp.major < 3) { + printf("> __shfl() intrinsic requires device SM 3.0+\n"); + printf("> Waiving test.\n"); + exit(EXIT_WAIVED); + } - bool bTestResult = true; - bool simpleTest = shuffle_simple_test(argc, argv); - bool intTest = shuffle_integral_image_test(); + bool bTestResult = true; + bool simpleTest = shuffle_simple_test(argc, argv); + bool intTest = shuffle_integral_image_test(); - bTestResult = simpleTest & intTest; + bTestResult = simpleTest & intTest; - exit((bTestResult) ? EXIT_SUCCESS : EXIT_FAILURE); + exit((bTestResult) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/2_Concepts_and_Techniques/shfl_scan/util.h b/Samples/2_Concepts_and_Techniques/shfl_scan/util.h index 1bea6969..13092ec7 100644 --- a/Samples/2_Concepts_and_Techniques/shfl_scan/util.h +++ b/Samples/2_Concepts_and_Techniques/shfl_scan/util.h @@ -29,33 +29,33 @@ #define SAMPLES_SHFL_SCAN_UTIL_H_ // Macro to catch CUDA errors in kernel launches -#define CHECK_LAUNCH_ERROR() \ - do { \ - /* Check synchronous errors, i.e. pre-launch */ \ - cudaError_t err = cudaGetLastError(); \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ - __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ - err = cudaDeviceSynchronize(); \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s!\n", __FILE__, \ - __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define CHECK_LAUNCH_ERROR() \ + do { \ + /* Check synchronous errors, i.e. pre-launch */ \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf( \ + stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ + err = cudaDeviceSynchronize(); \ + if (cudaSuccess != err) { \ + fprintf( \ + stderr, "Cuda error in file '%s' in line %i : %s!\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) // Macro to catch CUDA errors in CUDA runtime calls -#define CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ - __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + fprintf( \ + stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) -#endif // SAMPLES_SHFL_SCAN_UTIL_H_ +#endif // SAMPLES_SHFL_SCAN_UTIL_H_ diff --git a/Samples/2_Concepts_and_Techniques/sortingNetworks/bitonicSort.cu b/Samples/2_Concepts_and_Techniques/sortingNetworks/bitonicSort.cu index 46916254..8848aba7 100644 --- a/Samples/2_Concepts_and_Techniques/sortingNetworks/bitonicSort.cu +++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/bitonicSort.cu @@ -25,69 +25,64 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -//Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm +// Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm #include #include namespace cg = cooperative_groups; #include -#include "sortingNetworks_common.h" + #include "sortingNetworks_common.cuh" +#include "sortingNetworks_common.h" //////////////////////////////////////////////////////////////////////////////// // Monolithic bitonic sort kernel for short arrays fitting into shared memory //////////////////////////////////////////////////////////////////////////////// -__global__ void bitonicSortShared(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint arrayLength, uint dir) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Shared memory storage for one or more short vectors - __shared__ uint s_key[SHARED_SIZE_LIMIT]; - __shared__ uint s_val[SHARED_SIZE_LIMIT]; +__global__ void +bitonicSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint dir) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Shared memory storage for one or more short vectors + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; - // Offset to the beginning of subbatch and load data - d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = d_SrcKey[0]; - s_val[threadIdx.x + 0] = d_SrcVal[0]; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + // Offset to the beginning of subbatch and load data + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[0]; + s_val[threadIdx.x + 0] = d_SrcVal[0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; - for (uint size = 2; size < arrayLength; size <<= 1) { - // Bitonic merge - uint ddd = dir ^ ((threadIdx.x & (size / 2)) != 0); + for (uint size = 2; size < arrayLength; size <<= 1) { + // Bitonic merge + uint ddd = dir ^ ((threadIdx.x & (size / 2)) != 0); - for (uint stride = size / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], ddd); + for (uint stride = size / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], ddd); + } } - } - // ddd == dir for the last bitonic merge step - { - for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], dir); + // ddd == dir for the last bitonic merge step + { + for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir); + } } - } - cg::sync(cta); - d_DstKey[0] = s_key[threadIdx.x + 0]; - d_DstVal[0] = s_val[threadIdx.x + 0]; - d_DstKey[(SHARED_SIZE_LIMIT / 2)] = - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - d_DstVal[(SHARED_SIZE_LIMIT / 2)] = - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + cg::sync(cta); + d_DstKey[0] = s_key[threadIdx.x + 0]; + d_DstVal[0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; } //////////////////////////////////////////////////////////////////////////////// @@ -98,179 +93,179 @@ __global__ void bitonicSortShared(uint *d_DstKey, uint *d_DstVal, // even / odd subarrays being sorted in opposite directions // Bitonic merge accepts both // Ascending | descending or descending | ascending sorted pairs -__global__ void bitonicSortShared1(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Shared memory storage for current subarray - __shared__ uint s_key[SHARED_SIZE_LIMIT]; - __shared__ uint s_val[SHARED_SIZE_LIMIT]; +__global__ void bitonicSortShared1(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Shared memory storage for current subarray + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; - // Offset to the beginning of subarray and load data - d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = d_SrcKey[0]; - s_val[threadIdx.x + 0] = d_SrcVal[0]; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + // Offset to the beginning of subarray and load data + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[0]; + s_val[threadIdx.x + 0] = d_SrcVal[0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; - for (uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) { - // Bitonic merge - uint ddd = (threadIdx.x & (size / 2)) != 0; + for (uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) { + // Bitonic merge + uint ddd = (threadIdx.x & (size / 2)) != 0; - for (uint stride = size / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], ddd); + for (uint stride = size / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], ddd); + } } - } - // Odd / even arrays of SHARED_SIZE_LIMIT elements - // sorted in opposite directions - uint ddd = blockIdx.x & 1; - { - for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], ddd); + // Odd / even arrays of SHARED_SIZE_LIMIT elements + // sorted in opposite directions + uint ddd = blockIdx.x & 1; + { + for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], ddd); + } } - } - cg::sync(cta); - d_DstKey[0] = s_key[threadIdx.x + 0]; - d_DstVal[0] = s_val[threadIdx.x + 0]; - d_DstKey[(SHARED_SIZE_LIMIT / 2)] = - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - d_DstVal[(SHARED_SIZE_LIMIT / 2)] = - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + cg::sync(cta); + d_DstKey[0] = s_key[threadIdx.x + 0]; + d_DstVal[0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; } // Bitonic merge iteration for stride >= SHARED_SIZE_LIMIT -__global__ void bitonicMergeGlobal(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint arrayLength, uint size, uint stride, - uint dir) { - uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x; - uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); +__global__ void bitonicMergeGlobal(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint arrayLength, + uint size, + uint stride, + uint dir) +{ + uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x; + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); - // Bitonic merge - uint ddd = dir ^ ((comparatorI & (size / 2)) != 0); - uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + // Bitonic merge + uint ddd = dir ^ ((comparatorI & (size / 2)) != 0); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); - uint keyA = d_SrcKey[pos + 0]; - uint valA = d_SrcVal[pos + 0]; - uint keyB = d_SrcKey[pos + stride]; - uint valB = d_SrcVal[pos + stride]; + uint keyA = d_SrcKey[pos + 0]; + uint valA = d_SrcVal[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + uint valB = d_SrcVal[pos + stride]; - Comparator(keyA, valA, keyB, valB, ddd); + Comparator(keyA, valA, keyB, valB, ddd); - d_DstKey[pos + 0] = keyA; - d_DstVal[pos + 0] = valA; - d_DstKey[pos + stride] = keyB; - d_DstVal[pos + stride] = valB; + d_DstKey[pos + 0] = keyA; + d_DstVal[pos + 0] = valA; + d_DstKey[pos + stride] = keyB; + d_DstVal[pos + stride] = valB; } // Combined bitonic merge steps for // size > SHARED_SIZE_LIMIT and stride = [1 .. SHARED_SIZE_LIMIT / 2] -__global__ void bitonicMergeShared(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint arrayLength, uint size, uint dir) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Shared memory storage for current subarray - __shared__ uint s_key[SHARED_SIZE_LIMIT]; - __shared__ uint s_val[SHARED_SIZE_LIMIT]; +__global__ void bitonicMergeShared(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint arrayLength, + uint size, + uint dir) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Shared memory storage for current subarray + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; - d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = d_SrcKey[0]; - s_val[threadIdx.x + 0] = d_SrcVal[0]; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[0]; + s_val[threadIdx.x + 0] = d_SrcVal[0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; - // Bitonic merge - uint comparatorI = - UMAD(blockIdx.x, blockDim.x, threadIdx.x) & ((arrayLength / 2) - 1); - uint ddd = dir ^ ((comparatorI & (size / 2)) != 0); + // Bitonic merge + uint comparatorI = UMAD(blockIdx.x, blockDim.x, threadIdx.x) & ((arrayLength / 2) - 1); + uint ddd = dir ^ ((comparatorI & (size / 2)) != 0); + + for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], ddd); + } - for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], ddd); - } - - cg::sync(cta); - d_DstKey[0] = s_key[threadIdx.x + 0]; - d_DstVal[0] = s_val[threadIdx.x + 0]; - d_DstKey[(SHARED_SIZE_LIMIT / 2)] = - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - d_DstVal[(SHARED_SIZE_LIMIT / 2)] = - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstKey[0] = s_key[threadIdx.x + 0]; + d_DstVal[0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; } //////////////////////////////////////////////////////////////////////////////// // Interface function //////////////////////////////////////////////////////////////////////////////// // Helper function (also used by odd-even merge sort) -extern "C" uint factorRadix2(uint *log2L, uint L) { - if (!L) { - *log2L = 0; - return 0; - } else { - for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++) - ; +extern "C" uint factorRadix2(uint *log2L, uint L) +{ + if (!L) { + *log2L = 0; + return 0; + } + else { + for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++) + ; - return L; - } + return L; + } } -extern "C" uint bitonicSort(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, - uint *d_SrcVal, uint batchSize, uint arrayLength, - uint dir) { - // Nothing to sort - if (arrayLength < 2) return 0; +extern "C" uint +bitonicSort(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint batchSize, uint arrayLength, uint dir) +{ + // Nothing to sort + if (arrayLength < 2) + return 0; - // Only power-of-two array lengths are supported by this implementation - uint log2L; - uint factorizationRemainder = factorRadix2(&log2L, arrayLength); - assert(factorizationRemainder == 1); + // Only power-of-two array lengths are supported by this implementation + uint log2L; + uint factorizationRemainder = factorRadix2(&log2L, arrayLength); + assert(factorizationRemainder == 1); - dir = (dir != 0); + dir = (dir != 0); - uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; - uint threadCount = SHARED_SIZE_LIMIT / 2; + uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; + uint threadCount = SHARED_SIZE_LIMIT / 2; - if (arrayLength <= SHARED_SIZE_LIMIT) { - assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0); - bitonicSortShared<<>>(d_DstKey, d_DstVal, d_SrcKey, - d_SrcVal, arrayLength, dir); - } else { - bitonicSortShared1<<>>(d_DstKey, d_DstVal, - d_SrcKey, d_SrcVal); + if (arrayLength <= SHARED_SIZE_LIMIT) { + assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0); + bitonicSortShared<<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir); + } + else { + bitonicSortShared1<<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal); - for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1) - for (unsigned stride = size / 2; stride > 0; stride >>= 1) - if (stride >= SHARED_SIZE_LIMIT) { - bitonicMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>( - d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, - dir); - } else { - bitonicMergeShared<<>>( - d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, dir); - break; - } - } + for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1) + for (unsigned stride = size / 2; stride > 0; stride >>= 1) + if (stride >= SHARED_SIZE_LIMIT) { + bitonicMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>( + d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, dir); + } + else { + bitonicMergeShared<<>>( + d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, dir); + break; + } + } - return threadCount; + return threadCount; } diff --git a/Samples/2_Concepts_and_Techniques/sortingNetworks/main.cpp b/Samples/2_Concepts_and_Techniques/sortingNetworks/main.cpp index 7a658c28..0d5afb88 100644 --- a/Samples/2_Concepts_and_Techniques/sortingNetworks/main.cpp +++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/main.cpp @@ -50,114 +50,107 @@ //////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - cudaError_t error; - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + cudaError_t error; + printf("%s Starting...\n\n", argv[0]); - printf("Starting up CUDA context...\n"); - int dev = findCudaDevice(argc, (const char **)argv); + printf("Starting up CUDA context...\n"); + int dev = findCudaDevice(argc, (const char **)argv); - uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU; - uint *d_InputKey, *d_InputVal, *d_OutputKey, *d_OutputVal; - StopWatchInterface *hTimer = NULL; + uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU; + uint *d_InputKey, *d_InputVal, *d_OutputKey, *d_OutputVal; + StopWatchInterface *hTimer = NULL; - const uint N = 1048576; - const uint DIR = 0; - const uint numValues = 65536; - const uint numIterations = 1; + const uint N = 1048576; + const uint DIR = 0; + const uint numValues = 65536; + const uint numIterations = 1; - printf("Allocating and initializing host arrays...\n\n"); - sdkCreateTimer(&hTimer); - h_InputKey = (uint *)malloc(N * sizeof(uint)); - h_InputVal = (uint *)malloc(N * sizeof(uint)); - h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint)); - h_OutputValGPU = (uint *)malloc(N * sizeof(uint)); - srand(2001); + printf("Allocating and initializing host arrays...\n\n"); + sdkCreateTimer(&hTimer); + h_InputKey = (uint *)malloc(N * sizeof(uint)); + h_InputVal = (uint *)malloc(N * sizeof(uint)); + h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint)); + h_OutputValGPU = (uint *)malloc(N * sizeof(uint)); + srand(2001); - for (uint i = 0; i < N; i++) { - h_InputKey[i] = rand() % numValues; - h_InputVal[i] = i; - } - - printf("Allocating and initializing CUDA arrays...\n\n"); - error = cudaMalloc((void **)&d_InputKey, N * sizeof(uint)); - checkCudaErrors(error); - error = cudaMalloc((void **)&d_InputVal, N * sizeof(uint)); - checkCudaErrors(error); - error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint)); - checkCudaErrors(error); - error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint)); - checkCudaErrors(error); - error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), - cudaMemcpyHostToDevice); - checkCudaErrors(error); - error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), - cudaMemcpyHostToDevice); - checkCudaErrors(error); - - int flag = 1; - printf("Running GPU bitonic sort (%u identical iterations)...\n\n", - numIterations); - - for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2) { - printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, - N / arrayLength); - error = cudaDeviceSynchronize(); - checkCudaErrors(error); - - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - uint threadCount = 0; - - for (uint i = 0; i < numIterations; i++) - threadCount = bitonicSort(d_OutputKey, d_OutputVal, d_InputKey, - d_InputVal, N / arrayLength, arrayLength, DIR); - - error = cudaDeviceSynchronize(); - checkCudaErrors(error); - - sdkStopTimer(&hTimer); - printf("Average time: %f ms\n\n", - sdkGetTimerValue(&hTimer) / numIterations); - - if (arrayLength == N) { - double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations; - printf( - "sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f " - "s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * (double)arrayLength / dTimeSecs), dTimeSecs, arrayLength, 1, - threadCount); + for (uint i = 0; i < N; i++) { + h_InputKey[i] = rand() % numValues; + h_InputVal[i] = i; } - printf("\nValidating the results...\n"); - printf("...reading back GPU results\n"); - error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), - cudaMemcpyDeviceToHost); + printf("Allocating and initializing CUDA arrays...\n\n"); + error = cudaMalloc((void **)&d_InputKey, N * sizeof(uint)); checkCudaErrors(error); - error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), - cudaMemcpyDeviceToHost); + error = cudaMalloc((void **)&d_InputVal, N * sizeof(uint)); + checkCudaErrors(error); + error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint)); + checkCudaErrors(error); + error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint)); + checkCudaErrors(error); + error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), cudaMemcpyHostToDevice); + checkCudaErrors(error); + error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); - int keysFlag = - validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, - arrayLength, numValues, DIR); - int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, - N / arrayLength, arrayLength); - flag = flag && keysFlag && valuesFlag; + int flag = 1; + printf("Running GPU bitonic sort (%u identical iterations)...\n\n", numIterations); - printf("\n"); - } + for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2) { + printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, N / arrayLength); + error = cudaDeviceSynchronize(); + checkCudaErrors(error); - printf("Shutting down...\n"); - sdkDeleteTimer(&hTimer); - cudaFree(d_OutputVal); - cudaFree(d_OutputKey); - cudaFree(d_InputVal); - cudaFree(d_InputKey); - free(h_OutputValGPU); - free(h_OutputKeyGPU); - free(h_InputVal); - free(h_InputKey); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + uint threadCount = 0; - exit(flag ? EXIT_SUCCESS : EXIT_FAILURE); + for (uint i = 0; i < numIterations; i++) + threadCount = + bitonicSort(d_OutputKey, d_OutputVal, d_InputKey, d_InputVal, N / arrayLength, arrayLength, DIR); + + error = cudaDeviceSynchronize(); + checkCudaErrors(error); + + sdkStopTimer(&hTimer); + printf("Average time: %f ms\n\n", sdkGetTimerValue(&hTimer) / numIterations); + + if (arrayLength == N) { + double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations; + printf("sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f " + "s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-6 * (double)arrayLength / dTimeSecs), + dTimeSecs, + arrayLength, + 1, + threadCount); + } + + printf("\nValidating the results...\n"); + printf("...reading back GPU results\n"); + error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), cudaMemcpyDeviceToHost); + checkCudaErrors(error); + error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), cudaMemcpyDeviceToHost); + checkCudaErrors(error); + + int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, DIR); + int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength); + flag = flag && keysFlag && valuesFlag; + + printf("\n"); + } + + printf("Shutting down...\n"); + sdkDeleteTimer(&hTimer); + cudaFree(d_OutputVal); + cudaFree(d_OutputKey); + cudaFree(d_InputVal); + cudaFree(d_InputKey); + free(h_OutputValGPU); + free(h_OutputKeyGPU); + free(h_InputVal); + free(h_InputKey); + + exit(flag ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/2_Concepts_and_Techniques/sortingNetworks/oddEvenMergeSort.cu b/Samples/2_Concepts_and_Techniques/sortingNetworks/oddEvenMergeSort.cu index d348e565..8bd0fa37 100644 --- a/Samples/2_Concepts_and_Techniques/sortingNetworks/oddEvenMergeSort.cu +++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/oddEvenMergeSort.cu @@ -25,7 +25,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -//Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/oemen.htm +// Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/oemen.htm #include @@ -34,106 +34,107 @@ namespace cg = cooperative_groups; #include -#include "sortingNetworks_common.h" + #include "sortingNetworks_common.cuh" +#include "sortingNetworks_common.h" //////////////////////////////////////////////////////////////////////////////// // Monolithic Bacther's sort kernel for short arrays fitting into shared memory //////////////////////////////////////////////////////////////////////////////// -__global__ void oddEvenMergeSortShared(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint arrayLength, uint dir) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Shared memory storage for one or more small vectors - __shared__ uint s_key[SHARED_SIZE_LIMIT]; - __shared__ uint s_val[SHARED_SIZE_LIMIT]; +__global__ void +oddEvenMergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint dir) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Shared memory storage for one or more small vectors + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; - // Offset to the beginning of subbatch and load data - d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = d_SrcKey[0]; - s_val[threadIdx.x + 0] = d_SrcVal[0]; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = - d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + // Offset to the beginning of subbatch and load data + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[0]; + s_val[threadIdx.x + 0] = d_SrcVal[0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; - for (uint size = 2; size <= arrayLength; size <<= 1) { - uint stride = size / 2; - uint offset = threadIdx.x & (stride - 1); + for (uint size = 2; size <= arrayLength; size <<= 1) { + uint stride = size / 2; + uint offset = threadIdx.x & (stride - 1); - { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], - s_val[pos + stride], dir); - stride >>= 1; + { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir); + stride >>= 1; + } + + for (; stride > 0; stride >>= 1) { + cg::sync(cta); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + + if (offset >= stride) + Comparator(s_key[pos - stride], s_val[pos - stride], s_key[pos + 0], s_val[pos + 0], dir); + } } - for (; stride > 0; stride >>= 1) { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - - if (offset >= stride) - Comparator(s_key[pos - stride], s_val[pos - stride], s_key[pos + 0], - s_val[pos + 0], dir); - } - } - - cg::sync(cta); - d_DstKey[0] = s_key[threadIdx.x + 0]; - d_DstVal[0] = s_val[threadIdx.x + 0]; - d_DstKey[(SHARED_SIZE_LIMIT / 2)] = - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - d_DstVal[(SHARED_SIZE_LIMIT / 2)] = - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + cg::sync(cta); + d_DstKey[0] = s_key[threadIdx.x + 0]; + d_DstVal[0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; } //////////////////////////////////////////////////////////////////////////////// // Odd-even merge sort iteration kernel // for large arrays (not fitting into shared memory) //////////////////////////////////////////////////////////////////////////////// -__global__ void oddEvenMergeGlobal(uint *d_DstKey, uint *d_DstVal, - uint *d_SrcKey, uint *d_SrcVal, - uint arrayLength, uint size, uint stride, - uint dir) { - uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void oddEvenMergeGlobal(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint arrayLength, + uint size, + uint stride, + uint dir) +{ + uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x; - // Odd-even merge - uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + // Odd-even merge + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); - if (stride < size / 2) { - uint offset = global_comparatorI & ((size / 2) - 1); + if (stride < size / 2) { + uint offset = global_comparatorI & ((size / 2) - 1); - if (offset >= stride) { - uint keyA = d_SrcKey[pos - stride]; - uint valA = d_SrcVal[pos - stride]; - uint keyB = d_SrcKey[pos + 0]; - uint valB = d_SrcVal[pos + 0]; + if (offset >= stride) { + uint keyA = d_SrcKey[pos - stride]; + uint valA = d_SrcVal[pos - stride]; + uint keyB = d_SrcKey[pos + 0]; + uint valB = d_SrcVal[pos + 0]; - Comparator(keyA, valA, keyB, valB, dir); + Comparator(keyA, valA, keyB, valB, dir); - d_DstKey[pos - stride] = keyA; - d_DstVal[pos - stride] = valA; - d_DstKey[pos + 0] = keyB; - d_DstVal[pos + 0] = valB; + d_DstKey[pos - stride] = keyA; + d_DstVal[pos - stride] = valA; + d_DstKey[pos + 0] = keyB; + d_DstVal[pos + 0] = valB; + } } - } else { - uint keyA = d_SrcKey[pos + 0]; - uint valA = d_SrcVal[pos + 0]; - uint keyB = d_SrcKey[pos + stride]; - uint valB = d_SrcVal[pos + stride]; + else { + uint keyA = d_SrcKey[pos + 0]; + uint valA = d_SrcVal[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + uint valB = d_SrcVal[pos + stride]; - Comparator(keyA, valA, keyB, valB, dir); + Comparator(keyA, valA, keyB, valB, dir); - d_DstKey[pos + 0] = keyA; - d_DstVal[pos + 0] = valA; - d_DstKey[pos + stride] = keyB; - d_DstVal[pos + stride] = valB; - } + d_DstKey[pos + 0] = keyA; + d_DstVal[pos + 0] = valA; + d_DstKey[pos + stride] = keyB; + d_DstVal[pos + stride] = valB; + } } //////////////////////////////////////////////////////////////////////////////// @@ -142,39 +143,44 @@ __global__ void oddEvenMergeGlobal(uint *d_DstKey, uint *d_DstVal, // Helper function extern "C" uint factorRadix2(uint *log2L, uint L); -extern "C" void oddEvenMergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, - uint *d_SrcVal, uint batchSize, - uint arrayLength, uint dir) { - // Nothing to sort - if (arrayLength < 2) return; +extern "C" void oddEvenMergeSort(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint batchSize, + uint arrayLength, + uint dir) +{ + // Nothing to sort + if (arrayLength < 2) + return; - // Only power-of-two array lengths are supported by this implementation - uint log2L; - uint factorizationRemainder = factorRadix2(&log2L, arrayLength); - assert(factorizationRemainder == 1); + // Only power-of-two array lengths are supported by this implementation + uint log2L; + uint factorizationRemainder = factorRadix2(&log2L, arrayLength); + assert(factorizationRemainder == 1); - dir = (dir != 0); + dir = (dir != 0); - uint blockCount = (batchSize * arrayLength) / SHARED_SIZE_LIMIT; - uint threadCount = SHARED_SIZE_LIMIT / 2; + uint blockCount = (batchSize * arrayLength) / SHARED_SIZE_LIMIT; + uint threadCount = SHARED_SIZE_LIMIT / 2; - if (arrayLength <= SHARED_SIZE_LIMIT) { - assert(SHARED_SIZE_LIMIT % arrayLength == 0); - oddEvenMergeSortShared<<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir); - } else { - oddEvenMergeSortShared<<>>( - d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, SHARED_SIZE_LIMIT, dir); + if (arrayLength <= SHARED_SIZE_LIMIT) { + assert(SHARED_SIZE_LIMIT % arrayLength == 0); + oddEvenMergeSortShared<<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir); + } + else { + oddEvenMergeSortShared<<>>( + d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, SHARED_SIZE_LIMIT, dir); - for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1) - for (unsigned stride = size / 2; stride > 0; stride >>= 1) { - // Unlike with bitonic sort, combining bitonic merge steps with - // stride = [SHARED_SIZE_LIMIT / 2 .. 1] seems to be impossible as there - // are dependencies between data elements crossing the SHARED_SIZE_LIMIT - // borders - oddEvenMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>( - d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, - dir); - } - } + for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1) + for (unsigned stride = size / 2; stride > 0; stride >>= 1) { + // Unlike with bitonic sort, combining bitonic merge steps with + // stride = [SHARED_SIZE_LIMIT / 2 .. 1] seems to be impossible as there + // are dependencies between data elements crossing the SHARED_SIZE_LIMIT + // borders + oddEvenMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>( + d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, dir); + } + } } diff --git a/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.cuh b/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.cuh index e2dc3fa3..63ff2799 100644 --- a/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.cuh +++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.cuh @@ -34,21 +34,21 @@ #define SHARED_SIZE_LIMIT 1024U // Map to single instructions on G8x / G9x / G100 -#define UMUL(a, b) __umul24((a), (b)) +#define UMUL(a, b) __umul24((a), (b)) #define UMAD(a, b, c) (UMUL((a), (b)) + (c)) -__device__ inline void Comparator(uint &keyA, uint &valA, uint &keyB, - uint &valB, uint dir) { - uint t; +__device__ inline void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint dir) +{ + uint t; - if ((keyA > keyB) == dir) { - t = keyA; - keyA = keyB; - keyB = t; - t = valA; - valA = valB; - valB = t; - } + if ((keyA > keyB) == dir) { + t = keyA; + keyA = keyB; + keyB = t; + t = valA; + valA = valB; + valB = t; + } } #endif diff --git a/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.h b/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.h index 5ee6115b..b26efc30 100644 --- a/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.h +++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_common.h @@ -34,20 +34,22 @@ typedef unsigned int uint; // Sort result validation routines //////////////////////////////////////////////////////////////////////////////// // Sorted keys array validation (check for integrity and proper order) -extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, - uint arrayLength, uint numValues, uint dir); +extern "C" uint +validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint dir); -extern "C" int validateValues(uint *resKey, uint *resVal, uint *srcKey, - uint batchSize, uint arrayLength); +extern "C" int validateValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength); //////////////////////////////////////////////////////////////////////////////// // CUDA sorting networks //////////////////////////////////////////////////////////////////////////////// -extern "C" uint bitonicSort(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, - uint *d_SrcVal, uint batchSize, uint arrayLength, - uint dir); +extern "C" uint +bitonicSort(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint batchSize, uint arrayLength, uint dir); -extern "C" void oddEvenMergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, - uint *d_SrcVal, uint batchSize, - uint arrayLength, uint dir); +extern "C" void oddEvenMergeSort(uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint batchSize, + uint arrayLength, + uint dir); diff --git a/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_validate.cpp b/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_validate.cpp index 261f5053..99792e01 100644 --- a/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_validate.cpp +++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/sortingNetworks_validate.cpp @@ -34,108 +34,108 @@ //////////////////////////////////////////////////////////////////////////////// // Validate sorted keys array (check for integrity and proper order) //////////////////////////////////////////////////////////////////////////////// -extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, - uint arrayLength, uint numValues, uint dir) { - uint *srcHist; - uint *resHist; +extern "C" uint +validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint dir) +{ + uint *srcHist; + uint *resHist; - if (arrayLength < 2) { - printf("validateSortedKeys(): arrayLength too short, exiting...\n"); - return 1; - } - - printf("...inspecting keys array: "); - - srcHist = (uint *)malloc(numValues * sizeof(uint)); - resHist = (uint *)malloc(numValues * sizeof(uint)); - - int flag = 1; - - for (uint j = 0; j < batchSize; - j++, srcKey += arrayLength, resKey += arrayLength) { - // Build histograms for keys arrays - memset(srcHist, 0, numValues * sizeof(uint)); - memset(resHist, 0, numValues * sizeof(uint)); - - for (uint i = 0; i < arrayLength; i++) { - if (srcKey[i] < numValues && resKey[i] < numValues) { - srcHist[srcKey[i]]++; - resHist[resKey[i]]++; - } else { - flag = 0; - break; - } + if (arrayLength < 2) { + printf("validateSortedKeys(): arrayLength too short, exiting...\n"); + return 1; } - if (!flag) { - printf("***Set %u source/result key arrays are not limited properly***\n", - j); - goto brk; - } + printf("...inspecting keys array: "); - // Compare the histograms - for (uint i = 0; i < numValues; i++) - if (srcHist[i] != resHist[i]) { - flag = 0; - break; - } + srcHist = (uint *)malloc(numValues * sizeof(uint)); + resHist = (uint *)malloc(numValues * sizeof(uint)); - if (!flag) { - printf("***Set %u source/result keys histograms do not match***\n", j); - goto brk; - } + int flag = 1; - if (dir) { - // Ascending order - for (uint i = 0; i < arrayLength - 1; i++) - if (resKey[i + 1] < resKey[i]) { - flag = 0; - break; + for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) { + // Build histograms for keys arrays + memset(srcHist, 0, numValues * sizeof(uint)); + memset(resHist, 0, numValues * sizeof(uint)); + + for (uint i = 0; i < arrayLength; i++) { + if (srcKey[i] < numValues && resKey[i] < numValues) { + srcHist[srcKey[i]]++; + resHist[resKey[i]]++; + } + else { + flag = 0; + break; + } } - } else { - // Descending order - for (uint i = 0; i < arrayLength - 1; i++) - if (resKey[i + 1] > resKey[i]) { - flag = 0; - break; + + if (!flag) { + printf("***Set %u source/result key arrays are not limited properly***\n", j); + goto brk; + } + + // Compare the histograms + for (uint i = 0; i < numValues; i++) + if (srcHist[i] != resHist[i]) { + flag = 0; + break; + } + + if (!flag) { + printf("***Set %u source/result keys histograms do not match***\n", j); + goto brk; + } + + if (dir) { + // Ascending order + for (uint i = 0; i < arrayLength - 1; i++) + if (resKey[i + 1] < resKey[i]) { + flag = 0; + break; + } + } + else { + // Descending order + for (uint i = 0; i < arrayLength - 1; i++) + if (resKey[i + 1] > resKey[i]) { + flag = 0; + break; + } + } + + if (!flag) { + printf("***Set %u result key array is not ordered properly***\n", j); + goto brk; } } - if (!flag) { - printf("***Set %u result key array is not ordered properly***\n", j); - goto brk; - } - } - brk: - free(resHist); - free(srcHist); + free(resHist); + free(srcHist); - if (flag) printf("OK\n"); + if (flag) + printf("OK\n"); - return flag; + return flag; } -extern "C" int validateValues(uint *resKey, uint *resVal, uint *srcKey, - uint batchSize, uint arrayLength) { - int correctFlag = 1, stableFlag = 1; +extern "C" int validateValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength) +{ + int correctFlag = 1, stableFlag = 1; - printf("...inspecting keys and values array: "); + printf("...inspecting keys and values array: "); - for (uint i = 0; i < batchSize; - i++, resKey += arrayLength, resVal += arrayLength) { - for (uint j = 0; j < arrayLength; j++) { - if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0; + for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) { + for (uint j = 0; j < arrayLength; j++) { + if (resKey[j] != srcKey[resVal[j]]) + correctFlag = 0; - if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && - (resVal[j] > resVal[j + 1])) - stableFlag = 0; + if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1])) + stableFlag = 0; + } } - } - printf(correctFlag ? "OK\n" : "***corrupted!!!***\n"); - printf(stableFlag ? "...stability property: stable!\n" - : "...stability property: NOT stable\n"); + printf(correctFlag ? "OK\n" : "***corrupted!!!***\n"); + printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n"); - return correctFlag; + return correctFlag; } diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/streamOrderedAllocation.cu b/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/streamOrderedAllocation.cu index 503a7555..a9344cfe 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/streamOrderedAllocation.cu +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/streamOrderedAllocation.cu @@ -39,8 +39,8 @@ // System includes #include -#include #include +#include // CUDA runtime #include @@ -52,105 +52,31 @@ #define MAX_ITER 20 /* Add two vectors on the GPU */ -__global__ void vectorAddGPU(const float *a, const float *b, float *c, int N) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void vectorAddGPU(const float *a, const float *b, float *c, int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < N) { - c[idx] = a[idx] + b[idx]; - } + if (idx < N) { + c[idx] = a[idx] + b[idx]; + } } -int basicStreamOrderedAllocation(const int dev, const int nelem, const float *a, - const float *b, float *c) { - float *d_a, *d_b, *d_c; // Device buffers - float errorNorm, refNorm, ref, diff; - size_t bytes = nelem * sizeof(float); +int basicStreamOrderedAllocation(const int dev, const int nelem, const float *a, const float *b, float *c) +{ + float *d_a, *d_b, *d_c; // Device buffers + float errorNorm, refNorm, ref, diff; + size_t bytes = nelem * sizeof(float); - cudaStream_t stream; - printf("Starting basicStreamOrderedAllocation()\n"); - checkCudaErrors(cudaSetDevice(dev)); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + cudaStream_t stream; + printf("Starting basicStreamOrderedAllocation()\n"); + checkCudaErrors(cudaSetDevice(dev)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaMallocAsync(&d_a, bytes, stream)); - checkCudaErrors(cudaMallocAsync(&d_b, bytes, stream)); - checkCudaErrors(cudaMallocAsync(&d_c, bytes, stream)); - checkCudaErrors( - cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors( - cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream)); - - dim3 block(256); - dim3 grid((unsigned int)ceil(nelem / (float)block.x)); - vectorAddGPU<<>>(d_a, d_b, d_c, nelem); - - checkCudaErrors(cudaFreeAsync(d_a, stream)); - checkCudaErrors(cudaFreeAsync(d_b, stream)); - checkCudaErrors( - cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaFreeAsync(d_c, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - /* Compare the results */ - printf("> Checking the results from vectorAddGPU() ...\n"); - errorNorm = 0.f; - refNorm = 0.f; - - for (int n = 0; n < nelem; n++) { - ref = a[n] + b[n]; - diff = c[n] - ref; - errorNorm += diff * diff; - refNorm += ref * ref; - } - - errorNorm = (float)sqrt((double)errorNorm); - refNorm = (float)sqrt((double)refNorm); - if (errorNorm / refNorm < 1.e-6f) - printf("basicStreamOrderedAllocation PASSED\n"); - - checkCudaErrors(cudaStreamDestroy(stream)); - - return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE; -} - -// streamOrderedAllocationPostSync(): demonstrates If the application wants the -// memory to persist in the pool beyond synchronization, then it sets the -// release threshold on the pool. This way, when the application reaches the -// "steady state", it is no longer allocating/freeing memory from the OS. -int streamOrderedAllocationPostSync(const int dev, const int nelem, - const float *a, const float *b, float *c) { - float *d_a, *d_b, *d_c; // Device buffers - float errorNorm, refNorm, ref, diff; - size_t bytes = nelem * sizeof(float); - - cudaStream_t stream; - cudaMemPool_t memPool; - cudaEvent_t start, end; - printf("Starting streamOrderedAllocationPostSync()\n"); - checkCudaErrors(cudaSetDevice(dev)); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&end)); - - checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, dev)); - uint64_t thresholdVal = ULONG_MAX; - // set high release threshold on the default pool so that cudaFreeAsync will - // not actually release memory to the system. By default, the release - // threshold for a memory pool is set to zero. This implies that the CUDA - // driver is allowed to release a memory chunk back to the system as long as - // it does not contain any active suballocations. - checkCudaErrors(cudaMemPoolSetAttribute( - memPool, cudaMemPoolAttrReleaseThreshold, (void *)&thresholdVal)); - - // Record the start event - checkCudaErrors(cudaEventRecord(start, stream)); - for (int i = 0; i < MAX_ITER; i++) { checkCudaErrors(cudaMallocAsync(&d_a, bytes, stream)); checkCudaErrors(cudaMallocAsync(&d_b, bytes, stream)); checkCudaErrors(cudaMallocAsync(&d_c, bytes, stream)); - checkCudaErrors( - cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors( - cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream)); dim3 block(256); dim3 grid((unsigned int)ceil(nelem / (float)block.x)); @@ -158,86 +84,152 @@ int streamOrderedAllocationPostSync(const int dev, const int nelem, checkCudaErrors(cudaFreeAsync(d_a, stream)); checkCudaErrors(cudaFreeAsync(d_b, stream)); - checkCudaErrors( - cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaFreeAsync(d_c, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); - } - checkCudaErrors(cudaEventRecord(end, stream)); - // Wait for the end event to complete - checkCudaErrors(cudaEventSynchronize(end)); - float msecTotal = 0.0f; - checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, end)); - printf("Total elapsed time = %f ms over %d iterations\n", msecTotal, - MAX_ITER); + /* Compare the results */ + printf("> Checking the results from vectorAddGPU() ...\n"); + errorNorm = 0.f; + refNorm = 0.f; - /* Compare the results */ - printf("> Checking the results from vectorAddGPU() ...\n"); - errorNorm = 0.f; - refNorm = 0.f; + for (int n = 0; n < nelem; n++) { + ref = a[n] + b[n]; + diff = c[n] - ref; + errorNorm += diff * diff; + refNorm += ref * ref; + } - for (int n = 0; n < nelem; n++) { - ref = a[n] + b[n]; - diff = c[n] - ref; - errorNorm += diff * diff; - refNorm += ref * ref; - } + errorNorm = (float)sqrt((double)errorNorm); + refNorm = (float)sqrt((double)refNorm); + if (errorNorm / refNorm < 1.e-6f) + printf("basicStreamOrderedAllocation PASSED\n"); - errorNorm = (float)sqrt((double)errorNorm); - refNorm = (float)sqrt((double)refNorm); - if (errorNorm / refNorm < 1.e-6f) - printf("streamOrderedAllocationPostSync PASSED\n"); + checkCudaErrors(cudaStreamDestroy(stream)); - checkCudaErrors(cudaStreamDestroy(stream)); - - return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE; + return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE; } -int main(int argc, char **argv) { - int nelem; - int dev = 0; // use default device 0 - size_t bytes; - float *a, *b, *c; // Host +// streamOrderedAllocationPostSync(): demonstrates If the application wants the +// memory to persist in the pool beyond synchronization, then it sets the +// release threshold on the pool. This way, when the application reaches the +// "steady state", it is no longer allocating/freeing memory from the OS. +int streamOrderedAllocationPostSync(const int dev, const int nelem, const float *a, const float *b, float *c) +{ + float *d_a, *d_b, *d_c; // Device buffers + float errorNorm, refNorm, ref, diff; + size_t bytes = nelem * sizeof(float); - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Usage: streamOrderedAllocation [OPTION]\n\n"); - printf("Options:\n"); - printf(" --device=[device #] Specify the device to be used\n"); - return EXIT_SUCCESS; - } + cudaStream_t stream; + cudaMemPool_t memPool; + cudaEvent_t start, end; + printf("Starting streamOrderedAllocationPostSync()\n"); + checkCudaErrors(cudaSetDevice(dev)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&end)); - dev = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, dev)); + uint64_t thresholdVal = ULONG_MAX; + // set high release threshold on the default pool so that cudaFreeAsync will + // not actually release memory to the system. By default, the release + // threshold for a memory pool is set to zero. This implies that the CUDA + // driver is allowed to release a memory chunk back to the system as long as + // it does not contain any active suballocations. + checkCudaErrors(cudaMemPoolSetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, (void *)&thresholdVal)); - int isMemPoolSupported = 0; - checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported, - cudaDevAttrMemoryPoolsSupported, dev)); - if (!isMemPoolSupported) { - printf("Waiving execution as device does not support Memory Pools\n"); - exit(EXIT_WAIVED); - } + // Record the start event + checkCudaErrors(cudaEventRecord(start, stream)); + for (int i = 0; i < MAX_ITER; i++) { + checkCudaErrors(cudaMallocAsync(&d_a, bytes, stream)); + checkCudaErrors(cudaMallocAsync(&d_b, bytes, stream)); + checkCudaErrors(cudaMallocAsync(&d_c, bytes, stream)); + checkCudaErrors(cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream)); - // Allocate CPU memory. - nelem = 1048576; - bytes = nelem * sizeof(float); + dim3 block(256); + dim3 grid((unsigned int)ceil(nelem / (float)block.x)); + vectorAddGPU<<>>(d_a, d_b, d_c, nelem); - a = (float *)malloc(bytes); - b = (float *)malloc(bytes); - c = (float *)malloc(bytes); - /* Initialize the vectors. */ - for (int n = 0; n < nelem; n++) { - a[n] = rand() / (float)RAND_MAX; - b[n] = rand() / (float)RAND_MAX; - } + checkCudaErrors(cudaFreeAsync(d_a, stream)); + checkCudaErrors(cudaFreeAsync(d_b, stream)); + checkCudaErrors(cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaFreeAsync(d_c, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + } + checkCudaErrors(cudaEventRecord(end, stream)); + // Wait for the end event to complete + checkCudaErrors(cudaEventSynchronize(end)); - int ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c); - int ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c); + float msecTotal = 0.0f; + checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, end)); + printf("Total elapsed time = %f ms over %d iterations\n", msecTotal, MAX_ITER); - /* Memory clean up */ - free(a); - free(b); - free(c); + /* Compare the results */ + printf("> Checking the results from vectorAddGPU() ...\n"); + errorNorm = 0.f; + refNorm = 0.f; - return ((ret1 == EXIT_SUCCESS && ret2 == EXIT_SUCCESS) ? EXIT_SUCCESS - : EXIT_FAILURE); + for (int n = 0; n < nelem; n++) { + ref = a[n] + b[n]; + diff = c[n] - ref; + errorNorm += diff * diff; + refNorm += ref * ref; + } + + errorNorm = (float)sqrt((double)errorNorm); + refNorm = (float)sqrt((double)refNorm); + if (errorNorm / refNorm < 1.e-6f) + printf("streamOrderedAllocationPostSync PASSED\n"); + + checkCudaErrors(cudaStreamDestroy(stream)); + + return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE; +} + +int main(int argc, char **argv) +{ + int nelem; + int dev = 0; // use default device 0 + size_t bytes; + float *a, *b, *c; // Host + + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Usage: streamOrderedAllocation [OPTION]\n\n"); + printf("Options:\n"); + printf(" --device=[device #] Specify the device to be used\n"); + return EXIT_SUCCESS; + } + + dev = findCudaDevice(argc, (const char **)argv); + + int isMemPoolSupported = 0; + checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported, cudaDevAttrMemoryPoolsSupported, dev)); + if (!isMemPoolSupported) { + printf("Waiving execution as device does not support Memory Pools\n"); + exit(EXIT_WAIVED); + } + + // Allocate CPU memory. + nelem = 1048576; + bytes = nelem * sizeof(float); + + a = (float *)malloc(bytes); + b = (float *)malloc(bytes); + c = (float *)malloc(bytes); + /* Initialize the vectors. */ + for (int n = 0; n < nelem; n++) { + a[n] = rand() / (float)RAND_MAX; + b[n] = rand() / (float)RAND_MAX; + } + + int ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c); + int ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c); + + /* Memory clean up */ + free(a); + free(b); + free(c); + + return ((ret1 == EXIT_SUCCESS && ret2 == EXIT_SUCCESS) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md index 8b4ee825..012eb61d 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md @@ -33,4 +33,3 @@ cudaDeviceGetAttribute, cudaMemPoolImportFromShareableHandle, cudaSetDevice, cud Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## References (for more details) - diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu index d9c5e38d..199a862c 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu @@ -30,10 +30,10 @@ * using one process per GPU for computation. */ +#include #include #include #include -#include #define CUDA_DRIVER_API 1 #include "helper_cuda.h" #include "helper_cuda_drvapi.h" @@ -45,7 +45,7 @@ static const char ipcName[] = "streamOrderedAllocationIPC_pipe"; // For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited // in the same way. #define MAX_DEVICES (32) -#define DATA_SIZE (64ULL << 20ULL) // 64MB +#define DATA_SIZE (64ULL << 20ULL) // 64MB #if defined(__linux__) #define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x) @@ -55,406 +55,403 @@ static const char ipcName[] = "streamOrderedAllocationIPC_pipe"; #error Unsupported system #endif -typedef struct shmStruct_st { - size_t nprocesses; - int barrier; - int sense; - cudaMemAllocationHandleType handleType; - int devices[MAX_DEVICES]; - cudaMemPoolPtrExportData exportPtrData[MAX_DEVICES]; +typedef struct shmStruct_st +{ + size_t nprocesses; + int barrier; + int sense; + cudaMemAllocationHandleType handleType; + int devices[MAX_DEVICES]; + cudaMemPoolPtrExportData exportPtrData[MAX_DEVICES]; } shmStruct; -__global__ void simpleKernel(char *ptr, int sz, char val) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - for (; idx < sz; idx += (gridDim.x * blockDim.x)) { - ptr[idx] = val; - } +__global__ void simpleKernel(char *ptr, int sz, char val) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (; idx < sz; idx += (gridDim.x * blockDim.x)) { + ptr[idx] = val; + } } -static void barrierWait(volatile int *barrier, volatile int *sense, - unsigned int n) { - int count; +static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n) +{ + int count; - // Check-in - count = cpu_atomic_add32(barrier, 1); - if (count == n) // Last one in - *sense = 1; - while (!*sense) - ; + // Check-in + count = cpu_atomic_add32(barrier, 1); + if (count == n) // Last one in + *sense = 1; + while (!*sense) + ; - // Check-out - count = cpu_atomic_add32(barrier, -1); - if (count == 0) // Last one out - *sense = 0; - while (*sense) - ; + // Check-out + count = cpu_atomic_add32(barrier, -1); + if (count == 0) // Last one out + *sense = 0; + while (*sense) + ; } -static void childProcess(int id) { - volatile shmStruct *shm = NULL; - cudaStream_t stream; - sharedMemoryInfo info; - size_t procCount, i; - int blocks = 0; - int threads = 128; - cudaDeviceProp prop; - std::vector ptrs; +static void childProcess(int id) +{ + volatile shmStruct *shm = NULL; + cudaStream_t stream; + sharedMemoryInfo info; + size_t procCount, i; + int blocks = 0; + int threads = 128; + cudaDeviceProp prop; + std::vector ptrs; - std::vector verification_buffer(DATA_SIZE); + std::vector verification_buffer(DATA_SIZE); - ipcHandle *ipcChildHandle = NULL; - checkIpcErrors(ipcOpenSocket(ipcChildHandle)); + ipcHandle *ipcChildHandle = NULL; + checkIpcErrors(ipcOpenSocket(ipcChildHandle)); - if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { - printf("Failed to create shared memory slab\n"); - exit(EXIT_FAILURE); - } - shm = (volatile shmStruct *)info.addr; - procCount = shm->nprocesses; + if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); + } + shm = (volatile shmStruct *)info.addr; + procCount = shm->nprocesses; - barrierWait(&shm->barrier, &shm->sense, (unsigned int)(procCount + 1)); + barrierWait(&shm->barrier, &shm->sense, (unsigned int)(procCount + 1)); - // Receive all allocation handles shared by Parent. - std::vector shHandle(shm->nprocesses); - checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle)); + // Receive all allocation handles shared by Parent. + std::vector shHandle(shm->nprocesses); + checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle)); - checkCudaErrors(cudaSetDevice(shm->devices[id])); - checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id])); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks, simpleKernel, threads, 0)); - blocks *= prop.multiProcessorCount; + checkCudaErrors(cudaSetDevice(shm->devices[id])); + checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id])); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0)); + blocks *= prop.multiProcessorCount; - std::vector pools(shm->nprocesses); + std::vector pools(shm->nprocesses); - cudaMemAllocationHandleType handleType = shm->handleType; + cudaMemAllocationHandleType handleType = shm->handleType; - // Import mem pools from all the devices created in the master - // process using shareable handles received via socket - // and import the pointer to the allocated buffer using - // exportData filled in shared memory by the master process. - for (i = 0; i < procCount; i++) { - checkCudaErrors(cudaMemPoolImportFromShareableHandle( - &pools[i], (void *)shHandle[i], handleType, 0)); + // Import mem pools from all the devices created in the master + // process using shareable handles received via socket + // and import the pointer to the allocated buffer using + // exportData filled in shared memory by the master process. + for (i = 0; i < procCount; i++) { + checkCudaErrors(cudaMemPoolImportFromShareableHandle(&pools[i], (void *)shHandle[i], handleType, 0)); - cudaMemAccessFlags accessFlags; - cudaMemLocation location; - location.type = cudaMemLocationTypeDevice; - location.id = shm->devices[id]; - checkCudaErrors(cudaMemPoolGetAccess(&accessFlags, pools[i], &location)); - if (accessFlags != cudaMemAccessFlagsProtReadWrite) { - cudaMemAccessDesc desc; - memset(&desc, 0, sizeof(cudaMemAccessDesc)); - desc.location.type = cudaMemLocationTypeDevice; - desc.location.id = shm->devices[id]; - desc.flags = cudaMemAccessFlagsProtReadWrite; - checkCudaErrors(cudaMemPoolSetAccess(pools[i], &desc, 1)); + cudaMemAccessFlags accessFlags; + cudaMemLocation location; + location.type = cudaMemLocationTypeDevice; + location.id = shm->devices[id]; + checkCudaErrors(cudaMemPoolGetAccess(&accessFlags, pools[i], &location)); + if (accessFlags != cudaMemAccessFlagsProtReadWrite) { + cudaMemAccessDesc desc; + memset(&desc, 0, sizeof(cudaMemAccessDesc)); + desc.location.type = cudaMemLocationTypeDevice; + desc.location.id = shm->devices[id]; + desc.flags = cudaMemAccessFlagsProtReadWrite; + checkCudaErrors(cudaMemPoolSetAccess(pools[i], &desc, 1)); + } + + // Import the allocation from each memory pool by iterating over exportData + // until import is success. + for (int j = 0; j < procCount; j++) { + void *ptr = NULL; + // Import the allocation using the opaque export data retrieved through + // the shared memory". + cudaError_t ret = + cudaMemPoolImportPointer(&ptr, pools[i], (cudaMemPoolPtrExportData *)&shm->exportPtrData[j]); + + if (ret == cudaSuccess) { + // Pointer import is successful hence add it to the ptrs bag. + ptrs.push_back(ptr); + break; + } + else { + // Reset failure error received from cudaMemPoolImportPointer + // for further try. + cudaGetLastError(); + } + } + // Since we have imported allocations shared by the parent with us, we can + // close this ShareableHandle. + checkIpcErrors(ipcCloseShareableHandle(shHandle[i])); } - // Import the allocation from each memory pool by iterating over exportData - // until import is success. - for (int j = 0; j < procCount; j++) { - void *ptr = NULL; - // Import the allocation using the opaque export data retrieved through - // the shared memory". - cudaError_t ret = cudaMemPoolImportPointer( - &ptr, pools[i], (cudaMemPoolPtrExportData *)&shm->exportPtrData[j]); - - if (ret == cudaSuccess) { - // Pointer import is successful hence add it to the ptrs bag. - ptrs.push_back(ptr); - break; - } else { - // Reset failure error received from cudaMemPoolImportPointer - // for further try. - cudaGetLastError(); - } - } // Since we have imported allocations shared by the parent with us, we can - // close this ShareableHandle. - checkIpcErrors(ipcCloseShareableHandle(shHandle[i])); - } + // close the socket. + checkIpcErrors(ipcCloseSocket(ipcChildHandle)); - // Since we have imported allocations shared by the parent with us, we can - // close the socket. - checkIpcErrors(ipcCloseSocket(ipcChildHandle)); + // At each iteration of the loop, each sibling process will push work on + // their respective devices accessing the next peer mapped buffer allocated + // by the master process (these can come from other sibling processes as + // well). To coordinate each process' access, we force the stream to wait for + // the work already accessing this buffer. + for (i = 0; i < procCount; i++) { + size_t bufferId = (i + id) % procCount; - // At each iteration of the loop, each sibling process will push work on - // their respective devices accessing the next peer mapped buffer allocated - // by the master process (these can come from other sibling processes as - // well). To coordinate each process' access, we force the stream to wait for - // the work already accessing this buffer. - for (i = 0; i < procCount; i++) { - size_t bufferId = (i + id) % procCount; + // Push a simple kernel on it + simpleKernel<<>>((char *)ptrs[bufferId], DATA_SIZE, id); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaStreamSynchronize(stream)); - // Push a simple kernel on it - simpleKernel<<>>((char *)ptrs[bufferId], - DATA_SIZE, id); - checkCudaErrors(cudaGetLastError()); + // Wait for all my sibling processes to push this stage of their work + // before proceeding to the next. This prevents siblings from racing + // ahead and clobbering the recorded event or waiting on the wrong + // recorded event. + barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); + if (id == 0) { + printf("Step %lld done\n", (unsigned long long)i); + } + } + + // Now wait for my buffer to be ready so I can copy it locally and verify it + checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream)); + + // And wait for all the queued up work to complete checkCudaErrors(cudaStreamSynchronize(stream)); - // Wait for all my sibling processes to push this stage of their work - // before proceeding to the next. This prevents siblings from racing - // ahead and clobbering the recorded event or waiting on the wrong - // recorded event. - barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); - if (id == 0) { - printf("Step %lld done\n", (unsigned long long)i); + printf("Process %d: verifying...\n", id); + + // The contents should have the id of the sibling just after me + char compareId = (char)((id + 1) % procCount); + for (unsigned long long j = 0; j < DATA_SIZE; j++) { + if (verification_buffer[j] != compareId) { + printf("Process %d: Verification mismatch at %lld: %d != %d\n", + id, + j, + (int)verification_buffer[j], + (int)compareId); + } } - } - // Now wait for my buffer to be ready so I can copy it locally and verify it - checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, - cudaMemcpyDeviceToHost, stream)); - - // And wait for all the queued up work to complete - checkCudaErrors(cudaStreamSynchronize(stream)); - - printf("Process %d: verifying...\n", id); - - // The contents should have the id of the sibling just after me - char compareId = (char)((id + 1) % procCount); - for (unsigned long long j = 0; j < DATA_SIZE; j++) { - if (verification_buffer[j] != compareId) { - printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j, - (int)verification_buffer[j], (int)compareId); + // Clean up! + for (i = 0; i < procCount; i++) { + // Free the memory before the exporter process frees it + checkCudaErrors(cudaFreeAsync(ptrs[i], stream)); } - } - // Clean up! - for (i = 0; i < procCount; i++) { - // Free the memory before the exporter process frees it - checkCudaErrors(cudaFreeAsync(ptrs[i], stream)); - } + // And wait for all the queued up work to complete + checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaStreamDestroy(stream)); - // And wait for all the queued up work to complete - checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaStreamDestroy(stream)); - - printf("Process %d complete!\n", id); + printf("Process %d complete!\n", id); } -static void parentProcess(char *app) { - sharedMemoryInfo info; - int devCount, i; - volatile shmStruct *shm = NULL; - std::vector ptrs; - std::vector processes; - cudaMemAllocationHandleType handleType = cudaMemHandleTypeNone; +static void parentProcess(char *app) +{ + sharedMemoryInfo info; + int devCount, i; + volatile shmStruct *shm = NULL; + std::vector ptrs; + std::vector processes; + cudaMemAllocationHandleType handleType = cudaMemHandleTypeNone; - checkCudaErrors(cudaGetDeviceCount(&devCount)); - std::vector devices(devCount); - for (i = 0; i < devCount; i++) { - cuDeviceGet(&devices[i], i); - } - - if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { - printf("Failed to create shared memory slab\n"); - exit(EXIT_FAILURE); - } - shm = (volatile shmStruct *)info.addr; - memset((void *)shm, 0, sizeof(*shm)); - - // Pick all the devices that can access each other's memory for this test - // Keep in mind that CUDA has minimal support for fork() without a - // corresponding exec() in the child process, but in this case our - // spawnProcess will always exec, so no need to worry. - for (i = 0; i < devCount; i++) { - bool allPeers = true; - cudaDeviceProp prop; - checkCudaErrors(cudaGetDeviceProperties(&prop, i)); - - int isMemPoolSupported = 0; - checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported, - cudaDevAttrMemoryPoolsSupported, i)); - // CUDA IPC is only supported on devices with unified addressing - if (!isMemPoolSupported) { - printf("Device %d does not support cuda memory pools, skipping...\n", i); - continue; + checkCudaErrors(cudaGetDeviceCount(&devCount)); + std::vector devices(devCount); + for (i = 0; i < devCount; i++) { + cuDeviceGet(&devices[i], i); } - int supportedHandleTypes = 0; - checkCudaErrors(cudaDeviceGetAttribute(&supportedHandleTypes, - cudaDevAttrMemoryPoolSupportedHandleTypes, i)); - if (supportedHandleTypes == 0) { - printf("Device %d does not support Memory pool based IPC, skipping...\n", i); - continue; + + if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); + } + shm = (volatile shmStruct *)info.addr; + memset((void *)shm, 0, sizeof(*shm)); + + // Pick all the devices that can access each other's memory for this test + // Keep in mind that CUDA has minimal support for fork() without a + // corresponding exec() in the child process, but in this case our + // spawnProcess will always exec, so no need to worry. + for (i = 0; i < devCount; i++) { + bool allPeers = true; + cudaDeviceProp prop; + checkCudaErrors(cudaGetDeviceProperties(&prop, i)); + + int isMemPoolSupported = 0; + checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported, cudaDevAttrMemoryPoolsSupported, i)); + // CUDA IPC is only supported on devices with unified addressing + if (!isMemPoolSupported) { + printf("Device %d does not support cuda memory pools, skipping...\n", i); + continue; + } + int supportedHandleTypes = 0; + checkCudaErrors(cudaDeviceGetAttribute(&supportedHandleTypes, cudaDevAttrMemoryPoolSupportedHandleTypes, i)); + if (supportedHandleTypes == 0) { + printf("Device %d does not support Memory pool based IPC, skipping...\n", i); + continue; + } + + if (handleType == cudaMemHandleTypeNone) { + if (supportedHandleTypes & cudaMemHandleTypePosixFileDescriptor) { + handleType = cudaMemHandleTypePosixFileDescriptor; + } + else if (supportedHandleTypes & cudaMemHandleTypeWin32) { + handleType = cudaMemHandleTypeWin32; + } + else { + printf("Device %d does not support any supported handle types, skipping...\n", i); + continue; + } + } + else { + if ((supportedHandleTypes & handleType) != handleType) { + printf("Mixed handle types are not supported, waiving test\n"); + exit(EXIT_WAIVED); + } + } + // This sample requires two processes accessing each device, so we need + // to ensure exclusive or prohibited mode is not set + if (prop.computeMode != cudaComputeModeDefault) { + printf("Device %d is in an unsupported compute mode for this sample\n", i); + continue; + } +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // CUDA IPC on Windows is only supported on TCC + if (!prop.tccDriver) { + printf("Device %d is not in TCC mode\n", i); + continue; + } +#endif + + for (int j = 0; j < shm->nprocesses; j++) { + int canAccessPeerIJ, canAccessPeerJI; + checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i)); + checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j])); + if (!canAccessPeerIJ || !canAccessPeerJI) { + allPeers = false; + break; + } + } + if (allPeers) { + // Enable peers here. This isn't necessary for IPC, but it will + // setup the peers for the device. For systems that only allow 8 + // peers per GPU at a time, this acts to remove devices from CanAccessPeer + for (int j = 0; j < shm->nprocesses; j++) { + checkCudaErrors(cudaSetDevice(i)); + checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0)); + checkCudaErrors(cudaSetDevice(shm->devices[j])); + checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0)); + } + shm->devices[shm->nprocesses++] = i; + if (shm->nprocesses >= MAX_DEVICES) + break; + } + else { + printf("Device %d is not peer capable with some other selected peers, " + "skipping\n", + i); + } + } + + if (shm->nprocesses == 0) { + printf("No CUDA devices support IPC\n"); + exit(EXIT_WAIVED); } if (handleType == cudaMemHandleTypeNone) { - if (supportedHandleTypes & cudaMemHandleTypePosixFileDescriptor) { - handleType = cudaMemHandleTypePosixFileDescriptor; + printf("No supported handle types found, waiving test\n"); + exit(EXIT_WAIVED); + } + + std::vector shareableHandles(shm->nprocesses); + std::vector streams(shm->nprocesses); + std::vector pools(shm->nprocesses); + + // Now allocate memory for each process and fill the shared + // memory buffer with the export data and get memPool handles to communicate + for (i = 0; i < shm->nprocesses; i++) { + void *ptr = NULL; + checkCudaErrors(cudaSetDevice(shm->devices[i])); + checkCudaErrors(cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking)); + // Allocate an explicit pool with IPC capabilities + cudaMemPoolProps poolProps; + memset(&poolProps, 0, sizeof(cudaMemPoolProps)); + poolProps.allocType = cudaMemAllocationTypePinned; + poolProps.handleTypes = handleType; + + poolProps.location.type = cudaMemLocationTypeDevice; + poolProps.location.id = shm->devices[i]; + + checkCudaErrors(cudaMemPoolCreate(&pools[i], &poolProps)); + + // Query the shareable handle for the pool + // Allocate memory in a stream from the pool just created + checkCudaErrors(cudaMallocAsync(&ptr, DATA_SIZE, pools[i], streams[i])); + + checkCudaErrors(cudaMemPoolExportToShareableHandle(&shareableHandles[i], pools[i], handleType, 0)); + + // Memset handle to 0 to make sure call to `cudaMemPoolImportPointer` in + // childProcess will fail if the following call to + // `cudaMemPoolExportPointer` fails. + memset((void *)&shm->exportPtrData[i], 0, sizeof(cudaMemPoolPtrExportData)); + // Get the opaque ‘bag-of-bits’ representing the allocation + checkCudaErrors(cudaMemPoolExportPointer((cudaMemPoolPtrExportData *)&shm->exportPtrData[i], ptr)); + ptrs.push_back(ptr); + } + + shm->handleType = handleType; + + // Launch the child processes! + for (i = 0; i < shm->nprocesses; i++) { + char devIdx[10]; + char *const args[] = {app, devIdx, NULL}; + Process process; + + SPRINTF(devIdx, "%d", i); + + if (spawnProcess(&process, app, args)) { + printf("Failed to create process\n"); + exit(EXIT_FAILURE); } - else if (supportedHandleTypes & cudaMemHandleTypeWin32) { - handleType = cudaMemHandleTypeWin32; - } - else { - printf("Device %d does not support any supported handle types, skipping...\n", i); - continue; + + processes.push_back(process); + } + + barrierWait(&shm->barrier, &shm->sense, (unsigned int)(shm->nprocesses + 1)); + + ipcHandle *ipcParentHandle = NULL; + checkIpcErrors(ipcCreateSocket(ipcParentHandle, ipcName, processes)); + checkIpcErrors(ipcSendShareableHandles(ipcParentHandle, shareableHandles, processes)); + + // Close the shareable handles as they are not needed anymore. + for (int i = 0; i < shm->nprocesses; i++) { + checkIpcErrors(ipcCloseShareableHandle(shareableHandles[i])); + } + checkIpcErrors(ipcCloseSocket(ipcParentHandle)); + + // And wait for them to finish + for (i = 0; i < processes.size(); i++) { + if (waitProcess(&processes[i]) != EXIT_SUCCESS) { + printf("Process %d failed!\n", i); + exit(EXIT_FAILURE); } } - else { - if ((supportedHandleTypes & handleType) != handleType) { - printf("Mixed handle types are not supported, waiving test\n"); - exit(EXIT_WAIVED); - } - } - // This sample requires two processes accessing each device, so we need - // to ensure exclusive or prohibited mode is not set - if (prop.computeMode != cudaComputeModeDefault) { - printf("Device %d is in an unsupported compute mode for this sample\n", - i); - continue; - } -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - // CUDA IPC on Windows is only supported on TCC - if (!prop.tccDriver) { - printf("Device %d is not in TCC mode\n", i); - continue; - } -#endif - for (int j = 0; j < shm->nprocesses; j++) { - int canAccessPeerIJ, canAccessPeerJI; - checkCudaErrors( - cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i)); - checkCudaErrors( - cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j])); - if (!canAccessPeerIJ || !canAccessPeerJI) { - allPeers = false; - break; - } - } - if (allPeers) { - // Enable peers here. This isn't necessary for IPC, but it will - // setup the peers for the device. For systems that only allow 8 - // peers per GPU at a time, this acts to remove devices from CanAccessPeer - for (int j = 0; j < shm->nprocesses; j++) { - checkCudaErrors(cudaSetDevice(i)); - checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0)); - checkCudaErrors(cudaSetDevice(shm->devices[j])); - checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0)); - } - shm->devices[shm->nprocesses++] = i; - if (shm->nprocesses >= MAX_DEVICES) break; - } else { - printf( - "Device %d is not peer capable with some other selected peers, " - "skipping\n", - i); - } - } - - if (shm->nprocesses == 0) { - printf("No CUDA devices support IPC\n"); - exit(EXIT_WAIVED); - } - - if (handleType == cudaMemHandleTypeNone) { - printf("No supported handle types found, waiving test\n"); - exit(EXIT_WAIVED); - } - - std::vector shareableHandles(shm->nprocesses); - std::vector streams(shm->nprocesses); - std::vector pools(shm->nprocesses); - - // Now allocate memory for each process and fill the shared - // memory buffer with the export data and get memPool handles to communicate - for (i = 0; i < shm->nprocesses; i++) { - void *ptr = NULL; - checkCudaErrors(cudaSetDevice(shm->devices[i])); - checkCudaErrors( - cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking)); - // Allocate an explicit pool with IPC capabilities - cudaMemPoolProps poolProps; - memset(&poolProps, 0, sizeof(cudaMemPoolProps)); - poolProps.allocType = cudaMemAllocationTypePinned; - poolProps.handleTypes = handleType; - - poolProps.location.type = cudaMemLocationTypeDevice; - poolProps.location.id = shm->devices[i]; - - checkCudaErrors(cudaMemPoolCreate(&pools[i], &poolProps)); - - // Query the shareable handle for the pool - // Allocate memory in a stream from the pool just created - checkCudaErrors(cudaMallocAsync(&ptr, DATA_SIZE, pools[i], streams[i])); - - checkCudaErrors(cudaMemPoolExportToShareableHandle( - &shareableHandles[i], pools[i], handleType, 0)); - - // Memset handle to 0 to make sure call to `cudaMemPoolImportPointer` in - // childProcess will fail if the following call to - // `cudaMemPoolExportPointer` fails. - memset((void *)&shm->exportPtrData[i], 0, sizeof(cudaMemPoolPtrExportData)); - // Get the opaque ‘bag-of-bits’ representing the allocation - checkCudaErrors(cudaMemPoolExportPointer( - (cudaMemPoolPtrExportData *)&shm->exportPtrData[i], ptr)); - ptrs.push_back(ptr); - } - - shm->handleType = handleType; - - // Launch the child processes! - for (i = 0; i < shm->nprocesses; i++) { - char devIdx[10]; - char *const args[] = {app, devIdx, NULL}; - Process process; - - SPRINTF(devIdx, "%d", i); - - if (spawnProcess(&process, app, args)) { - printf("Failed to create process\n"); - exit(EXIT_FAILURE); + // Clean up! + for (i = 0; i < shm->nprocesses; i++) { + checkCudaErrors(cudaSetDevice(shm->devices[i])); + checkCudaErrors(cudaFreeAsync(ptrs[i], streams[i])); + checkCudaErrors(cudaStreamSynchronize(streams[i])); + checkCudaErrors(cudaMemPoolDestroy(pools[i])); } - processes.push_back(process); - } - - barrierWait(&shm->barrier, &shm->sense, (unsigned int)(shm->nprocesses + 1)); - - ipcHandle *ipcParentHandle = NULL; - checkIpcErrors(ipcCreateSocket(ipcParentHandle, ipcName, processes)); - checkIpcErrors( - ipcSendShareableHandles(ipcParentHandle, shareableHandles, processes)); - - // Close the shareable handles as they are not needed anymore. - for (int i = 0; i < shm->nprocesses; i++) { - checkIpcErrors(ipcCloseShareableHandle(shareableHandles[i])); - } - checkIpcErrors(ipcCloseSocket(ipcParentHandle)); - - // And wait for them to finish - for (i = 0; i < processes.size(); i++) { - if (waitProcess(&processes[i]) != EXIT_SUCCESS) { - printf("Process %d failed!\n", i); - exit(EXIT_FAILURE); - } - } - - // Clean up! - for (i = 0; i < shm->nprocesses; i++) { - checkCudaErrors(cudaSetDevice(shm->devices[i])); - checkCudaErrors(cudaFreeAsync(ptrs[i], streams[i])); - checkCudaErrors(cudaStreamSynchronize(streams[i])); - checkCudaErrors(cudaMemPoolDestroy(pools[i])); - } - - sharedMemoryClose(&info); + sharedMemoryClose(&info); } // Host code -int main(int argc, char **argv) { -#if defined(__arm__) || defined(__aarch64__) || defined(WIN32) || \ - defined(_WIN32) || defined(WIN64) || defined(_WIN64) - printf("Not supported on ARM or Windows\n"); - return EXIT_WAIVED; +int main(int argc, char **argv) +{ +#if defined(__arm__) || defined(__aarch64__) || defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + printf("Not supported on ARM or Windows\n"); + return EXIT_WAIVED; #else - if (argc == 1) { - parentProcess(argv[0]); - } else { - childProcess(atoi(argv[1])); - } - return EXIT_SUCCESS; + if (argc == 1) { + parentProcess(argv[0]); + } + else { + childProcess(atoi(argv[1])); + } + return EXIT_SUCCESS; #endif } diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu index 51916cf6..8324fc17 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu @@ -33,10 +33,10 @@ // System includes #include -#include #include #include #include +#include #include // CUDA runtime @@ -48,205 +48,198 @@ // Simple kernel to demonstrate copying cudaMallocAsync memory via P2P to peer // device -__global__ void copyP2PAndScale(const int *src, int *dst, int N) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void copyP2PAndScale(const int *src, int *dst, int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < N) { - // scale & store src vector. - dst[idx] = 2 * src[idx]; - } + if (idx < N) { + // scale & store src vector. + dst[idx] = 2 * src[idx]; + } } // Map of device version to device number -std::multimap, int> getIdenticalGPUs() { - int numGpus = 0; - checkCudaErrors(cudaGetDeviceCount(&numGpus)); +std::multimap, int> getIdenticalGPUs() +{ + int numGpus = 0; + checkCudaErrors(cudaGetDeviceCount(&numGpus)); - std::multimap, int> identicalGpus; + std::multimap, int> identicalGpus; - for (int i = 0; i < numGpus; i++) { - int isMemPoolSupported = 0; - checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported, - cudaDevAttrMemoryPoolsSupported, i)); + for (int i = 0; i < numGpus; i++) { + int isMemPoolSupported = 0; + checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported, cudaDevAttrMemoryPoolsSupported, i)); - // Filter unsupported devices - if (isMemPoolSupported) { - int major = 0, minor = 0; - checkCudaErrors( - cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i)); - checkCudaErrors( - cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, i)); - identicalGpus.emplace(std::make_pair(major, minor), i); - } - } - - return identicalGpus; -} - -std::pair getP2PCapableGpuPair() { - constexpr size_t kNumGpusRequired = 2; - - auto gpusByArch = getIdenticalGPUs(); - - auto it = gpusByArch.begin(); - auto end = gpusByArch.end(); - - auto bestFit = std::make_pair(it, it); - // use std::distance to find the largest number of GPUs amongst architectures - auto distance = [](decltype(bestFit) p) { - return std::distance(p.first, p.second); - }; - - // Read each unique key/pair element in order - for (; it != end; it = gpusByArch.upper_bound(it->first)) { - // first and second are iterators bounded within the architecture group - auto testFit = gpusByArch.equal_range(it->first); - // Always use devices with highest architecture version or whichever has the - // most devices available - if (distance(bestFit) <= distance(testFit)) bestFit = testFit; - } - - if (distance(bestFit) < kNumGpusRequired) { - printf( - "No Two or more GPUs with same architecture capable of cuda Memory " - "Pools found." - "\nWaiving the sample\n"); - exit(EXIT_WAIVED); - } - - std::set bestFitDeviceIds; - - // check & select peer-to-peer access capable GPU devices. - int devIds[2]; - for (auto itr = bestFit.first; itr != bestFit.second; itr++) { - int deviceId = itr->second; - checkCudaErrors(cudaSetDevice(deviceId)); - - std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds, - &kNumGpusRequired]( - decltype(*itr) mapPair) { - if (deviceId != mapPair.second) { - int access = 0; - checkCudaErrors( - cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); - printf("Device=%d %s Access Peer Device=%d\n", deviceId, - access ? "CAN" : "CANNOT", mapPair.second); - if (access && bestFitDeviceIds.size() < kNumGpusRequired) { - bestFitDeviceIds.emplace(deviceId); - bestFitDeviceIds.emplace(mapPair.second); - } else { - printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); + // Filter unsupported devices + if (isMemPoolSupported) { + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, i)); + identicalGpus.emplace(std::make_pair(major, minor), i); } - } - }); - - if (bestFitDeviceIds.size() >= kNumGpusRequired) { - printf("Selected p2p capable devices - "); - int i = 0; - for (auto devicesItr = bestFitDeviceIds.begin(); - devicesItr != bestFitDeviceIds.end(); devicesItr++) { - devIds[i++] = *devicesItr; - printf("deviceId = %d ", *devicesItr); - } - printf("\n"); - break; } - } - // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p - // capable, hence we add it without p2p capability check. - if (!bestFitDeviceIds.size()) { - printf("No Two or more Devices p2p capable found.. exiting..\n"); - exit(EXIT_WAIVED); - } - - auto p2pGpuPair = std::make_pair(devIds[0], devIds[1]); - - return p2pGpuPair; + return identicalGpus; } -int memPoolP2PCopy() { - int *dev0_srcVec, *dev1_dstVec; // Device buffers - cudaStream_t stream1, stream2; - cudaMemPool_t memPool; - cudaEvent_t waitOnStream1; +std::pair getP2PCapableGpuPair() +{ + constexpr size_t kNumGpusRequired = 2; - // Allocate CPU memory. - size_t nelem = 1048576; - size_t bytes = nelem * sizeof(int); + auto gpusByArch = getIdenticalGPUs(); - int *a = (int *)malloc(bytes); - int *output = (int *)malloc(bytes); + auto it = gpusByArch.begin(); + auto end = gpusByArch.end(); - /* Initialize the vectors. */ - for (int n = 0; n < nelem; n++) { - a[n] = rand() / (int)RAND_MAX; - } + auto bestFit = std::make_pair(it, it); + // use std::distance to find the largest number of GPUs amongst architectures + auto distance = [](decltype(bestFit) p) { return std::distance(p.first, p.second); }; - auto p2pDevices = getP2PCapableGpuPair(); - printf("selected devices = %d & %d\n", p2pDevices.first, p2pDevices.second); - checkCudaErrors(cudaSetDevice(p2pDevices.first)); - checkCudaErrors(cudaEventCreate(&waitOnStream1)); - - checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)); - - // Get the default mempool for device p2pDevices.first from the pair - checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, p2pDevices.first)); - - // Allocate memory in a stream from the pool set above. - checkCudaErrors(cudaMallocAsync(&dev0_srcVec, bytes, stream1)); - - checkCudaErrors( - cudaMemcpyAsync(dev0_srcVec, a, bytes, cudaMemcpyHostToDevice, stream1)); - checkCudaErrors(cudaEventRecord(waitOnStream1, stream1)); - - checkCudaErrors(cudaSetDevice(p2pDevices.second)); - checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); - - // Allocate memory in p2pDevices.second device - checkCudaErrors(cudaMallocAsync(&dev1_dstVec, bytes, stream2)); - - // Setup peer mappings for p2pDevices.second device - cudaMemAccessDesc desc; - memset(&desc, 0, sizeof(cudaMemAccessDesc)); - desc.location.type = cudaMemLocationTypeDevice; - desc.location.id = p2pDevices.second; - desc.flags = cudaMemAccessFlagsProtReadWrite; - checkCudaErrors(cudaMemPoolSetAccess(memPool, &desc, 1)); - - printf("> copyP2PAndScale kernel running ...\n"); - dim3 block(256); - dim3 grid((unsigned int)ceil(nelem / (int)block.x)); - checkCudaErrors(cudaStreamWaitEvent(stream2, waitOnStream1)); - copyP2PAndScale<<>>(dev0_srcVec, dev1_dstVec, nelem); - - checkCudaErrors(cudaMemcpyAsync(output, dev1_dstVec, bytes, - cudaMemcpyDeviceToHost, stream2)); - checkCudaErrors(cudaFreeAsync(dev0_srcVec, stream2)); - checkCudaErrors(cudaFreeAsync(dev1_dstVec, stream2)); - checkCudaErrors(cudaStreamSynchronize(stream2)); - - /* Compare the results */ - printf("> Checking the results from copyP2PAndScale() ...\n"); - - for (int n = 0; n < nelem; n++) { - if ((2 * a[n]) != output[n]) { - printf("mismatch i = %d expected = %d val = %d\n", n, 2 * a[n], - output[n]); - return EXIT_FAILURE; + // Read each unique key/pair element in order + for (; it != end; it = gpusByArch.upper_bound(it->first)) { + // first and second are iterators bounded within the architecture group + auto testFit = gpusByArch.equal_range(it->first); + // Always use devices with highest architecture version or whichever has the + // most devices available + if (distance(bestFit) <= distance(testFit)) + bestFit = testFit; } - } - free(a); - free(output); - checkCudaErrors(cudaStreamDestroy(stream1)); - checkCudaErrors(cudaStreamDestroy(stream2)); - printf("PASSED\n"); + if (distance(bestFit) < kNumGpusRequired) { + printf("No Two or more GPUs with same architecture capable of cuda Memory " + "Pools found." + "\nWaiving the sample\n"); + exit(EXIT_WAIVED); + } - return EXIT_SUCCESS; + std::set bestFitDeviceIds; + + // check & select peer-to-peer access capable GPU devices. + int devIds[2]; + for (auto itr = bestFit.first; itr != bestFit.second; itr++) { + int deviceId = itr->second; + checkCudaErrors(cudaSetDevice(deviceId)); + + std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds, &kNumGpusRequired](decltype(*itr) mapPair) { + if (deviceId != mapPair.second) { + int access = 0; + checkCudaErrors(cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); + printf("Device=%d %s Access Peer Device=%d\n", deviceId, access ? "CAN" : "CANNOT", mapPair.second); + if (access && bestFitDeviceIds.size() < kNumGpusRequired) { + bestFitDeviceIds.emplace(deviceId); + bestFitDeviceIds.emplace(mapPair.second); + } + else { + printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); + } + } + }); + + if (bestFitDeviceIds.size() >= kNumGpusRequired) { + printf("Selected p2p capable devices - "); + int i = 0; + for (auto devicesItr = bestFitDeviceIds.begin(); devicesItr != bestFitDeviceIds.end(); devicesItr++) { + devIds[i++] = *devicesItr; + printf("deviceId = %d ", *devicesItr); + } + printf("\n"); + break; + } + } + + // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p + // capable, hence we add it without p2p capability check. + if (!bestFitDeviceIds.size()) { + printf("No Two or more Devices p2p capable found.. exiting..\n"); + exit(EXIT_WAIVED); + } + + auto p2pGpuPair = std::make_pair(devIds[0], devIds[1]); + + return p2pGpuPair; } -int main(int argc, char **argv) { - int ret = memPoolP2PCopy(); - return ret; +int memPoolP2PCopy() +{ + int *dev0_srcVec, *dev1_dstVec; // Device buffers + cudaStream_t stream1, stream2; + cudaMemPool_t memPool; + cudaEvent_t waitOnStream1; + + // Allocate CPU memory. + size_t nelem = 1048576; + size_t bytes = nelem * sizeof(int); + + int *a = (int *)malloc(bytes); + int *output = (int *)malloc(bytes); + + /* Initialize the vectors. */ + for (int n = 0; n < nelem; n++) { + a[n] = rand() / (int)RAND_MAX; + } + + auto p2pDevices = getP2PCapableGpuPair(); + printf("selected devices = %d & %d\n", p2pDevices.first, p2pDevices.second); + checkCudaErrors(cudaSetDevice(p2pDevices.first)); + checkCudaErrors(cudaEventCreate(&waitOnStream1)); + + checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)); + + // Get the default mempool for device p2pDevices.first from the pair + checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, p2pDevices.first)); + + // Allocate memory in a stream from the pool set above. + checkCudaErrors(cudaMallocAsync(&dev0_srcVec, bytes, stream1)); + + checkCudaErrors(cudaMemcpyAsync(dev0_srcVec, a, bytes, cudaMemcpyHostToDevice, stream1)); + checkCudaErrors(cudaEventRecord(waitOnStream1, stream1)); + + checkCudaErrors(cudaSetDevice(p2pDevices.second)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); + + // Allocate memory in p2pDevices.second device + checkCudaErrors(cudaMallocAsync(&dev1_dstVec, bytes, stream2)); + + // Setup peer mappings for p2pDevices.second device + cudaMemAccessDesc desc; + memset(&desc, 0, sizeof(cudaMemAccessDesc)); + desc.location.type = cudaMemLocationTypeDevice; + desc.location.id = p2pDevices.second; + desc.flags = cudaMemAccessFlagsProtReadWrite; + checkCudaErrors(cudaMemPoolSetAccess(memPool, &desc, 1)); + + printf("> copyP2PAndScale kernel running ...\n"); + dim3 block(256); + dim3 grid((unsigned int)ceil(nelem / (int)block.x)); + checkCudaErrors(cudaStreamWaitEvent(stream2, waitOnStream1)); + copyP2PAndScale<<>>(dev0_srcVec, dev1_dstVec, nelem); + + checkCudaErrors(cudaMemcpyAsync(output, dev1_dstVec, bytes, cudaMemcpyDeviceToHost, stream2)); + checkCudaErrors(cudaFreeAsync(dev0_srcVec, stream2)); + checkCudaErrors(cudaFreeAsync(dev1_dstVec, stream2)); + checkCudaErrors(cudaStreamSynchronize(stream2)); + + /* Compare the results */ + printf("> Checking the results from copyP2PAndScale() ...\n"); + + for (int n = 0; n < nelem; n++) { + if ((2 * a[n]) != output[n]) { + printf("mismatch i = %d expected = %d val = %d\n", n, 2 * a[n], output[n]); + return EXIT_FAILURE; + } + } + + free(a); + free(output); + checkCudaErrors(cudaStreamDestroy(stream1)); + checkCudaErrors(cudaStreamDestroy(stream2)); + printf("PASSED\n"); + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ret = memPoolP2PCopy(); + return ret; } diff --git a/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction.cu b/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction.cu index 32b3d572..26d1b2e9 100644 --- a/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction.cu +++ b/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction.cu @@ -48,28 +48,28 @@ COMMAND LINE ARGUMENTS - "--shmoo": Test performance for 1 to 32M elements with each of the + "--shmoo": Test performance for 1 to 32M elements with each of the 7 different kernels "--n=": Specify the number of elements to reduce (default 1048576) "--threads=": Specify the number of threads per block (default 128) - "--maxblocks=": Specify the maximum number of thread blocks to launch + "--maxblocks=": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64) - "--cpufinal": Read back the per-block results and do final sum of block + "--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false) - "--cputhresh=": The threshold of number of blocks sums below which to + "--cputhresh=": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1) "--multipass": Use a multipass reduction instead of a single-pass reduction */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes, project -#include #include +#include #define VERSION_MAJOR (CUDART_VERSION / 1000) #define VERSION_MINOR (CUDART_VERSION % 100) / 10 @@ -86,51 +86,52 @@ const char *sSDKsample = "threadFenceReduction"; // declaration, forward bool runTest(int argc, char **argv); -extern "C" { -void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata); -void reduceSinglePass(int size, int threads, int blocks, float *d_idata, - float *d_odata); +extern "C" +{ + void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata); + void reduceSinglePass(int size, int threads, int blocks, float *d_idata, float *d_odata); } #if CUDART_VERSION < 2020 -void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata) { - printf("reduce(), compiler not supported, aborting tests\n"); +void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata) +{ + printf("reduce(), compiler not supported, aborting tests\n"); } -void reduceSinglePass(int size, int threads, int blocks, float *d_idata, - float *d_odata) { - printf("reduceSinglePass(), compiler not supported, aborting tests\n"); +void reduceSinglePass(int size, int threads, int blocks, float *d_idata, float *d_odata) +{ + printf("reduceSinglePass(), compiler not supported, aborting tests\n"); } #endif //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - cudaDeviceProp deviceProp; - deviceProp.major = 0; - deviceProp.minor = 0; - int dev; +int main(int argc, char **argv) +{ + cudaDeviceProp deviceProp; + deviceProp.major = 0; + deviceProp.minor = 0; + int dev; - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - dev = findCudaDevice(argc, (const char **)argv); + dev = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - printf("GPU Device supports SM %d.%d compute capability\n\n", - deviceProp.major, deviceProp.minor); + printf("GPU Device supports SM %d.%d compute capability\n\n", deviceProp.major, deviceProp.minor); - bool bTestResult = false; + bool bTestResult = false; #if CUDART_VERSION >= 2020 - bTestResult = runTest(argc, argv); + bTestResult = runTest(argc, argv); #else - print_NVCC_min_spec(sSDKsample, "2.2", "Version 185"); - exit(EXIT_SUCCESS); + print_NVCC_min_spec(sSDKsample, "2.2", "Version 185"); + exit(EXIT_SUCCESS); #endif - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// @@ -141,137 +142,147 @@ int main(int argc, char **argv) { //! @param data pointer to input data //! @param size number of input data elements //////////////////////////////////////////////////////////////////////////////// -template -T reduceCPU(T *data, int size) { - T sum = data[0]; - T c = (T)0.0; +template T reduceCPU(T *data, int size) +{ + T sum = data[0]; + T c = (T)0.0; - for (int i = 1; i < size; i++) { - T y = data[i] - c; - T t = sum + y; - c = (t - sum) - y; - sum = t; - } + for (int i = 1; i < size; i++) { + T y = data[i] - c; + T t = sum + y; + c = (t - sum) - y; + sum = t; + } - return sum; + return sum; } -unsigned int nextPow2(unsigned int x) { - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return ++x; +unsigned int nextPow2(unsigned int x) +{ + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; } //////////////////////////////////////////////////////////////////////////////// // Compute the number of threads and blocks to use for the reduction // We set threads / block to the minimum of maxThreads and n/2. //////////////////////////////////////////////////////////////////////////////// -void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, - int &threads) { - if (n == 1) { - threads = 1; - blocks = 1; - } else { - threads = (n < maxThreads * 2) ? nextPow2(n / 2) : maxThreads; - blocks = max(1, n / (threads * 2)); - } +void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads) +{ + if (n == 1) { + threads = 1; + blocks = 1; + } + else { + threads = (n < maxThreads * 2) ? nextPow2(n / 2) : maxThreads; + blocks = max(1, n / (threads * 2)); + } - blocks = min(maxBlocks, blocks); + blocks = min(maxBlocks, blocks); } //////////////////////////////////////////////////////////////////////////////// // This function performs a reduction of the input data multiple times and // measures the average reduction time. //////////////////////////////////////////////////////////////////////////////// -float benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads, - int maxBlocks, int testIterations, bool multiPass, - bool cpuFinalReduction, int cpuFinalThreshold, - StopWatchInterface *timer, float *h_odata, float *d_idata, - float *d_odata) { - float gpu_result = 0; - bool bNeedReadback = true; - cudaError_t error; +float benchmarkReduce(int n, + int numThreads, + int numBlocks, + int maxThreads, + int maxBlocks, + int testIterations, + bool multiPass, + bool cpuFinalReduction, + int cpuFinalThreshold, + StopWatchInterface *timer, + float *h_odata, + float *d_idata, + float *d_odata) +{ + float gpu_result = 0; + bool bNeedReadback = true; + cudaError_t error; - for (int i = 0; i < testIterations; ++i) { - gpu_result = 0; - unsigned int retCnt = 0; - error = setRetirementCount(retCnt); - checkCudaErrors(error); - - cudaDeviceSynchronize(); - sdkStartTimer(&timer); - - if (multiPass) { - // execute the kernel - reduce(n, numThreads, numBlocks, d_idata, d_odata); - - // check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); - - if (cpuFinalReduction) { - // sum partial sums from each block on CPU - // copy result from device to host - error = cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(float), - cudaMemcpyDeviceToHost); + for (int i = 0; i < testIterations; ++i) { + gpu_result = 0; + unsigned int retCnt = 0; + error = setRetirementCount(retCnt); checkCudaErrors(error); - for (int i = 0; i < numBlocks; i++) { - gpu_result += h_odata[i]; + cudaDeviceSynchronize(); + sdkStartTimer(&timer); + + if (multiPass) { + // execute the kernel + reduce(n, numThreads, numBlocks, d_idata, d_odata); + + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); + + if (cpuFinalReduction) { + // sum partial sums from each block on CPU + // copy result from device to host + error = cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); + checkCudaErrors(error); + + for (int i = 0; i < numBlocks; i++) { + gpu_result += h_odata[i]; + } + + bNeedReadback = false; + } + else { + // sum partial block sums on GPU + int s = numBlocks; + + while (s > cpuFinalThreshold) { + int threads = 0, blocks = 0; + getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads); + + reduce(s, threads, blocks, d_odata, d_odata); + + s = s / (threads * 2); + } + + if (s > 1) { + // copy result from device to host + error = cudaMemcpy(h_odata, d_odata, s * sizeof(float), cudaMemcpyDeviceToHost); + checkCudaErrors(error); + + for (int i = 0; i < s; i++) { + gpu_result += h_odata[i]; + } + + bNeedReadback = false; + } + } + } + else { + getLastCudaError("Kernel execution failed"); + + // execute the kernel + reduceSinglePass(n, numThreads, numBlocks, d_idata, d_odata); + + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); } - bNeedReadback = false; - } else { - // sum partial block sums on GPU - int s = numBlocks; - - while (s > cpuFinalThreshold) { - int threads = 0, blocks = 0; - getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads); - - reduce(s, threads, blocks, d_odata, d_odata); - - s = s / (threads * 2); - } - - if (s > 1) { - // copy result from device to host - error = cudaMemcpy(h_odata, d_odata, s * sizeof(float), - cudaMemcpyDeviceToHost); - checkCudaErrors(error); - - for (int i = 0; i < s; i++) { - gpu_result += h_odata[i]; - } - - bNeedReadback = false; - } - } - } else { - getLastCudaError("Kernel execution failed"); - - // execute the kernel - reduceSinglePass(n, numThreads, numBlocks, d_idata, d_odata); - - // check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); + cudaDeviceSynchronize(); + sdkStopTimer(&timer); } - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - } + if (bNeedReadback) { + // copy final sum from device to host + error = cudaMemcpy(&gpu_result, d_odata, sizeof(float), cudaMemcpyDeviceToHost); + checkCudaErrors(error); + } - if (bNeedReadback) { - // copy final sum from device to host - error = - cudaMemcpy(&gpu_result, d_odata, sizeof(float), cudaMemcpyDeviceToHost); - checkCudaErrors(error); - } - - return gpu_result; + return gpu_result; } //////////////////////////////////////////////////////////////////////////////// @@ -280,186 +291,203 @@ float benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads, // for generating a "shmoo" plot showing the performance for each kernel // variation over a wide range of input sizes. //////////////////////////////////////////////////////////////////////////////// -void shmoo(int minN, int maxN, int maxThreads, int maxBlocks) { - // create random input data on CPU - unsigned int bytes = maxN * sizeof(float); - - float *h_idata = (float *)malloc(bytes); - - for (int i = 0; i < maxN; i++) { - // Keep the numbers small so we don't get truncation error in the sum - h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX; - } - - int maxNumBlocks = min(65535, maxN / maxThreads); - - // allocate mem for the result on host side - float *h_odata = (float *)malloc(maxNumBlocks * sizeof(float)); - - // allocate device memory and data - float *d_idata = NULL; - float *d_odata = NULL; - - checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); - checkCudaErrors(cudaMalloc((void **)&d_odata, maxNumBlocks * sizeof(float))); - - // copy data directly to device memory - checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks * sizeof(float), - cudaMemcpyHostToDevice)); - - // warm-up - reduce(maxN, maxThreads, maxNumBlocks, d_idata, d_odata); - int testIterations = 100; - - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - - // print headers - printf("N, %d blocks one pass, %d blocks multipass\n", maxBlocks, maxBlocks); - - for (int i = minN; i <= maxN; i *= 2) { - printf("%d, ", i); - - for (int multiPass = 0; multiPass <= 1; multiPass++) { - sdkResetTimer(&timer); - int numBlocks = 0; - int numThreads = 0; - getNumBlocksAndThreads(i, maxBlocks, maxThreads, numBlocks, numThreads); - - benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, - testIterations, multiPass == 1, false, 1, timer, h_odata, - d_idata, d_odata); - - float reduceTime = sdkGetAverageTimerValue(&timer); - printf("%f%s", reduceTime, multiPass == 0 ? ", " : "\n"); - } - } - - printf("\n"); - - // cleanup - sdkDeleteTimer(&timer); - free(h_idata); - free(h_odata); - - cudaFree(d_idata); - cudaFree(d_odata); -} - -//////////////////////////////////////////////////////////////////////////////// -// The main function which runs the reduction test. -//////////////////////////////////////////////////////////////////////////////// -bool runTest(int argc, char **argv) { - int size = 1 << 20; // number of elements to reduce - int maxThreads = 128; // number of threads per block - int maxBlocks = 64; - bool cpuFinalReduction = false; - int cpuFinalThreshold = 1; - bool multipass = false; - bool bTestResult = false; - - if (checkCmdLineFlag(argc, (const char **)argv, "n")) { - size = getCmdLineArgumentInt(argc, (const char **)argv, "n"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { - maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "maxblocks")) { - maxBlocks = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks"); - } - - printf("%d elements\n", size); - printf("%d threads (max)\n", maxThreads); - - cpuFinalReduction = checkCmdLineFlag(argc, (const char **)argv, "cpufinal"); - multipass = checkCmdLineFlag(argc, (const char **)argv, "multipass"); - - if (checkCmdLineFlag(argc, (const char **)argv, "cputhresh")) { - cpuFinalThreshold = - getCmdLineArgumentInt(argc, (const char **)argv, "cputhresh"); - } - - bool runShmoo = checkCmdLineFlag(argc, (const char **)argv, "shmoo"); - - if (runShmoo) { - shmoo(1, 33554432, maxThreads, maxBlocks); - } else { +void shmoo(int minN, int maxN, int maxThreads, int maxBlocks) +{ // create random input data on CPU - unsigned int bytes = size * sizeof(float); + unsigned int bytes = maxN * sizeof(float); float *h_idata = (float *)malloc(bytes); - for (int i = 0; i < size; i++) { - // Keep the numbers small so we don't get truncation error in the sum - h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX; + for (int i = 0; i < maxN; i++) { + // Keep the numbers small so we don't get truncation error in the sum + h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX; } - int numBlocks = 0; - int numThreads = 0; - getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads); - - if (numBlocks == 1) { - cpuFinalThreshold = 1; - } + int maxNumBlocks = min(65535, maxN / maxThreads); // allocate mem for the result on host side - float *h_odata = (float *)malloc(numBlocks * sizeof(float)); - - printf("%d blocks\n", numBlocks); + float *h_odata = (float *)malloc(maxNumBlocks * sizeof(float)); // allocate device memory and data float *d_idata = NULL; float *d_odata = NULL; checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); - checkCudaErrors(cudaMalloc((void **)&d_odata, numBlocks * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_odata, maxNumBlocks * sizeof(float))); // copy data directly to device memory - checkCudaErrors( - cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks * sizeof(float), - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks * sizeof(float), cudaMemcpyHostToDevice)); // warm-up - reduce(size, numThreads, numBlocks, d_idata, d_odata); + reduce(maxN, maxThreads, maxNumBlocks, d_idata, d_odata); int testIterations = 100; - StopWatchInterface *timer = 0; + StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); - float gpu_result = 0; + // print headers + printf("N, %d blocks one pass, %d blocks multipass\n", maxBlocks, maxBlocks); - gpu_result = - benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks, - testIterations, multipass, cpuFinalReduction, - cpuFinalThreshold, timer, h_odata, d_idata, d_odata); + for (int i = minN; i <= maxN; i *= 2) { + printf("%d, ", i); - float reduceTime = sdkGetAverageTimerValue(&timer); - printf("Average time: %f ms\n", reduceTime); - printf("Bandwidth: %f GB/s\n\n", - (size * sizeof(int)) / (reduceTime * 1.0e6)); + for (int multiPass = 0; multiPass <= 1; multiPass++) { + sdkResetTimer(&timer); + int numBlocks = 0; + int numThreads = 0; + getNumBlocksAndThreads(i, maxBlocks, maxThreads, numBlocks, numThreads); - // compute reference solution - float cpu_result = reduceCPU(h_idata, size); + benchmarkReduce(i, + numThreads, + numBlocks, + maxThreads, + maxBlocks, + testIterations, + multiPass == 1, + false, + 1, + timer, + h_odata, + d_idata, + d_odata); - printf("GPU result = %0.12f\n", gpu_result); - printf("CPU result = %0.12f\n", cpu_result); + float reduceTime = sdkGetAverageTimerValue(&timer); + printf("%f%s", reduceTime, multiPass == 0 ? ", " : "\n"); + } + } - double threshold = 1e-8 * size; - double diff = abs((double)gpu_result - (double)cpu_result); - bTestResult = (diff < threshold); + printf("\n"); // cleanup sdkDeleteTimer(&timer); - free(h_idata); free(h_odata); + cudaFree(d_idata); cudaFree(d_odata); - } - - return bTestResult; +} + +//////////////////////////////////////////////////////////////////////////////// +// The main function which runs the reduction test. +//////////////////////////////////////////////////////////////////////////////// +bool runTest(int argc, char **argv) +{ + int size = 1 << 20; // number of elements to reduce + int maxThreads = 128; // number of threads per block + int maxBlocks = 64; + bool cpuFinalReduction = false; + int cpuFinalThreshold = 1; + bool multipass = false; + bool bTestResult = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + size = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { + maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "maxblocks")) { + maxBlocks = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks"); + } + + printf("%d elements\n", size); + printf("%d threads (max)\n", maxThreads); + + cpuFinalReduction = checkCmdLineFlag(argc, (const char **)argv, "cpufinal"); + multipass = checkCmdLineFlag(argc, (const char **)argv, "multipass"); + + if (checkCmdLineFlag(argc, (const char **)argv, "cputhresh")) { + cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **)argv, "cputhresh"); + } + + bool runShmoo = checkCmdLineFlag(argc, (const char **)argv, "shmoo"); + + if (runShmoo) { + shmoo(1, 33554432, maxThreads, maxBlocks); + } + else { + // create random input data on CPU + unsigned int bytes = size * sizeof(float); + + float *h_idata = (float *)malloc(bytes); + + for (int i = 0; i < size; i++) { + // Keep the numbers small so we don't get truncation error in the sum + h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX; + } + + int numBlocks = 0; + int numThreads = 0; + getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads); + + if (numBlocks == 1) { + cpuFinalThreshold = 1; + } + + // allocate mem for the result on host side + float *h_odata = (float *)malloc(numBlocks * sizeof(float)); + + printf("%d blocks\n", numBlocks); + + // allocate device memory and data + float *d_idata = NULL; + float *d_odata = NULL; + + checkCudaErrors(cudaMalloc((void **)&d_idata, bytes)); + checkCudaErrors(cudaMalloc((void **)&d_odata, numBlocks * sizeof(float))); + + // copy data directly to device memory + checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks * sizeof(float), cudaMemcpyHostToDevice)); + + // warm-up + reduce(size, numThreads, numBlocks, d_idata, d_odata); + int testIterations = 100; + + StopWatchInterface *timer = 0; + sdkCreateTimer(&timer); + + float gpu_result = 0; + + gpu_result = benchmarkReduce(size, + numThreads, + numBlocks, + maxThreads, + maxBlocks, + testIterations, + multipass, + cpuFinalReduction, + cpuFinalThreshold, + timer, + h_odata, + d_idata, + d_odata); + + float reduceTime = sdkGetAverageTimerValue(&timer); + printf("Average time: %f ms\n", reduceTime); + printf("Bandwidth: %f GB/s\n\n", (size * sizeof(int)) / (reduceTime * 1.0e6)); + + // compute reference solution + float cpu_result = reduceCPU(h_idata, size); + + printf("GPU result = %0.12f\n", gpu_result); + printf("CPU result = %0.12f\n", cpu_result); + + double threshold = 1e-8 * size; + double diff = abs((double)gpu_result - (double)cpu_result); + bTestResult = (diff < threshold); + + // cleanup + sdkDeleteTimer(&timer); + + free(h_idata); + free(h_odata); + cudaFree(d_idata); + cudaFree(d_odata); + } + + return bTestResult; } diff --git a/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction_kernel.cuh b/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction_kernel.cuh index e56d627e..5c553feb 100644 --- a/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction_kernel.cuh +++ b/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction_kernel.cuh @@ -32,8 +32,8 @@ #ifndef _REDUCE_KERNEL_H_ #define _REDUCE_KERNEL_H_ -#include #include +#include namespace cg = cooperative_groups; @@ -51,85 +51,87 @@ namespace cg = cooperative_groups; */ template -__device__ void reduceBlock(volatile float *sdata, float mySum, - const unsigned int tid, cg::thread_block cta) { - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - sdata[tid] = mySum; - cg::sync(tile32); - - const int VEC = 32; - const int vid = tid & (VEC - 1); - - float beta = mySum; - float temp; - - for (int i = VEC / 2; i > 0; i >>= 1) { - if (vid < i) { - temp = sdata[tid + i]; - beta += temp; - sdata[tid] = beta; - } +__device__ void reduceBlock(volatile float *sdata, float mySum, const unsigned int tid, cg::thread_block cta) +{ + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + sdata[tid] = mySum; cg::sync(tile32); - } - cg::sync(cta); - if (cta.thread_rank() == 0) { - beta = 0; - for (int i = 0; i < blockDim.x; i += VEC) { - beta += sdata[i]; + const int VEC = 32; + const int vid = tid & (VEC - 1); + + float beta = mySum; + float temp; + + for (int i = VEC / 2; i > 0; i >>= 1) { + if (vid < i) { + temp = sdata[tid + i]; + beta += temp; + sdata[tid] = beta; + } + cg::sync(tile32); } - sdata[0] = beta; - } - cg::sync(cta); + cg::sync(cta); + + if (cta.thread_rank() == 0) { + beta = 0; + for (int i = 0; i < blockDim.x; i += VEC) { + beta += sdata[i]; + } + sdata[0] = beta; + } + cg::sync(cta); } template -__device__ void reduceBlocks(const float *g_idata, float *g_odata, - unsigned int n, cg::thread_block cta) { - extern __shared__ float sdata[]; +__device__ void reduceBlocks(const float *g_idata, float *g_odata, unsigned int n, cg::thread_block cta) +{ + extern __shared__ float sdata[]; - // perform first level of reduction, - // reading from global memory, writing to shared memory - unsigned int tid = threadIdx.x; - unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x; - unsigned int gridSize = blockSize * 2 * gridDim.x; - float mySum = 0; + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x; + unsigned int gridSize = blockSize * 2 * gridDim.x; + float mySum = 0; - // we reduce multiple elements per thread. The number is determined by the - // number of active thread blocks (via gridDim). More blocks will result - // in a larger gridSize and therefore fewer elements per thread - while (i < n) { - mySum += g_idata[i]; + // we reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + while (i < n) { + mySum += g_idata[i]; - // ensure we don't read out of bounds -- this is optimized away for powerOf2 - // sized arrays - if (nIsPow2 || i + blockSize < n) mySum += g_idata[i + blockSize]; + // ensure we don't read out of bounds -- this is optimized away for powerOf2 + // sized arrays + if (nIsPow2 || i + blockSize < n) + mySum += g_idata[i + blockSize]; - i += gridSize; - } + i += gridSize; + } - // do reduction in shared mem - reduceBlock(sdata, mySum, tid, cta); + // do reduction in shared mem + reduceBlock(sdata, mySum, tid, cta); - // write result for this block to global mem - if (tid == 0) g_odata[blockIdx.x] = sdata[0]; + // write result for this block to global mem + if (tid == 0) + g_odata[blockIdx.x] = sdata[0]; } template -__global__ void reduceMultiPass(const float *g_idata, float *g_odata, - unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - reduceBlocks(g_idata, g_odata, n, cta); +__global__ void reduceMultiPass(const float *g_idata, float *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + reduceBlocks(g_idata, g_odata, n, cta); } // Global variable used by reduceSinglePass to count how many blocks have // finished __device__ unsigned int retirementCount = 0; -cudaError_t setRetirementCount(int retCnt) { - return cudaMemcpyToSymbol(retirementCount, &retCnt, sizeof(unsigned int), 0, - cudaMemcpyHostToDevice); +cudaError_t setRetirementCount(int retCnt) +{ + return cudaMemcpyToSymbol(retirementCount, &retCnt, sizeof(unsigned int), 0, cudaMemcpyHostToDevice); } // This reduction kernel reduces an arbitrary size array in a single kernel @@ -148,59 +150,59 @@ cudaError_t setRetirementCount(int retCnt) { // For more details on the reduction algorithm (notably the multi-pass // approach), see the "reduction" sample in the CUDA SDK. template -__global__ void reduceSinglePass(const float *g_idata, float *g_odata, - unsigned int n) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // - // PHASE 1: Process all inputs assigned to this block - // +__global__ void reduceSinglePass(const float *g_idata, float *g_odata, unsigned int n) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // + // PHASE 1: Process all inputs assigned to this block + // - reduceBlocks(g_idata, g_odata, n, cta); + reduceBlocks(g_idata, g_odata, n, cta); - // - // PHASE 2: Last block finished will process all partial sums - // + // + // PHASE 2: Last block finished will process all partial sums + // - if (gridDim.x > 1) { - const unsigned int tid = threadIdx.x; - __shared__ bool amLast; - extern float __shared__ smem[]; + if (gridDim.x > 1) { + const unsigned int tid = threadIdx.x; + __shared__ bool amLast; + extern float __shared__ smem[]; - // wait until all outstanding memory instructions in this thread are - // finished - __threadfence(); + // wait until all outstanding memory instructions in this thread are + // finished + __threadfence(); - // Thread 0 takes a ticket - if (tid == 0) { - unsigned int ticket = atomicInc(&retirementCount, gridDim.x); - // If the ticket ID is equal to the number of blocks, we are the last - // block! - amLast = (ticket == gridDim.x - 1); + // Thread 0 takes a ticket + if (tid == 0) { + unsigned int ticket = atomicInc(&retirementCount, gridDim.x); + // If the ticket ID is equal to the number of blocks, we are the last + // block! + amLast = (ticket == gridDim.x - 1); + } + + cg::sync(cta); + + // The last block sums the results of all other blocks + if (amLast) { + int i = tid; + float mySum = 0; + + while (i < gridDim.x) { + mySum += g_odata[i]; + i += blockSize; + } + + reduceBlock(smem, mySum, tid, cta); + + if (tid == 0) { + g_odata[0] = smem[0]; + + // reset retirement count so that next run succeeds + retirementCount = 0; + } + } } - - cg::sync(cta); - - // The last block sums the results of all other blocks - if (amLast) { - int i = tid; - float mySum = 0; - - while (i < gridDim.x) { - mySum += g_odata[i]; - i += blockSize; - } - - reduceBlock(smem, mySum, tid, cta); - - if (tid == 0) { - g_odata[0] = smem[0]; - - // reset retirement count so that next run succeeds - retirementCount = 0; - } - } - } } bool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); } @@ -208,233 +210,194 @@ bool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); } //////////////////////////////////////////////////////////////////////////////// // Wrapper function for kernel launch //////////////////////////////////////////////////////////////////////////////// -extern "C" void reduce(int size, int threads, int blocks, float *d_idata, - float *d_odata) { - dim3 dimBlock(threads, 1, 1); - dim3 dimGrid(blocks, 1, 1); - int smemSize = - (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float); +extern "C" void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata) +{ + dim3 dimBlock(threads, 1, 1); + dim3 dimGrid(blocks, 1, 1); + int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float); - // choose which of the optimized versions of reduction to launch - if (isPow2(size)) { - switch (threads) { - case 512: - reduceMultiPass<512, true> - <<>>(d_idata, d_odata, size); - break; + // choose which of the optimized versions of reduction to launch + if (isPow2(size)) { + switch (threads) { + case 512: + reduceMultiPass<512, true><<>>(d_idata, d_odata, size); + break; - case 256: - reduceMultiPass<256, true> - <<>>(d_idata, d_odata, size); - break; + case 256: + reduceMultiPass<256, true><<>>(d_idata, d_odata, size); + break; - case 128: - reduceMultiPass<128, true> - <<>>(d_idata, d_odata, size); - break; + case 128: + reduceMultiPass<128, true><<>>(d_idata, d_odata, size); + break; - case 64: - reduceMultiPass<64, true> - <<>>(d_idata, d_odata, size); - break; + case 64: + reduceMultiPass<64, true><<>>(d_idata, d_odata, size); + break; - case 32: - reduceMultiPass<32, true> - <<>>(d_idata, d_odata, size); - break; + case 32: + reduceMultiPass<32, true><<>>(d_idata, d_odata, size); + break; - case 16: - reduceMultiPass<16, true> - <<>>(d_idata, d_odata, size); - break; + case 16: + reduceMultiPass<16, true><<>>(d_idata, d_odata, size); + break; - case 8: - reduceMultiPass<8, true> - <<>>(d_idata, d_odata, size); - break; + case 8: + reduceMultiPass<8, true><<>>(d_idata, d_odata, size); + break; - case 4: - reduceMultiPass<4, true> - <<>>(d_idata, d_odata, size); - break; + case 4: + reduceMultiPass<4, true><<>>(d_idata, d_odata, size); + break; - case 2: - reduceMultiPass<2, true> - <<>>(d_idata, d_odata, size); - break; + case 2: + reduceMultiPass<2, true><<>>(d_idata, d_odata, size); + break; - case 1: - reduceMultiPass<1, true> - <<>>(d_idata, d_odata, size); - break; + case 1: + reduceMultiPass<1, true><<>>(d_idata, d_odata, size); + break; + } } - } else { - switch (threads) { - case 512: - reduceMultiPass<512, false> - <<>>(d_idata, d_odata, size); - break; + else { + switch (threads) { + case 512: + reduceMultiPass<512, false><<>>(d_idata, d_odata, size); + break; - case 256: - reduceMultiPass<256, false> - <<>>(d_idata, d_odata, size); - break; + case 256: + reduceMultiPass<256, false><<>>(d_idata, d_odata, size); + break; - case 128: - reduceMultiPass<128, false> - <<>>(d_idata, d_odata, size); - break; + case 128: + reduceMultiPass<128, false><<>>(d_idata, d_odata, size); + break; - case 64: - reduceMultiPass<64, false> - <<>>(d_idata, d_odata, size); - break; + case 64: + reduceMultiPass<64, false><<>>(d_idata, d_odata, size); + break; - case 32: - reduceMultiPass<32, false> - <<>>(d_idata, d_odata, size); - break; + case 32: + reduceMultiPass<32, false><<>>(d_idata, d_odata, size); + break; - case 16: - reduceMultiPass<16, false> - <<>>(d_idata, d_odata, size); - break; + case 16: + reduceMultiPass<16, false><<>>(d_idata, d_odata, size); + break; - case 8: - reduceMultiPass<8, false> - <<>>(d_idata, d_odata, size); - break; + case 8: + reduceMultiPass<8, false><<>>(d_idata, d_odata, size); + break; - case 4: - reduceMultiPass<4, false> - <<>>(d_idata, d_odata, size); - break; + case 4: + reduceMultiPass<4, false><<>>(d_idata, d_odata, size); + break; - case 2: - reduceMultiPass<2, false> - <<>>(d_idata, d_odata, size); - break; + case 2: + reduceMultiPass<2, false><<>>(d_idata, d_odata, size); + break; - case 1: - reduceMultiPass<1, false> - <<>>(d_idata, d_odata, size); - break; + case 1: + reduceMultiPass<1, false><<>>(d_idata, d_odata, size); + break; + } } - } } -extern "C" void reduceSinglePass(int size, int threads, int blocks, - float *d_idata, float *d_odata) { - dim3 dimBlock(threads, 1, 1); - dim3 dimGrid(blocks, 1, 1); - int smemSize = threads * sizeof(float); +extern "C" void reduceSinglePass(int size, int threads, int blocks, float *d_idata, float *d_odata) +{ + dim3 dimBlock(threads, 1, 1); + dim3 dimGrid(blocks, 1, 1); + int smemSize = threads * sizeof(float); - // choose which of the optimized versions of reduction to launch - if (isPow2(size)) { - switch (threads) { - case 512: - reduceSinglePass<512, true> - <<>>(d_idata, d_odata, size); - break; + // choose which of the optimized versions of reduction to launch + if (isPow2(size)) { + switch (threads) { + case 512: + reduceSinglePass<512, true><<>>(d_idata, d_odata, size); + break; - case 256: - reduceSinglePass<256, true> - <<>>(d_idata, d_odata, size); - break; + case 256: + reduceSinglePass<256, true><<>>(d_idata, d_odata, size); + break; - case 128: - reduceSinglePass<128, true> - <<>>(d_idata, d_odata, size); - break; + case 128: + reduceSinglePass<128, true><<>>(d_idata, d_odata, size); + break; - case 64: - reduceSinglePass<64, true> - <<>>(d_idata, d_odata, size); - break; + case 64: + reduceSinglePass<64, true><<>>(d_idata, d_odata, size); + break; - case 32: - reduceSinglePass<32, true> - <<>>(d_idata, d_odata, size); - break; + case 32: + reduceSinglePass<32, true><<>>(d_idata, d_odata, size); + break; - case 16: - reduceSinglePass<16, true> - <<>>(d_idata, d_odata, size); - break; + case 16: + reduceSinglePass<16, true><<>>(d_idata, d_odata, size); + break; - case 8: - reduceSinglePass<8, true> - <<>>(d_idata, d_odata, size); - break; + case 8: + reduceSinglePass<8, true><<>>(d_idata, d_odata, size); + break; - case 4: - reduceSinglePass<4, true> - <<>>(d_idata, d_odata, size); - break; + case 4: + reduceSinglePass<4, true><<>>(d_idata, d_odata, size); + break; - case 2: - reduceSinglePass<2, true> - <<>>(d_idata, d_odata, size); - break; + case 2: + reduceSinglePass<2, true><<>>(d_idata, d_odata, size); + break; - case 1: - reduceSinglePass<1, true> - <<>>(d_idata, d_odata, size); - break; + case 1: + reduceSinglePass<1, true><<>>(d_idata, d_odata, size); + break; + } } - } else { - switch (threads) { - case 512: - reduceSinglePass<512, false> - <<>>(d_idata, d_odata, size); - break; + else { + switch (threads) { + case 512: + reduceSinglePass<512, false><<>>(d_idata, d_odata, size); + break; - case 256: - reduceSinglePass<256, false> - <<>>(d_idata, d_odata, size); - break; + case 256: + reduceSinglePass<256, false><<>>(d_idata, d_odata, size); + break; - case 128: - reduceSinglePass<128, false> - <<>>(d_idata, d_odata, size); - break; + case 128: + reduceSinglePass<128, false><<>>(d_idata, d_odata, size); + break; - case 64: - reduceSinglePass<64, false> - <<>>(d_idata, d_odata, size); - break; + case 64: + reduceSinglePass<64, false><<>>(d_idata, d_odata, size); + break; - case 32: - reduceSinglePass<32, false> - <<>>(d_idata, d_odata, size); - break; + case 32: + reduceSinglePass<32, false><<>>(d_idata, d_odata, size); + break; - case 16: - reduceSinglePass<16, false> - <<>>(d_idata, d_odata, size); - break; + case 16: + reduceSinglePass<16, false><<>>(d_idata, d_odata, size); + break; - case 8: - reduceSinglePass<8, false> - <<>>(d_idata, d_odata, size); - break; + case 8: + reduceSinglePass<8, false><<>>(d_idata, d_odata, size); + break; - case 4: - reduceSinglePass<4, false> - <<>>(d_idata, d_odata, size); - break; + case 4: + reduceSinglePass<4, false><<>>(d_idata, d_odata, size); + break; - case 2: - reduceSinglePass<2, false> - <<>>(d_idata, d_odata, size); - break; + case 2: + reduceSinglePass<2, false><<>>(d_idata, d_odata, size); + break; - case 1: - reduceSinglePass<1, false> - <<>>(d_idata, d_odata, size); - break; + case 1: + reduceSinglePass<1, false><<>>(d_idata, d_odata, size); + break; + } } - } } -#endif // #ifndef _REDUCE_KERNEL_H_ +#endif // #ifndef _REDUCE_KERNEL_H_ diff --git a/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp b/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp index 12e0a343..96c280aa 100644 --- a/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp +++ b/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp @@ -26,66 +26,66 @@ */ /****************************************************************************** -* -* Module: threadMigration.cpp -* -* Description: -* Simple sample demonstrating multi-GPU/multithread functionality using -* the CUDA Context Management API. This API allows the a CUDA context to -* be associated with a CPU process. A host thread may have only one device -* context current at a time. -* -* Refer to the CUDA programming guide 4.5.3.3 on Context Management -* -******************************************************************************/ + * + * Module: threadMigration.cpp + * + * Description: + * Simple sample demonstrating multi-GPU/multithread functionality using + * the CUDA Context Management API. This API allows the a CUDA context to + * be associated with a CPU process. A host thread may have only one device + * context current at a time. + * + * Refer to the CUDA programming guide 4.5.3.3 on Context Management + * + ******************************************************************************/ #define MAXTHREADS 256 -#define NUM_INTS 32 +#define NUM_INTS 32 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // Windows threads use different data structures #include -DWORD rgdwThreadIds[MAXTHREADS]; -HANDLE rghThreads[MAXTHREADS]; +DWORD rgdwThreadIds[MAXTHREADS]; +HANDLE rghThreads[MAXTHREADS]; CRITICAL_SECTION g_cs; #define ENTERCRITICALSECTION EnterCriticalSection(&g_cs); #define LEAVECRITICALSECTION LeaveCriticalSection(&g_cs); -#define STRICMP stricmp +#define STRICMP stricmp #else // Includes POSIX thread headers for Linux thread support #include #include -pthread_t rghThreads[MAXTHREADS]; +pthread_t rghThreads[MAXTHREADS]; pthread_mutex_t g_mutex; #define ENTERCRITICALSECTION pthread_mutex_lock(&g_mutex); #define LEAVECRITICALSECTION pthread_mutex_unlock(&g_mutex); -#define STRICMP strcasecmp +#define STRICMP strcasecmp #endif -#include -#include +#include #include #include #include - #include -#include +#include +#include using namespace std; int NumThreads; int ThreadLaunchCount; -typedef struct _CUDAContext_st { - CUcontext hcuContext; - CUmodule hcuModule; - CUfunction hcuFunction; - CUdeviceptr dptr; - int deviceID; - int threadNum; +typedef struct _CUDAContext_st +{ + CUcontext hcuContext; + CUmodule hcuModule; + CUfunction hcuFunction; + CUdeviceptr dptr; + int deviceID; + int threadNum; } CUDAContext; CUDAContext g_ThreadParams[MAXTHREADS]; @@ -102,73 +102,76 @@ bool gbAutoQuit = false; bool runTest(int argc, char **argv); #define CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status) \ - if (dptr) cuMemFree(dptr); \ - if (hcuModule) cuModuleUnload(hcuModule); \ - if (hcuContext) cuCtxDestroy(hcuContext); \ - return status; + if (dptr) \ + cuMemFree(dptr); \ + if (hcuModule) \ + cuModuleUnload(hcuModule); \ + if (hcuContext) \ + cuCtxDestroy(hcuContext); \ + return status; -#define THREAD_QUIT \ - printf("Error\n"); \ - return 0; +#define THREAD_QUIT \ + printf("Error\n"); \ + return 0; // This sample uses the Driver API interface. The CUDA context needs // to be setup and the CUDA module (CUBIN) is built by NVCC -static CUresult InitCUDAContext(CUDAContext *pContext, CUdevice hcuDevice, - int deviceID, char **argv) { - CUcontext hcuContext = 0; - CUmodule hcuModule = 0; - CUfunction hcuFunction = 0; - CUdeviceptr dptr = 0; +static CUresult InitCUDAContext(CUDAContext *pContext, CUdevice hcuDevice, int deviceID, char **argv) +{ + CUcontext hcuContext = 0; + CUmodule hcuModule = 0; + CUfunction hcuFunction = 0; + CUdeviceptr dptr = 0; - // cuCtxCreate: Function works on floating contexts and current context - CUresult status = cuCtxCreate(&hcuContext, 0, hcuDevice); + // cuCtxCreate: Function works on floating contexts and current context + CUresult status = cuCtxCreate(&hcuContext, 0, hcuDevice); - if (CUDA_SUCCESS != status) { - fprintf(stderr, "cuCtxCreate for failed %d\n", deviceID, - status); - CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); - } + if (CUDA_SUCCESS != status) { + fprintf(stderr, "cuCtxCreate for failed %d\n", deviceID, status); + CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); + } - status = CUDA_ERROR_INVALID_IMAGE; + status = CUDA_ERROR_INVALID_IMAGE; - string module_path, ptx_source; - std::ostringstream fatbin; + string module_path, ptx_source; + std::ostringstream fatbin; - if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { - exit(EXIT_FAILURE); - } else { - printf("> initCUDA loading module: <%s>\n", module_path.c_str()); - } + if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { + exit(EXIT_FAILURE); + } + else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); + } - if (!fatbin.str().size()) { - printf("fatbin file empty. exiting..\n"); - exit(EXIT_FAILURE); - } + if (!fatbin.str().size()) { + printf("fatbin file empty. exiting..\n"); + exit(EXIT_FAILURE); + } - // Create module from binary file (FATBIN) - checkCudaErrors(cuModuleLoadData(&hcuModule, fatbin.str().c_str())); + // Create module from binary file (FATBIN) + checkCudaErrors(cuModuleLoadData(&hcuModule, fatbin.str().c_str())); - status = cuModuleGetFunction(&hcuFunction, hcuModule, "kernelFunction"); + status = cuModuleGetFunction(&hcuFunction, hcuModule, "kernelFunction"); - if (CUDA_SUCCESS != status) { - fprintf(stderr, "cuModuleGetFunction failed %d\n", status); - CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); - } + if (CUDA_SUCCESS != status) { + fprintf(stderr, "cuModuleGetFunction failed %d\n", status); + CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); + } - // Here we must release the CUDA context from the thread context - status = cuCtxPopCurrent(NULL); + // Here we must release the CUDA context from the thread context + status = cuCtxPopCurrent(NULL); - if (CUDA_SUCCESS != status) { - fprintf(stderr, "cuCtxPopCurrent failed %d\n", status); - CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); - } + if (CUDA_SUCCESS != status) { + fprintf(stderr, "cuCtxPopCurrent failed %d\n", status); + CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); + } - pContext->hcuContext = hcuContext; - pContext->hcuModule = hcuModule; - pContext->hcuFunction = hcuFunction; - pContext->deviceID = deviceID; + pContext->hcuContext = hcuContext; + pContext->hcuModule = hcuModule; + pContext->hcuFunction = hcuFunction; + pContext->deviceID = deviceID; - return CUDA_SUCCESS; + return CUDA_SUCCESS; } // ThreadProc launches the CUDA kernel on a CUDA context. @@ -179,252 +182,256 @@ DWORD WINAPI ThreadProc(CUDAContext *pParams) void *ThreadProc(CUDAContext *pParams) #endif { - int wrong = 0; - int *pInt = 0; + int wrong = 0; + int *pInt = 0; - printf(" - ThreadProc() Launched...\n", - pParams->deviceID, pParams->hcuContext, pParams->threadNum); + printf(" - ThreadProc() Launched...\n", + pParams->deviceID, + pParams->hcuContext, + pParams->threadNum); - // cuCtxPushCurrent: Attach the caller CUDA context to the thread context. - CUresult status = cuCtxPushCurrent(pParams->hcuContext); - - if (CUDA_SUCCESS != status) { - THREAD_QUIT; - } - checkCudaErrors(cuMemAlloc(&pParams->dptr, NUM_INTS * sizeof(int))); - - // There are two ways to launch CUDA kernels via the Driver API. - // In this CUDA Sample, we illustrate both ways to pass parameters - // and specify parameters. By default we use the simpler method. - - if (1) { - // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel - // Launching (simpler method) - void *args[5] = {&pParams->dptr}; - - // new CUDA 4.0 Driver API Kernel launch call - status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, NULL, - args, NULL); + // cuCtxPushCurrent: Attach the caller CUDA context to the thread context. + CUresult status = cuCtxPushCurrent(pParams->hcuContext); if (CUDA_SUCCESS != status) { - fprintf(stderr, "cuLaunch failed %d\n", status); - THREAD_QUIT; + THREAD_QUIT; } - } else { - // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel - // Launching (advanced method) - int offset = 0; - char argBuffer[256]; + checkCudaErrors(cuMemAlloc(&pParams->dptr, NUM_INTS * sizeof(int))); - // pass in launch parameters (not actually de-referencing CUdeviceptr). - // CUdeviceptr is storing the value of the parameters - *((CUdeviceptr *)&argBuffer[offset]) = pParams->dptr; - offset += sizeof(CUdeviceptr); + // There are two ways to launch CUDA kernels via the Driver API. + // In this CUDA Sample, we illustrate both ways to pass parameters + // and specify parameters. By default we use the simpler method. - void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, - CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, - CU_LAUNCH_PARAM_END}; + if (1) { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (simpler method) + void *args[5] = {&pParams->dptr}; - // new CUDA 4.0 Driver API Kernel launch call - status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, 0, NULL, - (void **)&kernel_launch_config); + // new CUDA 4.0 Driver API Kernel launch call + status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, NULL, args, NULL); - if (CUDA_SUCCESS != status) { - fprintf(stderr, "cuLaunch failed %d\n", status); - THREAD_QUIT; + if (CUDA_SUCCESS != status) { + fprintf(stderr, "cuLaunch failed %d\n", status); + THREAD_QUIT; + } } - } + else { + // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel + // Launching (advanced method) + int offset = 0; + char argBuffer[256]; - pInt = (int *)malloc(NUM_INTS * sizeof(int)); + // pass in launch parameters (not actually de-referencing CUdeviceptr). + // CUdeviceptr is storing the value of the parameters + *((CUdeviceptr *)&argBuffer[offset]) = pParams->dptr; + offset += sizeof(CUdeviceptr); - if (!pInt) return 0; + void *kernel_launch_config[5] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END}; - if (CUDA_SUCCESS == - cuMemcpyDtoH(pInt, pParams->dptr, NUM_INTS * sizeof(int))) { - for (int i = 0; i < NUM_INTS; i++) { - if (pInt[i] != 32 - i) { - printf(" error [%d]=%d!\n", - pParams->deviceID, pParams->hcuContext, pParams->threadNum, i, - pInt[i]); - wrong++; - } + // new CUDA 4.0 Driver API Kernel launch call + status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, 0, NULL, (void **)&kernel_launch_config); + + if (CUDA_SUCCESS != status) { + fprintf(stderr, "cuLaunch failed %d\n", status); + THREAD_QUIT; + } } - ENTERCRITICALSECTION + pInt = (int *)malloc(NUM_INTS * sizeof(int)); - if (!wrong) ThreadLaunchCount += 1; + if (!pInt) + return 0; - LEAVECRITICALSECTION - } + if (CUDA_SUCCESS == cuMemcpyDtoH(pInt, pParams->dptr, NUM_INTS * sizeof(int))) { + for (int i = 0; i < NUM_INTS; i++) { + if (pInt[i] != 32 - i) { + printf(" error [%d]=%d!\n", + pParams->deviceID, + pParams->hcuContext, + pParams->threadNum, + i, + pInt[i]); + wrong++; + } + } - free(pInt); - fflush(stdout); - checkCudaErrors(cuMemFree(pParams->dptr)); + ENTERCRITICALSECTION - // cuCtxPopCurrent: Detach the current CUDA context from the calling thread. - checkCudaErrors(cuCtxPopCurrent(NULL)); + if (!wrong) + ThreadLaunchCount += 1; - printf(" - ThreadProc() Finished!\n\n", - pParams->deviceID, pParams->hcuContext, pParams->threadNum); + LEAVECRITICALSECTION + } - return 0; + free(pInt); + fflush(stdout); + checkCudaErrors(cuMemFree(pParams->dptr)); + + // cuCtxPopCurrent: Detach the current CUDA context from the calling thread. + checkCudaErrors(cuCtxPopCurrent(NULL)); + + printf(" - ThreadProc() Finished!\n\n", + pParams->deviceID, + pParams->hcuContext, + pParams->threadNum); + + return 0; } -bool FinalErrorCheck(CUDAContext *pContext, int NumThreads, int deviceCount) { - if (ThreadLaunchCount != NumThreads * deviceCount) { - printf(" ThreadLaunchCounts(s)\n", - NumThreads * deviceCount, ThreadLaunchCount); - return false; - } else { - for (int iDevice = 0; iDevice < deviceCount; iDevice++) { - // cuCtxDestroy called on current context or a floating context - if (CUDA_SUCCESS != cuCtxDestroy(pContext[iDevice].hcuContext)) +bool FinalErrorCheck(CUDAContext *pContext, int NumThreads, int deviceCount) +{ + if (ThreadLaunchCount != NumThreads * deviceCount) { + printf(" ThreadLaunchCounts(s)\n", NumThreads * deviceCount, ThreadLaunchCount); + return false; + } + else { + for (int iDevice = 0; iDevice < deviceCount; iDevice++) { + // cuCtxDestroy called on current context or a floating context + if (CUDA_SUCCESS != cuCtxDestroy(pContext[iDevice].hcuContext)) + return false; + } + + return true; + } +} + +int main(int argc, char **argv) +{ + printf("Starting threadMigration\n"); + + bool bTestResult = runTest(argc, argv); + + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); +} + +bool runTest(int argc, char **argv) +{ + printf("[ threadMigration ] API test...\n"); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + InitializeCriticalSection(&g_cs); +#else + pthread_mutex_init(&g_mutex, NULL); +#endif + // By default, we will launch 2 CUDA threads for each device + NumThreads = 2; + + if (argc > 1) { + // If we are doing the QAtest or automated testing, we quit without + // prompting + if (checkCmdLineFlag(argc, (const char **)argv, "qatest") + || checkCmdLineFlag(argc, (const char **)argv, "noprompt")) { + gbAutoQuit = true; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "numthreads")) { + NumThreads = getCmdLineArgumentInt(argc, (const char **)argv, "numthreads"); + + if (NumThreads < 1 || NumThreads > 15) { + printf("Usage: \"threadMigration -n=\", ranges 1-15\n"); + return 1; + } + } + } + + int deviceCount; + int hcuDevice = 0; + CUresult status; + status = cuInit(0); + + if (CUDA_SUCCESS != status) + return false; + + status = cuDeviceGetCount(&deviceCount); + + if (CUDA_SUCCESS != status) + return false; + + printf("> %d CUDA device(s), %d Thread(s)/device to launched\n\n", deviceCount, NumThreads); + + if (deviceCount == 0) { return false; } - return true; - } -} + int ihThread = 0; + int ThreadIndex = 0; -int main(int argc, char **argv) { - printf("Starting threadMigration\n"); + CUDAContext *pContext = (CUDAContext *)malloc(sizeof(CUDAContext) * deviceCount); - bool bTestResult = runTest(argc, argv); + for (int iDevice = 0; iDevice < deviceCount; iDevice++) { + char szName[256]; + status = cuDeviceGet(&hcuDevice, iDevice); - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); -} + if (CUDA_SUCCESS != status) + return false; -bool runTest(int argc, char **argv) { - printf("[ threadMigration ] API test...\n"); + status = cuDeviceGetName(szName, 256, hcuDevice); + + if (CUDA_SUCCESS != status) + return false; + + { + int major = 0, minor = 0; + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hcuDevice)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hcuDevice)); + int sharedMemPerBlock; + checkCudaErrors( + cuDeviceGetAttribute(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, hcuDevice)); + int totalConstantMemory; + checkCudaErrors( + cuDeviceGetAttribute(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, hcuDevice)); + int regsPerBlock; + checkCudaErrors( + cuDeviceGetAttribute(®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, hcuDevice)); + int clockRate; + checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, hcuDevice)); + printf("Device %d: \"%s\" (Compute %d.%d)\n", iDevice, szName, major, minor); + printf("\tsharedMemPerBlock: %d\n", sharedMemPerBlock); + printf("\tconstantMemory : %d\n", totalConstantMemory); + printf("\tregsPerBlock : %d\n", regsPerBlock); + printf("\tclockRate : %d\n", clockRate); + printf("\n"); + } + + if (CUDA_SUCCESS != InitCUDAContext(&pContext[iDevice], hcuDevice, iDevice, argv)) { + return FinalErrorCheck(pContext, NumThreads, deviceCount); + } + else { + for (int iThread = 0; iThread < NumThreads; iThread++, ihThread++) { + g_ThreadParams[ThreadIndex].hcuContext = pContext[iDevice].hcuContext; + g_ThreadParams[ThreadIndex].hcuModule = pContext[iDevice].hcuModule; + g_ThreadParams[ThreadIndex].hcuFunction = pContext[iDevice].hcuFunction; + g_ThreadParams[ThreadIndex].deviceID = pContext[iDevice].deviceID; + g_ThreadParams[ThreadIndex].threadNum = iThread; + // Launch (NumThreads) for each CUDA context #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - InitializeCriticalSection(&g_cs); -#else - pthread_mutex_init(&g_mutex, NULL); + rghThreads[ThreadIndex] = CreateThread(NULL, + 0, + (LPTHREAD_START_ROUTINE)ThreadProc, + &g_ThreadParams[ThreadIndex], + 0, + &rgdwThreadIds[ThreadIndex]); +#else // Assume we are running linux + pthread_create( + &rghThreads[ThreadIndex], NULL, (void *(*)(void *))ThreadProc, &g_ThreadParams[ThreadIndex]); #endif - // By default, we will launch 2 CUDA threads for each device - NumThreads = 2; - - if (argc > 1) { - // If we are doing the QAtest or automated testing, we quit without - // prompting - if (checkCmdLineFlag(argc, (const char **)argv, "qatest") || - checkCmdLineFlag(argc, (const char **)argv, "noprompt")) { - gbAutoQuit = true; + ThreadIndex += 1; + } + } } - if (checkCmdLineFlag(argc, (const char **)argv, "numthreads")) { - NumThreads = - getCmdLineArgumentInt(argc, (const char **)argv, "numthreads"); - - if (NumThreads < 1 || NumThreads > 15) { - printf( - "Usage: \"threadMigration -n=\", ranges 1-15\n"); - return 1; - } - } - } - - int deviceCount; - int hcuDevice = 0; - CUresult status; - status = cuInit(0); - - if (CUDA_SUCCESS != status) return false; - - status = cuDeviceGetCount(&deviceCount); - - if (CUDA_SUCCESS != status) return false; - - printf("> %d CUDA device(s), %d Thread(s)/device to launched\n\n", - deviceCount, NumThreads); - - if (deviceCount == 0) { - return false; - } - - int ihThread = 0; - int ThreadIndex = 0; - - CUDAContext *pContext = - (CUDAContext *)malloc(sizeof(CUDAContext) * deviceCount); - - for (int iDevice = 0; iDevice < deviceCount; iDevice++) { - char szName[256]; - status = cuDeviceGet(&hcuDevice, iDevice); - - if (CUDA_SUCCESS != status) return false; - - status = cuDeviceGetName(szName, 256, hcuDevice); - - if (CUDA_SUCCESS != status) return false; - - { - int major = 0, minor = 0; - checkCudaErrors(cuDeviceGetAttribute( - &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hcuDevice)); - checkCudaErrors(cuDeviceGetAttribute( - &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hcuDevice)); - int sharedMemPerBlock; - checkCudaErrors(cuDeviceGetAttribute( - &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - hcuDevice)); - int totalConstantMemory; - checkCudaErrors(cuDeviceGetAttribute( - &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - hcuDevice)); - int regsPerBlock; - checkCudaErrors(cuDeviceGetAttribute( - ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - hcuDevice)); - int clockRate; - checkCudaErrors(cuDeviceGetAttribute( - &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, hcuDevice)); - printf("Device %d: \"%s\" (Compute %d.%d)\n", iDevice, szName, major, - minor); - printf("\tsharedMemPerBlock: %d\n", sharedMemPerBlock); - printf("\tconstantMemory : %d\n", totalConstantMemory); - printf("\tregsPerBlock : %d\n", regsPerBlock); - printf("\tclockRate : %d\n", clockRate); - printf("\n"); - } - - if (CUDA_SUCCESS != - InitCUDAContext(&pContext[iDevice], hcuDevice, iDevice, argv)) { - return FinalErrorCheck(pContext, NumThreads, deviceCount); - } else { - for (int iThread = 0; iThread < NumThreads; iThread++, ihThread++) { - g_ThreadParams[ThreadIndex].hcuContext = pContext[iDevice].hcuContext; - g_ThreadParams[ThreadIndex].hcuModule = pContext[iDevice].hcuModule; - g_ThreadParams[ThreadIndex].hcuFunction = pContext[iDevice].hcuFunction; - g_ThreadParams[ThreadIndex].deviceID = pContext[iDevice].deviceID; - g_ThreadParams[ThreadIndex].threadNum = iThread; - // Launch (NumThreads) for each CUDA context + // Wait until all workers are done #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - rghThreads[ThreadIndex] = CreateThread( - NULL, 0, (LPTHREAD_START_ROUTINE)ThreadProc, - &g_ThreadParams[ThreadIndex], 0, &rgdwThreadIds[ThreadIndex]); -#else // Assume we are running linux - pthread_create(&rghThreads[ThreadIndex], NULL, - (void *(*)(void *))ThreadProc, - &g_ThreadParams[ThreadIndex]); -#endif - ThreadIndex += 1; - } - } - } - - // Wait until all workers are done -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - WaitForMultipleObjects(ThreadIndex, rghThreads, TRUE, INFINITE); + WaitForMultipleObjects(ThreadIndex, rghThreads, TRUE, INFINITE); #else - for (int i = 0; i < ThreadIndex; i++) { - pthread_join(rghThreads[i], NULL); - } + for (int i = 0; i < ThreadIndex; i++) { + pthread_join(rghThreads[i], NULL); + } #endif - bool ret_status = FinalErrorCheck(pContext, NumThreads, deviceCount); - free(pContext); - return ret_status; + bool ret_status = FinalErrorCheck(pContext, NumThreads, deviceCount); + free(pContext); + return ret_status; } diff --git a/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration_kernel.cu b/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration_kernel.cu index 50be0e87..aa0dea41 100644 --- a/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration_kernel.cu +++ b/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration_kernel.cu @@ -25,6 +25,4 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -extern "C" __global__ void kernelFunction(int *input) { - input[threadIdx.x] = 32 - threadIdx.x; -} +extern "C" __global__ void kernelFunction(int *input) { input[threadIdx.x] = 32 - threadIdx.x; } diff --git a/Samples/3_CUDA_Features/README.md b/Samples/3_CUDA_Features/README.md index 47b16a4e..74278321 100644 --- a/Samples/3_CUDA_Features/README.md +++ b/Samples/3_CUDA_Features/README.md @@ -76,4 +76,3 @@ This sample demonstrates how using Cooperative Groups (CG) to perform warp aggre ### [graphConditionalNodes](./graphConditionalNodes) Demonstrate the use of CUDA Graph conditional nodes available starting in CUDA 12.4. - diff --git a/Samples/3_CUDA_Features/StreamPriorities/README.md b/Samples/3_CUDA_Features/StreamPriorities/README.md index 9b9d056e..7e9447c5 100644 --- a/Samples/3_CUDA_Features/StreamPriorities/README.md +++ b/Samples/3_CUDA_Features/StreamPriorities/README.md @@ -32,4 +32,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/3_CUDA_Features/StreamPriorities/StreamPriorities.cu b/Samples/3_CUDA_Features/StreamPriorities/StreamPriorities.cu index 5e5c3229..093e51f0 100644 --- a/Samples/3_CUDA_Features/StreamPriorities/StreamPriorities.cu +++ b/Samples/3_CUDA_Features/StreamPriorities/StreamPriorities.cu @@ -30,172 +30,164 @@ // CUDA-C includes #include - #include #define TOTAL_SIZE 256 * 1024 * 1024 -#define EACH_SIZE 128 * 1024 * 1024 +#define EACH_SIZE 128 * 1024 * 1024 // # threadblocks #define TBLOCKS 1024 #define THREADS 512 // throw error on equality -#define ERR_EQ(X, Y) \ - do { \ - if ((X) == (Y)) { \ - fprintf(stderr, "Error in %s at %s:%d\n", __func__, __FILE__, __LINE__); \ - exit(-1); \ - } \ - } while (0) +#define ERR_EQ(X, Y) \ + do { \ + if ((X) == (Y)) { \ + fprintf(stderr, "Error in %s at %s:%d\n", __func__, __FILE__, __LINE__); \ + exit(-1); \ + } \ + } while (0) // throw error on difference -#define ERR_NE(X, Y) \ - do { \ - if ((X) != (Y)) { \ - fprintf(stderr, "Error in %s at %s:%d\n", __func__, __FILE__, __LINE__); \ - exit(-1); \ - } \ - } while (0) +#define ERR_NE(X, Y) \ + do { \ + if ((X) != (Y)) { \ + fprintf(stderr, "Error in %s at %s:%d\n", __func__, __FILE__, __LINE__); \ + exit(-1); \ + } \ + } while (0) // copy from source -> destination arrays -__global__ void memcpy_kernel(int *dst, int *src, size_t n) { - int num = gridDim.x * blockDim.x; - int id = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void memcpy_kernel(int *dst, int *src, size_t n) +{ + int num = gridDim.x * blockDim.x; + int id = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = id; i < n / sizeof(int); i += num) { - dst[i] = src[i]; - } + for (int i = id; i < n / sizeof(int); i += num) { + dst[i] = src[i]; + } } // initialise memory -void mem_init(int *buf, size_t n) { - for (int i = 0; i < n / sizeof(int); i++) { - buf[i] = i; - } +void mem_init(int *buf, size_t n) +{ + for (int i = 0; i < n / sizeof(int); i++) { + buf[i] = i; + } } -int main(int argc, char **argv) { - cudaDeviceProp device_prop; - int dev_id; +int main(int argc, char **argv) +{ + cudaDeviceProp device_prop; + int dev_id; - printf("Starting [%s]...\n", argv[0]); + printf("Starting [%s]...\n", argv[0]); - // set device - dev_id = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); + // set device + dev_id = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); - if ((device_prop.major << 4) + device_prop.minor < 0x35) { - fprintf(stderr, - "%s requires Compute Capability of SM 3.5 or higher to " - "run.\nexiting...\n", - argv[0]); - exit(EXIT_WAIVED); - } + if ((device_prop.major << 4) + device_prop.minor < 0x35) { + fprintf(stderr, + "%s requires Compute Capability of SM 3.5 or higher to " + "run.\nexiting...\n", + argv[0]); + exit(EXIT_WAIVED); + } - // get the range of priorities available - // [ greatest_priority, lowest_priority ] - int priority_low; - int priority_hi; - checkCudaErrors( - cudaDeviceGetStreamPriorityRange(&priority_low, &priority_hi)); + // get the range of priorities available + // [ greatest_priority, lowest_priority ] + int priority_low; + int priority_hi; + checkCudaErrors(cudaDeviceGetStreamPriorityRange(&priority_low, &priority_hi)); - printf("CUDA stream priority range: LOW: %d to HIGH: %d\n", priority_low, - priority_hi); + printf("CUDA stream priority range: LOW: %d to HIGH: %d\n", priority_low, priority_hi); - // create streams with highest and lowest available priorities - cudaStream_t st_low; - cudaStream_t st_hi; - checkCudaErrors(cudaStreamCreateWithPriority(&st_low, cudaStreamNonBlocking, - priority_low)); - checkCudaErrors( - cudaStreamCreateWithPriority(&st_hi, cudaStreamNonBlocking, priority_hi)); + // create streams with highest and lowest available priorities + cudaStream_t st_low; + cudaStream_t st_hi; + checkCudaErrors(cudaStreamCreateWithPriority(&st_low, cudaStreamNonBlocking, priority_low)); + checkCudaErrors(cudaStreamCreateWithPriority(&st_hi, cudaStreamNonBlocking, priority_hi)); - size_t size; - size = TOTAL_SIZE; + size_t size; + size = TOTAL_SIZE; - // initialise host data - int *h_src_low; - int *h_src_hi; - ERR_EQ(h_src_low = (int *)malloc(size), NULL); - ERR_EQ(h_src_hi = (int *)malloc(size), NULL); - mem_init(h_src_low, size); - mem_init(h_src_hi, size); + // initialise host data + int *h_src_low; + int *h_src_hi; + ERR_EQ(h_src_low = (int *)malloc(size), NULL); + ERR_EQ(h_src_hi = (int *)malloc(size), NULL); + mem_init(h_src_low, size); + mem_init(h_src_hi, size); - // initialise device data - int *h_dst_low; - int *h_dst_hi; - ERR_EQ(h_dst_low = (int *)malloc(size), NULL); - ERR_EQ(h_dst_hi = (int *)malloc(size), NULL); - memset(h_dst_low, 0, size); - memset(h_dst_hi, 0, size); + // initialise device data + int *h_dst_low; + int *h_dst_hi; + ERR_EQ(h_dst_low = (int *)malloc(size), NULL); + ERR_EQ(h_dst_hi = (int *)malloc(size), NULL); + memset(h_dst_low, 0, size); + memset(h_dst_hi, 0, size); - // copy source data -> device - int *d_src_low; - int *d_src_hi; - checkCudaErrors(cudaMalloc(&d_src_low, size)); - checkCudaErrors(cudaMalloc(&d_src_hi, size)); - checkCudaErrors( - cudaMemcpy(d_src_low, h_src_low, size, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_src_hi, h_src_hi, size, cudaMemcpyHostToDevice)); + // copy source data -> device + int *d_src_low; + int *d_src_hi; + checkCudaErrors(cudaMalloc(&d_src_low, size)); + checkCudaErrors(cudaMalloc(&d_src_hi, size)); + checkCudaErrors(cudaMemcpy(d_src_low, h_src_low, size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_src_hi, h_src_hi, size, cudaMemcpyHostToDevice)); - // allocate memory for memcopy destination - int *d_dst_low; - int *d_dst_hi; - checkCudaErrors(cudaMalloc(&d_dst_low, size)); - checkCudaErrors(cudaMalloc(&d_dst_hi, size)); + // allocate memory for memcopy destination + int *d_dst_low; + int *d_dst_hi; + checkCudaErrors(cudaMalloc(&d_dst_low, size)); + checkCudaErrors(cudaMalloc(&d_dst_hi, size)); - // create some events - cudaEvent_t ev_start_low; - cudaEvent_t ev_start_hi; - cudaEvent_t ev_end_low; - cudaEvent_t ev_end_hi; - checkCudaErrors(cudaEventCreate(&ev_start_low)); - checkCudaErrors(cudaEventCreate(&ev_start_hi)); - checkCudaErrors(cudaEventCreate(&ev_end_low)); - checkCudaErrors(cudaEventCreate(&ev_end_hi)); + // create some events + cudaEvent_t ev_start_low; + cudaEvent_t ev_start_hi; + cudaEvent_t ev_end_low; + cudaEvent_t ev_end_hi; + checkCudaErrors(cudaEventCreate(&ev_start_low)); + checkCudaErrors(cudaEventCreate(&ev_start_hi)); + checkCudaErrors(cudaEventCreate(&ev_end_low)); + checkCudaErrors(cudaEventCreate(&ev_end_hi)); - /* */ + /* */ - // call pair of kernels repeatedly (with different priority streams) - checkCudaErrors(cudaEventRecord(ev_start_low, st_low)); - checkCudaErrors(cudaEventRecord(ev_start_hi, st_hi)); + // call pair of kernels repeatedly (with different priority streams) + checkCudaErrors(cudaEventRecord(ev_start_low, st_low)); + checkCudaErrors(cudaEventRecord(ev_start_hi, st_hi)); - for (int i = 0; i < TOTAL_SIZE; i += EACH_SIZE) { - int j = i / sizeof(int); - memcpy_kernel<<>>(d_dst_low + j, d_src_low + j, - EACH_SIZE); - memcpy_kernel<<>>(d_dst_hi + j, d_src_hi + j, - EACH_SIZE); - } + for (int i = 0; i < TOTAL_SIZE; i += EACH_SIZE) { + int j = i / sizeof(int); + memcpy_kernel<<>>(d_dst_low + j, d_src_low + j, EACH_SIZE); + memcpy_kernel<<>>(d_dst_hi + j, d_src_hi + j, EACH_SIZE); + } - checkCudaErrors(cudaEventRecord(ev_end_low, st_low)); - checkCudaErrors(cudaEventRecord(ev_end_hi, st_hi)); + checkCudaErrors(cudaEventRecord(ev_end_low, st_low)); + checkCudaErrors(cudaEventRecord(ev_end_hi, st_hi)); - checkCudaErrors(cudaEventSynchronize(ev_end_low)); - checkCudaErrors(cudaEventSynchronize(ev_end_hi)); + checkCudaErrors(cudaEventSynchronize(ev_end_low)); + checkCudaErrors(cudaEventSynchronize(ev_end_hi)); - /* */ + /* */ - size = TOTAL_SIZE; - checkCudaErrors( - cudaMemcpy(h_dst_low, d_dst_low, size, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_dst_hi, d_dst_hi, size, cudaMemcpyDeviceToHost)); + size = TOTAL_SIZE; + checkCudaErrors(cudaMemcpy(h_dst_low, d_dst_low, size, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_dst_hi, d_dst_hi, size, cudaMemcpyDeviceToHost)); - // check results of kernels - ERR_NE(memcmp(h_dst_low, h_src_low, size), 0); - ERR_NE(memcmp(h_dst_hi, h_src_hi, size), 0); + // check results of kernels + ERR_NE(memcmp(h_dst_low, h_src_low, size), 0); + ERR_NE(memcmp(h_dst_hi, h_src_hi, size), 0); - // check timings - float ms_low; - float ms_hi; - checkCudaErrors(cudaEventElapsedTime(&ms_low, ev_start_low, ev_end_low)); - checkCudaErrors(cudaEventElapsedTime(&ms_hi, ev_start_hi, ev_end_hi)); + // check timings + float ms_low; + float ms_hi; + checkCudaErrors(cudaEventElapsedTime(&ms_low, ev_start_low, ev_end_low)); + checkCudaErrors(cudaEventElapsedTime(&ms_hi, ev_start_hi, ev_end_hi)); - printf("elapsed time of kernels launched to LOW priority stream: %.3lf ms\n", - ms_low); - printf("elapsed time of kernels launched to HI priority stream: %.3lf ms\n", - ms_hi); + printf("elapsed time of kernels launched to LOW priority stream: %.3lf ms\n", ms_low); + printf("elapsed time of kernels launched to HI priority stream: %.3lf ms\n", ms_hi); - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } diff --git a/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu b/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu index a89e4096..be6b791c 100644 --- a/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu +++ b/Samples/3_CUDA_Features/bf16TensorCoreGemm/bf16TensorCoreGemm.cu @@ -58,15 +58,15 @@ // but carefully enough to avoid local memory use. #include -#include #include +#include #include #include -#include +#include // helper functions and utilities to work with CUDA -#include #include +#include // Externally configurable parameters. @@ -107,7 +107,7 @@ // Implementation constants. -#define WARPS_PER_BLOCK 8 +#define WARPS_PER_BLOCK 8 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) #if SHARED_MEMORY_LIMIT_64K @@ -125,10 +125,10 @@ #define CHUNK_K 8 #endif -#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(__nv_bfloat16)) -#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) +#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(__nv_bfloat16)) +#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) #define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES) -#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) +#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) #define BLOCK_ROW_WARPS 2 #define BLOCK_COL_WARPS 4 @@ -158,25 +158,24 @@ // we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync. #define SKEW_BF16 16 -#define checkKernelErrors(expr) do { \ - expr; \ - \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err)); \ - abort(); \ - } \ -} while(0) +#define checkKernelErrors(expr) \ + do { \ + expr; \ + \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, cudaGetErrorString(__err)); \ + abort(); \ + } \ + } while (0) -enum kernels -{ - bf16mma_shmem_gemm_async_copy = 0, // __nv_bfloat16 MMA shmem using kernel with async_copy - bf16mma_shmem_gemm = 1, // __nv_bfloat16 MMA shmem using kernel normal copy (without async_copy). - simple_bf16mma_gemm = 2 // __nv_bfloat16 MMA non-shmem using simple kernel. +enum kernels { + bf16mma_shmem_gemm_async_copy = 0, // __nv_bfloat16 MMA shmem using kernel with async_copy + bf16mma_shmem_gemm = 1, // __nv_bfloat16 MMA shmem using kernel normal copy (without async_copy). + simple_bf16mma_gemm = 2 // __nv_bfloat16 MMA non-shmem using simple kernel. }; -const char* kernelNames[] = {"compute_bf16gemm_async_copy", "compute_bf16gemm", - "simple_wmma_bf16gemm"}; +const char *kernelNames[] = {"compute_bf16gemm_async_copy", "compute_bf16gemm", "simple_wmma_bf16gemm"}; using namespace nvcuda; @@ -184,22 +183,23 @@ __host__ void init_host_matrices(__nv_bfloat16 *a, __nv_bfloat16 *b, float *c) { for (int i = 0; i < M_GLOBAL; i++) { for (int j = 0; j < K_GLOBAL; j++) { - a[i*K_GLOBAL+j] = (__nv_bfloat16)(float)(rand() % 3); + a[i * K_GLOBAL + j] = (__nv_bfloat16)(float)(rand() % 3); } } for (int i = 0; i < N_GLOBAL; i++) { for (int j = 0; j < K_GLOBAL; j++) { - b[i*K_GLOBAL+j] = (__nv_bfloat16)(float)(rand() % 3); + b[i * K_GLOBAL + j] = (__nv_bfloat16)(float)(rand() % 3); } } for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { - c[t] = (float)(rand() % 3); + c[t] = (float)(rand() % 3); } } -__global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, const float *C, float *D, float alpha, float beta) +__global__ void +compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, const float *C, float *D, float alpha, float beta) { #if __CUDA_ARCH__ >= 800 extern __shared__ __nv_bfloat16 shmem[][CHUNK_K * K + SKEW_BF16]; @@ -212,10 +212,11 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; // This pointer is used to access the C and D matrix tiles this warp computes. - float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; + float *shmem_warp_tile_ptr = (float *)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. - float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N; + float *shmem_warp_stream_ptr = (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * N; // Adjust the beta scaler, as it'll be multiplied by alpha at the end of // each tile computation. Technically this is not generally correct (may result @@ -225,7 +226,7 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the // right and down, and selects the next tile to compute. Once there's no such tile, // all warps in this CTA exit. - for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; @@ -235,14 +236,14 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, } // This warp's pointer to the C matrix data to copy memory from to shared memory. - const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; const float *src_gmem_warp_stream_ptr = &C[gmem_idx]; // Stream multiple C tiles to shared memory. #pragma unroll for (int i = 0; i < N; i++) { - *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = - *((int4*)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId); + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = + *((int4 *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId); } __syncthreads(); @@ -266,7 +267,7 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, // Scale the C matrix. #pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { + for (int i = 0; i < WARP_COL_TILES; i++) { #pragma unroll for (int j = 0; j < WARP_ROW_TILES; j++) { #pragma unroll @@ -278,16 +279,19 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, // Select what warp copies what matrix to shared memory. // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const __nv_bfloat16 *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2); + const __nv_bfloat16 *warp_ptr = + (warpId < (WARPS_PER_BLOCK / 2)) + ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2); // Go through the global K dimension by a fixed step at a time. #pragma unroll for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { // Copy slices of the A and B matrices to shared memory. // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. - size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off); + size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) + ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. @@ -297,9 +301,10 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, shmem_idx += laneId / CHUNK_COPY_LINE_LANES; #pragma unroll - for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) { + for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) { // Copy 16 bytes at once in each lane. - *((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *((int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)); + *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = + *((int4 *)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)); // Advance the global memory pointer and the shared memory index. lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP; @@ -316,8 +321,8 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId/BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); - const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); + const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_BF16); @@ -326,7 +331,7 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, if (i == 0) { // Load the B matrix fragment once, because it is going to be reused // against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N); + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_BF16); @@ -364,8 +369,8 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, #pragma unroll for (int i = 0; i < N; i++) { - *((float4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((float4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + *((float4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((float4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); @@ -373,7 +378,12 @@ __global__ void compute_bf16gemm(const __nv_bfloat16 *A, const __nv_bfloat16 *B, #endif } -__global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_bfloat16 *B, const float *C, float *D, float alpha, float beta) +__global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, + const __nv_bfloat16 *B, + const float *C, + float *D, + float alpha, + float beta) { #if __CUDA_ARCH__ >= 800 extern __shared__ __nv_bfloat16 shmem[][CHUNK_K * K + SKEW_BF16]; @@ -386,24 +396,25 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b constexpr size_t shmem_idx_b_off = BLOCK_COL_TILES * M; // This pointer is used to access the C and D matrix tiles this warp computes. - float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; + float *shmem_warp_tile_ptr = (float *)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. - float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N; + float *shmem_warp_stream_ptr = (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * N; // Adjust the beta scaler, as it'll be multiplied by alpha at the end of // each tile computation. Technically this is not generally correct (may result // in a loss of precision). Zero still needs to be specially handled though. beta /= alpha; - cuda::pipeline pipe = cuda::make_pipeline(); - const auto shape4 = cuda::aligned_size_t(sizeof(float4)); - constexpr int loadStride = 2; // load 4 floats, left-shift by 2. + cuda::pipeline pipe = cuda::make_pipeline(); + const auto shape4 = cuda::aligned_size_t(sizeof(float4)); + constexpr int loadStride = 2; // load 4 floats, left-shift by 2. // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the // right and down, and selects the next tile to compute. Once there's no such tile, // all warps in this CTA exit. - for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; @@ -413,7 +424,7 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b } // This warp's pointer to the C matrix data to copy memory from to shared memory. - const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; const float *src_gmem_warp_stream_ptr = &C[gmem_idx]; // Stream multiple C tiles to shared memory. @@ -422,8 +433,9 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b pipe.producer_acquire(); cuda::memcpy_async(&shmem_warp_stream_ptr[(SHMEM_STRIDE * i) + (laneId << loadStride)], - &src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i) + (laneId << loadStride)], - shape4, pipe); + &src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i) + (laneId << loadStride)], + shape4, + pipe); pipe.producer_commit(); } @@ -458,13 +470,15 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b __syncthreads(); // Select what warp copies what matrix to shared memory. // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const __nv_bfloat16 *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2); + const __nv_bfloat16 *warp_ptr = + (warpId < (WARPS_PER_BLOCK / 2)) + ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2); - constexpr int chunksPerLane = ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; + constexpr int chunksPerLane = ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; constexpr int loadStrideBfloat8 = 3; // load 8 bfloats, left-shift by 3. - const int laneLoadElem = (laneId % CHUNK_COPY_LINE_LANES) << loadStrideBfloat8; - const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); + const int laneLoadElem = (laneId % CHUNK_COPY_LINE_LANES) << loadStrideBfloat8; + const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); // Go through the global K dimension by a fixed step at a time. #pragma unroll @@ -472,7 +486,8 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b // Copy slices of the A and B matrices to shared memory. // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. // As for bf16 MMA M == N we use M for warp 4-7 + shmem_idx_b_off. - size_t shmem_idx = (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) + ((warpId / (WARPS_PER_BLOCK/2)) * shmem_idx_b_off); + size_t shmem_idx = + (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + ((warpId / (WARPS_PER_BLOCK / 2)) * shmem_idx_b_off); // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. @@ -482,7 +497,7 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b shmem_idx += stridePerLaneCopy; #pragma unroll - for(int i = 0; i < chunksPerLane; i++) { + for (int i = 0; i < chunksPerLane; i++) { // Copy 16 bytes at once in each lane. pipe.producer_acquire(); cuda::memcpy_async(&shmem[shmem_idx][laneLoadElem], lane_ptr, shape4, pipe); @@ -502,8 +517,8 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); - const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); + const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_BF16); @@ -512,7 +527,7 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b if (i == 0) { // Load the B matrix fragment once, because it is going to be reused // against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N); + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); const __nv_bfloat16 *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_BF16); @@ -551,8 +566,8 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b #pragma unroll for (int i = 0; i < N; i++) { - *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); @@ -562,73 +577,86 @@ __global__ void compute_bf16gemm_async_copy(const __nv_bfloat16 *A, const __nv_b // Performs an MxNxK bf16 GEMM (C=alpha*A*B + beta*C) assuming: // 1) Matrices are packed in memory. -// 2) M, N and K are multiples of 16, 16 and 16 respectively. +// 2) M, N and K are multiples of 16, 16 and 16 respectively. // 3) A is row major, B is column major matrix. // Note: This is a less performant version of the compute_bf16gemm kernel. It is designed for // demonstration purposes only to show the CUDA WMMA API use without relying on // availability of the shared memory. -__global__ void simple_wmma_bf16gemm(__nv_bfloat16 *a, __nv_bfloat16 *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) +__global__ void simple_wmma_bf16gemm(__nv_bfloat16 *a, + __nv_bfloat16 *b, + float *c, + float *d, + int m_ld, + int n_ld, + int k_ld, + float alpha, + float beta) { #if __CUDA_ARCH__ >= 800 - // Leading dimensions. Packed with no transpositions. + // Leading dimensions. Packed with no transpositions. int lda = k_ld; int ldb = k_ld; int ldc = n_ld; - // Tile using a 2D grid - int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; - int warpN = (blockIdx.y * blockDim.y + threadIdx.y); - - // Declare the fragments - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment acc_frag; - wmma::fragment c_frag; + // Tile using a 2D grid + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); - wmma::fill_fragment(acc_frag, 0.0f); + // Declare the fragments + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; - // Loop over k - for (int i = 0; i < k_ld; i += K) { - int aCol = i; - int aRow = warpM * M; + wmma::fill_fragment(acc_frag, 0.0f); - int bCol = i; - int bRow = warpN * N; + // Loop over k + for (int i = 0; i < k_ld; i += K) { + int aCol = i; + int aRow = warpM * M; - // Bounds checking - if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { - // Load the inputs - wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); - wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); - - // Perform the matrix multiplication - wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + int bCol = i; + int bRow = warpN * N; - } - } + // Bounds checking + if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { + // Load the inputs + wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); + wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); - // Load in the current value of c, scale it by beta, and add this our result scaled by alpha - int cCol = warpN * N; - int cRow = warpM * M; + // Perform the matrix multiplication + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + } + } - if (cRow < m_ld && cCol < n_ld) { - wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); + // Load in the current value of c, scale it by beta, and add this our result scaled by alpha + int cCol = warpN * N; + int cRow = warpM * M; - for(int i=0; i < c_frag.num_elements; i++) { - c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; - } + if (cRow < m_ld && cCol < n_ld) { + wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); - // Store the output - wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); - } + for (int i = 0; i < c_frag.num_elements; i++) { + c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + } + + // Store the output + wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); + } #endif } -__host__ void matMultiplyOnHost(__nv_bfloat16 *A, __nv_bfloat16 *B, float *C, - float alpha, float beta, - int numARows, int numAColumns, - int numBRows, int numBColumns, - int numCRows, int numCColumns) +__host__ void matMultiplyOnHost(__nv_bfloat16 *A, + __nv_bfloat16 *B, + float *C, + float alpha, + float beta, + int numARows, + int numAColumns, + int numBRows, + int numBColumns, + int numCRows, + int numCColumns) { for (int i = 0; i < numCRows; i++) { for (int j = 0; j < numCColumns; j++) { @@ -638,7 +666,7 @@ __host__ void matMultiplyOnHost(__nv_bfloat16 *A, __nv_bfloat16 *B, float *C, temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k]; } - C[i*numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; } } } @@ -664,29 +692,29 @@ int main(int argc, char **argv) __nv_bfloat16 *A_h = NULL; __nv_bfloat16 *B_h = NULL; - float *C_h = NULL; + float *C_h = NULL; #if CPU_DEBUG - float *result_hD = NULL; + float *result_hD = NULL; float *result_host = NULL; #endif - A_h = (__nv_bfloat16*) malloc(sizeof(__nv_bfloat16) * M_GLOBAL * K_GLOBAL); - B_h = (__nv_bfloat16*) malloc(sizeof(__nv_bfloat16) * K_GLOBAL * N_GLOBAL); - C_h = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + A_h = (__nv_bfloat16 *)malloc(sizeof(__nv_bfloat16) * M_GLOBAL * K_GLOBAL); + B_h = (__nv_bfloat16 *)malloc(sizeof(__nv_bfloat16) * K_GLOBAL * N_GLOBAL); + C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); #if CPU_DEBUG - result_hD = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); - result_host = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); #endif __nv_bfloat16 *A = NULL; __nv_bfloat16 *B = NULL; - float *C = NULL; - float *D = NULL; + float *C = NULL; + float *D = NULL; - checkCudaErrors(cudaMalloc((void**)&A, sizeof(__nv_bfloat16) * M_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&B, sizeof(__nv_bfloat16) * N_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&C, sizeof(float) * M_GLOBAL * N_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&D, sizeof(float) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&A, sizeof(__nv_bfloat16) * M_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&B, sizeof(__nv_bfloat16) * N_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&C, sizeof(float) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&D, sizeof(float) * M_GLOBAL * N_GLOBAL)); assert(((unsigned long long)A) % 128 == 0); assert(((unsigned long long)B) % 128 == 0); @@ -714,11 +742,11 @@ int main(int argc, char **argv) printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); const float alpha = 1.1f; - const float beta = 1.2f; + const float beta = 1.2f; cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); checkCudaErrors(cudaEventRecord(start)); @@ -740,26 +768,30 @@ int main(int argc, char **argv) if ((deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) && (selected_kernel != simple_bf16mma_gemm)) { printf("Computing using high performance kernel = %d - %s\n", selected_kernel, kernelNames[selected_kernel]); - switch (selected_kernel) - { - case bf16mma_shmem_gemm_async_copy : - default: - checkCudaErrors(cudaFuncSetAttribute(compute_bf16gemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors((compute_bf16gemm_async_copy<<>>(A, B, C, D, alpha, beta))); - break; - case bf16mma_shmem_gemm : - checkCudaErrors(cudaFuncSetAttribute(compute_bf16gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors((compute_bf16gemm<<>>(A, B, C, D, alpha, beta))); - break; + switch (selected_kernel) { + case bf16mma_shmem_gemm_async_copy: + default: + checkCudaErrors(cudaFuncSetAttribute( + compute_bf16gemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors( + (compute_bf16gemm_async_copy<<>>( + A, B, C, D, alpha, beta))); + break; + case bf16mma_shmem_gemm: + checkCudaErrors( + cudaFuncSetAttribute(compute_bf16gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors((compute_bf16gemm<<>>( + A, B, C, D, alpha, beta))); + break; } #if CPU_DEBUG - checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); #endif } else { dim3 gridDim; dim3 blockDim; - + // blockDim.x must be a multple of warpSize // 128x4 means we have 16 warps and a block computes a 64x64 output tile blockDim.x = 128; @@ -783,11 +815,7 @@ int main(int argc, char **argv) memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL); - matMultiplyOnHost(A_h, B_h, result_host, - alpha, beta, - M_GLOBAL, K_GLOBAL, - K_GLOBAL, N_GLOBAL, - M_GLOBAL, N_GLOBAL); + matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { if (fabs(result_hD[i] - result_host[i]) > 0.1f) { @@ -803,15 +831,15 @@ int main(int argc, char **argv) checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); printf("Time: %f ms\n", milliseconds); - printf("TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12); + printf("TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) / 1e12); free(A_h); free(B_h); free(C_h); - checkCudaErrors(cudaFree((void*)A)); - checkCudaErrors(cudaFree((void*)B)); - checkCudaErrors(cudaFree((void*)C)); - checkCudaErrors(cudaFree((void*)D)); + checkCudaErrors(cudaFree((void *)A)); + checkCudaErrors(cudaFree((void *)B)); + checkCudaErrors(cudaFree((void *)C)); + checkCudaErrors(cudaFree((void *)D)); return 0; } diff --git a/Samples/3_CUDA_Features/binaryPartitionCG/binaryPartitionCG.cu b/Samples/3_CUDA_Features/binaryPartitionCG/binaryPartitionCG.cu index 83cd2d46..858a9229 100644 --- a/Samples/3_CUDA_Features/binaryPartitionCG/binaryPartitionCG.cu +++ b/Samples/3_CUDA_Features/binaryPartitionCG/binaryPartitionCG.cu @@ -43,17 +43,18 @@ * divergence is inevitable one can use binary_partition group. */ -#include #include #include #include +#include namespace cg = cooperative_groups; -void initOddEvenArr(int *inputArr, unsigned int size) { - for (int i = 0; i < size; i++) { - inputArr[i] = rand() % 50; - } +void initOddEvenArr(int *inputArr, unsigned int size) +{ + for (int i = 0; i < size; i++) { + inputArr[i] = rand() % 50; + } } /** @@ -61,99 +62,97 @@ void initOddEvenArr(int *inputArr, unsigned int size) { * * Creates cooperative groups and performs odd/even counting & summation. */ -__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, - int *sumOfOddAndEvens, unsigned int size) { - cg::thread_block cta = cg::this_thread_block(); - cg::grid_group grid = cg::this_grid(); - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); +__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, int *sumOfOddAndEvens, unsigned int size) +{ + cg::thread_block cta = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - for (int i = grid.thread_rank(); i < size; i += grid.size()) { - int elem = inputArr[i]; - auto subTile = cg::binary_partition(tile32, elem & 1); - if (elem & 1) // Odd numbers group - { - int oddGroupSum = cg::reduce(subTile, elem, cg::plus()); + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + int elem = inputArr[i]; + auto subTile = cg::binary_partition(tile32, elem & 1); + if (elem & 1) // Odd numbers group + { + int oddGroupSum = cg::reduce(subTile, elem, cg::plus()); - if (subTile.thread_rank() == 0) { - // Add number of odds present in this group of Odds. - atomicAdd(numOfOdds, subTile.size()); + if (subTile.thread_rank() == 0) { + // Add number of odds present in this group of Odds. + atomicAdd(numOfOdds, subTile.size()); - // Add local reduction of odds present in this group of Odds. - atomicAdd(&sumOfOddAndEvens[0], oddGroupSum); - } - } else // Even numbers group - { - int evenGroupSum = cg::reduce(subTile, elem, cg::plus()); + // Add local reduction of odds present in this group of Odds. + atomicAdd(&sumOfOddAndEvens[0], oddGroupSum); + } + } + else // Even numbers group + { + int evenGroupSum = cg::reduce(subTile, elem, cg::plus()); - if (subTile.thread_rank() == 0) { - // Add local reduction of even present in this group of evens. - atomicAdd(&sumOfOddAndEvens[1], evenGroupSum); - } + if (subTile.thread_rank() == 0) { + // Add local reduction of even present in this group of evens. + atomicAdd(&sumOfOddAndEvens[1], evenGroupSum); + } + } + // reconverge warp so for next loop iteration we ensure convergence of + // above diverged threads to perform coalesced loads of inputArr. + cg::sync(tile32); } - // reconverge warp so for next loop iteration we ensure convergence of - // above diverged threads to perform coalesced loads of inputArr. - cg::sync(tile32); - } } /** * Host main routine */ -int main(int argc, const char **argv) { - int deviceId = findCudaDevice(argc, argv); - int *h_inputArr, *d_inputArr; - int *h_numOfOdds, *d_numOfOdds; - int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems; - unsigned int arrSize = 1024 * 100; +int main(int argc, const char **argv) +{ + int deviceId = findCudaDevice(argc, argv); + int *h_inputArr, *d_inputArr; + int *h_numOfOdds, *d_numOfOdds; + int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems; + unsigned int arrSize = 1024 * 100; - checkCudaErrors(cudaMallocHost(&h_inputArr, sizeof(int) * arrSize)); - checkCudaErrors(cudaMallocHost(&h_numOfOdds, sizeof(int))); - checkCudaErrors(cudaMallocHost(&h_sumOfOddEvenElems, sizeof(int) * 2)); - initOddEvenArr(h_inputArr, arrSize); + checkCudaErrors(cudaMallocHost(&h_inputArr, sizeof(int) * arrSize)); + checkCudaErrors(cudaMallocHost(&h_numOfOdds, sizeof(int))); + checkCudaErrors(cudaMallocHost(&h_sumOfOddEvenElems, sizeof(int) * 2)); + initOddEvenArr(h_inputArr, arrSize); - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int) * arrSize)); - checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int))); - checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int) * 2)); + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int) * arrSize)); + checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int))); + checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int) * 2)); - checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int) * arrSize, - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream)); - checkCudaErrors( - cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2 * sizeof(int), stream)); + checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int) * arrSize, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream)); + checkCudaErrors(cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2 * sizeof(int), stream)); - // Launch the kernel - int threadsPerBlock = 0; - int blocksPerGrid = 0; - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( - &blocksPerGrid, &threadsPerBlock, oddEvenCountAndSumCG, 0, 0)); + // Launch the kernel + int threadsPerBlock = 0; + int blocksPerGrid = 0; + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&blocksPerGrid, &threadsPerBlock, oddEvenCountAndSumCG, 0, 0)); - printf("\nLaunching %d blocks with %d threads...\n\n", blocksPerGrid, - threadsPerBlock); + printf("\nLaunching %d blocks with %d threads...\n\n", blocksPerGrid, threadsPerBlock); - oddEvenCountAndSumCG<<>>( - d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize); + oddEvenCountAndSumCG<<>>( + d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize); - checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, - 2 * sizeof(int), cudaMemcpyDeviceToHost, - stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors( + cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, 2 * sizeof(int), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", - arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0], - h_sumOfOddEvenElems[1]); - printf("\n...Done.\n\n"); + printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", + arrSize, + h_numOfOdds[0], + h_sumOfOddEvenElems[0], + h_sumOfOddEvenElems[1]); + printf("\n...Done.\n\n"); - checkCudaErrors(cudaFreeHost(h_inputArr)); - checkCudaErrors(cudaFreeHost(h_numOfOdds)); - checkCudaErrors(cudaFreeHost(h_sumOfOddEvenElems)); + checkCudaErrors(cudaFreeHost(h_inputArr)); + checkCudaErrors(cudaFreeHost(h_numOfOdds)); + checkCudaErrors(cudaFreeHost(h_sumOfOddEvenElems)); - checkCudaErrors(cudaFree(d_inputArr)); - checkCudaErrors(cudaFree(d_numOfOdds)); - checkCudaErrors(cudaFree(d_sumOfOddEvenElems)); + checkCudaErrors(cudaFree(d_inputArr)); + checkCudaErrors(cudaFree(d_numOfOdds)); + checkCudaErrors(cudaFree(d_sumOfOddEvenElems)); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/3_CUDA_Features/bindlessTexture/README.md b/Samples/3_CUDA_Features/bindlessTexture/README.md index d9820b0b..03677f66 100644 --- a/Samples/3_CUDA_Features/bindlessTexture/README.md +++ b/Samples/3_CUDA_Features/bindlessTexture/README.md @@ -32,4 +32,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.cpp b/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.cpp index 4ce67710..670c82fa 100644 --- a/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.cpp +++ b/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.cpp @@ -37,14 +37,12 @@ Look into the bindlessTexture_kernel.cu file for most relevant code. */ -#include -#include -#include -#include - -#include - #include +#include +#include +#include +#include +#include #if defined(__APPLE__) || defined(MACOSX) #pragma clang diagnostic ignored "-Wdeprecated-declarations" #include @@ -55,31 +53,31 @@ #include #endif +#include +#include + #include "bindlessTexture.h" -#include -#include - #define MAX_EPSILON_ERROR 5.0f -#define THRESHOLD 0.15f +#define THRESHOLD 0.15f const char *sSDKsample = "CUDA bindlessTexture"; const char *imageFilenames[] = { - "flower.ppm", "person.ppm", "sponge.ppm", + "flower.ppm", + "person.ppm", + "sponge.ppm", }; const cudaExtent atlasSize = make_cudaExtent(4, 4, 0); -const dim3 windowSize(512, 512); -const dim3 windowBlockSize(16, 16, 1); -const dim3 windowGridSize(windowSize.x / windowBlockSize.x, - windowSize.y / windowBlockSize.y); +const dim3 windowSize(512, 512); +const dim3 windowBlockSize(16, 16, 1); +const dim3 windowGridSize(windowSize.x / windowBlockSize.x, windowSize.y / windowBlockSize.y); -float lod = 0.5; // texture mip map level +float lod = 0.5; // texture mip map level -GLuint pbo; // OpenGL pixel buffer object -struct cudaGraphicsResource *cuda_pbo_resource = - NULL; // CUDA Graphics Resource (to transfer PBO) +GLuint pbo; // OpenGL pixel buffer object +struct cudaGraphicsResource *cuda_pbo_resource = NULL; // CUDA Graphics Resource (to transfer PBO) bool animate = true; @@ -88,330 +86,335 @@ StopWatchInterface *timer = NULL; uint *d_output = NULL; // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; -extern "C" void initAtlasAndImages(const Image *images, size_t numImages, - cudaExtent atlasSize); +extern "C" void initAtlasAndImages(const Image *images, size_t numImages, cudaExtent atlasSize); extern "C" void deinitAtlasAndImages(); extern "C" void randomizeAtlas(); -extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize, uint *d_output, - uint imageW, uint imageH, float lod); +extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float lod); -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - fpsLimit = (int)MAX(1.0f, ifps); - sdkResetTimer(&timer); - } + fpsLimit = (int)MAX(1.0f, ifps); + sdkResetTimer(&timer); + } } // render image using CUDA -void render() { - // map PBO to get CUDA device pointer - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_output, &num_bytes, cuda_pbo_resource)); +void render() +{ + // map PBO to get CUDA device pointer + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); - // call CUDA kernel, writing results to PBO - renderAtlasImage(windowGridSize, windowBlockSize, d_output, windowSize.x, - windowSize.y, lod); + // call CUDA kernel, writing results to PBO + renderAtlasImage(windowGridSize, windowBlockSize, d_output, windowSize.x, windowSize.y, lod); - getLastCudaError("render_kernel failed"); + getLastCudaError("render_kernel failed"); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); } // display results using OpenGL (called by GLUT) -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - render(); + render(); - // display results - glClear(GL_COLOR_BUFFER_BIT); + // display results + glClear(GL_COLOR_BUFFER_BIT); - // draw image from PBO - glDisable(GL_DEPTH_TEST); - glRasterPos2i(0, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glDrawPixels(windowSize.x, windowSize.y, GL_RGBA, GL_UNSIGNED_BYTE, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + // draw image from PBO + glDisable(GL_DEPTH_TEST); + glRasterPos2i(0, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glDrawPixels(windowSize.x, windowSize.y, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - glutSwapBuffers(); - glutReportErrors(); + glutSwapBuffers(); + glutReportErrors(); - sdkStopTimer(&timer); - computeFPS(); + sdkStopTimer(&timer); + computeFPS(); } -void idle() { - if (animate) { - lod += 0.02f; - glutPostRedisplay(); - } +void idle() +{ + if (animate) { + lod += 0.02f; + glutPostRedisplay(); + } } -void keyboard(unsigned char key, int x, int y) { - switch (key) { +void keyboard(unsigned char key, int x, int y) +{ + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case '=': case '+': - lod += 0.25f; - break; + lod += 0.25f; + break; case '-': - lod -= 0.25f; - break; + lod -= 0.25f; + break; case 'r': - randomizeAtlas(); - break; + randomizeAtlas(); + break; case ' ': - animate = !animate; - lod = 0.0f; - break; + animate = !animate; + lod = 0.0f; + break; default: - break; - } + break; + } - glutPostRedisplay(); + glutPostRedisplay(); } -void reshape(int x, int y) { - glViewport(0, 0, x, y); +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } // Global cleanup function // Shared by both GL and non-GL code paths -void cleanup() { - sdkDeleteTimer(&timer); +void cleanup() +{ + sdkDeleteTimer(&timer); - // unregister this buffer object from CUDA C - if (cuda_pbo_resource) { - cudaGraphicsUnregisterResource(cuda_pbo_resource); - glDeleteBuffers(1, &pbo); - } + // unregister this buffer object from CUDA C + if (cuda_pbo_resource) { + cudaGraphicsUnregisterResource(cuda_pbo_resource); + glDeleteBuffers(1, &pbo); + } } -void cleanup_all() { - cleanup(); - deinitAtlasAndImages(); +void cleanup_all() +{ + cleanup(); + deinitAtlasAndImages(); } -void initGLBuffers() { - // create pixel buffer object - glGenBuffers(1, &pbo); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, - windowSize.x * windowSize.y * sizeof(GLubyte) * 4, 0, - GL_STREAM_DRAW_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); +void initGLBuffers() +{ + // create pixel buffer object + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, windowSize.x * windowSize.y * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - // register this buffer object with CUDA - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); } // Load raw data from disk -uchar *loadRawFile(const char *filename, size_t size) { - FILE *fp = fopen(filename, "rb"); +uchar *loadRawFile(const char *filename, size_t size) +{ + FILE *fp = fopen(filename, "rb"); - if (!fp) { - fprintf(stderr, "Error opening file '%s'\n", filename); - return 0; - } - - uchar *data = (uchar *)malloc(size); - size_t read = fread(data, 1, size, fp); - fclose(fp); - - printf("Read '%s', %zu bytes\n", filename, read); - - return data; -} - -void initGL(int *argc, char **argv) { - // initialize GLUT callback functions - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); - glutInitWindowSize(windowSize.x, windowSize.y); - glutCreateWindow(sSDKsample); - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); - glutIdleFunc(idle); - - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Required OpenGL extensions are missing."); - exit(EXIT_FAILURE); - } -} - -void runAutoTest(const char *ref_file, char *exec_path) { - size_t windowBytes = windowSize.x * windowSize.y * sizeof(GLubyte) * 4; - - checkCudaErrors(cudaMalloc((void **)&d_output, windowBytes)); - - // render the volumeData - renderAtlasImage(windowGridSize, windowBlockSize, d_output, windowSize.x, - windowSize.y, lod); - - checkCudaErrors(cudaDeviceSynchronize()); - getLastCudaError("render_kernel failed"); - - void *h_output = malloc(windowBytes); - checkCudaErrors( - cudaMemcpy(h_output, d_output, windowBytes, cudaMemcpyDeviceToHost)); - sdkDumpBin(h_output, (unsigned int)windowBytes, "bindlessTexture.bin"); - - bool bTestResult = sdkCompareBin2BinFloat( - "bindlessTexture.bin", sdkFindFilePath(ref_file, exec_path), - windowSize.x * windowSize.y, MAX_EPSILON_ERROR, THRESHOLD, exec_path); - - checkCudaErrors(cudaFree(d_output)); - free(h_output); - deinitAtlasAndImages(); - - sdkStopTimer(&timer); - sdkDeleteTimer(&timer); - - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); -} - -void loadImageData(const char *exe_path) { - std::vector images; - - for (size_t i = 0; i < sizeof(imageFilenames) / sizeof(imageFilenames[0]); - i++) { - unsigned int imgWidth = 0; - unsigned int imgHeight = 0; - uchar *imgData = NULL; - const char *imgPath = 0; - const char *imgFilename = imageFilenames[i]; - - if (exe_path) { - imgPath = sdkFindFilePath(imgFilename, exe_path); + if (!fp) { + fprintf(stderr, "Error opening file '%s'\n", filename); + return 0; } - if (imgPath == 0) { - printf("Error finding image file '%s'\n", imgFilename); - exit(EXIT_FAILURE); + uchar *data = (uchar *)malloc(size); + size_t read = fread(data, 1, size, fp); + fclose(fp); + + printf("Read '%s', %zu bytes\n", filename, read); + + return data; +} + +void initGL(int *argc, char **argv) +{ + // initialize GLUT callback functions + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); + glutInitWindowSize(windowSize.x, windowSize.y); + glutCreateWindow(sSDKsample); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutIdleFunc(idle); + + if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions are missing."); + exit(EXIT_FAILURE); + } +} + +void runAutoTest(const char *ref_file, char *exec_path) +{ + size_t windowBytes = windowSize.x * windowSize.y * sizeof(GLubyte) * 4; + + checkCudaErrors(cudaMalloc((void **)&d_output, windowBytes)); + + // render the volumeData + renderAtlasImage(windowGridSize, windowBlockSize, d_output, windowSize.x, windowSize.y, lod); + + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("render_kernel failed"); + + void *h_output = malloc(windowBytes); + checkCudaErrors(cudaMemcpy(h_output, d_output, windowBytes, cudaMemcpyDeviceToHost)); + sdkDumpBin(h_output, (unsigned int)windowBytes, "bindlessTexture.bin"); + + bool bTestResult = sdkCompareBin2BinFloat("bindlessTexture.bin", + sdkFindFilePath(ref_file, exec_path), + windowSize.x * windowSize.y, + MAX_EPSILON_ERROR, + THRESHOLD, + exec_path); + + checkCudaErrors(cudaFree(d_output)); + free(h_output); + deinitAtlasAndImages(); + + sdkStopTimer(&timer); + sdkDeleteTimer(&timer); + + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); +} + +void loadImageData(const char *exe_path) +{ + std::vector images; + + for (size_t i = 0; i < sizeof(imageFilenames) / sizeof(imageFilenames[0]); i++) { + unsigned int imgWidth = 0; + unsigned int imgHeight = 0; + uchar *imgData = NULL; + const char *imgPath = 0; + const char *imgFilename = imageFilenames[i]; + + if (exe_path) { + imgPath = sdkFindFilePath(imgFilename, exe_path); + } + + if (imgPath == 0) { + printf("Error finding image file '%s'\n", imgFilename); + exit(EXIT_FAILURE); + } + + sdkLoadPPM4(imgPath, (unsigned char **)&imgData, &imgWidth, &imgHeight); + + if (!imgData) { + printf("Error opening file '%s'\n", imgPath); + exit(EXIT_FAILURE); + } + + printf("Loaded '%s', %d x %d pixels\n", imgPath, imgWidth, imgHeight); + + checkHost(imgWidth > 1); + checkHost(imgHeight > 1); + + Image img; + img.size = make_cudaExtent(imgWidth, imgHeight, 0); + img.h_data = imgData; + images.push_back(img); } - sdkLoadPPM4(imgPath, (unsigned char **)&imgData, &imgWidth, &imgHeight); - - if (!imgData) { - printf("Error opening file '%s'\n", imgPath); - exit(EXIT_FAILURE); - } - - printf("Loaded '%s', %d x %d pixels\n", imgPath, imgWidth, imgHeight); - - checkHost(imgWidth > 1); - checkHost(imgHeight > 1); - - Image img; - img.size = make_cudaExtent(imgWidth, imgHeight, 0); - img.h_data = imgData; - images.push_back(img); - } - - initAtlasAndImages(&images[0], images.size(), atlasSize); + initAtlasAndImages(&images[0], images.size(), atlasSize); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - sdkCreateTimer(&timer); +int main(int argc, char **argv) +{ + sdkCreateTimer(&timer); - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; - char *ref_file = NULL; + char *ref_file = NULL; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - fpsLimit = frameCheckNumber; - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + fpsLimit = frameCheckNumber; + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + } - srand(15234); + srand(15234); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - if (!ref_file) { - initGL(&argc, argv); + if (!ref_file) { + initGL(&argc, argv); - // OpenGL buffers - initGLBuffers(); - } + // OpenGL buffers + initGLBuffers(); + } - if (!checkCudaCapabilities(3, 0)) { - cleanup(); + if (!checkCudaCapabilities(3, 0)) { + cleanup(); - exit(EXIT_WAIVED); - } + exit(EXIT_WAIVED); + } - loadImageData(argv[0]); + loadImageData(argv[0]); - if (ref_file) { - runAutoTest(ref_file, argv[0]); - } + if (ref_file) { + runAutoTest(ref_file, argv[0]); + } - printf( - "Press space to toggle animation\n" - "Press '+' and '-' to change lod level\n" - "Press 'r' to randomize virtual atlas\n"); + printf("Press space to toggle animation\n" + "Press '+' and '-' to change lod level\n" + "Press 'r' to randomize virtual atlas\n"); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup_all); + atexit(cleanup_all); #else - glutCloseFunc(cleanup_all); + glutCloseFunc(cleanup_all); #endif - glutMainLoop(); + glutMainLoop(); } diff --git a/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.h b/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.h index 52d5ef55..3106a570 100644 --- a/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.h +++ b/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture.h @@ -29,39 +29,38 @@ #define _BINDLESSTEXTURE_CU_ // includes, cuda -#include #include +#include // CUDA utilities and system includes #include #include -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; #pragma pack(push, 4) -struct Image { - void *h_data; - cudaExtent size; - cudaResourceType type; - cudaArray_t dataArray; - cudaMipmappedArray_t mipmapArray; - cudaTextureObject_t textureObject; +struct Image +{ + void *h_data; + cudaExtent size; + cudaResourceType type; + cudaArray_t dataArray; + cudaMipmappedArray_t mipmapArray; + cudaTextureObject_t textureObject; - Image() { memset(this, 0, sizeof(Image)); } + Image() { memset(this, 0, sizeof(Image)); } }; #pragma pack(pop) -inline void _checkHost(bool test, const char *condition, const char *file, - int line, const char *func) { - if (!test) { - fprintf(stderr, "HOST error at %s:%d (%s) \"%s\" \n", file, line, condition, - func); - exit(EXIT_FAILURE); - } +inline void _checkHost(bool test, const char *condition, const char *file, int line, const char *func) +{ + if (!test) { + fprintf(stderr, "HOST error at %s:%d (%s) \"%s\" \n", file, line, condition, func); + exit(EXIT_FAILURE); + } } -#define checkHost(condition) \ - _checkHost(condition, #condition, __FILE__, __LINE__, __FUNCTION__) +#define checkHost(condition) _checkHost(condition, #condition, __FILE__, __LINE__, __FUNCTION__) #endif diff --git a/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture_kernel.cu b/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture_kernel.cu index be163a2c..9f346c55 100644 --- a/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture_kernel.cu +++ b/Samples/3_CUDA_Features/bindlessTexture/bindlessTexture_kernel.cu @@ -41,26 +41,24 @@ #ifndef _BINDLESSTEXTURE_KERNEL_CU_ #define _BINDLESSTEXTURE_KERNEL_CU_ -#include -#include -#include -#include - -#include - #include #include +#include +#include +#include +#include +#include #include "bindlessTexture.h" // set this to just see the mipmap chain of first image -//#define SHOW_MIPMAPS +// #define SHOW_MIPMAPS // local references to resources -Image atlasImage; +Image atlasImage; std::vector contentImages; -float highestLod = 1.0f; +float highestLod = 1.0f; #ifndef MAX #define MAX(a, b) ((a > b) ? a : b) @@ -68,22 +66,21 @@ float highestLod = 1.0f; ////////////////////////////////////////////////////////////////////////// -__host__ __device__ __inline__ uint2 encodeTextureObject( - cudaTextureObject_t obj) { - return make_uint2((uint)(obj & 0xFFFFFFFF), (uint)(obj >> 32)); +__host__ __device__ __inline__ uint2 encodeTextureObject(cudaTextureObject_t obj) +{ + return make_uint2((uint)(obj & 0xFFFFFFFF), (uint)(obj >> 32)); } -__host__ __device__ __inline__ cudaTextureObject_t decodeTextureObject( - uint2 obj) { - return (((cudaTextureObject_t)obj.x) | ((cudaTextureObject_t)obj.y) << 32); +__host__ __device__ __inline__ cudaTextureObject_t decodeTextureObject(uint2 obj) +{ + return (((cudaTextureObject_t)obj.x) | ((cudaTextureObject_t)obj.y) << 32); } -__device__ __inline__ float4 to_float4(uchar4 vec) { - return make_float4(vec.x, vec.y, vec.z, vec.w); -} +__device__ __inline__ float4 to_float4(uchar4 vec) { return make_float4(vec.x, vec.y, vec.z, vec.w); } -__device__ __inline__ uchar4 to_uchar4(float4 vec) { - return make_uchar4((uchar)vec.x, (uchar)vec.y, (uchar)vec.z, (uchar)vec.w); +__device__ __inline__ uchar4 to_uchar4(float4 vec) +{ + return make_uchar4((uchar)vec.x, (uchar)vec.y, (uchar)vec.z, (uchar)vec.w); } ////////////////////////////////////////////////////////////////////////// @@ -92,48 +89,46 @@ __device__ __inline__ uchar4 to_uchar4(float4 vec) { // the atlas texture stores the 64 bit cudaTextureObjects // we use it for "virtual" texturing -__global__ void d_render(uchar4 *d_output, uint imageW, uint imageH, float lod, - cudaTextureObject_t atlasTexture) { - uint x = blockIdx.x * blockDim.x + threadIdx.x; - uint y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void d_render(uchar4 *d_output, uint imageW, uint imageH, float lod, cudaTextureObject_t atlasTexture) +{ + uint x = blockIdx.x * blockDim.x + threadIdx.x; + uint y = blockIdx.y * blockDim.y + threadIdx.y; - float u = x / (float)imageW; - float v = y / (float)imageH; + float u = x / (float)imageW; + float v = y / (float)imageH; - if ((x < imageW) && (y < imageH)) { - // read from 2D atlas texture and decode texture object - uint2 texCoded = tex2D(atlasTexture, u, v); - cudaTextureObject_t tex = decodeTextureObject(texCoded); + if ((x < imageW) && (y < imageH)) { + // read from 2D atlas texture and decode texture object + uint2 texCoded = tex2D(atlasTexture, u, v); + cudaTextureObject_t tex = decodeTextureObject(texCoded); - // read from cuda texture object, use template to specify what data will be - // returned. tex2DLod allows us to pass the lod (mip map level) directly. - // There is other functions with CUDA 5, e.g. tex2DGrad, that allow you - // to pass derivatives to perform automatic mipmap/anisotropic filtering. - float4 color = tex2DLod(tex, u, 1 - v, lod); - // In our sample tex is always valid, but for something like your own - // sparse texturing you would need to make sure to handle the zero case. + // read from cuda texture object, use template to specify what data will be + // returned. tex2DLod allows us to pass the lod (mip map level) directly. + // There is other functions with CUDA 5, e.g. tex2DGrad, that allow you + // to pass derivatives to perform automatic mipmap/anisotropic filtering. + float4 color = tex2DLod(tex, u, 1 - v, lod); + // In our sample tex is always valid, but for something like your own + // sparse texturing you would need to make sure to handle the zero case. - // write output color - uint i = y * imageW + x; - d_output[i] = to_uchar4(color * 255.0); - } + // write output color + uint i = y * imageW + x; + d_output[i] = to_uchar4(color * 255.0); + } } -extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize, - uchar4 *d_output, uint imageW, uint imageH, - float lod) { - // psuedo animate lod - lod = fmodf(lod, highestLod * 2); - lod = highestLod - fabs(lod - highestLod); +extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize, uchar4 *d_output, uint imageW, uint imageH, float lod) +{ + // psuedo animate lod + lod = fmodf(lod, highestLod * 2); + lod = highestLod - fabs(lod - highestLod); #ifdef SHOW_MIPMAPS - lod = 0.0f; + lod = 0.0f; #endif - d_render<<>>(d_output, imageW, imageH, lod, - atlasImage.textureObject); + d_render<<>>(d_output, imageW, imageH, lod, atlasImage.textureObject); - checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaGetLastError()); } ////////////////////////////////////////////////////////////////////////// @@ -143,287 +138,278 @@ extern "C" void renderAtlasImage(dim3 gridSize, dim3 blockSize, // global binding points anymore. We can directly pass them as function // arguments. -__global__ void d_mipmap(cudaSurfaceObject_t mipOutput, - cudaTextureObject_t mipInput, uint imageW, - uint imageH) { - uint x = blockIdx.x * blockDim.x + threadIdx.x; - uint y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void d_mipmap(cudaSurfaceObject_t mipOutput, cudaTextureObject_t mipInput, uint imageW, uint imageH) +{ + uint x = blockIdx.x * blockDim.x + threadIdx.x; + uint y = blockIdx.y * blockDim.y + threadIdx.y; - float px = 1.0 / float(imageW); - float py = 1.0 / float(imageH); + float px = 1.0 / float(imageW); + float py = 1.0 / float(imageH); - if ((x < imageW) && (y < imageH)) { - // take the average of 4 samples + if ((x < imageW) && (y < imageH)) { + // take the average of 4 samples - // we are using the normalized access to make sure non-power-of-two textures - // behave well when downsized. - float4 color = (tex2D(mipInput, (x + 0) * px, (y + 0) * py)) + - (tex2D(mipInput, (x + 1) * px, (y + 0) * py)) + - (tex2D(mipInput, (x + 1) * px, (y + 1) * py)) + - (tex2D(mipInput, (x + 0) * px, (y + 1) * py)); + // we are using the normalized access to make sure non-power-of-two textures + // behave well when downsized. + float4 color = (tex2D(mipInput, (x + 0) * px, (y + 0) * py)) + + (tex2D(mipInput, (x + 1) * px, (y + 0) * py)) + + (tex2D(mipInput, (x + 1) * px, (y + 1) * py)) + + (tex2D(mipInput, (x + 0) * px, (y + 1) * py)); - color /= 4.0; - color *= 255.0; - color = fminf(color, make_float4(255.0)); + color /= 4.0; + color *= 255.0; + color = fminf(color, make_float4(255.0)); - surf2Dwrite(to_uchar4(color), mipOutput, x * sizeof(uchar4), y); - } + surf2Dwrite(to_uchar4(color), mipOutput, x * sizeof(uchar4), y); + } } -void generateMipMaps(cudaMipmappedArray_t mipmapArray, cudaExtent size) { - size_t width = size.width; - size_t height = size.height; +void generateMipMaps(cudaMipmappedArray_t mipmapArray, cudaExtent size) +{ + size_t width = size.width; + size_t height = size.height; #ifdef SHOW_MIPMAPS - cudaArray_t levelFirst; - checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFirst, mipmapArray, 0)); + cudaArray_t levelFirst; + checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFirst, mipmapArray, 0)); #endif - uint level = 0; + uint level = 0; - while (width != 1 || height != 1) { - width /= 2; - width = MAX((size_t)1, width); - height /= 2; - height = MAX((size_t)1, height); + while (width != 1 || height != 1) { + width /= 2; + width = MAX((size_t)1, width); + height /= 2; + height = MAX((size_t)1, height); - cudaArray_t levelFrom; - checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFrom, mipmapArray, level)); - cudaArray_t levelTo; - checkCudaErrors( - cudaGetMipmappedArrayLevel(&levelTo, mipmapArray, level + 1)); + cudaArray_t levelFrom; + checkCudaErrors(cudaGetMipmappedArrayLevel(&levelFrom, mipmapArray, level)); + cudaArray_t levelTo; + checkCudaErrors(cudaGetMipmappedArrayLevel(&levelTo, mipmapArray, level + 1)); - cudaExtent levelToSize; - checkCudaErrors(cudaArrayGetInfo(NULL, &levelToSize, NULL, levelTo)); - checkHost(levelToSize.width == width); - checkHost(levelToSize.height == height); - checkHost(levelToSize.depth == 0); + cudaExtent levelToSize; + checkCudaErrors(cudaArrayGetInfo(NULL, &levelToSize, NULL, levelTo)); + checkHost(levelToSize.width == width); + checkHost(levelToSize.height == height); + checkHost(levelToSize.depth == 0); - // generate texture object for reading - cudaTextureObject_t texInput; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + // generate texture object for reading + cudaTextureObject_t texInput; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = levelFrom; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = levelFrom; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = 1; - texDescr.filterMode = cudaFilterModeLinear; + texDescr.normalizedCoords = 1; + texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.addressMode[2] = cudaAddressModeClamp; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.addressMode[2] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors( - cudaCreateTextureObject(&texInput, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texInput, &texRes, &texDescr, NULL)); - // generate surface object for writing + // generate surface object for writing - cudaSurfaceObject_t surfOutput; - cudaResourceDesc surfRes; - memset(&surfRes, 0, sizeof(cudaResourceDesc)); - surfRes.resType = cudaResourceTypeArray; - surfRes.res.array.array = levelTo; + cudaSurfaceObject_t surfOutput; + cudaResourceDesc surfRes; + memset(&surfRes, 0, sizeof(cudaResourceDesc)); + surfRes.resType = cudaResourceTypeArray; + surfRes.res.array.array = levelTo; - checkCudaErrors(cudaCreateSurfaceObject(&surfOutput, &surfRes)); + checkCudaErrors(cudaCreateSurfaceObject(&surfOutput, &surfRes)); - // run mipmap kernel - dim3 blockSize(16, 16, 1); - dim3 gridSize(((uint)width + blockSize.x - 1) / blockSize.x, - ((uint)height + blockSize.y - 1) / blockSize.y, 1); + // run mipmap kernel + dim3 blockSize(16, 16, 1); + dim3 gridSize(((uint)width + blockSize.x - 1) / blockSize.x, ((uint)height + blockSize.y - 1) / blockSize.y, 1); - d_mipmap<<>>(surfOutput, texInput, (uint)width, - (uint)height); + d_mipmap<<>>(surfOutput, texInput, (uint)width, (uint)height); - checkCudaErrors(cudaDeviceSynchronize()); - checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaGetLastError()); - checkCudaErrors(cudaDestroySurfaceObject(surfOutput)); + checkCudaErrors(cudaDestroySurfaceObject(surfOutput)); - checkCudaErrors(cudaDestroyTextureObject(texInput)); + checkCudaErrors(cudaDestroyTextureObject(texInput)); #ifdef SHOW_MIPMAPS - // we blit the current mipmap back into first level - cudaMemcpy3DParms copyParams = {0}; - copyParams.dstArray = levelFirst; - copyParams.srcArray = levelTo; - copyParams.extent = make_cudaExtent(width, height, 1); - copyParams.kind = cudaMemcpyDeviceToDevice; - checkCudaErrors(cudaMemcpy3D(©Params)); + // we blit the current mipmap back into first level + cudaMemcpy3DParms copyParams = {0}; + copyParams.dstArray = levelFirst; + copyParams.srcArray = levelTo; + copyParams.extent = make_cudaExtent(width, height, 1); + copyParams.kind = cudaMemcpyDeviceToDevice; + checkCudaErrors(cudaMemcpy3D(©Params)); #endif - level++; - } + level++; + } } -uint getMipMapLevels(cudaExtent size) { - size_t sz = MAX(MAX(size.width, size.height), size.depth); +uint getMipMapLevels(cudaExtent size) +{ + size_t sz = MAX(MAX(size.width, size.height), size.depth); - uint levels = 0; + uint levels = 0; - while (sz) { - sz /= 2; - levels++; - } + while (sz) { + sz /= 2; + levels++; + } - return levels; + return levels; } ////////////////////////////////////////////////////////////////////////// // Initalization -extern "C" void randomizeAtlas() { - uint2 *h_data = (uint2 *)atlasImage.h_data; +extern "C" void randomizeAtlas() +{ + uint2 *h_data = (uint2 *)atlasImage.h_data; - // assign random texture object handles to our atlas image tiles - for (size_t i = 0; i < atlasImage.size.width * atlasImage.size.height; i++) { + // assign random texture object handles to our atlas image tiles + for (size_t i = 0; i < atlasImage.size.width * atlasImage.size.height; i++) { #ifdef SHOW_MIPMAPS - h_data[i] = encodeTextureObject(contentImages[0].textureObject); + h_data[i] = encodeTextureObject(contentImages[0].textureObject); #else - h_data[i] = encodeTextureObject( - contentImages[rand() % contentImages.size()].textureObject); + h_data[i] = encodeTextureObject(contentImages[rand() % contentImages.size()].textureObject); #endif - } + } - // copy data to atlas array - cudaMemcpy3DParms copyParams = {0}; - copyParams.srcPtr = make_cudaPitchedPtr( - atlasImage.h_data, atlasImage.size.width * sizeof(uint2), - atlasImage.size.width, atlasImage.size.height); - copyParams.dstArray = atlasImage.dataArray; - copyParams.extent = atlasImage.size; - copyParams.extent.depth = 1; - copyParams.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(©Params)); + // copy data to atlas array + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = make_cudaPitchedPtr( + atlasImage.h_data, atlasImage.size.width * sizeof(uint2), atlasImage.size.width, atlasImage.size.height); + copyParams.dstArray = atlasImage.dataArray; + copyParams.extent = atlasImage.size; + copyParams.extent.depth = 1; + copyParams.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(©Params)); }; -extern "C" void deinitAtlasAndImages() { - for (size_t i = 0; i < contentImages.size(); i++) { - Image &image = contentImages[i]; +extern "C" void deinitAtlasAndImages() +{ + for (size_t i = 0; i < contentImages.size(); i++) { + Image &image = contentImages[i]; - if (image.h_data) { - free(image.h_data); + if (image.h_data) { + free(image.h_data); + } + + if (image.textureObject) { + checkCudaErrors(cudaDestroyTextureObject(image.textureObject)); + } + + if (image.mipmapArray) { + checkCudaErrors(cudaFreeMipmappedArray(image.mipmapArray)); + } } - if (image.textureObject) { - checkCudaErrors(cudaDestroyTextureObject(image.textureObject)); + if (atlasImage.h_data) { + free(atlasImage.h_data); } - if (image.mipmapArray) { - checkCudaErrors(cudaFreeMipmappedArray(image.mipmapArray)); + if (atlasImage.textureObject) { + checkCudaErrors(cudaDestroyTextureObject(atlasImage.textureObject)); } - } - if (atlasImage.h_data) { - free(atlasImage.h_data); - } - - if (atlasImage.textureObject) { - checkCudaErrors(cudaDestroyTextureObject(atlasImage.textureObject)); - } - - if (atlasImage.dataArray) { - checkCudaErrors(cudaFreeArray(atlasImage.dataArray)); - } + if (atlasImage.dataArray) { + checkCudaErrors(cudaFreeArray(atlasImage.dataArray)); + } } -extern "C" void initAtlasAndImages(const Image *images, size_t numImages, - cudaExtent atlasSize) { - // create individual textures - contentImages.resize(numImages); +extern "C" void initAtlasAndImages(const Image *images, size_t numImages, cudaExtent atlasSize) +{ + // create individual textures + contentImages.resize(numImages); - for (size_t i = 0; i < numImages; i++) { - Image &image = contentImages[i]; - image.size = images[i].size; - image.size.depth = 0; - image.type = cudaResourceTypeMipmappedArray; + for (size_t i = 0; i < numImages; i++) { + Image &image = contentImages[i]; + image.size = images[i].size; + image.size.depth = 0; + image.type = cudaResourceTypeMipmappedArray; - // how many mipmaps we need - uint levels = getMipMapLevels(image.size); - highestLod = MAX(highestLod, (float)levels - 1); + // how many mipmaps we need + uint levels = getMipMapLevels(image.size); + highestLod = MAX(highestLod, (float)levels - 1); - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - checkCudaErrors(cudaMallocMipmappedArray(&image.mipmapArray, &desc, - image.size, levels)); + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + checkCudaErrors(cudaMallocMipmappedArray(&image.mipmapArray, &desc, image.size, levels)); - // upload level 0 - cudaArray_t level0; - checkCudaErrors(cudaGetMipmappedArrayLevel(&level0, image.mipmapArray, 0)); + // upload level 0 + cudaArray_t level0; + checkCudaErrors(cudaGetMipmappedArrayLevel(&level0, image.mipmapArray, 0)); - cudaMemcpy3DParms copyParams = {0}; - copyParams.srcPtr = - make_cudaPitchedPtr(images[i].h_data, image.size.width * sizeof(uchar4), - image.size.width, image.size.height); - copyParams.dstArray = level0; - copyParams.extent = image.size; - copyParams.extent.depth = 1; - copyParams.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(©Params)); + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = make_cudaPitchedPtr( + images[i].h_data, image.size.width * sizeof(uchar4), image.size.width, image.size.height); + copyParams.dstArray = level0; + copyParams.extent = image.size; + copyParams.extent.depth = 1; + copyParams.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(©Params)); - // compute rest of mipmaps based on level 0 - generateMipMaps(image.mipmapArray, image.size); + // compute rest of mipmaps based on level 0 + generateMipMaps(image.mipmapArray, image.size); - // generate bindless texture object + // generate bindless texture object - cudaResourceDesc resDescr; - memset(&resDescr, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc resDescr; + memset(&resDescr, 0, sizeof(cudaResourceDesc)); - resDescr.resType = cudaResourceTypeMipmappedArray; - resDescr.res.mipmap.mipmap = image.mipmapArray; + resDescr.resType = cudaResourceTypeMipmappedArray; + resDescr.res.mipmap.mipmap = image.mipmapArray; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = 1; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.mipmapFilterMode = cudaFilterModeLinear; + + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.addressMode[2] = cudaAddressModeClamp; + + texDescr.maxMipmapLevelClamp = float(levels - 1); + + texDescr.readMode = cudaReadModeNormalizedFloat; + + checkCudaErrors(cudaCreateTextureObject(&image.textureObject, &resDescr, &texDescr, NULL)); + } + + // create atlas array + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + checkCudaErrors(cudaMallocArray(&atlasImage.dataArray, &channelDesc, atlasSize.width, atlasSize.height)); + atlasImage.h_data = malloc(atlasSize.width * atlasSize.height * sizeof(uint2)); + atlasImage.type = cudaResourceTypeArray; + atlasImage.size = atlasSize; + + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = atlasImage.dataArray; cudaTextureDesc texDescr; memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = 1; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.mipmapFilterMode = cudaFilterModeLinear; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.addressMode[2] = cudaAddressModeClamp; + checkCudaErrors(cudaCreateTextureObject(&atlasImage.textureObject, &texRes, &texDescr, NULL)); - texDescr.maxMipmapLevelClamp = float(levels - 1); - - texDescr.readMode = cudaReadModeNormalizedFloat; - - checkCudaErrors(cudaCreateTextureObject(&image.textureObject, &resDescr, - &texDescr, NULL)); - } - - // create atlas array - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - checkCudaErrors(cudaMallocArray(&atlasImage.dataArray, &channelDesc, - atlasSize.width, atlasSize.height)); - atlasImage.h_data = - malloc(atlasSize.width * atlasSize.height * sizeof(uint2)); - atlasImage.type = cudaResourceTypeArray; - atlasImage.size = atlasSize; - - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = atlasImage.dataArray; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors(cudaCreateTextureObject(&atlasImage.textureObject, &texRes, - &texDescr, NULL)); - - randomizeAtlas(); + randomizeAtlas(); } -#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_ +#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_ diff --git a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpAdvancedQuicksort.cu b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpAdvancedQuicksort.cu index 6f47f0a3..bb5ac5a5 100644 --- a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpAdvancedQuicksort.cu +++ b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpAdvancedQuicksort.cu @@ -40,23 +40,25 @@ // and when. // //////////////////////////////////////////////////////////////////////////////// -#include -#include #include +#include +#include namespace cg = cooperative_groups; #include #include + #include "cdpQuicksort.h" //////////////////////////////////////////////////////////////////////////////// // Inline PTX call to return index of highest non-zero bit in a word //////////////////////////////////////////////////////////////////////////////// -static __device__ __forceinline__ unsigned int __qsflo(unsigned int word) { - unsigned int ret; - asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word)); - return ret; +static __device__ __forceinline__ unsigned int __qsflo(unsigned int word) +{ + unsigned int ret; + asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word)); + return ret; } //////////////////////////////////////////////////////////////////////////////// @@ -71,25 +73,25 @@ static __device__ __forceinline__ unsigned int __qsflo(unsigned int word) { // to complete. // //////////////////////////////////////////////////////////////////////////////// -template -static __device__ T *ringbufAlloc(qsortRingbuf *ringbuf) { - // Wait for there to be space in the ring buffer. We'll retry only a fixed - // number of times and then fail, to avoid an out-of-memory deadlock. - unsigned int loop = 10000; +template static __device__ T *ringbufAlloc(qsortRingbuf *ringbuf) +{ + // Wait for there to be space in the ring buffer. We'll retry only a fixed + // number of times and then fail, to avoid an out-of-memory deadlock. + unsigned int loop = 10000; - while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) && - (loop-- > 0)) - ; + while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) && (loop-- > 0)) + ; - if (loop == 0) return NULL; + if (loop == 0) + return NULL; - // Note that the element includes a little index book-keeping, for freeing - // later. - unsigned int index = atomicAdd((unsigned int *)&ringbuf->head, 1); - T *ret = (T *)(ringbuf->stackbase) + (index & (ringbuf->stacksize - 1)); - ret->index = index; + // Note that the element includes a little index book-keeping, for freeing + // later. + unsigned int index = atomicAdd((unsigned int *)&ringbuf->head, 1); + T *ret = (T *)(ringbuf->stackbase) + (index & (ringbuf->stacksize - 1)); + ret->index = index; - return ret; + return ret; } //////////////////////////////////////////////////////////////////////////////// @@ -101,17 +103,19 @@ static __device__ T *ringbufAlloc(qsortRingbuf *ringbuf) { // space is now available. // //////////////////////////////////////////////////////////////////////////////// -template -static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) { - unsigned int index = data->index; // Non-wrapped index to free - unsigned int count = atomicAdd((unsigned int *)&(ringbuf->count), 1) + 1; - unsigned int max = atomicMax((unsigned int *)&(ringbuf->max), index + 1); +template static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) +{ + unsigned int index = data->index; // Non-wrapped index to free + unsigned int count = atomicAdd((unsigned int *)&(ringbuf->count), 1) + 1; + unsigned int max = atomicMax((unsigned int *)&(ringbuf->max), index + 1); - // Update the tail if need be. Note we update "max" to be the new value in - // ringbuf->max - if (max < (index + 1)) max = index + 1; + // Update the tail if need be. Note we update "max" to be the new value in + // ringbuf->max + if (max < (index + 1)) + max = index + 1; - if (max == count) atomicMax((unsigned int *)&(ringbuf->tail), count); + if (max == count) + atomicMax((unsigned int *)&(ringbuf->tail), count); } //////////////////////////////////////////////////////////////////////////////// @@ -133,201 +137,200 @@ static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) { // and cover the instruction overhead. // //////////////////////////////////////////////////////////////////////////////// -__global__ void qsort_warp(unsigned *indata, unsigned *outdata, - unsigned int offset, unsigned int len, +__global__ void qsort_warp(unsigned *indata, + unsigned *outdata, + unsigned int offset, + unsigned int len, qsortAtomicData *atomicData, - qsortRingbuf *atomicDataStack, - unsigned int source_is_indata, unsigned int depth) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // Find my data offset, based on warp ID - unsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT); - // unsigned int warp_id = threadIdx.x >> 5; // Used for debug only - unsigned int lane_id = threadIdx.x & (warpSize - 1); - - // Exit if I'm outside the range of sort to be done - if (thread_id >= len) return; - - // - // First part of the algorithm. Each warp counts the number of elements that - // are - // greater/less than the pivot. - // - // When a warp knows its count, it updates an atomic counter. - // - - // Read in the data and the pivot. Arbitrary pivot selection for now. - unsigned pivot = indata[offset + len / 2]; - unsigned data = indata[offset + thread_id]; - - // Count how many are <= and how many are > pivot. - // If all are <= pivot then we adjust the comparison - // because otherwise the sort will move nothing and - // we'll iterate forever. - cg::coalesced_group active = cg::coalesced_threads(); - unsigned int greater = (data > pivot); - unsigned int gt_mask = active.ballot(greater); - - if (gt_mask == 0) { - greater = (data >= pivot); - gt_mask = active.ballot(greater); // Must re-ballot for adjusted comparator - } - - unsigned int lt_mask = active.ballot(!greater); - unsigned int gt_count = __popc(gt_mask); - unsigned int lt_count = __popc(lt_mask); - - // Atomically adjust the lt_ and gt_offsets by this amount. Only one thread - // need do this. Share the result using shfl - unsigned int lt_offset, gt_offset; - - if (lane_id == 0) { - if (lt_count > 0) - lt_offset = atomicAdd((unsigned int *)&atomicData->lt_offset, lt_count); - - if (gt_count > 0) - gt_offset = - len - (atomicAdd((unsigned int *)&atomicData->gt_offset, gt_count) + - gt_count); - } - - lt_offset = - active.shfl((int)lt_offset, 0); // Everyone pulls the offsets from lane 0 - gt_offset = active.shfl((int)gt_offset, 0); - - // Now compute my own personal offset within this. I need to know how many - // threads with a lane ID less than mine are going to write to the same buffer - // as me. We can use popc to implement a single-operation warp scan in this - // case. - unsigned lane_mask_lt; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt)); - unsigned int my_mask = greater ? gt_mask : lt_mask; - unsigned int my_offset = __popc(my_mask & lane_mask_lt); - - // Move data. - my_offset += greater ? gt_offset : lt_offset; - outdata[offset + my_offset] = data; - - // Count up if we're the last warp in. If so, then Kepler will launch the next - // set of sorts directly from here. - if (lane_id == 0) { - // Count "elements written". If I wrote the last one, then trigger the next - // qsorts - unsigned int mycount = lt_count + gt_count; - - if (atomicAdd((unsigned int *)&atomicData->sorted_count, mycount) + - mycount == - len) { - // We're the last warp to do any sorting. Therefore it's up to us to - // launch the next stage. - unsigned int lt_len = atomicData->lt_offset; - unsigned int gt_len = atomicData->gt_offset; - - cudaStream_t lstream, rstream; - cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking); - cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking); - - // Begin by freeing our atomicData storage. It's better for the ringbuffer - // algorithm - // if we free when we're done, rather than re-using (makes for less - // fragmentation). - ringbufFree(atomicDataStack, atomicData); - - // Exceptional case: if "lt_len" is zero, then all values in the batch - // are equal. We are then done (may need to copy into correct buffer, - // though) - if (lt_len == 0) { - if (source_is_indata) - cudaMemcpyAsync(indata + offset, outdata + offset, - gt_len * sizeof(unsigned), cudaMemcpyDeviceToDevice, - lstream); + qsortRingbuf *atomicDataStack, + unsigned int source_is_indata, + unsigned int depth) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // Find my data offset, based on warp ID + unsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT); + // unsigned int warp_id = threadIdx.x >> 5; // Used for debug only + unsigned int lane_id = threadIdx.x & (warpSize - 1); + // Exit if I'm outside the range of sort to be done + if (thread_id >= len) return; - } - // Start with lower half first - if (lt_len > BITONICSORT_LEN) { - // If we've exceeded maximum depth, fall through to backup - // big_bitonicsort - if (depth >= QSORT_MAXDEPTH) { - // The final bitonic stage sorts in-place in "outdata". We therefore - // re-use "indata" as the out-of-range tracking buffer. For (2^n)+1 - // elements we need (2^(n+1)) bytes of oor buffer. The backup qsort - // buffer is at least this large when sizeof(QTYPE) >= 2. - big_bitonicsort<<<1, BITONICSORT_LEN, 0, lstream>>>( - outdata, source_is_indata ? indata : outdata, indata, offset, - lt_len); - } else { - // Launch another quicksort. We need to allocate more storage for the - // atomic data. - if ((atomicData = ringbufAlloc(atomicDataStack)) == - NULL) - printf("Stack-allocation error. Failing left child launch.\n"); - else { - atomicData->lt_offset = atomicData->gt_offset = - atomicData->sorted_count = 0; - unsigned int numblocks = - (unsigned int)(lt_len + (QSORT_BLOCKSIZE - 1)) >> - QSORT_BLOCKSIZE_SHIFT; - qsort_warp<<>>( - outdata, indata, offset, lt_len, atomicData, atomicDataStack, - !source_is_indata, depth + 1); - } - } - } else if (lt_len > 1) { - // Final stage uses a bitonic sort instead. It's important to - // make sure the final stage ends up in the correct (original) buffer. - // We launch the smallest power-of-2 number of threads that we can. - unsigned int bitonic_len = 1 << (__qsflo(lt_len - 1U) + 1); - bitonicsort<<<1, bitonic_len, 0, lstream>>>( - outdata, source_is_indata ? indata : outdata, offset, lt_len); - } - // Finally, if we sorted just one single element, we must still make - // sure that it winds up in the correct place. - else if (source_is_indata && (lt_len == 1)) - indata[offset] = outdata[offset]; + // + // First part of the algorithm. Each warp counts the number of elements that + // are + // greater/less than the pivot. + // + // When a warp knows its count, it updates an atomic counter. + // - if (cudaPeekAtLastError() != cudaSuccess) - printf("Left-side launch fail: %s\n", - cudaGetErrorString(cudaGetLastError())); + // Read in the data and the pivot. Arbitrary pivot selection for now. + unsigned pivot = indata[offset + len / 2]; + unsigned data = indata[offset + thread_id]; - // Now the upper half. - if (gt_len > BITONICSORT_LEN) { - // If we've exceeded maximum depth, fall through to backup - // big_bitonicsort - if (depth >= QSORT_MAXDEPTH) - big_bitonicsort<<<1, BITONICSORT_LEN, 0, rstream>>>( - outdata, source_is_indata ? indata : outdata, indata, - offset + lt_len, gt_len); - else { - // Allocate new atomic storage for this launch - if ((atomicData = ringbufAlloc(atomicDataStack)) == - NULL) - printf("Stack allocation error! Failing right-side launch.\n"); - else { - atomicData->lt_offset = atomicData->gt_offset = - atomicData->sorted_count = 0; - unsigned int numblocks = - (unsigned int)(gt_len + (QSORT_BLOCKSIZE - 1)) >> - QSORT_BLOCKSIZE_SHIFT; - qsort_warp<<>>( - outdata, indata, offset + lt_len, gt_len, atomicData, - atomicDataStack, !source_is_indata, depth + 1); - } - } - } else if (gt_len > 1) { - unsigned int bitonic_len = 1 << (__qsflo(gt_len - 1U) + 1); - bitonicsort<<<1, bitonic_len, 0, rstream>>>( - outdata, source_is_indata ? indata : outdata, offset + lt_len, - gt_len); - } else if (source_is_indata && (gt_len == 1)) - indata[offset + lt_len] = outdata[offset + lt_len]; + // Count how many are <= and how many are > pivot. + // If all are <= pivot then we adjust the comparison + // because otherwise the sort will move nothing and + // we'll iterate forever. + cg::coalesced_group active = cg::coalesced_threads(); + unsigned int greater = (data > pivot); + unsigned int gt_mask = active.ballot(greater); - if (cudaPeekAtLastError() != cudaSuccess) - printf("Right-side launch fail: %s\n", - cudaGetErrorString(cudaGetLastError())); + if (gt_mask == 0) { + greater = (data >= pivot); + gt_mask = active.ballot(greater); // Must re-ballot for adjusted comparator + } + + unsigned int lt_mask = active.ballot(!greater); + unsigned int gt_count = __popc(gt_mask); + unsigned int lt_count = __popc(lt_mask); + + // Atomically adjust the lt_ and gt_offsets by this amount. Only one thread + // need do this. Share the result using shfl + unsigned int lt_offset, gt_offset; + + if (lane_id == 0) { + if (lt_count > 0) + lt_offset = atomicAdd((unsigned int *)&atomicData->lt_offset, lt_count); + + if (gt_count > 0) + gt_offset = len - (atomicAdd((unsigned int *)&atomicData->gt_offset, gt_count) + gt_count); + } + + lt_offset = active.shfl((int)lt_offset, 0); // Everyone pulls the offsets from lane 0 + gt_offset = active.shfl((int)gt_offset, 0); + + // Now compute my own personal offset within this. I need to know how many + // threads with a lane ID less than mine are going to write to the same buffer + // as me. We can use popc to implement a single-operation warp scan in this + // case. + unsigned lane_mask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt)); + unsigned int my_mask = greater ? gt_mask : lt_mask; + unsigned int my_offset = __popc(my_mask & lane_mask_lt); + + // Move data. + my_offset += greater ? gt_offset : lt_offset; + outdata[offset + my_offset] = data; + + // Count up if we're the last warp in. If so, then Kepler will launch the next + // set of sorts directly from here. + if (lane_id == 0) { + // Count "elements written". If I wrote the last one, then trigger the next + // qsorts + unsigned int mycount = lt_count + gt_count; + + if (atomicAdd((unsigned int *)&atomicData->sorted_count, mycount) + mycount == len) { + // We're the last warp to do any sorting. Therefore it's up to us to + // launch the next stage. + unsigned int lt_len = atomicData->lt_offset; + unsigned int gt_len = atomicData->gt_offset; + + cudaStream_t lstream, rstream; + cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking); + cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking); + + // Begin by freeing our atomicData storage. It's better for the ringbuffer + // algorithm + // if we free when we're done, rather than re-using (makes for less + // fragmentation). + ringbufFree(atomicDataStack, atomicData); + + // Exceptional case: if "lt_len" is zero, then all values in the batch + // are equal. We are then done (may need to copy into correct buffer, + // though) + if (lt_len == 0) { + if (source_is_indata) + cudaMemcpyAsync(indata + offset, + outdata + offset, + gt_len * sizeof(unsigned), + cudaMemcpyDeviceToDevice, + lstream); + + return; + } + + // Start with lower half first + if (lt_len > BITONICSORT_LEN) { + // If we've exceeded maximum depth, fall through to backup + // big_bitonicsort + if (depth >= QSORT_MAXDEPTH) { + // The final bitonic stage sorts in-place in "outdata". We therefore + // re-use "indata" as the out-of-range tracking buffer. For (2^n)+1 + // elements we need (2^(n+1)) bytes of oor buffer. The backup qsort + // buffer is at least this large when sizeof(QTYPE) >= 2. + big_bitonicsort<<<1, BITONICSORT_LEN, 0, lstream>>>( + outdata, source_is_indata ? indata : outdata, indata, offset, lt_len); + } + else { + // Launch another quicksort. We need to allocate more storage for the + // atomic data. + if ((atomicData = ringbufAlloc(atomicDataStack)) == NULL) + printf("Stack-allocation error. Failing left child launch.\n"); + else { + atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0; + unsigned int numblocks = + (unsigned int)(lt_len + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT; + qsort_warp<<>>( + outdata, indata, offset, lt_len, atomicData, atomicDataStack, !source_is_indata, depth + 1); + } + } + } + else if (lt_len > 1) { + // Final stage uses a bitonic sort instead. It's important to + // make sure the final stage ends up in the correct (original) buffer. + // We launch the smallest power-of-2 number of threads that we can. + unsigned int bitonic_len = 1 << (__qsflo(lt_len - 1U) + 1); + bitonicsort<<<1, bitonic_len, 0, lstream>>>( + outdata, source_is_indata ? indata : outdata, offset, lt_len); + } + // Finally, if we sorted just one single element, we must still make + // sure that it winds up in the correct place. + else if (source_is_indata && (lt_len == 1)) + indata[offset] = outdata[offset]; + + if (cudaPeekAtLastError() != cudaSuccess) + printf("Left-side launch fail: %s\n", cudaGetErrorString(cudaGetLastError())); + + // Now the upper half. + if (gt_len > BITONICSORT_LEN) { + // If we've exceeded maximum depth, fall through to backup + // big_bitonicsort + if (depth >= QSORT_MAXDEPTH) + big_bitonicsort<<<1, BITONICSORT_LEN, 0, rstream>>>( + outdata, source_is_indata ? indata : outdata, indata, offset + lt_len, gt_len); + else { + // Allocate new atomic storage for this launch + if ((atomicData = ringbufAlloc(atomicDataStack)) == NULL) + printf("Stack allocation error! Failing right-side launch.\n"); + else { + atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0; + unsigned int numblocks = + (unsigned int)(gt_len + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT; + qsort_warp<<>>(outdata, + indata, + offset + lt_len, + gt_len, + atomicData, + atomicDataStack, + !source_is_indata, + depth + 1); + } + } + } + else if (gt_len > 1) { + unsigned int bitonic_len = 1 << (__qsflo(gt_len - 1U) + 1); + bitonicsort<<<1, bitonic_len, 0, rstream>>>( + outdata, source_is_indata ? indata : outdata, offset + lt_len, gt_len); + } + else if (source_is_indata && (gt_len == 1)) + indata[offset + lt_len] = outdata[offset + lt_len]; + + if (cudaPeekAtLastError() != cudaSuccess) + printf("Right-side launch fail: %s\n", cudaGetErrorString(cudaGetLastError())); + } } - } } //////////////////////////////////////////////////////////////////////////////// @@ -343,241 +346,236 @@ __global__ void qsort_warp(unsigned *indata, unsigned *outdata, // Returns the time elapsed for the sort. // //////////////////////////////////////////////////////////////////////////////// -float run_quicksort_cdp(unsigned *gpudata, unsigned *scratchdata, - unsigned int count, cudaStream_t stream) { - unsigned int stacksize = QSORT_STACK_ELEMS; +float run_quicksort_cdp(unsigned *gpudata, unsigned *scratchdata, unsigned int count, cudaStream_t stream) +{ + unsigned int stacksize = QSORT_STACK_ELEMS; - // This is the stack, for atomic tracking of each sort's status - qsortAtomicData *gpustack; - checkCudaErrors( - cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData))); - checkCudaErrors(cudaMemset( - gpustack, 0, sizeof(qsortAtomicData))); // Only need set first entry to 0 + // This is the stack, for atomic tracking of each sort's status + qsortAtomicData *gpustack; + checkCudaErrors(cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData))); + checkCudaErrors(cudaMemset(gpustack, 0, sizeof(qsortAtomicData))); // Only need set first entry to 0 - // Create the memory ringbuffer used for handling the stack. - // Initialise everything to where it needs to be. - qsortRingbuf buf; - qsortRingbuf *ringbuf; - checkCudaErrors(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf))); - buf.head = 1; // We start with one allocation - buf.tail = 0; - buf.count = 0; - buf.max = 0; - buf.stacksize = stacksize; - buf.stackbase = gpustack; - checkCudaErrors( - cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice)); + // Create the memory ringbuffer used for handling the stack. + // Initialise everything to where it needs to be. + qsortRingbuf buf; + qsortRingbuf *ringbuf; + checkCudaErrors(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf))); + buf.head = 1; // We start with one allocation + buf.tail = 0; + buf.count = 0; + buf.max = 0; + buf.stacksize = stacksize; + buf.stackbase = gpustack; + checkCudaErrors(cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice)); - // Timing events... - cudaEvent_t ev1, ev2; - checkCudaErrors(cudaEventCreate(&ev1)); - checkCudaErrors(cudaEventCreate(&ev2)); - checkCudaErrors(cudaEventRecord(ev1)); + // Timing events... + cudaEvent_t ev1, ev2; + checkCudaErrors(cudaEventCreate(&ev1)); + checkCudaErrors(cudaEventCreate(&ev2)); + checkCudaErrors(cudaEventRecord(ev1)); - // Now we trivially launch the qsort kernel - if (count > BITONICSORT_LEN) { - unsigned int numblocks = - (unsigned int)(count + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT; - qsort_warp<<>>( - gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0); - } else { - bitonicsort<<<1, BITONICSORT_LEN>>>(gpudata, gpudata, 0, count); - } - - checkCudaErrors(cudaGetLastError()); - checkCudaErrors(cudaEventRecord(ev2)); - checkCudaErrors(cudaDeviceSynchronize()); - - float elapse = 0.0f; - - if (cudaPeekAtLastError() != cudaSuccess) - printf("Launch failure: %s\n", cudaGetErrorString(cudaGetLastError())); - else - checkCudaErrors(cudaEventElapsedTime(&elapse, ev1, ev2)); - - // Sanity check that the stack allocator is doing the right thing - checkCudaErrors( - cudaMemcpy(&buf, ringbuf, sizeof(*ringbuf), cudaMemcpyDeviceToHost)); - - if (count > BITONICSORT_LEN && buf.head != buf.tail) { - printf("Stack allocation error!\nRingbuf:\n"); - printf("\t head = %u\n", buf.head); - printf("\t tail = %u\n", buf.tail); - printf("\tcount = %u\n", buf.count); - printf("\t max = %u\n", buf.max); - } - - // Release our stack data once we're done - checkCudaErrors(cudaFree(ringbuf)); - checkCudaErrors(cudaFree(gpustack)); - - return elapse; -} - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -int run_qsort(unsigned int size, int seed, int debug, int loop, int verbose) { - if (seed > 0) srand(seed); - - // Create and set up our test - unsigned *gpudata, *scratchdata; - checkCudaErrors(cudaMalloc((void **)&gpudata, size * sizeof(unsigned))); - checkCudaErrors(cudaMalloc((void **)&scratchdata, size * sizeof(unsigned))); - - // Create CPU data. - unsigned *data = new unsigned[size]; - unsigned int min = loop ? loop : size; - unsigned int max = size; - loop = (loop == 0) ? 1 : loop; - - for (size = min; size <= max; size += loop) { - if (verbose) printf(" Input: "); - - for (unsigned int i = 0; i < size; i++) { - // Build data 8 bits at a time - data[i] = 0; - char *ptr = (char *)&(data[i]); - - for (unsigned j = 0; j < sizeof(unsigned); j++) { - // Easy-to-read data in debug mode - if (debug) { - *ptr++ = (char)(rand() % 10); - break; - } - - *ptr++ = (char)(rand() & 255); - } - - if (verbose) { - if (i && !(i % 32)) printf("\n "); - - printf("%u ", data[i]); - } + // Now we trivially launch the qsort kernel + if (count > BITONICSORT_LEN) { + unsigned int numblocks = (unsigned int)(count + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT; + qsort_warp<<>>( + gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0); + } + else { + bitonicsort<<<1, BITONICSORT_LEN>>>(gpudata, gpudata, 0, count); } - if (verbose) printf("\n"); - - checkCudaErrors(cudaMemcpy(gpudata, data, size * sizeof(unsigned), - cudaMemcpyHostToDevice)); - - // So we're now populated and ready to go! We size our launch as - // blocks of up to BLOCKSIZE threads, and appropriate grid size. - // One thread is launched per element. - float elapse; - elapse = run_quicksort_cdp(gpudata, scratchdata, size, NULL); - - // run_bitonicsort(gpudata, scratchdata, size, verbose); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaEventRecord(ev2)); checkCudaErrors(cudaDeviceSynchronize()); - // Copy back the data and verify correct sort - checkCudaErrors(cudaMemcpy(data, gpudata, size * sizeof(unsigned), - cudaMemcpyDeviceToHost)); + float elapse = 0.0f; - if (verbose) { - printf("Output: "); + if (cudaPeekAtLastError() != cudaSuccess) + printf("Launch failure: %s\n", cudaGetErrorString(cudaGetLastError())); + else + checkCudaErrors(cudaEventElapsedTime(&elapse, ev1, ev2)); - for (unsigned int i = 0; i < size; i++) { - if (i && !(i % 32)) printf("\n "); + // Sanity check that the stack allocator is doing the right thing + checkCudaErrors(cudaMemcpy(&buf, ringbuf, sizeof(*ringbuf), cudaMemcpyDeviceToHost)); - printf("%u ", data[i]); - } - - printf("\n"); + if (count > BITONICSORT_LEN && buf.head != buf.tail) { + printf("Stack allocation error!\nRingbuf:\n"); + printf("\t head = %u\n", buf.head); + printf("\t tail = %u\n", buf.tail); + printf("\tcount = %u\n", buf.count); + printf("\t max = %u\n", buf.max); } - unsigned int check; + // Release our stack data once we're done + checkCudaErrors(cudaFree(ringbuf)); + checkCudaErrors(cudaFree(gpustack)); - for (check = 1; check < size; check++) { - if (data[check] < data[check - 1]) { - printf("FAILED at element: %d\n", check); - break; - } - } - - if (check != size) { - printf(" cdpAdvancedQuicksort FAILED\n"); - exit(EXIT_FAILURE); - } else - printf(" cdpAdvancedQuicksort PASSED\n"); - - // Display the time between event recordings - printf("Sorted %u elems in %.3f ms (%.3f Melems/sec)\n", size, elapse, - (float)size / (elapse * 1000.0f)); - fflush(stdout); - } - - // Release everything and we're done - checkCudaErrors(cudaFree(scratchdata)); - checkCudaErrors(cudaFree(gpudata)); - delete (data); - return 0; + return elapse; } -static void usage() { - printf( - "Syntax: cdpAdvancedQuicksort [-size=] [-seed=] [-debug] " - "[-loop-step=] [-verbose]\n"); - printf( - "If loop_step is non-zero, will run from 1->array_len in steps of " - "loop_step\n"); +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +int run_qsort(unsigned int size, int seed, int debug, int loop, int verbose) +{ + if (seed > 0) + srand(seed); + + // Create and set up our test + unsigned *gpudata, *scratchdata; + checkCudaErrors(cudaMalloc((void **)&gpudata, size * sizeof(unsigned))); + checkCudaErrors(cudaMalloc((void **)&scratchdata, size * sizeof(unsigned))); + + // Create CPU data. + unsigned *data = new unsigned[size]; + unsigned int min = loop ? loop : size; + unsigned int max = size; + loop = (loop == 0) ? 1 : loop; + + for (size = min; size <= max; size += loop) { + if (verbose) + printf(" Input: "); + + for (unsigned int i = 0; i < size; i++) { + // Build data 8 bits at a time + data[i] = 0; + char *ptr = (char *)&(data[i]); + + for (unsigned j = 0; j < sizeof(unsigned); j++) { + // Easy-to-read data in debug mode + if (debug) { + *ptr++ = (char)(rand() % 10); + break; + } + + *ptr++ = (char)(rand() & 255); + } + + if (verbose) { + if (i && !(i % 32)) + printf("\n "); + + printf("%u ", data[i]); + } + } + + if (verbose) + printf("\n"); + + checkCudaErrors(cudaMemcpy(gpudata, data, size * sizeof(unsigned), cudaMemcpyHostToDevice)); + + // So we're now populated and ready to go! We size our launch as + // blocks of up to BLOCKSIZE threads, and appropriate grid size. + // One thread is launched per element. + float elapse; + elapse = run_quicksort_cdp(gpudata, scratchdata, size, NULL); + + // run_bitonicsort(gpudata, scratchdata, size, verbose); + checkCudaErrors(cudaDeviceSynchronize()); + + // Copy back the data and verify correct sort + checkCudaErrors(cudaMemcpy(data, gpudata, size * sizeof(unsigned), cudaMemcpyDeviceToHost)); + + if (verbose) { + printf("Output: "); + + for (unsigned int i = 0; i < size; i++) { + if (i && !(i % 32)) + printf("\n "); + + printf("%u ", data[i]); + } + + printf("\n"); + } + + unsigned int check; + + for (check = 1; check < size; check++) { + if (data[check] < data[check - 1]) { + printf("FAILED at element: %d\n", check); + break; + } + } + + if (check != size) { + printf(" cdpAdvancedQuicksort FAILED\n"); + exit(EXIT_FAILURE); + } + else + printf(" cdpAdvancedQuicksort PASSED\n"); + + // Display the time between event recordings + printf("Sorted %u elems in %.3f ms (%.3f Melems/sec)\n", size, elapse, (float)size / (elapse * 1000.0f)); + fflush(stdout); + } + + // Release everything and we're done + checkCudaErrors(cudaFree(scratchdata)); + checkCudaErrors(cudaFree(gpudata)); + delete (data); + return 0; +} + +static void usage() +{ + printf("Syntax: cdpAdvancedQuicksort [-size=] [-seed=] [-debug] " + "[-loop-step=] [-verbose]\n"); + printf("If loop_step is non-zero, will run from 1->array_len in steps of " + "loop_step\n"); } // Host side entry -int main(int argc, char *argv[]) { - int size = 1000000; - unsigned int seed = 0; - int debug = 0; - int loop = 0; - int verbose = 0; +int main(int argc, char *argv[]) +{ + int size = 1000000; + unsigned int seed = 0; + int debug = 0; + int loop = 0; + int verbose = 0; - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "h")) { - usage(); - printf("&&&& cdpAdvancedQuicksort WAIVED\n"); - exit(EXIT_WAIVED); - } + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h")) { + usage(); + printf("&&&& cdpAdvancedQuicksort WAIVED\n"); + exit(EXIT_WAIVED); + } - if (checkCmdLineFlag(argc, (const char **)argv, "size")) { - size = getCmdLineArgumentInt(argc, (const char **)argv, "size"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "size")) { + size = getCmdLineArgumentInt(argc, (const char **)argv, "size"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "seed")) { - seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "seed")) { + seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "loop-step")) { - loop = getCmdLineArgumentInt(argc, (const char **)argv, "loop-step"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "loop-step")) { + loop = getCmdLineArgumentInt(argc, (const char **)argv, "loop-step"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "debug")) { - debug = 1; - } + if (checkCmdLineFlag(argc, (const char **)argv, "debug")) { + debug = 1; + } - if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) { - verbose = 1; - } + if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) { + verbose = 1; + } - // Get device properties - int cuda_device = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp properties; - checkCudaErrors(cudaGetDeviceProperties(&properties, cuda_device)); - int cdpCapable = - (properties.major == 3 && properties.minor >= 5) || properties.major >= 4; + // Get device properties + int cuda_device = findCudaDevice(argc, (const char **)argv); + cudaDeviceProp properties; + checkCudaErrors(cudaGetDeviceProperties(&properties, cuda_device)); + int cdpCapable = (properties.major == 3 && properties.minor >= 5) || properties.major >= 4; - printf("GPU device %s has compute capabilities (SM %d.%d)\n", properties.name, - properties.major, properties.minor); + printf("GPU device %s has compute capabilities (SM %d.%d)\n", properties.name, properties.major, properties.minor); - if (!cdpCapable) { - printf( - "cdpAdvancedQuicksort requires SM 3.5 or higher to use CUDA Dynamic " - "Parallelism. Exiting...\n"); - exit(EXIT_WAIVED); - } + if (!cdpCapable) { + printf("cdpAdvancedQuicksort requires SM 3.5 or higher to use CUDA Dynamic " + "Parallelism. Exiting...\n"); + exit(EXIT_WAIVED); + } - printf("Running qsort on %d elements with seed %d, on %s\n", size, seed, - properties.name); + printf("Running qsort on %d elements with seed %d, on %s\n", size, seed, properties.name); - run_qsort(size, seed, debug, loop, verbose); + run_qsort(size, seed, debug, loop, verbose); - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } diff --git a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpBitonicSort.cu b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpBitonicSort.cu index 0d1dfae8..b54c39cb 100644 --- a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpBitonicSort.cu +++ b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpBitonicSort.cu @@ -33,18 +33,19 @@ // // The multithread code is from me. -#include #include +#include namespace cg = cooperative_groups; #include "cdpQuicksort.h" // Inline PTX call to return index of highest non-zero bit in a word -static __device__ __forceinline__ unsigned int __btflo(unsigned int word) { - unsigned int ret; - asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word)); - return ret; +static __device__ __forceinline__ unsigned int __btflo(unsigned int word) +{ + unsigned int ret; + asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word)); + return ret; } //////////////////////////////////////////////////////////////////////////////// @@ -57,8 +58,9 @@ static __device__ __forceinline__ unsigned int __btflo(unsigned int word) { // Perhaps it requires a class? // //////////////////////////////////////////////////////////////////////////////// -__device__ __forceinline__ int qcompare(unsigned &val1, unsigned &val2) { - return (val1 > val2) ? 1 : (val1 == val2) ? 0 : -1; +__device__ __forceinline__ int qcompare(unsigned &val1, unsigned &val2) +{ + return (val1 > val2) ? 1 : (val1 == val2) ? 0 : -1; } //////////////////////////////////////////////////////////////////////////////// @@ -84,68 +86,67 @@ __device__ __forceinline__ int qcompare(unsigned &val1, unsigned &val2) { // how much data we can sort per block. // //////////////////////////////////////////////////////////////////////////////// -static __device__ __forceinline__ void bitonicsort_kernel( - unsigned *indata, unsigned *outdata, unsigned int offset, unsigned int len, - cg::thread_block cta) { - __shared__ unsigned - sortbuf[1024]; // Max of 1024 elements - TODO: make this dynamic +static __device__ __forceinline__ void +bitonicsort_kernel(unsigned *indata, unsigned *outdata, unsigned int offset, unsigned int len, cg::thread_block cta) +{ + __shared__ unsigned sortbuf[1024]; // Max of 1024 elements - TODO: make this dynamic - // First copy data into shared memory. - unsigned int inside = (threadIdx.x < len); - sortbuf[threadIdx.x] = inside ? indata[threadIdx.x + offset] : 0xffffffffu; - cg::sync(cta); + // First copy data into shared memory. + unsigned int inside = (threadIdx.x < len); + sortbuf[threadIdx.x] = inside ? indata[threadIdx.x + offset] : 0xffffffffu; + cg::sync(cta); - // Now the sort loops - // Here, "k" is the sort level (remember bitonic does a multi-level butterfly - // style sort) - // and "j" is the partner element in the butterfly. - // Two threads each work on one butterfly, because the read/write needs to - // happen - // simultaneously - for (unsigned int k = 2; k <= blockDim.x; - k *= 2) // Butterfly stride increments in powers of 2 - { - for (unsigned int j = k >> 1; j > 0; - j >>= 1) // Strides also in powers of to, up to > 1; j > 0; j >>= 1) // Strides also in powers of to, up to swap_idx, then ascending means mine threadIdx.x); - bool swap = false; + // The k'th bit of my threadid (and hence my sort item ID) + // determines if we sort ascending or descending. + // However, since threads are reading from the top AND the bottom of + // the butterfly, if my ID is > swap_idx, then ascending means mine threadIdx.x); + bool swap = false; - if ((threadIdx.x & k) == ascend) { - if (my_elem > swap_elem) swap = true; - } + if ((threadIdx.x & k) == ascend) { + if (my_elem > swap_elem) + swap = true; + } - if ((threadIdx.x & k) == descend) { - if (my_elem < swap_elem) swap = true; - } + if ((threadIdx.x & k) == descend) { + if (my_elem < swap_elem) + swap = true; + } - // If we had to swap, then write my data to the other element's position. - // Don't forget to track out-of-range status too! - if (swap) { - sortbuf[swap_idx] = my_elem; - } + // If we had to swap, then write my data to the other element's position. + // Don't forget to track out-of-range status too! + if (swap) { + sortbuf[swap_idx] = my_elem; + } - cg::sync(cta); + cg::sync(cta); + } } - } - // Copy the sorted data from shared memory back to the output buffer - if (threadIdx.x < len) outdata[threadIdx.x + offset] = sortbuf[threadIdx.x]; + // Copy the sorted data from shared memory back to the output buffer + if (threadIdx.x < len) + outdata[threadIdx.x + offset] = sortbuf[threadIdx.x]; } ////////////////////////////////////////////////////////////////////////////////// @@ -161,126 +162,129 @@ static __device__ __forceinline__ void bitonicsort_kernel( // type. It must be a directly-comparable (i.e. with max value) type. // //////////////////////////////////////////////////////////////////////////////// -static __device__ __forceinline__ void big_bitonicsort_kernel( - unsigned *indata, unsigned *outdata, unsigned *backbuf, unsigned int offset, - unsigned int len, cg::thread_block cta) { - unsigned int len2 = - 1 << (__btflo(len - 1U) + 1); // Round up len to nearest power-of-2 +static __device__ __forceinline__ void big_bitonicsort_kernel(unsigned *indata, + unsigned *outdata, + unsigned *backbuf, + unsigned int offset, + unsigned int len, + cg::thread_block cta) +{ + unsigned int len2 = 1 << (__btflo(len - 1U) + 1); // Round up len to nearest power-of-2 - if (threadIdx.x >= len2) - return; // Early out for case where more threads launched than there is - // data + if (threadIdx.x >= len2) + return; // Early out for case where more threads launched than there is + // data - // First, set up our unused values to be the max data type. - for (unsigned int i = len; i < len2; i += blockDim.x) { - unsigned int index = i + threadIdx.x; + // First, set up our unused values to be the max data type. + for (unsigned int i = len; i < len2; i += blockDim.x) { + unsigned int index = i + threadIdx.x; - if (index < len2) { - // Must split our index between two buffers - if (index < len) - indata[index + offset] = 0xffffffffu; - else - backbuf[index + offset - len] = 0xffffffffu; - } - } - - cg::sync(cta); - - // Now the sort loops - // Here, "k" is the sort level (remember bitonic does a multi-level butterfly - // style sort) - // and "j" is the partner element in the butterfly. - // Two threads each work on one butterfly, because the read/write needs to - // happen - // simultaneously - for (unsigned int k = 2; k <= len2; - k *= 2) // Butterfly stride increments in powers of 2 - { - for (unsigned int j = k >> 1; j > 0; - j >>= 1) // Strides also in powers of to, up to index) { - unsigned my_elem, swap_elem; - - if (index < len) - my_elem = indata[index + offset]; - else - my_elem = backbuf[index + offset - len]; - - if (swap_idx < len) - swap_elem = indata[swap_idx + offset]; - else - swap_elem = backbuf[swap_idx + offset - len]; - - // The k'th bit of my index (and hence my sort item ID) - // determines if we sort ascending or descending. - // Also, if either my_elem or swap_elem is out of range, then it - // ALWAYS acts like it's the largest number. - bool swap = false; - - if ((index & k) == 0) { - if (my_elem > swap_elem) swap = true; - } - - if ((index & k) == k) { - if (my_elem < swap_elem) swap = true; - } - - // If we had to swap, then write my data to the other element's - // position. - if (swap) { - if (swap_idx < len) - indata[swap_idx + offset] = my_elem; + if (index < len2) { + // Must split our index between two buffers + if (index < len) + indata[index + offset] = 0xffffffffu; else - backbuf[swap_idx + offset - len] = my_elem; + backbuf[index + offset - len] = 0xffffffffu; + } + } + + cg::sync(cta); + + // Now the sort loops + // Here, "k" is the sort level (remember bitonic does a multi-level butterfly + // style sort) + // and "j" is the partner element in the butterfly. + // Two threads each work on one butterfly, because the read/write needs to + // happen + // simultaneously + for (unsigned int k = 2; k <= len2; k *= 2) // Butterfly stride increments in powers of 2 + { + for (unsigned int j = k >> 1; j > 0; j >>= 1) // Strides also in powers of to, up to index) { + unsigned my_elem, swap_elem; + + if (index < len) + my_elem = indata[index + offset]; + else + my_elem = backbuf[index + offset - len]; + + if (swap_idx < len) + swap_elem = indata[swap_idx + offset]; + else + swap_elem = backbuf[swap_idx + offset - len]; + + // The k'th bit of my index (and hence my sort item ID) + // determines if we sort ascending or descending. + // Also, if either my_elem or swap_elem is out of range, then it + // ALWAYS acts like it's the largest number. + bool swap = false; + + if ((index & k) == 0) { + if (my_elem > swap_elem) + swap = true; + } + + if ((index & k) == k) { + if (my_elem < swap_elem) + swap = true; + } + + // If we had to swap, then write my data to the other element's + // position. + if (swap) { + if (swap_idx < len) + indata[swap_idx + offset] = my_elem; + else + backbuf[swap_idx + offset - len] = my_elem; + + if (index < len) + indata[index + offset] = swap_elem; + else + backbuf[index + offset - len] = swap_elem; + } + } + } + + cg::sync(cta); // Only need to sync for each "j" pass + } + } + + // Copy the sorted data from the input to the output buffer, because we sort + // in-place + if (outdata != indata) { + for (unsigned int i = 0; i < len; i += blockDim.x) { + unsigned int index = i + threadIdx.x; if (index < len) - indata[index + offset] = swap_elem; - else - backbuf[index + offset - len] = swap_elem; - } + outdata[index + offset] = indata[index + offset]; } - } - - cg::sync(cta); // Only need to sync for each "j" pass } - } - - // Copy the sorted data from the input to the output buffer, because we sort - // in-place - if (outdata != indata) { - for (unsigned int i = 0; i < len; i += blockDim.x) { - unsigned int index = i + threadIdx.x; - - if (index < len) outdata[index + offset] = indata[index + offset]; - } - } } //////////////////////////////////////////////////////////////////////////////// // KERNELS //////////////////////////////////////////////////////////////////////////////// -__global__ void bitonicsort(unsigned *indata, unsigned *outdata, - unsigned int offset, unsigned int len) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - bitonicsort_kernel(indata, outdata, offset, len, cta); +__global__ void bitonicsort(unsigned *indata, unsigned *outdata, unsigned int offset, unsigned int len) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + bitonicsort_kernel(indata, outdata, offset, len, cta); } -__global__ void big_bitonicsort(unsigned *indata, unsigned *outdata, - unsigned *backbuf, unsigned int offset, - unsigned int len) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - big_bitonicsort_kernel(indata, outdata, backbuf, offset, len, cta); +__global__ void +big_bitonicsort(unsigned *indata, unsigned *outdata, unsigned *backbuf, unsigned int offset, unsigned int len) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + big_bitonicsort_kernel(indata, outdata, backbuf, offset, len, cta); } //////////////////////////////////////////////////////////////////////////////// diff --git a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpQuicksort.h b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpQuicksort.h index ae7c0b06..708fb7f7 100644 --- a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpQuicksort.h +++ b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/cdpQuicksort.h @@ -31,49 +31,47 @@ #define QUICKSORT_H #define QSORT_BLOCKSIZE_SHIFT 9 -#define QSORT_BLOCKSIZE (1 << QSORT_BLOCKSIZE_SHIFT) -#define BITONICSORT_LEN 1024 // Must be power of 2! -#define QSORT_MAXDEPTH \ - 16 // Will force final bitonic stage at depth QSORT_MAXDEPTH+1 +#define QSORT_BLOCKSIZE (1 << QSORT_BLOCKSIZE_SHIFT) +#define BITONICSORT_LEN 1024 // Must be power of 2! +#define QSORT_MAXDEPTH 16 // Will force final bitonic stage at depth QSORT_MAXDEPTH+1 //////////////////////////////////////////////////////////////////////////////// // The algorithm uses several variables updated by using atomic operations. //////////////////////////////////////////////////////////////////////////////// -typedef struct __align__(128) qsortAtomicData_t { - volatile unsigned int lt_offset; // Current output offset for pivot - volatile unsigned int sorted_count; // Total count sorted, for deciding when - // to launch next wave - volatile unsigned int - index; // Ringbuf tracking index. Can be ignored if not using ringbuf. -} -qsortAtomicData; +typedef struct __align__(128) qsortAtomicData_t +{ + volatile unsigned int lt_offset; // Current output offset for pivot + volatile unsigned int sorted_count; // Total count sorted, for deciding when + // to launch next wave + volatile unsigned int index; // Ringbuf tracking index. Can be ignored if not using ringbuf. +} qsortAtomicData; //////////////////////////////////////////////////////////////////////////////// // A ring-buffer for rapid stack allocation //////////////////////////////////////////////////////////////////////////////// -typedef struct qsortRingbuf_t { - volatile unsigned int head; // Head pointer - we allocate from here - volatile unsigned int - tail; // Tail pointer - indicates last still-in-use element - volatile unsigned int count; // Total count allocated - volatile unsigned int max; // Max index allocated - unsigned int stacksize; // Wrap-around size of buffer (must be power of 2) - volatile void *stackbase; // Pointer to the stack we're allocating from +typedef struct qsortRingbuf_t +{ + volatile unsigned int head; // Head pointer - we allocate from here + volatile unsigned int tail; // Tail pointer - indicates last still-in-use element + volatile unsigned int count; // Total count allocated + volatile unsigned int max; // Max index allocated + unsigned int stacksize; // Wrap-around size of buffer (must be power of 2) + volatile void *stackbase; // Pointer to the stack we're allocating from } qsortRingbuf; // Stack elem count must be power of 2! -#define QSORT_STACK_ELEMS \ - 1 * 1024 * 1024 // One million stack elements is a HUGE number. +#define QSORT_STACK_ELEMS 1 * 1024 * 1024 // One million stack elements is a HUGE number. -__global__ void qsort_warp(unsigned *indata, unsigned *outdata, - unsigned int len, qsortAtomicData *atomicData, - qsortRingbuf *ringbuf, unsigned int source_is_indata, - unsigned int depth); -__global__ void bitonicsort(unsigned *indata, unsigned *outdata, - unsigned int offset, unsigned int len); -__global__ void big_bitonicsort(unsigned *indata, unsigned *outdata, - unsigned *backbuf, unsigned int offset, - unsigned int len); +__global__ void qsort_warp(unsigned *indata, + unsigned *outdata, + unsigned int len, + qsortAtomicData *atomicData, + qsortRingbuf *ringbuf, + unsigned int source_is_indata, + unsigned int depth); +__global__ void bitonicsort(unsigned *indata, unsigned *outdata, unsigned int offset, unsigned int len); +__global__ void +big_bitonicsort(unsigned *indata, unsigned *outdata, unsigned *backbuf, unsigned int offset, unsigned int len); -#endif // QUICKSORT_H +#endif // QUICKSORT_H diff --git a/Samples/3_CUDA_Features/cdpBezierTessellation/BezierLineCDP.cu b/Samples/3_CUDA_Features/cdpBezierTessellation/BezierLineCDP.cu index 1fb82aab..6b1af34e 100644 --- a/Samples/3_CUDA_Features/cdpBezierTessellation/BezierLineCDP.cu +++ b/Samples/3_CUDA_Features/cdpBezierTessellation/BezierLineCDP.cu @@ -25,183 +25,181 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include #include +#include #include -__forceinline__ __device__ float2 operator+(float2 a, float2 b) { - float2 c; - c.x = a.x + b.x; - c.y = a.y + b.y; - return c; +__forceinline__ __device__ float2 operator+(float2 a, float2 b) +{ + float2 c; + c.x = a.x + b.x; + c.y = a.y + b.y; + return c; } -__forceinline__ __device__ float2 operator-(float2 a, float2 b) { - float2 c; - c.x = a.x - b.x; - c.y = a.y - b.y; - return c; +__forceinline__ __device__ float2 operator-(float2 a, float2 b) +{ + float2 c; + c.x = a.x - b.x; + c.y = a.y - b.y; + return c; } -__forceinline__ __device__ float2 operator*(float a, float2 b) { - float2 c; - c.x = a * b.x; - c.y = a * b.y; - return c; +__forceinline__ __device__ float2 operator*(float a, float2 b) +{ + float2 c; + c.x = a * b.x; + c.y = a * b.y; + return c; } -__forceinline__ __device__ float length(float2 a) { - return sqrtf(a.x * a.x + a.y * a.y); -} +__forceinline__ __device__ float length(float2 a) { return sqrtf(a.x * a.x + a.y * a.y); } #define MAX_TESSELLATION 32 -struct BezierLine { - float2 CP[3]; - float2 *vertexPos; - int nVertices; +struct BezierLine +{ + float2 CP[3]; + float2 *vertexPos; + int nVertices; }; -__global__ void computeBezierLinePositions(int lidx, BezierLine *bLines, - int nTessPoints) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; +__global__ void computeBezierLinePositions(int lidx, BezierLine *bLines, int nTessPoints) +{ + int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < nTessPoints) { - float u = (float)idx / (float)(nTessPoints - 1); - float omu = 1.0f - u; + if (idx < nTessPoints) { + float u = (float)idx / (float)(nTessPoints - 1); + float omu = 1.0f - u; - float B3u[3]; + float B3u[3]; - B3u[0] = omu * omu; - B3u[1] = 2.0f * u * omu; - B3u[2] = u * u; + B3u[0] = omu * omu; + B3u[1] = 2.0f * u * omu; + B3u[2] = u * u; - float2 position = {0, 0}; + float2 position = {0, 0}; - for (int i = 0; i < 3; i++) { - position = position + B3u[i] * bLines[lidx].CP[i]; + for (int i = 0; i < 3; i++) { + position = position + B3u[i] * bLines[lidx].CP[i]; + } + + bLines[lidx].vertexPos[idx] = position; } - - bLines[lidx].vertexPos[idx] = position; - } } -__global__ void computeBezierLinesCDP(BezierLine *bLines, int nLines) { - int lidx = threadIdx.x + blockDim.x * blockIdx.x; +__global__ void computeBezierLinesCDP(BezierLine *bLines, int nLines) +{ + int lidx = threadIdx.x + blockDim.x * blockIdx.x; - if (lidx < nLines) { - float curvature = length(bLines[lidx].CP[1] - - 0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) / - length(bLines[lidx].CP[2] - bLines[lidx].CP[0]); - int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION); + if (lidx < nLines) { + float curvature = length(bLines[lidx].CP[1] - 0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) + / length(bLines[lidx].CP[2] - bLines[lidx].CP[0]); + int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION); - if (bLines[lidx].vertexPos == NULL) { - bLines[lidx].nVertices = nTessPoints; - cudaMalloc((void **)&bLines[lidx].vertexPos, - nTessPoints * sizeof(float2)); + if (bLines[lidx].vertexPos == NULL) { + bLines[lidx].nVertices = nTessPoints; + cudaMalloc((void **)&bLines[lidx].vertexPos, nTessPoints * sizeof(float2)); + } + + computeBezierLinePositions<<>>( + lidx, bLines, bLines[lidx].nVertices); } - - computeBezierLinePositions<<>>(lidx, bLines, bLines[lidx].nVertices); - } } -__global__ void freeVertexMem(BezierLine *bLines, int nLines) { - int lidx = threadIdx.x + blockDim.x * blockIdx.x; +__global__ void freeVertexMem(BezierLine *bLines, int nLines) +{ + int lidx = threadIdx.x + blockDim.x * blockIdx.x; - if (lidx < nLines) cudaFree(bLines[lidx].vertexPos); + if (lidx < nLines) + cudaFree(bLines[lidx].vertexPos); } -unsigned int checkCapableSM35Device(int argc, char **argv) { - // Get device properties - cudaDeviceProp properties; - int device_count = 0, device = -1; - - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - device = getCmdLineArgumentInt(argc, (const char **)argv, "device"); - +unsigned int checkCapableSM35Device(int argc, char **argv) +{ + // Get device properties cudaDeviceProp properties; - checkCudaErrors(cudaGetDeviceProperties(&properties, device)); + int device_count = 0, device = -1; - if (properties.major > 3 || - (properties.major == 3 && properties.minor >= 5)) { - printf("Running on GPU %d (%s)\n", device, properties.name); - } else { - printf( - "cdpBezierTessellation requires GPU devices with compute SM 3.5 or " - "higher."); - printf("Current GPU device has compute SM %d.%d. Exiting...\n", - properties.major, properties.minor); - return EXIT_FAILURE; + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + device = getCmdLineArgumentInt(argc, (const char **)argv, "device"); + + cudaDeviceProp properties; + checkCudaErrors(cudaGetDeviceProperties(&properties, device)); + + if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) { + printf("Running on GPU %d (%s)\n", device, properties.name); + } + else { + printf("cdpBezierTessellation requires GPU devices with compute SM 3.5 or " + "higher."); + printf("Current GPU device has compute SM %d.%d. Exiting...\n", properties.major, properties.minor); + return EXIT_FAILURE; + } + } + else { + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + for (int i = 0; i < device_count; ++i) { + checkCudaErrors(cudaGetDeviceProperties(&properties, i)); + + if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) { + device = i; + printf("Running on GPU %d (%s)\n", i, properties.name); + break; + } + + printf("GPU %d %s does not support CUDA Dynamic Parallelism\n", i, properties.name); + } + } + if (device == -1) { + fprintf(stderr, + "cdpBezierTessellation requires GPU devices with compute SM 3.5 or " + "higher. Exiting...\n"); + return EXIT_WAIVED; } - } else { - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - for (int i = 0; i < device_count; ++i) { - checkCudaErrors(cudaGetDeviceProperties(&properties, i)); - - if (properties.major > 3 || - (properties.major == 3 && properties.minor >= 5)) { - device = i; - printf("Running on GPU %d (%s)\n", i, properties.name); - break; - } - - printf("GPU %d %s does not support CUDA Dynamic Parallelism\n", i, - properties.name); - } - } - if (device == -1) { - fprintf(stderr, - "cdpBezierTessellation requires GPU devices with compute SM 3.5 or " - "higher. Exiting...\n"); - return EXIT_WAIVED; - } - - return EXIT_SUCCESS; + return EXIT_SUCCESS; } -#define N_LINES 256 +#define N_LINES 256 #define BLOCK_DIM 64 -int main(int argc, char **argv) { - BezierLine *bLines_h = new BezierLine[N_LINES]; +int main(int argc, char **argv) +{ + BezierLine *bLines_h = new BezierLine[N_LINES]; - float2 last = {0, 0}; + float2 last = {0, 0}; - for (int i = 0; i < N_LINES; i++) { - bLines_h[i].CP[0] = last; + for (int i = 0; i < N_LINES; i++) { + bLines_h[i].CP[0] = last; - for (int j = 1; j < 3; j++) { - bLines_h[i].CP[j].x = (float)rand() / (float)RAND_MAX; - bLines_h[i].CP[j].y = (float)rand() / (float)RAND_MAX; + for (int j = 1; j < 3; j++) { + bLines_h[i].CP[j].x = (float)rand() / (float)RAND_MAX; + bLines_h[i].CP[j].y = (float)rand() / (float)RAND_MAX; + } + + last = bLines_h[i].CP[2]; + bLines_h[i].vertexPos = NULL; + bLines_h[i].nVertices = 0; } - last = bLines_h[i].CP[2]; - bLines_h[i].vertexPos = NULL; - bLines_h[i].nVertices = 0; - } + unsigned int sm35Ret = checkCapableSM35Device(argc, argv); + if (sm35Ret != EXIT_SUCCESS) { + exit(sm35Ret); + } - unsigned int sm35Ret = checkCapableSM35Device(argc, argv); - if (sm35Ret != EXIT_SUCCESS) { - exit(sm35Ret); - } + BezierLine *bLines_d; + checkCudaErrors(cudaMalloc((void **)&bLines_d, N_LINES * sizeof(BezierLine))); + checkCudaErrors(cudaMemcpy(bLines_d, bLines_h, N_LINES * sizeof(BezierLine), cudaMemcpyHostToDevice)); + printf("Computing Bezier Lines (CUDA Dynamic Parallelism Version) ... "); + computeBezierLinesCDP<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM), BLOCK_DIM>>>(bLines_d, N_LINES); + printf("Done!\n"); - BezierLine *bLines_d; - checkCudaErrors(cudaMalloc((void **)&bLines_d, N_LINES * sizeof(BezierLine))); - checkCudaErrors(cudaMemcpy(bLines_d, bLines_h, N_LINES * sizeof(BezierLine), - cudaMemcpyHostToDevice)); - printf("Computing Bezier Lines (CUDA Dynamic Parallelism Version) ... "); - computeBezierLinesCDP<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM), - BLOCK_DIM>>>(bLines_d, N_LINES); - printf("Done!\n"); + // Do something to draw the lines here - // Do something to draw the lines here + freeVertexMem<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM), BLOCK_DIM>>>(bLines_d, N_LINES); + checkCudaErrors(cudaFree(bLines_d)); + delete[] bLines_h; - freeVertexMem<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM), - BLOCK_DIM>>>(bLines_d, N_LINES); - checkCudaErrors(cudaFree(bLines_d)); - delete[] bLines_h; - - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } diff --git a/Samples/3_CUDA_Features/cdpQuadtree/cdpQuadtree.cu b/Samples/3_CUDA_Features/cdpQuadtree/cdpQuadtree.cu index 06fbddb7..60112fda 100644 --- a/Samples/3_CUDA_Features/cdpQuadtree/cdpQuadtree.cu +++ b/Samples/3_CUDA_Features/cdpQuadtree/cdpQuadtree.cu @@ -25,10 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include +#include #include #include -#include +#include namespace cg = cooperative_groups; #include @@ -36,164 +36,176 @@ namespace cg = cooperative_groups; //////////////////////////////////////////////////////////////////////////////// // A structure of 2D points (structure of arrays). //////////////////////////////////////////////////////////////////////////////// -class Points { - float *m_x; - float *m_y; +class Points +{ + float *m_x; + float *m_y; - public: - // Constructor. - __host__ __device__ Points() : m_x(NULL), m_y(NULL) {} +public: + // Constructor. + __host__ __device__ Points() + : m_x(NULL) + , m_y(NULL) + { + } - // Constructor. - __host__ __device__ Points(float *x, float *y) : m_x(x), m_y(y) {} + // Constructor. + __host__ __device__ Points(float *x, float *y) + : m_x(x) + , m_y(y) + { + } - // Get a point. - __host__ __device__ __forceinline__ float2 get_point(int idx) const { - return make_float2(m_x[idx], m_y[idx]); - } + // Get a point. + __host__ __device__ __forceinline__ float2 get_point(int idx) const { return make_float2(m_x[idx], m_y[idx]); } - // Set a point. - __host__ __device__ __forceinline__ void set_point(int idx, const float2 &p) { - m_x[idx] = p.x; - m_y[idx] = p.y; - } + // Set a point. + __host__ __device__ __forceinline__ void set_point(int idx, const float2 &p) + { + m_x[idx] = p.x; + m_y[idx] = p.y; + } - // Set the pointers. - __host__ __device__ __forceinline__ void set(float *x, float *y) { - m_x = x; - m_y = y; - } + // Set the pointers. + __host__ __device__ __forceinline__ void set(float *x, float *y) + { + m_x = x; + m_y = y; + } }; //////////////////////////////////////////////////////////////////////////////// // A 2D bounding box //////////////////////////////////////////////////////////////////////////////// -class Bounding_box { - // Extreme points of the bounding box. - float2 m_p_min; - float2 m_p_max; +class Bounding_box +{ + // Extreme points of the bounding box. + float2 m_p_min; + float2 m_p_max; - public: - // Constructor. Create a unit box. - __host__ __device__ Bounding_box() { - m_p_min = make_float2(0.0f, 0.0f); - m_p_max = make_float2(1.0f, 1.0f); - } +public: + // Constructor. Create a unit box. + __host__ __device__ Bounding_box() + { + m_p_min = make_float2(0.0f, 0.0f); + m_p_max = make_float2(1.0f, 1.0f); + } - // Compute the center of the bounding-box. - __host__ __device__ void compute_center(float2 ¢er) const { - center.x = 0.5f * (m_p_min.x + m_p_max.x); - center.y = 0.5f * (m_p_min.y + m_p_max.y); - } + // Compute the center of the bounding-box. + __host__ __device__ void compute_center(float2 ¢er) const + { + center.x = 0.5f * (m_p_min.x + m_p_max.x); + center.y = 0.5f * (m_p_min.y + m_p_max.y); + } - // The points of the box. - __host__ __device__ __forceinline__ const float2 &get_max() const { - return m_p_max; - } + // The points of the box. + __host__ __device__ __forceinline__ const float2 &get_max() const { return m_p_max; } - __host__ __device__ __forceinline__ const float2 &get_min() const { - return m_p_min; - } + __host__ __device__ __forceinline__ const float2 &get_min() const { return m_p_min; } - // Does a box contain a point. - __host__ __device__ bool contains(const float2 &p) const { - return p.x >= m_p_min.x && p.x < m_p_max.x && p.y >= m_p_min.y && - p.y < m_p_max.y; - } + // Does a box contain a point. + __host__ __device__ bool contains(const float2 &p) const + { + return p.x >= m_p_min.x && p.x < m_p_max.x && p.y >= m_p_min.y && p.y < m_p_max.y; + } - // Define the bounding box. - __host__ __device__ void set(float min_x, float min_y, float max_x, - float max_y) { - m_p_min.x = min_x; - m_p_min.y = min_y; - m_p_max.x = max_x; - m_p_max.y = max_y; - } + // Define the bounding box. + __host__ __device__ void set(float min_x, float min_y, float max_x, float max_y) + { + m_p_min.x = min_x; + m_p_min.y = min_y; + m_p_max.x = max_x; + m_p_max.y = max_y; + } }; //////////////////////////////////////////////////////////////////////////////// // A node of a quadree. //////////////////////////////////////////////////////////////////////////////// -class Quadtree_node { - // The identifier of the node. - int m_id; - // The bounding box of the tree. - Bounding_box m_bounding_box; - // The range of points. - int m_begin, m_end; +class Quadtree_node +{ + // The identifier of the node. + int m_id; + // The bounding box of the tree. + Bounding_box m_bounding_box; + // The range of points. + int m_begin, m_end; - public: - // Constructor. - __host__ __device__ Quadtree_node() : m_id(0), m_begin(0), m_end(0) {} +public: + // Constructor. + __host__ __device__ Quadtree_node() + : m_id(0) + , m_begin(0) + , m_end(0) + { + } - // The ID of a node at its level. - __host__ __device__ int id() const { return m_id; } + // The ID of a node at its level. + __host__ __device__ int id() const { return m_id; } - // The ID of a node at its level. - __host__ __device__ void set_id(int new_id) { m_id = new_id; } + // The ID of a node at its level. + __host__ __device__ void set_id(int new_id) { m_id = new_id; } - // The bounding box. - __host__ __device__ __forceinline__ const Bounding_box &bounding_box() const { - return m_bounding_box; - } + // The bounding box. + __host__ __device__ __forceinline__ const Bounding_box &bounding_box() const { return m_bounding_box; } - // Set the bounding box. - __host__ __device__ __forceinline__ void set_bounding_box(float min_x, - float min_y, - float max_x, - float max_y) { - m_bounding_box.set(min_x, min_y, max_x, max_y); - } + // Set the bounding box. + __host__ __device__ __forceinline__ void set_bounding_box(float min_x, float min_y, float max_x, float max_y) + { + m_bounding_box.set(min_x, min_y, max_x, max_y); + } - // The number of points in the tree. - __host__ __device__ __forceinline__ int num_points() const { - return m_end - m_begin; - } + // The number of points in the tree. + __host__ __device__ __forceinline__ int num_points() const { return m_end - m_begin; } - // The range of points in the tree. - __host__ __device__ __forceinline__ int points_begin() const { - return m_begin; - } + // The range of points in the tree. + __host__ __device__ __forceinline__ int points_begin() const { return m_begin; } - __host__ __device__ __forceinline__ int points_end() const { return m_end; } + __host__ __device__ __forceinline__ int points_end() const { return m_end; } - // Define the range for that node. - __host__ __device__ __forceinline__ void set_range(int begin, int end) { - m_begin = begin; - m_end = end; - } + // Define the range for that node. + __host__ __device__ __forceinline__ void set_range(int begin, int end) + { + m_begin = begin; + m_end = end; + } }; //////////////////////////////////////////////////////////////////////////////// // Algorithm parameters. //////////////////////////////////////////////////////////////////////////////// -struct Parameters { - // Choose the right set of points to use as in/out. - int point_selector; - // The number of nodes at a given level (2^k for level k). - int num_nodes_at_this_level; - // The recursion depth. - int depth; - // The max value for depth. - const int max_depth; - // The minimum number of points in a node to stop recursion. - const int min_points_per_node; +struct Parameters +{ + // Choose the right set of points to use as in/out. + int point_selector; + // The number of nodes at a given level (2^k for level k). + int num_nodes_at_this_level; + // The recursion depth. + int depth; + // The max value for depth. + const int max_depth; + // The minimum number of points in a node to stop recursion. + const int min_points_per_node; - // Constructor set to default values. - __host__ __device__ Parameters(int max_depth, int min_points_per_node) - : point_selector(0), - num_nodes_at_this_level(1), - depth(0), - max_depth(max_depth), - min_points_per_node(min_points_per_node) {} + // Constructor set to default values. + __host__ __device__ Parameters(int max_depth, int min_points_per_node) + : point_selector(0) + , num_nodes_at_this_level(1) + , depth(0) + , max_depth(max_depth) + , min_points_per_node(min_points_per_node) + { + } - // Copy constructor. Changes the values for next iteration. - __host__ __device__ Parameters(const Parameters ¶ms, bool) - : point_selector((params.point_selector + 1) % 2), - num_nodes_at_this_level(4 * params.num_nodes_at_this_level), - depth(params.depth + 1), - max_depth(params.max_depth), - min_points_per_node(params.min_points_per_node) {} + // Copy constructor. Changes the values for next iteration. + __host__ __device__ Parameters(const Parameters ¶ms, bool) + : point_selector((params.point_selector + 1) % 2) + , num_nodes_at_this_level(4 * params.num_nodes_at_this_level) + , depth(params.depth + 1) + , max_depth(params.max_depth) + , min_points_per_node(params.min_points_per_node) + { + } }; //////////////////////////////////////////////////////////////////////////////// @@ -247,493 +259,474 @@ struct Parameters { // will apply the same algorithm. //////////////////////////////////////////////////////////////////////////////// template -__global__ void build_quadtree_kernel(Quadtree_node *nodes, Points *points, - Parameters params) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // The number of warps in a block. - const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warpSize; +__global__ void build_quadtree_kernel(Quadtree_node *nodes, Points *points, Parameters params) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // The number of warps in a block. + const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warpSize; - // Shared memory to store the number of points. - extern __shared__ int smem[]; + // Shared memory to store the number of points. + extern __shared__ int smem[]; - // s_num_pts[4][NUM_WARPS_PER_BLOCK]; - // Addresses of shared memory. - volatile int *s_num_pts[4]; + // s_num_pts[4][NUM_WARPS_PER_BLOCK]; + // Addresses of shared memory. + volatile int *s_num_pts[4]; - for (int i = 0; i < 4; ++i) - s_num_pts[i] = (volatile int *)&smem[i * NUM_WARPS_PER_BLOCK]; + for (int i = 0; i < 4; ++i) + s_num_pts[i] = (volatile int *)&smem[i * NUM_WARPS_PER_BLOCK]; - // Compute the coordinates of the threads in the block. - const int warp_id = threadIdx.x / warpSize; - const int lane_id = threadIdx.x % warpSize; + // Compute the coordinates of the threads in the block. + const int warp_id = threadIdx.x / warpSize; + const int lane_id = threadIdx.x % warpSize; - // Mask for compaction. - // Same as: asm( "mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt) ); - int lane_mask_lt = (1 << lane_id) - 1; + // Mask for compaction. + // Same as: asm( "mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt) ); + int lane_mask_lt = (1 << lane_id) - 1; - // The current node. - Quadtree_node &node = nodes[blockIdx.x]; + // The current node. + Quadtree_node &node = nodes[blockIdx.x]; - // The number of points in the node. - int num_points = node.num_points(); + // The number of points in the node. + int num_points = node.num_points(); - float2 center; - int range_begin, range_end; - int warp_cnts[4] = {0, 0, 0, 0}; - // - // 1- Check the number of points and its depth. - // + float2 center; + int range_begin, range_end; + int warp_cnts[4] = {0, 0, 0, 0}; + // + // 1- Check the number of points and its depth. + // - // Stop the recursion here. Make sure points[0] contains all the points. - if (params.depth >= params.max_depth || - num_points <= params.min_points_per_node) { - if (params.point_selector == 1) { - int it = node.points_begin(), end = node.points_end(); + // Stop the recursion here. Make sure points[0] contains all the points. + if (params.depth >= params.max_depth || num_points <= params.min_points_per_node) { + if (params.point_selector == 1) { + int it = node.points_begin(), end = node.points_end(); - for (it += threadIdx.x; it < end; it += NUM_THREADS_PER_BLOCK) - if (it < end) points[0].set_point(it, points[1].get_point(it)); + for (it += threadIdx.x; it < end; it += NUM_THREADS_PER_BLOCK) + if (it < end) + points[0].set_point(it, points[1].get_point(it)); + } + + return; } - return; - } + // Compute the center of the bounding box of the points. + const Bounding_box &bbox = node.bounding_box(); - // Compute the center of the bounding box of the points. - const Bounding_box &bbox = node.bounding_box(); + bbox.compute_center(center); - bbox.compute_center(center); + // Find how many points to give to each warp. + int num_points_per_warp = max(warpSize, (num_points + NUM_WARPS_PER_BLOCK - 1) / NUM_WARPS_PER_BLOCK); - // Find how many points to give to each warp. - int num_points_per_warp = max( - warpSize, (num_points + NUM_WARPS_PER_BLOCK - 1) / NUM_WARPS_PER_BLOCK); + // Each warp of threads will compute the number of points to move to each + // quadrant. + range_begin = node.points_begin() + warp_id * num_points_per_warp; + range_end = min(range_begin + num_points_per_warp, node.points_end()); - // Each warp of threads will compute the number of points to move to each - // quadrant. - range_begin = node.points_begin() + warp_id * num_points_per_warp; - range_end = min(range_begin + num_points_per_warp, node.points_end()); + // + // 2- Count the number of points in each child. + // - // - // 2- Count the number of points in each child. - // + // Input points. + const Points &in_points = points[params.point_selector]; - // Input points. - const Points &in_points = points[params.point_selector]; + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + // Compute the number of points. + for (int range_it = range_begin + tile32.thread_rank(); tile32.any(range_it < range_end); range_it += warpSize) { + // Is it still an active thread? + bool is_active = range_it < range_end; - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - // Compute the number of points. - for (int range_it = range_begin + tile32.thread_rank(); - tile32.any(range_it < range_end); range_it += warpSize) { - // Is it still an active thread? - bool is_active = range_it < range_end; + // Load the coordinates of the point. + float2 p = is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f); - // Load the coordinates of the point. - float2 p = - is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f); + // Count top-left points. + int num_pts = __popc(tile32.ballot(is_active && p.x < center.x && p.y >= center.y)); + warp_cnts[0] += tile32.shfl(num_pts, 0); - // Count top-left points. - int num_pts = - __popc(tile32.ballot(is_active && p.x < center.x && p.y >= center.y)); - warp_cnts[0] += tile32.shfl(num_pts, 0); + // Count top-right points. + num_pts = __popc(tile32.ballot(is_active && p.x >= center.x && p.y >= center.y)); + warp_cnts[1] += tile32.shfl(num_pts, 0); - // Count top-right points. - num_pts = - __popc(tile32.ballot(is_active && p.x >= center.x && p.y >= center.y)); - warp_cnts[1] += tile32.shfl(num_pts, 0); + // Count bottom-left points. + num_pts = __popc(tile32.ballot(is_active && p.x < center.x && p.y < center.y)); + warp_cnts[2] += tile32.shfl(num_pts, 0); - // Count bottom-left points. - num_pts = - __popc(tile32.ballot(is_active && p.x < center.x && p.y < center.y)); - warp_cnts[2] += tile32.shfl(num_pts, 0); + // Count bottom-right points. + num_pts = __popc(tile32.ballot(is_active && p.x >= center.x && p.y < center.y)); + warp_cnts[3] += tile32.shfl(num_pts, 0); + } - // Count bottom-right points. - num_pts = - __popc(tile32.ballot(is_active && p.x >= center.x && p.y < center.y)); - warp_cnts[3] += tile32.shfl(num_pts, 0); - } + if (tile32.thread_rank() == 0) { + s_num_pts[0][warp_id] = warp_cnts[0]; + s_num_pts[1][warp_id] = warp_cnts[1]; + s_num_pts[2][warp_id] = warp_cnts[2]; + s_num_pts[3][warp_id] = warp_cnts[3]; + } - if (tile32.thread_rank() == 0) { - s_num_pts[0][warp_id] = warp_cnts[0]; - s_num_pts[1][warp_id] = warp_cnts[1]; - s_num_pts[2][warp_id] = warp_cnts[2]; - s_num_pts[3][warp_id] = warp_cnts[3]; - } + // Make sure warps have finished counting. + cg::sync(cta); - // Make sure warps have finished counting. - cg::sync(cta); + // + // 3- Scan the warps' results to know the "global" numbers. + // - // - // 3- Scan the warps' results to know the "global" numbers. - // - - // First 4 warps scan the numbers of points per child (inclusive scan). - if (warp_id < 4) { - int num_pts = tile32.thread_rank() < NUM_WARPS_PER_BLOCK - ? s_num_pts[warp_id][tile32.thread_rank()] - : 0; + // First 4 warps scan the numbers of points per child (inclusive scan). + if (warp_id < 4) { + int num_pts = tile32.thread_rank() < NUM_WARPS_PER_BLOCK ? s_num_pts[warp_id][tile32.thread_rank()] : 0; #pragma unroll - for (int offset = 1; offset < NUM_WARPS_PER_BLOCK; offset *= 2) { - int n = tile32.shfl_up(num_pts, offset); + for (int offset = 1; offset < NUM_WARPS_PER_BLOCK; offset *= 2) { + int n = tile32.shfl_up(num_pts, offset); - if (tile32.thread_rank() >= offset) num_pts += n; + if (tile32.thread_rank() >= offset) + num_pts += n; + } + + if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK) + s_num_pts[warp_id][tile32.thread_rank()] = num_pts; } - if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK) - s_num_pts[warp_id][tile32.thread_rank()] = num_pts; - } + cg::sync(cta); - cg::sync(cta); + // Compute global offsets. + if (warp_id == 0) { + int sum = s_num_pts[0][NUM_WARPS_PER_BLOCK - 1]; - // Compute global offsets. - if (warp_id == 0) { - int sum = s_num_pts[0][NUM_WARPS_PER_BLOCK - 1]; + for (int row = 1; row < 4; ++row) { + int tmp = s_num_pts[row][NUM_WARPS_PER_BLOCK - 1]; + cg::sync(tile32); - for (int row = 1; row < 4; ++row) { - int tmp = s_num_pts[row][NUM_WARPS_PER_BLOCK - 1]; - cg::sync(tile32); + if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK) + s_num_pts[row][tile32.thread_rank()] += sum; - if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK) - s_num_pts[row][tile32.thread_rank()] += sum; - - cg::sync(tile32); - sum += tmp; + cg::sync(tile32); + sum += tmp; + } } - } - cg::sync(cta); + cg::sync(cta); - // Make the scan exclusive. - int val = 0; - if (threadIdx.x < 4 * NUM_WARPS_PER_BLOCK) { - val = threadIdx.x == 0 ? 0 : smem[threadIdx.x - 1]; - val += node.points_begin(); - } - - cg::sync(cta); - - if (threadIdx.x < 4 * NUM_WARPS_PER_BLOCK) { - smem[threadIdx.x] = val; - } - - cg::sync(cta); - - // - // 4- Move points. - // - if (!(params.depth >= params.max_depth || - num_points <= params.min_points_per_node)) { - // Output points. - Points &out_points = points[(params.point_selector + 1) % 2]; - - warp_cnts[0] = s_num_pts[0][warp_id]; - warp_cnts[1] = s_num_pts[1][warp_id]; - warp_cnts[2] = s_num_pts[2][warp_id]; - warp_cnts[3] = s_num_pts[3][warp_id]; - - const Points &in_points = points[params.point_selector]; - // Reorder points. - for (int range_it = range_begin + tile32.thread_rank(); - tile32.any(range_it < range_end); range_it += warpSize) { - // Is it still an active thread? - bool is_active = range_it < range_end; - - // Load the coordinates of the point. - float2 p = - is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f); - - // Count top-left points. - bool pred = is_active && p.x < center.x && p.y >= center.y; - int vote = tile32.ballot(pred); - int dest = warp_cnts[0] + __popc(vote & lane_mask_lt); - - if (pred) out_points.set_point(dest, p); - - warp_cnts[0] += tile32.shfl(__popc(vote), 0); - - // Count top-right points. - pred = is_active && p.x >= center.x && p.y >= center.y; - vote = tile32.ballot(pred); - dest = warp_cnts[1] + __popc(vote & lane_mask_lt); - - if (pred) out_points.set_point(dest, p); - - warp_cnts[1] += tile32.shfl(__popc(vote), 0); - - // Count bottom-left points. - pred = is_active && p.x < center.x && p.y < center.y; - vote = tile32.ballot(pred); - dest = warp_cnts[2] + __popc(vote & lane_mask_lt); - - if (pred) out_points.set_point(dest, p); - - warp_cnts[2] += tile32.shfl(__popc(vote), 0); - - // Count bottom-right points. - pred = is_active && p.x >= center.x && p.y < center.y; - vote = tile32.ballot(pred); - dest = warp_cnts[3] + __popc(vote & lane_mask_lt); - - if (pred) out_points.set_point(dest, p); - - warp_cnts[3] += tile32.shfl(__popc(vote), 0); + // Make the scan exclusive. + int val = 0; + if (threadIdx.x < 4 * NUM_WARPS_PER_BLOCK) { + val = threadIdx.x == 0 ? 0 : smem[threadIdx.x - 1]; + val += node.points_begin(); } - } - cg::sync(cta); + cg::sync(cta); - if (tile32.thread_rank() == 0) { - s_num_pts[0][warp_id] = warp_cnts[0]; - s_num_pts[1][warp_id] = warp_cnts[1]; - s_num_pts[2][warp_id] = warp_cnts[2]; - s_num_pts[3][warp_id] = warp_cnts[3]; - } - - cg::sync(cta); - - // - // 5- Launch new blocks. - // - if (!(params.depth >= params.max_depth || - num_points <= params.min_points_per_node)) { - // The last thread launches new blocks. - if (threadIdx.x == NUM_THREADS_PER_BLOCK - 1) { - // The children. - Quadtree_node *children = - &nodes[params.num_nodes_at_this_level - (node.id() & ~3)]; - - // The offsets of the children at their level. - int child_offset = 4 * node.id(); - - // Set IDs. - children[child_offset + 0].set_id(4 * node.id() + 0); - children[child_offset + 1].set_id(4 * node.id() + 1); - children[child_offset + 2].set_id(4 * node.id() + 2); - children[child_offset + 3].set_id(4 * node.id() + 3); - - const Bounding_box &bbox = node.bounding_box(); - // Points of the bounding-box. - const float2 &p_min = bbox.get_min(); - const float2 &p_max = bbox.get_max(); - - // Set the bounding boxes of the children. - children[child_offset + 0].set_bounding_box(p_min.x, center.y, center.x, - p_max.y); // Top-left. - children[child_offset + 1].set_bounding_box(center.x, center.y, p_max.x, - p_max.y); // Top-right. - children[child_offset + 2].set_bounding_box(p_min.x, p_min.y, center.x, - center.y); // Bottom-left. - children[child_offset + 3].set_bounding_box(center.x, p_min.y, p_max.x, - center.y); // Bottom-right. - - // Set the ranges of the children. - - children[child_offset + 0].set_range(node.points_begin(), - s_num_pts[0][warp_id]); - children[child_offset + 1].set_range(s_num_pts[0][warp_id], - s_num_pts[1][warp_id]); - children[child_offset + 2].set_range(s_num_pts[1][warp_id], - s_num_pts[2][warp_id]); - children[child_offset + 3].set_range(s_num_pts[2][warp_id], - s_num_pts[3][warp_id]); - - // Launch 4 children. - build_quadtree_kernel<<< - 4, NUM_THREADS_PER_BLOCK, 4 * NUM_WARPS_PER_BLOCK * sizeof(int)>>>( - &children[child_offset], points, Parameters(params, true)); + if (threadIdx.x < 4 * NUM_WARPS_PER_BLOCK) { + smem[threadIdx.x] = val; + } + + cg::sync(cta); + + // + // 4- Move points. + // + if (!(params.depth >= params.max_depth || num_points <= params.min_points_per_node)) { + // Output points. + Points &out_points = points[(params.point_selector + 1) % 2]; + + warp_cnts[0] = s_num_pts[0][warp_id]; + warp_cnts[1] = s_num_pts[1][warp_id]; + warp_cnts[2] = s_num_pts[2][warp_id]; + warp_cnts[3] = s_num_pts[3][warp_id]; + + const Points &in_points = points[params.point_selector]; + // Reorder points. + for (int range_it = range_begin + tile32.thread_rank(); tile32.any(range_it < range_end); + range_it += warpSize) { + // Is it still an active thread? + bool is_active = range_it < range_end; + + // Load the coordinates of the point. + float2 p = is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f); + + // Count top-left points. + bool pred = is_active && p.x < center.x && p.y >= center.y; + int vote = tile32.ballot(pred); + int dest = warp_cnts[0] + __popc(vote & lane_mask_lt); + + if (pred) + out_points.set_point(dest, p); + + warp_cnts[0] += tile32.shfl(__popc(vote), 0); + + // Count top-right points. + pred = is_active && p.x >= center.x && p.y >= center.y; + vote = tile32.ballot(pred); + dest = warp_cnts[1] + __popc(vote & lane_mask_lt); + + if (pred) + out_points.set_point(dest, p); + + warp_cnts[1] += tile32.shfl(__popc(vote), 0); + + // Count bottom-left points. + pred = is_active && p.x < center.x && p.y < center.y; + vote = tile32.ballot(pred); + dest = warp_cnts[2] + __popc(vote & lane_mask_lt); + + if (pred) + out_points.set_point(dest, p); + + warp_cnts[2] += tile32.shfl(__popc(vote), 0); + + // Count bottom-right points. + pred = is_active && p.x >= center.x && p.y < center.y; + vote = tile32.ballot(pred); + dest = warp_cnts[3] + __popc(vote & lane_mask_lt); + + if (pred) + out_points.set_point(dest, p); + + warp_cnts[3] += tile32.shfl(__popc(vote), 0); + } + } + + cg::sync(cta); + + if (tile32.thread_rank() == 0) { + s_num_pts[0][warp_id] = warp_cnts[0]; + s_num_pts[1][warp_id] = warp_cnts[1]; + s_num_pts[2][warp_id] = warp_cnts[2]; + s_num_pts[3][warp_id] = warp_cnts[3]; + } + + cg::sync(cta); + + // + // 5- Launch new blocks. + // + if (!(params.depth >= params.max_depth || num_points <= params.min_points_per_node)) { + // The last thread launches new blocks. + if (threadIdx.x == NUM_THREADS_PER_BLOCK - 1) { + // The children. + Quadtree_node *children = &nodes[params.num_nodes_at_this_level - (node.id() & ~3)]; + + // The offsets of the children at their level. + int child_offset = 4 * node.id(); + + // Set IDs. + children[child_offset + 0].set_id(4 * node.id() + 0); + children[child_offset + 1].set_id(4 * node.id() + 1); + children[child_offset + 2].set_id(4 * node.id() + 2); + children[child_offset + 3].set_id(4 * node.id() + 3); + + const Bounding_box &bbox = node.bounding_box(); + // Points of the bounding-box. + const float2 &p_min = bbox.get_min(); + const float2 &p_max = bbox.get_max(); + + // Set the bounding boxes of the children. + children[child_offset + 0].set_bounding_box(p_min.x, center.y, center.x, + p_max.y); // Top-left. + children[child_offset + 1].set_bounding_box(center.x, center.y, p_max.x, + p_max.y); // Top-right. + children[child_offset + 2].set_bounding_box(p_min.x, p_min.y, center.x, + center.y); // Bottom-left. + children[child_offset + 3].set_bounding_box(center.x, p_min.y, p_max.x, + center.y); // Bottom-right. + + // Set the ranges of the children. + + children[child_offset + 0].set_range(node.points_begin(), s_num_pts[0][warp_id]); + children[child_offset + 1].set_range(s_num_pts[0][warp_id], s_num_pts[1][warp_id]); + children[child_offset + 2].set_range(s_num_pts[1][warp_id], s_num_pts[2][warp_id]); + children[child_offset + 3].set_range(s_num_pts[2][warp_id], s_num_pts[3][warp_id]); + + // Launch 4 children. + build_quadtree_kernel + <<<4, NUM_THREADS_PER_BLOCK, 4 * NUM_WARPS_PER_BLOCK * sizeof(int)>>>( + &children[child_offset], points, Parameters(params, true)); + } } - } } //////////////////////////////////////////////////////////////////////////////// // Make sure a Quadtree is properly defined. //////////////////////////////////////////////////////////////////////////////// -bool check_quadtree(const Quadtree_node *nodes, int idx, int num_pts, - Points *pts, Parameters params) { - const Quadtree_node &node = nodes[idx]; - int num_points = node.num_points(); +bool check_quadtree(const Quadtree_node *nodes, int idx, int num_pts, Points *pts, Parameters params) +{ + const Quadtree_node &node = nodes[idx]; + int num_points = node.num_points(); - if (!(params.depth == params.max_depth || - num_points <= params.min_points_per_node)) { - int num_points_in_children = 0; + if (!(params.depth == params.max_depth || num_points <= params.min_points_per_node)) { + int num_points_in_children = 0; - num_points_in_children += - nodes[params.num_nodes_at_this_level + 4 * idx + 0].num_points(); - num_points_in_children += - nodes[params.num_nodes_at_this_level + 4 * idx + 1].num_points(); - num_points_in_children += - nodes[params.num_nodes_at_this_level + 4 * idx + 2].num_points(); - num_points_in_children += - nodes[params.num_nodes_at_this_level + 4 * idx + 3].num_points(); + num_points_in_children += nodes[params.num_nodes_at_this_level + 4 * idx + 0].num_points(); + num_points_in_children += nodes[params.num_nodes_at_this_level + 4 * idx + 1].num_points(); + num_points_in_children += nodes[params.num_nodes_at_this_level + 4 * idx + 2].num_points(); + num_points_in_children += nodes[params.num_nodes_at_this_level + 4 * idx + 3].num_points(); - if (num_points_in_children != node.num_points()) return false; + if (num_points_in_children != node.num_points()) + return false; - return check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 0, - num_pts, pts, Parameters(params, true)) && - check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 1, - num_pts, pts, Parameters(params, true)) && - check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 2, - num_pts, pts, Parameters(params, true)) && - check_quadtree(&nodes[params.num_nodes_at_this_level], 4 * idx + 3, - num_pts, pts, Parameters(params, true)); - } + return check_quadtree( + &nodes[params.num_nodes_at_this_level], 4 * idx + 0, num_pts, pts, Parameters(params, true)) + && check_quadtree( + &nodes[params.num_nodes_at_this_level], 4 * idx + 1, num_pts, pts, Parameters(params, true)) + && check_quadtree( + &nodes[params.num_nodes_at_this_level], 4 * idx + 2, num_pts, pts, Parameters(params, true)) + && check_quadtree( + &nodes[params.num_nodes_at_this_level], 4 * idx + 3, num_pts, pts, Parameters(params, true)); + } - const Bounding_box &bbox = node.bounding_box(); + const Bounding_box &bbox = node.bounding_box(); - for (int it = node.points_begin(); it < node.points_end(); ++it) { - if (it >= num_pts) return false; + for (int it = node.points_begin(); it < node.points_end(); ++it) { + if (it >= num_pts) + return false; - float2 p = pts->get_point(it); + float2 p = pts->get_point(it); - if (!bbox.contains(p)) return false; - } + if (!bbox.contains(p)) + return false; + } - return true; + return true; } //////////////////////////////////////////////////////////////////////////////// // Parallel random number generator. //////////////////////////////////////////////////////////////////////////////// -struct Random_generator { - int count; +struct Random_generator +{ + int count; - __host__ __device__ Random_generator() : count(0) {} - __host__ __device__ unsigned int hash(unsigned int a) { - a = (a + 0x7ed55d16) + (a << 12); - a = (a ^ 0xc761c23c) ^ (a >> 19); - a = (a + 0x165667b1) + (a << 5); - a = (a + 0xd3a2646c) ^ (a << 9); - a = (a + 0xfd7046c5) + (a << 3); - a = (a ^ 0xb55a4f09) ^ (a >> 16); - return a; - } + __host__ __device__ Random_generator() + : count(0) + { + } + __host__ __device__ unsigned int hash(unsigned int a) + { + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; + } - __host__ __device__ __forceinline__ thrust::tuple operator()() { + __host__ __device__ __forceinline__ thrust::tuple operator()() + { #ifdef __CUDA_ARCH__ - unsigned seed = hash(blockIdx.x * blockDim.x + threadIdx.x + count); - // thrust::generate may call operator() more than once per thread. - // Hence, increment count by grid size to ensure uniqueness of seed - count += blockDim.x * gridDim.x; + unsigned seed = hash(blockIdx.x * blockDim.x + threadIdx.x + count); + // thrust::generate may call operator() more than once per thread. + // Hence, increment count by grid size to ensure uniqueness of seed + count += blockDim.x * gridDim.x; #else - unsigned seed = hash(0); + unsigned seed = hash(0); #endif - thrust::default_random_engine rng(seed); - thrust::random::uniform_real_distribution distrib; - return thrust::make_tuple(distrib(rng), distrib(rng)); - } + thrust::default_random_engine rng(seed); + thrust::random::uniform_real_distribution distrib; + return thrust::make_tuple(distrib(rng), distrib(rng)); + } }; //////////////////////////////////////////////////////////////////////////////// // Allocate GPU structs, launch kernel and clean up //////////////////////////////////////////////////////////////////////////////// -bool cdpQuadtree(int warp_size) { - // Constants to control the algorithm. - const int num_points = 1024; - const int max_depth = 8; - const int min_points_per_node = 16; +bool cdpQuadtree(int warp_size) +{ + // Constants to control the algorithm. + const int num_points = 1024; + const int max_depth = 8; + const int min_points_per_node = 16; - // Allocate memory for points. - thrust::device_vector x_d0(num_points); - thrust::device_vector x_d1(num_points); - thrust::device_vector y_d0(num_points); - thrust::device_vector y_d1(num_points); + // Allocate memory for points. + thrust::device_vector x_d0(num_points); + thrust::device_vector x_d1(num_points); + thrust::device_vector y_d0(num_points); + thrust::device_vector y_d1(num_points); - // Generate random points. - Random_generator rnd; - thrust::generate( - thrust::make_zip_iterator(thrust::make_tuple(x_d0.begin(), y_d0.begin())), - thrust::make_zip_iterator(thrust::make_tuple(x_d0.end(), y_d0.end())), - rnd); + // Generate random points. + Random_generator rnd; + thrust::generate(thrust::make_zip_iterator(thrust::make_tuple(x_d0.begin(), y_d0.begin())), + thrust::make_zip_iterator(thrust::make_tuple(x_d0.end(), y_d0.end())), + rnd); - // Host structures to analyze the device ones. - Points points_init[2]; - points_init[0].set(thrust::raw_pointer_cast(&x_d0[0]), - thrust::raw_pointer_cast(&y_d0[0])); - points_init[1].set(thrust::raw_pointer_cast(&x_d1[0]), - thrust::raw_pointer_cast(&y_d1[0])); + // Host structures to analyze the device ones. + Points points_init[2]; + points_init[0].set(thrust::raw_pointer_cast(&x_d0[0]), thrust::raw_pointer_cast(&y_d0[0])); + points_init[1].set(thrust::raw_pointer_cast(&x_d1[0]), thrust::raw_pointer_cast(&y_d1[0])); - // Allocate memory to store points. - Points *points; - checkCudaErrors(cudaMalloc((void **)&points, 2 * sizeof(Points))); - checkCudaErrors(cudaMemcpy(points, points_init, 2 * sizeof(Points), - cudaMemcpyHostToDevice)); + // Allocate memory to store points. + Points *points; + checkCudaErrors(cudaMalloc((void **)&points, 2 * sizeof(Points))); + checkCudaErrors(cudaMemcpy(points, points_init, 2 * sizeof(Points), cudaMemcpyHostToDevice)); - // We could use a close form... - int max_nodes = 0; + // We could use a close form... + int max_nodes = 0; - for (int i = 0, num_nodes_at_level = 1; i < max_depth; - ++i, num_nodes_at_level *= 4) - max_nodes += num_nodes_at_level; + for (int i = 0, num_nodes_at_level = 1; i < max_depth; ++i, num_nodes_at_level *= 4) + max_nodes += num_nodes_at_level; - // Allocate memory to store the tree. - Quadtree_node root; - root.set_range(0, num_points); - Quadtree_node *nodes; - checkCudaErrors( - cudaMalloc((void **)&nodes, max_nodes * sizeof(Quadtree_node))); - checkCudaErrors( - cudaMemcpy(nodes, &root, sizeof(Quadtree_node), cudaMemcpyHostToDevice)); + // Allocate memory to store the tree. + Quadtree_node root; + root.set_range(0, num_points); + Quadtree_node *nodes; + checkCudaErrors(cudaMalloc((void **)&nodes, max_nodes * sizeof(Quadtree_node))); + checkCudaErrors(cudaMemcpy(nodes, &root, sizeof(Quadtree_node), cudaMemcpyHostToDevice)); - // Build the quadtree. - Parameters params(max_depth, min_points_per_node); - std::cout << "Launching CDP kernel to build the quadtree" << std::endl; - const int NUM_THREADS_PER_BLOCK = 128; // Do not use less than 128 threads. - const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warp_size; - const size_t smem_size = 4 * NUM_WARPS_PER_BLOCK * sizeof(int); - build_quadtree_kernel< - NUM_THREADS_PER_BLOCK><<<1, NUM_THREADS_PER_BLOCK, smem_size>>>( - nodes, points, params); - checkCudaErrors(cudaGetLastError()); + // Build the quadtree. + Parameters params(max_depth, min_points_per_node); + std::cout << "Launching CDP kernel to build the quadtree" << std::endl; + const int NUM_THREADS_PER_BLOCK = 128; // Do not use less than 128 threads. + const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warp_size; + const size_t smem_size = 4 * NUM_WARPS_PER_BLOCK * sizeof(int); + build_quadtree_kernel<<<1, NUM_THREADS_PER_BLOCK, smem_size>>>(nodes, points, params); + checkCudaErrors(cudaGetLastError()); - // Copy points to CPU. - thrust::host_vector x_h(x_d0); - thrust::host_vector y_h(y_d0); - Points host_points; - host_points.set(thrust::raw_pointer_cast(&x_h[0]), - thrust::raw_pointer_cast(&y_h[0])); + // Copy points to CPU. + thrust::host_vector x_h(x_d0); + thrust::host_vector y_h(y_d0); + Points host_points; + host_points.set(thrust::raw_pointer_cast(&x_h[0]), thrust::raw_pointer_cast(&y_h[0])); - // Copy nodes to CPU. - Quadtree_node *host_nodes = new Quadtree_node[max_nodes]; - checkCudaErrors(cudaMemcpy(host_nodes, nodes, - max_nodes * sizeof(Quadtree_node), - cudaMemcpyDeviceToHost)); + // Copy nodes to CPU. + Quadtree_node *host_nodes = new Quadtree_node[max_nodes]; + checkCudaErrors(cudaMemcpy(host_nodes, nodes, max_nodes * sizeof(Quadtree_node), cudaMemcpyDeviceToHost)); - // Validate the results. - bool ok = check_quadtree(host_nodes, 0, num_points, &host_points, params); - std::cout << "Results: " << (ok ? "OK" : "FAILED") << std::endl; + // Validate the results. + bool ok = check_quadtree(host_nodes, 0, num_points, &host_points, params); + std::cout << "Results: " << (ok ? "OK" : "FAILED") << std::endl; - // Free CPU memory. - delete[] host_nodes; + // Free CPU memory. + delete[] host_nodes; - // Free memory. - checkCudaErrors(cudaFree(nodes)); - checkCudaErrors(cudaFree(points)); + // Free memory. + checkCudaErrors(cudaFree(nodes)); + checkCudaErrors(cudaFree(points)); - return ok; + return ok; } //////////////////////////////////////////////////////////////////////////////// // Main entry point. //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // Find/set the device. - // The test requires an architecture SM35 or greater (CDP capable). - int cuda_device = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp deviceProps; - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, cuda_device)); - int cdpCapable = (deviceProps.major == 3 && deviceProps.minor >= 5) || - deviceProps.major >= 4; +int main(int argc, char **argv) +{ + // Find/set the device. + // The test requires an architecture SM35 or greater (CDP capable). + int cuda_device = findCudaDevice(argc, (const char **)argv); + cudaDeviceProp deviceProps; + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, cuda_device)); + int cdpCapable = (deviceProps.major == 3 && deviceProps.minor >= 5) || deviceProps.major >= 4; - printf("GPU device %s has compute capabilities (SM %d.%d)\n", - deviceProps.name, deviceProps.major, deviceProps.minor); + printf( + "GPU device %s has compute capabilities (SM %d.%d)\n", deviceProps.name, deviceProps.major, deviceProps.minor); - if (!cdpCapable) { - std::cerr << "cdpQuadTree requires SM 3.5 or higher to use CUDA Dynamic " - "Parallelism. Exiting...\n" - << std::endl; - exit(EXIT_WAIVED); - } + if (!cdpCapable) { + std::cerr << "cdpQuadTree requires SM 3.5 or higher to use CUDA Dynamic " + "Parallelism. Exiting...\n" + << std::endl; + exit(EXIT_WAIVED); + } - bool ok = cdpQuadtree(deviceProps.warpSize); + bool ok = cdpQuadtree(deviceProps.warpSize); - return (ok ? EXIT_SUCCESS : EXIT_FAILURE); + return (ok ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/3_CUDA_Features/cdpSimplePrint/cdpSimplePrint.cu b/Samples/3_CUDA_Features/cdpSimplePrint/cdpSimplePrint.cu index 61eed09d..b2d34b2f 100644 --- a/Samples/3_CUDA_Features/cdpSimplePrint/cdpSimplePrint.cu +++ b/Samples/3_CUDA_Features/cdpSimplePrint/cdpSimplePrint.cu @@ -25,11 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include -#include - #include #include +#include +#include #include //////////////////////////////////////////////////////////////////////////////// @@ -40,26 +39,26 @@ __device__ int g_uids = 0; //////////////////////////////////////////////////////////////////////////////// // Print a simple message to signal the block which is currently executing. //////////////////////////////////////////////////////////////////////////////// -__device__ void print_info(int depth, int thread, int uid, int parent_uid) { - if (threadIdx.x == 0) { - if (depth == 0) - printf("BLOCK %d launched by the host\n", uid); - else { - char buffer[32]; +__device__ void print_info(int depth, int thread, int uid, int parent_uid) +{ + if (threadIdx.x == 0) { + if (depth == 0) + printf("BLOCK %d launched by the host\n", uid); + else { + char buffer[32]; - for (int i = 0; i < depth; ++i) { - buffer[3 * i + 0] = '|'; - buffer[3 * i + 1] = ' '; - buffer[3 * i + 2] = ' '; - } + for (int i = 0; i < depth; ++i) { + buffer[3 * i + 0] = '|'; + buffer[3 * i + 1] = ' '; + buffer[3 * i + 2] = ' '; + } - buffer[3 * depth] = '\0'; - printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, - thread, parent_uid); + buffer[3 * depth] = '\0'; + printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid); + } } - } - __syncthreads(); + __syncthreads(); } //////////////////////////////////////////////////////////////////////////////// @@ -69,101 +68,94 @@ __device__ void print_info(int depth, int thread, int uid, int parent_uid) { // about that block. Finally, if the 'max_depth' has not been reached, the // block launches new blocks directly from the GPU. //////////////////////////////////////////////////////////////////////////////// -__global__ void cdp_kernel(int max_depth, int depth, int thread, - int parent_uid) { - // We create a unique ID per block. Thread 0 does that and shares the value - // with the other threads. - __shared__ int s_uid; +__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid) +{ + // We create a unique ID per block. Thread 0 does that and shares the value + // with the other threads. + __shared__ int s_uid; - if (threadIdx.x == 0) { - s_uid = atomicAdd(&g_uids, 1); - } + if (threadIdx.x == 0) { + s_uid = atomicAdd(&g_uids, 1); + } - __syncthreads(); + __syncthreads(); - // We print the ID of the block and information about its parent. - print_info(depth, thread, s_uid, parent_uid); + // We print the ID of the block and information about its parent. + print_info(depth, thread, s_uid, parent_uid); - // We launch new blocks if we haven't reached the max_depth yet. - if (++depth >= max_depth) { - return; - } + // We launch new blocks if we haven't reached the max_depth yet. + if (++depth >= max_depth) { + return; + } - cdp_kernel<<>>(max_depth, depth, threadIdx.x, s_uid); + cdp_kernel<<>>(max_depth, depth, threadIdx.x, s_uid); } //////////////////////////////////////////////////////////////////////////////// // Main entry point. //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("starting Simple Print (CUDA Dynamic Parallelism)\n"); +int main(int argc, char **argv) +{ + printf("starting Simple Print (CUDA Dynamic Parallelism)\n"); - // Parse a few command-line arguments. - int max_depth = 2; + // Parse a few command-line arguments. + int max_depth = 2; - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "h")) { - printf( - "Usage: %s depth=\t(where max_depth is a value between 1 " - "and 8).\n", - argv[0]); - exit(EXIT_SUCCESS); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "depth")) { - max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth"); - - if (max_depth < 1 || max_depth > 8) { - printf("depth parameter has to be between 1 and 8\n"); - exit(EXIT_FAILURE); + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h")) { + printf("Usage: %s depth=\t(where max_depth is a value between 1 " + "and 8).\n", + argv[0]); + exit(EXIT_SUCCESS); } - } - // Find/set the device. - int device = -1; - cudaDeviceProp deviceProp; - device = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); + if (checkCmdLineFlag(argc, (const char **)argv, "depth")) { + max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth"); - if (!(deviceProp.major > 3 || - (deviceProp.major == 3 && deviceProp.minor >= 5))) { - printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", - device, deviceProp.name); - exit(EXIT_WAIVED); - } + if (max_depth < 1 || max_depth > 8) { + printf("depth parameter has to be between 1 and 8\n"); + exit(EXIT_FAILURE); + } + } - // Print a message describing what the sample does. - printf( - "*********************************************************************" - "******\n"); - printf( - "The CPU launches 2 blocks of 2 threads each. On the device each thread " - "will\n"); - printf( - "launch 2 blocks of 2 threads each. The GPU we will do that " - "recursively\n"); - printf("until it reaches max_depth=%d\n\n", max_depth); - printf("In total 2"); - int num_blocks = 2, sum = 2; + // Find/set the device. + int device = -1; + cudaDeviceProp deviceProp; + device = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); - for (int i = 1; i < max_depth; ++i) { - num_blocks *= 4; - printf("+%d", num_blocks); - sum += num_blocks; - } + if (!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >= 5))) { + printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name); + exit(EXIT_WAIVED); + } - printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum - 2); - printf( - "************************************************************************" - "***\n\n"); + // Print a message describing what the sample does. + printf("*********************************************************************" + "******\n"); + printf("The CPU launches 2 blocks of 2 threads each. On the device each thread " + "will\n"); + printf("launch 2 blocks of 2 threads each. The GPU we will do that " + "recursively\n"); + printf("until it reaches max_depth=%d\n\n", max_depth); + printf("In total 2"); + int num_blocks = 2, sum = 2; - // Launch the kernel from the CPU. - printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n"); - cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1); - checkCudaErrors(cudaGetLastError()); + for (int i = 1; i < max_depth; ++i) { + num_blocks *= 4; + printf("+%d", num_blocks); + sum += num_blocks; + } - // Finalize. - checkCudaErrors(cudaDeviceSynchronize()); + printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum - 2); + printf("************************************************************************" + "***\n\n"); - exit(EXIT_SUCCESS); + // Launch the kernel from the CPU. + printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n"); + cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1); + checkCudaErrors(cudaGetLastError()); + + // Finalize. + checkCudaErrors(cudaDeviceSynchronize()); + + exit(EXIT_SUCCESS); } diff --git a/Samples/3_CUDA_Features/cdpSimpleQuicksort/cdpSimpleQuicksort.cu b/Samples/3_CUDA_Features/cdpSimpleQuicksort/cdpSimpleQuicksort.cu index 0334c0f4..dc1bc9b7 100644 --- a/Samples/3_CUDA_Features/cdpSimpleQuicksort/cdpSimpleQuicksort.cu +++ b/Samples/3_CUDA_Features/cdpSimpleQuicksort/cdpSimpleQuicksort.cu @@ -25,221 +25,221 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include #include #include +#include -#define MAX_DEPTH 16 +#define MAX_DEPTH 16 #define INSERTION_SORT 32 //////////////////////////////////////////////////////////////////////////////// // Selection sort used when depth gets too big or the number of elements drops // below a threshold. //////////////////////////////////////////////////////////////////////////////// -__device__ void selection_sort(unsigned int *data, int left, int right) { - for (int i = left; i <= right; ++i) { - unsigned min_val = data[i]; - int min_idx = i; +__device__ void selection_sort(unsigned int *data, int left, int right) +{ + for (int i = left; i <= right; ++i) { + unsigned min_val = data[i]; + int min_idx = i; - // Find the smallest value in the range [left, right]. - for (int j = i + 1; j <= right; ++j) { - unsigned val_j = data[j]; + // Find the smallest value in the range [left, right]. + for (int j = i + 1; j <= right; ++j) { + unsigned val_j = data[j]; - if (val_j < min_val) { - min_idx = j; - min_val = val_j; - } + if (val_j < min_val) { + min_idx = j; + min_val = val_j; + } + } + + // Swap the values. + if (i != min_idx) { + data[min_idx] = data[i]; + data[i] = min_val; + } } - - // Swap the values. - if (i != min_idx) { - data[min_idx] = data[i]; - data[i] = min_val; - } - } } //////////////////////////////////////////////////////////////////////////////// // Very basic quicksort algorithm, recursively launching the next level. //////////////////////////////////////////////////////////////////////////////// -__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, - int depth) { - // If we're too deep or there are few elements left, we use an insertion - // sort... - if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT) { - selection_sort(data, left, right); - return; - } - - unsigned int *lptr = data + left; - unsigned int *rptr = data + right; - unsigned int pivot = data[(left + right) / 2]; - - // Do the partitioning. - while (lptr <= rptr) { - // Find the next left- and right-hand values to swap - unsigned int lval = *lptr; - unsigned int rval = *rptr; - - // Move the left pointer as long as the pointed element is smaller than the - // pivot. - while (lval < pivot) { - lptr++; - lval = *lptr; +__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) +{ + // If we're too deep or there are few elements left, we use an insertion + // sort... + if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT) { + selection_sort(data, left, right); + return; } - // Move the right pointer as long as the pointed element is larger than the - // pivot. - while (rval > pivot) { - rptr--; - rval = *rptr; + unsigned int *lptr = data + left; + unsigned int *rptr = data + right; + unsigned int pivot = data[(left + right) / 2]; + + // Do the partitioning. + while (lptr <= rptr) { + // Find the next left- and right-hand values to swap + unsigned int lval = *lptr; + unsigned int rval = *rptr; + + // Move the left pointer as long as the pointed element is smaller than the + // pivot. + while (lval < pivot) { + lptr++; + lval = *lptr; + } + + // Move the right pointer as long as the pointed element is larger than the + // pivot. + while (rval > pivot) { + rptr--; + rval = *rptr; + } + + // If the swap points are valid, do the swap! + if (lptr <= rptr) { + *lptr++ = rval; + *rptr-- = lval; + } } - // If the swap points are valid, do the swap! - if (lptr <= rptr) { - *lptr++ = rval; - *rptr-- = lval; + // Now the recursive part + int nright = rptr - data; + int nleft = lptr - data; + + // Launch a new block to sort the left part. + if (left < (rptr - data)) { + cudaStream_t s; + cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); + cdp_simple_quicksort<<<1, 1, 0, s>>>(data, left, nright, depth + 1); + cudaStreamDestroy(s); } - } - // Now the recursive part - int nright = rptr - data; - int nleft = lptr - data; - - // Launch a new block to sort the left part. - if (left < (rptr - data)) { - cudaStream_t s; - cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cdp_simple_quicksort<<<1, 1, 0, s>>>(data, left, nright, depth + 1); - cudaStreamDestroy(s); - } - - // Launch a new block to sort the right part. - if ((lptr - data) < right) { - cudaStream_t s1; - cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); - cdp_simple_quicksort<<<1, 1, 0, s1>>>(data, nleft, right, depth + 1); - cudaStreamDestroy(s1); - } + // Launch a new block to sort the right part. + if ((lptr - data) < right) { + cudaStream_t s1; + cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); + cdp_simple_quicksort<<<1, 1, 0, s1>>>(data, nleft, right, depth + 1); + cudaStreamDestroy(s1); + } } //////////////////////////////////////////////////////////////////////////////// // Call the quicksort kernel from the host. //////////////////////////////////////////////////////////////////////////////// -void run_qsort(unsigned int *data, unsigned int nitems) { - // Prepare CDP for the max depth 'MAX_DEPTH'. +void run_qsort(unsigned int *data, unsigned int nitems) +{ + // Prepare CDP for the max depth 'MAX_DEPTH'. - // Launch on device - int left = 0; - int right = nitems - 1; - std::cout << "Launching kernel on the GPU" << std::endl; - cdp_simple_quicksort<<<1, 1>>>(data, left, right, 0); - checkCudaErrors(cudaDeviceSynchronize()); + // Launch on device + int left = 0; + int right = nitems - 1; + std::cout << "Launching kernel on the GPU" << std::endl; + cdp_simple_quicksort<<<1, 1>>>(data, left, right, 0); + checkCudaErrors(cudaDeviceSynchronize()); } //////////////////////////////////////////////////////////////////////////////// // Initialize data on the host. //////////////////////////////////////////////////////////////////////////////// -void initialize_data(unsigned int *dst, unsigned int nitems) { - // Fixed seed for illustration - srand(2047); +void initialize_data(unsigned int *dst, unsigned int nitems) +{ + // Fixed seed for illustration + srand(2047); - // Fill dst with random values - for (unsigned i = 0; i < nitems; i++) dst[i] = rand() % nitems; + // Fill dst with random values + for (unsigned i = 0; i < nitems; i++) + dst[i] = rand() % nitems; } //////////////////////////////////////////////////////////////////////////////// // Verify the results. //////////////////////////////////////////////////////////////////////////////// -void check_results(int n, unsigned int *results_d) { - unsigned int *results_h = new unsigned[n]; - checkCudaErrors(cudaMemcpy(results_h, results_d, n * sizeof(unsigned), - cudaMemcpyDeviceToHost)); +void check_results(int n, unsigned int *results_d) +{ + unsigned int *results_h = new unsigned[n]; + checkCudaErrors(cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost)); - for (int i = 1; i < n; ++i) - if (results_h[i - 1] > results_h[i]) { - std::cout << "Invalid item[" << i - 1 << "]: " << results_h[i - 1] - << " greater than " << results_h[i] << std::endl; - exit(EXIT_FAILURE); - } + for (int i = 1; i < n; ++i) + if (results_h[i - 1] > results_h[i]) { + std::cout << "Invalid item[" << i - 1 << "]: " << results_h[i - 1] << " greater than " << results_h[i] + << std::endl; + exit(EXIT_FAILURE); + } - std::cout << "OK" << std::endl; - delete[] results_h; + std::cout << "OK" << std::endl; + delete[] results_h; } //////////////////////////////////////////////////////////////////////////////// // Main entry point. //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - int num_items = 128; - bool verbose = false; +int main(int argc, char **argv) +{ + int num_items = 128; + bool verbose = false; - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "h")) { - std::cerr << "Usage: " << argv[0] - << " num_items=\twhere num_items is the number of " - "items to sort" - << std::endl; - exit(EXIT_SUCCESS); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "v")) { - verbose = true; - } - - if (checkCmdLineFlag(argc, (const char **)argv, "num_items")) { - num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items"); - - if (num_items < 1) { - std::cerr << "ERROR: num_items has to be greater than 1" << std::endl; - exit(EXIT_FAILURE); + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h")) { + std::cerr << "Usage: " << argv[0] + << " num_items=\twhere num_items is the number of " + "items to sort" + << std::endl; + exit(EXIT_SUCCESS); } - } - // Find/set device and get device properties - int device = -1; - cudaDeviceProp deviceProp; - device = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); + if (checkCmdLineFlag(argc, (const char **)argv, "v")) { + verbose = true; + } - if (!(deviceProp.major > 3 || - (deviceProp.major == 3 && deviceProp.minor >= 5))) { - printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", - device, deviceProp.name); - exit(EXIT_WAIVED); - } + if (checkCmdLineFlag(argc, (const char **)argv, "num_items")) { + num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items"); - // Create input data - unsigned int *h_data = 0; - unsigned int *d_data = 0; + if (num_items < 1) { + std::cerr << "ERROR: num_items has to be greater than 1" << std::endl; + exit(EXIT_FAILURE); + } + } - // Allocate CPU memory and initialize data. - std::cout << "Initializing data:" << std::endl; - h_data = (unsigned int *)malloc(num_items * sizeof(unsigned int)); - initialize_data(h_data, num_items); + // Find/set device and get device properties + int device = -1; + cudaDeviceProp deviceProp; + device = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); - if (verbose) { - for (int i = 0; i < num_items; i++) - std::cout << "Data [" << i << "]: " << h_data[i] << std::endl; - } + if (!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >= 5))) { + printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name); + exit(EXIT_WAIVED); + } - // Allocate GPU memory. - checkCudaErrors( - cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int))); - checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), - cudaMemcpyHostToDevice)); + // Create input data + unsigned int *h_data = 0; + unsigned int *d_data = 0; - // Execute - std::cout << "Running quicksort on " << num_items << " elements" << std::endl; - run_qsort(d_data, num_items); + // Allocate CPU memory and initialize data. + std::cout << "Initializing data:" << std::endl; + h_data = (unsigned int *)malloc(num_items * sizeof(unsigned int)); + initialize_data(h_data, num_items); - // Check result - std::cout << "Validating results: "; - check_results(num_items, d_data); + if (verbose) { + for (int i = 0; i < num_items; i++) + std::cout << "Data [" << i << "]: " << h_data[i] << std::endl; + } - free(h_data); - checkCudaErrors(cudaFree(d_data)); + // Allocate GPU memory. + checkCudaErrors(cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int))); + checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice)); - exit(EXIT_SUCCESS); + // Execute + std::cout << "Running quicksort on " << num_items << " elements" << std::endl; + run_qsort(d_data, num_items); + + // Check result + std::cout << "Validating results: "; + check_results(num_items, d_data); + + free(h_data); + checkCudaErrors(cudaFree(d_data)); + + exit(EXIT_SUCCESS); } diff --git a/Samples/3_CUDA_Features/cudaCompressibleMemory/compMalloc.cpp b/Samples/3_CUDA_Features/cudaCompressibleMemory/compMalloc.cpp index 7554a1af..ac81f77b 100644 --- a/Samples/3_CUDA_Features/cudaCompressibleMemory/compMalloc.cpp +++ b/Samples/3_CUDA_Features/cudaCompressibleMemory/compMalloc.cpp @@ -25,11 +25,11 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include -#include -#include #include #include +#include +#include +#include cudaError_t setProp(CUmemAllocationProp *prop, bool UseCompressibleMemory) { @@ -38,9 +38,9 @@ cudaError_t setProp(CUmemAllocationProp *prop, bool UseCompressibleMemory) return cudaErrorMemoryAllocation; memset(prop, 0, sizeof(CUmemAllocationProp)); - prop->type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop->type = CU_MEM_ALLOCATION_TYPE_PINNED; prop->location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop->location.id = currentDevice; + prop->location.id = currentDevice; if (UseCompressibleMemory) prop->allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_GENERIC; @@ -51,13 +51,12 @@ cudaError_t setProp(CUmemAllocationProp *prop, bool UseCompressibleMemory) cudaError_t allocateCompressible(void **adr, size_t size, bool UseCompressibleMemory) { CUmemAllocationProp prop = {}; - cudaError_t err = setProp(&prop, UseCompressibleMemory); + cudaError_t err = setProp(&prop, UseCompressibleMemory); if (err != cudaSuccess) return err; size_t granularity = 0; - if (cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) + if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) return cudaErrorMemoryAllocation; size = ((size - 1) / granularity + 1) * granularity; CUdeviceptr dptr; @@ -85,9 +84,9 @@ cudaError_t allocateCompressible(void **adr, size_t size, bool UseCompressibleMe return cudaErrorMemoryAllocation; CUmemAccessDesc accessDescriptor; - accessDescriptor.location.id = prop.location.id; + accessDescriptor.location.id = prop.location.id; accessDescriptor.location.type = prop.location.type; - accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; if (cuMemSetAccess(dptr, size, &accessDescriptor, 1) != CUDA_SUCCESS) return cudaErrorMemoryAllocation; @@ -99,20 +98,18 @@ cudaError_t allocateCompressible(void **adr, size_t size, bool UseCompressibleMe cudaError_t freeCompressible(void *ptr, size_t size, bool UseCompressibleMemory) { CUmemAllocationProp prop = {}; - cudaError_t err = setProp(&prop, UseCompressibleMemory); + cudaError_t err = setProp(&prop, UseCompressibleMemory); if (err != cudaSuccess) return err; size_t granularity = 0; - if (cuMemGetAllocationGranularity(&granularity, &prop, - CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) + if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) return cudaErrorMemoryAllocation; size = ((size - 1) / granularity + 1) * granularity; if (ptr == NULL) return cudaSuccess; - if (cuMemUnmap((CUdeviceptr)ptr, size) != CUDA_SUCCESS || - cuMemAddressFree((CUdeviceptr)ptr, size) != CUDA_SUCCESS) + if (cuMemUnmap((CUdeviceptr)ptr, size) != CUDA_SUCCESS || cuMemAddressFree((CUdeviceptr)ptr, size) != CUDA_SUCCESS) return cudaErrorInvalidValue; return cudaSuccess; } diff --git a/Samples/3_CUDA_Features/cudaCompressibleMemory/saxpy.cu b/Samples/3_CUDA_Features/cudaCompressibleMemory/saxpy.cu index 11fbbb5e..0113e913 100644 --- a/Samples/3_CUDA_Features/cudaCompressibleMemory/saxpy.cu +++ b/Samples/3_CUDA_Features/cudaCompressibleMemory/saxpy.cu @@ -27,53 +27,54 @@ // // This sample uses the compressible memory allocation if device supports it -// and performs saxpy on it. -// Compressible memory may give better performance if the data is amenable to +// and performs saxpy on it. +// Compressible memory may give better performance if the data is amenable to // compression. -#include #include +#include #define CUDA_DRIVER_API -#include "helper_cuda.h" #include "compMalloc.h" +#include "helper_cuda.h" __global__ void saxpy(const float a, const float4 *x, const float4 *y, float4 *z, const size_t n) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) - { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) { const float4 x4 = x[i]; const float4 y4 = y[i]; - z[i] = make_float4(a * x4.x + y4.x, a * x4.y + y4.y, - a * x4.z + y4.z, a * x4.w + y4.w); + z[i] = make_float4(a * x4.x + y4.x, a * x4.y + y4.y, a * x4.z + y4.z, a * x4.w + y4.w); } } __global__ void init(float4 *x, float4 *y, const float val, const size_t n) { const float4 val4 = make_float4(val, val, val, val); - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) - { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += gridDim.x * blockDim.x) { x[i] = y[i] = val4; } } -void launchSaxpy(const float a, float4 *x, float4 *y, float4 *z, const size_t n, const float init_val, const bool compressibleZbuf) +void launchSaxpy(const float a, + float4 *x, + float4 *y, + float4 *z, + const size_t n, + const float init_val, + const bool compressibleZbuf) { cudaEvent_t start, stop; - float ms; - int blockSize; - int minGridSize; - dim3 threads, blocks; + float ms; + int blockSize; + int minGridSize; + dim3 threads, blocks; - if (!compressibleZbuf) - { + if (!compressibleZbuf) { // We are on config where compressible buffer can only be initialized through cudaMemcpy // hence, x & y buffers are allocated as compressible and initialized via cudaMemcpy // whereas z buffer is allocated as non-compressible. - float4 *h_x = (float4 *) malloc(sizeof(float4) * n); - float4 *h_y = (float4 *) malloc(sizeof(float4) * n); - for (int i = 0; i < n; i++) - { + float4 *h_x = (float4 *)malloc(sizeof(float4) * n); + float4 *h_y = (float4 *)malloc(sizeof(float4) * n); + for (int i = 0; i < n; i++) { h_x[i].x = h_x[i].y = h_x[i].z = h_x[i].w = init_val; h_y[i].x = h_y[i].y = h_y[i].z = h_y[i].w = init_val; } @@ -82,15 +83,14 @@ void launchSaxpy(const float a, float4 *x, float4 *y, float4 *z, const size_t n, free(h_x); free(h_y); } - else - { - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)init)); + else { + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)init)); threads = dim3(blockSize, 1, 1); blocks = dim3(minGridSize, 1, 1); init<<>>(x, y, init_val, n); } - checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)saxpy)); + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)saxpy)); threads = dim3(blockSize, 1, 1); blocks = dim3(minGridSize, 1, 1); @@ -103,39 +103,39 @@ void launchSaxpy(const float a, float4 *x, float4 *y, float4 *z, const size_t n, checkCudaErrors(cudaEventElapsedTime(&ms, start, stop)); const size_t size = n * sizeof(float4); - printf("Running saxpy with %d blocks x %d threads = %.3f ms %.3f TB/s\n", blocks.x, threads.x, ms, (size*3)/ms/1e9); + printf("Running saxpy with %d blocks x %d threads = %.3f ms %.3f TB/s\n", + blocks.x, + threads.x, + ms, + (size * 3) / ms / 1e9); } int main(int argc, char **argv) { const size_t n = 10485760; - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "?")) { + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { printf("Usage -device=n (n >= 0 for deviceID)\n"); exit(EXIT_SUCCESS); } - findCudaDevice(argc, (const char**)argv); + findCudaDevice(argc, (const char **)argv); CUdevice currentDevice; checkCudaErrors(cuCtxGetDevice(¤tDevice)); // Check that the selected device supports virtual memory management int vmm_supported = -1; - checkCudaErrors(cuDeviceGetAttribute(&vmm_supported, - CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - currentDevice)); + checkCudaErrors( + cuDeviceGetAttribute(&vmm_supported, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, currentDevice)); if (vmm_supported == 0) { printf("Device %d doesn't support Virtual Memory Management, waiving the execution.\n", currentDevice); exit(EXIT_WAIVED); } int isCompressionAvailable; - checkCudaErrors(cuDeviceGetAttribute(&isCompressionAvailable, - CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED, - currentDevice)); - if (isCompressionAvailable == 0) - { + checkCudaErrors(cuDeviceGetAttribute( + &isCompressionAvailable, CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED, currentDevice)); + if (isCompressionAvailable == 0) { printf("Device %d doesn't support Generic memory compression, waiving the execution.\n", currentDevice); exit(EXIT_WAIVED); } @@ -143,39 +143,33 @@ int main(int argc, char **argv) printf("Generic memory compression support is available\n"); int major, minor; - checkCudaErrors(cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - currentDevice)); - checkCudaErrors(cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - currentDevice)); - float4 *x, *y, *z; + checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, currentDevice)); + checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, currentDevice)); + float4 *x, *y, *z; const size_t size = n * sizeof(float4); // Allocating compressible memory checkCudaErrors(allocateCompressible((void **)&x, size, true)); checkCudaErrors(allocateCompressible((void **)&y, size, true)); bool compressibleZbuf = 0; - if ((major == 8 && minor == 0) || (major == 8 && minor == 6)) - { - // On SM 8.0 and 8.6 GPUs compressible buffer can only be initialized + if ((major == 8 && minor == 0) || (major == 8 && minor == 6)) { + // On SM 8.0 and 8.6 GPUs compressible buffer can only be initialized // through cudaMemcpy. printf("allocating non-compressible Z buffer\n"); checkCudaErrors(allocateCompressible((void **)&z, size, false)); compressibleZbuf = 0; } - else - { + else { checkCudaErrors(allocateCompressible((void **)&z, size, true)); compressibleZbuf = 1; } printf("Running saxpy on %zu bytes of Compressible memory\n", size); - const float a = 1.0f; + const float a = 1.0f; const float init_val = 1.0f; launchSaxpy(a, x, y, z, n, init_val, compressibleZbuf); - + checkCudaErrors(freeCompressible(x, size, true)); checkCudaErrors(freeCompressible(y, size, true)); checkCudaErrors(freeCompressible(z, size, true)); @@ -193,6 +187,6 @@ int main(int argc, char **argv) checkCudaErrors(freeCompressible(z, size, false)); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); + "Results may vary when GPU Boost is enabled.\n"); return EXIT_SUCCESS; -} \ No newline at end of file +} diff --git a/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu b/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu index 9ced5c28..d6431f01 100644 --- a/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu +++ b/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu @@ -112,7 +112,7 @@ // Implementation constants. -#define WARPS_PER_BLOCK 8 +#define WARPS_PER_BLOCK 8 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) #if SHARED_MEMORY_LIMIT_64K @@ -129,10 +129,10 @@ #define CHUNK_K 8 #endif -#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(half)) -#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) +#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(half)) +#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) #define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES) -#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) +#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) #define BLOCK_ROW_WARPS 2 #define BLOCK_COL_WARPS 4 @@ -162,237 +162,220 @@ // we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync. #define SKEW_HALF 16 -#define checkKernelErrors(expr) \ - do { \ - expr; \ - \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, \ - cudaGetErrorString(__err)); \ - abort(); \ - } \ - } while (0) +#define checkKernelErrors(expr) \ + do { \ + expr; \ + \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, cudaGetErrorString(__err)); \ + abort(); \ + } \ + } while (0) using namespace nvcuda; -__host__ void init_host_matrices(half *a, half *b, float *c) { - for (int i = 0; i < M_GLOBAL; i++) { - for (int j = 0; j < K_GLOBAL; j++) { - a[i * K_GLOBAL + j] = (half)(rand() % 3); +__host__ void init_host_matrices(half *a, half *b, float *c) +{ + for (int i = 0; i < M_GLOBAL; i++) { + for (int j = 0; j < K_GLOBAL; j++) { + a[i * K_GLOBAL + j] = (half)(rand() % 3); + } } - } - for (int i = 0; i < N_GLOBAL; i++) { - for (int j = 0; j < K_GLOBAL; j++) { - b[i * K_GLOBAL + j] = (half)(rand() % 3); + for (int i = 0; i < N_GLOBAL; i++) { + for (int j = 0; j < K_GLOBAL; j++) { + b[i * K_GLOBAL + j] = (half)(rand() % 3); + } } - } - for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { - c[t] = static_cast(rand() % 3); - } + for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { + c[t] = static_cast(rand() % 3); + } } -__global__ void compute_gemm(const half *A, const half *B, const float *C, - float *D, float alpha, float beta) { - extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF]; +__global__ void compute_gemm(const half *A, const half *B, const float *C, float *D, float alpha, float beta) +{ + extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF]; - // Warp and lane identification. - const unsigned int warpId = threadIdx.x / WARP_SIZE; - const unsigned int laneId = threadIdx.x % WARP_SIZE; + // Warp and lane identification. + const unsigned int warpId = threadIdx.x / WARP_SIZE; + const unsigned int laneId = threadIdx.x % WARP_SIZE; - // Offset in shared memory from which the B matrix is stored. - const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; + // Offset in shared memory from which the B matrix is stored. + const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; - // This pointer is used to access the C and D matrix tiles this warp computes. - float *shmem_warp_tile_ptr = (float *)&shmem[0][0] + - (warpId / 2) * SHMEM_STRIDE * K * 2 + - (warpId % 2) * SHMEM_OFFSET; + // This pointer is used to access the C and D matrix tiles this warp computes. + float *shmem_warp_tile_ptr = + (float *)&shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * K * 2 + (warpId % 2) * SHMEM_OFFSET; - // This pointer is used to stream the C and D matrices block-wide tile to and - // from shared memory. - float *shmem_warp_stream_ptr = - (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * K; + // This pointer is used to stream the C and D matrices block-wide tile to and + // from shared memory. + float *shmem_warp_stream_ptr = (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * K; - // Adjust the beta scaler, as it'll be multiplied by alpha at the end of - // each tile computation. Technically this is not generally correct (may - // result in a loss of precision). Zero still needs to be specially handled - // though. - beta /= alpha; + // Adjust the beta scaler, as it'll be multiplied by alpha at the end of + // each tile computation. Technically this is not generally correct (may + // result in a loss of precision). Zero still needs to be specially handled + // though. + beta /= alpha; - // Each CTA slides along the 128 x 128 tiles from the top left corner of the - // matrix to the right and down, and selects the next tile to compute. Once - // there's no such tile, all warps in this CTA exit. - for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { - const unsigned int block_tile_i = - ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); - const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; + // Each CTA slides along the 128 x 128 tiles from the top left corner of the + // matrix to the right and down, and selects the next tile to compute. Once + // there's no such tile, all warps in this CTA exit. + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); + const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; - // Stop when there are no more D matrix tiles to compute in this CTA. - if (block_tile_i >= M_TILES) { - break; - } - - // This warp's pointer to the C matrix data to copy memory from to shared - // memory. - const size_t gmem_idx = - (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; - const float *src_gmem_warp_stream_ptr = &C[gmem_idx]; - - // Stream multiple C tiles to shared memory. -#pragma unroll - for (int i = 0; i < K; i++) { - typedef int4 copy_t; - - *((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = - *((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + - laneId); - } - - __syncthreads(); - - // These fragments will accumulate the result of A and B matrix fragment - // multiplications along the K_GLOBAL dimension. - wmma::fragment c[WARP_COL_TILES] - [WARP_ROW_TILES]; - - // Load the C matrix tiles into fragments from shared memory. -#pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { -#pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { - const float *tile_ptr = - shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; - - wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT); - } - } - - __syncthreads(); - - // Scale the C matrix. -#pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { -#pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { -#pragma unroll - for (int t = 0; t < c[i][j].num_elements; t++) { - c[i][j].x[t] *= beta; + // Stop when there are no more D matrix tiles to compute in this CTA. + if (block_tile_i >= M_TILES) { + break; } - } - } - // Select what warp copies what matrix to shared memory. - // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const half *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + - M * K_GLOBAL * (warpId % 4) * 2) - : (&B[block_tile_j * N * K_GLOBAL] + - N * K_GLOBAL * (warpId % 4) * 2); + // This warp's pointer to the C matrix data to copy memory from to shared + // memory. + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const float *src_gmem_warp_stream_ptr = &C[gmem_idx]; - // Go through the global K dimension by a fixed step at a time. + // Stream multiple C tiles to shared memory. #pragma unroll - for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { - // Copy slices of the A and B matrices to shared memory. - // The first half of the warps in the CTA copy the A matrix, the rest copy - // the B matrix. - size_t shmem_idx = - warpId < (WARPS_PER_BLOCK / 2) - ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) - : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); + for (int i = 0; i < K; i++) { + typedef int4 copy_t; - // First half of the warp copies the first row / column of the matrix, - // the second half of the warp copies the next. - int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K + - (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) + - (laneId % CHUNK_COPY_LINE_LANES); + *((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = + *((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId); + } - // Shift the second half of the warp to the next row / column in the - // shared memory. - shmem_idx += laneId / CHUNK_COPY_LINE_LANES; + __syncthreads(); -#pragma unroll - for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; - i++) { - // Copy 16 bytes at once in each lane. - *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = - *lane_ptr; - - // Advance the global memory pointer and the shared memory index. - lane_ptr = - (int4 *)((half *)lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP); - shmem_idx += CHUNK_COPY_LINES_PER_WARP; - } - - __syncthreads(); - - // Compute a grid of C matrix tiles in each warp. -#pragma unroll - for (int k_step = 0; k_step < CHUNK_K; k_step++) { - wmma::fragment - a[WARP_COL_TILES]; - wmma::fragment - b[WARP_ROW_TILES]; + // These fragments will accumulate the result of A and B matrix fragment + // multiplications along the K_GLOBAL dimension. + wmma::fragment c[WARP_COL_TILES][WARP_ROW_TILES]; + // Load the C matrix tiles into fragments from shared memory. #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); - const half *tile_ptr = &shmem[shmem_idx_a][k_step * K]; +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { + const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; - wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_HALF); + wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT); + } + } + + __syncthreads(); + + // Scale the C matrix. +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { +#pragma unroll + for (int t = 0; t < c[i][j].num_elements; t++) { + c[i][j].x[t] *= beta; + } + } + } + + // Select what warp copies what matrix to shared memory. + // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. + const half *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % 4) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % 4) * 2); + + // Go through the global K dimension by a fixed step at a time. +#pragma unroll + for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { + // Copy slices of the A and B matrices to shared memory. + // The first half of the warps in the CTA copy the A matrix, the rest copy + // the B matrix. + size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) + ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); + + // First half of the warp copies the first row / column of the matrix, + // the second half of the warp copies the next. + int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) + + (laneId % CHUNK_COPY_LINE_LANES); + + // Shift the second half of the warp to the next row / column in the + // shared memory. + shmem_idx += laneId / CHUNK_COPY_LINE_LANES; #pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { - if (i == 0) { - // Load the B matrix fragment once, because it is going to be - // reused against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + - (WARP_ROW_TILES * N) * (warpId % 2) + - (j * N); - const half *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) { + // Copy 16 bytes at once in each lane. + *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *lane_ptr; - wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_HALF); + // Advance the global memory pointer and the shared memory index. + lane_ptr = (int4 *)((half *)lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP); + shmem_idx += CHUNK_COPY_LINES_PER_WARP; } - wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); - } + __syncthreads(); + + // Compute a grid of C matrix tiles in each warp. +#pragma unroll + for (int k_step = 0; k_step < CHUNK_K; k_step++) { + wmma::fragment a[WARP_COL_TILES]; + wmma::fragment b[WARP_ROW_TILES]; + +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { + size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); + const half *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + + wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_HALF); + +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { + if (i == 0) { + // Load the B matrix fragment once, because it is going to be + // reused against the other A matrix fragments. + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const half *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + + wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_HALF); + } + + wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); + } + } + } + + __syncthreads(); } - } - __syncthreads(); + // Store the D fragments to shared memory. +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { +#pragma unroll + // Uniform, point-wise transformations of ALL fragment elements by ALL + // threads in the warp are well-defined even though element indices + // within fragment storage are not defined. + for (int t = 0; t < c[i][j].num_elements; t++) + c[i][j].x[t] *= alpha; + + float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; + + wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT); + } + } + + __syncthreads(); + + // Now that shared memory contains all the D tiles, stream them to global + // memory. + float *dst_gmem_warp_stream_ptr = &D[gmem_idx]; + +#pragma unroll + for (int i = 0; i < K; i++) { + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + } + + __syncthreads(); } - - // Store the D fragments to shared memory. -#pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { -#pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { -#pragma unroll - // Uniform, point-wise transformations of ALL fragment elements by ALL - // threads in the warp are well-defined even though element indices - // within fragment storage are not defined. - for (int t = 0; t < c[i][j].num_elements; t++) c[i][j].x[t] *= alpha; - - float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; - - wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT); - } - } - - __syncthreads(); - - // Now that shared memory contains all the D tiles, stream them to global - // memory. - float *dst_gmem_warp_stream_ptr = &D[gmem_idx]; - -#pragma unroll - for (int i = 0; i < K; i++) { - *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); - } - - __syncthreads(); - } } // Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming: @@ -403,246 +386,232 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C, // designed for // demonstration purposes only to show the CUDA WMMA API use without // relying on availability of the shared memory. -__global__ void simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld, - int n_ld, int k_ld, float alpha, float beta) { - // Leading dimensions. Packed with no transpositions. - int lda = k_ld; - int ldb = k_ld; - int ldc = n_ld; +__global__ void +simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) +{ + // Leading dimensions. Packed with no transpositions. + int lda = k_ld; + int ldb = k_ld; + int ldc = n_ld; - // Tile using a 2D grid - int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; - int warpN = (blockIdx.y * blockDim.y + threadIdx.y); + // Tile using a 2D grid + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); - // Declare the fragments - wmma::fragment - a_frag; - wmma::fragment - b_frag; - wmma::fragment acc_frag; - wmma::fragment c_frag; + // Declare the fragments + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; - wmma::fill_fragment(acc_frag, 0.0f); + wmma::fill_fragment(acc_frag, 0.0f); - // Loop over k - for (int i = 0; i < k_ld; i += WMMA_K) { - int aCol = i; - int aRow = warpM * WMMA_M; - int bCol = warpN * N; - int bRow = i; + // Loop over k + for (int i = 0; i < k_ld; i += WMMA_K) { + int aCol = i; + int aRow = warpM * WMMA_M; + int bCol = warpN * N; + int bRow = i; - // Bounds checking - if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { - // Load the inputs - wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); - wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); + // Bounds checking + if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { + // Load the inputs + wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); + wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); - // Perform the matrix multiplication - wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); - } - } - - // Load in the current value of c, scale it by beta, and add this our result - // scaled by alpha - int cCol = warpN * WMMA_N; - int cRow = warpM * WMMA_M; - - if (cRow < m_ld && cCol < n_ld) { - wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, - wmma::mem_row_major); - - for (int i = 0; i < c_frag.num_elements; i++) { - c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + // Perform the matrix multiplication + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + } } - // Store the output - wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, - wmma::mem_row_major); - } -} + // Load in the current value of c, scale it by beta, and add this our result + // scaled by alpha + int cCol = warpN * WMMA_N; + int cRow = warpM * WMMA_M; -__host__ void matMultiplyOnHost(half *A, half *B, float *C, float alpha, - float beta, int numARows, int numAColumns, - int numBRows, int numBColumns, int numCRows, - int numCColumns) { - for (int i = 0; i < numCRows; i++) { - for (int j = 0; j < numCColumns; j++) { - float temp = 0.0; + if (cRow < m_ld && cCol < n_ld) { + wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); - for (int k = 0; k < numAColumns; k++) { - temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k]; - } + for (int i = 0; i < c_frag.num_elements; i++) { + c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + } - C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + // Store the output + wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); } - } } -int main(int argc, char **argv) { - printf("Initializing...\n"); +__host__ void matMultiplyOnHost(half *A, + half *B, + float *C, + float alpha, + float beta, + int numARows, + int numAColumns, + int numBRows, + int numBColumns, + int numCRows, + int numCColumns) +{ + for (int i = 0; i < numCRows; i++) { + for (int j = 0; j < numCColumns; j++) { + float temp = 0.0; - int dev = findCudaDevice(argc, (const char **)argv); + for (int k = 0; k < numAColumns; k++) { + temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k]; + } - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - - // Tensor cores require a GPU of Volta (SM7X) architecture or higher. - if (deviceProp.major < 7) { - printf( - "cudaTensorCoreGemm requires SM 7.0 or higher to use Tensor " - "Cores. Exiting...\n"); - exit(EXIT_WAIVED); - } - - printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES); - printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES); - printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES); - - half *A_h = NULL; - half *B_h = NULL; - float *C_h = NULL; -#if CPU_DEBUG - float *result_hD = NULL; - float *result_host = NULL; -#endif - - A_h = (half *)malloc(sizeof(half) * M_GLOBAL * K_GLOBAL); - B_h = (half *)malloc(sizeof(half) * K_GLOBAL * N_GLOBAL); - C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); -#if CPU_DEBUG - result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); - result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); -#endif - - half *A = NULL; - half *B = NULL; - float *C = NULL; - float *D = NULL; - - checkCudaErrors(cudaMalloc(reinterpret_cast(&A), - sizeof(half) * M_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&B), - sizeof(half) * N_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&C), - sizeof(float) * M_GLOBAL * N_GLOBAL)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&D), - sizeof(float) * M_GLOBAL * N_GLOBAL)); - - assert(((unsigned long long)A) % 128 == 0); - assert(((unsigned long long)B) % 128 == 0); - assert(((unsigned long long)C) % 128 == 0); - assert(((unsigned long long)D) % 128 == 0); - - init_host_matrices(A_h, B_h, C_h); - - printf("Preparing data for GPU...\n"); - - checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL)); - - enum { - // Compute the right amount of shared memory to request. - // We need shared memory to hold per-CTA C and D matrix tiles, and to cache - // per-CTA chunks - // of the A and B matrices. Therefore, the right amount to request is the - // maximum of those - // two numbers. - SHMEM_SZ = MAX( - sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2, - M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * - (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float)) - }; - - printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); - - const float alpha = 1.1f; - const float beta = 1.2f; - - cudaEvent_t start, stop; - - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start)); - - // If enough shared memory available on the GPU use high performant kernel - if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) { - printf("Computing... using high performance kernel compute_gemm \n"); - - checkCudaErrors(cudaFuncSetAttribute( - compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors( - (compute_gemm<<>>(A, B, C, D, alpha, beta))); -#if CPU_DEBUG - checkCudaErrors(cudaMemcpy(result_hD, D, - sizeof(float) * M_GLOBAL * N_GLOBAL, - cudaMemcpyDeviceToHost)); -#endif - } else { - dim3 gridDim; - dim3 blockDim; - - // blockDim.x must be a multple of warpSize - // 128x4 means we have 16 warps and a block computes a 64x64 output tile - blockDim.x = 128; - blockDim.y = 4; - - gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / - (WMMA_M * blockDim.x / 32); - gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y); - - printf("Computing... using simple_wmma_gemm kernel\n"); - simple_wmma_gemm<<>>(A, B, C, D, M_GLOBAL, N_GLOBAL, - K_GLOBAL, alpha, beta); -#if CPU_DEBUG - checkCudaErrors(cudaMemcpy(result_hD, D, - sizeof(float) * M_GLOBAL * N_GLOBAL, - cudaMemcpyDeviceToHost)); -#endif - } - - checkCudaErrors(cudaEventRecord(stop)); - checkCudaErrors(cudaEventSynchronize(stop)); - -#if CPU_DEBUG - printf("Verifying correctness of the computations...\n"); - - memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL); - - matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, - K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); - - for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { - if (fabs(result_hD[i] - result_host[i]) > 0.1f) - printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], - result_host[i]); - } - free(result_hD); - free(result_host); -#endif - - float milliseconds = 0; - - checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); - - printf("Time: %f ms\n", milliseconds); - printf("TFLOPS: %.2f\n", static_cast((static_cast(M_GLOBAL) * - N_GLOBAL * K_GLOBAL * 2) / - (milliseconds / 1000.)) / - 1e12); - - free(A_h); - free(B_h); - free(C_h); - checkCudaErrors(cudaFree(reinterpret_cast(A))); - checkCudaErrors(cudaFree(reinterpret_cast(B))); - checkCudaErrors(cudaFree(reinterpret_cast(C))); - checkCudaErrors(cudaFree(reinterpret_cast(D))); - - return 0; + C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + } + } +} + +int main(int argc, char **argv) +{ + printf("Initializing...\n"); + + int dev = findCudaDevice(argc, (const char **)argv); + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + + // Tensor cores require a GPU of Volta (SM7X) architecture or higher. + if (deviceProp.major < 7) { + printf("cudaTensorCoreGemm requires SM 7.0 or higher to use Tensor " + "Cores. Exiting...\n"); + exit(EXIT_WAIVED); + } + + printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES); + printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES); + printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES); + + half *A_h = NULL; + half *B_h = NULL; + float *C_h = NULL; +#if CPU_DEBUG + float *result_hD = NULL; + float *result_host = NULL; +#endif + + A_h = (half *)malloc(sizeof(half) * M_GLOBAL * K_GLOBAL); + B_h = (half *)malloc(sizeof(half) * K_GLOBAL * N_GLOBAL); + C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); +#if CPU_DEBUG + result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); +#endif + + half *A = NULL; + half *B = NULL; + float *C = NULL; + float *D = NULL; + + checkCudaErrors(cudaMalloc(reinterpret_cast(&A), sizeof(half) * M_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&B), sizeof(half) * N_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&C), sizeof(float) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&D), sizeof(float) * M_GLOBAL * N_GLOBAL)); + + assert(((unsigned long long)A) % 128 == 0); + assert(((unsigned long long)B) % 128 == 0); + assert(((unsigned long long)C) % 128 == 0); + assert(((unsigned long long)D) % 128 == 0); + + init_host_matrices(A_h, B_h, C_h); + + printf("Preparing data for GPU...\n"); + + checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL)); + + enum { + // Compute the right amount of shared memory to request. + // We need shared memory to hold per-CTA C and D matrix tiles, and to cache + // per-CTA chunks + // of the A and B matrices. Therefore, the right amount to request is the + // maximum of those + // two numbers. + SHMEM_SZ = MAX(sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2, + M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float)) + }; + + printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); + + const float alpha = 1.1f; + const float beta = 1.2f; + + cudaEvent_t start, stop; + + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start)); + + // If enough shared memory available on the GPU use high performant kernel + if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) { + printf("Computing... using high performance kernel compute_gemm \n"); + + checkCudaErrors(cudaFuncSetAttribute(compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors( + (compute_gemm<<>>(A, B, C, D, alpha, beta))); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); +#endif + } + else { + dim3 gridDim; + dim3 blockDim; + + // blockDim.x must be a multple of warpSize + // 128x4 means we have 16 warps and a block computes a 64x64 output tile + blockDim.x = 128; + blockDim.y = 4; + + gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32); + gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y); + + printf("Computing... using simple_wmma_gemm kernel\n"); + simple_wmma_gemm<<>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); +#endif + } + + checkCudaErrors(cudaEventRecord(stop)); + checkCudaErrors(cudaEventSynchronize(stop)); + +#if CPU_DEBUG + printf("Verifying correctness of the computations...\n"); + + memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL); + + matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); + + for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { + if (fabs(result_hD[i] - result_host[i]) > 0.1f) + printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]); + } + free(result_hD); + free(result_host); +#endif + + float milliseconds = 0; + + checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); + + printf("Time: %f ms\n", milliseconds); + printf("TFLOPS: %.2f\n", + static_cast((static_cast(M_GLOBAL) * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) + / 1e12); + + free(A_h); + free(B_h); + free(C_h); + checkCudaErrors(cudaFree(reinterpret_cast(A))); + checkCudaErrors(cudaFree(reinterpret_cast(B))); + checkCudaErrors(cudaFree(reinterpret_cast(C))); + checkCudaErrors(cudaFree(reinterpret_cast(D))); + + return 0; } diff --git a/Samples/3_CUDA_Features/dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu b/Samples/3_CUDA_Features/dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu index a104c3fb..cf5fbef9 100644 --- a/Samples/3_CUDA_Features/dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu +++ b/Samples/3_CUDA_Features/dmmaTensorCoreGemm/dmmaTensorCoreGemm.cu @@ -59,18 +59,18 @@ // but carefully enough to avoid local memory use. #include -#include -#include -#include #include #include -#include +#include #include #include +#include +#include +#include // helper functions and utilities to work with CUDA -#include #include +#include // Externally configurable parameters. @@ -111,7 +111,7 @@ // Implementation constants. -#define WARPS_PER_BLOCK 8 +#define WARPS_PER_BLOCK 8 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) #if SHARED_MEMORY_LIMIT_64K @@ -127,10 +127,10 @@ #define CHUNK_K 16 #endif -#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(double)) -#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) +#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(double)) +#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) #define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES) -#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) +#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) #define BLOCK_ROW_WARPS 2 #define BLOCK_COL_WARPS 4 @@ -160,26 +160,28 @@ // we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync. #define SKEW_DOUBLE 4 -#define checkKernelErrors(expr) do { \ - expr; \ - \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err)); \ - abort(); \ - } \ -} while(0) +#define checkKernelErrors(expr) \ + do { \ + expr; \ + \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, cudaGetErrorString(__err)); \ + abort(); \ + } \ + } while (0) -enum kernels -{ - dmma_shmem_gemm_async_copy = 0, // DMMA shmem using kernel with async_copy - dmma_shmem_gemm_cg_async_copy = 1, // DMMA shmem using kernel with cooperative groups async_copy - dmma_shmem_gemm = 2, // DMMA shmem using kernel normal copy (without async_copy). - simple_dmma_gemm = 3 // DMMA non-shmem using simple kernel. +enum kernels { + dmma_shmem_gemm_async_copy = 0, // DMMA shmem using kernel with async_copy + dmma_shmem_gemm_cg_async_copy = 1, // DMMA shmem using kernel with cooperative groups async_copy + dmma_shmem_gemm = 2, // DMMA shmem using kernel normal copy (without async_copy). + simple_dmma_gemm = 3 // DMMA non-shmem using simple kernel. }; -const char* kernelNames[] = {"compute_dgemm_async_copy", "compute_dgemm_cg_async_copy", - "compute_dgemm", "simple_wmma_gemm"}; +const char *kernelNames[] = {"compute_dgemm_async_copy", + "compute_dgemm_cg_async_copy", + "compute_dgemm", + "simple_wmma_gemm"}; using namespace nvcuda; namespace cg = cooperative_groups; @@ -188,18 +190,18 @@ __host__ void init_host_matrices(double *a, double *b, double *c) { for (int i = 0; i < M_GLOBAL; i++) { for (int j = 0; j < K_GLOBAL; j++) { - a[i*K_GLOBAL+j] = (double) (rand() % 3); + a[i * K_GLOBAL + j] = (double)(rand() % 3); } } for (int i = 0; i < N_GLOBAL; i++) { for (int j = 0; j < K_GLOBAL; j++) { - b[i*K_GLOBAL+j] = (double) (rand() % 3); + b[i * K_GLOBAL + j] = (double)(rand() % 3); } } for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { - c[t] = (double) (rand() % 3); + c[t] = (double)(rand() % 3); } } @@ -217,10 +219,12 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, // This pointer is used to access the C and D matrix tiles this warp computes. - double *shmem_warp_tile_ptr = (double*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; + double *shmem_warp_tile_ptr = (double *)&shmem[0][0] + + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. - double *shmem_warp_stream_ptr = (double*)&shmem[0][0] + warpId * SHMEM_STRIDE * N; + double *shmem_warp_stream_ptr = (double *)&shmem[0][0] + warpId * SHMEM_STRIDE * N; // Adjust the beta scaler, as it'll be multiplied by alpha at the end of // each tile computation. Technically this is not generally correct (may result @@ -230,7 +234,7 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, // Each CTA slides along the 64 x 64 tiles from the top left corner of the matrix to the // right and down, and selects the next tile to compute. Once there's no such tile, // all warps in this CTA exit. - for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; @@ -240,7 +244,7 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, } // This warp's pointer to the C matrix data to copy memory from to shared memory. - const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; const double *src_gmem_warp_stream_ptr = &C[gmem_idx]; // Stream multiple C tiles to shared memory. @@ -271,7 +275,7 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, // Scale the C matrix. #pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { + for (int i = 0; i < WARP_COL_TILES; i++) { #pragma unroll for (int j = 0; j < WARP_ROW_TILES; j++) { #pragma unroll @@ -283,16 +287,19 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, // Select what warp copies what matrix to shared memory. // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const double *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2); + const double *warp_ptr = + (warpId < (WARPS_PER_BLOCK / 2)) + ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2); // Go through the global K dimension by a fixed step at a time. #pragma unroll for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { // Copy slices of the A and B matrices to shared memory. // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. - size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off); + size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) + ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. @@ -302,9 +309,10 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, shmem_idx += laneId / CHUNK_COPY_LINE_LANES; #pragma unroll - for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP); i++) { - // Copy 16 bytes at once in each lane. - *((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *((int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)); + for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP); i++) { + // Copy 16 bytes at once in each lane. + *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = + *((int4 *)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)); // Advance the global memory pointer and the shared memory index. lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP; @@ -321,8 +329,8 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId/2) * M * 2 + (i * M); - const double *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); + const double *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_DOUBLE); @@ -331,11 +339,10 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, if (i == 0) { // Load the B matrix fragment once, because it is going to be reused // against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N); - const double *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const double *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_DOUBLE); - } wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); @@ -370,8 +377,8 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, #pragma unroll for (int i = 0; i < N; i++) { - *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); @@ -379,7 +386,8 @@ __global__ void compute_dgemm(const double *A, const double *B, const double *C, #endif } -__global__ void compute_dgemm_async_copy(const double *A, const double *B, const double *C, double *D, double alpha, double beta) +__global__ void +compute_dgemm_async_copy(const double *A, const double *B, const double *C, double *D, double alpha, double beta) { #if __CUDA_ARCH__ >= 800 extern __shared__ double shmem[][CHUNK_K * K + SKEW_DOUBLE]; @@ -392,7 +400,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const constexpr size_t shmem_idx_b_off = BLOCK_COL_TILES * M; // This pointer is used to access the C and D matrix tiles this warp computes. - double *shmem_warp_tile_ptr = &shmem[0][0] + (warpId/BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; + double *shmem_warp_tile_ptr = &shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. double *shmem_warp_stream_ptr = &shmem[0][0] + warpId * SHMEM_STRIDE * N; @@ -404,13 +413,13 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const cuda::pipeline pipe = cuda::make_pipeline(); - const auto shape2 = cuda::aligned_size_t(sizeof(double2)); + const auto shape2 = cuda::aligned_size_t(sizeof(double2)); constexpr int loadStride = 1; // load 2 double, left-shift by 1. // Each CTA slides along the 64 x 64 tiles from the top left corner of the matrix to the // right and down, and selects the next tile to compute. Once there's no such tile, // all warps in this CTA exit. - for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; @@ -420,7 +429,7 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const } // This warp's pointer to the C matrix data to copy memory from to shared memory. - const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; const double *src_gmem_warp_stream_ptr = &C[gmem_idx]; // Stream multiple C tiles to shared memory. @@ -428,8 +437,9 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const for (int i = 0; i < N; i++) { pipe.producer_acquire(); cuda::memcpy_async(&shmem_warp_stream_ptr[(SHMEM_STRIDE * i) + (laneId << loadStride)], - &src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i) + (laneId << loadStride)], - shape2, pipe); + &src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i) + (laneId << loadStride)], + shape2, + pipe); pipe.producer_commit(); } @@ -463,12 +473,14 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const // Select what warp copies what matrix to shared memory. // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const double *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2); + const double *warp_ptr = + (warpId < (WARPS_PER_BLOCK / 2)) + ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2); - const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); - constexpr int chunksPerLane = ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP); - const int laneLoadElem = (laneId % CHUNK_COPY_LINE_LANES) << loadStride; + const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); + constexpr int chunksPerLane = ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP); + const int laneLoadElem = (laneId % CHUNK_COPY_LINE_LANES) << loadStride; // Go through the global K dimension by a fixed step at a time. #pragma unroll @@ -476,7 +488,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const // Copy slices of the A and B matrices to shared memory. // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. // As for DMMA M == N we use M for warp 4-7 + shmem_idx_b_off. - size_t shmem_idx = (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) + (shmem_idx_b_off * (warpId/(WARPS_PER_BLOCK/2))); + size_t shmem_idx = + (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + (shmem_idx_b_off * (warpId / (WARPS_PER_BLOCK / 2))); // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. @@ -485,8 +498,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const // Shift the second half of the warp to the next row / column in the shared memory. shmem_idx += stridePerLaneCopy; #pragma unroll - for(int i = 0; i < chunksPerLane; i++) { - // Copy 16 bytes at once in each lane. + for (int i = 0; i < chunksPerLane; i++) { + // Copy 16 bytes at once in each lane. pipe.producer_acquire(); cuda::memcpy_async(&shmem[shmem_idx][laneLoadElem], lane_ptr, shape2, pipe); @@ -508,8 +521,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const wmma::fragment b[WARP_ROW_TILES]; #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId/2) * M * 2 + (i * M); - const double *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); + const double *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_DOUBLE); #pragma unroll @@ -517,8 +530,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const if (i == 0) { // Load the B matrix fragment once, because it is going to be reused // against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N); - const double *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const double *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_DOUBLE); } @@ -554,8 +567,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const #pragma unroll for (int i = 0; i < N; i++) { - *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); @@ -563,15 +576,16 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const #endif } - __global__ void compute_dgemm_cg_async_copy(const double *A, const double *B, const double *C, double *D, double alpha, double beta) +__global__ void +compute_dgemm_cg_async_copy(const double *A, const double *B, const double *C, double *D, double alpha, double beta) { #if __CUDA_ARCH__ >= 800 extern __shared__ double shmem[][CHUNK_K * K + SKEW_DOUBLE]; - auto cta = cg::this_thread_block(); - auto tile32 = cg::tiled_partition<32>(cta); + auto cta = cg::this_thread_block(); + auto tile32 = cg::tiled_partition<32>(cta); constexpr int tileChunkCopySize = WARP_SIZE / CHUNK_COPY_LINES_PER_WARP; - auto tileChunkCopy = cg::tiled_partition(cta); + auto tileChunkCopy = cg::tiled_partition(cta); // Warp and lane identification. const unsigned int warpId = threadIdx.x / WARP_SIZE; @@ -581,10 +595,11 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const constexpr size_t shmem_idx_b_off = BLOCK_COL_TILES * M; // This pointer is used to access the C and D matrix tiles this warp computes. - double *shmem_warp_tile_ptr = (double*)&shmem[0][0] + (warpId/2) * SHMEM_STRIDE * N * 2 + (warpId%2) * SHMEM_OFFSET; + double *shmem_warp_tile_ptr = + (double *)&shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * N * 2 + (warpId % 2) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. - double *shmem_warp_stream_ptr = (double*)&shmem[0][0] + warpId * SHMEM_STRIDE * N; + double *shmem_warp_stream_ptr = (double *)&shmem[0][0] + warpId * SHMEM_STRIDE * N; // Adjust the beta scaler, as it'll be multiplied by alpha at the end of // each tile computation. Technically this is not generally correct (may result @@ -594,7 +609,7 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const // Each CTA slides along the 64 x 64 tiles from the top left corner of the matrix to the // right and down, and selects the next tile to compute. Once there's no such tile, // all warps in this CTA exit. - for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; @@ -604,7 +619,7 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const } // This warp's pointer to the C matrix data to copy memory from to shared memory. - const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; const double *src_gmem_warp_stream_ptr = &C[gmem_idx]; // Stream multiple C tiles to shared memory. @@ -612,7 +627,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const for (int i = 0; i < N; i++) { auto dst_ptr = &shmem_warp_stream_ptr[(SHMEM_STRIDE * i)]; auto src_ptr = &src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i)]; - cg::memcpy_async(tile32, dst_ptr, src_ptr, cuda::aligned_size_t{tile32.size() * sizeof(double2)}); + cg::memcpy_async( + tile32, dst_ptr, src_ptr, cuda::aligned_size_t{tile32.size() * sizeof(double2)}); } cg::wait(cta); @@ -647,18 +663,20 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const cg::wait(cta); // Select what warp copies what matrix to shared memory. // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const double *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2); + const double *warp_ptr = + (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2); - const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); - constexpr int chunksPerLane = ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP); + const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); + constexpr int chunksPerLane = ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP); // Go through the global K dimension by a fixed step at a time. #pragma unroll for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { // Copy slices of the A and B matrices to shared memory. // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. // As for DMMA M == N we use M for warp 4-7 + shmem_idx_b_off. - size_t shmem_idx = (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) + (shmem_idx_b_off * (warpId/(WARPS_PER_BLOCK/2))); + size_t shmem_idx = + (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + (shmem_idx_b_off * (warpId / (WARPS_PER_BLOCK / 2))); // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. @@ -668,13 +686,15 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const shmem_idx += stridePerLaneCopy; #pragma unroll - for(int i = 0; i < chunksPerLane; i++) { + for (int i = 0; i < chunksPerLane; i++) { // Copy 16 bytes at once in each lane. auto dst_ptr = &shmem[shmem_idx][0]; auto src_ptr = lane_ptr; - cg::memcpy_async(tileChunkCopy, dst_ptr, src_ptr, - cuda::aligned_size_t{tileChunkCopySize * sizeof(double2)}); + cg::memcpy_async(tileChunkCopy, + dst_ptr, + src_ptr, + cuda::aligned_size_t{tileChunkCopySize * sizeof(double2)}); // Advance the global memory pointer and the shared memory index. lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP; @@ -690,8 +710,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId/2) * M * 2 + (i * M); - const double *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); + const double *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_DOUBLE); @@ -700,11 +720,10 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const if (i == 0) { // Load the B matrix fragment once, because it is going to be reused // against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N); - const double *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const double *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_DOUBLE); - } wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); @@ -738,8 +757,8 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const #pragma unroll for (int i = 0; i < N; i++) { - *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } cg::sync(cta); } @@ -748,12 +767,13 @@ __global__ void compute_dgemm_async_copy(const double *A, const double *B, const // Performs an MxNxK DGEMM (C=alpha*A*B + beta*C) assuming: // 1) Matrices are packed in memory. -// 2) M, N and K are multiples of 8, 8 and 4 respectively. +// 2) M, N and K are multiples of 8, 8 and 4 respectively. // 3) A is row major, B is column major matrix. // Note: This is a less performant version of the compute_dgemm kernel. It is designed for // demonstration purposes only to show the CUDA WMMA API use without relying on // availability of the shared memory. -__global__ void simple_wmma_gemm(double *a, double *b, double *c, double *d, int m_ld, int n_ld, int k_ld, double alpha, double beta) +__global__ void +simple_wmma_gemm(double *a, double *b, double *c, double *d, int m_ld, int n_ld, int k_ld, double alpha, double beta) { #if __CUDA_ARCH__ >= 800 // Leading dimensions. Packed with no transpositions. @@ -768,8 +788,8 @@ __global__ void simple_wmma_gemm(double *a, double *b, double *c, double *d, int // Declare the fragments wmma::fragment a_frag; wmma::fragment b_frag; - wmma::fragment acc_frag; - wmma::fragment c_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; wmma::fill_fragment(acc_frag, 0.0f); @@ -799,7 +819,7 @@ __global__ void simple_wmma_gemm(double *a, double *b, double *c, double *d, int if (cRow < m_ld && cCol < n_ld) { wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); - for(int i=0; i < c_frag.num_elements; i++) { + for (int i = 0; i < c_frag.num_elements; i++) { c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; } @@ -809,11 +829,17 @@ __global__ void simple_wmma_gemm(double *a, double *b, double *c, double *d, int #endif } -__host__ void matMultiplyOnHost(double *A, double *B, double *C, - float alpha, float beta, - int numARows, int numAColumns, - int numBRows, int numBColumns, - int numCRows, int numCColumns) +__host__ void matMultiplyOnHost(double *A, + double *B, + double *C, + float alpha, + float beta, + int numARows, + int numAColumns, + int numBRows, + int numBColumns, + int numCRows, + int numCColumns) { for (int i = 0; i < numCRows; i++) { for (int j = 0; j < numCColumns; j++) { @@ -824,7 +850,7 @@ __host__ void matMultiplyOnHost(double *A, double *B, double *C, temp += A[i * numAColumns + k] * B[j * numBRows + k]; } - C[i*numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; } } } @@ -852,16 +878,16 @@ int main(int argc, char **argv) double *B_h = NULL; double *C_h = NULL; #if CPU_DEBUG - double *result_hD = NULL; + double *result_hD = NULL; double *result_host = NULL; #endif - A_h = (double*) malloc(sizeof(double) * M_GLOBAL * K_GLOBAL); - B_h = (double*) malloc(sizeof(double) * K_GLOBAL * N_GLOBAL); - C_h = (double*) malloc(sizeof(double) * M_GLOBAL * N_GLOBAL); + A_h = (double *)malloc(sizeof(double) * M_GLOBAL * K_GLOBAL); + B_h = (double *)malloc(sizeof(double) * K_GLOBAL * N_GLOBAL); + C_h = (double *)malloc(sizeof(double) * M_GLOBAL * N_GLOBAL); #if CPU_DEBUG - result_hD = (double*) malloc(sizeof(double) * M_GLOBAL * N_GLOBAL); - result_host = (double*) malloc(sizeof(double) * M_GLOBAL * N_GLOBAL); + result_hD = (double *)malloc(sizeof(double) * M_GLOBAL * N_GLOBAL); + result_host = (double *)malloc(sizeof(double) * M_GLOBAL * N_GLOBAL); #endif double *A = NULL; @@ -869,10 +895,10 @@ int main(int argc, char **argv) double *C = NULL; double *D = NULL; - checkCudaErrors(cudaMalloc((void**)&A, sizeof(double) * M_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&B, sizeof(double) * N_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&C, sizeof(double) * M_GLOBAL * N_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&D, sizeof(double) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&A, sizeof(double) * M_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&B, sizeof(double) * N_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&C, sizeof(double) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&D, sizeof(double) * M_GLOBAL * N_GLOBAL)); assert(((unsigned long long)A) % 128 == 0); assert(((unsigned long long)B) % 128 == 0); @@ -900,7 +926,7 @@ int main(int argc, char **argv) printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); const double alpha = 1.1f; - const double beta = 1.2f; + const double beta = 1.2f; cudaEvent_t start, stop; @@ -913,45 +939,47 @@ int main(int argc, char **argv) // kernel to run - default (dmma_shmem_gemm_async_copy == 0) if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); - if (kernel_number < 4) - { + if (kernel_number < 4) { selected_kernel = (kernels)kernel_number; } - else - { + else { printf("Error: kernel number should be between 0 to 3, you have entered %d\n", kernel_number); exit(EXIT_FAILURE); } } // If enough shared memory available on the GPU use high performant kernel - if ((deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) && (selected_kernel != simple_dmma_gemm)) - { + if ((deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) && (selected_kernel != simple_dmma_gemm)) { printf("Computing using high performance kernel = %d - %s\n", selected_kernel, kernelNames[selected_kernel]); - switch (selected_kernel) - { - case dmma_shmem_gemm_async_copy : - default: - checkCudaErrors(cudaFuncSetAttribute(compute_dgemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors((compute_dgemm_async_copy<<>>(A, B, C, D, alpha, beta))); - break; - case dmma_shmem_gemm_cg_async_copy : - checkCudaErrors(cudaFuncSetAttribute(compute_dgemm_cg_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors((compute_dgemm_cg_async_copy<<>>(A, B, C, D, alpha, beta))); - break; - case dmma_shmem_gemm : - checkCudaErrors(cudaFuncSetAttribute(compute_dgemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors((compute_dgemm<<>>(A, B, C, D, alpha, beta))); - break; + switch (selected_kernel) { + case dmma_shmem_gemm_async_copy: + default: + checkCudaErrors( + cudaFuncSetAttribute(compute_dgemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors( + (compute_dgemm_async_copy<<>>( + A, B, C, D, alpha, beta))); + break; + case dmma_shmem_gemm_cg_async_copy: + checkCudaErrors(cudaFuncSetAttribute( + compute_dgemm_cg_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors( + (compute_dgemm_cg_async_copy<<>>( + A, B, C, D, alpha, beta))); + break; + case dmma_shmem_gemm: + checkCudaErrors(cudaFuncSetAttribute(compute_dgemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors((compute_dgemm<<>>( + A, B, C, D, alpha, beta))); + break; } #if CPU_DEBUG - checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(double)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(double) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); #endif } - else - { + else { dim3 gridDim; dim3 blockDim; @@ -978,25 +1006,19 @@ int main(int argc, char **argv) memcpy(result_host, C_h, sizeof(double) * M_GLOBAL * N_GLOBAL); - matMultiplyOnHost(A_h, B_h, result_host, - alpha, beta, - M_GLOBAL, K_GLOBAL, - K_GLOBAL, N_GLOBAL, - M_GLOBAL, N_GLOBAL); + matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); size_t number_of_matches = 0; - for (int i = 0; i < N_GLOBAL*M_GLOBAL; i++) { - if (fabs(result_hD[i] - result_host[i]) > 0.1f) - { + for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { + if (fabs(result_hD[i] - result_host[i]) > 0.1f) { printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]); break; } - else - { + else { number_of_matches++; } } - printf("number_of_matches = %zu out of = %d \n", number_of_matches, N_GLOBAL*M_GLOBAL); + printf("number_of_matches = %zu out of = %d \n", number_of_matches, N_GLOBAL * M_GLOBAL); free(result_hD); free(result_host); #endif @@ -1006,15 +1028,15 @@ int main(int argc, char **argv) checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); printf("Time: %f ms\n", milliseconds); - printf("FP64 TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12); + printf("FP64 TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) / 1e12); free(A_h); free(B_h); free(C_h); - checkCudaErrors(cudaFree((void*)A)); - checkCudaErrors(cudaFree((void*)B)); - checkCudaErrors(cudaFree((void*)C)); - checkCudaErrors(cudaFree((void*)D)); + checkCudaErrors(cudaFree((void *)A)); + checkCudaErrors(cudaFree((void *)B)); + checkCudaErrors(cudaFree((void *)C)); + checkCudaErrors(cudaFree((void *)D)); return 0; } diff --git a/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu b/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu index 4b3c4875..513bc2a9 100644 --- a/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu +++ b/Samples/3_CUDA_Features/globalToShmemAsyncCopy/globalToShmemAsyncCopy.cu @@ -39,12 +39,12 @@ */ // System includes -#include #include +#include // CUDA runtime -#include #include +#include #if __CUDA_ARCH__ >= 700 #include @@ -54,18 +54,18 @@ namespace cg = cooperative_groups; // Helper functions and utilities to work with CUDA -#include #include +#include enum kernels { - AsyncCopyMultiStageLargeChunk = 0, - AsyncCopyLargeChunk = 1, - AsyncCopyLargeChunkAWBarrier = 2, - AsyncCopyMultiStageSharedState = 3, - AsyncCopyMultiStage = 4, - AsyncCopySingleStage = 5, - Naive = 6, - NaiveLargeChunk = 7 + AsyncCopyMultiStageLargeChunk = 0, + AsyncCopyLargeChunk = 1, + AsyncCopyLargeChunkAWBarrier = 2, + AsyncCopyMultiStageSharedState = 3, + AsyncCopyMultiStage = 4, + AsyncCopySingleStage = 5, + Naive = 6, + NaiveLargeChunk = 7 }; const char *kernelNames[] = {"AsyncCopyMultiStageLargeChunk", @@ -81,91 +81,88 @@ constexpr int blockSize = 16; // Multi Stage memcpy_async pipeline with large chunk copy template -__global__ void MatrixMulAsyncCopyMultiStageLargeChunk( - float *__restrict__ C, const float *__restrict__ A, - const float *__restrict__ B, int wA, int wB) { - // Requires BLOCK_SIZE % 4 == 0 +__global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float *__restrict__ C, + const float *__restrict__ A, + const float *__restrict__ B, + int wA, + int wB) +{ + // Requires BLOCK_SIZE % 4 == 0 - // Multi-stage pipeline version - constexpr size_t maxPipelineStages = 4; + // Multi-stage pipeline version + constexpr size_t maxPipelineStages = 4; - // Declaration of the shared memory array As used to - // store the sub-matrix of A for each stage - __shared__ alignas( - alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A for each stage + __shared__ alignas(alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B for each stage - __shared__ alignas( - alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B for each stage + __shared__ alignas(alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - float Csub = 0.0; + float Csub = 0.0; - // Index of the first sub-matrix of A processed by the block - const int aBegin = wA * (BLOCK_SIZE)*blockIdx.y; + // Index of the first sub-matrix of A processed by the block + const int aBegin = wA * (BLOCK_SIZE)*blockIdx.y; - // Index of the last sub-matrix of A processed by the block - const int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + const int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - const int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + const int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - const int t4x = threadIdx.x * 4; - const auto shape4 = cuda::aligned_size_t(sizeof(float4)); + const int t4x = threadIdx.x * 4; + const auto shape4 = cuda::aligned_size_t(sizeof(float4)); - cuda::pipeline pipe = cuda::make_pipeline(); + cuda::pipeline pipe = cuda::make_pipeline(); + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; + a += aStep, b += bStep, ++i) { + // Load the matrices from device memory to shared memory; each thread loads + // one element of each matrix + for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { + pipe.producer_acquire(); + if (aStage <= aEnd && t4x < BLOCK_SIZE) { + // Rotating buffer + const int j = iStage % maxPipelineStages; + cuda::memcpy_async(&As[j][threadIdx.y][t4x], &A[aStage + wA * threadIdx.y + t4x], shape4, pipe); + cuda::memcpy_async(&Bs[j][threadIdx.y][t4x], &B[aStage + wA * threadIdx.y + t4x], shape4, pipe); + } + pipe.producer_commit(); + } + + pipe.consumer_wait(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, - iStage = 0; - a <= aEnd; a += aStep, b += bStep, ++i) { - // Load the matrices from device memory to shared memory; each thread loads - // one element of each matrix - for (; aStage <= a + aStep * maxPipelineStages; - aStage += aStep, bStage += bStep, ++iStage) { - pipe.producer_acquire(); - if (aStage <= aEnd && t4x < BLOCK_SIZE) { // Rotating buffer - const int j = iStage % maxPipelineStages; - cuda::memcpy_async(&As[j][threadIdx.y][t4x], - &A[aStage + wA * threadIdx.y + t4x], shape4, pipe); - cuda::memcpy_async(&Bs[j][threadIdx.y][t4x], - &B[aStage + wA * threadIdx.y + t4x], shape4, pipe); - } - pipe.producer_commit(); - } - - pipe.consumer_wait(); - // Synchronize to make sure the matrices are loaded - __syncthreads(); - - // Rotating buffer - const int j = i % maxPipelineStages; + const int j = i % maxPipelineStages; // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + } + pipe.consumer_release(); + + // Don't have to synchronize because maxPipelineStages is greater than one + // therefore next iteration is loading to a different buffer. } - pipe.consumer_release(); - // Don't have to synchronize because maxPipelineStages is greater than one - // therefore next iteration is loading to a different buffer. - } - - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Single Stage memcpy_async pipeline with Large copy chunk (float4) @@ -173,245 +170,244 @@ template __global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C, const float *__restrict__ A, const float *__restrict__ B, - int wA, int wB) { - // Requires BLOCK_SIZE % 4 == 0 + int wA, + int wB) +{ + // Requires BLOCK_SIZE % 4 == 0 - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Single-stage pipeline version - float Csub = 0.0; + // Single-stage pipeline version + float Csub = 0.0; - const int t4x = threadIdx.x * 4; - const auto shape4 = cuda::aligned_size_t(sizeof(float4)); - cuda::pipeline pipe = cuda::make_pipeline(); + const int t4x = threadIdx.x * 4; + const auto shape4 = cuda::aligned_size_t(sizeof(float4)); + cuda::pipeline pipe = cuda::make_pipeline(); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; - // a subset of threads loads a contiguous chunk of elements. + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory to shared memory; + // a subset of threads loads a contiguous chunk of elements. - // Previously, per-thread: - // As[ty][tx] = A[a + wA * ty + tx]; - // Bs[ty][tx] = B[b + wB * ty + tx]; + // Previously, per-thread: + // As[ty][tx] = A[a + wA * ty + tx]; + // Bs[ty][tx] = B[b + wB * ty + tx]; - // Now, one fourth of the threads load four elements of each matrix - if (t4x < BLOCK_SIZE) { - pipe.producer_acquire(); + // Now, one fourth of the threads load four elements of each matrix + if (t4x < BLOCK_SIZE) { + pipe.producer_acquire(); - cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x], - shape4, pipe); - cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x], - shape4, pipe); + cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x], shape4, pipe); + cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x], shape4, pipe); - pipe.producer_commit(); - pipe.consumer_wait(); - } + pipe.producer_commit(); + pipe.consumer_wait(); + } - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + pipe.consumer_release(); + + // Synchronize to make sure that the preceding + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. + __syncthreads(); } - pipe.consumer_release(); - - // Synchronize to make sure that the preceding - // computation is done before overwriting the - // shared memory sub-matrix buffers As and Bs in the next iteration. - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Single Stage memcpy_async pipeline with Large copy chunk (float4) using // arrive-wait barrier template -__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier( - float *__restrict__ C, const float *__restrict__ A, - const float *__restrict__ B, int wA, int wB) { +__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float *__restrict__ C, + const float *__restrict__ A, + const float *__restrict__ B, + int wA, + int wB) +{ #if __CUDA_ARCH__ >= 700 #pragma diag_suppress static_var_with_dynamic_init - // Requires BLOCK_SIZE % 4 == 0 + // Requires BLOCK_SIZE % 4 == 0 - __shared__ cuda::barrier bar; + __shared__ cuda::barrier bar; - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; - if (threadIdx.x == 0) { - init(&bar, blockDim.x * blockDim.y); - } - __syncthreads(); - - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; - - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; - - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; - - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; - - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; - - float Csub = 0.0; - - const int t4x = threadIdx.x * 4; - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; - // a subset of threads loads a contiguous chunk of elements. - - // Now, one fourth of the threads load four elements of each matrix - if (t4x < BLOCK_SIZE) { - float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); - float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); - const float4 *const A4 = - reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); - const float4 *const B4 = - reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); - - cuda::memcpy_async(A4s, A4, sizeof(float4), bar); - cuda::memcpy_async(B4s, B4, sizeof(float4), bar); + if (threadIdx.x == 0) { + init(&bar, blockDim.x * blockDim.y); } + __syncthreads(); - // Synchronize to make sure the matrices are loaded - bar.arrive_and_wait(); + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; + + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; + + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; + + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; + + float Csub = 0.0; + + const int t4x = threadIdx.x * 4; + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory to shared memory; + // a subset of threads loads a contiguous chunk of elements. + + // Now, one fourth of the threads load four elements of each matrix + if (t4x < BLOCK_SIZE) { + float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); + float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); + const float4 *const A4 = reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); + const float4 *const B4 = reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); + + cuda::memcpy_async(A4s, A4, sizeof(float4), bar); + cuda::memcpy_async(B4s, B4, sizeof(float4), bar); + } + + // Synchronize to make sure the matrices are loaded + bar.arrive_and_wait(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + // Synchronize to make sure that the preceding + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. + bar.arrive_and_wait(); } - // Synchronize to make sure that the preceding - // computation is done before overwriting the - // shared memory sub-matrix buffers As and Bs in the next iteration. - bar.arrive_and_wait(); - } - - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; #endif } // Single Stage memcpy_async pipeline with float copy template -__global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A, - const float *B, int wA, int wB) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; +__global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A, const float *B, int wA, int wB) +{ + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Single-stage pipeline version - float Csub = 0.0; + // Single-stage pipeline version + float Csub = 0.0; - cuda::pipeline pipe = cuda::make_pipeline(); - const auto shape1 = cuda::aligned_size_t(sizeof(float)); + cuda::pipeline pipe = cuda::make_pipeline(); + const auto shape1 = cuda::aligned_size_t(sizeof(float)); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; each thread loads - // one element of each matrix - { - pipe.producer_acquire(); + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory to shared memory; each thread loads + // one element of each matrix + { + pipe.producer_acquire(); - cuda::memcpy_async(&As[threadIdx.y][threadIdx.x], - &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe); - cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x], - &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe); + cuda::memcpy_async(&As[threadIdx.y][threadIdx.x], &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe); + cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x], &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe); - pipe.producer_commit(); - } + pipe.producer_commit(); + } - pipe.consumer_wait(); - // Synchronize to make sure the matrices are loaded - __syncthreads(); + pipe.consumer_wait(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + // Synchronize to make sure that the preceding + // computation is done before overwriting the + // shared memory sub-matrix buffers As and Bs in the next iteration. + __syncthreads(); } - // Synchronize to make sure that the preceding - // computation is done before overwriting the - // shared memory sub-matrix buffers As and Bs in the next iteration. - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Multi Stage memcpy_async thread_scope_thread pipeline with single-element @@ -420,87 +416,85 @@ template __global__ void MatrixMulAsyncCopyMultiStage(float *__restrict__ C, const float *__restrict__ A, const float *__restrict__ B, - int wA, int wB) { - // Multi-stage pipeline version - constexpr size_t maxPipelineStages = 4; + int wA, + int wB) +{ + // Multi-stage pipeline version + constexpr size_t maxPipelineStages = 4; - // Declaration of the shared memory array As used to - // store the sub-matrix of A for each stage - __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A for each stage + __shared__ float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B for each stage - __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B for each stage + __shared__ float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE]; - float Csub = 0.0; + float Csub = 0.0; - // Index of the first sub-matrix of A processed by the block - const int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + const int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - const int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + const int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - const int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + const int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - cuda::pipeline pipe = cuda::make_pipeline(); - const auto shape1 = cuda::aligned_size_t(sizeof(float)); + cuda::pipeline pipe = cuda::make_pipeline(); + const auto shape1 = cuda::aligned_size_t(sizeof(float)); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, - iStage = 0; - a <= aEnd; a += aStep, b += bStep, ++i) { - // Load the matrices from device memory to shared memory; each thread loads - // one element of each matrix + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; + a += aStep, b += bStep, ++i) { + // Load the matrices from device memory to shared memory; each thread loads + // one element of each matrix - for (; aStage <= a + aStep * maxPipelineStages; - aStage += aStep, bStage += bStep, ++iStage) { - if (aStage <= aEnd) { - // Rotating buffer - const int j = iStage % maxPipelineStages; + for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { + if (aStage <= aEnd) { + // Rotating buffer + const int j = iStage % maxPipelineStages; - pipe.producer_acquire(); + pipe.producer_acquire(); - cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x], - &A[aStage + wA * threadIdx.y + threadIdx.x], shape1, - pipe); - cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x], - &B[bStage + wB * threadIdx.y + threadIdx.x], shape1, - pipe); + cuda::memcpy_async( + &As[j][threadIdx.y][threadIdx.x], &A[aStage + wA * threadIdx.y + threadIdx.x], shape1, pipe); + cuda::memcpy_async( + &Bs[j][threadIdx.y][threadIdx.x], &B[bStage + wB * threadIdx.y + threadIdx.x], shape1, pipe); - pipe.producer_commit(); - } - } - pipe.consumer_wait(); + pipe.producer_commit(); + } + } + pipe.consumer_wait(); - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); - const int j = i % maxPipelineStages; + const int j = i % maxPipelineStages; - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + } + + pipe.consumer_release(); + // Don't have to synchronize because maxPipelineStages is greater than one + // therefore next iteration is loading to a different buffer. } - pipe.consumer_release(); - // Don't have to synchronize because maxPipelineStages is greater than one - // therefore next iteration is loading to a different buffer. - } - - // Write the block sub-matrix to device memory; - // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes four element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Multi Stage shared state memcpy_async pipeline thread_scope_block @@ -509,563 +503,526 @@ __global__ void MatrixMulAsyncCopyMultiStage(float *__restrict__ C, // consumer group which perform gemm computation on the loaded matrices by // producer. template -__global__ void MatrixMulAsyncCopyMultiStageSharedState( - float *__restrict__ C, const float *__restrict__ A, - const float *__restrict__ B, int wA, int wB) { - // Multi-stage pipeline version - constexpr size_t maxPipelineStages = 4; +__global__ void MatrixMulAsyncCopyMultiStageSharedState(float *__restrict__ C, + const float *__restrict__ A, + const float *__restrict__ B, + int wA, + int wB) +{ + // Multi-stage pipeline version + constexpr size_t maxPipelineStages = 4; - // Declaration of the shared memory array As used to - // store the sub-matrix of A for each stage - __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; + // Declaration of the shared memory array As used to + // store the sub-matrix of A for each stage + __shared__ float As[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B for each stage - __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B for each stage + __shared__ float Bs[maxPipelineStages][BLOCK_SIZE_X][BLOCK_SIZE_X]; - float Csub = 0.0; + float Csub = 0.0; - // Index of the first sub-matrix of A processed by the block - const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + const int aBegin = wA * BLOCK_SIZE_X * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - const int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + const int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - constexpr int aStep = BLOCK_SIZE_X; + // Step size used to iterate through the sub-matrices of A + constexpr int aStep = BLOCK_SIZE_X; - // Index of the first sub-matrix of B processed by the block - const int bBegin = BLOCK_SIZE_X * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + const int bBegin = BLOCK_SIZE_X * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE_X * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE_X * wB; - auto cta = cg::this_thread_block(); + auto cta = cg::this_thread_block(); - const auto shape1 = cuda::aligned_size_t(sizeof(float)); - __shared__ cuda::pipeline_shared_state shared_state; - constexpr int consumer_row_count = BLOCK_SIZE_X; + const auto shape1 = cuda::aligned_size_t(sizeof(float)); + __shared__ cuda::pipeline_shared_state shared_state; + constexpr int consumer_row_count = BLOCK_SIZE_X; - const auto thread_role = (cta.thread_index().y < consumer_row_count) - ? cuda::pipeline_role::consumer - : cuda::pipeline_role::producer; - auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role); + const auto thread_role = + (cta.thread_index().y < consumer_row_count) ? cuda::pipeline_role::consumer : cuda::pipeline_role::producer; + auto pipe = cuda::make_pipeline(cta, &shared_state, thread_role); - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, - iStage = 0; - a <= aEnd; a += aStep, b += bStep, ++i) { - if (threadIdx.y >= consumer_row_count) { - // this is a whole producer warp because threadIdx.y >= 16 where 16 == - // consumer_row_count, - // which loads the matrices from device memory to shared memory; - for (; aStage <= a + aStep * maxPipelineStages; - aStage += aStep, bStage += bStep, ++iStage) { - if (aStage <= aEnd) { - // Rotating buffer - const int j = iStage % maxPipelineStages; - const int strideRows = (blockDim.y - consumer_row_count); - pipe.producer_acquire(); - for (int rowId = threadIdx.y - consumer_row_count; - rowId < BLOCK_SIZE_X; rowId += strideRows) { - cuda::memcpy_async(&As[j][rowId][threadIdx.x], - &A[aStage + wA * rowId + threadIdx.x], shape1, - pipe); - cuda::memcpy_async(&Bs[j][rowId][threadIdx.x], - &B[bStage + wB * rowId + threadIdx.x], shape1, - pipe); - } - pipe.producer_commit(); + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; + a += aStep, b += bStep, ++i) { + if (threadIdx.y >= consumer_row_count) { + // this is a whole producer warp because threadIdx.y >= 16 where 16 == + // consumer_row_count, + // which loads the matrices from device memory to shared memory; + for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { + if (aStage <= aEnd) { + // Rotating buffer + const int j = iStage % maxPipelineStages; + const int strideRows = (blockDim.y - consumer_row_count); + pipe.producer_acquire(); + for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) { + cuda::memcpy_async( + &As[j][rowId][threadIdx.x], &A[aStage + wA * rowId + threadIdx.x], shape1, pipe); + cuda::memcpy_async( + &Bs[j][rowId][threadIdx.x], &B[bStage + wB * rowId + threadIdx.x], shape1, pipe); + } + pipe.producer_commit(); + } + } } - } - } else { - // this is a whole set of consumer group because threadIdx.y < - // consumer_row_count where consumer_row_count == 16, - // which computes gemm operation on matrices loaded in shared memory by - // producer warp. - const int j = i % maxPipelineStages; - // Synchronize consumer group to make sure the matrices are loaded by - // producer group. - pipe.consumer_wait(); + else { + // this is a whole set of consumer group because threadIdx.y < + // consumer_row_count where consumer_row_count == 16, + // which computes gemm operation on matrices loaded in shared memory by + // producer warp. + const int j = i % maxPipelineStages; + // Synchronize consumer group to make sure the matrices are loaded by + // producer group. + pipe.consumer_wait(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE_X; ++k) { - Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; - } - pipe.consumer_release(); + for (int k = 0; k < BLOCK_SIZE_X; ++k) { + Csub += As[j][threadIdx.y][k] * Bs[j][k][threadIdx.x]; + } + pipe.consumer_release(); + } } - } - // Write the block sub-matrix to device memory; - // each thread writes four element - if (threadIdx.y < consumer_row_count) { - const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; - } + // Write the block sub-matrix to device memory; + // each thread writes four element + if (threadIdx.y < consumer_row_count) { + const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; + } } /** * Matrix multiplication (CUDA Kernel) on the device: C = A * B * wA is A's width and wB is B's width */ -template -__global__ void MatrixMulNaive(float *C, float *A, float *B, int wA, int wB) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; +template __global__ void MatrixMulNaive(float *C, float *A, float *B, int wA, int wB) +{ + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x]; - Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x]; + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[threadIdx.y][threadIdx.x] = A[a + wA * threadIdx.y + threadIdx.x]; + Bs[threadIdx.y][threadIdx.x] = B[b + wB * threadIdx.y + threadIdx.x]; - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); } - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } -template -__global__ void MatrixMulNaiveLargeChunk(float *C, float *A, float *B, int wA, - int wB) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; +template __global__ void MatrixMulNaiveLargeChunk(float *C, float *A, float *B, int wA, int wB) +{ + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ alignas(alignof(float4)) float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ alignas(alignof(float4)) float Bs[BLOCK_SIZE][BLOCK_SIZE]; - int t4x = threadIdx.x * 4; + int t4x = threadIdx.x * 4; - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * blockIdx.y; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * blockIdx.y; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * blockIdx.x; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * blockIdx.x; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory - // to shared memory; + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Load the matrices from device memory + // to shared memory; - // One fourth of the threads load four elements of each matrix - if (t4x < BLOCK_SIZE) { - float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); - float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); - const float4 *const A4 = - reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); - const float4 *const B4 = - reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); - *A4s = *A4; - *B4s = *B4; - } + // One fourth of the threads load four elements of each matrix + if (t4x < BLOCK_SIZE) { + float4 *const A4s = reinterpret_cast(&As[threadIdx.y][t4x]); + float4 *const B4s = reinterpret_cast(&Bs[threadIdx.y][t4x]); + const float4 *const A4 = reinterpret_cast(&A[a + wA * threadIdx.y + t4x]); + const float4 *const B4 = reinterpret_cast(&B[a + wA * threadIdx.y + t4x]); + *A4s = *A4; + *B4s = *B4; + } - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[threadIdx.y][k] * Bs[k][threadIdx.x]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); } - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; - C[c + wB * threadIdx.y + threadIdx.x] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + C[c + wB * threadIdx.y + threadIdx.x] = Csub; } -void ConstantInit(float *data, int size, float val) { - for (int i = 0; i < size; ++i) { - data[i] = val; - } +void ConstantInit(float *data, int size, float val) +{ + for (int i = 0; i < size; ++i) { + data[i] = val; + } } /** * Run matrix multiplication using CUDA */ -int MatrixMultiply(int argc, char **argv, const dim3 &dimsA, const dim3 &dimsB, - kernels kernel_number) { - // Allocate host memory for matrices A and B - unsigned int size_A = dimsA.x * dimsA.y; - unsigned int mem_size_A = sizeof(float) * size_A; - float *h_A; - checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); - unsigned int size_B = dimsB.x * dimsB.y; - unsigned int mem_size_B = sizeof(float) * size_B; - float *h_B; - checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); - cudaStream_t stream; +int MatrixMultiply(int argc, char **argv, const dim3 &dimsA, const dim3 &dimsB, kernels kernel_number) +{ + // Allocate host memory for matrices A and B + unsigned int size_A = dimsA.x * dimsA.y; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A; + checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); + unsigned int size_B = dimsB.x * dimsB.y; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B; + checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); + cudaStream_t stream; - // Initialize host memory - const float valB = 2.10f; - ConstantInit(h_A, size_A, 1.0f); - ConstantInit(h_B, size_B, valB); + // Initialize host memory + const float valB = 2.10f; + ConstantInit(h_A, size_A, 1.0f); + ConstantInit(h_B, size_B, valB); - // Allocate device memory - float *d_A, *d_B, *d_C; + // Allocate device memory + float *d_A, *d_B, *d_C; - // Allocate host matrix C - dim3 dimsC(dimsB.x, dimsA.y, 1); - unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); - float *h_C; - checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); + // Allocate host matrix C + dim3 dimsC(dimsB.x, dimsA.y, 1); + unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); + float *h_C; + checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); - if (h_C == NULL) { - fprintf(stderr, "Failed to allocate host matrix C!\n"); - exit(EXIT_FAILURE); - } + if (h_C == NULL) { + fprintf(stderr, "Failed to allocate host matrix C!\n"); + exit(EXIT_FAILURE); + } - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); - // Allocate CUDA events that we'll use for timing - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); + // Allocate CUDA events that we'll use for timing + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - // copy host memory to device - checkCudaErrors( - cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); - checkCudaErrors( - cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream)); + // copy host memory to device + checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream)); - // Setup execution parameters - dim3 threads(blockSize, blockSize); - dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); + // Setup execution parameters + dim3 threads(blockSize, blockSize); + dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); - // Here the block size is 16x18, where first 16 rows are consumer thread group - // and last 2 rows (1 warp) is producer thread group - dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1); - dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, - dimsA.y / threadsSharedStateKernel.x); + // Here the block size is 16x18, where first 16 rows are consumer thread group + // and last 2 rows (1 warp) is producer thread group + dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1); + dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, dimsA.y / threadsSharedStateKernel.x); - printf("Running kernel = %d - %s\n", kernel_number, - kernelNames[kernel_number]); - // Create and start timer - printf("Computing result using CUDA Kernel...\n"); + printf("Running kernel = %d - %s\n", kernel_number, kernelNames[kernel_number]); + // Create and start timer + printf("Computing result using CUDA Kernel...\n"); - // Performs warmup operation using matrixMul CUDA kernel - switch (kernel_number) { + // Performs warmup operation using matrixMul CUDA kernel + switch (kernel_number) { case AsyncCopyMultiStageLargeChunk: default: - MatrixMulAsyncCopyMultiStageLargeChunk< - blockSize><<>>(d_C, d_A, d_B, dimsA.x, - dimsB.x); - break; + MatrixMulAsyncCopyMultiStageLargeChunk + <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case AsyncCopyLargeChunk: - MatrixMulAsyncCopyLargeChunk<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; + MatrixMulAsyncCopyLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case AsyncCopyLargeChunkAWBarrier: - MatrixMulAsyncCopyLargeChunkAWBarrier< - blockSize><<>>(d_C, d_A, d_B, dimsA.x, - dimsB.x); - break; + MatrixMulAsyncCopyLargeChunkAWBarrier<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case AsyncCopyMultiStageSharedState: - MatrixMulAsyncCopyMultiStageSharedState<<< - gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; + MatrixMulAsyncCopyMultiStageSharedState + <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case AsyncCopyMultiStage: - MatrixMulAsyncCopyMultiStage<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; + MatrixMulAsyncCopyMultiStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case AsyncCopySingleStage: - MatrixMulAsyncCopySingleStage<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; + MatrixMulAsyncCopySingleStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case Naive: - MatrixMulNaive<<>>(d_C, d_A, d_B, - dimsA.x, dimsB.x); - break; + MatrixMulNaive<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; case NaiveLargeChunk: - MatrixMulNaiveLargeChunk<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - } - - printf("done\n"); - checkCudaErrors(cudaStreamSynchronize(stream)); - - // Execute the kernel - int nIter = 100; - - // Record the start event - checkCudaErrors(cudaEventRecord(start, stream)); - - for (int j = 0; j < nIter; j++) { - switch (kernel_number) { - case AsyncCopyMultiStageLargeChunk: - default: - MatrixMulAsyncCopyMultiStageLargeChunk< - blockSize><<>>(d_C, d_A, d_B, dimsA.x, - dimsB.x); - break; - case AsyncCopyLargeChunk: - MatrixMulAsyncCopyLargeChunk<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyLargeChunkAWBarrier: - MatrixMulAsyncCopyLargeChunkAWBarrier< - blockSize><<>>(d_C, d_A, d_B, dimsA.x, - dimsB.x); - break; - case AsyncCopyMultiStageSharedState: - MatrixMulAsyncCopyMultiStageSharedState<<< - gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopyMultiStage: - MatrixMulAsyncCopyMultiStage<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case AsyncCopySingleStage: - MatrixMulAsyncCopySingleStage<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case Naive: - MatrixMulNaive<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); - break; - case NaiveLargeChunk: - MatrixMulNaiveLargeChunk<<>>( - d_C, d_A, d_B, dimsA.x, dimsB.x); + MatrixMulNaiveLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); break; } - } - // Record the stop event - checkCudaErrors(cudaEventRecord(stop, stream)); + printf("done\n"); + checkCudaErrors(cudaStreamSynchronize(stream)); - // Wait for the stop event to complete - checkCudaErrors(cudaEventSynchronize(stop)); + // Execute the kernel + int nIter = 100; - float msecTotal = 0.0f; - checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); + // Record the start event + checkCudaErrors(cudaEventRecord(start, stream)); - // Compute and print the performance - float msecPerMatrixMul = msecTotal / nIter; - double flopsPerMatrixMul = 2.0 * static_cast(dimsA.x) * - static_cast(dimsA.y) * - static_cast(dimsB.x); - double gigaFlops = - (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); - printf( - "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," - " WorkgroupSize= %u threads/block\n", - gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); - - // Copy result from device to host - checkCudaErrors( - cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - printf("Checking computed result for correctness: "); - bool correct = true; - - // test relative error by the formula - // |_cpu - _gpu|/<|x|, |y|> < eps - double eps = 1.e-6; // machine zero - - for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { - double abs_err = fabs(h_C[i] - (dimsA.x * valB)); - double dot_length = dimsA.x; - double abs_val = fabs(h_C[i]); - double rel_err = abs_err / abs_val / dot_length; - - if (rel_err > eps) { - printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, - h_C[i], dimsA.x * valB, eps); - correct = false; + for (int j = 0; j < nIter; j++) { + switch (kernel_number) { + case AsyncCopyMultiStageLargeChunk: + default: + MatrixMulAsyncCopyMultiStageLargeChunk + <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyLargeChunk: + MatrixMulAsyncCopyLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyLargeChunkAWBarrier: + MatrixMulAsyncCopyLargeChunkAWBarrier + <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyMultiStageSharedState: + MatrixMulAsyncCopyMultiStageSharedState + <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopyMultiStage: + MatrixMulAsyncCopyMultiStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case AsyncCopySingleStage: + MatrixMulAsyncCopySingleStage<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case Naive: + MatrixMulNaive<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + case NaiveLargeChunk: + MatrixMulNaiveLargeChunk<<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + break; + } } - } - printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + // Record the stop event + checkCudaErrors(cudaEventRecord(stop, stream)); - // Clean up memory - checkCudaErrors(cudaFreeHost(h_A)); - checkCudaErrors(cudaFreeHost(h_B)); - checkCudaErrors(cudaFreeHost(h_C)); - checkCudaErrors(cudaFree(d_A)); - checkCudaErrors(cudaFree(d_B)); - checkCudaErrors(cudaFree(d_C)); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); - printf( - "\nNOTE: The CUDA Samples are not meant for performance " - "measurements. Results may vary when GPU Boost is enabled.\n"); + // Wait for the stop event to complete + checkCudaErrors(cudaEventSynchronize(stop)); - if (correct) { - return EXIT_SUCCESS; - } else { - return EXIT_FAILURE; - } + float msecTotal = 0.0f; + checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); + + // Compute and print the performance + float msecPerMatrixMul = msecTotal / nIter; + double flopsPerMatrixMul = + 2.0 * static_cast(dimsA.x) * static_cast(dimsA.y) * static_cast(dimsB.x); + double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); + printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," + " WorkgroupSize= %u threads/block\n", + gigaFlops, + msecPerMatrixMul, + flopsPerMatrixMul, + threads.x * threads.y); + + // Copy result from device to host + checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + printf("Checking computed result for correctness: "); + bool correct = true; + + // test relative error by the formula + // |_cpu - _gpu|/<|x|, |y|> < eps + double eps = 1.e-6; // machine zero + + for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { + double abs_err = fabs(h_C[i] - (dimsA.x * valB)); + double dot_length = dimsA.x; + double abs_val = fabs(h_C[i]); + double rel_err = abs_err / abs_val / dot_length; + + if (rel_err > eps) { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps); + correct = false; + } + } + + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + + // Clean up memory + checkCudaErrors(cudaFreeHost(h_A)); + checkCudaErrors(cudaFreeHost(h_B)); + checkCudaErrors(cudaFreeHost(h_C)); + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_C)); + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + printf("\nNOTE: The CUDA Samples are not meant for performance " + "measurements. Results may vary when GPU Boost is enabled.\n"); + + if (correct) { + return EXIT_SUCCESS; + } + else { + return EXIT_FAILURE; + } } -int main(int argc, char **argv) { - printf("[globalToShmemAsyncCopy] - Starting...\n"); +int main(int argc, char **argv) +{ + printf("[globalToShmemAsyncCopy] - Starting...\n"); - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "?")) { - printf("Usage -device=n (n >= 0 for deviceID)\n"); - printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); - printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); - printf( - " -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - " - "AsyncCopyLargeChunk)\n"); - printf( - " (2 - AsyncCopyLargeChunkAWBarrier; 3 - " - "AsyncCopyMultiStageSharedState)\n"); - printf( - " (4 - AsyncCopyMultiStage; 5 - " - "AsyncCopySingleStage; 6 - Naive without memcpy_async)\n"); - printf( - " (7 - NaiveLargeChunk without " - "memcpy_async)\n"); - printf( - " Note: Outer matrix dimensions of A & B matrices must be equal.\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { + printf("Usage -device=n (n >= 0 for deviceID)\n"); + printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); + printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); + printf(" -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - " + "AsyncCopyLargeChunk)\n"); + printf(" (2 - AsyncCopyLargeChunkAWBarrier; 3 - " + "AsyncCopyMultiStageSharedState)\n"); + printf(" (4 - AsyncCopyMultiStage; 5 - " + "AsyncCopySingleStage; 6 - Naive without memcpy_async)\n"); + printf(" (7 - NaiveLargeChunk without " + "memcpy_async)\n"); + printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n"); - exit(EXIT_SUCCESS); - } - - // This will pick the best possible CUDA capable device, otherwise - // override the device ID based on input provided at the command line - int dev = findCudaDevice(argc, (const char **)argv); - - int matrixBlock = 32; - dim3 dimsA(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); - dim3 dimsB(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); - - // width of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { - dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); - } - - // height of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { - dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); - } - - // width of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { - dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); - } - - // height of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { - dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); - } - - if (dimsA.x != dimsB.y) { - printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", - dimsA.x, dimsB.y); - exit(EXIT_FAILURE); - } - - kernels selected_kernel = AsyncCopyMultiStageLargeChunk; - - // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) - if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { - int kernel_number = - getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); - if (kernel_number < 8) { - selected_kernel = (kernels)kernel_number; - } else { - printf( - "Error: kernel number should be between 0 to 6, you have entered " - "%d\n", - kernel_number); - exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); } - } - int major = 0; - checkCudaErrors( - cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); - if (major < 7) { - printf("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...\n"); - exit(EXIT_WAIVED); - } + // This will pick the best possible CUDA capable device, otherwise + // override the device ID based on input provided at the command line + int dev = findCudaDevice(argc, (const char **)argv); - printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, - dimsB.y); + int matrixBlock = 32; + dim3 dimsA(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); + dim3 dimsB(10 * 4 * matrixBlock, 10 * 4 * matrixBlock, 1); - int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel); + // width of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { + dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); + } - exit(matrix_result); + // height of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { + dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); + } + + // width of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { + dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); + } + + // height of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { + dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); + } + + if (dimsA.x != dimsB.y) { + printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y); + exit(EXIT_FAILURE); + } + + kernels selected_kernel = AsyncCopyMultiStageLargeChunk; + + // kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) + if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) { + int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel"); + if (kernel_number < 8) { + selected_kernel = (kernels)kernel_number; + } + else { + printf("Error: kernel number should be between 0 to 6, you have entered " + "%d\n", + kernel_number); + exit(EXIT_FAILURE); + } + } + + int major = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + if (major < 7) { + printf("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...\n"); + exit(EXIT_WAIVED); + } + + printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); + + int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel); + + exit(matrix_result); } diff --git a/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu b/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu index 9fb6deb0..5fbdbfbe 100644 --- a/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu +++ b/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu @@ -66,15 +66,12 @@ __global__ void ifGraphKernelA(char *dPtr, cudaGraphConditionalHandle handle) } // This kernel will only be executed if the condition is true -__global__ void ifGraphKernelC(void) -{ - printf("GPU: Hello from the GPU! The condition was true.\n"); -} +__global__ void ifGraphKernelC(void) { printf("GPU: Hello from the GPU! The condition was true.\n"); } // Setup and launch the graph void simpleIfGraph(void) { - cudaGraph_t graph; + cudaGraph_t graph; cudaGraphExec_t graphExec; cudaGraphNode_t kernelNode; cudaGraphNode_t conditionalNode; @@ -94,25 +91,25 @@ void simpleIfGraph(void) // Use a kernel upstream of the conditional to set the handle value cudaGraphNodeParams params = {cudaGraphNodeTypeKernel}; - params.kernel.func = (void *)ifGraphKernelA; + params.kernel.func = (void *)ifGraphKernelA; params.kernel.blockDim.x = params.kernel.blockDim.y = params.kernel.blockDim.z = 1; params.kernel.gridDim.x = params.kernel.gridDim.y = params.kernel.gridDim.z = 1; - params.kernel.kernelParams = kernelArgs; - kernelArgs[0] = &dPtr; - kernelArgs[1] = &handle; + params.kernel.kernelParams = kernelArgs; + kernelArgs[0] = &dPtr; + kernelArgs[1] = &handle; checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms)); cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; - cParams.conditional.handle = handle; - cParams.conditional.type = cudaGraphCondTypeIf; - cParams.conditional.size = 1; + cParams.conditional.handle = handle; + cParams.conditional.type = cudaGraphCondTypeIf; + cParams.conditional.size = 1; checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; // Populate the body of the conditional node cudaGraphNode_t bodyNode; - params.kernel.func = (void *)ifGraphKernelC; + params.kernel.func = (void *)ifGraphKernelC; params.kernel.kernelParams = nullptr; checkCudaErrors(cudaGraphAddNode(&bodyNode, bodyGraph, NULL, 0, ¶ms)); @@ -159,8 +156,7 @@ __global__ void doWhileEmptyKernel(void) __global__ void doWhileLoopKernel(char *dPtr, cudaGraphConditionalHandle handle) { - if (--(*dPtr) == 0) - { + if (--(*dPtr) == 0) { cudaGraphSetConditional(handle, 0); } printf("GPU: counter = %d\n", *dPtr); @@ -168,7 +164,7 @@ __global__ void doWhileLoopKernel(char *dPtr, cudaGraphConditionalHandle handle) void simpleDoWhileGraph(void) { - cudaGraph_t graph; + cudaGraph_t graph; cudaGraphExec_t graphExec; cudaGraphNode_t conditionalNode; @@ -183,9 +179,9 @@ void simpleDoWhileGraph(void) checkCudaErrors(cudaGraphConditionalHandleCreate(&handle, graph, 1, cudaGraphCondAssignDefault)); cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; - cParams.conditional.handle = handle; - cParams.conditional.type = cudaGraphCondTypeWhile; - cParams.conditional.size = 1; + cParams.conditional.handle = handle; + cParams.conditional.type = cudaGraphCondTypeWhile; + cParams.conditional.size = 1; checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, NULL, 0, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; @@ -193,7 +189,8 @@ void simpleDoWhileGraph(void) cudaStream_t captureStream; checkCudaErrors(cudaStreamCreate(&captureStream)); - checkCudaErrors(cudaStreamBeginCaptureToGraph(captureStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeGlobal)); + checkCudaErrors( + cudaStreamBeginCaptureToGraph(captureStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeGlobal)); doWhileEmptyKernel<<<1, 1, 0, captureStream>>>(); doWhileEmptyKernel<<<1, 1, 0, captureStream>>>(); doWhileLoopKernel<<<1, 1, 0, captureStream>>>(dPtr, handle); @@ -238,8 +235,7 @@ void simpleDoWhileGraph(void) __global__ void capturedWhileKernel(char *dPtr, cudaGraphConditionalHandle handle) { printf("GPU: counter = %d\n", *dPtr); - if (*dPtr) - { + if (*dPtr) { (*dPtr)--; } cudaGraphSetConditional(handle, *dPtr); @@ -253,12 +249,12 @@ __global__ void capturedWhileEmptyKernel(void) void capturedWhileGraph(void) { - cudaGraph_t graph; + cudaGraph_t graph; cudaGraphExec_t graphExec; cudaStreamCaptureStatus status; - const cudaGraphNode_t *dependencies; - size_t numDependencies; + const cudaGraphNode_t *dependencies; + size_t numDependencies; // Allocate a byte of device memory to use as input char *dPtr; @@ -284,17 +280,18 @@ void capturedWhileGraph(void) checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies)); // Insert conditional node B - cudaGraphNode_t conditionalNode; + cudaGraphNode_t conditionalNode; cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; - cParams.conditional.handle = handle; - cParams.conditional.type = cudaGraphCondTypeWhile; - cParams.conditional.size = 1; + cParams.conditional.handle = handle; + cParams.conditional.type = cudaGraphCondTypeWhile; + cParams.conditional.size = 1; checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, dependencies, numDependencies, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; // Update stream capture dependencies to account for the node we manually added - checkCudaErrors(cudaStreamUpdateCaptureDependencies(captureStream, &conditionalNode, 1, cudaStreamSetCaptureDependencies)); + checkCudaErrors( + cudaStreamUpdateCaptureDependencies(captureStream, &conditionalNode, 1, cudaStreamSetCaptureDependencies)); // Insert kernel node D capturedWhileEmptyKernel<<<1, 1, 0, captureStream>>>(); @@ -306,7 +303,8 @@ void capturedWhileGraph(void) cudaStream_t bodyStream; checkCudaErrors(cudaStreamCreate(&bodyStream)); - checkCudaErrors(cudaStreamBeginCaptureToGraph(bodyStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeGlobal)); + checkCudaErrors( + cudaStreamBeginCaptureToGraph(bodyStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeGlobal)); // Insert kernel node C capturedWhileKernel<<<1, 1, 0, bodyStream>>>(dPtr, handle); @@ -351,15 +349,12 @@ void capturedWhileGraph(void) */ // This kernel will only be executed if the condition is false -__global__ void ifGraphKernelD(void) -{ - printf("GPU: Hello from the GPU! The condition was false.\n"); -} +__global__ void ifGraphKernelD(void) { printf("GPU: Hello from the GPU! The condition was false.\n"); } // Setup and launch the graph void simpleIfElseGraph(void) { - cudaGraph_t graph; + cudaGraph_t graph; cudaGraphExec_t graphExec; cudaGraphNode_t kernelNode; cudaGraphNode_t conditionalNode; @@ -379,25 +374,25 @@ void simpleIfElseGraph(void) // Use a kernel upstream of the conditional to set the handle value cudaGraphNodeParams params = {cudaGraphNodeTypeKernel}; - params.kernel.func = (void *)ifGraphKernelA; + params.kernel.func = (void *)ifGraphKernelA; params.kernel.blockDim.x = params.kernel.blockDim.y = params.kernel.blockDim.z = 1; params.kernel.gridDim.x = params.kernel.gridDim.y = params.kernel.gridDim.z = 1; - params.kernel.kernelParams = kernelArgs; - kernelArgs[0] = &dPtr; - kernelArgs[1] = &handle; + params.kernel.kernelParams = kernelArgs; + kernelArgs[0] = &dPtr; + kernelArgs[1] = &handle; checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms)); cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; - cParams.conditional.handle = handle; - cParams.conditional.type = cudaGraphCondTypeIf; - cParams.conditional.size = 2; // Set size to 2 to indicate an ELSE graph will be used + cParams.conditional.handle = handle; + cParams.conditional.type = cudaGraphCondTypeIf; + cParams.conditional.size = 2; // Set size to 2 to indicate an ELSE graph will be used checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; // Populate the body of the first graph in the conditional node, executed if the condition is true cudaGraphNode_t trueBodyNode; - params.kernel.func = (void *)ifGraphKernelC; + params.kernel.func = (void *)ifGraphKernelC; params.kernel.kernelParams = nullptr; checkCudaErrors(cudaGraphAddNode(&trueBodyNode, bodyGraph, NULL, 0, ¶ms)); @@ -405,7 +400,7 @@ void simpleIfElseGraph(void) bodyGraph = cParams.conditional.phGraph_out[1]; cudaGraphNode_t falseBodyNode; - params.kernel.func = (void *)ifGraphKernelD; + params.kernel.func = (void *)ifGraphKernelD; params.kernel.kernelParams = nullptr; checkCudaErrors(cudaGraphAddNode(&falseBodyNode, bodyGraph, NULL, 0, ¶ms)); @@ -452,30 +447,18 @@ __global__ void switchGraphKernelA(char *dPtr, cudaGraphConditionalHandle handle printf("GPU: Handle set to %d\n", value); } -__global__ void switchGraphKernelC(void) -{ - printf("GPU: Hello from switchGraphKernelC(), running on the GPU!\n"); -} +__global__ void switchGraphKernelC(void) { printf("GPU: Hello from switchGraphKernelC(), running on the GPU!\n"); } -__global__ void switchGraphKernelD(void) -{ - printf("GPU: Hello from switchGraphKernelD(), running on the GPU!\n"); -} +__global__ void switchGraphKernelD(void) { printf("GPU: Hello from switchGraphKernelD(), running on the GPU!\n"); } -__global__ void switchGraphKernelE(void) -{ - printf("GPU: Hello from switchGraphKernelE(), running on the GPU!\n"); -} +__global__ void switchGraphKernelE(void) { printf("GPU: Hello from switchGraphKernelE(), running on the GPU!\n"); } -__global__ void switchGraphKernelF(void) -{ - printf("GPU: Hello from switchGraphKernelF(), running on the GPU!\n"); -} +__global__ void switchGraphKernelF(void) { printf("GPU: Hello from switchGraphKernelF(), running on the GPU!\n"); } // Setup and launch the graph void simpleSwitchGraph(void) { - cudaGraph_t graph; + cudaGraph_t graph; cudaGraphExec_t graphExec; cudaGraphNode_t kernelNode; cudaGraphNode_t conditionalNode; @@ -495,24 +478,24 @@ void simpleSwitchGraph(void) // Use a kernel upstream of the conditional to set the handle value cudaGraphNodeParams params = {cudaGraphNodeTypeKernel}; - params.kernel.func = (void *)switchGraphKernelA; + params.kernel.func = (void *)switchGraphKernelA; params.kernel.blockDim.x = params.kernel.blockDim.y = params.kernel.blockDim.z = 1; params.kernel.gridDim.x = params.kernel.gridDim.y = params.kernel.gridDim.z = 1; - params.kernel.kernelParams = kernelArgs; - kernelArgs[0] = &dPtr; - kernelArgs[1] = &handle; + params.kernel.kernelParams = kernelArgs; + kernelArgs[0] = &dPtr; + kernelArgs[1] = &handle; checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms)); cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; - cParams.conditional.handle = handle; - cParams.conditional.type = cudaGraphCondTypeSwitch; - cParams.conditional.size = 4; + cParams.conditional.handle = handle; + cParams.conditional.type = cudaGraphCondTypeSwitch; + cParams.conditional.size = 4; checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams)); // Populate the four graph bodies within the SWITCH conditional graph cudaGraphNode_t bodyNode; params.kernel.kernelParams = nullptr; - params.kernel.func = (void *)switchGraphKernelC; + params.kernel.func = (void *)switchGraphKernelC; checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[0], NULL, 0, ¶ms)); params.kernel.func = (void *)switchGraphKernelD; checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[1], NULL, 0, ¶ms)); @@ -523,8 +506,7 @@ void simpleSwitchGraph(void) checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - for (char i = 0; i < 5; i++) - { + for (char i = 0; i < 5; i++) { // Initialize device memory and launch the graph checkCudaErrors(cudaMemset(dPtr, i, 1)); printf("Host: Launching graph with device memory set to %d\n", i); @@ -547,11 +529,9 @@ int main(int argc, char **argv) int driverVersion = 0; cudaDriverGetVersion(&driverVersion); - printf("Driver version is: %d.%d\n", driverVersion / 1000, - (driverVersion % 100) / 10); + printf("Driver version is: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); - if (driverVersion < 12030) - { + if (driverVersion < 12030) { printf("Skipping execution as driver does not support Graph Conditional Nodes\n"); return 0; } @@ -560,8 +540,7 @@ int main(int argc, char **argv) simpleDoWhileGraph(); capturedWhileGraph(); - if (driverVersion < 12080) - { + if (driverVersion < 12080) { printf("Skipping execution as driver does not support if/else and switch type Graph Conditional Nodes\n"); return 0; } diff --git a/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu b/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu index 8ed66b09..84fa45f1 100644 --- a/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu +++ b/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu @@ -33,375 +33,358 @@ #include #include -#define NUM_GRAPHS 8 +#define NUM_GRAPHS 8 #define THREADS_PER_BLOCK 512 -void printMemoryFootprint(int device) { - size_t footprint; - checkCudaErrors(cudaDeviceGetGraphMemAttribute( - device, (cudaGraphMemAttributeType)0, &footprint)); - printf(" FOOTPRINT: %lu bytes\n", footprint); +void printMemoryFootprint(int device) +{ + size_t footprint; + checkCudaErrors(cudaDeviceGetGraphMemAttribute(device, (cudaGraphMemAttributeType)0, &footprint)); + printf(" FOOTPRINT: %lu bytes\n", footprint); } -void prepareAllocParams(cudaMemAllocNodeParams *allocParams, size_t bytes, - int device) { - memset(allocParams, 0, sizeof(*allocParams)); +void prepareAllocParams(cudaMemAllocNodeParams *allocParams, size_t bytes, int device) +{ + memset(allocParams, 0, sizeof(*allocParams)); - allocParams->bytesize = bytes; - allocParams->poolProps.allocType = cudaMemAllocationTypePinned; - allocParams->poolProps.location.id = device; - allocParams->poolProps.location.type = cudaMemLocationTypeDevice; + allocParams->bytesize = bytes; + allocParams->poolProps.allocType = cudaMemAllocationTypePinned; + allocParams->poolProps.location.id = device; + allocParams->poolProps.location.type = cudaMemLocationTypeDevice; } -void createVirtAddrReuseGraph(cudaGraphExec_t *graphExec, size_t bytes, - int device) { - cudaGraph_t graph; - cudaGraphNode_t allocNodeA, allocNodeB, freeNodeA, freeNodeB; - cudaMemAllocNodeParams allocParams; - float *d_a, *d_b; +void createVirtAddrReuseGraph(cudaGraphExec_t *graphExec, size_t bytes, int device) +{ + cudaGraph_t graph; + cudaGraphNode_t allocNodeA, allocNodeB, freeNodeA, freeNodeB; + cudaMemAllocNodeParams allocParams; + float *d_a, *d_b; - checkCudaErrors(cudaGraphCreate(&graph, 0)); - prepareAllocParams(&allocParams, bytes, device); + checkCudaErrors(cudaGraphCreate(&graph, 0)); + prepareAllocParams(&allocParams, bytes, device); - checkCudaErrors( - cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); - d_a = (float *)allocParams.dptr; - checkCudaErrors( - cudaGraphAddMemFreeNode(&freeNodeA, graph, &allocNodeA, 1, (void *)d_a)); + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); + d_a = (float *)allocParams.dptr; + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeA, graph, &allocNodeA, 1, (void *)d_a)); - // The dependency between the allocation of d_b and the free of d_a allows d_b - // to reuse the same VA. - checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeB, graph, &freeNodeA, 1, - &allocParams)); - d_b = (float *)allocParams.dptr; + // The dependency between the allocation of d_b and the free of d_a allows d_b + // to reuse the same VA. + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeB, graph, &freeNodeA, 1, &allocParams)); + d_b = (float *)allocParams.dptr; - if (d_a == d_b) { - printf("Check confirms that d_a and d_b share a virtual address.\n"); - } else { - printf("Check shows that d_a and d_b DO NOT share a virtual address.\n"); - } + if (d_a == d_b) { + printf("Check confirms that d_a and d_b share a virtual address.\n"); + } + else { + printf("Check shows that d_a and d_b DO NOT share a virtual address.\n"); + } - checkCudaErrors( - cudaGraphAddMemFreeNode(&freeNodeB, graph, &allocNodeB, 1, (void *)d_b)); + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeB, graph, &allocNodeB, 1, (void *)d_b)); - checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); - checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaGraphDestroy(graph)); } -void virtualAddressReuseSingleGraph(size_t bytes, int device) { - cudaStream_t stream; - cudaGraphExec_t graphExec; +void virtualAddressReuseSingleGraph(size_t bytes, int device) +{ + cudaStream_t stream; + cudaGraphExec_t graphExec; - printf("================================\n"); - printf("Running virtual address reuse example.\n"); - printf( - "Sequential allocations & frees within a single graph enable CUDA to " - "reuse virtual addresses.\n\n"); + printf("================================\n"); + printf("Running virtual address reuse example.\n"); + printf("Sequential allocations & frees within a single graph enable CUDA to " + "reuse virtual addresses.\n\n"); - createVirtAddrReuseGraph(&graphExec, bytes, device); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + createVirtAddrReuseGraph(&graphExec, bytes, device); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printMemoryFootprint(device); + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printMemoryFootprint(device); - checkCudaErrors(cudaGraphExecDestroy(graphExec)); - checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaStreamDestroy(stream)); } // This is a kernel that does no real work but runs at least for a specified // number of clocks -__global__ void clockBlock(clock_t clock_count) { - unsigned int start_clock = (unsigned int)clock(); +__global__ void clockBlock(clock_t clock_count) +{ + unsigned int start_clock = (unsigned int)clock(); - clock_t clock_offset = 0; + clock_t clock_offset = 0; - while (clock_offset < clock_count) { - unsigned int end_clock = (unsigned int)clock(); + while (clock_offset < clock_count) { + unsigned int end_clock = (unsigned int)clock(); - // The code below should work like - // this (thanks to modular arithmetics): - // - // clock_offset = (clock_t) (end_clock > start_clock ? - // end_clock - start_clock : - // end_clock + (0xffffffffu - start_clock)); - // - // Indeed, let m = 2^32 then - // end - start = end + m - start (mod m). + // The code below should work like + // this (thanks to modular arithmetics): + // + // clock_offset = (clock_t) (end_clock > start_clock ? + // end_clock - start_clock : + // end_clock + (0xffffffffu - start_clock)); + // + // Indeed, let m = 2^32 then + // end - start = end + m - start (mod m). - clock_offset = (clock_t)(end_clock - start_clock); - } + clock_offset = (clock_t)(end_clock - start_clock); + } } // A pointer to the allocated device buffer is returned in dPtr so the caller // can compare virtual addresses. The kernel node is added to increase the // graph's runtime. -void createSimpleAllocFreeGraph(cudaGraphExec_t *graphExec, float **dPtr, - size_t bytes, int device) { - cudaGraph_t graph; - cudaGraphNode_t allocNodeA, freeNodeA, blockDeviceNode; - cudaMemAllocNodeParams allocParams; - cudaKernelNodeParams blockDeviceNodeParams = {0}; - int numElements = bytes / sizeof(float); - float kernelTime = 5; // time for each thread to run in microseconds +void createSimpleAllocFreeGraph(cudaGraphExec_t *graphExec, float **dPtr, size_t bytes, int device) +{ + cudaGraph_t graph; + cudaGraphNode_t allocNodeA, freeNodeA, blockDeviceNode; + cudaMemAllocNodeParams allocParams; + cudaKernelNodeParams blockDeviceNodeParams = {0}; + int numElements = bytes / sizeof(float); + float kernelTime = 5; // time for each thread to run in microseconds - checkCudaErrors(cudaGraphCreate(&graph, 0)); - prepareAllocParams(&allocParams, bytes, device); + checkCudaErrors(cudaGraphCreate(&graph, 0)); + prepareAllocParams(&allocParams, bytes, device); - checkCudaErrors( - cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); - *dPtr = (float *)allocParams.dptr; + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); + *dPtr = (float *)allocParams.dptr; - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); - clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * deviceProp.clockRate); + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); + clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * deviceProp.clockRate); - void *blockDeviceArgs[1] = {(void *)&time_clocks}; + void *blockDeviceArgs[1] = {(void *)&time_clocks}; - size_t numBlocks = numElements / (size_t)THREADS_PER_BLOCK; - blockDeviceNodeParams.gridDim = dim3(numBlocks, 1, 1); - blockDeviceNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); - blockDeviceNodeParams.sharedMemBytes = 0; - blockDeviceNodeParams.extra = NULL; - blockDeviceNodeParams.func = (void *)clockBlock; - blockDeviceNodeParams.kernelParams = (void **)blockDeviceArgs; - checkCudaErrors(cudaGraphAddKernelNode(&blockDeviceNode, graph, &allocNodeA, - 1, &blockDeviceNodeParams)); + size_t numBlocks = numElements / (size_t)THREADS_PER_BLOCK; + blockDeviceNodeParams.gridDim = dim3(numBlocks, 1, 1); + blockDeviceNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); + blockDeviceNodeParams.sharedMemBytes = 0; + blockDeviceNodeParams.extra = NULL; + blockDeviceNodeParams.func = (void *)clockBlock; + blockDeviceNodeParams.kernelParams = (void **)blockDeviceArgs; + checkCudaErrors(cudaGraphAddKernelNode(&blockDeviceNode, graph, &allocNodeA, 1, &blockDeviceNodeParams)); - checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeA, graph, &blockDeviceNode, - 1, (void *)*dPtr)); + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeA, graph, &blockDeviceNode, 1, (void *)*dPtr)); - checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); - checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaGraphDestroy(graph)); } -void physicalMemoryReuseSingleStream(size_t bytes, int device) { - cudaStream_t stream; - cudaGraphExec_t graphExecs[NUM_GRAPHS]; - float *dPtrs[NUM_GRAPHS]; - bool virtualAddrDiffer = true; +void physicalMemoryReuseSingleStream(size_t bytes, int device) +{ + cudaStream_t stream; + cudaGraphExec_t graphExecs[NUM_GRAPHS]; + float *dPtrs[NUM_GRAPHS]; + bool virtualAddrDiffer = true; - printf("================================\n"); - printf("Running physical memory reuse example.\n"); - printf( - "CUDA reuses the same physical memory for allocations from separate " - "graphs when the allocation lifetimes don't overlap.\n\n"); + printf("================================\n"); + printf("Running physical memory reuse example.\n"); + printf("CUDA reuses the same physical memory for allocations from separate " + "graphs when the allocation lifetimes don't overlap.\n\n"); - for (int i = 0; i < NUM_GRAPHS; i++) { - createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); - } - - printf("Creating the graph execs does not reserve any physical memory.\n"); - printMemoryFootprint(device); - - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream)); - printf("\nThe first graph launched reserves the memory it needs.\n"); - printMemoryFootprint(device); - - checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream)); - printf( - "A subsequent launch of the same graph in the same stream reuses the " - "same physical memory. "); - printf("Thus the memory footprint does not grow here.\n"); - printMemoryFootprint(device); - - printf( - "\nSubsequent launches of other graphs in the same stream also reuse the " - "physical memory. "); - printf("Thus the memory footprint does not grow here.\n"); - for (int i = 1; i < NUM_GRAPHS; i++) { - checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream)); - printf("%02d: ", i); - printMemoryFootprint(device); - } - - checkCudaErrors(cudaStreamSynchronize(stream)); - - for (int i = 0; i < NUM_GRAPHS; i++) { - for (int j = i + 1; j < NUM_GRAPHS; j++) { - if (dPtrs[i] == dPtrs[j]) { - virtualAddrDiffer = false; - printf("Error: Graph exec %d and %d have the same virtual address!\n", - i - 1, i); - } + for (int i = 0; i < NUM_GRAPHS; i++) { + createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); } - checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); - } - if (virtualAddrDiffer) { - printf("\nCheck confirms all graphs use a different virtual address.\n"); - } else { - printf( - "\nAll graphs do NOT use different virtual addresses. Exiting test.\n"); - exit(EXIT_FAILURE); - } - checkCudaErrors(cudaStreamDestroy(stream)); -} - -void simultaneousStreams(size_t bytes, int device) { - cudaStream_t streams[NUM_GRAPHS]; - cudaGraphExec_t graphExecs[NUM_GRAPHS]; - float *dPtrs[NUM_GRAPHS]; - - printf("================================\n"); - printf("Running simultaneous streams example.\n"); - printf("Graphs that can run concurrently need separate physical memory. "); - printf( - "In this example, each graph launched in a separate stream increases the " - "total memory footprint.\n\n"); - - printf( - "When launching a new graph, CUDA may reuse physical memory from a graph " - "whose execution has already "); - printf( - "finished -- even if the new graph is being launched in a different " - "stream from the completed graph. "); - printf( - "Therefore, a kernel node is added to the graphs to increase " - "runtime.\n\n"); - - for (int i = 0; i < NUM_GRAPHS; i++) { - createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); - checkCudaErrors( - cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking)); - } - - printf("Initial footprint:\n"); - printMemoryFootprint(device); - - printf( - "\nEach graph launch in a seperate stream grows the memory footprint:\n"); - for (int i = 1; i < NUM_GRAPHS; i++) { - checkCudaErrors(cudaGraphLaunch(graphExecs[i], streams[i])); - printf("%02d: ", i); + printf("Creating the graph execs does not reserve any physical memory.\n"); printMemoryFootprint(device); - } - for (int i = 0; i < NUM_GRAPHS; i++) { - checkCudaErrors(cudaStreamSynchronize(streams[i])); - checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); - checkCudaErrors(cudaStreamDestroy(streams[i])); - } -} + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); -void createSimpleAllocNoFreeGraph(cudaGraphExec_t *graphExec, float **dPtr, - size_t bytes, int device) { - cudaGraph_t graph; - cudaGraphNode_t allocNodeA; - cudaMemAllocNodeParams allocParams; - - checkCudaErrors(cudaGraphCreate(&graph, 0)); - prepareAllocParams(&allocParams, bytes, device); - - checkCudaErrors( - cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); - *dPtr = (float *)allocParams.dptr; - - checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); - checkCudaErrors(cudaGraphDestroy(graph)); -} - -void unfreedAllocations(size_t bytes, int device) { - cudaStream_t stream; - cudaGraphExec_t graphExecs[NUM_GRAPHS]; - float *dPtrs[NUM_GRAPHS]; - - printf("================================\n"); - printf("Running unfreed streams example.\n"); - printf( - "CUDA cannot reuse phyiscal memory from graphs which do not free their " - "allocations.\n\n"); - - for (int i = 0; i < NUM_GRAPHS; i++) { - createSimpleAllocNoFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); - } - - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - printf( - "Despite being launched in the same stream, each graph launch grows the " - "memory footprint. "); - printf( - "Since the allocation is not freed, CUDA keeps the memory valid for " - "use.\n"); - for (int i = 0; i < NUM_GRAPHS; i++) { - checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream)); - printf("%02d: ", i); + checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream)); + printf("\nThe first graph launched reserves the memory it needs.\n"); printMemoryFootprint(device); - } - checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaGraphLaunch(graphExecs[0], stream)); + printf("A subsequent launch of the same graph in the same stream reuses the " + "same physical memory. "); + printf("Thus the memory footprint does not grow here.\n"); + printMemoryFootprint(device); - checkCudaErrors(cudaDeviceGraphMemTrim(device)); - printf( - "\nTrimming does not impact the memory footprint since the un-freed " - "allocations are still holding onto the memory.\n"); - printMemoryFootprint(device); + printf("\nSubsequent launches of other graphs in the same stream also reuse the " + "physical memory. "); + printf("Thus the memory footprint does not grow here.\n"); + for (int i = 1; i < NUM_GRAPHS; i++) { + checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream)); + printf("%02d: ", i); + printMemoryFootprint(device); + } - for (int i = 0; i < NUM_GRAPHS; i++) { - checkCudaErrors(cudaFree(dPtrs[i])); - } - printf("\nFreeing the allocations does not shrink the footprint.\n"); - printMemoryFootprint(device); + checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaDeviceGraphMemTrim(device)); - printf( - "\nSince the allocations are now freed, trimming does reduce the " - "footprint even when the graph execs are not yet destroyed.\n"); - printMemoryFootprint(device); + for (int i = 0; i < NUM_GRAPHS; i++) { + for (int j = i + 1; j < NUM_GRAPHS; j++) { + if (dPtrs[i] == dPtrs[j]) { + virtualAddrDiffer = false; + printf("Error: Graph exec %d and %d have the same virtual address!\n", i - 1, i); + } + } + checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); + } + if (virtualAddrDiffer) { + printf("\nCheck confirms all graphs use a different virtual address.\n"); + } + else { + printf("\nAll graphs do NOT use different virtual addresses. Exiting test.\n"); + exit(EXIT_FAILURE); + } - for (int i = 0; i < NUM_GRAPHS; i++) { - checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); - } - checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaStreamDestroy(stream)); } -void cleanupMemory(int device) { - checkCudaErrors(cudaDeviceGraphMemTrim(device)); - printf("\nCleaning up example by trimming device memory.\n"); - printMemoryFootprint(device); - printf("\n"); +void simultaneousStreams(size_t bytes, int device) +{ + cudaStream_t streams[NUM_GRAPHS]; + cudaGraphExec_t graphExecs[NUM_GRAPHS]; + float *dPtrs[NUM_GRAPHS]; + + printf("================================\n"); + printf("Running simultaneous streams example.\n"); + printf("Graphs that can run concurrently need separate physical memory. "); + printf("In this example, each graph launched in a separate stream increases the " + "total memory footprint.\n\n"); + + printf("When launching a new graph, CUDA may reuse physical memory from a graph " + "whose execution has already "); + printf("finished -- even if the new graph is being launched in a different " + "stream from the completed graph. "); + printf("Therefore, a kernel node is added to the graphs to increase " + "runtime.\n\n"); + + for (int i = 0; i < NUM_GRAPHS; i++) { + createSimpleAllocFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); + checkCudaErrors(cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking)); + } + + printf("Initial footprint:\n"); + printMemoryFootprint(device); + + printf("\nEach graph launch in a seperate stream grows the memory footprint:\n"); + for (int i = 1; i < NUM_GRAPHS; i++) { + checkCudaErrors(cudaGraphLaunch(graphExecs[i], streams[i])); + printf("%02d: ", i); + printMemoryFootprint(device); + } + + for (int i = 0; i < NUM_GRAPHS; i++) { + checkCudaErrors(cudaStreamSynchronize(streams[i])); + checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); + checkCudaErrors(cudaStreamDestroy(streams[i])); + } } -int main(int argc, char **argv) { - size_t bytes = 64 * 1024 * 1024; - int device = findCudaDevice(argc, (const char **)argv); +void createSimpleAllocNoFreeGraph(cudaGraphExec_t *graphExec, float **dPtr, size_t bytes, int device) +{ + cudaGraph_t graph; + cudaGraphNode_t allocNodeA; + cudaMemAllocNodeParams allocParams; - int driverVersion = 0; - int deviceSupportsMemoryPools = 0; + checkCudaErrors(cudaGraphCreate(&graph, 0)); + prepareAllocParams(&allocParams, bytes, device); - cudaDriverGetVersion(&driverVersion); - printf("Driver version is: %d.%d\n", driverVersion / 1000, - (driverVersion % 100) / 10); + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); + *dPtr = (float *)allocParams.dptr; - if (driverVersion < 11040) { - printf("Waiving execution as driver does not support Graph Memory Nodes\n"); - exit(EXIT_WAIVED); - } - - cudaDeviceGetAttribute(&deviceSupportsMemoryPools, - cudaDevAttrMemoryPoolsSupported, device); - if (!deviceSupportsMemoryPools) { - printf("Waiving execution as device does not support Memory Pools\n"); - exit(EXIT_WAIVED); - } else { - printf("Running sample.\n"); - } - - virtualAddressReuseSingleGraph(bytes, device); - cleanupMemory(device); - - physicalMemoryReuseSingleStream(bytes, device); - cleanupMemory(device); - - simultaneousStreams(bytes, device); - cleanupMemory(device); - - unfreedAllocations(bytes, device); - cleanupMemory(device); - - printf("================================\n"); - printf("Sample complete.\n"); + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaGraphDestroy(graph)); +} + +void unfreedAllocations(size_t bytes, int device) +{ + cudaStream_t stream; + cudaGraphExec_t graphExecs[NUM_GRAPHS]; + float *dPtrs[NUM_GRAPHS]; + + printf("================================\n"); + printf("Running unfreed streams example.\n"); + printf("CUDA cannot reuse phyiscal memory from graphs which do not free their " + "allocations.\n\n"); + + for (int i = 0; i < NUM_GRAPHS; i++) { + createSimpleAllocNoFreeGraph(&graphExecs[i], &dPtrs[i], bytes, device); + } + + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + printf("Despite being launched in the same stream, each graph launch grows the " + "memory footprint. "); + printf("Since the allocation is not freed, CUDA keeps the memory valid for " + "use.\n"); + for (int i = 0; i < NUM_GRAPHS; i++) { + checkCudaErrors(cudaGraphLaunch(graphExecs[i], stream)); + printf("%02d: ", i); + printMemoryFootprint(device); + } + + checkCudaErrors(cudaStreamSynchronize(stream)); + + checkCudaErrors(cudaDeviceGraphMemTrim(device)); + printf("\nTrimming does not impact the memory footprint since the un-freed " + "allocations are still holding onto the memory.\n"); + printMemoryFootprint(device); + + for (int i = 0; i < NUM_GRAPHS; i++) { + checkCudaErrors(cudaFree(dPtrs[i])); + } + printf("\nFreeing the allocations does not shrink the footprint.\n"); + printMemoryFootprint(device); + + checkCudaErrors(cudaDeviceGraphMemTrim(device)); + printf("\nSince the allocations are now freed, trimming does reduce the " + "footprint even when the graph execs are not yet destroyed.\n"); + printMemoryFootprint(device); + + for (int i = 0; i < NUM_GRAPHS; i++) { + checkCudaErrors(cudaGraphExecDestroy(graphExecs[i])); + } + checkCudaErrors(cudaStreamDestroy(stream)); +} + +void cleanupMemory(int device) +{ + checkCudaErrors(cudaDeviceGraphMemTrim(device)); + printf("\nCleaning up example by trimming device memory.\n"); + printMemoryFootprint(device); + printf("\n"); +} + +int main(int argc, char **argv) +{ + size_t bytes = 64 * 1024 * 1024; + int device = findCudaDevice(argc, (const char **)argv); + + int driverVersion = 0; + int deviceSupportsMemoryPools = 0; + + cudaDriverGetVersion(&driverVersion); + printf("Driver version is: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + + if (driverVersion < 11040) { + printf("Waiving execution as driver does not support Graph Memory Nodes\n"); + exit(EXIT_WAIVED); + } + + cudaDeviceGetAttribute(&deviceSupportsMemoryPools, cudaDevAttrMemoryPoolsSupported, device); + if (!deviceSupportsMemoryPools) { + printf("Waiving execution as device does not support Memory Pools\n"); + exit(EXIT_WAIVED); + } + else { + printf("Running sample.\n"); + } + + virtualAddressReuseSingleGraph(bytes, device); + cleanupMemory(device); + + physicalMemoryReuseSingleStream(bytes, device); + cleanupMemory(device); + + simultaneousStreams(bytes, device); + cleanupMemory(device); + + unfreedAllocations(bytes, device); + cleanupMemory(device); + + printf("================================\n"); + printf("Sample complete.\n"); } diff --git a/Samples/3_CUDA_Features/graphMemoryNodes/graphMemoryNodes.cu b/Samples/3_CUDA_Features/graphMemoryNodes/graphMemoryNodes.cu index 628bccb8..bba8e2a8 100644 --- a/Samples/3_CUDA_Features/graphMemoryNodes/graphMemoryNodes.cu +++ b/Samples/3_CUDA_Features/graphMemoryNodes/graphMemoryNodes.cu @@ -27,9 +27,8 @@ // System includes #include -#include - #include +#include #include // CUDA runtime @@ -39,81 +38,85 @@ #include #include -#define THREADS_PER_BLOCK 512 +#define THREADS_PER_BLOCK 512 #define ALLOWABLE_VARIANCE 1.e-6f -#define NUM_ELEMENTS 8000000 +#define NUM_ELEMENTS 8000000 // Stores the square of each input element in output array -__global__ void squareArray(const float *input, float *output, - int numElements) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void squareArray(const float *input, float *output, int numElements) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numElements) { - output[idx] = input[idx] * input[idx]; - } + if (idx < numElements) { + output[idx] = input[idx] * input[idx]; + } } // Stores the negative of each input element in output array -__global__ void negateArray(const float *input, float *output, - int numElements) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void negateArray(const float *input, float *output, int numElements) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numElements) { - output[idx] = input[idx] * -1; - } + if (idx < numElements) { + output[idx] = input[idx] * -1; + } } -struct negSquareArrays { - float *input; - float *square; - float *negSquare; - int numElements; - size_t bytes; - size_t numBlocks; +struct negSquareArrays +{ + float *input; + float *square; + float *negSquare; + int numElements; + size_t bytes; + size_t numBlocks; }; -void fillRandomly(float *array, int numElements) { - for (int n = 0; n < numElements; n++) { - array[n] = rand() / (float)RAND_MAX; - } +void fillRandomly(float *array, int numElements) +{ + for (int n = 0; n < numElements; n++) { + array[n] = rand() / (float)RAND_MAX; + } } -void resetOutputArrays(negSquareArrays *hostArrays) { - fillRandomly(hostArrays->square, hostArrays->numElements); - fillRandomly(hostArrays->negSquare, hostArrays->numElements); +void resetOutputArrays(negSquareArrays *hostArrays) +{ + fillRandomly(hostArrays->square, hostArrays->numElements); + fillRandomly(hostArrays->negSquare, hostArrays->numElements); } -void prepareHostArrays(negSquareArrays *hostArrays) { - hostArrays->numElements = NUM_ELEMENTS; - size_t bytes = hostArrays->numElements * sizeof(float); +void prepareHostArrays(negSquareArrays *hostArrays) +{ + hostArrays->numElements = NUM_ELEMENTS; + size_t bytes = hostArrays->numElements * sizeof(float); - size_t numBlocks = hostArrays->numElements / (size_t)THREADS_PER_BLOCK; - if ((numBlocks % (size_t)THREADS_PER_BLOCK) != 0) { - numBlocks++; - } + size_t numBlocks = hostArrays->numElements / (size_t)THREADS_PER_BLOCK; + if ((numBlocks % (size_t)THREADS_PER_BLOCK) != 0) { + numBlocks++; + } - hostArrays->input = (float *)malloc(bytes); - hostArrays->square = (float *)malloc(bytes); - hostArrays->negSquare = (float *)malloc(bytes); - hostArrays->bytes = bytes; - hostArrays->numBlocks = numBlocks; + hostArrays->input = (float *)malloc(bytes); + hostArrays->square = (float *)malloc(bytes); + hostArrays->negSquare = (float *)malloc(bytes); + hostArrays->bytes = bytes; + hostArrays->numBlocks = numBlocks; - fillRandomly(hostArrays->input, hostArrays->numElements); - fillRandomly(hostArrays->square, hostArrays->numElements); - fillRandomly(hostArrays->negSquare, hostArrays->numElements); + fillRandomly(hostArrays->input, hostArrays->numElements); + fillRandomly(hostArrays->square, hostArrays->numElements); + fillRandomly(hostArrays->negSquare, hostArrays->numElements); } -void createFreeGraph(cudaGraphExec_t *graphExec, float *dPtr) { - cudaGraph_t graph; - cudaGraphNode_t freeNode; +void createFreeGraph(cudaGraphExec_t *graphExec, float *dPtr) +{ + cudaGraph_t graph; + cudaGraphNode_t freeNode; - checkCudaErrors(cudaGraphCreate(&graph, 0)); + checkCudaErrors(cudaGraphCreate(&graph, 0)); - checkCudaErrors( - cudaGraphAddMemFreeNode(&freeNode, graph, NULL, 0, (void *)dPtr)); + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNode, graph, NULL, 0, (void *)dPtr)); - checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); - checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaGraphDestroy(graph)); } /** @@ -145,115 +148,122 @@ void createFreeGraph(cudaGraphExec_t *graphExec, float *dPtr) { * | * free d_negSquare */ -void createNegateSquaresGraphExplicitly(cudaGraphExec_t *graphExec, int device, +void createNegateSquaresGraphExplicitly(cudaGraphExec_t *graphExec, + int device, negSquareArrays *hostArrays, - float **d_negSquare_out = NULL) { - // Array buffers on device - float *d_input, *d_square, *d_negSquare; + float **d_negSquare_out = NULL) +{ + // Array buffers on device + float *d_input, *d_square, *d_negSquare; - // Memory allocation parameters - cudaMemAllocNodeParams allocParams; - memset(&allocParams, 0, sizeof(allocParams)); - allocParams.bytesize = hostArrays->bytes; - allocParams.poolProps.allocType = cudaMemAllocationTypePinned; - allocParams.poolProps.location.id = device; - allocParams.poolProps.location.type = cudaMemLocationTypeDevice; + // Memory allocation parameters + cudaMemAllocNodeParams allocParams; + memset(&allocParams, 0, sizeof(allocParams)); + allocParams.bytesize = hostArrays->bytes; + allocParams.poolProps.allocType = cudaMemAllocationTypePinned; + allocParams.poolProps.location.id = device; + allocParams.poolProps.location.type = cudaMemLocationTypeDevice; - // Kernel launch parameters - cudaKernelNodeParams kernelNodeParams = {0}; - kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1); - kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); - kernelNodeParams.sharedMemBytes = 0; - kernelNodeParams.extra = NULL; + // Kernel launch parameters + cudaKernelNodeParams kernelNodeParams = {0}; + kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1); + kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.extra = NULL; - cudaGraph_t graph; - cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare; - cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare; - cudaGraphNode_t squareKernelNode, negateKernelNode; - cudaGraphNode_t freeNodeInput, freeNodeSquare; + cudaGraph_t graph; + cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare; + cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare; + cudaGraphNode_t squareKernelNode, negateKernelNode; + cudaGraphNode_t freeNodeInput, freeNodeSquare; - // Buffer for storing graph node dependencies - std::vector nodeDependencies; + // Buffer for storing graph node dependencies + std::vector nodeDependencies; - checkCudaErrors(cudaGraphCreate(&graph, 0)); + checkCudaErrors(cudaGraphCreate(&graph, 0)); - checkCudaErrors( - cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams)); - d_input = (float *)allocParams.dptr; + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams)); + d_input = (float *)allocParams.dptr; - // To keep the graph structure simple (fewer branching dependencies), - // allocNodeSquare should depend on allocNodeInput - checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph, - &allocNodeInput, 1, &allocParams)); - d_square = (float *)allocParams.dptr; + // To keep the graph structure simple (fewer branching dependencies), + // allocNodeSquare should depend on allocNodeInput + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph, &allocNodeInput, 1, &allocParams)); + d_square = (float *)allocParams.dptr; - // copyNodeInput needs to depend on allocNodeInput because copyNodeInput - // writes to d_input. It does so here indirectly through allocNodeSquare. - checkCudaErrors(cudaGraphAddMemcpyNode1D( - ©NodeInput, graph, &allocNodeSquare, 1, d_input, hostArrays->input, - hostArrays->bytes, cudaMemcpyHostToDevice)); + // copyNodeInput needs to depend on allocNodeInput because copyNodeInput + // writes to d_input. It does so here indirectly through allocNodeSquare. + checkCudaErrors(cudaGraphAddMemcpyNode1D(©NodeInput, + graph, + &allocNodeSquare, + 1, + d_input, + hostArrays->input, + hostArrays->bytes, + cudaMemcpyHostToDevice)); - void *squareKernelArgs[3] = {(void *)&d_input, (void *)&d_square, - (void *)&(hostArrays->numElements)}; - kernelNodeParams.func = (void *)squareArray; - kernelNodeParams.kernelParams = (void **)squareKernelArgs; + void *squareKernelArgs[3] = {(void *)&d_input, (void *)&d_square, (void *)&(hostArrays->numElements)}; + kernelNodeParams.func = (void *)squareArray; + kernelNodeParams.kernelParams = (void **)squareKernelArgs; - // Square kernel depends on copyNodeInput to ensure all data is on the device - // before kernel launch. - checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph, - ©NodeInput, 1, &kernelNodeParams)); + // Square kernel depends on copyNodeInput to ensure all data is on the device + // before kernel launch. + checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph, ©NodeInput, 1, &kernelNodeParams)); - checkCudaErrors(cudaGraphAddMemcpyNode1D( - ©NodeSquare, graph, &squareKernelNode, 1, hostArrays->square, - d_square, hostArrays->bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaGraphAddMemcpyNode1D(©NodeSquare, + graph, + &squareKernelNode, + 1, + hostArrays->square, + d_square, + hostArrays->bytes, + cudaMemcpyDeviceToHost)); - // Free of d_input depends on the square kernel to ensure that d_input is not - // freed while being read by the kernel. It also depends on the alloc of - // d_input via squareKernelNode > copyNodeInput > allocNodeSquare > - // allocNodeInput. - checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph, - &squareKernelNode, 1, d_input)); + // Free of d_input depends on the square kernel to ensure that d_input is not + // freed while being read by the kernel. It also depends on the alloc of + // d_input via squareKernelNode > copyNodeInput > allocNodeSquare > + // allocNodeInput. + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph, &squareKernelNode, 1, d_input)); - // Allocation of C depends on free of A so CUDA can reuse the virtual address. - checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph, - &freeNodeInput, 1, &allocParams)); - d_negSquare = (float *)allocParams.dptr; + // Allocation of C depends on free of A so CUDA can reuse the virtual address. + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph, &freeNodeInput, 1, &allocParams)); + d_negSquare = (float *)allocParams.dptr; - if (d_negSquare == d_input) { - printf( - "Check verified that d_negSquare and d_input share a virtual " - "address.\n"); - } + if (d_negSquare == d_input) { + printf("Check verified that d_negSquare and d_input share a virtual " + "address.\n"); + } - void *negateKernelArgs[3] = {(void *)&d_square, (void *)&d_negSquare, - (void *)&(hostArrays->numElements)}; - kernelNodeParams.func = (void *)negateArray; - kernelNodeParams.kernelParams = (void **)negateKernelArgs; + void *negateKernelArgs[3] = {(void *)&d_square, (void *)&d_negSquare, (void *)&(hostArrays->numElements)}; + kernelNodeParams.func = (void *)negateArray; + kernelNodeParams.kernelParams = (void **)negateKernelArgs; - checkCudaErrors(cudaGraphAddKernelNode( - &negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams)); + checkCudaErrors(cudaGraphAddKernelNode(&negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams)); - nodeDependencies.push_back(copyNodeSquare); - nodeDependencies.push_back(negateKernelNode); - checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeSquare, graph, - nodeDependencies.data(), - nodeDependencies.size(), d_square)); - nodeDependencies.clear(); + nodeDependencies.push_back(copyNodeSquare); + nodeDependencies.push_back(negateKernelNode); + checkCudaErrors( + cudaGraphAddMemFreeNode(&freeNodeSquare, graph, nodeDependencies.data(), nodeDependencies.size(), d_square)); + nodeDependencies.clear(); - checkCudaErrors(cudaGraphAddMemcpyNode1D( - ©NodeNegSquare, graph, &negateKernelNode, 1, hostArrays->negSquare, - d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaGraphAddMemcpyNode1D(©NodeNegSquare, + graph, + &negateKernelNode, + 1, + hostArrays->negSquare, + d_negSquare, + hostArrays->bytes, + cudaMemcpyDeviceToHost)); - if (d_negSquare_out == NULL) { - cudaGraphNode_t freeNodeNegSquare; - checkCudaErrors(cudaGraphAddMemFreeNode( - &freeNodeNegSquare, graph, ©NodeNegSquare, 1, d_negSquare)); - } else { - *d_negSquare_out = d_negSquare; - } + if (d_negSquare_out == NULL) { + cudaGraphNode_t freeNodeNegSquare; + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeNegSquare, graph, ©NodeNegSquare, 1, d_negSquare)); + } + else { + *d_negSquare_out = d_negSquare; + } - checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); - checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaGraphDestroy(graph)); } /** @@ -293,60 +303,55 @@ void createNegateSquaresGraphExplicitly(cudaGraphExec_t *graphExec, int device, * | | * wait squareFreeEvent --------------<---- record squareFreeEvent */ -void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays, - float **d_negSquare_out = NULL) { - float *d_input, *d_square, *d_negSquare; - cudaStream_t stream2; - cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent, - squareFreeEvent; +void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays, float **d_negSquare_out = NULL) +{ + float *d_input, *d_square, *d_negSquare; + cudaStream_t stream2; + cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent, squareFreeEvent; - checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); - checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent)); - checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent)); - checkCudaErrors(cudaEventCreate(&squareFreeEvent)); + checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent)); + checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent)); + checkCudaErrors(cudaEventCreate(&squareFreeEvent)); - // Virtual addresses are assigned synchronously when cudaMallocAsync is - // called, thus there is no performace benefit gained by separating the - // allocations into two streams. - checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1)); - checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1)); + // Virtual addresses are assigned synchronously when cudaMallocAsync is + // called, thus there is no performace benefit gained by separating the + // allocations into two streams. + checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1)); + checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1)); - checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes, - cudaMemcpyHostToDevice, stream1)); - squareArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( - d_input, d_square, hostArrays->numElements); - checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes, cudaMemcpyHostToDevice, stream1)); + squareArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(d_input, d_square, hostArrays->numElements); + checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1)); - checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0)); - checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square, - hostArrays->bytes, cudaMemcpyDeviceToHost, - stream2)); + checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0)); + checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square, hostArrays->bytes, cudaMemcpyDeviceToHost, stream2)); - checkCudaErrors(cudaFreeAsync(d_input, stream1)); - checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1)); - negateArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( - d_square, d_negSquare, hostArrays->numElements); - checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1)); - checkCudaErrors(cudaMemcpyAsync(hostArrays->negSquare, d_negSquare, - hostArrays->bytes, cudaMemcpyDeviceToHost, - stream1)); - if (d_negSquare_out == NULL) { - checkCudaErrors(cudaFreeAsync(d_negSquare, stream1)); - } else { - *d_negSquare_out = d_negSquare; - } + checkCudaErrors(cudaFreeAsync(d_input, stream1)); + checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1)); + negateArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( + d_square, d_negSquare, hostArrays->numElements); + checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1)); + checkCudaErrors( + cudaMemcpyAsync(hostArrays->negSquare, d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost, stream1)); + if (d_negSquare_out == NULL) { + checkCudaErrors(cudaFreeAsync(d_negSquare, stream1)); + } + else { + *d_negSquare_out = d_negSquare; + } - checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0)); - checkCudaErrors(cudaFreeAsync(d_square, stream2)); - checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2)); + checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0)); + checkCudaErrors(cudaFreeAsync(d_square, stream2)); + checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2)); - checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0)); + checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0)); - checkCudaErrors(cudaStreamDestroy(stream2)); - checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent)); - checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent)); - checkCudaErrors(cudaEventDestroy(squareFreeEvent)); + checkCudaErrors(cudaStreamDestroy(stream2)); + checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent)); + checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent)); + checkCudaErrors(cudaEventDestroy(squareFreeEvent)); } /** @@ -356,199 +361,195 @@ void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays, */ void createNegateSquaresGraphWithStreamCapture(cudaGraphExec_t *graphExec, negSquareArrays *hostArrays, - float **d_negSquare_out = NULL) { - cudaGraph_t graph; - cudaStream_t stream; + float **d_negSquare_out = NULL) +{ + cudaGraph_t graph; + cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - doNegateSquaresInStream(stream, hostArrays, d_negSquare_out); - checkCudaErrors(cudaStreamEndCapture(stream, &graph)); + checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + doNegateSquaresInStream(stream, hostArrays, d_negSquare_out); + checkCudaErrors(cudaStreamEndCapture(stream, &graph)); - checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); - checkCudaErrors(cudaStreamDestroy(stream)); - checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaGraphDestroy(graph)); } -void prepareRefArrays(negSquareArrays *hostArrays, - negSquareArrays *deviceRefArrays, - bool **foundValidationFailure) { - deviceRefArrays->bytes = hostArrays->bytes; - deviceRefArrays->numElements = hostArrays->numElements; +void prepareRefArrays(negSquareArrays *hostArrays, negSquareArrays *deviceRefArrays, bool **foundValidationFailure) +{ + deviceRefArrays->bytes = hostArrays->bytes; + deviceRefArrays->numElements = hostArrays->numElements; - for (int i = 0; i < hostArrays->numElements; i++) { - hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i]; - hostArrays->negSquare[i] = hostArrays->square[i] * -1; - } - - checkCudaErrors( - cudaMalloc((void **)&deviceRefArrays->negSquare, deviceRefArrays->bytes)); - checkCudaErrors(cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare, - hostArrays->bytes, cudaMemcpyHostToDevice)); - - checkCudaErrors( - cudaMallocManaged((void **)foundValidationFailure, sizeof(bool))); -} - -int checkValidationFailure(bool *foundValidationFailure) { - if (*foundValidationFailure) { - printf("Validation FAILURE!\n\n"); - *foundValidationFailure = false; - return EXIT_FAILURE; - } else { - printf("Validation PASSED!\n\n"); - return EXIT_SUCCESS; - } -} - -__global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays, - bool *foundValidationFailure) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - float ref, diff; - - if (idx < devRefArrays.numElements) { - ref = devRefArrays.negSquare[idx]; - diff = d_negSquare[idx] - ref; - diff *= diff; - ref *= ref; - if (diff / ref > ALLOWABLE_VARIANCE) { - *foundValidationFailure = true; + for (int i = 0; i < hostArrays->numElements; i++) { + hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i]; + hostArrays->negSquare[i] = hostArrays->square[i] * -1; } - } + + checkCudaErrors(cudaMalloc((void **)&deviceRefArrays->negSquare, deviceRefArrays->bytes)); + checkCudaErrors( + cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare, hostArrays->bytes, cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMallocManaged((void **)foundValidationFailure, sizeof(bool))); } -void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure) { - float ref, diff; - - for (int i = 0; i < hostArrays->numElements; i++) { - ref = hostArrays->input[i] * hostArrays->input[i] * -1; - diff = hostArrays->negSquare[i] - ref; - diff *= diff; - ref *= ref; - if (diff / ref > ALLOWABLE_VARIANCE) { - *foundValidationFailure = true; +int checkValidationFailure(bool *foundValidationFailure) +{ + if (*foundValidationFailure) { + printf("Validation FAILURE!\n\n"); + *foundValidationFailure = false; + return EXIT_FAILURE; + } + else { + printf("Validation PASSED!\n\n"); + return EXIT_SUCCESS; } - } } -int main(int argc, char **argv) { - negSquareArrays hostArrays, deviceRefArrays; - cudaStream_t stream; - cudaGraphExec_t graphExec, graphExecFreeC; +__global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays, bool *foundValidationFailure) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + float ref, diff; - // Declare pointers for GPU buffers - float *d_negSquare = NULL; - bool *foundValidationFailure = NULL; + if (idx < devRefArrays.numElements) { + ref = devRefArrays.negSquare[idx]; + diff = d_negSquare[idx] - ref; + diff *= diff; + ref *= ref; + if (diff / ref > ALLOWABLE_VARIANCE) { + *foundValidationFailure = true; + } + } +} - srand(time(0)); - int device = findCudaDevice(argc, (const char **)argv); +void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure) +{ + float ref, diff; - int driverVersion = 0; - int deviceSupportsMemoryPools = 0; + for (int i = 0; i < hostArrays->numElements; i++) { + ref = hostArrays->input[i] * hostArrays->input[i] * -1; + diff = hostArrays->negSquare[i] - ref; + diff *= diff; + ref *= ref; + if (diff / ref > ALLOWABLE_VARIANCE) { + *foundValidationFailure = true; + } + } +} - cudaDriverGetVersion(&driverVersion); - printf("Driver version is: %d.%d\n", driverVersion / 1000, - (driverVersion % 100) / 10); +int main(int argc, char **argv) +{ + negSquareArrays hostArrays, deviceRefArrays; + cudaStream_t stream; + cudaGraphExec_t graphExec, graphExecFreeC; - if (driverVersion < 11040) { - printf("Waiving execution as driver does not support Graph Memory Nodes\n"); - exit(EXIT_WAIVED); - } + // Declare pointers for GPU buffers + float *d_negSquare = NULL; + bool *foundValidationFailure = NULL; - cudaDeviceGetAttribute(&deviceSupportsMemoryPools, - cudaDevAttrMemoryPoolsSupported, device); - if (!deviceSupportsMemoryPools) { - printf("Waiving execution as device does not support Memory Pools\n"); - exit(EXIT_WAIVED); - } else { - printf("Setting up sample.\n"); - } + srand(time(0)); + int device = findCudaDevice(argc, (const char **)argv); - prepareHostArrays(&hostArrays); - prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure); - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - printf("Setup complete.\n\n"); + int driverVersion = 0; + int deviceSupportsMemoryPools = 0; - printf("Running negateSquares in a stream.\n"); - doNegateSquaresInStream(stream, &hostArrays); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Validating negateSquares in a stream...\n"); - validateHost(&hostArrays, foundValidationFailure); - checkValidationFailure(foundValidationFailure); - resetOutputArrays(&hostArrays); + cudaDriverGetVersion(&driverVersion); + printf("Driver version is: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); - printf("Running negateSquares in a stream-captured graph.\n"); - createNegateSquaresGraphWithStreamCapture(&graphExec, &hostArrays); - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Validating negateSquares in a stream-captured graph...\n"); - validateHost(&hostArrays, foundValidationFailure); - checkValidationFailure(foundValidationFailure); - resetOutputArrays(&hostArrays); + if (driverVersion < 11040) { + printf("Waiving execution as driver does not support Graph Memory Nodes\n"); + exit(EXIT_WAIVED); + } - printf("Running negateSquares in an explicitly constructed graph.\n"); - createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays); - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Validating negateSquares in an explicitly constructed graph...\n"); - validateHost(&hostArrays, foundValidationFailure); - checkValidationFailure(foundValidationFailure); - resetOutputArrays(&hostArrays); + cudaDeviceGetAttribute(&deviceSupportsMemoryPools, cudaDevAttrMemoryPoolsSupported, device); + if (!deviceSupportsMemoryPools) { + printf("Waiving execution as device does not support Memory Pools\n"); + exit(EXIT_WAIVED); + } + else { + printf("Setting up sample.\n"); + } - // Each of the three examples below free d_negSquare outside the graph. As - // demonstrated by validateGPU, d_negSquare can be accessed by outside the - // graph before d_negSquare is freed. + prepareHostArrays(&hostArrays); + prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure); + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + printf("Setup complete.\n\n"); - printf("Running negateSquares with d_negSquare freed outside the stream.\n"); - createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays, - &d_negSquare); - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - validateGPU<<>>( - d_negSquare, deviceRefArrays, foundValidationFailure); - // Since cudaFree is synchronous, the stream must synchronize before freeing - // d_negSquare to ensure d_negSquare no longer being accessed. - checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaFree(d_negSquare)); - printf( - "Validating negateSquares with d_negSquare freed outside the " - "stream...\n"); - validateHost(&hostArrays, foundValidationFailure); - checkValidationFailure(foundValidationFailure); - resetOutputArrays(&hostArrays); + printf("Running negateSquares in a stream.\n"); + doNegateSquaresInStream(stream, &hostArrays); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Validating negateSquares in a stream...\n"); + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); - printf("Running negateSquares with d_negSquare freed outside the graph.\n"); - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - validateGPU<<>>( - d_negSquare, deviceRefArrays, foundValidationFailure); - checkCudaErrors(cudaFreeAsync(d_negSquare, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf( - "Validating negateSquares with d_negSquare freed outside the graph...\n"); - checkValidationFailure(foundValidationFailure); - resetOutputArrays(&hostArrays); + printf("Running negateSquares in a stream-captured graph.\n"); + createNegateSquaresGraphWithStreamCapture(&graphExec, &hostArrays); + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Validating negateSquares in a stream-captured graph...\n"); + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); - printf( - "Running negateSquares with d_negSquare freed in a different graph.\n"); - createFreeGraph(&graphExecFreeC, d_negSquare); - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - validateGPU<<>>( - d_negSquare, deviceRefArrays, foundValidationFailure); - checkCudaErrors(cudaGraphLaunch(graphExecFreeC, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf( - "Validating negateSquares with d_negSquare freed in a different " - "graph...\n"); - checkValidationFailure(foundValidationFailure); + printf("Running negateSquares in an explicitly constructed graph.\n"); + createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays); + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Validating negateSquares in an explicitly constructed graph...\n"); + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); - printf("Cleaning up sample.\n"); - checkCudaErrors(cudaGraphExecDestroy(graphExec)); - checkCudaErrors(cudaGraphExecDestroy(graphExecFreeC)); - checkCudaErrors(cudaStreamDestroy(stream)); - checkCudaErrors(cudaFree(foundValidationFailure)); - checkCudaErrors(cudaFree(deviceRefArrays.negSquare)); - free(hostArrays.input); - free(hostArrays.square); - free(hostArrays.negSquare); - printf("Cleanup complete. Exiting sample.\n"); -} \ No newline at end of file + // Each of the three examples below free d_negSquare outside the graph. As + // demonstrated by validateGPU, d_negSquare can be accessed by outside the + // graph before d_negSquare is freed. + + printf("Running negateSquares with d_negSquare freed outside the stream.\n"); + createNegateSquaresGraphExplicitly(&graphExec, device, &hostArrays, &d_negSquare); + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + validateGPU<<>>( + d_negSquare, deviceRefArrays, foundValidationFailure); + // Since cudaFree is synchronous, the stream must synchronize before freeing + // d_negSquare to ensure d_negSquare no longer being accessed. + checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaFree(d_negSquare)); + printf("Validating negateSquares with d_negSquare freed outside the " + "stream...\n"); + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); + + printf("Running negateSquares with d_negSquare freed outside the graph.\n"); + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + validateGPU<<>>( + d_negSquare, deviceRefArrays, foundValidationFailure); + checkCudaErrors(cudaFreeAsync(d_negSquare, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Validating negateSquares with d_negSquare freed outside the graph...\n"); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); + + printf("Running negateSquares with d_negSquare freed in a different graph.\n"); + createFreeGraph(&graphExecFreeC, d_negSquare); + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + validateGPU<<>>( + d_negSquare, deviceRefArrays, foundValidationFailure); + checkCudaErrors(cudaGraphLaunch(graphExecFreeC, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Validating negateSquares with d_negSquare freed in a different " + "graph...\n"); + checkValidationFailure(foundValidationFailure); + + printf("Cleaning up sample.\n"); + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphExecDestroy(graphExecFreeC)); + checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaFree(foundValidationFailure)); + checkCudaErrors(cudaFree(deviceRefArrays.negSquare)); + free(hostArrays.input); + free(hostArrays.square); + free(hostArrays.negSquare); + printf("Cleanup complete. Exiting sample.\n"); +} diff --git a/Samples/3_CUDA_Features/immaTensorCoreGemm/immaTensorCoreGemm.cu b/Samples/3_CUDA_Features/immaTensorCoreGemm/immaTensorCoreGemm.cu index b00b5db3..df8c17fd 100644 --- a/Samples/3_CUDA_Features/immaTensorCoreGemm/immaTensorCoreGemm.cu +++ b/Samples/3_CUDA_Features/immaTensorCoreGemm/immaTensorCoreGemm.cu @@ -115,7 +115,7 @@ // Implementation constants. -#define WARPS_PER_BLOCK 8 +#define WARPS_PER_BLOCK 8 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) #if SHARED_MEMORY_LIMIT_64K @@ -132,10 +132,10 @@ #define CHUNK_K 16 #endif -#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(uint8_t)) -#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) +#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(uint8_t)) +#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) #define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES) -#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) +#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) #define BLOCK_ROW_WARPS 2 #define BLOCK_COL_WARPS 4 @@ -166,236 +166,219 @@ // nvcuda::wmma::load_matrix_sync. #define SKEW_UINT8 32 -#define checkKernelErrors(expr) \ - do { \ - expr; \ - \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, \ - cudaGetErrorString(__err)); \ - abort(); \ - } \ - } while (0) +#define checkKernelErrors(expr) \ + do { \ + expr; \ + \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, cudaGetErrorString(__err)); \ + abort(); \ + } \ + } while (0) using namespace nvcuda; -__host__ void init_host_matrices(uint8_t *a, uint8_t *b, int *c) { - for (int i = 0; i < M_GLOBAL; i++) { - for (int j = 0; j < K_GLOBAL; j++) { - a[i * K_GLOBAL + j] = (uint8_t)(rand() % 3); +__host__ void init_host_matrices(uint8_t *a, uint8_t *b, int *c) +{ + for (int i = 0; i < M_GLOBAL; i++) { + for (int j = 0; j < K_GLOBAL; j++) { + a[i * K_GLOBAL + j] = (uint8_t)(rand() % 3); + } } - } - for (int i = 0; i < N_GLOBAL; i++) { - for (int j = 0; j < K_GLOBAL; j++) { - b[i * K_GLOBAL + j] = (uint8_t)(rand() % 3); + for (int i = 0; i < N_GLOBAL; i++) { + for (int j = 0; j < K_GLOBAL; j++) { + b[i * K_GLOBAL + j] = (uint8_t)(rand() % 3); + } } - } - for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { - c[t] = (rand() % 3); - } + for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { + c[t] = (rand() % 3); + } } -__global__ void compute_gemm_imma(const uint8_t *A, const uint8_t *B, - const int *C, int *D, int alpha, int beta) { - extern __shared__ uint8_t shmem[][CHUNK_K * K + SKEW_UINT8]; +__global__ void compute_gemm_imma(const uint8_t *A, const uint8_t *B, const int *C, int *D, int alpha, int beta) +{ + extern __shared__ uint8_t shmem[][CHUNK_K * K + SKEW_UINT8]; - // Warp and lane identification. - const unsigned int warpId = threadIdx.x / WARP_SIZE; - const unsigned int laneId = threadIdx.x % WARP_SIZE; + // Warp and lane identification. + const unsigned int warpId = threadIdx.x / WARP_SIZE; + const unsigned int laneId = threadIdx.x % WARP_SIZE; - // Offset in shared memory from which the B matrix is stored. - const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; + // Offset in shared memory from which the B matrix is stored. + const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; - // This pointer is used to access the C and D matrix tiles this warp computes. - int *shmem_warp_tile_ptr = (int *)&shmem[0][0] + - (warpId / 2) * SHMEM_STRIDE * K * 2 + - (warpId % 2) * SHMEM_OFFSET; + // This pointer is used to access the C and D matrix tiles this warp computes. + int *shmem_warp_tile_ptr = (int *)&shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * K * 2 + (warpId % 2) * SHMEM_OFFSET; - // This pointer is used to stream the C and D matrices block-wide tile to and - // from shared memory. - int *shmem_warp_stream_ptr = (int *)&shmem[0][0] + warpId * SHMEM_STRIDE * K; + // This pointer is used to stream the C and D matrices block-wide tile to and + // from shared memory. + int *shmem_warp_stream_ptr = (int *)&shmem[0][0] + warpId * SHMEM_STRIDE * K; - // Adjust the beta scaler, as it'll be multiplied by alpha at the end of - // each tile computation. Technically this is not generally correct (may - // result in a loss of precision). Zero still needs to be specially handled - // though. - beta /= alpha; + // Adjust the beta scaler, as it'll be multiplied by alpha at the end of + // each tile computation. Technically this is not generally correct (may + // result in a loss of precision). Zero still needs to be specially handled + // though. + beta /= alpha; - // Each CTA slides along the 128 x 128 tiles from the top left corner of the - // matrix to the right and down, and selects the next tile to compute. Once - // there's no such tile, all warps in this CTA exit. - for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { - const unsigned int block_tile_i = - ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); - const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; + // Each CTA slides along the 128 x 128 tiles from the top left corner of the + // matrix to the right and down, and selects the next tile to compute. Once + // there's no such tile, all warps in this CTA exit. + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); + const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; - // Stop when there are no more D matrix tiles to compute in this CTA. - if (block_tile_i >= M_TILES) { - break; - } - - // This warp's pointer to the C matrix data to copy memory from to shared - // memory. - const size_t gmem_idx = - (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; - const int *src_gmem_warp_stream_ptr = &C[gmem_idx]; - - // Stream multiple C tiles to shared memory. -#pragma unroll - for (int i = 0; i < K; i++) { - typedef int4 copy_t; - - *((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = - *((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + - laneId); - } - - __syncthreads(); - - // These fragments will accumulate the result of A and B matrix fragment - // multiplications along the K_GLOBAL dimension. - wmma::fragment c[WARP_COL_TILES] - [WARP_ROW_TILES]; - - // Load the C matrix tiles into fragments from shared memory. -#pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { -#pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { - const int *tile_ptr = - shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; - - wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT); - } - } - - __syncthreads(); - - // Scale the C matrix. -#pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { -#pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { -#pragma unroll - for (int t = 0; t < c[i][j].num_elements; t++) { - c[i][j].x[t] *= beta; + // Stop when there are no more D matrix tiles to compute in this CTA. + if (block_tile_i >= M_TILES) { + break; } - } - } - // Select what warp copies what matrix to shared memory. - // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const uint8_t *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + - M * K_GLOBAL * (warpId % 4) * 2) - : (&B[block_tile_j * N * K_GLOBAL] + - N * K_GLOBAL * (warpId % 4) * 2); + // This warp's pointer to the C matrix data to copy memory from to shared + // memory. + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const int *src_gmem_warp_stream_ptr = &C[gmem_idx]; - // Go through the global K dimension by a fixed step at a time. + // Stream multiple C tiles to shared memory. #pragma unroll - for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { - // Copy slices of the A and B matrices to shared memory. - // The first half of the warps in the CTA copy the A matrix, the rest copy - // the B matrix. - size_t shmem_idx = - warpId < (WARPS_PER_BLOCK / 2) - ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) - : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); + for (int i = 0; i < K; i++) { + typedef int4 copy_t; - // First half of the warp copies the first row / column of the matrix, - // the second half of the warp copies the next. - int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K + - (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) + - (laneId % CHUNK_COPY_LINE_LANES); + *((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = + *((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId); + } - // Shift the second half of the warp to the next row / column in the - // shared memory. - shmem_idx += laneId / CHUNK_COPY_LINE_LANES; + __syncthreads(); -#pragma unroll - for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; - i++) { - // Copy 16 bytes at once in each lane. - *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = - *lane_ptr; - - // Advance the global memory pointer and the shared memory index. - lane_ptr = (int4 *)((uint8_t *)lane_ptr + - K_GLOBAL * CHUNK_COPY_LINES_PER_WARP); - shmem_idx += CHUNK_COPY_LINES_PER_WARP; - } - - __syncthreads(); - - // Compute a grid of C matrix tiles in each warp. -#pragma unroll - for (int k_step = 0; k_step < CHUNK_K; k_step++) { - wmma::fragment - a[WARP_COL_TILES]; - wmma::fragment - b[WARP_ROW_TILES]; + // These fragments will accumulate the result of A and B matrix fragment + // multiplications along the K_GLOBAL dimension. + wmma::fragment c[WARP_COL_TILES][WARP_ROW_TILES]; + // Load the C matrix tiles into fragments from shared memory. #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); - const uint8_t *tile_ptr = &shmem[shmem_idx_a][k_step * K]; +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { + const int *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; - wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_UINT8); + wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT); + } + } + + __syncthreads(); + + // Scale the C matrix. +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { +#pragma unroll + for (int t = 0; t < c[i][j].num_elements; t++) { + c[i][j].x[t] *= beta; + } + } + } + + // Select what warp copies what matrix to shared memory. + // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. + const uint8_t *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % 4) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % 4) * 2); + + // Go through the global K dimension by a fixed step at a time. +#pragma unroll + for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { + // Copy slices of the A and B matrices to shared memory. + // The first half of the warps in the CTA copy the A matrix, the rest copy + // the B matrix. + size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) + ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); + + // First half of the warp copies the first row / column of the matrix, + // the second half of the warp copies the next. + int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) + + (laneId % CHUNK_COPY_LINE_LANES); + + // Shift the second half of the warp to the next row / column in the + // shared memory. + shmem_idx += laneId / CHUNK_COPY_LINE_LANES; #pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { - if (i == 0) { - // Load the B matrix fragment once, because it is going to be - // reused against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + - (WARP_ROW_TILES * N) * (warpId % 2) + - (j * N); - const uint8_t *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) { + // Copy 16 bytes at once in each lane. + *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *lane_ptr; - wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_UINT8); + // Advance the global memory pointer and the shared memory index. + lane_ptr = (int4 *)((uint8_t *)lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP); + shmem_idx += CHUNK_COPY_LINES_PER_WARP; } - wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); - } + __syncthreads(); + + // Compute a grid of C matrix tiles in each warp. +#pragma unroll + for (int k_step = 0; k_step < CHUNK_K; k_step++) { + wmma::fragment a[WARP_COL_TILES]; + wmma::fragment b[WARP_ROW_TILES]; + +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { + size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); + const uint8_t *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + + wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_UINT8); + +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { + if (i == 0) { + // Load the B matrix fragment once, because it is going to be + // reused against the other A matrix fragments. + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const uint8_t *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + + wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_UINT8); + } + + wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); + } + } + } + + __syncthreads(); } - } - __syncthreads(); + // Store the D fragments to shared memory. +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { +#pragma unroll + // Uniform, point-wise transformations of ALL fragment elements by ALL + // threads in the warp are well-defined even though element indices + // within fragment storage are not defined. + for (int t = 0; t < c[i][j].num_elements; t++) + c[i][j].x[t] *= alpha; + + int *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; + + wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT); + } + } + + __syncthreads(); + + // Now that shared memory contains all the D tiles, stream them to global + // memory. + int *dst_gmem_warp_stream_ptr = &D[gmem_idx]; + +#pragma unroll + for (int i = 0; i < K; i++) { + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + } + + __syncthreads(); } - - // Store the D fragments to shared memory. -#pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { -#pragma unroll - for (int j = 0; j < WARP_ROW_TILES; j++) { -#pragma unroll - // Uniform, point-wise transformations of ALL fragment elements by ALL - // threads in the warp are well-defined even though element indices - // within fragment storage are not defined. - for (int t = 0; t < c[i][j].num_elements; t++) c[i][j].x[t] *= alpha; - - int *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; - - wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT); - } - } - - __syncthreads(); - - // Now that shared memory contains all the D tiles, stream them to global - // memory. - int *dst_gmem_warp_stream_ptr = &D[gmem_idx]; - -#pragma unroll - for (int i = 0; i < K; i++) { - *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); - } - - __syncthreads(); - } } // Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming: @@ -406,250 +389,244 @@ __global__ void compute_gemm_imma(const uint8_t *A, const uint8_t *B, // is designed for // demonstration purposes only to show the CUDA WMMA API use without // relying on availability of the shared memory. -__global__ void simple_wmma_gemm_imma(const uint8_t *a, const uint8_t *b, - const int *c, int *d, int m_ld, int n_ld, - int k_ld, int alpha, int beta) { - // Leading dimensions. Packed with no transpositions. - int lda = m_ld; - int ldb = k_ld; - int ldc = n_ld; +__global__ void simple_wmma_gemm_imma(const uint8_t *a, + const uint8_t *b, + const int *c, + int *d, + int m_ld, + int n_ld, + int k_ld, + int alpha, + int beta) +{ + // Leading dimensions. Packed with no transpositions. + int lda = m_ld; + int ldb = k_ld; + int ldc = n_ld; - // Tile using a 2D grid - int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; - int warpN = (blockIdx.y * blockDim.y + threadIdx.y); + // Tile using a 2D grid + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); - // Declare the fragments - wmma::fragment - a_frag; - wmma::fragment - b_frag; - wmma::fragment acc_frag; - wmma::fragment c_frag; + // Declare the fragments + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; - wmma::fill_fragment(acc_frag, 0.0f); + wmma::fill_fragment(acc_frag, 0.0f); - // Loop over k - for (int i = 0; i < k_ld; i += WMMA_K) { - int aCol = i; - int aRow = warpM * WMMA_M; + // Loop over k + for (int i = 0; i < k_ld; i += WMMA_K) { + int aCol = i; + int aRow = warpM * WMMA_M; - int bCol = i; - int bRow = warpN * WMMA_N; + int bCol = i; + int bRow = warpN * WMMA_N; - // Bounds checking - if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { - // Load the inputs - wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); - wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb); + // Bounds checking + if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { + // Load the inputs + wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); + wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb); - // Perform the matrix multiplication - wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); - } - } - - // Load in the current value of c, scale it by beta, and add this our result - // scaled by alpha - int cCol = warpN * WMMA_N; - int cRow = warpM * WMMA_M; - - if (cRow < m_ld && cCol < n_ld) { - wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, - wmma::mem_row_major); - - for (int i = 0; i < c_frag.num_elements; i++) { - c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + // Perform the matrix multiplication + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + } } - // Store the output - wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, - wmma::mem_row_major); - } + // Load in the current value of c, scale it by beta, and add this our result + // scaled by alpha + int cCol = warpN * WMMA_N; + int cRow = warpM * WMMA_M; + + if (cRow < m_ld && cCol < n_ld) { + wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); + + for (int i = 0; i < c_frag.num_elements; i++) { + c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + } + + // Store the output + wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); + } } -__host__ void matMultiplyOnHost(uint8_t *A, uint8_t *B, int *C, int alpha, - int beta, int numARows, int numAColumns, - int numBRows, int numBColumns, int numCRows, - int numCColumns) { - for (int i = 0; i < numCRows; i++) { - for (int j = 0; j < numCColumns; j++) { - int temp = 0; +__host__ void matMultiplyOnHost(uint8_t *A, + uint8_t *B, + int *C, + int alpha, + int beta, + int numARows, + int numAColumns, + int numBRows, + int numBColumns, + int numCRows, + int numCColumns) +{ + for (int i = 0; i < numCRows; i++) { + for (int j = 0; j < numCColumns; j++) { + int temp = 0; - for (int k = 0; k < numAColumns; k++) { - temp += A[i * numAColumns + k] * B[j * numBRows + k]; - } + for (int k = 0; k < numAColumns; k++) { + temp += A[i * numAColumns + k] * B[j * numBRows + k]; + } - C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + } } - } } -int main(int argc, char **argv) { - printf("Initializing...\n"); +int main(int argc, char **argv) +{ + printf("Initializing...\n"); - int dev = findCudaDevice(argc, (const char **)argv); + int dev = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - // Tensor cores require a GPU of Volta (SM72) architecture or higher. - if (deviceProp.major < 7 || (deviceProp.major <= 7 && deviceProp.minor < 2)) { - printf( - "immaTensorCoreGemm requires SM 7.2 or higher to use Tensor Cores. " - "Exiting...\n"); - exit(EXIT_WAIVED); - } - - printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES); - printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES); - printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES); - - uint8_t *A_h = NULL; - uint8_t *B_h = NULL; - int *C_h = NULL; -#if CPU_DEBUG - int *result_hD = NULL; - int *result_host = NULL; -#endif - - A_h = (uint8_t *)malloc(sizeof(uint8_t) * M_GLOBAL * K_GLOBAL); - B_h = (uint8_t *)malloc(sizeof(uint8_t) * K_GLOBAL * N_GLOBAL); - C_h = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL); -#if CPU_DEBUG - result_hD = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL); - result_host = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL); -#endif - - uint8_t *A = NULL; - uint8_t *B = NULL; - int *C = NULL; - int *D = NULL; - - checkCudaErrors( - cudaMalloc(reinterpret_cast(&A), sizeof(uint8_t) * M_GLOBAL * K_GLOBAL)); - checkCudaErrors( - cudaMalloc(reinterpret_cast(&B), sizeof(uint8_t) * N_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&C), sizeof(int) * M_GLOBAL * N_GLOBAL)); - checkCudaErrors(cudaMalloc(reinterpret_cast(&D), sizeof(int) * M_GLOBAL * N_GLOBAL)); - - assert(((unsigned long long)A) % 128 == 0); - assert(((unsigned long long)B) % 128 == 0); - assert(((unsigned long long)C) % 128 == 0); - assert(((unsigned long long)D) % 128 == 0); - - init_host_matrices(A_h, B_h, C_h); - - checkCudaErrors(cudaMemcpy(A, A_h, sizeof(uint8_t) * M_GLOBAL * K_GLOBAL, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(B, B_h, sizeof(uint8_t) * N_GLOBAL * K_GLOBAL, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(C, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemset(D, 0, sizeof(int) * M_GLOBAL * N_GLOBAL)); - - printf("Preparing data for GPU...\n"); - - assert(((unsigned long long)A) % 128 == 0); - assert(((unsigned long long)B) % 128 == 0); - assert(((unsigned long long)C) % 128 == 0); - assert(((unsigned long long)D) % 128 == 0); - - enum { - // Compute the right amount of shared memory to request. - // We need shared memory to hold per-CTA C and D matrix tiles, and to cache - // per-CTA chunks - // of the A and B matrices. Therefore, the right amount to request is the - // maximum of those - // two numbers. - SHMEM_SZ = MAX(sizeof(uint8_t) * (BLOCK_COL_TILES * M) * - (CHUNK_K * K + SKEW_UINT8) * 2, - M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * - (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(int)) - }; - - printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); - - int alpha = 1; - int beta = 1; - - cudaEvent_t start, stop; - - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start)); - - // If enough shared memory available on the GPU use high performant kernel - if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) { - printf("Computing... using high performance kernel compute_gemm_imma \n"); - - checkCudaErrors(cudaFuncSetAttribute( - compute_gemm_imma, cudaFuncAttributeMaxDynamicSharedMemorySize, - SHMEM_SZ)); - checkKernelErrors( - (compute_gemm_imma<<>>(A, B, C, D, alpha, beta))); -#if CPU_DEBUG - checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL, - cudaMemcpyDeviceToHost)); -#endif - } else { - dim3 gridDim; - dim3 blockDim; - - // blockDim.x must be a multiple of warpSize - // 128x4 means we have 16 warps and a block computes a 64x64 output tile - blockDim.x = 128; - blockDim.y = 4; - - gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / - (WMMA_M * blockDim.x / 32); - gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y); - - printf("Computing... using simple_wmma_gemm_imma kernel\n"); - simple_wmma_gemm_imma<<>>(A, B, C, D, M_GLOBAL, N_GLOBAL, - K_GLOBAL, alpha, beta); -#if CPU_DEBUG - checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL, - cudaMemcpyDeviceToHost)); -#endif - } - - checkCudaErrors(cudaEventRecord(stop)); - checkCudaErrors(cudaEventSynchronize(stop)); - -#if CPU_DEBUG - printf("Verifying correctness of the computations...\n"); - - memcpy(result_host, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL); - - matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, - K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); - - for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { - if (abs(result_hD[i] - result_host[i]) > 0) { - printf("mismatch i=%d result_hD=%d result_host=%d\n", i, result_hD[i], - result_host[i]); + // Tensor cores require a GPU of Volta (SM72) architecture or higher. + if (deviceProp.major < 7 || (deviceProp.major <= 7 && deviceProp.minor < 2)) { + printf("immaTensorCoreGemm requires SM 7.2 or higher to use Tensor Cores. " + "Exiting...\n"); + exit(EXIT_WAIVED); } - } - free(result_host); - free(result_hD); + + printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES); + printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES); + printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES); + + uint8_t *A_h = NULL; + uint8_t *B_h = NULL; + int *C_h = NULL; +#if CPU_DEBUG + int *result_hD = NULL; + int *result_host = NULL; #endif - float milliseconds = 0; + A_h = (uint8_t *)malloc(sizeof(uint8_t) * M_GLOBAL * K_GLOBAL); + B_h = (uint8_t *)malloc(sizeof(uint8_t) * K_GLOBAL * N_GLOBAL); + C_h = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL); +#if CPU_DEBUG + result_hD = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL); + result_host = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL); +#endif - checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); + uint8_t *A = NULL; + uint8_t *B = NULL; + int *C = NULL; + int *D = NULL; + + checkCudaErrors(cudaMalloc(reinterpret_cast(&A), sizeof(uint8_t) * M_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&B), sizeof(uint8_t) * N_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&C), sizeof(int) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc(reinterpret_cast(&D), sizeof(int) * M_GLOBAL * N_GLOBAL)); + + assert(((unsigned long long)A) % 128 == 0); + assert(((unsigned long long)B) % 128 == 0); + assert(((unsigned long long)C) % 128 == 0); + assert(((unsigned long long)D) % 128 == 0); + + init_host_matrices(A_h, B_h, C_h); + + checkCudaErrors(cudaMemcpy(A, A_h, sizeof(uint8_t) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(B, B_h, sizeof(uint8_t) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(C, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(D, 0, sizeof(int) * M_GLOBAL * N_GLOBAL)); + + printf("Preparing data for GPU...\n"); + + assert(((unsigned long long)A) % 128 == 0); + assert(((unsigned long long)B) % 128 == 0); + assert(((unsigned long long)C) % 128 == 0); + assert(((unsigned long long)D) % 128 == 0); + + enum { + // Compute the right amount of shared memory to request. + // We need shared memory to hold per-CTA C and D matrix tiles, and to cache + // per-CTA chunks + // of the A and B matrices. Therefore, the right amount to request is the + // maximum of those + // two numbers. + SHMEM_SZ = MAX(sizeof(uint8_t) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_UINT8) * 2, + M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(int)) + }; + + printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); + + int alpha = 1; + int beta = 1; + + cudaEvent_t start, stop; + + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start)); + + // If enough shared memory available on the GPU use high performant kernel + if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) { + printf("Computing... using high performance kernel compute_gemm_imma \n"); + + checkCudaErrors(cudaFuncSetAttribute(compute_gemm_imma, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors((compute_gemm_imma<<>>( + A, B, C, D, alpha, beta))); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); +#endif + } + else { + dim3 gridDim; + dim3 blockDim; + + // blockDim.x must be a multiple of warpSize + // 128x4 means we have 16 warps and a block computes a 64x64 output tile + blockDim.x = 128; + blockDim.y = 4; + + gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32); + gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y); + + printf("Computing... using simple_wmma_gemm_imma kernel\n"); + simple_wmma_gemm_imma<<>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); +#endif + } + + checkCudaErrors(cudaEventRecord(stop)); + checkCudaErrors(cudaEventSynchronize(stop)); + +#if CPU_DEBUG + printf("Verifying correctness of the computations...\n"); + + memcpy(result_host, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL); + + matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); + + for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { + if (abs(result_hD[i] - result_host[i]) > 0) { + printf("mismatch i=%d result_hD=%d result_host=%d\n", i, result_hD[i], result_host[i]); + } + } + free(result_host); + free(result_hD); +#endif + + float milliseconds = 0; + + checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); printf("Time: %f ms\n", milliseconds); - printf("TOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12); + printf("TOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) / 1e12); - free(A_h); - free(B_h); - free(C_h); - checkCudaErrors(cudaFree(reinterpret_cast(A))); - checkCudaErrors(cudaFree(reinterpret_cast(B))); - checkCudaErrors(cudaFree(reinterpret_cast(C))); - checkCudaErrors(cudaFree(reinterpret_cast(D))); + free(A_h); + free(B_h); + free(C_h); + checkCudaErrors(cudaFree(reinterpret_cast(A))); + checkCudaErrors(cudaFree(reinterpret_cast(B))); + checkCudaErrors(cudaFree(reinterpret_cast(C))); + checkCudaErrors(cudaFree(reinterpret_cast(D))); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu b/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu index dd83b4bb..c719973b 100644 --- a/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu +++ b/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu @@ -29,6 +29,7 @@ #include #include #include + #include "jacobi.h" namespace cg = cooperative_groups; @@ -39,355 +40,356 @@ namespace cg = cooperative_groups; #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 #else -__device__ double atomicAdd(double *address, double val) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +__device__ double atomicAdd(double *address, double val) +{ + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != - // NaN) - } while (assumed != old); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != + // NaN) + } while (assumed != old); - return __longlong_as_double(old); + return __longlong_as_double(old); } #endif -static __global__ void JacobiMethod(const float *A, const double *b, - const float conv_threshold, double *x, - double *x_new, double *sum) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ double x_shared[N_ROWS]; // N_ROWS == n - __shared__ double b_shared[ROWS_PER_CTA + 1]; +static __global__ void +JacobiMethod(const float *A, const double *b, const float conv_threshold, double *x, double *x_new, double *sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ double x_shared[N_ROWS]; // N_ROWS == n + __shared__ double b_shared[ROWS_PER_CTA + 1]; - for (int i = threadIdx.x; i < N_ROWS; i += blockDim.x) { - x_shared[i] = x[i]; - } - - if (threadIdx.x < ROWS_PER_CTA) { - int k = threadIdx.x; - for (int i = k + (blockIdx.x * ROWS_PER_CTA); - (k < ROWS_PER_CTA) && (i < N_ROWS); - k += ROWS_PER_CTA, i += ROWS_PER_CTA) { - b_shared[i % (ROWS_PER_CTA + 1)] = b[i]; - } - } - - cg::sync(cta); - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - for (int k = 0, i = blockIdx.x * ROWS_PER_CTA; - (k < ROWS_PER_CTA) && (i < N_ROWS); k++, i++) { - double rowThreadSum = 0.0; - for (int j = threadIdx.x; j < N_ROWS; j += blockDim.x) { - rowThreadSum += (A[i * N_ROWS + j] * x_shared[j]); + for (int i = threadIdx.x; i < N_ROWS; i += blockDim.x) { + x_shared[i] = x[i]; } - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - rowThreadSum += tile32.shfl_down(rowThreadSum, offset); + if (threadIdx.x < ROWS_PER_CTA) { + int k = threadIdx.x; + for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) { + b_shared[i % (ROWS_PER_CTA + 1)] = b[i]; + } } - if (tile32.thread_rank() == 0) { - atomicAdd(&b_shared[i % (ROWS_PER_CTA + 1)], -rowThreadSum); - } - } + cg::sync(cta); - cg::sync(cta); + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - if (threadIdx.x < ROWS_PER_CTA) { - cg::thread_block_tile tile8 = - cg::tiled_partition(cta); - double temp_sum = 0.0; + for (int k = 0, i = blockIdx.x * ROWS_PER_CTA; (k < ROWS_PER_CTA) && (i < N_ROWS); k++, i++) { + double rowThreadSum = 0.0; + for (int j = threadIdx.x; j < N_ROWS; j += blockDim.x) { + rowThreadSum += (A[i * N_ROWS + j] * x_shared[j]); + } - int k = threadIdx.x; + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + rowThreadSum += tile32.shfl_down(rowThreadSum, offset); + } - for (int i = k + (blockIdx.x * ROWS_PER_CTA); - (k < ROWS_PER_CTA) && (i < N_ROWS); - k += ROWS_PER_CTA, i += ROWS_PER_CTA) { - double dx = b_shared[i % (ROWS_PER_CTA + 1)]; - dx /= A[i * N_ROWS + i]; - - x_new[i] = (x_shared[i] + dx); - temp_sum += fabs(dx); + if (tile32.thread_rank() == 0) { + atomicAdd(&b_shared[i % (ROWS_PER_CTA + 1)], -rowThreadSum); + } } - for (int offset = tile8.size() / 2; offset > 0; offset /= 2) { - temp_sum += tile8.shfl_down(temp_sum, offset); - } + cg::sync(cta); - if (tile8.thread_rank() == 0) { - atomicAdd(sum, temp_sum); + if (threadIdx.x < ROWS_PER_CTA) { + cg::thread_block_tile tile8 = cg::tiled_partition(cta); + double temp_sum = 0.0; + + int k = threadIdx.x; + + for (int i = k + (blockIdx.x * ROWS_PER_CTA); (k < ROWS_PER_CTA) && (i < N_ROWS); + k += ROWS_PER_CTA, i += ROWS_PER_CTA) { + double dx = b_shared[i % (ROWS_PER_CTA + 1)]; + dx /= A[i * N_ROWS + i]; + + x_new[i] = (x_shared[i] + dx); + temp_sum += fabs(dx); + } + + for (int offset = tile8.size() / 2; offset > 0; offset /= 2) { + temp_sum += tile8.shfl_down(temp_sum, offset); + } + + if (tile8.thread_rank() == 0) { + atomicAdd(sum, temp_sum); + } } - } } // Thread block size for finalError kernel should be multiple of 32 -static __global__ void finalError(double *x, double *g_sum) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - extern __shared__ double warpSum[]; - double sum = 0.0; +static __global__ void finalError(double *x, double *g_sum) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + extern __shared__ double warpSum[]; + double sum = 0.0; - int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = globalThreadId; i < N_ROWS; i += blockDim.x * gridDim.x) { - double d = x[i] - 1.0; - sum += fabs(d); - } + for (int i = globalThreadId; i < N_ROWS; i += blockDim.x * gridDim.x) { + double d = x[i] - 1.0; + sum += fabs(d); + } - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - sum += tile32.shfl_down(sum, offset); - } - - if (tile32.thread_rank() == 0) { - warpSum[threadIdx.x / warpSize] = sum; - } - - cg::sync(cta); - - double blockSum = 0.0; - if (threadIdx.x < (blockDim.x / warpSize)) { - blockSum = warpSum[threadIdx.x]; - } - - if (threadIdx.x < 32) { for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - blockSum += tile32.shfl_down(blockSum, offset); + sum += tile32.shfl_down(sum, offset); } + if (tile32.thread_rank() == 0) { - atomicAdd(g_sum, blockSum); + warpSum[threadIdx.x / warpSize] = sum; } - } -} -double JacobiMethodGpuCudaGraphExecKernelSetParams( - const float *A, const double *b, const float conv_threshold, - const int max_iter, double *x, double *x_new, cudaStream_t stream) { - // CTA size - dim3 nthreads(256, 1, 1); - // grid size - dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); - cudaGraph_t graph; - cudaGraphExec_t graphExec = NULL; + cg::sync(cta); - double sum = 0.0; - double *d_sum = NULL; - checkCudaErrors(cudaMalloc(&d_sum, sizeof(double))); - - std::vector nodeDependencies; - cudaGraphNode_t memcpyNode, jacobiKernelNode, memsetNode; - cudaMemcpy3DParms memcpyParams = {0}; - cudaMemsetParams memsetParams = {0}; - - memsetParams.dst = (void *)d_sum; - memsetParams.value = 0; - memsetParams.pitch = 0; - // elementSize can be max 4 bytes, so we take sizeof(float) and width=2 - memsetParams.elementSize = sizeof(float); - memsetParams.width = 2; - memsetParams.height = 1; - - checkCudaErrors(cudaGraphCreate(&graph, 0)); - checkCudaErrors( - cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); - nodeDependencies.push_back(memsetNode); - - cudaKernelNodeParams NodeParams0, NodeParams1; - NodeParams0.func = (void *)JacobiMethod; - NodeParams0.gridDim = nblocks; - NodeParams0.blockDim = nthreads; - NodeParams0.sharedMemBytes = 0; - void *kernelArgs0[6] = {(void *)&A, (void *)&b, (void *)&conv_threshold, - (void *)&x, (void *)&x_new, (void *)&d_sum}; - NodeParams0.kernelParams = kernelArgs0; - NodeParams0.extra = NULL; - - checkCudaErrors( - cudaGraphAddKernelNode(&jacobiKernelNode, graph, nodeDependencies.data(), - nodeDependencies.size(), &NodeParams0)); - - nodeDependencies.clear(); - nodeDependencies.push_back(jacobiKernelNode); - - memcpyParams.srcArray = NULL; - memcpyParams.srcPos = make_cudaPos(0, 0, 0); - memcpyParams.srcPtr = make_cudaPitchedPtr(d_sum, sizeof(double), 1, 1); - memcpyParams.dstArray = NULL; - memcpyParams.dstPos = make_cudaPos(0, 0, 0); - memcpyParams.dstPtr = make_cudaPitchedPtr(&sum, sizeof(double), 1, 1); - memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1); - memcpyParams.kind = cudaMemcpyDeviceToHost; - - checkCudaErrors( - cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), - nodeDependencies.size(), &memcpyParams)); - - checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - - NodeParams1.func = (void *)JacobiMethod; - NodeParams1.gridDim = nblocks; - NodeParams1.blockDim = nthreads; - NodeParams1.sharedMemBytes = 0; - void *kernelArgs1[6] = {(void *)&A, (void *)&b, (void *)&conv_threshold, - (void *)&x_new, (void *)&x, (void *)&d_sum}; - NodeParams1.kernelParams = kernelArgs1; - NodeParams1.extra = NULL; - - int k = 0; - for (k = 0; k < max_iter; k++) { - checkCudaErrors(cudaGraphExecKernelNodeSetParams( - graphExec, jacobiKernelNode, - ((k & 1) == 0) ? &NodeParams0 : &NodeParams1)); - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - if (sum <= conv_threshold) { - checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - nblocks.x = (N_ROWS / nthreads.x) + 1; - size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); - if ((k & 1) == 0) { - finalError<<>>(x_new, d_sum); - } else { - finalError<<>>(x, d_sum); - } - - checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("GPU iterations : %d\n", k + 1); - printf("GPU error : %.3e\n", sum); - break; + double blockSum = 0.0; + if (threadIdx.x < (blockDim.x / warpSize)) { + blockSum = warpSum[threadIdx.x]; } - } - checkCudaErrors(cudaFree(d_sum)); - return sum; -} - -double JacobiMethodGpuCudaGraphExecUpdate(const float *A, const double *b, - const float conv_threshold, - const int max_iter, double *x, - double *x_new, cudaStream_t stream) { - // CTA size - dim3 nthreads(256, 1, 1); - // grid size - dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); - cudaGraph_t graph; - cudaGraphExec_t graphExec = NULL; - - double sum = 0.0; - double *d_sum; - checkCudaErrors(cudaMalloc(&d_sum, sizeof(double))); - - int k = 0; - for (k = 0; k < max_iter; k++) { - checkCudaErrors( - cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - if ((k & 1) == 0) { - JacobiMethod<<>>(A, b, conv_threshold, x, - x_new, d_sum); - } else { - JacobiMethod<<>>(A, b, conv_threshold, - x_new, x, d_sum); - } - checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamEndCapture(stream, &graph)); - - if (graphExec == NULL) { - checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - } else { - cudaGraphExecUpdateResult updateResult_out; - checkCudaErrors( - cudaGraphExecUpdate(graphExec, graph, NULL, &updateResult_out)); - if (updateResult_out != cudaGraphExecUpdateSuccess) { - if (graphExec != NULL) { - checkCudaErrors(cudaGraphExecDestroy(graphExec)); + if (threadIdx.x < 32) { + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + blockSum += tile32.shfl_down(blockSum, offset); + } + if (tile32.thread_rank() == 0) { + atomicAdd(g_sum, blockSum); } - printf("k = %d graph update failed with error - %d\n", k, - updateResult_out); - checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - } } - checkCudaErrors(cudaGraphLaunch(graphExec, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - - if (sum <= conv_threshold) { - checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - nblocks.x = (N_ROWS / nthreads.x) + 1; - size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); - if ((k & 1) == 0) { - finalError<<>>(x_new, d_sum); - } else { - finalError<<>>(x, d_sum); - } - - checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("GPU iterations : %d\n", k + 1); - printf("GPU error : %.3e\n", sum); - break; - } - } - - checkCudaErrors(cudaFree(d_sum)); - return sum; } -double JacobiMethodGpu(const float *A, const double *b, - const float conv_threshold, const int max_iter, - double *x, double *x_new, cudaStream_t stream) { - // CTA size - dim3 nthreads(256, 1, 1); - // grid size - dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); +double JacobiMethodGpuCudaGraphExecKernelSetParams(const float *A, + const double *b, + const float conv_threshold, + const int max_iter, + double *x, + double *x_new, + cudaStream_t stream) +{ + // CTA size + dim3 nthreads(256, 1, 1); + // grid size + dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); + cudaGraph_t graph; + cudaGraphExec_t graphExec = NULL; - double sum = 0.0; - double *d_sum; - checkCudaErrors(cudaMalloc(&d_sum, sizeof(double))); - int k = 0; + double sum = 0.0; + double *d_sum = NULL; + checkCudaErrors(cudaMalloc(&d_sum, sizeof(double))); - for (k = 0; k < max_iter; k++) { - checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - if ((k & 1) == 0) { - JacobiMethod<<>>(A, b, conv_threshold, x, - x_new, d_sum); - } else { - JacobiMethod<<>>(A, b, conv_threshold, - x_new, x, d_sum); + std::vector nodeDependencies; + cudaGraphNode_t memcpyNode, jacobiKernelNode, memsetNode; + cudaMemcpy3DParms memcpyParams = {0}; + cudaMemsetParams memsetParams = {0}; + + memsetParams.dst = (void *)d_sum; + memsetParams.value = 0; + memsetParams.pitch = 0; + // elementSize can be max 4 bytes, so we take sizeof(float) and width=2 + memsetParams.elementSize = sizeof(float); + memsetParams.width = 2; + memsetParams.height = 1; + + checkCudaErrors(cudaGraphCreate(&graph, 0)); + checkCudaErrors(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); + nodeDependencies.push_back(memsetNode); + + cudaKernelNodeParams NodeParams0, NodeParams1; + NodeParams0.func = (void *)JacobiMethod; + NodeParams0.gridDim = nblocks; + NodeParams0.blockDim = nthreads; + NodeParams0.sharedMemBytes = 0; + void *kernelArgs0[6] = { + (void *)&A, (void *)&b, (void *)&conv_threshold, (void *)&x, (void *)&x_new, (void *)&d_sum}; + NodeParams0.kernelParams = kernelArgs0; + NodeParams0.extra = NULL; + + checkCudaErrors(cudaGraphAddKernelNode( + &jacobiKernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &NodeParams0)); + + nodeDependencies.clear(); + nodeDependencies.push_back(jacobiKernelNode); + + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(d_sum, sizeof(double), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(&sum, sizeof(double), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1); + memcpyParams.kind = cudaMemcpyDeviceToHost; + + checkCudaErrors( + cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams)); + + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + NodeParams1.func = (void *)JacobiMethod; + NodeParams1.gridDim = nblocks; + NodeParams1.blockDim = nthreads; + NodeParams1.sharedMemBytes = 0; + void *kernelArgs1[6] = { + (void *)&A, (void *)&b, (void *)&conv_threshold, (void *)&x_new, (void *)&x, (void *)&d_sum}; + NodeParams1.kernelParams = kernelArgs1; + NodeParams1.extra = NULL; + + int k = 0; + for (k = 0; k < max_iter; k++) { + checkCudaErrors(cudaGraphExecKernelNodeSetParams( + graphExec, jacobiKernelNode, ((k & 1) == 0) ? &NodeParams0 : &NodeParams1)); + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + if (sum <= conv_threshold) { + checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + nblocks.x = (N_ROWS / nthreads.x) + 1; + size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); + if ((k & 1) == 0) { + finalError<<>>(x_new, d_sum); + } + else { + finalError<<>>(x, d_sum); + } + + checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("GPU iterations : %d\n", k + 1); + printf("GPU error : %.3e\n", sum); + break; + } } - checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - if (sum <= conv_threshold) { - checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); - nblocks.x = (N_ROWS / nthreads.x) + 1; - size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); - if ((k & 1) == 0) { - finalError<<>>(x_new, d_sum); - } else { - finalError<<>>(x, d_sum); - } - - checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("GPU iterations : %d\n", k + 1); - printf("GPU error : %.3e\n", sum); - break; - } - } - - checkCudaErrors(cudaFree(d_sum)); - return sum; + checkCudaErrors(cudaFree(d_sum)); + return sum; +} + +double JacobiMethodGpuCudaGraphExecUpdate(const float *A, + const double *b, + const float conv_threshold, + const int max_iter, + double *x, + double *x_new, + cudaStream_t stream) +{ + // CTA size + dim3 nthreads(256, 1, 1); + // grid size + dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); + cudaGraph_t graph; + cudaGraphExec_t graphExec = NULL; + + double sum = 0.0; + double *d_sum; + checkCudaErrors(cudaMalloc(&d_sum, sizeof(double))); + + int k = 0; + for (k = 0; k < max_iter; k++) { + checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + if ((k & 1) == 0) { + JacobiMethod<<>>(A, b, conv_threshold, x, x_new, d_sum); + } + else { + JacobiMethod<<>>(A, b, conv_threshold, x_new, x, d_sum); + } + checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamEndCapture(stream, &graph)); + + if (graphExec == NULL) { + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + } + else { + cudaGraphExecUpdateResult updateResult_out; + checkCudaErrors(cudaGraphExecUpdate(graphExec, graph, NULL, &updateResult_out)); + if (updateResult_out != cudaGraphExecUpdateSuccess) { + if (graphExec != NULL) { + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + } + printf("k = %d graph update failed with error - %d\n", k, updateResult_out); + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + } + } + checkCudaErrors(cudaGraphLaunch(graphExec, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + if (sum <= conv_threshold) { + checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + nblocks.x = (N_ROWS / nthreads.x) + 1; + size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); + if ((k & 1) == 0) { + finalError<<>>(x_new, d_sum); + } + else { + finalError<<>>(x, d_sum); + } + + checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("GPU iterations : %d\n", k + 1); + printf("GPU error : %.3e\n", sum); + break; + } + } + + checkCudaErrors(cudaFree(d_sum)); + return sum; +} + +double JacobiMethodGpu(const float *A, + const double *b, + const float conv_threshold, + const int max_iter, + double *x, + double *x_new, + cudaStream_t stream) +{ + // CTA size + dim3 nthreads(256, 1, 1); + // grid size + dim3 nblocks((N_ROWS / ROWS_PER_CTA) + 2, 1, 1); + + double sum = 0.0; + double *d_sum; + checkCudaErrors(cudaMalloc(&d_sum, sizeof(double))); + int k = 0; + + for (k = 0; k < max_iter; k++) { + checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + if ((k & 1) == 0) { + JacobiMethod<<>>(A, b, conv_threshold, x, x_new, d_sum); + } + else { + JacobiMethod<<>>(A, b, conv_threshold, x_new, x, d_sum); + } + checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + if (sum <= conv_threshold) { + checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream)); + nblocks.x = (N_ROWS / nthreads.x) + 1; + size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double); + if ((k & 1) == 0) { + finalError<<>>(x_new, d_sum); + } + else { + finalError<<>>(x, d_sum); + } + + checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("GPU iterations : %d\n", k + 1); + printf("GPU error : %.3e\n", sum); + break; + } + } + + checkCudaErrors(cudaFree(d_sum)); + return sum; } diff --git a/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.h b/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.h index fd1104ac..8b73b811 100644 --- a/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.h +++ b/Samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.h @@ -30,4 +30,4 @@ #define N_ROWS 512 -#endif \ No newline at end of file +#endif diff --git a/Samples/3_CUDA_Features/jacobiCudaGraphs/main.cpp b/Samples/3_CUDA_Features/jacobiCudaGraphs/main.cpp index 12dcd5c2..b17d8815 100644 --- a/Samples/3_CUDA_Features/jacobiCudaGraphs/main.cpp +++ b/Samples/3_CUDA_Features/jacobiCudaGraphs/main.cpp @@ -43,24 +43,37 @@ #include #include #include + #include "jacobi.h" // Run the Jacobi method for A*x = b on GPU with CUDA Graph - // cudaGraphExecKernelNodeSetParams(). -extern double JacobiMethodGpuCudaGraphExecKernelSetParams( - const float *A, const double *b, const float conv_threshold, - const int max_iter, double *x, double *x_new, cudaStream_t stream); +extern double JacobiMethodGpuCudaGraphExecKernelSetParams(const float *A, + const double *b, + const float conv_threshold, + const int max_iter, + double *x, + double *x_new, + cudaStream_t stream); // Run the Jacobi method for A*x = b on GPU with Instantiated CUDA Graph Update // API - cudaGraphExecUpdate(). -extern double JacobiMethodGpuCudaGraphExecUpdate( - const float *A, const double *b, const float conv_threshold, - const int max_iter, double *x, double *x_new, cudaStream_t stream); +extern double JacobiMethodGpuCudaGraphExecUpdate(const float *A, + const double *b, + const float conv_threshold, + const int max_iter, + double *x, + double *x_new, + cudaStream_t stream); // Run the Jacobi method for A*x = b on GPU without CUDA Graph. -extern double JacobiMethodGpu(const float *A, const double *b, - const float conv_threshold, const int max_iter, - double *x, double *x_new, cudaStream_t stream); +extern double JacobiMethodGpu(const float *A, + const double *b, + const float conv_threshold, + const int max_iter, + double *x, + double *x_new, + cudaStream_t stream); // creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1 // elsewhere. The elements of the right hand side b all equal 2*n, hence the @@ -68,148 +81,146 @@ extern double JacobiMethodGpu(const float *A, const double *b, void createLinearSystem(float *A, double *b); // Run the Jacobi method for A*x = b on CPU. -void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, - int *numit, double *x); +void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, int *numit, double *x); -int main(int argc, char **argv) { - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Command line: jacobiCudaGraphs [-option]\n"); - printf("Valid options:\n"); - printf( - "-gpumethod=<0,1 or 2> : 0 - [Default] " - "JacobiMethodGpuCudaGraphExecKernelSetParams\n"); - printf(" : 1 - JacobiMethodGpuCudaGraphExecUpdate\n"); - printf(" : 2 - JacobiMethodGpu - Non CUDA Graph\n"); - printf("-device=device_num : cuda device id"); - printf("-help : Output a help message\n"); - exit(EXIT_SUCCESS); - } - - int gpumethod = 0; - if (checkCmdLineFlag(argc, (const char **)argv, "gpumethod")) { - gpumethod = getCmdLineArgumentInt(argc, (const char **)argv, "gpumethod"); - - if (gpumethod < 0 || gpumethod > 2) { - printf("Error: gpumethod must be 0 or 1 or 2, gpumethod=%d is invalid\n", - gpumethod); - exit(EXIT_SUCCESS); +int main(int argc, char **argv) +{ + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Command line: jacobiCudaGraphs [-option]\n"); + printf("Valid options:\n"); + printf("-gpumethod=<0,1 or 2> : 0 - [Default] " + "JacobiMethodGpuCudaGraphExecKernelSetParams\n"); + printf(" : 1 - JacobiMethodGpuCudaGraphExecUpdate\n"); + printf(" : 2 - JacobiMethodGpu - Non CUDA Graph\n"); + printf("-device=device_num : cuda device id"); + printf("-help : Output a help message\n"); + exit(EXIT_SUCCESS); } - } - int dev = findCudaDevice(argc, (const char **)argv); + int gpumethod = 0; + if (checkCmdLineFlag(argc, (const char **)argv, "gpumethod")) { + gpumethod = getCmdLineArgumentInt(argc, (const char **)argv, "gpumethod"); - double *b = NULL; - float *A = NULL; - checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double))); - memset(b, 0, N_ROWS * sizeof(double)); - checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float))); - memset(A, 0, N_ROWS * N_ROWS * sizeof(float)); + if (gpumethod < 0 || gpumethod > 2) { + printf("Error: gpumethod must be 0 or 1 or 2, gpumethod=%d is invalid\n", gpumethod); + exit(EXIT_SUCCESS); + } + } - createLinearSystem(A, b); - double *x = NULL; - // start with array of all zeroes - x = (double *)calloc(N_ROWS, sizeof(double)); + int dev = findCudaDevice(argc, (const char **)argv); - float conv_threshold = 1.0e-2; - int max_iter = 4 * N_ROWS * N_ROWS; - int cnt = 0; + double *b = NULL; + float *A = NULL; + checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double))); + memset(b, 0, N_ROWS * sizeof(double)); + checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float))); + memset(A, 0, N_ROWS * N_ROWS * sizeof(float)); - // create timer - StopWatchInterface *timerCPU = NULL, *timerGpu = NULL; - sdkCreateTimer(&timerCPU); + createLinearSystem(A, b); + double *x = NULL; + // start with array of all zeroes + x = (double *)calloc(N_ROWS, sizeof(double)); - sdkStartTimer(&timerCPU); - JacobiMethodCPU(A, b, conv_threshold, max_iter, &cnt, x); + float conv_threshold = 1.0e-2; + int max_iter = 4 * N_ROWS * N_ROWS; + int cnt = 0; - double sum = 0.0; - // Compute error - for (int i = 0; i < N_ROWS; i++) { - double d = x[i] - 1.0; - sum += fabs(d); - } - sdkStopTimer(&timerCPU); - printf("CPU iterations : %d\n", cnt); - printf("CPU error : %.3e\n", sum); - printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU)); + // create timer + StopWatchInterface *timerCPU = NULL, *timerGpu = NULL; + sdkCreateTimer(&timerCPU); - float *d_A; - double *d_b, *d_x, *d_x_new; - cudaStream_t stream1; - checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)); - checkCudaErrors(cudaMalloc(&d_b, sizeof(double) * N_ROWS)); - checkCudaErrors(cudaMalloc(&d_A, sizeof(float) * N_ROWS * N_ROWS)); - checkCudaErrors(cudaMalloc(&d_x, sizeof(double) * N_ROWS)); - checkCudaErrors(cudaMalloc(&d_x_new, sizeof(double) * N_ROWS)); + sdkStartTimer(&timerCPU); + JacobiMethodCPU(A, b, conv_threshold, max_iter, &cnt, x); - checkCudaErrors(cudaMemsetAsync(d_x, 0, sizeof(double) * N_ROWS, stream1)); - checkCudaErrors( - cudaMemsetAsync(d_x_new, 0, sizeof(double) * N_ROWS, stream1)); - checkCudaErrors(cudaMemcpyAsync(d_A, A, sizeof(float) * N_ROWS * N_ROWS, - cudaMemcpyHostToDevice, stream1)); - checkCudaErrors(cudaMemcpyAsync(d_b, b, sizeof(double) * N_ROWS, - cudaMemcpyHostToDevice, stream1)); - - sdkCreateTimer(&timerGpu); - sdkStartTimer(&timerGpu); - - double sumGPU = 0.0; - if (gpumethod == 0) { - sumGPU = JacobiMethodGpuCudaGraphExecKernelSetParams( - d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); - } else if (gpumethod == 1) { - sumGPU = JacobiMethodGpuCudaGraphExecUpdate( - d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); - } else if (gpumethod == 2) { - sumGPU = JacobiMethodGpu(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, - stream1); - } - - sdkStopTimer(&timerGpu); - printf("GPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerGpu)); - - checkCudaErrors(cudaFree(d_b)); - checkCudaErrors(cudaFree(d_A)); - checkCudaErrors(cudaFree(d_x)); - checkCudaErrors(cudaFree(d_x_new)); - - checkCudaErrors(cudaFreeHost(A)); - checkCudaErrors(cudaFreeHost(b)); - - printf("&&&& jacobiCudaGraphs %s\n", - (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED"); - - return (fabs(sum - sumGPU) < conv_threshold) ? EXIT_SUCCESS : EXIT_FAILURE; -} - -void createLinearSystem(float *A, double *b) { - int i, j; - for (i = 0; i < N_ROWS; i++) { - b[i] = 2.0 * N_ROWS; - for (j = 0; j < N_ROWS; j++) A[i * N_ROWS + j] = 1.0; - A[i * N_ROWS + i] = N_ROWS + 1.0; - } -} - -void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, - int *num_iter, double *x) { - double *x_new; - x_new = (double *)calloc(N_ROWS, sizeof(double)); - int k; - - for (k = 0; k < max_iter; k++) { double sum = 0.0; + // Compute error for (int i = 0; i < N_ROWS; i++) { - double temp_dx = b[i]; - for (int j = 0; j < N_ROWS; j++) temp_dx -= A[i * N_ROWS + j] * x[j]; - temp_dx /= A[i * N_ROWS + i]; - x_new[i] += temp_dx; - sum += fabs(temp_dx); + double d = x[i] - 1.0; + sum += fabs(d); + } + sdkStopTimer(&timerCPU); + printf("CPU iterations : %d\n", cnt); + printf("CPU error : %.3e\n", sum); + printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU)); + + float *d_A; + double *d_b, *d_x, *d_x_new; + cudaStream_t stream1; + checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)); + checkCudaErrors(cudaMalloc(&d_b, sizeof(double) * N_ROWS)); + checkCudaErrors(cudaMalloc(&d_A, sizeof(float) * N_ROWS * N_ROWS)); + checkCudaErrors(cudaMalloc(&d_x, sizeof(double) * N_ROWS)); + checkCudaErrors(cudaMalloc(&d_x_new, sizeof(double) * N_ROWS)); + + checkCudaErrors(cudaMemsetAsync(d_x, 0, sizeof(double) * N_ROWS, stream1)); + checkCudaErrors(cudaMemsetAsync(d_x_new, 0, sizeof(double) * N_ROWS, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_A, A, sizeof(float) * N_ROWS * N_ROWS, cudaMemcpyHostToDevice, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_b, b, sizeof(double) * N_ROWS, cudaMemcpyHostToDevice, stream1)); + + sdkCreateTimer(&timerGpu); + sdkStartTimer(&timerGpu); + + double sumGPU = 0.0; + if (gpumethod == 0) { + sumGPU = JacobiMethodGpuCudaGraphExecKernelSetParams(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); + } + else if (gpumethod == 1) { + sumGPU = JacobiMethodGpuCudaGraphExecUpdate(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); + } + else if (gpumethod == 2) { + sumGPU = JacobiMethodGpu(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); } - for (int i = 0; i < N_ROWS; i++) x[i] = x_new[i]; + sdkStopTimer(&timerGpu); + printf("GPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerGpu)); - if (sum <= conv_threshold) break; - } - *num_iter = k + 1; - free(x_new); + checkCudaErrors(cudaFree(d_b)); + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_x)); + checkCudaErrors(cudaFree(d_x_new)); + + checkCudaErrors(cudaFreeHost(A)); + checkCudaErrors(cudaFreeHost(b)); + + printf("&&&& jacobiCudaGraphs %s\n", (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED"); + + return (fabs(sum - sumGPU) < conv_threshold) ? EXIT_SUCCESS : EXIT_FAILURE; +} + +void createLinearSystem(float *A, double *b) +{ + int i, j; + for (i = 0; i < N_ROWS; i++) { + b[i] = 2.0 * N_ROWS; + for (j = 0; j < N_ROWS; j++) + A[i * N_ROWS + j] = 1.0; + A[i * N_ROWS + i] = N_ROWS + 1.0; + } +} + +void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, int *num_iter, double *x) +{ + double *x_new; + x_new = (double *)calloc(N_ROWS, sizeof(double)); + int k; + + for (k = 0; k < max_iter; k++) { + double sum = 0.0; + for (int i = 0; i < N_ROWS; i++) { + double temp_dx = b[i]; + for (int j = 0; j < N_ROWS; j++) + temp_dx -= A[i * N_ROWS + j] * x[j]; + temp_dx /= A[i * N_ROWS + i]; + x_new[i] += temp_dx; + sum += fabs(temp_dx); + } + + for (int i = 0; i < N_ROWS; i++) + x[i] = x_new[i]; + + if (sum <= conv_threshold) + break; + } + *num_iter = k + 1; + free(x_new); } diff --git a/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp b/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp index 541692c5..7811a9c0 100644 --- a/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp +++ b/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp @@ -30,15 +30,16 @@ * using cuMemMap APIs and with one process per GPU for computation. */ -#include #include #include -#include "cuda.h" +#include +#include "cuda.h" #include "helper_multiprocess.h" // includes, project #include + #include "helper_cuda_drvapi.h" // includes, CUDA @@ -52,15 +53,16 @@ using namespace std; #define MAX_DEVICES (32) #define PROCESSES_PER_DEVICE 1 -#define DATA_BUF_SIZE 4ULL * 1024ULL * 1024ULL +#define DATA_BUF_SIZE 4ULL * 1024ULL * 1024ULL static const char ipcName[] = "memmap_ipc_pipe"; static const char shmName[] = "memmap_ipc_shm"; -typedef struct shmStruct_st { - size_t nprocesses; - int barrier; - int sense; +typedef struct shmStruct_st +{ + size_t nprocesses; + int barrier; + int sense; } shmStruct; bool findModulePath(const char *, string &, char **, string &); @@ -80,8 +82,7 @@ bool findModulePath(const char *, string &, char **, string &); // ipcHandleTypeFlag variable is a convenience variable and is passed by value // to individual requests. #if defined(__linux__) || defined(__QNX__) -CUmemAllocationHandleType ipcHandleTypeFlag = - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #else CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32; #endif @@ -94,554 +95,539 @@ CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32; #error Unsupported system #endif -CUmodule cuModule; +CUmodule cuModule; CUfunction _memMapIpc_kernel; -static void barrierWait(volatile int *barrier, volatile int *sense, - unsigned int n) { - int count; +static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n) +{ + int count; - // Check-in - count = cpu_atomic_add32(barrier, 1); - if (count == n) { // Last one in - *sense = 1; - } - while (!*sense) - ; + // Check-in + count = cpu_atomic_add32(barrier, 1); + if (count == n) { // Last one in + *sense = 1; + } + while (!*sense) + ; - // Check-out - count = cpu_atomic_add32(barrier, -1); - if (count == 0) { // Last one out - *sense = 0; - } - while (*sense) - ; + // Check-out + count = cpu_atomic_add32(barrier, -1); + if (count == 0) { // Last one out + *sense = 0; + } + while (*sense) + ; } // Windows-specific LPSECURITYATTRIBUTES -void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) { +void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) +{ #if defined(__linux__) || defined(__QNX__) - return; + return; #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)"; - static OBJECT_ATTRIBUTES objAttributes; - static bool objAttributesConfigured = false; + static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)"; + static OBJECT_ATTRIBUTES objAttributes; + static bool objAttributesConfigured = false; - if (!objAttributesConfigured) { - PSECURITY_DESCRIPTOR secDesc; - BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA( - sddl, SDDL_REVISION_1, &secDesc, NULL); - if (result == 0) { - printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n", - GetLastError()); + if (!objAttributesConfigured) { + PSECURITY_DESCRIPTOR secDesc; + BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA(sddl, SDDL_REVISION_1, &secDesc, NULL); + if (result == 0) { + printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n", GetLastError()); + } + + InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc); + + objAttributesConfigured = true; } - InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc); - - objAttributesConfigured = true; - } - - prop->win32HandleMetaData = &objAttributes; - return; + prop->win32HandleMetaData = &objAttributes; + return; #endif } -static void memMapAllocateAndExportMemory( - unsigned char backingDevice, size_t allocSize, - std::vector &allocationHandles, - std::vector &shareableHandles) { - // This property structure describes the physical location where the memory - // will be allocated via cuMemCreate along with additional properties. - CUmemAllocationProp prop = {}; +static void memMapAllocateAndExportMemory(unsigned char backingDevice, + size_t allocSize, + std::vector &allocationHandles, + std::vector &shareableHandles) +{ + // This property structure describes the physical location where the memory + // will be allocated via cuMemCreate along with additional properties. + CUmemAllocationProp prop = {}; - // The allocations will be device pinned memory backed on backingDevice and - // exportable with the specified handle type. - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + // The allocations will be device pinned memory backed on backingDevice and + // exportable with the specified handle type. + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - // Back all allocations on backingDevice. - prop.location.id = (int)backingDevice; + // Back all allocations on backingDevice. + prop.location.id = (int)backingDevice; - // Passing a requestedHandleTypes indicates intention to export this - // allocation to a platform-specific handle. This sample requests a file - // descriptor on Linux and NT Handle on Windows. - prop.requestedHandleTypes = ipcHandleTypeFlag; + // Passing a requestedHandleTypes indicates intention to export this + // allocation to a platform-specific handle. This sample requests a file + // descriptor on Linux and NT Handle on Windows. + prop.requestedHandleTypes = ipcHandleTypeFlag; - // Get the minimum granularity supported for allocation with cuMemCreate() - size_t granularity = 0; - checkCudaErrors(cuMemGetAllocationGranularity( - &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - if (allocSize % granularity) { - printf( - "Allocation size is not a multiple of minimum supported granularity " - "for this device. Exiting...\n"); - exit(EXIT_FAILURE); - } - - // Windows-specific LPSECURITYATTRIBUTES is required when - // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope - // of which exported allocations may be tranferred to other processes. For all - // other handle types, pass NULL. - getDefaultSecurityDescriptor(&prop); - - for (int i = 0; i < allocationHandles.size(); i++) { - // Create the allocation as a pinned allocation on device specified in - // prop.location.id - checkCudaErrors(cuMemCreate(&allocationHandles[i], allocSize, &prop, 0)); - - // Export the allocation to a platform-specific handle. The type of handle - // requested here must match the requestedHandleTypes field in the prop - // structure passed to cuMemCreate. - checkCudaErrors(cuMemExportToShareableHandle((void *)&shareableHandles[i], - allocationHandles[i], - ipcHandleTypeFlag, 0)); - } -} - -static void memMapImportAndMapMemory( - CUdeviceptr d_ptr, size_t mapSize, - std::vector &shareableHandles, int mapDevice) { - std::vector allocationHandles; - allocationHandles.resize(shareableHandles.size()); - - // The accessDescriptor will describe the mapping requirement for the - // mapDevice passed as argument - CUmemAccessDesc accessDescriptor; - - // Specify location for mapping the imported allocations. - accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDescriptor.location.id = mapDevice; - - // Specify both read and write accesses. - accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - - for (int i = 0; i < shareableHandles.size(); i++) { - // Import the memory allocation back into a CUDA handle from the platform - // specific handle. - checkCudaErrors(cuMemImportFromShareableHandle( - &allocationHandles[i], (void *)(uintptr_t)shareableHandles[i], - ipcHandleTypeFlag)); - - // Assign the chunk to the appropriate VA range and release the handle. - // After mapping the memory, it can be referenced by virtual address. - checkCudaErrors( - cuMemMap(d_ptr + (i * mapSize), mapSize, 0, allocationHandles[i], 0)); - - // Since we do not need to make any other mappings of this memory or export - // it, we no longer need and can release the allocationHandle. The - // allocation will be kept live until it is unmapped. - checkCudaErrors(cuMemRelease(allocationHandles[i])); - } - - // Retain peer access and map all chunks to mapDevice - checkCudaErrors(cuMemSetAccess(d_ptr, shareableHandles.size() * mapSize, - &accessDescriptor, 1)); -} - -static void memMapUnmapAndFreeMemory(CUdeviceptr dptr, size_t size) { - CUresult status = CUDA_SUCCESS; - - // Unmap the mapped virtual memory region - // Since the handles to the mapped backing stores have already been released - // by cuMemRelease, and these are the only/last mappings referencing them, - // The backing stores will be freed. - // Since the memory has been unmapped after this call, accessing the specified - // va range will result in a fault (unitll it is remapped). - checkCudaErrors(cuMemUnmap(dptr, size)); - - // Free the virtual address region. This allows the virtual address region - // to be reused by future cuMemAddressReserve calls. This also allows the - // virtual address region to be used by other allocation made through - // opperating system calls like malloc & mmap. - checkCudaErrors(cuMemAddressFree(dptr, size)); -} - -static void memMapGetDeviceFunction(char **argv) { - // first search for the module path before we load the results - string module_path, ptx_source; - if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) { - if (!findModulePath("memMapIpc_kernel.cubin", module_path, argv, - ptx_source)) { - printf( - "> findModulePath could not find ptx or cubin\n"); - exit(EXIT_FAILURE); + // Get the minimum granularity supported for allocation with cuMemCreate() + size_t granularity = 0; + checkCudaErrors(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + if (allocSize % granularity) { + printf("Allocation size is not a multiple of minimum supported granularity " + "for this device. Exiting...\n"); + exit(EXIT_FAILURE); } - } else { - printf("> initCUDA loading module: <%s>\n", module_path.c_str()); - } - // Create module from binary file (PTX or CUBIN) - if (module_path.rfind("ptx") != string::npos) { - // in this branch we use compilation with parameters - const unsigned int jitNumOptions = 3; - CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; - void **jitOptVals = new void *[jitNumOptions]; - // set up size of compilation log buffer - jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - int jitLogBufferSize = 1024; - jitOptVals[0] = (void *)(size_t)jitLogBufferSize; - // set up pointer to the compilation log buffer - jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; - char *jitLogBuffer = new char[jitLogBufferSize]; - jitOptVals[1] = jitLogBuffer; - // set up pointer to set the Maximum # of registers for a particular kernel - jitOptions[2] = CU_JIT_MAX_REGISTERS; - int jitRegCount = 32; - jitOptVals[2] = (void *)(size_t)jitRegCount; - checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), - jitNumOptions, jitOptions, - (void **)jitOptVals)); - printf("> PTX JIT log:\n%s\n", jitLogBuffer); + // Windows-specific LPSECURITYATTRIBUTES is required when + // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope + // of which exported allocations may be tranferred to other processes. For all + // other handle types, pass NULL. + getDefaultSecurityDescriptor(&prop); - // Clean up dynamically allocated memory - delete[] jitOptions; - delete[] jitOptVals; - delete[] jitLogBuffer; - } else { - checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str())); - } + for (int i = 0; i < allocationHandles.size(); i++) { + // Create the allocation as a pinned allocation on device specified in + // prop.location.id + checkCudaErrors(cuMemCreate(&allocationHandles[i], allocSize, &prop, 0)); - // Get function handle from module - checkCudaErrors( - cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel")); + // Export the allocation to a platform-specific handle. The type of handle + // requested here must match the requestedHandleTypes field in the prop + // structure passed to cuMemCreate. + checkCudaErrors( + cuMemExportToShareableHandle((void *)&shareableHandles[i], allocationHandles[i], ipcHandleTypeFlag, 0)); + } } -static void childProcess(int devId, int id, char **argv) { - volatile shmStruct *shm = NULL; - sharedMemoryInfo info; - ipcHandle *ipcChildHandle = NULL; - int blocks = 0; - int threads = 128; +static void memMapImportAndMapMemory(CUdeviceptr d_ptr, + size_t mapSize, + std::vector &shareableHandles, + int mapDevice) +{ + std::vector allocationHandles; + allocationHandles.resize(shareableHandles.size()); - checkIpcErrors(ipcOpenSocket(ipcChildHandle)); + // The accessDescriptor will describe the mapping requirement for the + // mapDevice passed as argument + CUmemAccessDesc accessDescriptor; - if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { - printf("Failed to create shared memory slab\n"); - exit(EXIT_FAILURE); - } - shm = (volatile shmStruct *)info.addr; - int procCount = (int)shm->nprocesses; + // Specify location for mapping the imported allocations. + accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDescriptor.location.id = mapDevice; - barrierWait(&shm->barrier, &shm->sense, (unsigned int)(procCount + 1)); + // Specify both read and write accesses. + accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - // Receive all allocation handles shared by Parent. - std::vector shHandle(procCount); - checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle)); + for (int i = 0; i < shareableHandles.size(); i++) { + // Import the memory allocation back into a CUDA handle from the platform + // specific handle. + checkCudaErrors(cuMemImportFromShareableHandle( + &allocationHandles[i], (void *)(uintptr_t)shareableHandles[i], ipcHandleTypeFlag)); - CUcontext ctx; - CUdevice device; - CUstream stream; - int multiProcessorCount; + // Assign the chunk to the appropriate VA range and release the handle. + // After mapping the memory, it can be referenced by virtual address. + checkCudaErrors(cuMemMap(d_ptr + (i * mapSize), mapSize, 0, allocationHandles[i], 0)); - checkCudaErrors(cuDeviceGet(&device, devId)); - checkCudaErrors(cuCtxCreate(&ctx, 0, device)); - checkCudaErrors(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + // Since we do not need to make any other mappings of this memory or export + // it, we no longer need and can release the allocationHandle. The + // allocation will be kept live until it is unmapped. + checkCudaErrors(cuMemRelease(allocationHandles[i])); + } - // Obtain kernel function for the sample - memMapGetDeviceFunction(argv); + // Retain peer access and map all chunks to mapDevice + checkCudaErrors(cuMemSetAccess(d_ptr, shareableHandles.size() * mapSize, &accessDescriptor, 1)); +} - checkCudaErrors(cuOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks, _memMapIpc_kernel, threads, 0)); - checkCudaErrors(cuDeviceGetAttribute( - &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); - blocks *= multiProcessorCount; +static void memMapUnmapAndFreeMemory(CUdeviceptr dptr, size_t size) +{ + CUresult status = CUDA_SUCCESS; - CUdeviceptr d_ptr = 0ULL; + // Unmap the mapped virtual memory region + // Since the handles to the mapped backing stores have already been released + // by cuMemRelease, and these are the only/last mappings referencing them, + // The backing stores will be freed. + // Since the memory has been unmapped after this call, accessing the specified + // va range will result in a fault (unitll it is remapped). + checkCudaErrors(cuMemUnmap(dptr, size)); - // Reserve the required contiguous VA space for the allocations - checkCudaErrors(cuMemAddressReserve(&d_ptr, procCount * DATA_BUF_SIZE, - DATA_BUF_SIZE, 0, 0)); + // Free the virtual address region. This allows the virtual address region + // to be reused by future cuMemAddressReserve calls. This also allows the + // virtual address region to be used by other allocation made through + // opperating system calls like malloc & mmap. + checkCudaErrors(cuMemAddressFree(dptr, size)); +} - // Import the memory allocations shared by the parent with us and map them in - // our address space. - memMapImportAndMapMemory(d_ptr, DATA_BUF_SIZE, shHandle, devId); +static void memMapGetDeviceFunction(char **argv) +{ + // first search for the module path before we load the results + string module_path, ptx_source; + if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) { + if (!findModulePath("memMapIpc_kernel.cubin", module_path, argv, ptx_source)) { + printf("> findModulePath could not find ptx or cubin\n"); + exit(EXIT_FAILURE); + } + } + else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); + } - // Since we have imported allocations shared by the parent with us, we can - // close all the ShareableHandles. - for (int i = 0; i < procCount; i++) { - checkIpcErrors(ipcCloseShareableHandle(shHandle[i])); - } - checkIpcErrors(ipcCloseSocket(ipcChildHandle)); + // Create module from binary file (PTX or CUBIN) + if (module_path.rfind("ptx") != string::npos) { + // in this branch we use compilation with parameters + const unsigned int jitNumOptions = 3; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void *[jitNumOptions]; + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = (void *)(size_t)jitRegCount; + checkCudaErrors( + cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals)); + printf("> PTX JIT log:\n%s\n", jitLogBuffer); - for (int i = 0; i < procCount; i++) { - size_t bufferId = (i + id) % procCount; + // Clean up dynamically allocated memory + delete[] jitOptions; + delete[] jitOptVals; + delete[] jitLogBuffer; + } + else { + checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str())); + } - // Build arguments to be passed to cuda kernel. - CUdeviceptr ptr = d_ptr + (bufferId * DATA_BUF_SIZE); - int size = DATA_BUF_SIZE; - char val = (char)id; + // Get function handle from module + checkCudaErrors(cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel")); +} - void *args[] = {&ptr, &size, &val}; +static void childProcess(int devId, int id, char **argv) +{ + volatile shmStruct *shm = NULL; + sharedMemoryInfo info; + ipcHandle *ipcChildHandle = NULL; + int blocks = 0; + int threads = 128; - // Push a simple kernel on th buffer. - checkCudaErrors(cuLaunchKernel(_memMapIpc_kernel, blocks, 1, 1, threads, 1, - 1, 0, stream, args, 0)); + checkIpcErrors(ipcOpenSocket(ipcChildHandle)); + + if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); + } + shm = (volatile shmStruct *)info.addr; + int procCount = (int)shm->nprocesses; + + barrierWait(&shm->barrier, &shm->sense, (unsigned int)(procCount + 1)); + + // Receive all allocation handles shared by Parent. + std::vector shHandle(procCount); + checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle)); + + CUcontext ctx; + CUdevice device; + CUstream stream; + int multiProcessorCount; + + checkCudaErrors(cuDeviceGet(&device, devId)); + checkCudaErrors(cuCtxCreate(&ctx, 0, device)); + checkCudaErrors(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + + // Obtain kernel function for the sample + memMapGetDeviceFunction(argv); + + checkCudaErrors(cuOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, _memMapIpc_kernel, threads, 0)); + checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); + blocks *= multiProcessorCount; + + CUdeviceptr d_ptr = 0ULL; + + // Reserve the required contiguous VA space for the allocations + checkCudaErrors(cuMemAddressReserve(&d_ptr, procCount * DATA_BUF_SIZE, DATA_BUF_SIZE, 0, 0)); + + // Import the memory allocations shared by the parent with us and map them in + // our address space. + memMapImportAndMapMemory(d_ptr, DATA_BUF_SIZE, shHandle, devId); + + // Since we have imported allocations shared by the parent with us, we can + // close all the ShareableHandles. + for (int i = 0; i < procCount; i++) { + checkIpcErrors(ipcCloseShareableHandle(shHandle[i])); + } + checkIpcErrors(ipcCloseSocket(ipcChildHandle)); + + for (int i = 0; i < procCount; i++) { + size_t bufferId = (i + id) % procCount; + + // Build arguments to be passed to cuda kernel. + CUdeviceptr ptr = d_ptr + (bufferId * DATA_BUF_SIZE); + int size = DATA_BUF_SIZE; + char val = (char)id; + + void *args[] = {&ptr, &size, &val}; + + // Push a simple kernel on th buffer. + checkCudaErrors(cuLaunchKernel(_memMapIpc_kernel, blocks, 1, 1, threads, 1, 1, 0, stream, args, 0)); + checkCudaErrors(cuStreamSynchronize(stream)); + + // Wait for all my sibling processes to push this stage of their work + // before proceeding to the next. This makes the data in the buffer + // deterministic. + barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); + if (id == 0) { + printf("Step %llu done\n", (unsigned long long)i); + } + } + + printf("Process %d: verifying...\n", id); + + // Copy the data onto host and verify value if it matches expected value or + // not. + std::vector verification_buffer(DATA_BUF_SIZE); + checkCudaErrors(cuMemcpyDtoHAsync(&verification_buffer[0], d_ptr + (id * DATA_BUF_SIZE), DATA_BUF_SIZE, stream)); checkCudaErrors(cuStreamSynchronize(stream)); - // Wait for all my sibling processes to push this stage of their work - // before proceeding to the next. This makes the data in the buffer - // deterministic. - barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); - if (id == 0) { - printf("Step %llu done\n", (unsigned long long)i); + // The contents should have the id of the sibling just after me + char compareId = (char)((id + 1) % procCount); + for (unsigned long long j = 0; j < DATA_BUF_SIZE; j++) { + if (verification_buffer[j] != compareId) { + printf("Process %d: Verification mismatch at %lld: %d != %d\n", + id, + j, + (int)verification_buffer[j], + (int)compareId); + break; + } } - } - printf("Process %d: verifying...\n", id); + // Clean up! + checkCudaErrors(cuStreamDestroy(stream)); + checkCudaErrors(cuCtxDestroy(ctx)); - // Copy the data onto host and verify value if it matches expected value or - // not. - std::vector verification_buffer(DATA_BUF_SIZE); - checkCudaErrors(cuMemcpyDtoHAsync(&verification_buffer[0], - d_ptr + (id * DATA_BUF_SIZE), DATA_BUF_SIZE, - stream)); - checkCudaErrors(cuStreamSynchronize(stream)); + // Unmap the allocations from our address space. Unmapping will also free the + // handle as we have already released the imported handle with the call to + // cuMemRelease. Finally, free up the Virtual Address space we reserved with + // cuMemAddressReserve. + memMapUnmapAndFreeMemory(d_ptr, procCount * DATA_BUF_SIZE); - // The contents should have the id of the sibling just after me - char compareId = (char)((id + 1) % procCount); - for (unsigned long long j = 0; j < DATA_BUF_SIZE; j++) { - if (verification_buffer[j] != compareId) { - printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j, - (int)verification_buffer[j], (int)compareId); - break; - } - } - - // Clean up! - checkCudaErrors(cuStreamDestroy(stream)); - checkCudaErrors(cuCtxDestroy(ctx)); - - // Unmap the allocations from our address space. Unmapping will also free the - // handle as we have already released the imported handle with the call to - // cuMemRelease. Finally, free up the Virtual Address space we reserved with - // cuMemAddressReserve. - memMapUnmapAndFreeMemory(d_ptr, procCount * DATA_BUF_SIZE); - - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } -static void parentProcess(char *app) { - int devCount, i, nprocesses = 0; - volatile shmStruct *shm = NULL; - sharedMemoryInfo info; - std::vector processes; +static void parentProcess(char *app) +{ + int devCount, i, nprocesses = 0; + volatile shmStruct *shm = NULL; + sharedMemoryInfo info; + std::vector processes; - checkCudaErrors(cuDeviceGetCount(&devCount)); - std::vector devices(devCount); + checkCudaErrors(cuDeviceGetCount(&devCount)); + std::vector devices(devCount); - if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { - printf("Failed to create shared memory slab\n"); - exit(EXIT_FAILURE); - } + if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { + printf("Failed to create shared memory slab\n"); + exit(EXIT_FAILURE); + } - shm = (volatile shmStruct *)info.addr; - memset((void *)shm, 0, sizeof(*shm)); + shm = (volatile shmStruct *)info.addr; + memset((void *)shm, 0, sizeof(*shm)); - for (i = 0; i < devCount; i++) { - checkCudaErrors(cuDeviceGet(&devices[i], i)); - } + for (i = 0; i < devCount; i++) { + checkCudaErrors(cuDeviceGet(&devices[i], i)); + } - std::vector ctxs; - std::vector selectedDevices; + std::vector ctxs; + std::vector selectedDevices; - // Pick all the devices that can access each other's memory for this test - // Keep in mind that CUDA has minimal support for fork() without a - // corresponding exec() in the child process, but in this case our - // spawnProcess will always exec, so no need to worry. - for (i = 0; i < devCount; i++) { - bool allPeers = true; - int deviceComputeMode; - int deviceSupportsIpcHandle; - int attributeVal = 0; + // Pick all the devices that can access each other's memory for this test + // Keep in mind that CUDA has minimal support for fork() without a + // corresponding exec() in the child process, but in this case our + // spawnProcess will always exec, so no need to worry. + for (i = 0; i < devCount; i++) { + bool allPeers = true; + int deviceComputeMode; + int deviceSupportsIpcHandle; + int attributeVal = 0; - checkCudaErrors(cuDeviceGet(&devices[i], i)); - checkCudaErrors(cuDeviceGetAttribute( - &deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, devices[i])); - checkCudaErrors(cuDeviceGetAttribute( - &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - devices[i])); + checkCudaErrors(cuDeviceGet(&devices[i], i)); + checkCudaErrors(cuDeviceGetAttribute(&deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, devices[i])); + checkCudaErrors( + cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, devices[i])); #if defined(__linux__) || defined(__QNX__) - checkCudaErrors(cuDeviceGetAttribute( - &deviceSupportsIpcHandle, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, - devices[i])); + checkCudaErrors(cuDeviceGetAttribute( + &deviceSupportsIpcHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, devices[i])); #else - checkCudaErrors(cuDeviceGetAttribute( - &deviceSupportsIpcHandle, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, devices[i])); + checkCudaErrors(cuDeviceGetAttribute( + &deviceSupportsIpcHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, devices[i])); #endif - // Check that the selected device supports virtual address management - if (attributeVal == 0) { - printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", - devices[i]); - continue; + // Check that the selected device supports virtual address management + if (attributeVal == 0) { + printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", devices[i]); + continue; + } + + // This sample requires two processes accessing each device, so we need + // to ensure exclusive or prohibited mode is not set + if (deviceComputeMode != CU_COMPUTEMODE_DEFAULT) { + printf("Device %d is in an unsupported compute mode for this sample\n", i); + continue; + } + + if (!deviceSupportsIpcHandle) { + printf("Device %d does not support requested handle type for IPC, " + "skipping...\n", + i); + continue; + } + + for (int j = 0; j < selectedDevices.size(); j++) { + int canAccessPeerIJ, canAccessPeerJI; + checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerJI, devices[selectedDevices[j]], devices[i])); + checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerIJ, devices[i], devices[selectedDevices[j]])); + if (!canAccessPeerIJ || !canAccessPeerJI) { + allPeers = false; + break; + } + } + if (allPeers) { + CUcontext ctx; + checkCudaErrors(cuCtxCreate(&ctx, 0, devices[i])); + ctxs.push_back(ctx); + + // Enable peers here. This isn't necessary for IPC, but it will + // setup the peers for the device. For systems that only allow 8 + // peers per GPU at a time, this acts to remove devices from CanAccessPeer + for (int j = 0; j < nprocesses; j++) { + checkCudaErrors(cuCtxSetCurrent(ctxs.back())); + checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0)); + checkCudaErrors(cuCtxSetCurrent(ctxs[j])); + checkCudaErrors(cuCtxEnablePeerAccess(ctxs.back(), 0)); + } + selectedDevices.push_back(i); + nprocesses++; + if (nprocesses >= MAX_DEVICES) { + break; + } + } + else { + printf("Device %d is not peer capable with some other selected peers, " + "skipping\n", + i); + } } - // This sample requires two processes accessing each device, so we need - // to ensure exclusive or prohibited mode is not set - if (deviceComputeMode != CU_COMPUTEMODE_DEFAULT) { - printf("Device %d is in an unsupported compute mode for this sample\n", - i); - continue; + for (int i = 0; i < ctxs.size(); ++i) { + checkCudaErrors(cuCtxDestroy(ctxs[i])); + }; + + if (nprocesses == 0) { + printf("No CUDA devices support IPC\n"); + exit(EXIT_WAIVED); + } + shm->nprocesses = nprocesses; + + unsigned char firstSelectedDevice = selectedDevices[0]; + + std::vector shHandles(nprocesses); + std::vector allocationHandles(nprocesses); + + // Allocate `nprocesses` number of memory chunks and obtain a shareable handle + // for each allocation. Share all memory allocations with all children. + memMapAllocateAndExportMemory(firstSelectedDevice, DATA_BUF_SIZE, allocationHandles, shHandles); + + // Launch the child processes! + for (i = 0; i < nprocesses; i++) { + char devIdx[10]; + char procIdx[12]; + char *const args[] = {app, devIdx, procIdx, NULL}; + Process process; + + SPRINTF(devIdx, "%d", selectedDevices[i]); + SPRINTF(procIdx, "%d", i); + + if (spawnProcess(&process, app, args)) { + printf("Failed to create process\n"); + exit(EXIT_FAILURE); + } + + processes.push_back(process); } - if (!deviceSupportsIpcHandle) { - printf( - "Device %d does not support requested handle type for IPC, " - "skipping...\n", - i); - continue; + barrierWait(&shm->barrier, &shm->sense, (unsigned int)(nprocesses + 1)); + + ipcHandle *ipcParentHandle = NULL; + checkIpcErrors(ipcCreateSocket(ipcParentHandle, ipcName, processes)); + checkIpcErrors(ipcSendShareableHandles(ipcParentHandle, shHandles, processes)); + + // Close the shareable handles as they are not needed anymore. + for (int i = 0; i < nprocesses; i++) { + checkIpcErrors(ipcCloseShareableHandle(shHandles[i])); } - for (int j = 0; j < selectedDevices.size(); j++) { - int canAccessPeerIJ, canAccessPeerJI; - checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerJI, - devices[selectedDevices[j]], - devices[i])); - checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerIJ, - devices[i], - devices[selectedDevices[j]])); - if (!canAccessPeerIJ || !canAccessPeerJI) { - allPeers = false; - break; - } - } - if (allPeers) { - CUcontext ctx; - checkCudaErrors(cuCtxCreate(&ctx, 0, devices[i])); - ctxs.push_back(ctx); - - // Enable peers here. This isn't necessary for IPC, but it will - // setup the peers for the device. For systems that only allow 8 - // peers per GPU at a time, this acts to remove devices from CanAccessPeer - for (int j = 0; j < nprocesses; j++) { - checkCudaErrors(cuCtxSetCurrent(ctxs.back())); - checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0)); - checkCudaErrors(cuCtxSetCurrent(ctxs[j])); - checkCudaErrors(cuCtxEnablePeerAccess(ctxs.back(), 0)); - } - selectedDevices.push_back(i); - nprocesses++; - if (nprocesses >= MAX_DEVICES) { - break; - } - } else { - printf( - "Device %d is not peer capable with some other selected peers, " - "skipping\n", - i); - } - } - - for (int i = 0; i < ctxs.size(); ++i) { - checkCudaErrors(cuCtxDestroy(ctxs[i])); - }; - - if (nprocesses == 0) { - printf("No CUDA devices support IPC\n"); - exit(EXIT_WAIVED); - } - shm->nprocesses = nprocesses; - - unsigned char firstSelectedDevice = selectedDevices[0]; - - std::vector shHandles(nprocesses); - std::vector allocationHandles(nprocesses); - - // Allocate `nprocesses` number of memory chunks and obtain a shareable handle - // for each allocation. Share all memory allocations with all children. - memMapAllocateAndExportMemory(firstSelectedDevice, DATA_BUF_SIZE, - allocationHandles, shHandles); - - // Launch the child processes! - for (i = 0; i < nprocesses; i++) { - char devIdx[10]; - char procIdx[12]; - char *const args[] = {app, devIdx, procIdx, NULL}; - Process process; - - SPRINTF(devIdx, "%d", selectedDevices[i]); - SPRINTF(procIdx, "%d", i); - - if (spawnProcess(&process, app, args)) { - printf("Failed to create process\n"); - exit(EXIT_FAILURE); + // And wait for them to finish + for (i = 0; i < processes.size(); i++) { + if (waitProcess(&processes[i]) != EXIT_SUCCESS) { + printf("Process %d failed!\n", i); + exit(EXIT_FAILURE); + } } - processes.push_back(process); - } - - barrierWait(&shm->barrier, &shm->sense, (unsigned int)(nprocesses + 1)); - - ipcHandle *ipcParentHandle = NULL; - checkIpcErrors(ipcCreateSocket(ipcParentHandle, ipcName, processes)); - checkIpcErrors( - ipcSendShareableHandles(ipcParentHandle, shHandles, processes)); - - // Close the shareable handles as they are not needed anymore. - for (int i = 0; i < nprocesses; i++) { - checkIpcErrors(ipcCloseShareableHandle(shHandles[i])); - } - - // And wait for them to finish - for (i = 0; i < processes.size(); i++) { - if (waitProcess(&processes[i]) != EXIT_SUCCESS) { - printf("Process %d failed!\n", i); - exit(EXIT_FAILURE); + for (i = 0; i < nprocesses; i++) { + checkCudaErrors(cuMemRelease(allocationHandles[i])); } - } - for (i = 0; i < nprocesses; i++) { - checkCudaErrors(cuMemRelease(allocationHandles[i])); - } - - checkIpcErrors(ipcCloseSocket(ipcParentHandle)); - sharedMemoryClose(&info); + checkIpcErrors(ipcCloseSocket(ipcParentHandle)); + sharedMemoryClose(&info); } // Host code -int main(int argc, char **argv) { - // Initialize - checkCudaErrors(cuInit(0)); +int main(int argc, char **argv) +{ + // Initialize + checkCudaErrors(cuInit(0)); - if (argc == 1) { - parentProcess(argv[0]); - } else { - childProcess(atoi(argv[1]), atoi(argv[2]), argv); - } - return EXIT_SUCCESS; + if (argc == 1) { + parentProcess(argv[0]); + } + else { + childProcess(atoi(argv[1]), atoi(argv[2]), argv); + } + return EXIT_SUCCESS; } -bool inline findModulePath(const char *module_file, string &module_path, - char **argv, string &ptx_source) { - char *actual_path = sdkFindFilePath(module_file, argv[0]); +bool inline findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source) +{ + char *actual_path = sdkFindFilePath(module_file, argv[0]); - if (actual_path) { - module_path = actual_path; - } else { - printf("> findModulePath file not found: <%s> \n", module_file); - return false; - } - - if (module_path.empty()) { - printf("> findModulePath could not find file: <%s> \n", module_file); - return false; - } else { - printf("> findModulePath found file at <%s>\n", module_path.c_str()); - - if (module_path.rfind(".ptx") != string::npos) { - FILE *fp = fopen(module_path.c_str(), "rb"); - fseek(fp, 0, SEEK_END); - int file_size = ftell(fp); - char *buf = new char[file_size + 1]; - fseek(fp, 0, SEEK_SET); - fread(buf, sizeof(char), file_size, fp); - fclose(fp); - buf[file_size] = '\0'; - ptx_source = buf; - delete[] buf; + if (actual_path) { + module_path = actual_path; + } + else { + printf("> findModulePath file not found: <%s> \n", module_file); + return false; } - return true; - } + if (module_path.empty()) { + printf("> findModulePath could not find file: <%s> \n", module_file); + return false; + } + else { + printf("> findModulePath found file at <%s>\n", module_path.c_str()); + + if (module_path.rfind(".ptx") != string::npos) { + FILE *fp = fopen(module_path.c_str(), "rb"); + fseek(fp, 0, SEEK_END); + int file_size = ftell(fp); + char *buf = new char[file_size + 1]; + fseek(fp, 0, SEEK_SET); + fread(buf, sizeof(char), file_size, fp); + fclose(fp); + buf[file_size] = '\0'; + ptx_source = buf; + delete[] buf; + } + + return true; + } } diff --git a/Samples/3_CUDA_Features/newdelete/container.hpp b/Samples/3_CUDA_Features/newdelete/container.hpp index 17490476..59875fe6 100644 --- a/Samples/3_CUDA_Features/newdelete/container.hpp +++ b/Samples/3_CUDA_Features/newdelete/container.hpp @@ -31,16 +31,16 @@ // //////////////////////////////////////////////////////////////////////////////// -template -class Container { - public: - __device__ Container() { ; } +template class Container +{ +public: + __device__ Container() { ; } - __device__ virtual ~Container() { ; } + __device__ virtual ~Container() { ; } - __device__ virtual void push(T e) = 0; + __device__ virtual void push(T e) = 0; - __device__ virtual bool pop(T& e) = 0; + __device__ virtual bool pop(T &e) = 0; }; //////////////////////////////////////////////////////////////////////////////// @@ -52,47 +52,57 @@ class Container { // //////////////////////////////////////////////////////////////////////////////// -template -class Vector : public Container { - public: - // Constructor, data is allocated on the heap - // NOTE: This must be called from only one thread - __device__ Vector(int max_size) : m_top(-1) { m_data = new T[max_size]; } - - // Constructor, data uses preallocated buffer via placement new - __device__ Vector(int max_size, T* preallocated_buffer) : m_top(-1) { - m_data = new (preallocated_buffer) T[max_size]; - } - - // Destructor, data is freed - // NOTE: This must be called from only one thread - __device__ ~Vector() { - if (m_data) delete[] m_data; - } - - __device__ virtual void push(T e) { - if (m_data) { - // Atomically increment the top idx - int idx = atomicAdd(&(this->m_top), 1); - m_data[idx + 1] = e; +template class Vector : public Container +{ +public: + // Constructor, data is allocated on the heap + // NOTE: This must be called from only one thread + __device__ Vector(int max_size) + : m_top(-1) + { + m_data = new T[max_size]; } - } - __device__ virtual bool pop(T& e) { - if (m_data && m_top >= 0) { - // Atomically decrement the top idx - int idx = atomicAdd(&(this->m_top), -1); - if (idx >= 0) { - e = m_data[idx]; - return true; - } + // Constructor, data uses preallocated buffer via placement new + __device__ Vector(int max_size, T *preallocated_buffer) + : m_top(-1) + { + m_data = new (preallocated_buffer) T[max_size]; } - return false; - } - private: - int m_size; - T* m_data; + // Destructor, data is freed + // NOTE: This must be called from only one thread + __device__ ~Vector() + { + if (m_data) + delete[] m_data; + } - int m_top; + __device__ virtual void push(T e) + { + if (m_data) { + // Atomically increment the top idx + int idx = atomicAdd(&(this->m_top), 1); + m_data[idx + 1] = e; + } + } + + __device__ virtual bool pop(T &e) + { + if (m_data && m_top >= 0) { + // Atomically decrement the top idx + int idx = atomicAdd(&(this->m_top), -1); + if (idx >= 0) { + e = m_data[idx]; + return true; + } + } + return false; + } + +private: + int m_size; + T *m_data; + + int m_top; }; diff --git a/Samples/3_CUDA_Features/newdelete/newdelete.cu b/Samples/3_CUDA_Features/newdelete/newdelete.cu index 2d9b6f74..aa9c7306 100644 --- a/Samples/3_CUDA_Features/newdelete/newdelete.cu +++ b/Samples/3_CUDA_Features/newdelete/newdelete.cu @@ -29,17 +29,14 @@ // new and delete operators and virtual function declarations available with // CUDA 4.0. +#include #include -#include - namespace cg = cooperative_groups; -#include - -#include - -#include #include +#include +#include +#include const char *sSDKsample = "newdelete"; @@ -51,12 +48,13 @@ const char *sSDKsample = "newdelete"; // //////////////////////////////////////////////////////////////////////////////// -__global__ void vectorCreate(Container **g_container, int max_size) { - // The Vector object and the data storage are allocated in device heap memory. - // This makes it persistent for the lifetime of the CUDA context. - // The grid has only one thread as only a single object instance is needed. +__global__ void vectorCreate(Container **g_container, int max_size) +{ + // The Vector object and the data storage are allocated in device heap memory. + // This makes it persistent for the lifetime of the CUDA context. + // The grid has only one thread as only a single object instance is needed. - *g_container = new Vector(max_size); + *g_container = new Vector(max_size); } //////////////////////////////////////////////////////////////////////////////// @@ -65,26 +63,29 @@ __global__ void vectorCreate(Container **g_container, int max_size) { // //////////////////////////////////////////////////////////////////////////////// -__global__ void containerFill(Container **g_container) { - // All threads of the grid cooperatively populate the shared Container object - // with data. - if (threadIdx.x == 0) { - (*g_container)->push(blockIdx.x); - } +__global__ void containerFill(Container **g_container) +{ + // All threads of the grid cooperatively populate the shared Container object + // with data. + if (threadIdx.x == 0) { + (*g_container)->push(blockIdx.x); + } } -__global__ void containerConsume(Container **g_container, int *d_result) { - // All threads of the grid cooperatively consume the data from the shared - // Container object. - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void containerConsume(Container **g_container, int *d_result) +{ + // All threads of the grid cooperatively consume the data from the shared + // Container object. + int idx = blockIdx.x * blockDim.x + threadIdx.x; - int v; + int v; - if ((*g_container)->pop(v)) { - d_result[idx] = v; - } else { - d_result[idx] = -1; - } + if ((*g_container)->pop(v)) { + d_result[idx] = v; + } + else { + d_result[idx] = -1; + } } //////////////////////////////////////////////////////////////////////////////// @@ -93,9 +94,7 @@ __global__ void containerConsume(Container **g_container, int *d_result) { // //////////////////////////////////////////////////////////////////////////////// -__global__ void containerDelete(Container **g_container) { - delete *g_container; -} +__global__ void containerDelete(Container **g_container) { delete *g_container; } //////////////////////////////////////////////////////////////////////////////// // @@ -104,87 +103,92 @@ __global__ void containerDelete(Container **g_container) { // //////////////////////////////////////////////////////////////////////////////// -__global__ void placementNew(int *d_result) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ unsigned char __align__(8) s_buffer[sizeof(Vector)]; - __shared__ int __align__(8) s_data[1024]; - __shared__ Vector *s_vector; +__global__ void placementNew(int *d_result) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ unsigned char __align__(8) s_buffer[sizeof(Vector)]; + __shared__ int __align__(8) s_data[1024]; + __shared__ Vector *s_vector; - // The first thread of the block initializes the shared Vector object. - // The placement new operator enables the Vector object and the data array top - // be placed in shared memory. - if (threadIdx.x == 0) { - s_vector = new (s_buffer) Vector(1024, s_data); - } + // The first thread of the block initializes the shared Vector object. + // The placement new operator enables the Vector object and the data array top + // be placed in shared memory. + if (threadIdx.x == 0) { + s_vector = new (s_buffer) Vector(1024, s_data); + } - cg::sync(cta); + cg::sync(cta); - if ((threadIdx.x & 1) == 0) { - s_vector->push(threadIdx.x >> 1); - } + if ((threadIdx.x & 1) == 0) { + s_vector->push(threadIdx.x >> 1); + } - // Need to sync as the vector implementation does not support concurrent - // push/pop operations. - cg::sync(cta); + // Need to sync as the vector implementation does not support concurrent + // push/pop operations. + cg::sync(cta); - int v; + int v; - if (s_vector->pop(v)) { - d_result[threadIdx.x] = v; - } else { - d_result[threadIdx.x] = -1; - } + if (s_vector->pop(v)) { + d_result[threadIdx.x] = v; + } + else { + d_result[threadIdx.x] = -1; + } - // Note: deleting objects placed in shared memory is not necessary (lifetime - // of shared memory is that of the block) + // Note: deleting objects placed in shared memory is not necessary (lifetime + // of shared memory is that of the block) } -struct ComplexType_t { - int a; - int b; - float c; - float d; +struct ComplexType_t +{ + int a; + int b; + float c; + float d; }; -__global__ void complexVector(int *d_result) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ unsigned char __align__(8) s_buffer[sizeof(Vector)]; - __shared__ ComplexType_t __align__(8) s_data[1024]; - __shared__ Vector *s_vector; +__global__ void complexVector(int *d_result) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ unsigned char __align__(8) s_buffer[sizeof(Vector)]; + __shared__ ComplexType_t __align__(8) s_data[1024]; + __shared__ Vector *s_vector; - // The first thread of the block initializes the shared Vector object. - // The placement new operator enables the Vector object and the data array top - // be placed in shared memory. - if (threadIdx.x == 0) { - s_vector = new (s_buffer) Vector(1024, s_data); - } + // The first thread of the block initializes the shared Vector object. + // The placement new operator enables the Vector object and the data array top + // be placed in shared memory. + if (threadIdx.x == 0) { + s_vector = new (s_buffer) Vector(1024, s_data); + } - cg::sync(cta); + cg::sync(cta); - if ((threadIdx.x & 1) == 0) { - ComplexType_t data; - data.a = threadIdx.x >> 1; - data.b = blockIdx.x; - data.c = threadIdx.x / (float)(blockDim.x); - data.d = blockIdx.x / (float)(gridDim.x); + if ((threadIdx.x & 1) == 0) { + ComplexType_t data; + data.a = threadIdx.x >> 1; + data.b = blockIdx.x; + data.c = threadIdx.x / (float)(blockDim.x); + data.d = blockIdx.x / (float)(gridDim.x); - s_vector->push(data); - } + s_vector->push(data); + } - cg::sync(cta); + cg::sync(cta); - ComplexType_t v; + ComplexType_t v; - if (s_vector->pop(v)) { - d_result[threadIdx.x] = v.a; - } else { - d_result[threadIdx.x] = -1; - } + if (s_vector->pop(v)) { + d_result[threadIdx.x] = v.a; + } + else { + d_result[threadIdx.x] = -1; + } - // Note: deleting objects placed in shared memory is not necessary (lifetime - // of shared memory is that of the block) + // Note: deleting objects placed in shared memory is not necessary (lifetime + // of shared memory is that of the block) } //////////////////////////////////////////////////////////////////////////////// @@ -193,74 +197,77 @@ __global__ void complexVector(int *d_result) { // //////////////////////////////////////////////////////////////////////////////// -bool checkResult(int *d_result, int N) { - std::vector h_result; - h_result.resize(N); +bool checkResult(int *d_result, int N) +{ + std::vector h_result; + h_result.resize(N); - checkCudaErrors(cudaMemcpy(&h_result[0], d_result, N * sizeof(int), - cudaMemcpyDeviceToHost)); - std::sort(h_result.begin(), h_result.end()); + checkCudaErrors(cudaMemcpy(&h_result[0], d_result, N * sizeof(int), cudaMemcpyDeviceToHost)); + std::sort(h_result.begin(), h_result.end()); - bool success = true; - bool test = false; + bool success = true; + bool test = false; - int value = 0; + int value = 0; - for (int i = 0; i < N; ++i) { - if (h_result[i] != -1) { - test = true; + for (int i = 0; i < N; ++i) { + if (h_result[i] != -1) { + test = true; + } + + if (test && (value++) != h_result[i]) { + success = false; + } } - if (test && (value++) != h_result[i]) { - success = false; - } - } - - return success; + return success; } -bool testContainer(Container **d_container, int blocks, int threads) { - int *d_result; - cudaMalloc(&d_result, blocks * threads * sizeof(int)); +bool testContainer(Container **d_container, int blocks, int threads) +{ + int *d_result; + cudaMalloc(&d_result, blocks * threads * sizeof(int)); - containerFill<<>>(d_container); - containerConsume<<>>(d_container, d_result); - containerDelete<<<1, 1>>>(d_container); - checkCudaErrors(cudaDeviceSynchronize()); + containerFill<<>>(d_container); + containerConsume<<>>(d_container, d_result); + containerDelete<<<1, 1>>>(d_container); + checkCudaErrors(cudaDeviceSynchronize()); - bool success = checkResult(d_result, blocks * threads); + bool success = checkResult(d_result, blocks * threads); - cudaFree(d_result); + cudaFree(d_result); - return success; + return success; } -bool testPlacementNew(int threads) { - int *d_result; - cudaMalloc(&d_result, threads * sizeof(int)); +bool testPlacementNew(int threads) +{ + int *d_result; + cudaMalloc(&d_result, threads * sizeof(int)); - placementNew<<<1, threads>>>(d_result); - checkCudaErrors(cudaDeviceSynchronize()); + placementNew<<<1, threads>>>(d_result); + checkCudaErrors(cudaDeviceSynchronize()); - bool success = checkResult(d_result, threads); + bool success = checkResult(d_result, threads); - cudaFree(d_result); + cudaFree(d_result); - return success; + return success; } -bool testComplexType(int threads) { - int *d_result; - cudaMalloc(&d_result, threads * sizeof(int)); +bool testComplexType(int threads) +{ + int *d_result; + cudaMalloc(&d_result, threads * sizeof(int)); - complexVector<<<1, threads>>>(d_result); - checkCudaErrors(cudaDeviceSynchronize()); + complexVector<<<1, threads>>>(d_result); + checkCudaErrors(cudaDeviceSynchronize()); - bool success = checkResult(d_result, threads); + bool success = checkResult(d_result, threads); - cudaFree(d_result); + cudaFree(d_result); - return success; + return success; } //////////////////////////////////////////////////////////////////////////////// @@ -269,41 +276,42 @@ bool testComplexType(int threads) { // //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s Starting...\n\n", sSDKsample); +int main(int argc, char **argv) +{ + printf("%s Starting...\n\n", sSDKsample); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - // set the heap size for device size new/delete to 128 MB - checkCudaErrors(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * (1 << 20))); + // set the heap size for device size new/delete to 128 MB + checkCudaErrors(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * (1 << 20))); - Container **d_container; - checkCudaErrors(cudaMalloc(&d_container, sizeof(Container **))); + Container **d_container; + checkCudaErrors(cudaMalloc(&d_container, sizeof(Container **))); - bool bTest = false; - int test_passed = 0; + bool bTest = false; + int test_passed = 0; - printf(" > Container = Vector test "); - vectorCreate<<<1, 1>>>(d_container, 128 * 128); - bTest = testContainer(d_container, 128, 128); - printf(bTest ? "OK\n\n" : "NOT OK\n\n"); - test_passed += (bTest ? 1 : 0); + printf(" > Container = Vector test "); + vectorCreate<<<1, 1>>>(d_container, 128 * 128); + bTest = testContainer(d_container, 128, 128); + printf(bTest ? "OK\n\n" : "NOT OK\n\n"); + test_passed += (bTest ? 1 : 0); - checkCudaErrors(cudaFree(d_container)); + checkCudaErrors(cudaFree(d_container)); - printf(" > Container = Vector, using placement new on SMEM buffer test "); - bTest = testPlacementNew(1024); - printf(bTest ? "OK\n\n" : "NOT OK\n\n"); - test_passed += (bTest ? 1 : 0); + printf(" > Container = Vector, using placement new on SMEM buffer test "); + bTest = testPlacementNew(1024); + printf(bTest ? "OK\n\n" : "NOT OK\n\n"); + test_passed += (bTest ? 1 : 0); - printf(" > Container = Vector, with user defined datatype test "); - bTest = testComplexType(1024); - printf(bTest ? "OK\n\n" : "NOT OK\n\n"); - test_passed += (bTest ? 1 : 0); + printf(" > Container = Vector, with user defined datatype test "); + bTest = testComplexType(1024); + printf(bTest ? "OK\n\n" : "NOT OK\n\n"); + test_passed += (bTest ? 1 : 0); - printf("Test Summary: %d/3 succesfully run\n", test_passed); + printf("Test Summary: %d/3 succesfully run\n", test_passed); - exit(test_passed == 3 ? EXIT_SUCCESS : EXIT_FAILURE); + exit(test_passed == 3 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/3_CUDA_Features/ptxjit/ptxjit.cpp b/Samples/3_CUDA_Features/ptxjit/ptxjit.cpp index f0ef3421..c68a6a5a 100644 --- a/Samples/3_CUDA_Features/ptxjit/ptxjit.cpp +++ b/Samples/3_CUDA_Features/ptxjit/ptxjit.cpp @@ -34,10 +34,10 @@ */ // System includes +#include #include #include #include -#include // CUDA driver & runtime #include @@ -47,7 +47,7 @@ #define CUDA_DRIVER_API #include #include -#include // helper for shared that are common to CUDA Samples +#include // helper for shared that are common to CUDA Samples #if defined(_WIN64) || defined(__LP64__) #define PTX_FILE "ptxjit_kernel64.ptx" @@ -57,189 +57,191 @@ const char *sSDKname = "PTX Just In Time (JIT) Compilation (no-qatest)"; -bool inline findModulePath(const char *module_file, std::string &module_path, - char **argv, std::string &ptx_source) { - char *actual_path = sdkFindFilePath(module_file, argv[0]); +bool inline findModulePath(const char *module_file, std::string &module_path, char **argv, std::string &ptx_source) +{ + char *actual_path = sdkFindFilePath(module_file, argv[0]); - if (actual_path) { - module_path = actual_path; - } else { - printf("> findModulePath file not found: <%s> \n", module_file); - return false; - } - - if (module_path.empty()) { - printf("> findModulePath file not found: <%s> \n", module_file); - return false; - } else { - printf("> findModulePath <%s>\n", module_path.c_str()); - - if (module_path.rfind(".ptx") != std::string::npos) { - FILE *fp = fopen(module_path.c_str(), "rb"); - fseek(fp, 0, SEEK_END); - int file_size = ftell(fp); - char *buf = new char[file_size + 1]; - fseek(fp, 0, SEEK_SET); - fread(buf, sizeof(char), file_size, fp); - fclose(fp); - buf[file_size] = '\0'; - ptx_source = buf; - delete[] buf; + if (actual_path) { + module_path = actual_path; + } + else { + printf("> findModulePath file not found: <%s> \n", module_file); + return false; } - return true; - } -} - -void ptxJIT(int argc, char **argv, CUmodule *phModule, CUfunction *phKernel, - CUlinkState *lState) { - CUjit_option options[6]; - void *optionVals[6]; - float walltime; - char error_log[8192], info_log[8192]; - unsigned int logSize = 8192; - void *cuOut; - size_t outSize; - int myErr = 0; - std::string module_path, ptx_source; - - // Setup linker options - // Return walltime from JIT compilation - options[0] = CU_JIT_WALL_TIME; - optionVals[0] = (void *)&walltime; - // Pass a buffer for info messages - options[1] = CU_JIT_INFO_LOG_BUFFER; - optionVals[1] = (void *)info_log; - // Pass the size of the info buffer - options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - optionVals[2] = (void *)(long)logSize; - // Pass a buffer for error message - options[3] = CU_JIT_ERROR_LOG_BUFFER; - optionVals[3] = (void *)error_log; - // Pass the size of the error buffer - options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; - optionVals[4] = (void *)(long)logSize; - // Make the linker verbose - options[5] = CU_JIT_LOG_VERBOSE; - optionVals[5] = (void *)1; - - // Create a pending linker invocation - checkCudaErrors(cuLinkCreate(6, options, optionVals, lState)); - - // first search for the module path before we load the results - if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) { - printf("> findModulePath could not find ptx\n"); - exit(EXIT_FAILURE); - } else { - printf("> initCUDA loading module: <%s>\n", module_path.c_str()); - } - - // Load the PTX from the ptx file - printf("Loading ptxjit_kernel[] program\n"); - myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void *)ptx_source.c_str(), - strlen(ptx_source.c_str()) + 1, 0, 0, 0, 0); - - if (myErr != CUDA_SUCCESS) { - // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option - // above. - fprintf(stderr, "PTX Linker Error:\n%s\n", error_log); - } - - // Complete the linker step - checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize)); - - // Linker walltime and info_log were requested in options above. - printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, - info_log); - - // Load resulting cuBin into module - checkCudaErrors(cuModuleLoadData(phModule, cuOut)); - - // Locate the kernel entry poin - checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "myKernel")); - - // Destroy the linker invocation - checkCudaErrors(cuLinkDestroy(*lState)); -} - -int main(int argc, char **argv) { - const unsigned int nThreads = 256; - const unsigned int nBlocks = 64; - const size_t memSize = nThreads * nBlocks * sizeof(int); - - CUmodule hModule = 0; - CUfunction hKernel = 0; - CUlinkState lState; - int *d_data = 0; - int *h_data = 0; - - int cuda_device = 0; - - printf("[%s] - Starting...\n", sSDKname); - - CUdevice dev = findCudaDeviceDRV(argc, (const char **)argv); - int driverVersion; - cudaDriverGetVersion(&driverVersion); - if (driverVersion < CUDART_VERSION) { - printf( - "driverVersion = %d < CUDART_VERSION = %d \n" - "Enhanced compatibility is not supported for this sample.. waving " - "execution\n", - driverVersion, CUDART_VERSION); - exit(EXIT_WAIVED); - } - - // Allocate memory on host and device (Runtime API) - // NOTE: The runtime API will create the GPU Context implicitly here - if ((h_data = (int *)malloc(memSize)) == NULL) { - std::cerr << "Could not allocate host memory" << std::endl; - exit(EXIT_FAILURE); - } - - checkCudaErrors(cudaMalloc(&d_data, memSize)); - - // JIT Compile the Kernel from PTX and get the Handles (Driver API) - ptxJIT(argc, argv, &hModule, &hKernel, &lState); - - // Set the kernel parameters (Driver API) - dim3 block(nThreads, 1, 1); - dim3 grid(nBlocks, 1, 1); - - void *args[1] = {&d_data}; - - // Launch the kernel (Driver API_) - checkCudaErrors(cuLaunchKernel(hKernel, grid.x, grid.y, grid.z, block.x, - block.y, block.z, 0, NULL, args, NULL)); - std::cout << "CUDA kernel launched" << std::endl; - - // Copy the result back to the host - checkCudaErrors(cudaMemcpy(h_data, d_data, memSize, cudaMemcpyDeviceToHost)); - - // Check the result - bool dataGood = true; - - for (unsigned int i = 0; dataGood && i < nBlocks * nThreads; i++) { - if (h_data[i] != (int)i) { - std::cerr << "Error at " << i << std::endl; - dataGood = false; + if (module_path.empty()) { + printf("> findModulePath file not found: <%s> \n", module_file); + return false; } - } + else { + printf("> findModulePath <%s>\n", module_path.c_str()); - // Cleanup - if (d_data) { - checkCudaErrors(cudaFree(d_data)); - d_data = 0; - } + if (module_path.rfind(".ptx") != std::string::npos) { + FILE *fp = fopen(module_path.c_str(), "rb"); + fseek(fp, 0, SEEK_END); + int file_size = ftell(fp); + char *buf = new char[file_size + 1]; + fseek(fp, 0, SEEK_SET); + fread(buf, sizeof(char), file_size, fp); + fclose(fp); + buf[file_size] = '\0'; + ptx_source = buf; + delete[] buf; + } - if (h_data) { - free(h_data); - h_data = 0; - } - - if (hModule) { - checkCudaErrors(cuModuleUnload(hModule)); - hModule = 0; - } - - return dataGood ? EXIT_SUCCESS : EXIT_FAILURE; + return true; + } +} + +void ptxJIT(int argc, char **argv, CUmodule *phModule, CUfunction *phKernel, CUlinkState *lState) +{ + CUjit_option options[6]; + void *optionVals[6]; + float walltime; + char error_log[8192], info_log[8192]; + unsigned int logSize = 8192; + void *cuOut; + size_t outSize; + int myErr = 0; + std::string module_path, ptx_source; + + // Setup linker options + // Return walltime from JIT compilation + options[0] = CU_JIT_WALL_TIME; + optionVals[0] = (void *)&walltime; + // Pass a buffer for info messages + options[1] = CU_JIT_INFO_LOG_BUFFER; + optionVals[1] = (void *)info_log; + // Pass the size of the info buffer + options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optionVals[2] = (void *)(long)logSize; + // Pass a buffer for error message + options[3] = CU_JIT_ERROR_LOG_BUFFER; + optionVals[3] = (void *)error_log; + // Pass the size of the error buffer + options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optionVals[4] = (void *)(long)logSize; + // Make the linker verbose + options[5] = CU_JIT_LOG_VERBOSE; + optionVals[5] = (void *)1; + + // Create a pending linker invocation + checkCudaErrors(cuLinkCreate(6, options, optionVals, lState)); + + // first search for the module path before we load the results + if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) { + printf("> findModulePath could not find ptx\n"); + exit(EXIT_FAILURE); + } + else { + printf("> initCUDA loading module: <%s>\n", module_path.c_str()); + } + + // Load the PTX from the ptx file + printf("Loading ptxjit_kernel[] program\n"); + myErr = cuLinkAddData( + *lState, CU_JIT_INPUT_PTX, (void *)ptx_source.c_str(), strlen(ptx_source.c_str()) + 1, 0, 0, 0, 0); + + if (myErr != CUDA_SUCCESS) { + // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option + // above. + fprintf(stderr, "PTX Linker Error:\n%s\n", error_log); + } + + // Complete the linker step + checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize)); + + // Linker walltime and info_log were requested in options above. + printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime, info_log); + + // Load resulting cuBin into module + checkCudaErrors(cuModuleLoadData(phModule, cuOut)); + + // Locate the kernel entry poin + checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "myKernel")); + + // Destroy the linker invocation + checkCudaErrors(cuLinkDestroy(*lState)); +} + +int main(int argc, char **argv) +{ + const unsigned int nThreads = 256; + const unsigned int nBlocks = 64; + const size_t memSize = nThreads * nBlocks * sizeof(int); + + CUmodule hModule = 0; + CUfunction hKernel = 0; + CUlinkState lState; + int *d_data = 0; + int *h_data = 0; + + int cuda_device = 0; + + printf("[%s] - Starting...\n", sSDKname); + + CUdevice dev = findCudaDeviceDRV(argc, (const char **)argv); + int driverVersion; + cudaDriverGetVersion(&driverVersion); + if (driverVersion < CUDART_VERSION) { + printf("driverVersion = %d < CUDART_VERSION = %d \n" + "Enhanced compatibility is not supported for this sample.. waving " + "execution\n", + driverVersion, + CUDART_VERSION); + exit(EXIT_WAIVED); + } + + // Allocate memory on host and device (Runtime API) + // NOTE: The runtime API will create the GPU Context implicitly here + if ((h_data = (int *)malloc(memSize)) == NULL) { + std::cerr << "Could not allocate host memory" << std::endl; + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaMalloc(&d_data, memSize)); + + // JIT Compile the Kernel from PTX and get the Handles (Driver API) + ptxJIT(argc, argv, &hModule, &hKernel, &lState); + + // Set the kernel parameters (Driver API) + dim3 block(nThreads, 1, 1); + dim3 grid(nBlocks, 1, 1); + + void *args[1] = {&d_data}; + + // Launch the kernel (Driver API_) + checkCudaErrors(cuLaunchKernel(hKernel, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, NULL, args, NULL)); + std::cout << "CUDA kernel launched" << std::endl; + + // Copy the result back to the host + checkCudaErrors(cudaMemcpy(h_data, d_data, memSize, cudaMemcpyDeviceToHost)); + + // Check the result + bool dataGood = true; + + for (unsigned int i = 0; dataGood && i < nBlocks * nThreads; i++) { + if (h_data[i] != (int)i) { + std::cerr << "Error at " << i << std::endl; + dataGood = false; + } + } + + // Cleanup + if (d_data) { + checkCudaErrors(cudaFree(d_data)); + d_data = 0; + } + + if (h_data) { + free(h_data); + h_data = 0; + } + + if (hModule) { + checkCudaErrors(cuModuleUnload(hModule)); + hModule = 0; + } + + return dataGood ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/Samples/3_CUDA_Features/ptxjit/ptxjit_kernel.cu b/Samples/3_CUDA_Features/ptxjit/ptxjit_kernel.cu index a184df49..b3a9c112 100644 --- a/Samples/3_CUDA_Features/ptxjit/ptxjit_kernel.cu +++ b/Samples/3_CUDA_Features/ptxjit/ptxjit_kernel.cu @@ -29,7 +29,8 @@ * Simple kernel for ptxjit demonstration. * */ -extern "C" __global__ void myKernel(int *data) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - data[tid] = tid; +extern "C" __global__ void myKernel(int *data) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + data[tid] = tid; } diff --git a/Samples/3_CUDA_Features/simpleCudaGraphs/simpleCudaGraphs.cu b/Samples/3_CUDA_Features/simpleCudaGraphs/simpleCudaGraphs.cu index 98f740df..fa6eb507 100644 --- a/Samples/3_CUDA_Features/simpleCudaGraphs/simpleCudaGraphs.cu +++ b/Samples/3_CUDA_Features/simpleCudaGraphs/simpleCudaGraphs.cu @@ -32,382 +32,376 @@ namespace cg = cooperative_groups; -#define THREADS_PER_BLOCK 512 +#define THREADS_PER_BLOCK 512 #define GRAPH_LAUNCH_ITERATIONS 3 -typedef struct callBackData { - const char *fn_name; - double *data; +typedef struct callBackData +{ + const char *fn_name; + double *data; } callBackData_t; -__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, - size_t outputSize) { - __shared__ double tmp[THREADS_PER_BLOCK]; +__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, size_t outputSize) +{ + __shared__ double tmp[THREADS_PER_BLOCK]; - cg::thread_block cta = cg::this_thread_block(); - size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; + cg::thread_block cta = cg::this_thread_block(); + size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; - double temp_sum = 0.0; - for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { - temp_sum += (double)inputVec[i]; - } - tmp[cta.thread_rank()] = temp_sum; - - cg::sync(cta); - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - double beta = temp_sum; - double temp; - - for (int i = tile32.size() / 2; i > 0; i >>= 1) { - if (tile32.thread_rank() < i) { - temp = tmp[cta.thread_rank() + i]; - beta += temp; - tmp[cta.thread_rank()] = beta; + double temp_sum = 0.0; + for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { + temp_sum += (double)inputVec[i]; } - cg::sync(tile32); - } - cg::sync(cta); + tmp[cta.thread_rank()] = temp_sum; - if (cta.thread_rank() == 0 && blockIdx.x < outputSize) { - beta = 0.0; - for (int i = 0; i < cta.size(); i += tile32.size()) { - beta += tmp[i]; + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + double beta = temp_sum; + double temp; + + for (int i = tile32.size() / 2; i > 0; i >>= 1) { + if (tile32.thread_rank() < i) { + temp = tmp[cta.thread_rank() + i]; + beta += temp; + tmp[cta.thread_rank()] = beta; + } + cg::sync(tile32); } - outputVec[blockIdx.x] = beta; - } -} + cg::sync(cta); -__global__ void reduceFinal(double *inputVec, double *result, - size_t inputSize) { - __shared__ double tmp[THREADS_PER_BLOCK]; - - cg::thread_block cta = cg::this_thread_block(); - size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; - - double temp_sum = 0.0; - for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { - temp_sum += (double)inputVec[i]; - } - tmp[cta.thread_rank()] = temp_sum; - - cg::sync(cta); - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - // do reduction in shared mem - if ((blockDim.x >= 512) && (cta.thread_rank() < 256)) { - tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 256]; - } - - cg::sync(cta); - - if ((blockDim.x >= 256) && (cta.thread_rank() < 128)) { - tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 128]; - } - - cg::sync(cta); - - if ((blockDim.x >= 128) && (cta.thread_rank() < 64)) { - tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 64]; - } - - cg::sync(cta); - - if (cta.thread_rank() < 32) { - // Fetch final intermediate sum from 2nd warp - if (blockDim.x >= 64) temp_sum += tmp[cta.thread_rank() + 32]; - // Reduce final warp using shuffle - for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { - temp_sum += tile32.shfl_down(temp_sum, offset); + if (cta.thread_rank() == 0 && blockIdx.x < outputSize) { + beta = 0.0; + for (int i = 0; i < cta.size(); i += tile32.size()) { + beta += tmp[i]; + } + outputVec[blockIdx.x] = beta; } - } - // write result for this block to global mem - if (cta.thread_rank() == 0) result[0] = temp_sum; } -void init_input(float *a, size_t size) { - for (size_t i = 0; i < size; i++) a[i] = (rand() & 0xFF) / (float)RAND_MAX; +__global__ void reduceFinal(double *inputVec, double *result, size_t inputSize) +{ + __shared__ double tmp[THREADS_PER_BLOCK]; + + cg::thread_block cta = cg::this_thread_block(); + size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x; + + double temp_sum = 0.0; + for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) { + temp_sum += (double)inputVec[i]; + } + tmp[cta.thread_rank()] = temp_sum; + + cg::sync(cta); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + // do reduction in shared mem + if ((blockDim.x >= 512) && (cta.thread_rank() < 256)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 256]; + } + + cg::sync(cta); + + if ((blockDim.x >= 256) && (cta.thread_rank() < 128)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 128]; + } + + cg::sync(cta); + + if ((blockDim.x >= 128) && (cta.thread_rank() < 64)) { + tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 64]; + } + + cg::sync(cta); + + if (cta.thread_rank() < 32) { + // Fetch final intermediate sum from 2nd warp + if (blockDim.x >= 64) + temp_sum += tmp[cta.thread_rank() + 32]; + // Reduce final warp using shuffle + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + temp_sum += tile32.shfl_down(temp_sum, offset); + } + } + // write result for this block to global mem + if (cta.thread_rank() == 0) + result[0] = temp_sum; } -void CUDART_CB myHostNodeCallback(void *data) { - // Check status of GPU after stream operations are done - callBackData_t *tmp = (callBackData_t *)(data); - // checkCudaErrors(tmp->status); - - double *result = (double *)(tmp->data); - char *function = (char *)(tmp->fn_name); - printf("[%s] Host callback final reduced sum = %lf\n", function, *result); - *result = 0.0; // reset the result +void init_input(float *a, size_t size) +{ + for (size_t i = 0; i < size; i++) + a[i] = (rand() & 0xFF) / (float)RAND_MAX; } -void cudaGraphsManual(float *inputVec_h, float *inputVec_d, double *outputVec_d, - double *result_d, size_t inputSize, size_t numOfBlocks) { - cudaStream_t streamForGraph; - cudaGraph_t graph; - std::vector nodeDependencies; - cudaGraphNode_t memcpyNode, kernelNode, memsetNode; - double result_h = 0.0; +void CUDART_CB myHostNodeCallback(void *data) +{ + // Check status of GPU after stream operations are done + callBackData_t *tmp = (callBackData_t *)(data); + // checkCudaErrors(tmp->status); - checkCudaErrors(cudaStreamCreate(&streamForGraph)); - - cudaKernelNodeParams kernelNodeParams = {0}; - cudaMemcpy3DParms memcpyParams = {0}; - cudaMemsetParams memsetParams = {0}; - - memcpyParams.srcArray = NULL; - memcpyParams.srcPos = make_cudaPos(0, 0, 0); - memcpyParams.srcPtr = - make_cudaPitchedPtr(inputVec_h, sizeof(float) * inputSize, inputSize, 1); - memcpyParams.dstArray = NULL; - memcpyParams.dstPos = make_cudaPos(0, 0, 0); - memcpyParams.dstPtr = - make_cudaPitchedPtr(inputVec_d, sizeof(float) * inputSize, inputSize, 1); - memcpyParams.extent = make_cudaExtent(sizeof(float) * inputSize, 1, 1); - memcpyParams.kind = cudaMemcpyHostToDevice; - - memsetParams.dst = (void *)outputVec_d; - memsetParams.value = 0; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(float); // elementSize can be max 4 bytes - memsetParams.width = numOfBlocks * 2; - memsetParams.height = 1; - - checkCudaErrors(cudaGraphCreate(&graph, 0)); - checkCudaErrors( - cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams)); - checkCudaErrors( - cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); - - nodeDependencies.push_back(memsetNode); - nodeDependencies.push_back(memcpyNode); - - void *kernelArgs[4] = {(void *)&inputVec_d, (void *)&outputVec_d, &inputSize, - &numOfBlocks}; - - kernelNodeParams.func = (void *)reduce; - kernelNodeParams.gridDim = dim3(numOfBlocks, 1, 1); - kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); - kernelNodeParams.sharedMemBytes = 0; - kernelNodeParams.kernelParams = (void **)kernelArgs; - kernelNodeParams.extra = NULL; - - checkCudaErrors( - cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), - nodeDependencies.size(), &kernelNodeParams)); - - nodeDependencies.clear(); - nodeDependencies.push_back(kernelNode); - - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = result_d; - memsetParams.value = 0; - memsetParams.elementSize = sizeof(float); - memsetParams.width = 2; - memsetParams.height = 1; - checkCudaErrors( - cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); - - nodeDependencies.push_back(memsetNode); - - memset(&kernelNodeParams, 0, sizeof(kernelNodeParams)); - kernelNodeParams.func = (void *)reduceFinal; - kernelNodeParams.gridDim = dim3(1, 1, 1); - kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); - kernelNodeParams.sharedMemBytes = 0; - void *kernelArgs2[3] = {(void *)&outputVec_d, (void *)&result_d, - &numOfBlocks}; - kernelNodeParams.kernelParams = kernelArgs2; - kernelNodeParams.extra = NULL; - - checkCudaErrors( - cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), - nodeDependencies.size(), &kernelNodeParams)); - nodeDependencies.clear(); - nodeDependencies.push_back(kernelNode); - - memset(&memcpyParams, 0, sizeof(memcpyParams)); - - memcpyParams.srcArray = NULL; - memcpyParams.srcPos = make_cudaPos(0, 0, 0); - memcpyParams.srcPtr = make_cudaPitchedPtr(result_d, sizeof(double), 1, 1); - memcpyParams.dstArray = NULL; - memcpyParams.dstPos = make_cudaPos(0, 0, 0); - memcpyParams.dstPtr = make_cudaPitchedPtr(&result_h, sizeof(double), 1, 1); - memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1); - memcpyParams.kind = cudaMemcpyDeviceToHost; - checkCudaErrors( - cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), - nodeDependencies.size(), &memcpyParams)); - nodeDependencies.clear(); - nodeDependencies.push_back(memcpyNode); - - cudaGraphNode_t hostNode; - cudaHostNodeParams hostParams = {0}; - hostParams.fn = myHostNodeCallback; - callBackData_t hostFnData; - hostFnData.data = &result_h; - hostFnData.fn_name = "cudaGraphsManual"; - hostParams.userData = &hostFnData; - - checkCudaErrors(cudaGraphAddHostNode(&hostNode, graph, - nodeDependencies.data(), - nodeDependencies.size(), &hostParams)); - - cudaGraphNode_t *nodes = NULL; - size_t numNodes = 0; - checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); - printf("\nNum of nodes in the graph created manually = %zu\n", numNodes); - - cudaGraphExec_t graphExec; - checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - - cudaGraph_t clonedGraph; - cudaGraphExec_t clonedGraphExec; - checkCudaErrors(cudaGraphClone(&clonedGraph, graph)); - checkCudaErrors( - cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0)); - - for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { - checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); - } - - checkCudaErrors(cudaStreamSynchronize(streamForGraph)); - - printf("Cloned Graph Output.. \n"); - for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { - checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph)); - } - checkCudaErrors(cudaStreamSynchronize(streamForGraph)); - - checkCudaErrors(cudaGraphExecDestroy(graphExec)); - checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec)); - checkCudaErrors(cudaGraphDestroy(graph)); - checkCudaErrors(cudaGraphDestroy(clonedGraph)); - checkCudaErrors(cudaStreamDestroy(streamForGraph)); + double *result = (double *)(tmp->data); + char *function = (char *)(tmp->fn_name); + printf("[%s] Host callback final reduced sum = %lf\n", function, *result); + *result = 0.0; // reset the result } -void cudaGraphsUsingStreamCapture(float *inputVec_h, float *inputVec_d, - double *outputVec_d, double *result_d, - size_t inputSize, size_t numOfBlocks) { - cudaStream_t stream1, stream2, stream3, streamForGraph; - cudaEvent_t forkStreamEvent, memsetEvent1, memsetEvent2; - cudaGraph_t graph; - double result_h = 0.0; +void cudaGraphsManual(float *inputVec_h, + float *inputVec_d, + double *outputVec_d, + double *result_d, + size_t inputSize, + size_t numOfBlocks) +{ + cudaStream_t streamForGraph; + cudaGraph_t graph; + std::vector nodeDependencies; + cudaGraphNode_t memcpyNode, kernelNode, memsetNode; + double result_h = 0.0; - checkCudaErrors(cudaStreamCreate(&stream1)); - checkCudaErrors(cudaStreamCreate(&stream2)); - checkCudaErrors(cudaStreamCreate(&stream3)); - checkCudaErrors(cudaStreamCreate(&streamForGraph)); + checkCudaErrors(cudaStreamCreate(&streamForGraph)); - checkCudaErrors(cudaEventCreate(&forkStreamEvent)); - checkCudaErrors(cudaEventCreate(&memsetEvent1)); - checkCudaErrors(cudaEventCreate(&memsetEvent2)); + cudaKernelNodeParams kernelNodeParams = {0}; + cudaMemcpy3DParms memcpyParams = {0}; + cudaMemsetParams memsetParams = {0}; - checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal)); + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(inputVec_h, sizeof(float) * inputSize, inputSize, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(inputVec_d, sizeof(float) * inputSize, inputSize, 1); + memcpyParams.extent = make_cudaExtent(sizeof(float) * inputSize, 1, 1); + memcpyParams.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaEventRecord(forkStreamEvent, stream1)); - checkCudaErrors(cudaStreamWaitEvent(stream2, forkStreamEvent, 0)); - checkCudaErrors(cudaStreamWaitEvent(stream3, forkStreamEvent, 0)); + memsetParams.dst = (void *)outputVec_d; + memsetParams.value = 0; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(float); // elementSize can be max 4 bytes + memsetParams.width = numOfBlocks * 2; + memsetParams.height = 1; - checkCudaErrors(cudaMemcpyAsync(inputVec_d, inputVec_h, - sizeof(float) * inputSize, cudaMemcpyDefault, - stream1)); + checkCudaErrors(cudaGraphCreate(&graph, 0)); + checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams)); + checkCudaErrors(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); - checkCudaErrors( - cudaMemsetAsync(outputVec_d, 0, sizeof(double) * numOfBlocks, stream2)); + nodeDependencies.push_back(memsetNode); + nodeDependencies.push_back(memcpyNode); - checkCudaErrors(cudaEventRecord(memsetEvent1, stream2)); + void *kernelArgs[4] = {(void *)&inputVec_d, (void *)&outputVec_d, &inputSize, &numOfBlocks}; - checkCudaErrors(cudaMemsetAsync(result_d, 0, sizeof(double), stream3)); - checkCudaErrors(cudaEventRecord(memsetEvent2, stream3)); + kernelNodeParams.func = (void *)reduce; + kernelNodeParams.gridDim = dim3(numOfBlocks, 1, 1); + kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void **)kernelArgs; + kernelNodeParams.extra = NULL; - checkCudaErrors(cudaStreamWaitEvent(stream1, memsetEvent1, 0)); + checkCudaErrors(cudaGraphAddKernelNode( + &kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams)); - reduce<<>>( - inputVec_d, outputVec_d, inputSize, numOfBlocks); + nodeDependencies.clear(); + nodeDependencies.push_back(kernelNode); - checkCudaErrors(cudaStreamWaitEvent(stream1, memsetEvent2, 0)); + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = result_d; + memsetParams.value = 0; + memsetParams.elementSize = sizeof(float); + memsetParams.width = 2; + memsetParams.height = 1; + checkCudaErrors(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams)); - reduceFinal<<<1, THREADS_PER_BLOCK, 0, stream1>>>(outputVec_d, result_d, - numOfBlocks); - checkCudaErrors(cudaMemcpyAsync(&result_h, result_d, sizeof(double), - cudaMemcpyDefault, stream1)); + nodeDependencies.push_back(memsetNode); - callBackData_t hostFnData = {0}; - hostFnData.data = &result_h; - hostFnData.fn_name = "cudaGraphsUsingStreamCapture"; - cudaHostFn_t fn = myHostNodeCallback; - checkCudaErrors(cudaLaunchHostFunc(stream1, fn, &hostFnData)); - checkCudaErrors(cudaStreamEndCapture(stream1, &graph)); + memset(&kernelNodeParams, 0, sizeof(kernelNodeParams)); + kernelNodeParams.func = (void *)reduceFinal; + kernelNodeParams.gridDim = dim3(1, 1, 1); + kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + void *kernelArgs2[3] = {(void *)&outputVec_d, (void *)&result_d, &numOfBlocks}; + kernelNodeParams.kernelParams = kernelArgs2; + kernelNodeParams.extra = NULL; - cudaGraphNode_t *nodes = NULL; - size_t numNodes = 0; - checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); - printf("\nNum of nodes in the graph created using stream capture API = %zu\n", - numNodes); + checkCudaErrors(cudaGraphAddKernelNode( + &kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams)); + nodeDependencies.clear(); + nodeDependencies.push_back(kernelNode); - cudaGraphExec_t graphExec; - checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + memset(&memcpyParams, 0, sizeof(memcpyParams)); - cudaGraph_t clonedGraph; - cudaGraphExec_t clonedGraphExec; - checkCudaErrors(cudaGraphClone(&clonedGraph, graph)); - checkCudaErrors( - cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0)); + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(result_d, sizeof(double), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(&result_h, sizeof(double), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1); + memcpyParams.kind = cudaMemcpyDeviceToHost; + checkCudaErrors( + cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams)); + nodeDependencies.clear(); + nodeDependencies.push_back(memcpyNode); - for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { - checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); - } + cudaGraphNode_t hostNode; + cudaHostNodeParams hostParams = {0}; + hostParams.fn = myHostNodeCallback; + callBackData_t hostFnData; + hostFnData.data = &result_h; + hostFnData.fn_name = "cudaGraphsManual"; + hostParams.userData = &hostFnData; - checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + checkCudaErrors( + cudaGraphAddHostNode(&hostNode, graph, nodeDependencies.data(), nodeDependencies.size(), &hostParams)); - printf("Cloned Graph Output.. \n"); - for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { - checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph)); - } + cudaGraphNode_t *nodes = NULL; + size_t numNodes = 0; + checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); + printf("\nNum of nodes in the graph created manually = %zu\n", numNodes); - checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + cudaGraphExec_t graphExec; + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - checkCudaErrors(cudaGraphExecDestroy(graphExec)); - checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec)); - checkCudaErrors(cudaGraphDestroy(graph)); - checkCudaErrors(cudaGraphDestroy(clonedGraph)); - checkCudaErrors(cudaStreamDestroy(stream1)); - checkCudaErrors(cudaStreamDestroy(stream2)); - checkCudaErrors(cudaStreamDestroy(streamForGraph)); + cudaGraph_t clonedGraph; + cudaGraphExec_t clonedGraphExec; + checkCudaErrors(cudaGraphClone(&clonedGraph, graph)); + checkCudaErrors(cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0)); + + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); + } + + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + printf("Cloned Graph Output.. \n"); + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph)); + } + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec)); + checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphDestroy(clonedGraph)); + checkCudaErrors(cudaStreamDestroy(streamForGraph)); } -int main(int argc, char **argv) { - size_t size = 1 << 24; // number of elements to reduce - size_t maxBlocks = 512; +void cudaGraphsUsingStreamCapture(float *inputVec_h, + float *inputVec_d, + double *outputVec_d, + double *result_d, + size_t inputSize, + size_t numOfBlocks) +{ + cudaStream_t stream1, stream2, stream3, streamForGraph; + cudaEvent_t forkStreamEvent, memsetEvent1, memsetEvent2; + cudaGraph_t graph; + double result_h = 0.0; - // This will pick the best possible CUDA capable device - int devID = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaStreamCreate(&stream1)); + checkCudaErrors(cudaStreamCreate(&stream2)); + checkCudaErrors(cudaStreamCreate(&stream3)); + checkCudaErrors(cudaStreamCreate(&streamForGraph)); - printf("%zu elements\n", size); - printf("threads per block = %d\n", THREADS_PER_BLOCK); - printf("Graph Launch iterations = %d\n", GRAPH_LAUNCH_ITERATIONS); + checkCudaErrors(cudaEventCreate(&forkStreamEvent)); + checkCudaErrors(cudaEventCreate(&memsetEvent1)); + checkCudaErrors(cudaEventCreate(&memsetEvent2)); - float *inputVec_d = NULL, *inputVec_h = NULL; - double *outputVec_d = NULL, *result_d; + checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal)); - checkCudaErrors(cudaMallocHost(&inputVec_h, sizeof(float) * size)); - checkCudaErrors(cudaMalloc(&inputVec_d, sizeof(float) * size)); - checkCudaErrors(cudaMalloc(&outputVec_d, sizeof(double) * maxBlocks)); - checkCudaErrors(cudaMalloc(&result_d, sizeof(double))); + checkCudaErrors(cudaEventRecord(forkStreamEvent, stream1)); + checkCudaErrors(cudaStreamWaitEvent(stream2, forkStreamEvent, 0)); + checkCudaErrors(cudaStreamWaitEvent(stream3, forkStreamEvent, 0)); - init_input(inputVec_h, size); + checkCudaErrors(cudaMemcpyAsync(inputVec_d, inputVec_h, sizeof(float) * inputSize, cudaMemcpyDefault, stream1)); - cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, - maxBlocks); - cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, - size, maxBlocks); + checkCudaErrors(cudaMemsetAsync(outputVec_d, 0, sizeof(double) * numOfBlocks, stream2)); - checkCudaErrors(cudaFree(inputVec_d)); - checkCudaErrors(cudaFree(outputVec_d)); - checkCudaErrors(cudaFree(result_d)); - checkCudaErrors(cudaFreeHost(inputVec_h)); - return EXIT_SUCCESS; + checkCudaErrors(cudaEventRecord(memsetEvent1, stream2)); + + checkCudaErrors(cudaMemsetAsync(result_d, 0, sizeof(double), stream3)); + checkCudaErrors(cudaEventRecord(memsetEvent2, stream3)); + + checkCudaErrors(cudaStreamWaitEvent(stream1, memsetEvent1, 0)); + + reduce<<>>(inputVec_d, outputVec_d, inputSize, numOfBlocks); + + checkCudaErrors(cudaStreamWaitEvent(stream1, memsetEvent2, 0)); + + reduceFinal<<<1, THREADS_PER_BLOCK, 0, stream1>>>(outputVec_d, result_d, numOfBlocks); + checkCudaErrors(cudaMemcpyAsync(&result_h, result_d, sizeof(double), cudaMemcpyDefault, stream1)); + + callBackData_t hostFnData = {0}; + hostFnData.data = &result_h; + hostFnData.fn_name = "cudaGraphsUsingStreamCapture"; + cudaHostFn_t fn = myHostNodeCallback; + checkCudaErrors(cudaLaunchHostFunc(stream1, fn, &hostFnData)); + checkCudaErrors(cudaStreamEndCapture(stream1, &graph)); + + cudaGraphNode_t *nodes = NULL; + size_t numNodes = 0; + checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); + printf("\nNum of nodes in the graph created using stream capture API = %zu\n", numNodes); + + cudaGraphExec_t graphExec; + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + cudaGraph_t clonedGraph; + cudaGraphExec_t clonedGraphExec; + checkCudaErrors(cudaGraphClone(&clonedGraph, graph)); + checkCudaErrors(cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0)); + + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); + } + + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + printf("Cloned Graph Output.. \n"); + for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) { + checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph)); + } + + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec)); + checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaGraphDestroy(clonedGraph)); + checkCudaErrors(cudaStreamDestroy(stream1)); + checkCudaErrors(cudaStreamDestroy(stream2)); + checkCudaErrors(cudaStreamDestroy(streamForGraph)); +} + +int main(int argc, char **argv) +{ + size_t size = 1 << 24; // number of elements to reduce + size_t maxBlocks = 512; + + // This will pick the best possible CUDA capable device + int devID = findCudaDevice(argc, (const char **)argv); + + printf("%zu elements\n", size); + printf("threads per block = %d\n", THREADS_PER_BLOCK); + printf("Graph Launch iterations = %d\n", GRAPH_LAUNCH_ITERATIONS); + + float *inputVec_d = NULL, *inputVec_h = NULL; + double *outputVec_d = NULL, *result_d; + + checkCudaErrors(cudaMallocHost(&inputVec_h, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&inputVec_d, sizeof(float) * size)); + checkCudaErrors(cudaMalloc(&outputVec_d, sizeof(double) * maxBlocks)); + checkCudaErrors(cudaMalloc(&result_d, sizeof(double))); + + init_input(inputVec_h, size); + + cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks); + cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks); + + checkCudaErrors(cudaFree(inputVec_d)); + checkCudaErrors(cudaFree(outputVec_d)); + checkCudaErrors(cudaFree(result_d)); + checkCudaErrors(cudaFreeHost(inputVec_h)); + return EXIT_SUCCESS; } diff --git a/Samples/3_CUDA_Features/tf32TensorCoreGemm/tf32TensorCoreGemm.cu b/Samples/3_CUDA_Features/tf32TensorCoreGemm/tf32TensorCoreGemm.cu index 40a324b4..d95afb50 100644 --- a/Samples/3_CUDA_Features/tf32TensorCoreGemm/tf32TensorCoreGemm.cu +++ b/Samples/3_CUDA_Features/tf32TensorCoreGemm/tf32TensorCoreGemm.cu @@ -58,14 +58,14 @@ // but carefully enough to avoid local memory use. #include -#include #include -#include #include +#include +#include // helper functions and utilities to work with CUDA -#include #include +#include // Externally configurable parameters. @@ -106,7 +106,7 @@ // Implementation constants. -#define WARPS_PER_BLOCK 8 +#define WARPS_PER_BLOCK 8 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) #if SHARED_MEMORY_LIMIT_64K @@ -124,10 +124,10 @@ #define CHUNK_K 8 #endif -#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(float)) -#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) +#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(float)) +#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) #define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES) -#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) +#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) #define BLOCK_ROW_WARPS 2 #define BLOCK_COL_WARPS 4 @@ -157,25 +157,24 @@ // we must keep each row and column 256-bit aligned, as required by nvcuda::wmma::load_matrix_sync. #define SKEW_FLOAT 8 -#define checkKernelErrors(expr) do { \ - expr; \ - \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err)); \ - abort(); \ - } \ -} while(0) +#define checkKernelErrors(expr) \ + do { \ + expr; \ + \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, cudaGetErrorString(__err)); \ + abort(); \ + } \ + } while (0) -enum kernels -{ - tf32mma_shmem_gemm_async_copy = 0, // tf32 MMA shmem using kernel with async_copy - tf32mma_shmem_gemm = 1, // tf32 MMA shmem using kernel normal copy (without async_copy). - simple_tf32mma_gemm = 2 // tf32 MMA non-shmem using simple kernel. +enum kernels { + tf32mma_shmem_gemm_async_copy = 0, // tf32 MMA shmem using kernel with async_copy + tf32mma_shmem_gemm = 1, // tf32 MMA shmem using kernel normal copy (without async_copy). + simple_tf32mma_gemm = 2 // tf32 MMA non-shmem using simple kernel. }; -const char* kernelNames[] = {"compute_tf32gemm_async_copy", "compute_tf32gemm", - "simple_wmma_tf32gemm"}; +const char *kernelNames[] = {"compute_tf32gemm_async_copy", "compute_tf32gemm", "simple_wmma_tf32gemm"}; using namespace nvcuda; @@ -183,18 +182,18 @@ __host__ void init_host_matrices(float *a, float *b, float *c) { for (int i = 0; i < M_GLOBAL; i++) { for (int j = 0; j < K_GLOBAL; j++) { - a[i*K_GLOBAL+j] = (float)(rand() % 3); + a[i * K_GLOBAL + j] = (float)(rand() % 3); } } for (int i = 0; i < N_GLOBAL; i++) { for (int j = 0; j < K_GLOBAL; j++) { - b[i*K_GLOBAL+j] = (float)(rand() % 3); + b[i * K_GLOBAL + j] = (float)(rand() % 3); } } for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { - c[t] = (float)(rand() % 3); + c[t] = (float)(rand() % 3); } } @@ -211,10 +210,11 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; // This pointer is used to access the C and D matrix tiles this warp computes. - float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; + float *shmem_warp_tile_ptr = (float *)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. - float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N; + float *shmem_warp_stream_ptr = (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * N; // Adjust the beta scaler, as it'll be multiplied by alpha at the end of // each tile computation. Technically this is not generally correct (may result @@ -224,7 +224,7 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the // right and down, and selects the next tile to compute. Once there's no such tile, // all warps in this CTA exit. - for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; @@ -234,14 +234,14 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, } // This warp's pointer to the C matrix data to copy memory from to shared memory. - const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; const float *src_gmem_warp_stream_ptr = &C[gmem_idx]; // Stream multiple C tiles to shared memory. #pragma unroll for (int i = 0; i < N; i++) { - *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = - *((int4*)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId); + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = + *((int4 *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId); } __syncthreads(); @@ -265,7 +265,7 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, // Scale the C matrix. #pragma unroll - for (int i = 0; i < WARP_COL_TILES; i++) { + for (int i = 0; i < WARP_COL_TILES; i++) { #pragma unroll for (int j = 0; j < WARP_ROW_TILES; j++) { #pragma unroll @@ -277,16 +277,19 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, // Select what warp copies what matrix to shared memory. // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const float *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2); + const float *warp_ptr = + (warpId < (WARPS_PER_BLOCK / 2)) + ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2); // Go through the global K dimension by a fixed step at a time. #pragma unroll for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { // Copy slices of the A and B matrices to shared memory. // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. - size_t shmem_idx = warpId < (WARPS_PER_BLOCK/2) ? (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (N * (warpId % (WARPS_PER_BLOCK/2)) * 2 + shmem_idx_b_off); + size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) + ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. @@ -296,9 +299,10 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, shmem_idx += laneId / CHUNK_COPY_LINE_LANES; #pragma unroll - for(int i = 0; i < ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) { + for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) { // Copy 16 bytes at once in each lane. - *((int4*)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *((int4*)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)); + *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = + *((int4 *)lane_ptr + (laneId % CHUNK_COPY_LINE_LANES)); // Advance the global memory pointer and the shared memory index. lane_ptr = lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP; @@ -315,8 +319,8 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId/BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); - const float *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); + const float *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_FLOAT); #pragma unroll @@ -328,8 +332,8 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, if (i == 0) { // Load the B matrix fragment once, because it is going to be reused // against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N); - const float *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const float *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_FLOAT); #pragma unroll @@ -370,8 +374,8 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, #pragma unroll for (int i = 0; i < N; i++) { - *((int4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((int4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); @@ -379,7 +383,8 @@ __global__ void compute_tf32gemm(const float *A, const float *B, const float *C, #endif } -__global__ void compute_tf32gemm_async_copy(const float *A, const float *B, const float *C, float *D, const float alpha, float beta) +__global__ void +compute_tf32gemm_async_copy(const float *A, const float *B, const float *C, float *D, const float alpha, float beta) { #if __CUDA_ARCH__ >= 800 extern __shared__ float shmem[][CHUNK_K * K + SKEW_FLOAT]; @@ -389,10 +394,11 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons const unsigned int laneId = threadIdx.x % WARP_SIZE; // This pointer is used to access the C and D matrix tiles this warp computes. - float *shmem_warp_tile_ptr = (float*)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; + float *shmem_warp_tile_ptr = (float *)&shmem[0][0] + (warpId / BLOCK_ROW_WARPS) * SHMEM_STRIDE * N * BLOCK_ROW_WARPS + + (warpId % BLOCK_ROW_WARPS) * SHMEM_OFFSET; // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. - float *shmem_warp_stream_ptr = (float*)&shmem[0][0] + warpId * SHMEM_STRIDE * N; + float *shmem_warp_stream_ptr = (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * N; // Offset in shared memory from which the B matrix is stored. constexpr size_t shmem_idx_b_off = BLOCK_COL_TILES * M; @@ -402,14 +408,14 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons // in a loss of precision). Zero still needs to be specially handled though. beta /= alpha; - cuda::pipeline pipe = cuda::make_pipeline(); - const auto shape4 = cuda::aligned_size_t(sizeof(float4)); - constexpr int loadStride = 2; // load 4 floats, so left-shift by 2. + cuda::pipeline pipe = cuda::make_pipeline(); + const auto shape4 = cuda::aligned_size_t(sizeof(float4)); + constexpr int loadStride = 2; // load 4 floats, so left-shift by 2. // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the // right and down, and selects the next tile to compute. Once there's no such tile, // all warps in this CTA exit. - for(unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; @@ -419,7 +425,7 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons } // This warp's pointer to the C matrix data to copy memory from to shared memory. - const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; const float *src_gmem_warp_stream_ptr = &C[gmem_idx]; // Stream multiple C tiles to shared memory. @@ -427,8 +433,9 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons for (int i = 0; i < N; i++) { pipe.producer_acquire(); cuda::memcpy_async(&shmem_warp_stream_ptr[(SHMEM_STRIDE * i) + (laneId << loadStride)], - &src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i) + (laneId << loadStride)], - shape4, pipe); + &src_gmem_warp_stream_ptr[(GLOBAL_MEM_STRIDE * i) + (laneId << loadStride)], + shape4, + pipe); pipe.producer_commit(); } // Now wait for all the above issued 8 batches to complete. @@ -461,19 +468,22 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons // Select what warp copies what matrix to shared memory. // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. - const float *warp_ptr = (warpId < (WARPS_PER_BLOCK/2)) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2) : - (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK/2)) * 2); + const float *warp_ptr = + (warpId < (WARPS_PER_BLOCK / 2)) + ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + : (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % (WARPS_PER_BLOCK / 2)) * 2); - constexpr int chunksPerLane = ((WARP_SIZE/2) / CHUNK_COPY_LINES_PER_WARP) * 2; - const int laneLoadElem = (laneId % CHUNK_COPY_LINE_LANES) << loadStride; - const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); + constexpr int chunksPerLane = ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; + const int laneLoadElem = (laneId % CHUNK_COPY_LINE_LANES) << loadStride; + const int stridePerLaneCopy = (laneId / CHUNK_COPY_LINE_LANES); // Go through the global K dimension by a fixed step at a time. #pragma unroll for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { // Copy slices of the A and B matrices to shared memory. // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. // As for tf32 MMA M == N we use M for warp 4-7 + shmem_idx_b_off. - size_t shmem_idx = (M * (warpId % (WARPS_PER_BLOCK/2)) * 2) + ((warpId / (WARPS_PER_BLOCK/2)) * shmem_idx_b_off); + size_t shmem_idx = + (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) + ((warpId / (WARPS_PER_BLOCK / 2)) * shmem_idx_b_off); // First half of the warp copies the first row / column of the matrix, // the second half of the warp copies the next. const float *lane_ptr = (warp_ptr + tile_k * K + stridePerLaneCopy * K_GLOBAL + laneLoadElem); @@ -482,7 +492,7 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons shmem_idx += stridePerLaneCopy; #pragma unroll - for(int i = 0; i < chunksPerLane; i++) { + for (int i = 0; i < chunksPerLane; i++) { // Copy 16 bytes at once in each lane. pipe.producer_acquire(); cuda::memcpy_async(&shmem[shmem_idx][laneLoadElem], lane_ptr, shape4, pipe); @@ -504,8 +514,8 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons #pragma unroll for (int i = 0; i < WARP_COL_TILES; i++) { - size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); - const float *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + size_t shmem_idx_a = (warpId / BLOCK_ROW_WARPS) * M * BLOCK_ROW_WARPS + (i * M); + const float *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_FLOAT); @@ -518,13 +528,13 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons if (i == 0) { // Load the B matrix fragment once, because it is going to be reused // against the other A matrix fragments. - size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId%2) + (j * N); - const float *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const float *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_FLOAT); #pragma unroll for (int t = 0; t < b[j].num_elements; t++) { - b[j].x[t] = wmma::__float_to_tf32(b[j].x[t]); + b[j].x[t] = wmma::__float_to_tf32(b[j].x[t]); } } @@ -560,8 +570,8 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons #pragma unroll for (int i = 0; i < N; i++) { - *((float4*)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = - *((float4*)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + *((float4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((float4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); @@ -571,84 +581,90 @@ __global__ void compute_tf32gemm_async_copy(const float *A, const float *B, cons // Performs an MxNxK tf32 GEMM (C=alpha*A*B + beta*C) assuming: // 1) Matrices are packed in memory. -// 2) M, N and K are multiples of 16, 16 and 8 respectively. +// 2) M, N and K are multiples of 16, 16 and 8 respectively. // 3) A is row major, B is column major matrix. // Note: This is a less performant version of the compute_tf32gemm kernel. It is designed for // demonstration purposes only to show the CUDA WMMA API use without relying on // availability of the shared memory. -__global__ void simple_wmma_tf32gemm(float *a, float *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) +__global__ void +simple_wmma_tf32gemm(float *a, float *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) { #if __CUDA_ARCH__ >= 800 - // Leading dimensions. Packed with no transpositions. + // Leading dimensions. Packed with no transpositions. int lda = k_ld; int ldb = k_ld; int ldc = n_ld; - // Tile using a 2D grid - int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; - int warpN = (blockIdx.y * blockDim.y + threadIdx.y); - - // Declare the fragments - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment acc_frag; - wmma::fragment c_frag; + // Tile using a 2D grid + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); - wmma::fill_fragment(acc_frag, 0.0f); + // Declare the fragments + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; - // Loop over k - for (int i = 0; i < k_ld; i += K) { - int aCol = i; - int aRow = warpM * M; + wmma::fill_fragment(acc_frag, 0.0f); - //int bCol = i; - //int bRow = warpN * N; - int bCol = warpN * N; - int bRow = i; + // Loop over k + for (int i = 0; i < k_ld; i += K) { + int aCol = i; + int aRow = warpM * M; - // Bounds checking - if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { - // Load the inputs - wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); - wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); - - #pragma unroll - for (int t = 0; t < a_frag.num_elements; t++) { - a_frag.x[t] = wmma::__float_to_tf32(a_frag.x[t]); + // int bCol = i; + // int bRow = warpN * N; + int bCol = warpN * N; + int bRow = i; + + // Bounds checking + if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { + // Load the inputs + wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); + wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); + +#pragma unroll + for (int t = 0; t < a_frag.num_elements; t++) { + a_frag.x[t] = wmma::__float_to_tf32(a_frag.x[t]); + } + +#pragma unroll + for (int t = 0; t < b_frag.num_elements; t++) { + b_frag.x[t] = wmma::__float_to_tf32(b_frag.x[t]); + } + // Perform the matrix multiplication + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + } + } + + // Load in the current value of c, scale it by beta, and add this our result scaled by alpha + int cCol = warpN * N; + int cRow = warpM * M; + + if (cRow < m_ld && cCol < n_ld) { + wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); + + for (int i = 0; i < c_frag.num_elements; i++) { + c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; } - #pragma unroll - for (int t = 0; t < b_frag.num_elements; t++) { - b_frag.x[t] = wmma::__float_to_tf32(b_frag.x[t]); - } - // Perform the matrix multiplication - wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); - - } - } - - // Load in the current value of c, scale it by beta, and add this our result scaled by alpha - int cCol = warpN * N; - int cRow = warpM * M; - - if (cRow < m_ld && cCol < n_ld) { - wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); - - for(int i=0; i < c_frag.num_elements; i++) { - c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; - } - - // Store the output - wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); - } + // Store the output + wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); + } #endif } -__host__ void matMultiplyOnHost(float *A, float *B, float *C, - float alpha, float beta, - int numARows, int numAColumns, - int numBRows, int numBColumns, - int numCRows, int numCColumns) +__host__ void matMultiplyOnHost(float *A, + float *B, + float *C, + float alpha, + float beta, + int numARows, + int numAColumns, + int numBRows, + int numBColumns, + int numCRows, + int numCColumns) { for (int i = 0; i < numCRows; i++) { for (int j = 0; j < numCColumns; j++) { @@ -658,7 +674,7 @@ __host__ void matMultiplyOnHost(float *A, float *B, float *C, temp += A[i * numAColumns + k] * B[j * numBRows + k]; } - C[i*numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; } } } @@ -686,16 +702,16 @@ int main(int argc, char **argv) float *B_h = NULL; float *C_h = NULL; #if CPU_DEBUG - float *result_hD = NULL; + float *result_hD = NULL; float *result_host = NULL; #endif - A_h = (float*) malloc(sizeof(float) * M_GLOBAL * K_GLOBAL); - B_h = (float*) malloc(sizeof(float) * K_GLOBAL * N_GLOBAL); - C_h = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + A_h = (float *)malloc(sizeof(float) * M_GLOBAL * K_GLOBAL); + B_h = (float *)malloc(sizeof(float) * K_GLOBAL * N_GLOBAL); + C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); #if CPU_DEBUG - result_hD = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); - result_host = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); #endif float *A = NULL; @@ -703,10 +719,10 @@ int main(int argc, char **argv) float *C = NULL; float *D = NULL; - checkCudaErrors(cudaMalloc((void**)&A, sizeof(float) * M_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&B, sizeof(float) * N_GLOBAL * K_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&C, sizeof(float) * M_GLOBAL * N_GLOBAL)); - checkCudaErrors(cudaMalloc((void**)&D, sizeof(float) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&A, sizeof(float) * M_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&B, sizeof(float) * N_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&C, sizeof(float) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc((void **)&D, sizeof(float) * M_GLOBAL * N_GLOBAL)); assert(((unsigned long long)A) % 128 == 0); assert(((unsigned long long)B) % 128 == 0); @@ -734,11 +750,11 @@ int main(int argc, char **argv) printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); const float alpha = 1.1f; - const float beta = 1.2f; + const float beta = 1.2f; cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); checkCudaErrors(cudaEventRecord(start)); @@ -760,26 +776,30 @@ int main(int argc, char **argv) if ((deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) && (selected_kernel != simple_tf32mma_gemm)) { printf("Computing using high performance kernel = %d - %s\n", selected_kernel, kernelNames[selected_kernel]); - switch (selected_kernel) - { - case tf32mma_shmem_gemm_async_copy : - default: - checkCudaErrors(cudaFuncSetAttribute(compute_tf32gemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors((compute_tf32gemm_async_copy<<>>(A, B, C, D, alpha, beta))); - break; - case tf32mma_shmem_gemm : - checkCudaErrors(cudaFuncSetAttribute(compute_tf32gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); - checkKernelErrors((compute_tf32gemm<<>>(A, B, C, D, alpha, beta))); - break; + switch (selected_kernel) { + case tf32mma_shmem_gemm_async_copy: + default: + checkCudaErrors(cudaFuncSetAttribute( + compute_tf32gemm_async_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors( + (compute_tf32gemm_async_copy<<>>( + A, B, C, D, alpha, beta))); + break; + case tf32mma_shmem_gemm: + checkCudaErrors( + cudaFuncSetAttribute(compute_tf32gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors((compute_tf32gemm<<>>( + A, B, C, D, alpha, beta))); + break; } #if CPU_DEBUG - checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); #endif } else { dim3 gridDim; dim3 blockDim; - + // blockDim.x must be a multple of warpSize // 128x4 means we have 16 warps and a block computes a 64x64 output tile blockDim.x = 128; @@ -803,11 +823,7 @@ int main(int argc, char **argv) memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL); - matMultiplyOnHost(A_h, B_h, result_host, - alpha, beta, - M_GLOBAL, K_GLOBAL, - K_GLOBAL, N_GLOBAL, - M_GLOBAL, N_GLOBAL); + matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL, K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL); for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { if (fabs(result_hD[i] - result_host[i]) > 0.1f) { @@ -823,15 +839,15 @@ int main(int argc, char **argv) checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); printf("Time: %f ms\n", milliseconds); - printf("TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12); + printf("TFLOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) / 1e12); free(A_h); free(B_h); free(C_h); - checkCudaErrors(cudaFree((void*)A)); - checkCudaErrors(cudaFree((void*)B)); - checkCudaErrors(cudaFree((void*)C)); - checkCudaErrors(cudaFree((void*)D)); + checkCudaErrors(cudaFree((void *)A)); + checkCudaErrors(cudaFree((void *)B)); + checkCudaErrors(cudaFree((void *)C)); + checkCudaErrors(cudaFree((void *)D)); return 0; } diff --git a/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu b/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu index 7ffc0879..f6cdaa90 100644 --- a/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu +++ b/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG.cu @@ -27,301 +27,287 @@ #include // includes, project +#include +#include +#include #include #include -#include - -#include -#include - namespace cg = cooperative_groups; -#define NUM_ELEMS 10000000 +#define NUM_ELEMS 10000000 #define NUM_THREADS_PER_BLOCK 512 // warp-aggregated atomic increment -__device__ int atomicAggInc(int *counter) { - cg::coalesced_group active = cg::coalesced_threads(); +__device__ int atomicAggInc(int *counter) +{ + cg::coalesced_group active = cg::coalesced_threads(); - // leader does the update - int res = 0; - if (active.thread_rank() == 0) { - res = atomicAdd(counter, active.size()); - } + // leader does the update + int res = 0; + if (active.thread_rank() == 0) { + res = atomicAdd(counter, active.size()); + } - // broadcast result - res = active.shfl(res, 0); + // broadcast result + res = active.shfl(res, 0); - // each thread computes its own value - return res + active.thread_rank(); + // each thread computes its own value + return res + active.thread_rank(); } -__global__ void filter_arr(int *dst, int *nres, const int *src, int n) { - int id = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void filter_arr(int *dst, int *nres, const int *src, int n) +{ + int id = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = id; i < n; i += gridDim.x * blockDim.x) { - if (src[i] > 0) dst[atomicAggInc(nres)] = src[i]; - } + for (int i = id; i < n; i += gridDim.x * blockDim.x) { + if (src[i] > 0) + dst[atomicAggInc(nres)] = src[i]; + } } // warp-aggregated atomic multi bucket increment #if __CUDA_ARCH__ >= 700 __device__ int atomicAggIncMulti(const int bucket, int *counter) { - cg::coalesced_group active = cg::coalesced_threads(); - // group all threads with same bucket value. - auto labeledGroup = cg::labeled_partition(active, bucket); + cg::coalesced_group active = cg::coalesced_threads(); + // group all threads with same bucket value. + auto labeledGroup = cg::labeled_partition(active, bucket); - int res = 0; - if (labeledGroup.thread_rank() == 0) - { - res = atomicAdd(&counter[bucket], labeledGroup.size()); - } + int res = 0; + if (labeledGroup.thread_rank() == 0) { + res = atomicAdd(&counter[bucket], labeledGroup.size()); + } - // broadcast result - res = labeledGroup.shfl(res, 0); + // broadcast result + res = labeledGroup.shfl(res, 0); - // each thread computes its own value - return res + labeledGroup.thread_rank(); + // each thread computes its own value + return res + labeledGroup.thread_rank(); } #endif // Places individual value indices into its corresponding buckets. -__global__ void mapToBuckets(const int *srcArr, int *indicesBuckets, int *bucketCounters, const int srcSize, const int numOfBuckets) +__global__ void +mapToBuckets(const int *srcArr, int *indicesBuckets, int *bucketCounters, const int srcSize, const int numOfBuckets) { #if __CUDA_ARCH__ >= 700 - cg::grid_group grid = cg::this_grid(); + cg::grid_group grid = cg::this_grid(); - for (int i=grid.thread_rank(); i < srcSize; i += grid.size()) - { - const int bucket = srcArr[i]; - if (bucket < numOfBuckets) - { - indicesBuckets[atomicAggIncMulti(bucket, bucketCounters)] = i; + for (int i = grid.thread_rank(); i < srcSize; i += grid.size()) { + const int bucket = srcArr[i]; + if (bucket < numOfBuckets) { + indicesBuckets[atomicAggIncMulti(bucket, bucketCounters)] = i; + } } - } #endif } int mapIndicesToBuckets(int *h_srcArr, int *d_srcArr, int numOfBuckets) { - int *d_indicesBuckets, *d_bucketCounters; - int *cpuBucketCounters = new int[numOfBuckets]; - int *h_bucketCounters = new int[numOfBuckets]; + int *d_indicesBuckets, *d_bucketCounters; + int *cpuBucketCounters = new int[numOfBuckets]; + int *h_bucketCounters = new int[numOfBuckets]; - memset(cpuBucketCounters, 0, sizeof(int)*numOfBuckets); - // Initialize each bucket counters. - for (int i = 0; i < numOfBuckets; i++) - { - h_bucketCounters[i] = i*NUM_ELEMS; - } - - checkCudaErrors(cudaMalloc(&d_indicesBuckets, sizeof(int) * NUM_ELEMS * numOfBuckets)); - checkCudaErrors(cudaMalloc(&d_bucketCounters, sizeof(int) * numOfBuckets)); - - checkCudaErrors(cudaMemcpy(d_bucketCounters, h_bucketCounters, sizeof(int)*numOfBuckets, cudaMemcpyHostToDevice)); - - dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); - dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK), 1, 1); - - mapToBuckets<<>>(d_srcArr, d_indicesBuckets, d_bucketCounters, NUM_ELEMS, numOfBuckets); - - checkCudaErrors(cudaMemcpy(h_bucketCounters, d_bucketCounters, sizeof(int)*numOfBuckets, cudaMemcpyDeviceToHost)); - - for (int i=0; i < NUM_ELEMS; i++) - { - cpuBucketCounters[h_srcArr[i]]++; - } - - bool allMatch = true; - int finalElems = 0; - for (int i=0; i < numOfBuckets; i++) - { - finalElems += (h_bucketCounters[i] - i*NUM_ELEMS); - if (cpuBucketCounters[i] != (h_bucketCounters[i] - i*NUM_ELEMS)) - { - allMatch = false; - break; + memset(cpuBucketCounters, 0, sizeof(int) * numOfBuckets); + // Initialize each bucket counters. + for (int i = 0; i < numOfBuckets; i++) { + h_bucketCounters[i] = i * NUM_ELEMS; } - } - if (!allMatch && finalElems != NUM_ELEMS) - { - return EXIT_FAILURE; - } - return EXIT_SUCCESS; + checkCudaErrors(cudaMalloc(&d_indicesBuckets, sizeof(int) * NUM_ELEMS * numOfBuckets)); + checkCudaErrors(cudaMalloc(&d_bucketCounters, sizeof(int) * numOfBuckets)); + + checkCudaErrors(cudaMemcpy(d_bucketCounters, h_bucketCounters, sizeof(int) * numOfBuckets, cudaMemcpyHostToDevice)); + + dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); + dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK), 1, 1); + + mapToBuckets<<>>(d_srcArr, d_indicesBuckets, d_bucketCounters, NUM_ELEMS, numOfBuckets); + + checkCudaErrors(cudaMemcpy(h_bucketCounters, d_bucketCounters, sizeof(int) * numOfBuckets, cudaMemcpyDeviceToHost)); + + for (int i = 0; i < NUM_ELEMS; i++) { + cpuBucketCounters[h_srcArr[i]]++; + } + + bool allMatch = true; + int finalElems = 0; + for (int i = 0; i < numOfBuckets; i++) { + finalElems += (h_bucketCounters[i] - i * NUM_ELEMS); + if (cpuBucketCounters[i] != (h_bucketCounters[i] - i * NUM_ELEMS)) { + allMatch = false; + break; + } + } + + if (!allMatch && finalElems != NUM_ELEMS) { + return EXIT_FAILURE; + } + return EXIT_SUCCESS; } // Warp-aggregated atomic Max in multi bucket #if __CUDA_ARCH__ >= 700 __device__ void atomicAggMaxMulti(const int bucket, int *counter, const int valueForMax) { - cg::coalesced_group active = cg::coalesced_threads(); - // group all threads with same bucket value. - auto labeledGroup = cg::labeled_partition(active, bucket); + cg::coalesced_group active = cg::coalesced_threads(); + // group all threads with same bucket value. + auto labeledGroup = cg::labeled_partition(active, bucket); - const int maxValueInGroup = cg::reduce(labeledGroup, valueForMax, cg::greater()); + const int maxValueInGroup = cg::reduce(labeledGroup, valueForMax, cg::greater()); - if (labeledGroup.thread_rank() == 0) - { - atomicMax(&counter[bucket], maxValueInGroup); - } + if (labeledGroup.thread_rank() == 0) { + atomicMax(&counter[bucket], maxValueInGroup); + } } #endif // Performs max calculation in each buckets. -__global__ void calculateMaxInEachBuckets(const int *srcArr, const int *valueInBuckets, int *bucketsMax, const int srcSize, const int numOfBuckets) +__global__ void calculateMaxInEachBuckets(const int *srcArr, + const int *valueInBuckets, + int *bucketsMax, + const int srcSize, + const int numOfBuckets) { #if __CUDA_ARCH__ >= 700 - cg::grid_group grid = cg::this_grid(); + cg::grid_group grid = cg::this_grid(); - for (int i=grid.thread_rank(); i < srcSize; i += grid.size()) - { - const int bucket = srcArr[i]; - if (bucket < numOfBuckets) - { - atomicAggMaxMulti(bucket, bucketsMax, valueInBuckets[i]); + for (int i = grid.thread_rank(); i < srcSize; i += grid.size()) { + const int bucket = srcArr[i]; + if (bucket < numOfBuckets) { + atomicAggMaxMulti(bucket, bucketsMax, valueInBuckets[i]); + } } - } #endif } int calculateMaxInBuckets(int *h_srcArr, int *d_srcArr, int numOfBuckets) { - int *d_valueInBuckets, *d_bucketsMax; - int *h_valueInBuckets = new int[NUM_ELEMS]; - int *cpuBucketsMax = new int[numOfBuckets]; - int *h_bucketsMax = new int[numOfBuckets]; + int *d_valueInBuckets, *d_bucketsMax; + int *h_valueInBuckets = new int[NUM_ELEMS]; + int *cpuBucketsMax = new int[numOfBuckets]; + int *h_bucketsMax = new int[numOfBuckets]; - memset(cpuBucketsMax, 0, sizeof(int) * numOfBuckets); + memset(cpuBucketsMax, 0, sizeof(int) * numOfBuckets); - // Here we create values which is assumed to correspond to each - // buckets of srcArr at same array index. - for (int i=0; i < NUM_ELEMS; i++) - { - h_valueInBuckets[i] = rand(); - } - - checkCudaErrors(cudaMalloc(&d_valueInBuckets, sizeof(int) * NUM_ELEMS)); - checkCudaErrors(cudaMalloc(&d_bucketsMax, sizeof(int) * numOfBuckets)); - - checkCudaErrors(cudaMemset(d_bucketsMax, 0, sizeof(int) * numOfBuckets)); - checkCudaErrors(cudaMemcpy(d_valueInBuckets, h_valueInBuckets, sizeof(int) * NUM_ELEMS, cudaMemcpyHostToDevice)); - - dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); - dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK), 1, 1); - - calculateMaxInEachBuckets<<>>(d_srcArr, d_valueInBuckets, d_bucketsMax, NUM_ELEMS, numOfBuckets); - - checkCudaErrors(cudaMemcpy(h_bucketsMax, d_bucketsMax, sizeof(int) * numOfBuckets, cudaMemcpyDeviceToHost)); - - for (int i = 0; i < NUM_ELEMS; i++) - { - if (cpuBucketsMax[h_srcArr[i]] < h_valueInBuckets[i]) - { - cpuBucketsMax[h_srcArr[i]] = h_valueInBuckets[i]; + // Here we create values which is assumed to correspond to each + // buckets of srcArr at same array index. + for (int i = 0; i < NUM_ELEMS; i++) { + h_valueInBuckets[i] = rand(); } - } - bool allMatch = true; - int finalElems = 0; - for (int i=0; i < numOfBuckets; i++) - { - if (cpuBucketsMax[i] != h_bucketsMax[i]) - { - allMatch = false; - printf("CPU i=%d max = %d mismatches GPU max = %d\n", i, cpuBucketsMax[i], h_bucketsMax[i]); - break; + checkCudaErrors(cudaMalloc(&d_valueInBuckets, sizeof(int) * NUM_ELEMS)); + checkCudaErrors(cudaMalloc(&d_bucketsMax, sizeof(int) * numOfBuckets)); + + checkCudaErrors(cudaMemset(d_bucketsMax, 0, sizeof(int) * numOfBuckets)); + checkCudaErrors(cudaMemcpy(d_valueInBuckets, h_valueInBuckets, sizeof(int) * NUM_ELEMS, cudaMemcpyHostToDevice)); + + dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); + dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK), 1, 1); + + calculateMaxInEachBuckets<<>>(d_srcArr, d_valueInBuckets, d_bucketsMax, NUM_ELEMS, numOfBuckets); + + checkCudaErrors(cudaMemcpy(h_bucketsMax, d_bucketsMax, sizeof(int) * numOfBuckets, cudaMemcpyDeviceToHost)); + + for (int i = 0; i < NUM_ELEMS; i++) { + if (cpuBucketsMax[h_srcArr[i]] < h_valueInBuckets[i]) { + cpuBucketsMax[h_srcArr[i]] = h_valueInBuckets[i]; + } } - } - if (allMatch) - { - printf("CPU max matches GPU max\n"); - } - delete[] h_valueInBuckets; - delete[] cpuBucketsMax; - delete[] h_bucketsMax; - checkCudaErrors(cudaFree(d_valueInBuckets)); - checkCudaErrors(cudaFree(d_bucketsMax)); + bool allMatch = true; + int finalElems = 0; + for (int i = 0; i < numOfBuckets; i++) { + if (cpuBucketsMax[i] != h_bucketsMax[i]) { + allMatch = false; + printf("CPU i=%d max = %d mismatches GPU max = %d\n", i, cpuBucketsMax[i], h_bucketsMax[i]); + break; + } + } + if (allMatch) { + printf("CPU max matches GPU max\n"); + } - if (!allMatch && finalElems != NUM_ELEMS) - { - return EXIT_FAILURE; - } + delete[] h_valueInBuckets; + delete[] cpuBucketsMax; + delete[] h_bucketsMax; + checkCudaErrors(cudaFree(d_valueInBuckets)); + checkCudaErrors(cudaFree(d_bucketsMax)); - return EXIT_SUCCESS; + if (!allMatch && finalElems != NUM_ELEMS) { + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; } -int main(int argc, char **argv) { - int *data_to_filter, *filtered_data, nres = 0; - int *d_data_to_filter, *d_filtered_data, *d_nres; +int main(int argc, char **argv) +{ + int *data_to_filter, *filtered_data, nres = 0; + int *d_data_to_filter, *d_filtered_data, *d_nres; - int numOfBuckets = 5; + int numOfBuckets = 5; - data_to_filter = reinterpret_cast(malloc(sizeof(int) * NUM_ELEMS)); + data_to_filter = reinterpret_cast(malloc(sizeof(int) * NUM_ELEMS)); - // Generate input data. - for (int i = 0; i < NUM_ELEMS; i++) { - data_to_filter[i] = rand() % numOfBuckets; - } - - int devId = findCudaDevice(argc, (const char **)argv); - - checkCudaErrors(cudaMalloc(&d_data_to_filter, sizeof(int) * NUM_ELEMS)); - checkCudaErrors(cudaMalloc(&d_filtered_data, sizeof(int) * NUM_ELEMS)); - checkCudaErrors(cudaMalloc(&d_nres, sizeof(int))); - - checkCudaErrors(cudaMemcpy(d_data_to_filter, data_to_filter, - sizeof(int) * NUM_ELEMS, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemset(d_nres, 0, sizeof(int))); - - dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); - dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1); - - filter_arr<<>>(d_filtered_data, d_nres, d_data_to_filter, - NUM_ELEMS); - - checkCudaErrors( - cudaMemcpy(&nres, d_nres, sizeof(int), cudaMemcpyDeviceToHost)); - - filtered_data = reinterpret_cast(malloc(sizeof(int) * nres)); - - checkCudaErrors(cudaMemcpy(filtered_data, d_filtered_data, sizeof(int) * nres, - cudaMemcpyDeviceToHost)); - - int *host_filtered_data = - reinterpret_cast(malloc(sizeof(int) * NUM_ELEMS)); - - // Generate host output with host filtering code. - int host_flt_count = 0; - for (int i = 0; i < NUM_ELEMS; i++) { - if (data_to_filter[i] > 0) { - host_filtered_data[host_flt_count++] = data_to_filter[i]; + // Generate input data. + for (int i = 0; i < NUM_ELEMS; i++) { + data_to_filter[i] = rand() % numOfBuckets; } - } - int major = 0; - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId)); + int devId = findCudaDevice(argc, (const char **)argv); - int mapIndicesToBucketsStatus = EXIT_SUCCESS; - int calculateMaxInBucketsStatus = EXIT_SUCCESS; - // atomicAggIncMulti & atomicAggMaxMulti require a GPU of Volta (SM7X) architecture or higher, - // so that it can take advantage of the new MATCH capability of Volta hardware - if (major >= 7) { - mapIndicesToBucketsStatus = mapIndicesToBuckets(data_to_filter, d_data_to_filter, numOfBuckets); - calculateMaxInBucketsStatus = calculateMaxInBuckets(data_to_filter, d_data_to_filter, numOfBuckets); - } + checkCudaErrors(cudaMalloc(&d_data_to_filter, sizeof(int) * NUM_ELEMS)); + checkCudaErrors(cudaMalloc(&d_filtered_data, sizeof(int) * NUM_ELEMS)); + checkCudaErrors(cudaMalloc(&d_nres, sizeof(int))); - printf("\nWarp Aggregated Atomics %s \n", - (host_flt_count == nres) && (mapIndicesToBucketsStatus == EXIT_SUCCESS) && - (calculateMaxInBucketsStatus == EXIT_SUCCESS) ? "PASSED" : "FAILED"); + checkCudaErrors(cudaMemcpy(d_data_to_filter, data_to_filter, sizeof(int) * NUM_ELEMS, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(d_nres, 0, sizeof(int))); - checkCudaErrors(cudaFree(d_data_to_filter)); - checkCudaErrors(cudaFree(d_filtered_data)); - checkCudaErrors(cudaFree(d_nres)); - free(data_to_filter); - free(filtered_data); - free(host_filtered_data); + dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1); + dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1); + + filter_arr<<>>(d_filtered_data, d_nres, d_data_to_filter, NUM_ELEMS); + + checkCudaErrors(cudaMemcpy(&nres, d_nres, sizeof(int), cudaMemcpyDeviceToHost)); + + filtered_data = reinterpret_cast(malloc(sizeof(int) * nres)); + + checkCudaErrors(cudaMemcpy(filtered_data, d_filtered_data, sizeof(int) * nres, cudaMemcpyDeviceToHost)); + + int *host_filtered_data = reinterpret_cast(malloc(sizeof(int) * NUM_ELEMS)); + + // Generate host output with host filtering code. + int host_flt_count = 0; + for (int i = 0; i < NUM_ELEMS; i++) { + if (data_to_filter[i] > 0) { + host_filtered_data[host_flt_count++] = data_to_filter[i]; + } + } + + int major = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId)); + + int mapIndicesToBucketsStatus = EXIT_SUCCESS; + int calculateMaxInBucketsStatus = EXIT_SUCCESS; + // atomicAggIncMulti & atomicAggMaxMulti require a GPU of Volta (SM7X) architecture or higher, + // so that it can take advantage of the new MATCH capability of Volta hardware + if (major >= 7) { + mapIndicesToBucketsStatus = mapIndicesToBuckets(data_to_filter, d_data_to_filter, numOfBuckets); + calculateMaxInBucketsStatus = calculateMaxInBuckets(data_to_filter, d_data_to_filter, numOfBuckets); + } + + printf("\nWarp Aggregated Atomics %s \n", + (host_flt_count == nres) && (mapIndicesToBucketsStatus == EXIT_SUCCESS) + && (calculateMaxInBucketsStatus == EXIT_SUCCESS) + ? "PASSED" + : "FAILED"); + + checkCudaErrors(cudaFree(d_data_to_filter)); + checkCudaErrors(cudaFree(d_filtered_data)); + checkCudaErrors(cudaFree(d_nres)); + free(data_to_filter); + free(filtered_data); + free(host_filtered_data); } diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp index d57bdd9b..3aa2d152 100644 --- a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp +++ b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/FilterBorderControlNPP.cpp @@ -45,486 +45,558 @@ #include #include #include +#include #include #include +#include #include #include -#include -#include +inline int cudaDeviceInit(int argc, const char **argv) +{ + int deviceCount; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); -inline int cudaDeviceInit(int argc, const char **argv) { - int deviceCount; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + if (deviceCount == 0) { + std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; + exit(EXIT_FAILURE); + } - if (deviceCount == 0) { - std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; - exit(EXIT_FAILURE); - } + int dev = findCudaDevice(argc, argv); - int dev = findCudaDevice(argc, argv); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name << std::endl; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name - << std::endl; + checkCudaErrors(cudaSetDevice(dev)); - checkCudaErrors(cudaSetDevice(dev)); - - return dev; + return dev; } -int main(int argc, char *argv[]) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char *argv[]) +{ + printf("%s Starting...\n\n", argv[0]); - try { - const char *inputFile = "teapot512.pgm"; - std::string sFilename = inputFile; - std::string sOutputDir = "./"; + try { + const char *inputFile = "teapot512.pgm"; + std::string sFilename = inputFile; + std::string sOutputDir = "./"; - cudaDeviceInit(argc, (const char **)argv); + cudaDeviceInit(argc, (const char **)argv); - NppStreamContext nppStreamCtx; - nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. + NppStreamContext nppStreamCtx; + nppStreamCtx.hStream = + 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. - cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - { - printf("CUDA error: no devices supporting CUDA.\n"); - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) { + printf("CUDA error: no devices supporting CUDA.\n"); + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + } + + const NppLibraryVersion *libVer = nppGetLibVersion(); + + printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); + + int driverVersion, runtimeVersion; + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + + printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, + cudaDevAttrComputeCapabilityMajor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, + cudaDevAttrComputeCapabilityMinor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); + + cudaDeviceProp oDeviceProperties; + + cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); + + nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; + nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; + nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; + nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; + + char *filePath; + + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); + } + else { + filePath = sdkFindFilePath(inputFile, argv[0]); + } + + if (!filePath) { + std::cerr << "Couldn't find input file " << sFilename << std::endl; + exit(1); + } + + sFilename = filePath; + + // if we specify the filename at the command line, then we only test + // sFilename[0]. + int file_errors = 0; + std::ifstream infile(sFilename.data(), std::ifstream::in); + + if (infile.good()) { + std::cout << "gradientFilterBorderNPP opened <" << sFilename.data() << "> successfully!" << std::endl; + file_errors = 0; + infile.close(); + } + else { + std::cout << "gradientFilterBorderNPP unable to open <" << sFilename.data() << ">" << std::endl; + file_errors++; + infile.close(); + } + + if (file_errors > 0) { + cudaDeviceReset(); + exit(EXIT_FAILURE); + } + + std::string sResultBaseFilename = sFilename; + + std::string::size_type dot = sResultBaseFilename.rfind('.'); + + if (dot != std::string::npos) { + sResultBaseFilename = sResultBaseFilename.substr(0, dot); + } + + std::string sResultXFilename = sOutputDir + sFilename + "_gradientVectorPrewittBorderX_Vertical.pgm"; + std::string sResultYFilename = sResultBaseFilename; + + // sResultXFilename += "_gradientVectorPrewittBorderX_Vertical.pgm"; + sResultYFilename += "_gradientVectorPrewittBorderY_Horizontal.pgm"; + + // if (checkCmdLineFlag(argc, (const char **)argv, "output")) + // { + // char *outputFilePath; + // getCmdLineArgumentString(argc, (const char **)argv, "output", + // &outputFilePath); sResultBaseFilename = outputFilePath; + // } + + // declare a host image object for an 8-bit grayscale image + npp::ImageCPU_8u_C1 oHostSrc; + // load gray-scale image from disk + npp::loadImage(sFilename, oHostSrc); + // declare a device image and copy construct from the host image, + // i.e. upload host to device + npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); + + NppiSize oSrcSize = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; + NppiPoint oSrcOffset = {0, 0}; + + // create struct with ROI size + NppiSize oSizeROI = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; + // allocate device destination images of appropriatedly size + npp::ImageNPP_16s_C1 oDeviceDstX(oSizeROI.width, oSizeROI.height); + npp::ImageNPP_16s_C1 oDeviceDstY(oSizeROI.width, oSizeROI.height); + + // run Prewitt edge detection gradient vector filter + NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(oDeviceSrc.data(), + oDeviceSrc.pitch(), + oSrcSize, + oSrcOffset, + oDeviceDstX.data(), + oDeviceDstX.pitch(), + oDeviceDstY.data(), + oDeviceDstY.pitch(), + 0, + 0, + 0, + 0, + oSizeROI, + NPP_MASK_SIZE_3_X_3, + nppiNormL1, + NPP_BORDER_REPLICATE, + nppStreamCtx)); + + // allocate device destination images of appropriatedly size + npp::ImageNPP_8u_C1 oDeviceDstOutX(oSizeROI.width, oSizeROI.height); + npp::ImageNPP_8u_C1 oDeviceDstOutY(oSizeROI.width, oSizeROI.height); + + // convert 16s_C1 result images to binary 8u_C1 output images using constant + // value to adjust amount of visible detail + NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), + oDeviceDstX.pitch(), + 32, + oDeviceDstOutX.data(), + oDeviceDstOutX.pitch(), + oSizeROI, + NPP_CMP_GREATER_EQ, + nppStreamCtx)); + + NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), + oDeviceDstY.pitch(), + 32, + oDeviceDstOutY.data(), + oDeviceDstOutY.pitch(), + oSizeROI, + NPP_CMP_GREATER_EQ, + nppStreamCtx)); + + // create host images for the results + npp::ImageCPU_8u_C1 oHostDstX(oDeviceDstOutX.size()); + npp::ImageCPU_8u_C1 oHostDstY(oDeviceDstOutY.size()); + // and copy the device result data into them + oDeviceDstOutX.copyTo(oHostDstX.data(), oHostDstX.pitch()); + oDeviceDstOutY.copyTo(oHostDstY.data(), oHostDstY.pitch()); + + saveImage(sResultXFilename, oHostDstX); + std::cout << "Saved image: " << sResultXFilename << std::endl; + saveImage(sResultYFilename, oHostDstY); + std::cout << "Saved image: " << sResultYFilename << std::endl; + + // now use the Prewitt gradient border filter function in such a way that no + // border replication operations will be applied + + // create a Prewitt filter mask size object, Prewitt uses a 3x3 filter + // kernel + NppiSize oMaskSize = {3, 3}; + // create a size object for the enlarged source image + NppiSize oEnlargedSrcSize = {oSrcSize.width + oMaskSize.width - 1, oSrcSize.height + oMaskSize.height - 1}; + + // create an enlarged device source image + npp::ImageNPP_8u_C1 oEnlargedDeviceSrc(oEnlargedSrcSize.width, oEnlargedSrcSize.height); + + // copy and enlarge the original device source image and surround it with a + // white edge (border) + NPP_CHECK_NPP(nppiCopyConstBorder_8u_C1R_Ctx(oDeviceSrc.data(), + oDeviceSrc.pitch(), + oSrcSize, + oEnlargedDeviceSrc.data(), + oEnlargedDeviceSrc.pitch(), + oEnlargedSrcSize, + oMaskSize.width / 2, + oMaskSize.height / 2, + 255, + nppStreamCtx)); + + // adjust oEnlargedDeviceSrc pixel pointer to point to the first pixel of + // the original source image in the enlarged source image + const Npp8u *pTemp = reinterpret_cast(oEnlargedDeviceSrc.data()); + pTemp += (oMaskSize.height / 2) * oEnlargedDeviceSrc.pitch(); + const Npp8u *pAdjustedSrc = reinterpret_cast((void *)(pTemp)); + pAdjustedSrc += oMaskSize.width / 2; + + // create device output images for the no source border results + npp::ImageNPP_8u_C1 oDeviceDstOutXNoBorders(oSizeROI.width, oSizeROI.height); + npp::ImageNPP_8u_C1 oDeviceDstOutYNoBorders(oSizeROI.width, oSizeROI.height); + + // tell the filter function what cartesian pixel position pAdjustedSrc is + // pointing to within the enlarged source image + oSrcOffset.x += oMaskSize.width / 2; + oSrcOffset.y += oMaskSize.height / 2; + + // run Prewitt edge detection gradient vector filter bypassing border + // control due to enlarged source image + NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(pAdjustedSrc, + oEnlargedDeviceSrc.pitch(), + oEnlargedSrcSize, + oSrcOffset, + oDeviceDstX.data(), + oDeviceDstX.pitch(), + oDeviceDstY.data(), + oDeviceDstY.pitch(), + 0, + 0, + 0, + 0, + oSizeROI, + NPP_MASK_SIZE_3_X_3, + nppiNormL1, + NPP_BORDER_REPLICATE, + nppStreamCtx)); + + // convert 16s_C1 result images to binary 8u_C1 output images using constant + // value to adjust amount of visible detail + NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), + oDeviceDstX.pitch(), + 32, + oDeviceDstOutXNoBorders.data(), + oDeviceDstOutXNoBorders.pitch(), + oSizeROI, + NPP_CMP_GREATER_EQ, + nppStreamCtx)); + + NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), + oDeviceDstY.pitch(), + 32, + oDeviceDstOutYNoBorders.data(), + oDeviceDstOutYNoBorders.pitch(), + oSizeROI, + NPP_CMP_GREATER_EQ, + nppStreamCtx)); + // create additional output files + std::string sResultXNoBordersFilename = sResultBaseFilename; + std::string sResultYNoBordersFilename = sResultBaseFilename; + + sResultXNoBordersFilename += "_gradientVectorPrewittBorderX_Vertical_WithNoSourceBorders.pgm"; + sResultYNoBordersFilename += "_gradientVectorPrewittBorderY_Horizontal_WithNoSourceBorders.pgm"; + + // copy the device result data into the host output images + oDeviceDstOutXNoBorders.copyTo(oHostDstX.data(), oHostDstX.pitch()); + oDeviceDstOutYNoBorders.copyTo(oHostDstY.data(), oHostDstY.pitch()); + + saveImage(sResultXNoBordersFilename, oHostDstX); + std::cout << "Saved image: " << sResultXNoBordersFilename << std::endl; + saveImage(sResultYNoBordersFilename, oHostDstY); + std::cout << "Saved image: " << sResultYNoBordersFilename << std::endl; + + // now diff the two output images, one using border control and one + // bypassing border control + + // create device output images for the diff results + npp::ImageNPP_8u_C1 oDeviceDstOutXDiff(oSizeROI.width, oSizeROI.height); + npp::ImageNPP_8u_C1 oDeviceDstOutYDiff(oSizeROI.width, oSizeROI.height); + + // diff the two 8u_C1 result images one with and one without border control + + NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(oDeviceDstOutXNoBorders.data(), + oDeviceDstOutXNoBorders.pitch(), + oDeviceDstOutX.data(), + oDeviceDstOutX.pitch(), + oDeviceDstOutXDiff.data(), + oDeviceDstOutXDiff.pitch(), + oSizeROI, + nppStreamCtx)); + + NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(oDeviceDstOutYNoBorders.data(), + oDeviceDstOutYNoBorders.pitch(), + oDeviceDstOutY.data(), + oDeviceDstOutY.pitch(), + oDeviceDstOutYDiff.data(), + oDeviceDstOutYDiff.pitch(), + oSizeROI, + nppStreamCtx)); + + // create additional output files + std::string sResultXDiffFilename = sResultBaseFilename; + std::string sResultYDiffFilename = sResultBaseFilename; + + sResultXDiffFilename += "_gradientVectorPrewittBorderX_Vertical_BorderDiffs.pgm"; + sResultYDiffFilename += "_gradientVectorPrewittBorderY_Horizontal_BorderDiffs.pgm"; + + // copy the device result data into the host output images + oDeviceDstOutXDiff.copyTo(oHostDstX.data(), oHostDstX.pitch()); + oDeviceDstOutYDiff.copyTo(oHostDstY.data(), oHostDstY.pitch()); + + saveImage(sResultXDiffFilename, oHostDstX); + std::cout << "Saved image: " << sResultXDiffFilename << std::endl; + saveImage(sResultYDiffFilename, oHostDstY); + std::cout << "Saved image: " << sResultYDiffFilename << std::endl; + + // if you closely examine the above difference files (recommend using GIMP + // for viewing using scaling with no interpolation) you will see several + // single pixel differences (white pixels) along the right and bottom edges + // of the default vs. borderless images this happens because border pixels + // in the original source image are duplicated when the filter kernels + // overlap the edge of the source image when using the first version of the + // filter call but are actually sampled from the enlarged source image when + // using the second version of the filter call the technique used in the + // second filter call can be used with any filter border function in NPP to + // duplicate results that would be generated from a non-border filter + // function call by filling the border pixel outside the embedded source + // image with the appropriate border pixel values + + // here is how to use border control to process a source image in multiple + // calls and get correct output in the destination image + + // since the source image pointer already points to the beginning of the + // source image in the enlarged source image it doesn't need changed + + // tighten up the top and left source image borders - this will enable + // border replication on the left and top borders of the original source + // image + oSrcOffset.x = 0; + oSrcOffset.y = 0; + // tighten up the right and bottom side source image borders - this will + // enable border replication on the right and bottom borders of the original + // source image + oEnlargedSrcSize.width = oSrcSize.width; + oEnlargedSrcSize.height = oSrcSize.height; + + // create device output images for the mixed edge results + npp::ImageNPP_8u_C1 oDeviceDstOutXMixedBorders(oSizeROI.width, oSizeROI.height); + npp::ImageNPP_8u_C1 oDeviceDstOutYMixedBorders(oSizeROI.width, oSizeROI.height); + + // shrink output ROI width so that only the left half of the destination + // image will be generated however since oEnlargedSrcSize.width is still set + // to oSrcSize.width then border control will be disabled when the filter + // needs to access source pixels beyond the right side of the left half of + // the source image + int nLeftWidth = oSizeROI.width / 2; + int nRightWidth = oSizeROI.width - nLeftWidth; + oSizeROI.width = nLeftWidth; + + // run Prewitt edge detection gradient vector filter to generate the left + // side of the output image + NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(pAdjustedSrc, + oEnlargedDeviceSrc.pitch(), + oEnlargedSrcSize, + oSrcOffset, + oDeviceDstX.data(), + oDeviceDstX.pitch(), + oDeviceDstY.data(), + oDeviceDstY.pitch(), + 0, + 0, + 0, + 0, + oSizeROI, + NPP_MASK_SIZE_3_X_3, + nppiNormL1, + NPP_BORDER_REPLICATE, + nppStreamCtx)); + + // now move the enlarged source pointer to the horizontal middle of the + // enlarged source image and tell the function where it was moved to + pAdjustedSrc += nLeftWidth; + // and adjust the source offset parameter accordingly - this will in effect + // turn off border control for the left border allowing the necessary source + // pixels to be used + oSrcOffset.x += nLeftWidth; + + // update oSizeROI.width so that only enough destination pixels will be + // produced to fill the right half of the destination image + oSizeROI.width = nRightWidth; + + // run Prewitt edge detection gradient vector filter to generate the right + // side of the output image adjusting the destination image pointers + // appropriately + NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx(pAdjustedSrc, + oEnlargedDeviceSrc.pitch(), + oEnlargedSrcSize, + oSrcOffset, + oDeviceDstX.data() + nLeftWidth, + oDeviceDstX.pitch(), + oDeviceDstY.data() + nLeftWidth, + oDeviceDstY.pitch(), + 0, + 0, + 0, + 0, + oSizeROI, + NPP_MASK_SIZE_3_X_3, + nppiNormL1, + NPP_BORDER_REPLICATE, + nppStreamCtx)); + + // convert 16s_C1 result images to binary 8u_C1 output images using constant + // value to adjust amount of visible detail + NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), + oDeviceDstX.pitch(), + 32, + oDeviceDstOutXMixedBorders.data(), + oDeviceDstOutXMixedBorders.pitch(), + oSizeROI, + NPP_CMP_GREATER_EQ, + nppStreamCtx)); + + NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), + oDeviceDstY.pitch(), + 32, + oDeviceDstOutYMixedBorders.data(), + oDeviceDstOutYMixedBorders.pitch(), + oSizeROI, + NPP_CMP_GREATER_EQ, + nppStreamCtx)); + // create additional output files + std::string sResultXMixedBordersFilename = sResultBaseFilename; + std::string sResultYMixedBordersFilename = sResultBaseFilename; + + sResultXMixedBordersFilename += "_gradientVectorPrewittBorderX_Vertical_WithMixedBorders.pgm"; + sResultYMixedBordersFilename += "_gradientVectorPrewittBorderY_Horizontal_WithMixedBorders.pgm"; + + // copy the device result data into the host output images + oDeviceDstOutXMixedBorders.copyTo(oHostDstX.data(), oHostDstX.pitch()); + oDeviceDstOutYMixedBorders.copyTo(oHostDstY.data(), oHostDstY.pitch()); + + saveImage(sResultXMixedBordersFilename, oHostDstX); + std::cout << "Saved image: " << sResultXMixedBordersFilename << std::endl; + saveImage(sResultYMixedBordersFilename, oHostDstY); + std::cout << "Saved image: " << sResultYMixedBordersFilename << std::endl; + + // diff the original 8u_C1 result images with border control and the mixed + // border control images, they should match (diff image will be all black) + + NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(oDeviceDstOutXMixedBorders.data(), + oDeviceDstOutXMixedBorders.pitch(), + oDeviceDstOutX.data(), + oDeviceDstOutX.pitch(), + oDeviceDstOutXDiff.data(), + oDeviceDstOutXDiff.pitch(), + oSizeROI, + nppStreamCtx)); + + NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx(oDeviceDstOutYMixedBorders.data(), + oDeviceDstOutYMixedBorders.pitch(), + oDeviceDstOutY.data(), + oDeviceDstOutY.pitch(), + oDeviceDstOutYDiff.data(), + oDeviceDstOutYDiff.pitch(), + oSizeROI, + nppStreamCtx)); + + // create additional output files + std::string sResultXMixedDiffFilename = sResultBaseFilename; + std::string sResultYMixedDiffFilename = sResultBaseFilename; + + sResultXMixedDiffFilename += "_gradientVectorPrewittBorderX_Vertical_MixedBorderDiffs.pgm"; + sResultYMixedDiffFilename += "_gradientVectorPrewittBorderY_Horizontal_MixedBorderDiffs.pgm"; + + // copy the device result data into the host output images + oDeviceDstOutXDiff.copyTo(oHostDstX.data(), oHostDstX.pitch()); + oDeviceDstOutYDiff.copyTo(oHostDstY.data(), oHostDstY.pitch()); + + saveImage(sResultXMixedDiffFilename, oHostDstX); + std::cout << "Saved image: " << sResultXMixedDiffFilename << std::endl; + saveImage(sResultYMixedDiffFilename, oHostDstY); + std::cout << "Saved image: " << sResultYMixedDiffFilename << std::endl; + + nppiFree(oDeviceSrc.data()); + nppiFree(oDeviceDstX.data()); + nppiFree(oDeviceDstY.data()); + nppiFree(oDeviceDstOutX.data()); + nppiFree(oDeviceDstOutY.data()); + nppiFree(oDeviceDstOutXNoBorders.data()); + nppiFree(oDeviceDstOutYNoBorders.data()); + nppiFree(oDeviceDstOutXDiff.data()); + nppiFree(oDeviceDstOutYDiff.data()); + nppiFree(oDeviceDstOutXMixedBorders.data()); + nppiFree(oDeviceDstOutYMixedBorders.data()); + nppiFree(oEnlargedDeviceSrc.data()); + + cudaDeviceReset(); + exit(EXIT_SUCCESS); + } + catch (npp::Exception &rException) { + std::cerr << "Program error! The following exception occurred: \n"; + std::cerr << rException << std::endl; + std::cerr << "Aborting." << std::endl; + + cudaDeviceReset(); + exit(EXIT_FAILURE); + } + catch (...) { + std::cerr << "Program error! An unknow type of exception occurred. \n"; + std::cerr << "Aborting." << std::endl; + + cudaDeviceReset(); + exit(EXIT_FAILURE); + return -1; } - const NppLibraryVersion *libVer = nppGetLibVersion(); - - printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); - - int driverVersion, runtimeVersion; - cudaDriverGetVersion(&driverVersion); - cudaRuntimeGetVersion(&runtimeVersion); - - printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); - printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10); - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, - cudaDevAttrComputeCapabilityMajor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, - cudaDevAttrComputeCapabilityMinor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); - - cudaDeviceProp oDeviceProperties; - - cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); - - nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; - nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; - nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; - nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; - - char *filePath; - - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); - } else { - filePath = sdkFindFilePath(inputFile, argv[0]); - } - - if (!filePath) { - std::cerr << "Couldn't find input file " << sFilename << std::endl; - exit(1); - } - - sFilename = filePath; - - // if we specify the filename at the command line, then we only test - // sFilename[0]. - int file_errors = 0; - std::ifstream infile(sFilename.data(), std::ifstream::in); - - if (infile.good()) { - std::cout << "gradientFilterBorderNPP opened <" << sFilename.data() - << "> successfully!" << std::endl; - file_errors = 0; - infile.close(); - } else { - std::cout << "gradientFilterBorderNPP unable to open <" - << sFilename.data() << ">" << std::endl; - file_errors++; - infile.close(); - } - - if (file_errors > 0) { - cudaDeviceReset(); - exit(EXIT_FAILURE); - } - - std::string sResultBaseFilename = sFilename; - - std::string::size_type dot = sResultBaseFilename.rfind('.'); - - if (dot != std::string::npos) { - sResultBaseFilename = sResultBaseFilename.substr(0, dot); - } - - std::string sResultXFilename = - sOutputDir + sFilename + "_gradientVectorPrewittBorderX_Vertical.pgm"; - std::string sResultYFilename = sResultBaseFilename; - - // sResultXFilename += "_gradientVectorPrewittBorderX_Vertical.pgm"; - sResultYFilename += "_gradientVectorPrewittBorderY_Horizontal.pgm"; - - // if (checkCmdLineFlag(argc, (const char **)argv, "output")) - // { - // char *outputFilePath; - // getCmdLineArgumentString(argc, (const char **)argv, "output", - // &outputFilePath); sResultBaseFilename = outputFilePath; - // } - - // declare a host image object for an 8-bit grayscale image - npp::ImageCPU_8u_C1 oHostSrc; - // load gray-scale image from disk - npp::loadImage(sFilename, oHostSrc); - // declare a device image and copy construct from the host image, - // i.e. upload host to device - npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); - - NppiSize oSrcSize = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; - NppiPoint oSrcOffset = {0, 0}; - - // create struct with ROI size - NppiSize oSizeROI = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; - // allocate device destination images of appropriatedly size - npp::ImageNPP_16s_C1 oDeviceDstX(oSizeROI.width, oSizeROI.height); - npp::ImageNPP_16s_C1 oDeviceDstY(oSizeROI.width, oSizeROI.height); - - // run Prewitt edge detection gradient vector filter - NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( - oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset, - oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(), - oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3, - nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx)); - - // allocate device destination images of appropriatedly size - npp::ImageNPP_8u_C1 oDeviceDstOutX(oSizeROI.width, oSizeROI.height); - npp::ImageNPP_8u_C1 oDeviceDstOutY(oSizeROI.width, oSizeROI.height); - - // convert 16s_C1 result images to binary 8u_C1 output images using constant - // value to adjust amount of visible detail - NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx( - oDeviceDstX.data(), oDeviceDstX.pitch(), 32, oDeviceDstOutX.data(), - oDeviceDstOutX.pitch(), oSizeROI, NPP_CMP_GREATER_EQ, nppStreamCtx)); - - NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx( - oDeviceDstY.data(), oDeviceDstY.pitch(), 32, oDeviceDstOutY.data(), - oDeviceDstOutY.pitch(), oSizeROI, NPP_CMP_GREATER_EQ, nppStreamCtx)); - - // create host images for the results - npp::ImageCPU_8u_C1 oHostDstX(oDeviceDstOutX.size()); - npp::ImageCPU_8u_C1 oHostDstY(oDeviceDstOutY.size()); - // and copy the device result data into them - oDeviceDstOutX.copyTo(oHostDstX.data(), oHostDstX.pitch()); - oDeviceDstOutY.copyTo(oHostDstY.data(), oHostDstY.pitch()); - - saveImage(sResultXFilename, oHostDstX); - std::cout << "Saved image: " << sResultXFilename << std::endl; - saveImage(sResultYFilename, oHostDstY); - std::cout << "Saved image: " << sResultYFilename << std::endl; - - // now use the Prewitt gradient border filter function in such a way that no - // border replication operations will be applied - - // create a Prewitt filter mask size object, Prewitt uses a 3x3 filter - // kernel - NppiSize oMaskSize = {3, 3}; - // create a size object for the enlarged source image - NppiSize oEnlargedSrcSize = {oSrcSize.width + oMaskSize.width - 1, - oSrcSize.height + oMaskSize.height - 1}; - - // create an enlarged device source image - npp::ImageNPP_8u_C1 oEnlargedDeviceSrc(oEnlargedSrcSize.width, - oEnlargedSrcSize.height); - - // copy and enlarge the original device source image and surround it with a - // white edge (border) - NPP_CHECK_NPP(nppiCopyConstBorder_8u_C1R_Ctx( - oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, - oEnlargedDeviceSrc.data(), oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, - oMaskSize.width / 2, oMaskSize.height / 2, 255, nppStreamCtx)); - - // adjust oEnlargedDeviceSrc pixel pointer to point to the first pixel of - // the original source image in the enlarged source image - const Npp8u *pTemp = - reinterpret_cast(oEnlargedDeviceSrc.data()); - pTemp += (oMaskSize.height / 2) * oEnlargedDeviceSrc.pitch(); - const Npp8u *pAdjustedSrc = - reinterpret_cast((void *)(pTemp)); - pAdjustedSrc += oMaskSize.width / 2; - - // create device output images for the no source border results - npp::ImageNPP_8u_C1 oDeviceDstOutXNoBorders(oSizeROI.width, - oSizeROI.height); - npp::ImageNPP_8u_C1 oDeviceDstOutYNoBorders(oSizeROI.width, - oSizeROI.height); - - // tell the filter function what cartesian pixel position pAdjustedSrc is - // pointing to within the enlarged source image - oSrcOffset.x += oMaskSize.width / 2; - oSrcOffset.y += oMaskSize.height / 2; - - // run Prewitt edge detection gradient vector filter bypassing border - // control due to enlarged source image - NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( - pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset, - oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(), - oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3, - nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx)); - - // convert 16s_C1 result images to binary 8u_C1 output images using constant - // value to adjust amount of visible detail - NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), oDeviceDstX.pitch(), - 32, oDeviceDstOutXNoBorders.data(), - oDeviceDstOutXNoBorders.pitch(), - oSizeROI, NPP_CMP_GREATER_EQ, - nppStreamCtx)); - - NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), oDeviceDstY.pitch(), - 32, oDeviceDstOutYNoBorders.data(), - oDeviceDstOutYNoBorders.pitch(), - oSizeROI, NPP_CMP_GREATER_EQ, - nppStreamCtx)); - // create additional output files - std::string sResultXNoBordersFilename = sResultBaseFilename; - std::string sResultYNoBordersFilename = sResultBaseFilename; - - sResultXNoBordersFilename += - "_gradientVectorPrewittBorderX_Vertical_WithNoSourceBorders.pgm"; - sResultYNoBordersFilename += - "_gradientVectorPrewittBorderY_Horizontal_WithNoSourceBorders.pgm"; - - // copy the device result data into the host output images - oDeviceDstOutXNoBorders.copyTo(oHostDstX.data(), oHostDstX.pitch()); - oDeviceDstOutYNoBorders.copyTo(oHostDstY.data(), oHostDstY.pitch()); - - saveImage(sResultXNoBordersFilename, oHostDstX); - std::cout << "Saved image: " << sResultXNoBordersFilename << std::endl; - saveImage(sResultYNoBordersFilename, oHostDstY); - std::cout << "Saved image: " << sResultYNoBordersFilename << std::endl; - - // now diff the two output images, one using border control and one - // bypassing border control - - // create device output images for the diff results - npp::ImageNPP_8u_C1 oDeviceDstOutXDiff(oSizeROI.width, oSizeROI.height); - npp::ImageNPP_8u_C1 oDeviceDstOutYDiff(oSizeROI.width, oSizeROI.height); - - // diff the two 8u_C1 result images one with and one without border control - - NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( - oDeviceDstOutXNoBorders.data(), oDeviceDstOutXNoBorders.pitch(), - oDeviceDstOutX.data(), oDeviceDstOutX.pitch(), - oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI, - nppStreamCtx)); - - NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( - oDeviceDstOutYNoBorders.data(), oDeviceDstOutYNoBorders.pitch(), - oDeviceDstOutY.data(), oDeviceDstOutY.pitch(), - oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI, - nppStreamCtx)); - - // create additional output files - std::string sResultXDiffFilename = sResultBaseFilename; - std::string sResultYDiffFilename = sResultBaseFilename; - - sResultXDiffFilename += - "_gradientVectorPrewittBorderX_Vertical_BorderDiffs.pgm"; - sResultYDiffFilename += - "_gradientVectorPrewittBorderY_Horizontal_BorderDiffs.pgm"; - - // copy the device result data into the host output images - oDeviceDstOutXDiff.copyTo(oHostDstX.data(), oHostDstX.pitch()); - oDeviceDstOutYDiff.copyTo(oHostDstY.data(), oHostDstY.pitch()); - - saveImage(sResultXDiffFilename, oHostDstX); - std::cout << "Saved image: " << sResultXDiffFilename << std::endl; - saveImage(sResultYDiffFilename, oHostDstY); - std::cout << "Saved image: " << sResultYDiffFilename << std::endl; - - // if you closely examine the above difference files (recommend using GIMP - // for viewing using scaling with no interpolation) you will see several - // single pixel differences (white pixels) along the right and bottom edges - // of the default vs. borderless images this happens because border pixels - // in the original source image are duplicated when the filter kernels - // overlap the edge of the source image when using the first version of the - // filter call but are actually sampled from the enlarged source image when - // using the second version of the filter call the technique used in the - // second filter call can be used with any filter border function in NPP to - // duplicate results that would be generated from a non-border filter - // function call by filling the border pixel outside the embedded source - // image with the appropriate border pixel values - - // here is how to use border control to process a source image in multiple - // calls and get correct output in the destination image - - // since the source image pointer already points to the beginning of the - // source image in the enlarged source image it doesn't need changed - - // tighten up the top and left source image borders - this will enable - // border replication on the left and top borders of the original source - // image - oSrcOffset.x = 0; - oSrcOffset.y = 0; - // tighten up the right and bottom side source image borders - this will - // enable border replication on the right and bottom borders of the original - // source image - oEnlargedSrcSize.width = oSrcSize.width; - oEnlargedSrcSize.height = oSrcSize.height; - - // create device output images for the mixed edge results - npp::ImageNPP_8u_C1 oDeviceDstOutXMixedBorders(oSizeROI.width, - oSizeROI.height); - npp::ImageNPP_8u_C1 oDeviceDstOutYMixedBorders(oSizeROI.width, - oSizeROI.height); - - // shrink output ROI width so that only the left half of the destination - // image will be generated however since oEnlargedSrcSize.width is still set - // to oSrcSize.width then border control will be disabled when the filter - // needs to access source pixels beyond the right side of the left half of - // the source image - int nLeftWidth = oSizeROI.width / 2; - int nRightWidth = oSizeROI.width - nLeftWidth; - oSizeROI.width = nLeftWidth; - - // run Prewitt edge detection gradient vector filter to generate the left - // side of the output image - NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( - pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset, - oDeviceDstX.data(), oDeviceDstX.pitch(), oDeviceDstY.data(), - oDeviceDstY.pitch(), 0, 0, 0, 0, oSizeROI, NPP_MASK_SIZE_3_X_3, - nppiNormL1, NPP_BORDER_REPLICATE, nppStreamCtx)); - - // now move the enlarged source pointer to the horizontal middle of the - // enlarged source image and tell the function where it was moved to - pAdjustedSrc += nLeftWidth; - // and adjust the source offset parameter accordingly - this will in effect - // turn off border control for the left border allowing the necessary source - // pixels to be used - oSrcOffset.x += nLeftWidth; - - // update oSizeROI.width so that only enough destination pixels will be - // produced to fill the right half of the destination image - oSizeROI.width = nRightWidth; - - // run Prewitt edge detection gradient vector filter to generate the right - // side of the output image adjusting the destination image pointers - // appropriately - NPP_CHECK_NPP(nppiGradientVectorPrewittBorder_8u16s_C1R_Ctx( - pAdjustedSrc, oEnlargedDeviceSrc.pitch(), oEnlargedSrcSize, oSrcOffset, - oDeviceDstX.data() + nLeftWidth, oDeviceDstX.pitch(), - oDeviceDstY.data() + nLeftWidth, oDeviceDstY.pitch(), 0, 0, 0, 0, - oSizeROI, NPP_MASK_SIZE_3_X_3, nppiNormL1, NPP_BORDER_REPLICATE, - nppStreamCtx)); - - // convert 16s_C1 result images to binary 8u_C1 output images using constant - // value to adjust amount of visible detail - NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstX.data(), oDeviceDstX.pitch(), - 32, oDeviceDstOutXMixedBorders.data(), - oDeviceDstOutXMixedBorders.pitch(), - oSizeROI, NPP_CMP_GREATER_EQ, - nppStreamCtx)); - - NPP_CHECK_NPP(nppiCompareC_16s_C1R_Ctx(oDeviceDstY.data(), oDeviceDstY.pitch(), - 32, oDeviceDstOutYMixedBorders.data(), - oDeviceDstOutYMixedBorders.pitch(), - oSizeROI, NPP_CMP_GREATER_EQ, - nppStreamCtx)); - // create additional output files - std::string sResultXMixedBordersFilename = sResultBaseFilename; - std::string sResultYMixedBordersFilename = sResultBaseFilename; - - sResultXMixedBordersFilename += - "_gradientVectorPrewittBorderX_Vertical_WithMixedBorders.pgm"; - sResultYMixedBordersFilename += - "_gradientVectorPrewittBorderY_Horizontal_WithMixedBorders.pgm"; - - // copy the device result data into the host output images - oDeviceDstOutXMixedBorders.copyTo(oHostDstX.data(), oHostDstX.pitch()); - oDeviceDstOutYMixedBorders.copyTo(oHostDstY.data(), oHostDstY.pitch()); - - saveImage(sResultXMixedBordersFilename, oHostDstX); - std::cout << "Saved image: " << sResultXMixedBordersFilename << std::endl; - saveImage(sResultYMixedBordersFilename, oHostDstY); - std::cout << "Saved image: " << sResultYMixedBordersFilename << std::endl; - - // diff the original 8u_C1 result images with border control and the mixed - // border control images, they should match (diff image will be all black) - - NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( - oDeviceDstOutXMixedBorders.data(), oDeviceDstOutXMixedBorders.pitch(), - oDeviceDstOutX.data(), oDeviceDstOutX.pitch(), - oDeviceDstOutXDiff.data(), oDeviceDstOutXDiff.pitch(), oSizeROI, - nppStreamCtx)); - - NPP_CHECK_NPP(nppiAbsDiff_8u_C1R_Ctx( - oDeviceDstOutYMixedBorders.data(), oDeviceDstOutYMixedBorders.pitch(), - oDeviceDstOutY.data(), oDeviceDstOutY.pitch(), - oDeviceDstOutYDiff.data(), oDeviceDstOutYDiff.pitch(), oSizeROI, - nppStreamCtx)); - - // create additional output files - std::string sResultXMixedDiffFilename = sResultBaseFilename; - std::string sResultYMixedDiffFilename = sResultBaseFilename; - - sResultXMixedDiffFilename += - "_gradientVectorPrewittBorderX_Vertical_MixedBorderDiffs.pgm"; - sResultYMixedDiffFilename += - "_gradientVectorPrewittBorderY_Horizontal_MixedBorderDiffs.pgm"; - - // copy the device result data into the host output images - oDeviceDstOutXDiff.copyTo(oHostDstX.data(), oHostDstX.pitch()); - oDeviceDstOutYDiff.copyTo(oHostDstY.data(), oHostDstY.pitch()); - - saveImage(sResultXMixedDiffFilename, oHostDstX); - std::cout << "Saved image: " << sResultXMixedDiffFilename << std::endl; - saveImage(sResultYMixedDiffFilename, oHostDstY); - std::cout << "Saved image: " << sResultYMixedDiffFilename << std::endl; - - nppiFree(oDeviceSrc.data()); - nppiFree(oDeviceDstX.data()); - nppiFree(oDeviceDstY.data()); - nppiFree(oDeviceDstOutX.data()); - nppiFree(oDeviceDstOutY.data()); - nppiFree(oDeviceDstOutXNoBorders.data()); - nppiFree(oDeviceDstOutYNoBorders.data()); - nppiFree(oDeviceDstOutXDiff.data()); - nppiFree(oDeviceDstOutYDiff.data()); - nppiFree(oDeviceDstOutXMixedBorders.data()); - nppiFree(oDeviceDstOutYMixedBorders.data()); - nppiFree(oEnlargedDeviceSrc.data()); - - cudaDeviceReset(); - exit(EXIT_SUCCESS); - } catch (npp::Exception &rException) { - std::cerr << "Program error! The following exception occurred: \n"; - std::cerr << rException << std::endl; - std::cerr << "Aborting." << std::endl; - - cudaDeviceReset(); - exit(EXIT_FAILURE); - } catch (...) { - std::cerr << "Program error! An unknow type of exception occurred. \n"; - std::cerr << "Aborting." << std::endl; - - cudaDeviceReset(); - exit(EXIT_FAILURE); - return -1; - } - - return 0; + return 0; } diff --git a/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/MersenneTwister.cpp b/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/MersenneTwister.cpp index 1b129f54..1d6af8a2 100644 --- a/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/MersenneTwister.cpp +++ b/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/MersenneTwister.cpp @@ -32,149 +32,148 @@ // Utilities and system includes // includes, system -#include +#include #include +#include #include -#include - // Utilities and system includes -#include -#include - #include #include +#include +#include float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU); -const int DEFAULT_RAND_N = 2400000; -const unsigned int DEFAULT_SEED = 777; +const int DEFAULT_RAND_N = 2400000; +const unsigned int DEFAULT_SEED = 777; /////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // Start logs - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + // Start logs + printf("%s Starting...\n\n", argv[0]); - // initialize the GPU, either identified by --device - // or by picking the device with highest flop rate. - int devID = findCudaDevice(argc, (const char **)argv); + // initialize the GPU, either identified by --device + // or by picking the device with highest flop rate. + int devID = findCudaDevice(argc, (const char **)argv); - // parsing the number of random numbers to generate - int rand_n = DEFAULT_RAND_N; + // parsing the number of random numbers to generate + int rand_n = DEFAULT_RAND_N; - if (checkCmdLineFlag(argc, (const char **)argv, "count")) { - rand_n = getCmdLineArgumentInt(argc, (const char **)argv, "count"); - } - - printf("Allocating data for %i samples...\n", rand_n); - - // parsing the seed - int seed = DEFAULT_SEED; - - if (checkCmdLineFlag(argc, (const char **)argv, "seed")) { - seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed"); - } - - printf("Seeding with %i ...\n", seed); - - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - float *d_Rand; - checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float))); - - curandGenerator_t prngGPU; - checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32)); - checkCudaErrors(curandSetStream(prngGPU, stream)); - checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed)); - - curandGenerator_t prngCPU; - checkCudaErrors( - curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32)); - checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed)); - - // - // Example 1: Compare random numbers generated on GPU and CPU - float *h_RandGPU; - checkCudaErrors(cudaMallocHost(&h_RandGPU, rand_n * sizeof(float))); - - printf("Generating random numbers on GPU...\n\n"); - checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n)); - - printf("\nReading back the results...\n"); - checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float), - cudaMemcpyDeviceToHost, stream)); - - float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); - - printf("Generating random numbers on CPU...\n\n"); - checkCudaErrors(curandGenerateUniform(prngCPU, (float *)h_RandCPU, rand_n)); - - checkCudaErrors(cudaStreamSynchronize(stream)); - printf("Comparing CPU/GPU random numbers...\n\n"); - float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); - - // - // Example 2: Timing of random number generation on GPU - const int numIterations = 10; - int i; - StopWatchInterface *hTimer; - - sdkCreateTimer(&hTimer); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - for (i = 0; i < numIterations; i++) { - checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n)); - } - - checkCudaErrors(cudaStreamSynchronize(stream)); - sdkStopTimer(&hTimer); - - double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer) / (double)numIterations; - - printf( - "MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, " - "Size = %u Numbers\n", - 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); - - printf("Shutting down...\n"); - - checkCudaErrors(curandDestroyGenerator(prngGPU)); - checkCudaErrors(curandDestroyGenerator(prngCPU)); - checkCudaErrors(cudaStreamDestroy(stream)); - checkCudaErrors(cudaFree(d_Rand)); - sdkDeleteTimer(&hTimer); - checkCudaErrors(cudaFreeHost(h_RandGPU)); - free(h_RandCPU); - - exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); -} - -float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) { - int i; - float rCPU, rGPU, delta; - float max_delta = 0.; - float sum_delta = 0.; - float sum_ref = 0.; - - for (i = 0; i < rand_n; i++) { - rCPU = h_RandCPU[i]; - rGPU = h_RandGPU[i]; - delta = fabs(rCPU - rGPU); - sum_delta += delta; - sum_ref += fabs(rCPU); - - if (delta >= max_delta) { - max_delta = delta; + if (checkCmdLineFlag(argc, (const char **)argv, "count")) { + rand_n = getCmdLineArgumentInt(argc, (const char **)argv, "count"); } - } - float L1norm = (float)(sum_delta / sum_ref); - printf("Max absolute error: %E\n", max_delta); - printf("L1 norm: %E\n\n", L1norm); + printf("Allocating data for %i samples...\n", rand_n); - return L1norm; + // parsing the seed + int seed = DEFAULT_SEED; + + if (checkCmdLineFlag(argc, (const char **)argv, "seed")) { + seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed"); + } + + printf("Seeding with %i ...\n", seed); + + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + float *d_Rand; + checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float))); + + curandGenerator_t prngGPU; + checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32)); + checkCudaErrors(curandSetStream(prngGPU, stream)); + checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed)); + + curandGenerator_t prngCPU; + checkCudaErrors(curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32)); + checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed)); + + // + // Example 1: Compare random numbers generated on GPU and CPU + float *h_RandGPU; + checkCudaErrors(cudaMallocHost(&h_RandGPU, rand_n * sizeof(float))); + + printf("Generating random numbers on GPU...\n\n"); + checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n)); + + printf("\nReading back the results...\n"); + checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost, stream)); + + float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); + + printf("Generating random numbers on CPU...\n\n"); + checkCudaErrors(curandGenerateUniform(prngCPU, (float *)h_RandCPU, rand_n)); + + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("Comparing CPU/GPU random numbers...\n\n"); + float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); + + // + // Example 2: Timing of random number generation on GPU + const int numIterations = 10; + int i; + StopWatchInterface *hTimer; + + sdkCreateTimer(&hTimer); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + + for (i = 0; i < numIterations; i++) { + checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n)); + } + + checkCudaErrors(cudaStreamSynchronize(stream)); + sdkStopTimer(&hTimer); + + double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer) / (double)numIterations; + + printf("MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, " + "Size = %u Numbers\n", + 1.0e-9 * rand_n / gpuTime, + gpuTime, + rand_n); + + printf("Shutting down...\n"); + + checkCudaErrors(curandDestroyGenerator(prngGPU)); + checkCudaErrors(curandDestroyGenerator(prngCPU)); + checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaFree(d_Rand)); + sdkDeleteTimer(&hTimer); + checkCudaErrors(cudaFreeHost(h_RandGPU)); + free(h_RandCPU); + + exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) +{ + int i; + float rCPU, rGPU, delta; + float max_delta = 0.; + float sum_delta = 0.; + float sum_ref = 0.; + + for (i = 0; i < rand_n; i++) { + rCPU = h_RandCPU[i]; + rGPU = h_RandGPU[i]; + delta = fabs(rCPU - rGPU); + sum_delta += delta; + sum_ref += fabs(rCPU); + + if (delta >= max_delta) { + max_delta = delta; + } + } + + float L1norm = (float)(sum_delta / sum_ref); + printf("Max absolute error: %E\n", max_delta); + printf("L1 norm: %E\n\n", L1norm); + + return L1norm; } diff --git a/Samples/4_CUDA_Libraries/README.md b/Samples/4_CUDA_Libraries/README.md index b2c25a8f..0ad27bbd 100644 --- a/Samples/4_CUDA_Libraries/README.md +++ b/Samples/4_CUDA_Libraries/README.md @@ -114,4 +114,3 @@ Example of using CUFFT. In this example, CUFFT is used to compute the 1D-convolu ### [watershedSegmentationNPP](./watershedSegmentationNPP) An NPP CUDA Sample that demonstrates how to use the NPP watershed segmentation function. - diff --git a/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.cpp b/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.cpp index 863fcfb9..a8a2c1c2 100644 --- a/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.cpp +++ b/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.cpp @@ -55,40 +55,43 @@ const char *sSDKname = "batchCUBLAS"; //============================================================================== #if defined(__cplusplus) -extern "C" { +extern "C" +{ #endif /* __cplusplus */ -int getDeviceVersion(void) { - int device; - struct cudaDeviceProp properties; + int getDeviceVersion(void) + { + int device; + struct cudaDeviceProp properties; - if (cudaGetDevice(&device) != cudaSuccess) { - printf("failed to get device\n"); - return 0; - } + if (cudaGetDevice(&device) != cudaSuccess) { + printf("failed to get device\n"); + return 0; + } - if (cudaGetDeviceProperties(&properties, device) != cudaSuccess) { - printf("failed to get properties\n"); - return 0; - } + if (cudaGetDeviceProperties(&properties, device) != cudaSuccess) { + printf("failed to get properties\n"); + return 0; + } - return properties.major * 100 + properties.minor * 10; -} + return properties.major * 100 + properties.minor * 10; + } -size_t getDeviceMemory(void) { - struct cudaDeviceProp properties; - int device; + size_t getDeviceMemory(void) + { + struct cudaDeviceProp properties; + int device; - if (cudaGetDevice(&device) != cudaSuccess) { - return 0; - } + if (cudaGetDevice(&device) != cudaSuccess) { + return 0; + } - if (cudaGetDeviceProperties(&properties, device) != cudaSuccess) { - return 0; - } + if (cudaGetDeviceProperties(&properties, device) != cudaSuccess) { + return 0; + } - return properties.totalGlobalMem; -} + return properties.totalGlobalMem; + } #if defined(__cplusplus) } #endif /* __cplusplus */ @@ -97,135 +100,178 @@ size_t getDeviceMemory(void) { // random utilities //============================================================================== -template -void fillupMatrix(T_ELEM *A, int lda, int rows, int cols, int seed = 0); +template void fillupMatrix(T_ELEM *A, int lda, int rows, int cols, int seed = 0); -template -void fillupMatrix(T_ELEM *A, int lda, int rows, int cols, int seed) { - for (int j = 0; j < cols; j++) { - for (int i = 0; i < rows; i++) { - A[i + lda * j] = cuGet( - ((double)(((lda * i + j + seed) % 253) + 1)) / 256.0, - ((double)((((cols * i + j) + 123 + seed) % 253) + 1)) / 256.0); +template void fillupMatrix(T_ELEM *A, int lda, int rows, int cols, int seed) +{ + for (int j = 0; j < cols; j++) { + for (int i = 0; i < rows; i++) { + A[i + lda * j] = cuGet(((double)(((lda * i + j + seed) % 253) + 1)) / 256.0, + ((double)((((cols * i + j) + 123 + seed) % 253) + 1)) / 256.0); + } } - } } /* Explicit instantiation */ -template void fillupMatrix(float *A, int lda, int rows, int cols, - int seed); -template void fillupMatrix(double *A, int lda, int rows, int cols, - int seed); +template void fillupMatrix(float *A, int lda, int rows, int cols, int seed); +template void fillupMatrix(double *A, int lda, int rows, int cols, int seed); /* For debugging */ -void printCuType(const char *str, float A) { - fprintf(stdout, "%s (0x%08x, %g)", str, floatAsUInt(A), A); -} +void printCuType(const char *str, float A) { fprintf(stdout, "%s (0x%08x, %g)", str, floatAsUInt(A), A); } -void printCuType(const char *str, double A) { - fprintf(stdout, "%s (0x%016llx, %g)", str, doubleAsULL(A), A); -} +void printCuType(const char *str, double A) { fprintf(stdout, "%s (0x%016llx, %g)", str, doubleAsULL(A), A); } //============================================================================== // defines and structures //============================================================================== -#define CUBLAS_SGEMM_MAX_ULP_ERR (.3) -#define CUBLAS_DGEMM_MAX_ULP_ERR (1.e-3) +#define CUBLAS_SGEMM_MAX_ULP_ERR (.3) +#define CUBLAS_DGEMM_MAX_ULP_ERR (1.e-3) #define CUBLAS_SGEMM_MAX_RELATIVE_ERR (6.e-6) #define CUBLAS_DGEMM_MAX_RELATIVE_ERR (0.0) -#define CUBLAS_GEMM_TEST_COUNT (30) -#define BENCH_MATRIX_M (128) -#define BENCH_MATRIX_K (128) -#define BENCH_MATRIX_N (128) +#define CUBLAS_GEMM_TEST_COUNT (30) +#define BENCH_MATRIX_M (128) +#define BENCH_MATRIX_K (128) +#define BENCH_MATRIX_N (128) -#define CLEANUP() \ - do { \ - if (A) free(A); \ - if (B) free(B); \ - if (C) free(C); \ - for (int i = 0; i < opts.N; ++i) { \ - if (devPtrA[i]) cudaFree(devPtrA[i]); \ - if (devPtrB[i]) cudaFree(devPtrB[i]); \ - if (devPtrC[i]) cudaFree(devPtrC[i]); \ - } \ - if (devPtrA) free(devPtrA); \ - if (devPtrB) free(devPtrB); \ - if (devPtrC) free(devPtrC); \ - if (devPtrA_dev) cudaFree(devPtrA_dev); \ - if (devPtrB_dev) cudaFree(devPtrB_dev); \ - if (devPtrC_dev) cudaFree(devPtrC_dev); \ - fflush(stdout); \ - } while (0) +#define CLEANUP() \ + do { \ + if (A) \ + free(A); \ + if (B) \ + free(B); \ + if (C) \ + free(C); \ + for (int i = 0; i < opts.N; ++i) { \ + if (devPtrA[i]) \ + cudaFree(devPtrA[i]); \ + if (devPtrB[i]) \ + cudaFree(devPtrB[i]); \ + if (devPtrC[i]) \ + cudaFree(devPtrC[i]); \ + } \ + if (devPtrA) \ + free(devPtrA); \ + if (devPtrB) \ + free(devPtrB); \ + if (devPtrC) \ + free(devPtrC); \ + if (devPtrA_dev) \ + cudaFree(devPtrA_dev); \ + if (devPtrB_dev) \ + cudaFree(devPtrB_dev); \ + if (devPtrC_dev) \ + cudaFree(devPtrC_dev); \ + fflush(stdout); \ + } while (0) enum testMethod { tmRegular, tmStream, tmBatched }; -struct gemmOpts { - int m; - int n; - int k; - testMethod test_method; - char *elem_type; - int N; // number of multiplications +struct gemmOpts +{ + int m; + int n; + int k; + testMethod test_method; + char *elem_type; + int N; // number of multiplications }; -template -struct gemmTestParams { - cublasOperation_t transa; - cublasOperation_t transb; - int m; - int n; - int k; - T_ELEM alpha; - T_ELEM beta; +template struct gemmTestParams +{ + cublasOperation_t transa; + cublasOperation_t transb; + int m; + int n; + int k; + T_ELEM alpha; + T_ELEM beta; }; //============================================================================== // template wrappers for cuda functions //============================================================================== -static inline cublasStatus_t cublasXgemm(cublasHandle_t handle, +static inline cublasStatus_t cublasXgemm(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, - int k, float *alpha, const float *A, - int lda, float *B, int ldb, - float *beta, float *C, int ldc) { - return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, - beta, C, ldc); + cublasOperation_t transb, + int m, + int n, + int k, + float *alpha, + const float *A, + int lda, + float *B, + int ldb, + float *beta, + float *C, + int ldc) +{ + return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } -static inline cublasStatus_t cublasXgemm(cublasHandle_t handle, +static inline cublasStatus_t cublasXgemm(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, - int k, double *alpha, const double *A, - int lda, double *B, int ldb, - double *beta, double *C, int ldc) { - return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, - beta, C, ldc); + cublasOperation_t transb, + int m, + int n, + int k, + double *alpha, + const double *A, + int lda, + double *B, + int ldb, + double *beta, + double *C, + int ldc) +{ + return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } -static inline cublasStatus_t cublasXgemmBatched( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, float *alpha, const float *Aarray[], int lda, - const float *Barray[], int ldb, float *beta, float *Carray[], int ldc, - int batchCount) { +static inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + float *alpha, + const float *Aarray[], + int lda, + const float *Barray[], + int ldb, + float *beta, + float *Carray[], + int ldc, + int batchCount) +{ #if CUDART_VERSION >= 4010 - return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasSgemmBatched( + handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); #else - return CUBLAS_STATUS_SUCCESS; + return CUBLAS_STATUS_SUCCESS; #endif } -static inline cublasStatus_t cublasXgemmBatched( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, double *alpha, const double *Aarray[], int lda, - const double *Barray[], int ldb, double *beta, double *Carray[], int ldc, - int batchCount) { +static inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + double *alpha, + const double *Aarray[], + int lda, + const double *Barray[], + int ldb, + double *beta, + double *Carray[], + int ldc, + int batchCount) +{ #if CUDART_VERSION >= 4010 - return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasDgemmBatched( + handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); #else - return CUBLAS_STATUS_SUCCESS; + return CUBLAS_STATUS_SUCCESS; #endif } @@ -233,433 +279,449 @@ static inline cublasStatus_t cublasXgemmBatched( // Primary Application code //============================================================================== -static int processArgs(int argc, char *argv[], struct gemmOpts *opts) { - int error = 0; - int oldError; - memset(opts, 0, sizeof(*opts)); - static char default_type[] = "d"; // default double - opts->elem_type = default_type; - opts->N = 10; +static int processArgs(int argc, char *argv[], struct gemmOpts *opts) +{ + int error = 0; + int oldError; + memset(opts, 0, sizeof(*opts)); + static char default_type[] = "d"; // default double + opts->elem_type = default_type; + opts->N = 10; - while (argc) { - oldError = error; + while (argc) { + oldError = error; - if (*argv[0] == SWITCH_CHAR) { - switch (*(argv[0] + 1)) { - case 'm': - opts->m = (int)atol(argv[0] + 2); - break; + if (*argv[0] == SWITCH_CHAR) { + switch (*(argv[0] + 1)) { + case 'm': + opts->m = (int)atol(argv[0] + 2); + break; - case 'n': - opts->n = (int)atol(argv[0] + 2); - break; + case 'n': + opts->n = (int)atol(argv[0] + 2); + break; - case 'k': - opts->k = (int)atol(argv[0] + 2); - break; + case 'k': + opts->k = (int)atol(argv[0] + 2); + break; - case 'N': - opts->N = (int)atol(argv[0] + 2); - break; + case 'N': + opts->N = (int)atol(argv[0] + 2); + break; - default: - break; - } + default: + break; + } + } + + if (error > oldError) { + fprintf(stderr, "Invalid switch '%c%s'\n", SWITCH_CHAR, argv[0] + 1); + } + + argc -= 1; + argv++; } - if (error > oldError) { - fprintf(stderr, "Invalid switch '%c%s'\n", SWITCH_CHAR, argv[0] + 1); - } - - argc -= 1; - argv++; - } - - return error; + return error; } template -static int TESTGEN(gemm)(const struct gemmOpts *opts, int matrixM, int matrixN, - int matrixK, int &numTests, - struct gemmTestParams *params) { - static T_ELEM alpha[] = {cuGet(0, 0), cuGet(-1, -1), - cuGet(1, -2), cuGet(2, -1), - cuGet(0, -3)}; - static T_ELEM beta[] = {cuGet(0, 0), cuGet(-1, -1), - cuGet(1, -2), cuGet(2, -1), - cuGet(0, -3)}; +static int TESTGEN(gemm)(const struct gemmOpts *opts, + int matrixM, + int matrixN, + int matrixK, + int &numTests, + struct gemmTestParams *params) +{ + static T_ELEM alpha[] = { + cuGet(0, 0), cuGet(-1, -1), cuGet(1, -2), cuGet(2, -1), cuGet(0, -3)}; + static T_ELEM beta[] = { + cuGet(0, 0), cuGet(-1, -1), cuGet(1, -2), cuGet(2, -1), cuGet(0, -3)}; #define NBR_ALPHAS (sizeof(alpha) / sizeof(alpha[0])) -#define NBR_BETAS (sizeof(beta) / sizeof(beta[0])) - static T_ELEM theAlpha; - static T_ELEM theBeta; - static int state; - static int m; - static int n; - static int k; +#define NBR_BETAS (sizeof(beta) / sizeof(beta[0])) + static T_ELEM theAlpha; + static T_ELEM theBeta; + static int state; + static int m; + static int n; + static int k; - if (numTests-- <= 0) { - return -1; - } + if (numTests-- <= 0) { + return -1; + } - theAlpha = alpha[cuRand() % NBR_ALPHAS]; - theBeta = beta[cuRand() % NBR_BETAS]; - params->transa = CUBLAS_OP_N; - params->transb = CUBLAS_OP_N; - m = matrixM; - n = matrixN; - k = matrixK; - params->m = m; - params->n = n; - params->k = k; - params->alpha = theAlpha; - params->beta = theBeta; + theAlpha = alpha[cuRand() % NBR_ALPHAS]; + theBeta = beta[cuRand() % NBR_BETAS]; + params->transa = CUBLAS_OP_N; + params->transb = CUBLAS_OP_N; + m = matrixM; + n = matrixN; + k = matrixK; + params->m = m; + params->n = n; + params->k = k; + params->alpha = theAlpha; + params->beta = theBeta; - printf("#### args: ta=%d tb=%d m=%d n=%d k=%d ", (unsigned int)params->transa, - (unsigned int)params->transb, params->m, params->n, params->k); - printCuType(" alpha =", params->alpha); - printCuType(" beta=", params->beta); - printf("\n"); + printf("#### args: ta=%d tb=%d m=%d n=%d k=%d ", + (unsigned int)params->transa, + (unsigned int)params->transb, + params->m, + params->n, + params->k); + printCuType(" alpha =", params->alpha); + printCuType(" beta=", params->beta); + printf("\n"); - m = cuRand() % matrixM; - n = cuRand() % matrixN; - k = cuRand() % matrixK; + m = cuRand() % matrixM; + n = cuRand() % matrixN; + k = cuRand() % matrixK; - state = cuRand() % 9; - return 0; + state = cuRand() % 9; + return 0; +} + +template void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols) +{ + for (int j = 0; j < cols; j++) { + for (int i = 0; i < rows; i++) { + A[i + lda * j] = cuGet(i + j); + } + } } template -void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols) { - for (int j = 0; j < cols; j++) { - for (int i = 0; i < rows; i++) { - A[i + lda * j] = cuGet(i + j); - } - } -} +int test_gemm_loop(struct gemmOpts &opts, float err, double max_relative_error, cublasHandle_t handle) +{ + struct gemmTestParams params; + cudaStream_t *streamArray = 0; + cublasStatus_t status1, status2, status3; + T_ELEM *A = NULL; + T_ELEM *B = NULL; + T_ELEM *C = NULL; + T_ELEM **devPtrA = 0; + T_ELEM **devPtrB = 0; + T_ELEM **devPtrC = 0; + T_ELEM **devPtrA_dev = NULL; + T_ELEM **devPtrB_dev = NULL; + T_ELEM **devPtrC_dev = NULL; + int matrixM, matrixN, matrixK; + int rowsA, rowsB, rowsC; + int colsA, colsB, colsC; + int matrixSizeA, matrixSizeB, matrixSizeC; + int errors; + double start, stop; -template -int test_gemm_loop(struct gemmOpts &opts, float err, double max_relative_error, - cublasHandle_t handle) { - struct gemmTestParams params; - cudaStream_t *streamArray = 0; - cublasStatus_t status1, status2, status3; - T_ELEM *A = NULL; - T_ELEM *B = NULL; - T_ELEM *C = NULL; - T_ELEM **devPtrA = 0; - T_ELEM **devPtrB = 0; - T_ELEM **devPtrC = 0; - T_ELEM **devPtrA_dev = NULL; - T_ELEM **devPtrB_dev = NULL; - T_ELEM **devPtrC_dev = NULL; - int matrixM, matrixN, matrixK; - int rowsA, rowsB, rowsC; - int colsA, colsB, colsC; - int matrixSizeA, matrixSizeB, matrixSizeC; - int errors; - double start, stop; + printf("Testing %cgemm\n", *opts.elem_type); - printf("Testing %cgemm\n", *opts.elem_type); + matrixM = (opts.m) ? opts.m : BENCH_MATRIX_M; + matrixN = (opts.n) ? opts.n : BENCH_MATRIX_N; + matrixK = (opts.k) ? opts.k : BENCH_MATRIX_K; - matrixM = (opts.m) ? opts.m : BENCH_MATRIX_M; - matrixN = (opts.n) ? opts.n : BENCH_MATRIX_N; - matrixK = (opts.k) ? opts.k : BENCH_MATRIX_K; + rowsA = imax(1, matrixM); + colsA = imax(1, matrixK); + rowsB = imax(1, matrixK); + colsB = imax(1, matrixN); + rowsC = imax(1, matrixM); + colsC = imax(1, matrixN); - rowsA = imax(1, matrixM); - colsA = imax(1, matrixK); - rowsB = imax(1, matrixK); - colsB = imax(1, matrixN); - rowsC = imax(1, matrixM); - colsC = imax(1, matrixN); + matrixSizeA = rowsA * colsA; + matrixSizeB = rowsB * colsB; + matrixSizeC = rowsC * colsC; - matrixSizeA = rowsA * colsA; - matrixSizeB = rowsB * colsB; - matrixSizeC = rowsC * colsC; - - devPtrA = (T_ELEM **)malloc(opts.N * sizeof(*devPtrA)); - devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB)); - devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC)); - - for (int i = 0; i < opts.N; i++) { - cudaError_t err1 = - cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(devPtrA[0][0])); - cudaError_t err2 = - cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0])); - cudaError_t err3 = - cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0])); - - if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || - (err3 != cudaSuccess)) { - CLEANUP(); - fprintf(stderr, "!!!! GPU memory allocation error\n"); - return CUBLASTEST_FAILED; - } - } - - // For batched processing we need those arrays on the device - if (opts.test_method == tmBatched) { - cudaError_t err1 = - cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA)); - cudaError_t err2 = - cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB)); - cudaError_t err3 = - cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC)); - - if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || - (err3 != cudaSuccess)) { - CLEANUP(); - fprintf(stderr, "!!!! GPU memory allocation error\n"); - return CUBLASTEST_FAILED; - } - - err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), - cudaMemcpyHostToDevice); - err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), - cudaMemcpyHostToDevice); - err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), - cudaMemcpyHostToDevice); - - if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || - (err3 != cudaSuccess)) { - CLEANUP(); - fprintf(stderr, "!!!! cannot copy pointer array to device\n"); - return CUBLASTEST_FAILED; - } - } - - A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0])); - B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0])); - C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0])); - - if ((!A) || (!B) || (!C)) { - CLEANUP(); - fprintf(stderr, "!!!! system memory allocation error\n"); - return CUBLASTEST_FAILED; - } - - streamArray = (cudaStream_t *)malloc(opts.N * sizeof(cudaStream_t *)); - - for (int i = 0; i < opts.N; i++) { - if (opts.test_method == tmStream) { - cudaError_t cudaErr = cudaStreamCreate(&streamArray[i]); - - if (cudaErr != cudaSuccess) { - CLEANUP(); - fprintf(stderr, "!!!! cannot create stream\n"); - return CUBLASTEST_FAILED; - } - } else { - streamArray[i] = 0; - } - } - - errors = 0; - int numTests = 1; - - while (TESTGEN(gemm)(&opts, matrixM, matrixN, matrixK, numTests, ¶ms) == - 0) { - printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC); - - // fillup with Nan first (so lda padding is full on Nan) - memset(A, 0xFF, matrixSizeA * sizeof(A[0])); - fillupMatrixDebug(A, rowsA, params.m, params.k); - memset(B, 0xFF, matrixSizeB * sizeof(B[0])); - fillupMatrix(B, rowsB, params.k, params.n, 121); - - if (!cuEqual(params.beta, cuGet(0))) { - fillupMatrix(C, rowsC, params.m, params.n); - } else { - /* fill with SNaNs to make sure ZGEMM doesn't access C */ - memset(C, 0xFF, matrixSizeC * sizeof(C[0])); - } - - double flopsCoef = 2.0; + devPtrA = (T_ELEM **)malloc(opts.N * sizeof(*devPtrA)); + devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB)); + devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC)); for (int i = 0; i < opts.N; i++) { - status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, - devPtrA[i], rowsA); - status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, - devPtrB[i], rowsB); - status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, - devPtrC[i], rowsC); + cudaError_t err1 = cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(devPtrA[0][0])); + cudaError_t err2 = cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0])); + cudaError_t err3 = cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0])); - if ((status1 != CUBLAS_STATUS_SUCCESS) || (status2 != status1) || - (status3 != status1)) { - CLEANUP(); - fprintf(stderr, "!!!! GPU access error (write)\n"); - return CUBLASTEST_FAILED; - } - } - - start = second(); - - if (opts.test_method == tmBatched) { - cublasSetStream(handle, streamArray[0]); - status1 = cublasXgemmBatched(handle, params.transa, params.transb, - params.m, params.n, params.k, ¶ms.alpha, - (const T_ELEM **)devPtrA_dev, rowsA, - (const T_ELEM **)devPtrB_dev, rowsB, - ¶ms.beta, devPtrC_dev, rowsC, opts.N); - - if (status1 != CUBLAS_STATUS_SUCCESS) { - cudaError_t cudaStatus = cudaGetLastError(); - CLEANUP(); - fprintf(stderr, - "!!!! GPU program execution error : cublas Error=%d, cuda " - "Error=%d,(%s)\n", - status1, cudaStatus, cudaGetErrorString(cudaStatus)); - return CUBLASTEST_FAILED; - } - } else { - for (int i = 0; i < opts.N; i++) { - cublasSetStream(handle, streamArray[i]); - status1 = - cublasXgemm(handle, params.transa, params.transb, params.m, - params.n, params.k, ¶ms.alpha, devPtrA[i], rowsA, - devPtrB[i], rowsB, ¶ms.beta, devPtrC[i], rowsC); - - if (status1 != CUBLAS_STATUS_SUCCESS) { - cudaError_t cudaStatus = cudaGetLastError(); - CLEANUP(); - fprintf(stderr, - "!!!! GPU program execution error : cublas Error=%d, cuda " - "Error=%d,(%s)\n", - status1, cudaStatus, cudaGetErrorString(cudaStatus)); - return CUBLASTEST_FAILED; + if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) { + CLEANUP(); + fprintf(stderr, "!!!! GPU memory allocation error\n"); + return CUBLASTEST_FAILED; } - } } - cudaError_t cudaStatus = cudaDeviceSynchronize(); + // For batched processing we need those arrays on the device + if (opts.test_method == tmBatched) { + cudaError_t err1 = cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA)); + cudaError_t err2 = cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB)); + cudaError_t err3 = cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC)); - if (cudaStatus != cudaSuccess) { - CLEANUP(); - fprintf(stderr, - "!!!! GPU program execution error on cudaDeviceSynchronize : " - "cudaError=%d,(%s)\n", - cudaStatus, cudaGetErrorString(cudaStatus)); - return CUBLASTEST_FAILED; + if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) { + CLEANUP(); + fprintf(stderr, "!!!! GPU memory allocation error\n"); + return CUBLASTEST_FAILED; + } + + err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), cudaMemcpyHostToDevice); + err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), cudaMemcpyHostToDevice); + err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice); + + if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) { + CLEANUP(); + fprintf(stderr, "!!!! cannot copy pointer array to device\n"); + return CUBLASTEST_FAILED; + } } - stop = second(); + A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0])); + B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0])); + C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0])); - fprintf(stdout, "^^^^ elapsed = %10.8f sec GFLOPS=%g\n", (stop - start), - opts.N * (1e-9 * flopsCoef * params.m * params.n * params.k) / - (stop - start)); + if ((!A) || (!B) || (!C)) { + CLEANUP(); + fprintf(stderr, "!!!! system memory allocation error\n"); + return CUBLASTEST_FAILED; + } - } // end while (TESTGEN.. + streamArray = (cudaStream_t *)malloc(opts.N * sizeof(cudaStream_t *)); - CLEANUP(); - fprintf(stdout, "@@@@ %cgemm test %s\n", *opts.elem_type, - errors ? "FAIL" : "OK"); - return CUBLASTEST_PASSED; + for (int i = 0; i < opts.N; i++) { + if (opts.test_method == tmStream) { + cudaError_t cudaErr = cudaStreamCreate(&streamArray[i]); + + if (cudaErr != cudaSuccess) { + CLEANUP(); + fprintf(stderr, "!!!! cannot create stream\n"); + return CUBLASTEST_FAILED; + } + } + else { + streamArray[i] = 0; + } + } + + errors = 0; + int numTests = 1; + + while (TESTGEN(gemm)(&opts, matrixM, matrixN, matrixK, numTests, ¶ms) == 0) { + printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC); + + // fillup with Nan first (so lda padding is full on Nan) + memset(A, 0xFF, matrixSizeA * sizeof(A[0])); + fillupMatrixDebug(A, rowsA, params.m, params.k); + memset(B, 0xFF, matrixSizeB * sizeof(B[0])); + fillupMatrix(B, rowsB, params.k, params.n, 121); + + if (!cuEqual(params.beta, cuGet(0))) { + fillupMatrix(C, rowsC, params.m, params.n); + } + else { + /* fill with SNaNs to make sure ZGEMM doesn't access C */ + memset(C, 0xFF, matrixSizeC * sizeof(C[0])); + } + + double flopsCoef = 2.0; + + for (int i = 0; i < opts.N; i++) { + status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA); + status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB); + status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC[i], rowsC); + + if ((status1 != CUBLAS_STATUS_SUCCESS) || (status2 != status1) || (status3 != status1)) { + CLEANUP(); + fprintf(stderr, "!!!! GPU access error (write)\n"); + return CUBLASTEST_FAILED; + } + } + + start = second(); + + if (opts.test_method == tmBatched) { + cublasSetStream(handle, streamArray[0]); + status1 = cublasXgemmBatched(handle, + params.transa, + params.transb, + params.m, + params.n, + params.k, + ¶ms.alpha, + (const T_ELEM **)devPtrA_dev, + rowsA, + (const T_ELEM **)devPtrB_dev, + rowsB, + ¶ms.beta, + devPtrC_dev, + rowsC, + opts.N); + + if (status1 != CUBLAS_STATUS_SUCCESS) { + cudaError_t cudaStatus = cudaGetLastError(); + CLEANUP(); + fprintf(stderr, + "!!!! GPU program execution error : cublas Error=%d, cuda " + "Error=%d,(%s)\n", + status1, + cudaStatus, + cudaGetErrorString(cudaStatus)); + return CUBLASTEST_FAILED; + } + } + else { + for (int i = 0; i < opts.N; i++) { + cublasSetStream(handle, streamArray[i]); + status1 = cublasXgemm(handle, + params.transa, + params.transb, + params.m, + params.n, + params.k, + ¶ms.alpha, + devPtrA[i], + rowsA, + devPtrB[i], + rowsB, + ¶ms.beta, + devPtrC[i], + rowsC); + + if (status1 != CUBLAS_STATUS_SUCCESS) { + cudaError_t cudaStatus = cudaGetLastError(); + CLEANUP(); + fprintf(stderr, + "!!!! GPU program execution error : cublas Error=%d, cuda " + "Error=%d,(%s)\n", + status1, + cudaStatus, + cudaGetErrorString(cudaStatus)); + return CUBLASTEST_FAILED; + } + } + } + + cudaError_t cudaStatus = cudaDeviceSynchronize(); + + if (cudaStatus != cudaSuccess) { + CLEANUP(); + fprintf(stderr, + "!!!! GPU program execution error on cudaDeviceSynchronize : " + "cudaError=%d,(%s)\n", + cudaStatus, + cudaGetErrorString(cudaStatus)); + return CUBLASTEST_FAILED; + } + + stop = second(); + + fprintf(stdout, + "^^^^ elapsed = %10.8f sec GFLOPS=%g\n", + (stop - start), + opts.N * (1e-9 * flopsCoef * params.m * params.n * params.k) / (stop - start)); + + } // end while (TESTGEN.. + + CLEANUP(); + fprintf(stdout, "@@@@ %cgemm test %s\n", *opts.elem_type, errors ? "FAIL" : "OK"); + return CUBLASTEST_PASSED; } -int main(int argc, char *argv[]) { - struct gemmOpts opts; - int errors, nTimes, nTotalErrors = 0; - int status = CUBLASTEST_PASSED; +int main(int argc, char *argv[]) +{ + struct gemmOpts opts; + int errors, nTimes, nTotalErrors = 0; + int status = CUBLASTEST_PASSED; - printf("%s Starting...\n\n", sSDKname); + printf("%s Starting...\n\n", sSDKname); - int dev = findCudaDevice(argc, (const char **)argv); + int dev = findCudaDevice(argc, (const char **)argv); - if (dev == -1) { - return CUBLASTEST_FAILED; - } - - errors = processArgs(argc, argv, &opts); - - if (errors) { - fprintf(stdout, - "\n Usage: batchcublas [-mSIZE_M] [-nSIZE_N] [-kSIZE_N] " - "[-NSIZE_NUM_ITERATIONS] [-qatest] [-noprompt]\n"); - return CUBLASTEST_FAILED; - } - - cublasHandle_t handle; - - if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { - fprintf(stdout, "CUBLAS initialization failed!\n"); - exit(EXIT_FAILURE); - } - - // Run single kernels - fprintf(stdout, "\n ==== Running single kernels ==== \n\n"); - nTimes = opts.N; - opts.N = 1; - *(opts.elem_type) = 's'; - status = test_gemm_loop(opts, (float)CUBLAS_SGEMM_MAX_ULP_ERR, - (double)CUBLAS_SGEMM_MAX_RELATIVE_ERR, handle); - - // Run Double version - *(opts.elem_type) = 'd'; - - if (getDeviceVersion() < DEV_VER_DBL_SUPPORT) { - fprintf(stdout, "@@@@ dgemm test WAIVED due to lack of DP support\n"); - exit(EXIT_WAIVED); - } - - status = - test_gemm_loop(opts, (float)CUBLAS_DGEMM_MAX_ULP_ERR, - (double)CUBLAS_DGEMM_MAX_RELATIVE_ERR, handle); - nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); - opts.N = nTimes; - - // Run with and without streams and then batched. The batched functions are a - // feature new feature in 4.1 -#if CUDART_VERSION >= 4010 - - for (int ii = 0; ii < 3; ii++) { -#else - - for (int ii = 0; ii < 2; ii++) { -#endif - - switch (ii) { - case 0: - opts.test_method = tmRegular; - fprintf(stdout, "\n ==== Running N=%d without streams ==== \n\n", - opts.N); - break; - - case 1: - opts.test_method = tmStream; - fprintf(stdout, "\n ==== Running N=%d with streams ==== \n\n", opts.N); - break; - - case 2: - opts.test_method = tmBatched; - fprintf(stdout, "\n ==== Running N=%d batched ==== \n\n", opts.N); - break; + if (dev == -1) { + return CUBLASTEST_FAILED; } - // Run single version + errors = processArgs(argc, argv, &opts); + + if (errors) { + fprintf(stdout, + "\n Usage: batchcublas [-mSIZE_M] [-nSIZE_N] [-kSIZE_N] " + "[-NSIZE_NUM_ITERATIONS] [-qatest] [-noprompt]\n"); + return CUBLASTEST_FAILED; + } + + cublasHandle_t handle; + + if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { + fprintf(stdout, "CUBLAS initialization failed!\n"); + exit(EXIT_FAILURE); + } + + // Run single kernels + fprintf(stdout, "\n ==== Running single kernels ==== \n\n"); + nTimes = opts.N; + opts.N = 1; *(opts.elem_type) = 's'; status = - test_gemm_loop(opts, (float)CUBLAS_SGEMM_MAX_ULP_ERR, - (double)CUBLAS_SGEMM_MAX_RELATIVE_ERR, handle); - nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); + test_gemm_loop(opts, (float)CUBLAS_SGEMM_MAX_ULP_ERR, (double)CUBLAS_SGEMM_MAX_RELATIVE_ERR, handle); // Run Double version *(opts.elem_type) = 'd'; - // Test doesn't meet minSpec, will will wave the DP test if (getDeviceVersion() < DEV_VER_DBL_SUPPORT) { - fprintf(stdout, "@@@@ dgemm test WAIVED due to lack of DP support\n"); - exit(EXIT_WAIVED); - } else { - status = - test_gemm_loop(opts, (float)CUBLAS_DGEMM_MAX_ULP_ERR, - (double)CUBLAS_DGEMM_MAX_RELATIVE_ERR, handle); - nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); + fprintf(stdout, "@@@@ dgemm test WAIVED due to lack of DP support\n"); + exit(EXIT_WAIVED); } - } - cublasDestroy(handle); + status = + test_gemm_loop(opts, (float)CUBLAS_DGEMM_MAX_ULP_ERR, (double)CUBLAS_DGEMM_MAX_RELATIVE_ERR, handle); + nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); + opts.N = nTimes; - printf("\nTest Summary\n"); - printf("%d error(s)\n", nTotalErrors); - exit(nTotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + // Run with and without streams and then batched. The batched functions are a + // feature new feature in 4.1 +#if CUDART_VERSION >= 4010 + + for (int ii = 0; ii < 3; ii++) { +#else + + for (int ii = 0; ii < 2; ii++) { +#endif + + switch (ii) { + case 0: + opts.test_method = tmRegular; + fprintf(stdout, "\n ==== Running N=%d without streams ==== \n\n", opts.N); + break; + + case 1: + opts.test_method = tmStream; + fprintf(stdout, "\n ==== Running N=%d with streams ==== \n\n", opts.N); + break; + + case 2: + opts.test_method = tmBatched; + fprintf(stdout, "\n ==== Running N=%d batched ==== \n\n", opts.N); + break; + } + + // Run single version + *(opts.elem_type) = 's'; + status = + test_gemm_loop(opts, (float)CUBLAS_SGEMM_MAX_ULP_ERR, (double)CUBLAS_SGEMM_MAX_RELATIVE_ERR, handle); + nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); + + // Run Double version + *(opts.elem_type) = 'd'; + + // Test doesn't meet minSpec, will will wave the DP test + if (getDeviceVersion() < DEV_VER_DBL_SUPPORT) { + fprintf(stdout, "@@@@ dgemm test WAIVED due to lack of DP support\n"); + exit(EXIT_WAIVED); + } + else { + status = test_gemm_loop( + opts, (float)CUBLAS_DGEMM_MAX_ULP_ERR, (double)CUBLAS_DGEMM_MAX_RELATIVE_ERR, handle); + nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); + } + } + + cublasDestroy(handle); + + printf("\nTest Summary\n"); + printf("%d error(s)\n", nTotalErrors); + exit(nTotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.h b/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.h index 13704bd1..5bc88780 100644 --- a/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.h +++ b/Samples/4_CUDA_Libraries/batchCUBLAS/batchCUBLAS.h @@ -37,8 +37,8 @@ #define SWITCH_CHAR '-' -#define REFFUNC(funcname) ref_##funcname -#define TESTGEN(funcname) get_##funcname##_params +#define REFFUNC(funcname) ref_##funcname +#define TESTGEN(funcname) get_##funcname##_params #define TESTPARAMS(funcname) funcname##TestParams #define DEV_VER_DBL_SUPPORT (130) @@ -54,10 +54,11 @@ //============================================================================== #if defined(__cplusplus) -extern "C" { +extern "C" +{ #endif /* __cplusplus */ -int getDeviceVersion(void); -size_t getDeviceMemory(void); + int getDeviceVersion(void); + size_t getDeviceMemory(void); #if defined(__cplusplus) } #endif /* __cplusplus */ @@ -68,71 +69,58 @@ size_t getDeviceMemory(void); static __inline__ int imax(int x, int y) { return (x > y) ? x : y; } -static __inline__ unsigned floatAsUInt(float x) { - volatile union { - float f; - unsigned i; - } xx; - xx.f = x; - return xx.i; +static __inline__ unsigned floatAsUInt(float x) +{ + volatile union + { + float f; + unsigned i; + } xx; + xx.f = x; + return xx.i; } -static __inline__ unsigned long long doubleAsULL(double x) { - volatile union { - double f; - unsigned long long i; - } xx; - xx.f = x; - return xx.i; +static __inline__ unsigned long long doubleAsULL(double x) +{ + volatile union + { + double f; + unsigned long long i; + } xx; + xx.f = x; + return xx.i; } -static __inline__ unsigned cuRand(void) { - /* George Marsaglia's fast inline random number generator */ +static __inline__ unsigned cuRand(void) +{ + /* George Marsaglia's fast inline random number generator */ #define CUDA_ZNEW (cuda_z = 36969 * (cuda_z & 65535) + (cuda_z >> 16)) #define CUDA_WNEW (cuda_w = 18000 * (cuda_w & 65535) + (cuda_w >> 16)) -#define CUDA_MWC ((CUDA_ZNEW << 16) + CUDA_WNEW) -#define CUDA_SHR3 \ - (cuda_jsr = cuda_jsr ^ (cuda_jsr << 17), \ - cuda_jsr = cuda_jsr ^ (cuda_jsr >> 13), \ - cuda_jsr = cuda_jsr ^ (cuda_jsr << 5)) +#define CUDA_MWC ((CUDA_ZNEW << 16) + CUDA_WNEW) +#define CUDA_SHR3 \ + (cuda_jsr = cuda_jsr ^ (cuda_jsr << 17), \ + cuda_jsr = cuda_jsr ^ (cuda_jsr >> 13), \ + cuda_jsr = cuda_jsr ^ (cuda_jsr << 5)) #define CUDA_CONG (cuda_jcong = 69069 * cuda_jcong + 1234567) -#define KISS ((CUDA_MWC ^ CUDA_CONG) + CUDA_SHR3) - static unsigned int cuda_z = 362436069, cuda_w = 521288629; - static unsigned int cuda_jsr = 123456789, cuda_jcong = 380116160; - return KISS; +#define KISS ((CUDA_MWC ^ CUDA_CONG) + CUDA_SHR3) + static unsigned int cuda_z = 362436069, cuda_w = 521288629; + static unsigned int cuda_jsr = 123456789, cuda_jcong = 380116160; + return KISS; } //============================================================================== // cuGet and cuEqual versions //============================================================================== -template -__inline__ __device__ __host__ T_ELEM cuGet(double); -template <> -__inline__ __device__ __host__ float cuGet(double x) { - return float(x); -} -template <> -__inline__ __device__ __host__ double cuGet(double x) { - return double(x); -} +template __inline__ __device__ __host__ T_ELEM cuGet(double); +template <> __inline__ __device__ __host__ float cuGet(double x) { return float(x); } +template <> __inline__ __device__ __host__ double cuGet(double x) { return double(x); } -template -__inline__ __device__ __host__ T_ELEM cuGet(double, double); -template <> -__inline__ __device__ __host__ float cuGet(double x, double y) { - return float(x); -} -template <> -__inline__ __device__ __host__ double cuGet(double x, double y) { - return double(x); -} -static __inline__ __device__ __host__ bool cuEqual(float x, float y) { - return (x == y); -} -static __inline__ __device__ __host__ bool cuEqual(double x, double y) { - return (x == y); -} +template __inline__ __device__ __host__ T_ELEM cuGet(double, double); +template <> __inline__ __device__ __host__ float cuGet(double x, double y) { return float(x); } +template <> __inline__ __device__ __host__ double cuGet(double x, double y) { return double(x); } +static __inline__ __device__ __host__ bool cuEqual(float x, float y) { return (x == y); } +static __inline__ __device__ __host__ bool cuEqual(double x, double y) { return (x == y); } //============================================================================== // Platform dependent timing utility @@ -143,33 +131,36 @@ static __inline__ __device__ __host__ bool cuEqual(double x, double y) { #define WIN32_LEAN_AND_MEAN #endif #include -static __inline__ double second(void) { - LARGE_INTEGER t; - static double oofreq; - static int checkedForHighResTimer; - static BOOL hasHighResTimer; +static __inline__ double second(void) +{ + LARGE_INTEGER t; + static double oofreq; + static int checkedForHighResTimer; + static BOOL hasHighResTimer; - if (!checkedForHighResTimer) { - hasHighResTimer = QueryPerformanceFrequency(&t); - oofreq = 1.0 / (double)t.QuadPart; - checkedForHighResTimer = 1; - } + if (!checkedForHighResTimer) { + hasHighResTimer = QueryPerformanceFrequency(&t); + oofreq = 1.0 / (double)t.QuadPart; + checkedForHighResTimer = 1; + } - if (hasHighResTimer) { - QueryPerformanceCounter(&t); - return (double)t.QuadPart * oofreq; - } else { - return (double)GetTickCount() / 1000.0; - } + if (hasHighResTimer) { + QueryPerformanceCounter(&t); + return (double)t.QuadPart * oofreq; + } + else { + return (double)GetTickCount() / 1000.0; + } } #elif defined(__linux__) || defined(__QNX__) #include #include #include -static double second(void) { - struct timeval tv; - gettimeofday(&tv, NULL); - return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +static double second(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; } #elif defined(__APPLE__) #include @@ -177,10 +168,11 @@ static double second(void) { #include #include #include -static double second(void) { - struct timeval tv; - gettimeofday(&tv, NULL); - return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +static double second(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; } #else #error unsupported platform diff --git a/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp b/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp index 10cef4af..8ced11db 100644 --- a/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp +++ b/Samples/4_CUDA_Libraries/boxFilterNPP/boxFilterNPP.cpp @@ -36,171 +36,178 @@ #include #include #include - -#include -#include -#include - #include -#include - +#include #include #include +#include +#include +#include -int main(int argc, char *argv[]) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char *argv[]) +{ + printf("%s Starting...\n\n", argv[0]); - try { - std::string sFilename; - char *filePath; - NppStreamContext nppStreamCtx; - nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. + try { + std::string sFilename; + char *filePath; + NppStreamContext nppStreamCtx; + nppStreamCtx.hStream = + 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. - cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - { - printf("CUDA error: no devices supporting CUDA.\n"); - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) { + printf("CUDA error: no devices supporting CUDA.\n"); + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + } + + const NppLibraryVersion *libVer = nppGetLibVersion(); + + printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); + + int driverVersion, runtimeVersion; + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + + printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, + cudaDevAttrComputeCapabilityMajor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, + cudaDevAttrComputeCapabilityMinor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); + + cudaDeviceProp oDeviceProperties; + + cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); + + nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; + nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; + nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; + nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; + + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); + } + else { + filePath = sdkFindFilePath("teapot512.pgm", argv[0]); + } + + if (filePath) { + sFilename = filePath; + } + else { + sFilename = "teapot512.pgm"; + } + + // if we specify the filename at the command line, then we only test + // sFilename[0]. + int file_errors = 0; + std::ifstream infile(sFilename.data(), std::ifstream::in); + + if (infile.good()) { + std::cout << "boxFilterNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; + file_errors = 0; + infile.close(); + } + else { + std::cout << "boxFilterNPP unable to open: <" << sFilename.data() << ">" << std::endl; + file_errors++; + infile.close(); + } + + if (file_errors > 0) { + exit(EXIT_FAILURE); + } + + std::string sResultFilename = sFilename; + + std::string::size_type dot = sResultFilename.rfind('.'); + + if (dot != std::string::npos) { + sResultFilename = sResultFilename.substr(0, dot); + } + + sResultFilename += "_boxFilter.pgm"; + + if (checkCmdLineFlag(argc, (const char **)argv, "output")) { + char *outputFilePath; + getCmdLineArgumentString(argc, (const char **)argv, "output", &outputFilePath); + sResultFilename = outputFilePath; + } + + // declare a host image object for an 8-bit grayscale image + npp::ImageCPU_8u_C1 oHostSrc; + // load gray-scale image from disk + npp::loadImage(sFilename, oHostSrc); + // declare a device image and copy construct from the host image, + // i.e. upload host to device + npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); + + // create struct with box-filter mask size + NppiSize oMaskSize = {5, 5}; + + NppiSize oSrcSize = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; + NppiPoint oSrcOffset = {0, 0}; + + // create struct with ROI size + NppiSize oSizeROI = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; + // allocate device image of appropriately reduced size + npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height); + // set anchor point inside the mask to (oMaskSize.width / 2, + // oMaskSize.height / 2) It should round down when odd + NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2}; + + // run box filter + NPP_CHECK_NPP(nppiFilterBoxBorder_8u_C1R_Ctx(oDeviceSrc.data(), + oDeviceSrc.pitch(), + oSrcSize, + oSrcOffset, + oDeviceDst.data(), + oDeviceDst.pitch(), + oSizeROI, + oMaskSize, + oAnchor, + NPP_BORDER_REPLICATE, + nppStreamCtx)); + + // declare a host image for the result + npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); + // and copy the device result data into it + oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); + + saveImage(sResultFilename, oHostDst); + std::cout << "Saved image: " << sResultFilename << std::endl; + + nppiFree(oDeviceSrc.data()); + nppiFree(oDeviceDst.data()); + + exit(EXIT_SUCCESS); + } + catch (npp::Exception &rException) { + std::cerr << "Program error! The following exception occurred: \n"; + std::cerr << rException << std::endl; + std::cerr << "Aborting." << std::endl; + + exit(EXIT_FAILURE); + } + catch (...) { + std::cerr << "Program error! An unknow type of exception occurred. \n"; + std::cerr << "Aborting." << std::endl; + + exit(EXIT_FAILURE); + return -1; } - const NppLibraryVersion *libVer = nppGetLibVersion(); - - printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); - - int driverVersion, runtimeVersion; - cudaDriverGetVersion(&driverVersion); - cudaRuntimeGetVersion(&runtimeVersion); - - printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); - printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10); - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, - cudaDevAttrComputeCapabilityMajor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, - cudaDevAttrComputeCapabilityMinor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); - - cudaDeviceProp oDeviceProperties; - - cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); - - nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; - nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; - nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; - nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; - - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); - } else { - filePath = sdkFindFilePath("teapot512.pgm", argv[0]); - } - - if (filePath) { - sFilename = filePath; - } else { - sFilename = "teapot512.pgm"; - } - - // if we specify the filename at the command line, then we only test - // sFilename[0]. - int file_errors = 0; - std::ifstream infile(sFilename.data(), std::ifstream::in); - - if (infile.good()) { - std::cout << "boxFilterNPP opened: <" << sFilename.data() - << "> successfully!" << std::endl; - file_errors = 0; - infile.close(); - } else { - std::cout << "boxFilterNPP unable to open: <" << sFilename.data() << ">" - << std::endl; - file_errors++; - infile.close(); - } - - if (file_errors > 0) { - exit(EXIT_FAILURE); - } - - std::string sResultFilename = sFilename; - - std::string::size_type dot = sResultFilename.rfind('.'); - - if (dot != std::string::npos) { - sResultFilename = sResultFilename.substr(0, dot); - } - - sResultFilename += "_boxFilter.pgm"; - - if (checkCmdLineFlag(argc, (const char **)argv, "output")) { - char *outputFilePath; - getCmdLineArgumentString(argc, (const char **)argv, "output", - &outputFilePath); - sResultFilename = outputFilePath; - } - - // declare a host image object for an 8-bit grayscale image - npp::ImageCPU_8u_C1 oHostSrc; - // load gray-scale image from disk - npp::loadImage(sFilename, oHostSrc); - // declare a device image and copy construct from the host image, - // i.e. upload host to device - npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); - - // create struct with box-filter mask size - NppiSize oMaskSize = {5, 5}; - - NppiSize oSrcSize = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; - NppiPoint oSrcOffset = {0, 0}; - - // create struct with ROI size - NppiSize oSizeROI = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; - // allocate device image of appropriately reduced size - npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height); - // set anchor point inside the mask to (oMaskSize.width / 2, - // oMaskSize.height / 2) It should round down when odd - NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2}; - - // run box filter - NPP_CHECK_NPP(nppiFilterBoxBorder_8u_C1R_Ctx( - oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset, - oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor, - NPP_BORDER_REPLICATE, nppStreamCtx)); - - // declare a host image for the result - npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); - // and copy the device result data into it - oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); - - saveImage(sResultFilename, oHostDst); - std::cout << "Saved image: " << sResultFilename << std::endl; - - nppiFree(oDeviceSrc.data()); - nppiFree(oDeviceDst.data()); - - exit(EXIT_SUCCESS); - } catch (npp::Exception &rException) { - std::cerr << "Program error! The following exception occurred: \n"; - std::cerr << rException << std::endl; - std::cerr << "Aborting." << std::endl; - - exit(EXIT_FAILURE); - } catch (...) { - std::cerr << "Program error! An unknow type of exception occurred. \n"; - std::cerr << "Aborting." << std::endl; - - exit(EXIT_FAILURE); - return -1; - } - - return 0; + return 0; } diff --git a/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp index 4a8ae7f7..174f4176 100644 --- a/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp +++ b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/cannyEdgeDetectorNPP.cpp @@ -36,215 +36,225 @@ #include #include #include - -#include -#include -#include - #include -#include - +#include #include #include +#include +#include +#include -inline int cudaDeviceInit(int argc, const char **argv) { - int deviceCount; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); +inline int cudaDeviceInit(int argc, const char **argv) +{ + int deviceCount; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - if (deviceCount == 0) { - std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; - exit(EXIT_FAILURE); - } + if (deviceCount == 0) { + std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; + exit(EXIT_FAILURE); + } - int dev = findCudaDevice(argc, argv); + int dev = findCudaDevice(argc, argv); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name - << std::endl; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name << std::endl; - checkCudaErrors(cudaSetDevice(dev)); + checkCudaErrors(cudaSetDevice(dev)); - return dev; + return dev; } -int main(int argc, char *argv[]) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char *argv[]) +{ + printf("%s Starting...\n\n", argv[0]); - try { - std::string sFilename; - char *filePath; + try { + std::string sFilename; + char *filePath; - cudaDeviceInit(argc, (const char **)argv); + cudaDeviceInit(argc, (const char **)argv); - NppStreamContext nppStreamCtx; - nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. + NppStreamContext nppStreamCtx; + nppStreamCtx.hStream = + 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. - cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - { - printf("CUDA error: no devices supporting CUDA.\n"); - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) { + printf("CUDA error: no devices supporting CUDA.\n"); + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + } + + const NppLibraryVersion *libVer = nppGetLibVersion(); + + printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); + + int driverVersion, runtimeVersion; + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + + printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, + cudaDevAttrComputeCapabilityMajor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, + cudaDevAttrComputeCapabilityMinor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); + + cudaDeviceProp oDeviceProperties; + + cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); + + nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; + nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; + nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; + nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; + + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); + } + else { + filePath = sdkFindFilePath("teapot512.pgm", argv[0]); + } + + if (filePath) { + sFilename = filePath; + } + else { + sFilename = "teapot512.pgm"; + } + + // if we specify the filename at the command line, then we only test + // sFilename[0]. + int file_errors = 0; + std::ifstream infile(sFilename.data(), std::ifstream::in); + + if (infile.good()) { + std::cout << "cannyEdgeDetectionNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; + file_errors = 0; + infile.close(); + } + else { + std::cout << "cannyEdgeDetectionNPP unable to open: <" << sFilename.data() << ">" << std::endl; + file_errors++; + infile.close(); + } + + if (file_errors > 0) { + exit(EXIT_FAILURE); + } + + std::string sResultFilename = sFilename; + + std::string::size_type dot = sResultFilename.rfind('.'); + + if (dot != std::string::npos) { + sResultFilename = sResultFilename.substr(0, dot); + } + + sResultFilename += "_cannyEdgeDetection.pgm"; + + if (checkCmdLineFlag(argc, (const char **)argv, "output")) { + char *outputFilePath; + getCmdLineArgumentString(argc, (const char **)argv, "output", &outputFilePath); + sResultFilename = outputFilePath; + } + + // declare a host image object for an 8-bit grayscale image + npp::ImageCPU_8u_C1 oHostSrc; + // load gray-scale image from disk + npp::loadImage(sFilename, oHostSrc); + // declare a device image and copy construct from the host image, + // i.e. upload host to device + npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); + + NppiSize oSrcSize = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; + NppiPoint oSrcOffset = {0, 0}; + + // create struct with ROI size + NppiSize oSizeROI = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; + // allocate device image of appropriately reduced size + npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height); + + int nBufferSize = 0; + Npp8u *pScratchBufferNPP = 0; + + // get necessary scratch buffer size and allocate that much device memory + NPP_CHECK_NPP(nppiFilterCannyBorderGetBufferSize(oSizeROI, &nBufferSize)); + + cudaMalloc((void **)&pScratchBufferNPP, nBufferSize); + + // now run the canny edge detection filter + // Using nppiNormL2 will produce larger magnitude values allowing for finer + // control of threshold values while nppiNormL1 will be slightly faster. + // Also, selecting the sobel gradient filter allows up to a 5x5 kernel size + // which can produce more precise results but is a bit slower. Commonly + // nppiNormL2 and sobel gradient filter size of 3x3 are used. Canny + // recommends that the high threshold value should be about 3 times the low + // threshold value. The threshold range will depend on the range of + // magnitude values that the sobel gradient filter generates for a + // particular image. + + Npp16s nLowThreshold = 72; + Npp16s nHighThreshold = 256; + + if ((nBufferSize > 0) && (pScratchBufferNPP != 0)) { + NPP_CHECK_NPP(nppiFilterCannyBorder_8u_C1R_Ctx(oDeviceSrc.data(), + oDeviceSrc.pitch(), + oSrcSize, + oSrcOffset, + oDeviceDst.data(), + oDeviceDst.pitch(), + oSizeROI, + NPP_FILTER_SOBEL, + NPP_MASK_SIZE_3_X_3, + nLowThreshold, + nHighThreshold, + nppiNormL2, + NPP_BORDER_REPLICATE, + pScratchBufferNPP, + nppStreamCtx)); + } + + // free scratch buffer memory + cudaFree(pScratchBufferNPP); + + // declare a host image for the result + npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); + // and copy the device result data into it + oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); + + saveImage(sResultFilename, oHostDst); + std::cout << "Saved image: " << sResultFilename << std::endl; + + nppiFree(oDeviceSrc.data()); + nppiFree(oDeviceDst.data()); + + exit(EXIT_SUCCESS); + } + catch (npp::Exception &rException) { + std::cerr << "Program error! The following exception occurred: \n"; + std::cerr << rException << std::endl; + std::cerr << "Aborting." << std::endl; + + exit(EXIT_FAILURE); + } + catch (...) { + std::cerr << "Program error! An unknow type of exception occurred. \n"; + std::cerr << "Aborting." << std::endl; + + exit(EXIT_FAILURE); + return -1; } - const NppLibraryVersion *libVer = nppGetLibVersion(); - - printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); - - int driverVersion, runtimeVersion; - cudaDriverGetVersion(&driverVersion); - cudaRuntimeGetVersion(&runtimeVersion); - - printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); - printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10); - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, - cudaDevAttrComputeCapabilityMajor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, - cudaDevAttrComputeCapabilityMinor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); - - cudaDeviceProp oDeviceProperties; - - cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); - - nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; - nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; - nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; - nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; - - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); - } else { - filePath = sdkFindFilePath("teapot512.pgm", argv[0]); - } - - if (filePath) { - sFilename = filePath; - } else { - sFilename = "teapot512.pgm"; - } - - // if we specify the filename at the command line, then we only test - // sFilename[0]. - int file_errors = 0; - std::ifstream infile(sFilename.data(), std::ifstream::in); - - if (infile.good()) { - std::cout << "cannyEdgeDetectionNPP opened: <" << sFilename.data() - << "> successfully!" << std::endl; - file_errors = 0; - infile.close(); - } else { - std::cout << "cannyEdgeDetectionNPP unable to open: <" << sFilename.data() - << ">" << std::endl; - file_errors++; - infile.close(); - } - - if (file_errors > 0) { - exit(EXIT_FAILURE); - } - - std::string sResultFilename = sFilename; - - std::string::size_type dot = sResultFilename.rfind('.'); - - if (dot != std::string::npos) { - sResultFilename = sResultFilename.substr(0, dot); - } - - sResultFilename += "_cannyEdgeDetection.pgm"; - - if (checkCmdLineFlag(argc, (const char **)argv, "output")) { - char *outputFilePath; - getCmdLineArgumentString(argc, (const char **)argv, "output", - &outputFilePath); - sResultFilename = outputFilePath; - } - - // declare a host image object for an 8-bit grayscale image - npp::ImageCPU_8u_C1 oHostSrc; - // load gray-scale image from disk - npp::loadImage(sFilename, oHostSrc); - // declare a device image and copy construct from the host image, - // i.e. upload host to device - npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); - - NppiSize oSrcSize = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; - NppiPoint oSrcOffset = {0, 0}; - - // create struct with ROI size - NppiSize oSizeROI = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; - // allocate device image of appropriately reduced size - npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height); - - int nBufferSize = 0; - Npp8u *pScratchBufferNPP = 0; - - // get necessary scratch buffer size and allocate that much device memory - NPP_CHECK_NPP(nppiFilterCannyBorderGetBufferSize(oSizeROI, &nBufferSize)); - - cudaMalloc((void **)&pScratchBufferNPP, nBufferSize); - - // now run the canny edge detection filter - // Using nppiNormL2 will produce larger magnitude values allowing for finer - // control of threshold values while nppiNormL1 will be slightly faster. - // Also, selecting the sobel gradient filter allows up to a 5x5 kernel size - // which can produce more precise results but is a bit slower. Commonly - // nppiNormL2 and sobel gradient filter size of 3x3 are used. Canny - // recommends that the high threshold value should be about 3 times the low - // threshold value. The threshold range will depend on the range of - // magnitude values that the sobel gradient filter generates for a - // particular image. - - Npp16s nLowThreshold = 72; - Npp16s nHighThreshold = 256; - - if ((nBufferSize > 0) && (pScratchBufferNPP != 0)) { - NPP_CHECK_NPP(nppiFilterCannyBorder_8u_C1R_Ctx( - oDeviceSrc.data(), oDeviceSrc.pitch(), oSrcSize, oSrcOffset, - oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, NPP_FILTER_SOBEL, - NPP_MASK_SIZE_3_X_3, nLowThreshold, nHighThreshold, nppiNormL2, - NPP_BORDER_REPLICATE, pScratchBufferNPP, nppStreamCtx)); - } - - // free scratch buffer memory - cudaFree(pScratchBufferNPP); - - // declare a host image for the result - npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); - // and copy the device result data into it - oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); - - saveImage(sResultFilename, oHostDst); - std::cout << "Saved image: " << sResultFilename << std::endl; - - nppiFree(oDeviceSrc.data()); - nppiFree(oDeviceDst.data()); - - exit(EXIT_SUCCESS); - } catch (npp::Exception &rException) { - std::cerr << "Program error! The following exception occurred: \n"; - std::cerr << rException << std::endl; - std::cerr << "Aborting." << std::endl; - - exit(EXIT_FAILURE); - } catch (...) { - std::cerr << "Program error! An unknow type of exception occurred. \n"; - std::cerr << "Aborting." << std::endl; - - exit(EXIT_FAILURE); - return -1; - } - - return 0; + return 0; } diff --git a/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp b/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp index 50d9470f..d2391bc0 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp +++ b/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp @@ -42,227 +42,261 @@ #include // Utilities and system includes -#include // helper function CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples +#include // helper function CUDA error checking and initialization +#include // helper for shared functions common to CUDA Samples const char *sSDKname = "conjugateGradient"; /* genTridiag: generate a random tridiagonal symmetric matrix */ -void genTridiag(int *I, int *J, float *val, int N, int nz) { - I[0] = 0, J[0] = 0, J[1] = 1; - val[0] = (float)rand() / RAND_MAX + 10.0f; - val[1] = (float)rand() / RAND_MAX; - int start; +void genTridiag(int *I, int *J, float *val, int N, int nz) +{ + I[0] = 0, J[0] = 0, J[1] = 1; + val[0] = (float)rand() / RAND_MAX + 10.0f; + val[1] = (float)rand() / RAND_MAX; + int start; - for (int i = 1; i < N; i++) { - if (i > 1) { - I[i] = I[i - 1] + 3; - } else { - I[1] = 2; + for (int i = 1; i < N; i++) { + if (i > 1) { + I[i] = I[i - 1] + 3; + } + else { + I[1] = 2; + } + + start = (i - 1) * 3 + 2; + J[start] = i - 1; + J[start + 1] = i; + + if (i < N - 1) { + J[start + 2] = i + 1; + } + + val[start] = val[start - 1]; + val[start + 1] = (float)rand() / RAND_MAX + 10.0f; + + if (i < N - 1) { + val[start + 2] = (float)rand() / RAND_MAX; + } } - start = (i - 1) * 3 + 2; - J[start] = i - 1; - J[start + 1] = i; - - if (i < N - 1) { - J[start + 2] = i + 1; - } - - val[start] = val[start - 1]; - val[start + 1] = (float)rand() / RAND_MAX + 10.0f; - - if (i < N - 1) { - val[start + 2] = (float)rand() / RAND_MAX; - } - } - - I[N] = nz; + I[N] = nz; } -int main(int argc, char **argv) { - int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; - float *val = NULL; - const float tol = 1e-5f; - const int max_iter = 10000; - float *x; - float *rhs; - float a, b, na, r0, r1; - int *d_col, *d_row; - float *d_val, *d_x, dot; - float *d_r, *d_p, *d_Ax; - int k; - float alpha, beta, alpham1; +int main(int argc, char **argv) +{ + int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; + float *val = NULL; + const float tol = 1e-5f; + const int max_iter = 10000; + float *x; + float *rhs; + float a, b, na, r0, r1; + int *d_col, *d_row; + float *d_val, *d_x, dot; + float *d_r, *d_p, *d_Ax; + int k; + float alpha, beta, alpham1; - // This will pick the best possible CUDA capable device - cudaDeviceProp deviceProp; - int devID = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + cudaDeviceProp deviceProp; + int devID = findCudaDevice(argc, (const char **)argv); - if (devID < 0) { - printf("exiting...\n"); - exit(EXIT_SUCCESS); - } - - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - - // Statistics about the GPU device - printf( - "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", - deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); - - /* Generate a random tridiagonal symmetric matrix in CSR format */ - M = N = 1048576; - nz = (N - 2) * 3 + 4; - I = (int *)malloc(sizeof(int) * (N + 1)); - J = (int *)malloc(sizeof(int) * nz); - val = (float *)malloc(sizeof(float) * nz); - genTridiag(I, J, val, N, nz); - - x = (float *)malloc(sizeof(float) * N); - rhs = (float *)malloc(sizeof(float) * N); - - for (int i = 0; i < N; i++) { - rhs[i] = 1.0; - x[i] = 0.0; - } - - /* Get handle to the CUBLAS context */ - cublasHandle_t cublasHandle = 0; - cublasStatus_t cublasStatus; - cublasStatus = cublasCreate(&cublasHandle); - - checkCudaErrors(cublasStatus); - - /* Get handle to the CUSPARSE context */ - cusparseHandle_t cusparseHandle = 0; - checkCudaErrors(cusparseCreate(&cusparseHandle)); - - checkCudaErrors(cudaMalloc((void **)&d_col, nz * sizeof(int))); - checkCudaErrors(cudaMalloc((void **)&d_row, (N + 1) * sizeof(int))); - checkCudaErrors(cudaMalloc((void **)&d_val, nz * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_x, N * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_r, N * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_p, N * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_Ax, N * sizeof(float))); - - /* Wrap raw data into cuSPARSE generic API objects */ - cusparseSpMatDescr_t matA = NULL; - checkCudaErrors(cusparseCreateCsr(&matA, N, N, nz, d_row, d_col, d_val, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - cusparseDnVecDescr_t vecx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F)); - cusparseDnVecDescr_t vecp = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecp, N, d_p, CUDA_R_32F)); - cusparseDnVecDescr_t vecAx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecAx, N, d_Ax, CUDA_R_32F)); - - /* Initialize problem data */ - cudaMemcpy(d_col, J, nz * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_row, I, (N + 1) * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice); - - alpha = 1.0; - alpham1 = -1.0; - beta = 0.0; - r0 = 0.; - - /* Allocate workspace for cuSPARSE */ - size_t bufferSize = 0; - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx, - &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); - void *buffer = NULL; - checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - - /* Begin CG */ - checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, matA, vecx, &beta, vecAx, CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1); - cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); - - k = 1; - - while (r1 > tol * tol && k <= max_iter) { - if (k > 1) { - b = r1 / r0; - cublasStatus = cublasSscal(cublasHandle, N, &b, d_p, 1); - cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1); - } else { - cublasStatus = cublasScopy(cublasHandle, N, d_r, 1, d_p, 1); + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_SUCCESS); } - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, - &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - cublasStatus = cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot); - a = r1 / dot; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - cublasStatus = cublasSaxpy(cublasHandle, N, &a, d_p, 1, d_x, 1); - na = -a; - cublasStatus = cublasSaxpy(cublasHandle, N, &na, d_Ax, 1, d_r, 1); + // Statistics about the GPU device + printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); - r0 = r1; + /* Generate a random tridiagonal symmetric matrix in CSR format */ + M = N = 1048576; + nz = (N - 2) * 3 + 4; + I = (int *)malloc(sizeof(int) * (N + 1)); + J = (int *)malloc(sizeof(int) * nz); + val = (float *)malloc(sizeof(float) * nz); + genTridiag(I, J, val, N, nz); + + x = (float *)malloc(sizeof(float) * N); + rhs = (float *)malloc(sizeof(float) * N); + + for (int i = 0; i < N; i++) { + rhs[i] = 1.0; + x[i] = 0.0; + } + + /* Get handle to the CUBLAS context */ + cublasHandle_t cublasHandle = 0; + cublasStatus_t cublasStatus; + cublasStatus = cublasCreate(&cublasHandle); + + checkCudaErrors(cublasStatus); + + /* Get handle to the CUSPARSE context */ + cusparseHandle_t cusparseHandle = 0; + checkCudaErrors(cusparseCreate(&cusparseHandle)); + + checkCudaErrors(cudaMalloc((void **)&d_col, nz * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_row, (N + 1) * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_val, nz * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_x, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_r, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_p, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Ax, N * sizeof(float))); + + /* Wrap raw data into cuSPARSE generic API objects */ + cusparseSpMatDescr_t matA = NULL; + checkCudaErrors(cusparseCreateCsr(&matA, + N, + N, + nz, + d_row, + d_col, + d_val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + cusparseDnVecDescr_t vecx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F)); + cusparseDnVecDescr_t vecp = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecp, N, d_p, CUDA_R_32F)); + cusparseDnVecDescr_t vecAx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecAx, N, d_Ax, CUDA_R_32F)); + + /* Initialize problem data */ + cudaMemcpy(d_col, J, nz * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_row, I, (N + 1) * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice); + + alpha = 1.0; + alpham1 = -1.0; + beta = 0.0; + r0 = 0.; + + /* Allocate workspace for cuSPARSE */ + size_t bufferSize = 0; + checkCudaErrors(cusparseSpMV_bufferSize(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecx, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); + void *buffer = NULL; + checkCudaErrors(cudaMalloc(&buffer, bufferSize)); + + /* Begin CG */ + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecx, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1); cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); - cudaDeviceSynchronize(); - printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); - k++; - } - cudaMemcpy(x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost); + k = 1; - float rsum, diff, err = 0.0; + while (r1 > tol * tol && k <= max_iter) { + if (k > 1) { + b = r1 / r0; + cublasStatus = cublasSscal(cublasHandle, N, &b, d_p, 1); + cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1); + } + else { + cublasStatus = cublasScopy(cublasHandle, N, d_r, 1, d_p, 1); + } - for (int i = 0; i < N; i++) { - rsum = 0.0; + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecp, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + cublasStatus = cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot); + a = r1 / dot; - for (int j = I[i]; j < I[i + 1]; j++) { - rsum += val[j] * x[J[j]]; + cublasStatus = cublasSaxpy(cublasHandle, N, &a, d_p, 1, d_x, 1); + na = -a; + cublasStatus = cublasSaxpy(cublasHandle, N, &na, d_Ax, 1, d_r, 1); + + r0 = r1; + cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); + cudaDeviceSynchronize(); + printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); + k++; } - diff = fabs(rsum - rhs[i]); + cudaMemcpy(x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost); - if (diff > err) { - err = diff; + float rsum, diff, err = 0.0; + + for (int i = 0; i < N; i++) { + rsum = 0.0; + + for (int j = I[i]; j < I[i + 1]; j++) { + rsum += val[j] * x[J[j]]; + } + + diff = fabs(rsum - rhs[i]); + + if (diff > err) { + err = diff; + } } - } - if (buffer) { - checkCudaErrors(cudaFree(buffer)); - } - - cusparseDestroy(cusparseHandle); - cublasDestroy(cublasHandle); - if (matA) { - checkCudaErrors(cusparseDestroySpMat(matA)); - } - if (vecx) { - checkCudaErrors(cusparseDestroyDnVec(vecx)); - } - if (vecAx) { - checkCudaErrors(cusparseDestroyDnVec(vecAx)); - } - if (vecp) { - checkCudaErrors(cusparseDestroyDnVec(vecp)); - } + if (buffer) { + checkCudaErrors(cudaFree(buffer)); + } - free(I); - free(J); - free(val); - free(x); - free(rhs); - cudaFree(d_col); - cudaFree(d_row); - cudaFree(d_val); - cudaFree(d_x); - cudaFree(d_r); - cudaFree(d_p); - cudaFree(d_Ax); + cusparseDestroy(cusparseHandle); + cublasDestroy(cublasHandle); + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } + if (vecp) { + checkCudaErrors(cusparseDestroyDnVec(vecp)); + } - printf("Test Summary: Error amount = %f\n", err); - exit((k <= max_iter) ? 0 : 1); + free(I); + free(J); + free(val); + free(x); + free(rhs); + cudaFree(d_col); + cudaFree(d_row); + cudaFree(d_val); + cudaFree(d_x); + cudaFree(d_r); + cudaFree(d_p); + cudaFree(d_Ax); + + printf("Test Summary: Error amount = %f\n", err); + exit((k <= max_iter) ? 0 : 1); } diff --git a/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu b/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu index 18523d7b..8f0eb748 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu +++ b/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu @@ -42,8 +42,8 @@ #include // Utilities and system includes -#include // helper function CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples +#include // helper function CUDA error checking and initialization +#include // helper for shared functions common to CUDA Samples const char *sSDKname = "conjugateGradientCudaGraphs"; @@ -52,291 +52,235 @@ const char *sSDKname = "conjugateGradientCudaGraphs"; #endif /* genTridiag: generate a random tridiagonal symmetric matrix */ -void genTridiag(int *I, int *J, float *val, int N, int nz) { - I[0] = 0, J[0] = 0, J[1] = 1; - val[0] = (float)rand() / RAND_MAX + 10.0f; - val[1] = (float)rand() / RAND_MAX; - int start; +void genTridiag(int *I, int *J, float *val, int N, int nz) +{ + I[0] = 0, J[0] = 0, J[1] = 1; + val[0] = (float)rand() / RAND_MAX + 10.0f; + val[1] = (float)rand() / RAND_MAX; + int start; - for (int i = 1; i < N; i++) { - if (i > 1) { - I[i] = I[i - 1] + 3; - } else { - I[1] = 2; + for (int i = 1; i < N; i++) { + if (i > 1) { + I[i] = I[i - 1] + 3; + } + else { + I[1] = 2; + } + + start = (i - 1) * 3 + 2; + J[start] = i - 1; + J[start + 1] = i; + + if (i < N - 1) { + J[start + 2] = i + 1; + } + + val[start] = val[start - 1]; + val[start + 1] = (float)rand() / RAND_MAX + 10.0f; + + if (i < N - 1) { + val[start + 2] = (float)rand() / RAND_MAX; + } } - start = (i - 1) * 3 + 2; - J[start] = i - 1; - J[start + 1] = i; + I[N] = nz; +} - if (i < N - 1) { - J[start + 2] = i + 1; +__global__ void initVectors(float *rhs, float *x, int N) +{ + size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + for (size_t i = gid; i < N; i += gridDim.x * blockDim.x) { + rhs[i] = 1.0; + x[i] = 0.0; + } +} + +__global__ void r1_div_x(float *r1, float *r0, float *b) +{ + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid == 0) { + b[0] = r1[0] / r0[0]; + } +} + +__global__ void a_minus(float *a, float *na) +{ + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid == 0) { + na[0] = -(a[0]); + } +} + +int main(int argc, char **argv) +{ + int N = 0, nz = 0, *I = NULL, *J = NULL; + float *val = NULL; + const float tol = 1e-5f; + const int max_iter = 10000; + float *x; + float *rhs; + float r1; + + int *d_col, *d_row; + float *d_val, *d_x; + float *d_r, *d_p, *d_Ax; + int k; + float alpha, beta, alpham1; + + cudaStream_t stream1, streamForGraph; + + // This will pick the best possible CUDA capable device + cudaDeviceProp deviceProp; + int devID = findCudaDevice(argc, (const char **)argv); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_SUCCESS); } - val[start] = val[start - 1]; - val[start + 1] = (float)rand() / RAND_MAX + 10.0f; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - if (i < N - 1) { - val[start + 2] = (float)rand() / RAND_MAX; + // Statistics about the GPU device + printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); + + /* Generate a random tridiagonal symmetric matrix in CSR format */ + N = 1048576; + nz = (N - 2) * 3 + 4; + checkCudaErrors(cudaMallocHost(&I, sizeof(int) * (N + 1))); + checkCudaErrors(cudaMallocHost(&J, sizeof(int) * nz)); + checkCudaErrors(cudaMallocHost(&val, sizeof(float) * nz)); + genTridiag(I, J, val, N, nz); + + checkCudaErrors(cudaMallocHost(&x, sizeof(float) * N)); + rhs = (float *)malloc(sizeof(float) * N); + + for (int i = 0; i < N; i++) { + rhs[i] = 1.0; + x[i] = 0.0; } - } - I[N] = nz; -} + /* Get handle to the CUBLAS context */ + cublasHandle_t cublasHandle = 0; + cublasStatus_t cublasStatus; + cublasStatus = cublasCreate(&cublasHandle); -__global__ void initVectors(float *rhs, float *x, int N) { - size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + checkCudaErrors(cublasStatus); - for (size_t i = gid; i < N; i += gridDim.x * blockDim.x) { - rhs[i] = 1.0; - x[i] = 0.0; - } -} + /* Get handle to the CUSPARSE context */ + cusparseHandle_t cusparseHandle = 0; + cusparseStatus_t cusparseStatus; + cusparseStatus = cusparseCreate(&cusparseHandle); -__global__ void r1_div_x(float *r1, float *r0, float *b) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - if (gid == 0) { - b[0] = r1[0] / r0[0]; - } -} + checkCudaErrors(cusparseStatus); -__global__ void a_minus(float *a, float *na) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - if (gid == 0) { - na[0] = -(a[0]); - } -} + checkCudaErrors(cudaStreamCreate(&stream1)); -int main(int argc, char **argv) { - int N = 0, nz = 0, *I = NULL, *J = NULL; - float *val = NULL; - const float tol = 1e-5f; - const int max_iter = 10000; - float *x; - float *rhs; - float r1; + checkCudaErrors(cudaMalloc((void **)&d_col, nz * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_row, (N + 1) * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_val, nz * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_x, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_r, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_p, N * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Ax, N * sizeof(float))); - int *d_col, *d_row; - float *d_val, *d_x; - float *d_r, *d_p, *d_Ax; - int k; - float alpha, beta, alpham1; + float *d_r1, *d_r0, *d_dot, *d_a, *d_na, *d_b; + checkCudaErrors(cudaMalloc((void **)&d_r1, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_r0, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_dot, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_a, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_na, sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(float))); - cudaStream_t stream1, streamForGraph; + /* Wrap raw data into cuSPARSE generic API objects */ + cusparseSpMatDescr_t matA = NULL; + checkCudaErrors(cusparseCreateCsr(&matA, + N, + N, + nz, + d_row, + d_col, + d_val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + cusparseDnVecDescr_t vecx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F)); + cusparseDnVecDescr_t vecp = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecp, N, d_p, CUDA_R_32F)); + cusparseDnVecDescr_t vecAx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecAx, N, d_Ax, CUDA_R_32F)); - // This will pick the best possible CUDA capable device - cudaDeviceProp deviceProp; - int devID = findCudaDevice(argc, (const char **)argv); + /* Allocate workspace for cuSPARSE */ + size_t bufferSize = 0; + checkCudaErrors(cusparseSpMV_bufferSize(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecx, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); + void *buffer = NULL; + checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - if (devID < 0) { - printf("exiting...\n"); - exit(EXIT_SUCCESS); - } + cusparseMatDescr_t descr = 0; + checkCudaErrors(cusparseCreateMatDescr(&descr)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + checkCudaErrors(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + checkCudaErrors(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - // Statistics about the GPU device - printf( - "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", - deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); + int numBlocks = 0, blockSize = 0; + checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, initVectors)); - /* Generate a random tridiagonal symmetric matrix in CSR format */ - N = 1048576; - nz = (N - 2) * 3 + 4; - checkCudaErrors(cudaMallocHost(&I, sizeof(int) * (N + 1))); - checkCudaErrors(cudaMallocHost(&J, sizeof(int) * nz)); - checkCudaErrors(cudaMallocHost(&val, sizeof(float) * nz)); - genTridiag(I, J, val, N, nz); + checkCudaErrors(cudaMemcpyAsync(d_col, J, nz * sizeof(int), cudaMemcpyHostToDevice, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_row, I, (N + 1) * sizeof(int), cudaMemcpyHostToDevice, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice, stream1)); - checkCudaErrors(cudaMallocHost(&x, sizeof(float) * N)); - rhs = (float *)malloc(sizeof(float) * N); + initVectors<<>>(d_r, d_x, N); - for (int i = 0; i < N; i++) { - rhs[i] = 1.0; - x[i] = 0.0; - } + alpha = 1.0; + alpham1 = -1.0; + beta = 0.0; - /* Get handle to the CUBLAS context */ - cublasHandle_t cublasHandle = 0; - cublasStatus_t cublasStatus; - cublasStatus = cublasCreate(&cublasHandle); + checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecx, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); - checkCudaErrors(cublasStatus); + checkCudaErrors(cublasSetStream(cublasHandle, stream1)); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1)); - /* Get handle to the CUSPARSE context */ - cusparseHandle_t cusparseHandle = 0; - cusparseStatus_t cusparseStatus; - cusparseStatus = cusparseCreate(&cusparseHandle); + checkCudaErrors(cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE)); + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); - checkCudaErrors(cusparseStatus); + k = 1; + // First Iteration when k=1 starts + checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1)); + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecp, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); - checkCudaErrors(cudaStreamCreate(&stream1)); - - checkCudaErrors(cudaMalloc((void **)&d_col, nz * sizeof(int))); - checkCudaErrors(cudaMalloc((void **)&d_row, (N + 1) * sizeof(int))); - checkCudaErrors(cudaMalloc((void **)&d_val, nz * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_x, N * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_r, N * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_p, N * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_Ax, N * sizeof(float))); - - float *d_r1, *d_r0, *d_dot, *d_a, *d_na, *d_b; - checkCudaErrors(cudaMalloc((void **)&d_r1, sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_r0, sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_dot, sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_a, sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_na, sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(float))); - - /* Wrap raw data into cuSPARSE generic API objects */ - cusparseSpMatDescr_t matA = NULL; - checkCudaErrors(cusparseCreateCsr(&matA, N, N, nz, d_row, d_col, d_val, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - cusparseDnVecDescr_t vecx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F)); - cusparseDnVecDescr_t vecp = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecp, N, d_p, CUDA_R_32F)); - cusparseDnVecDescr_t vecAx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecAx, N, d_Ax, CUDA_R_32F)); - - /* Allocate workspace for cuSPARSE */ - size_t bufferSize = 0; - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx, - &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); - void *buffer = NULL; - checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - - cusparseMatDescr_t descr = 0; - checkCudaErrors(cusparseCreateMatDescr(&descr)); - - checkCudaErrors(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - checkCudaErrors(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - - int numBlocks = 0, blockSize = 0; - checkCudaErrors( - cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, initVectors)); - - checkCudaErrors(cudaMemcpyAsync(d_col, J, nz * sizeof(int), - cudaMemcpyHostToDevice, stream1)); - checkCudaErrors(cudaMemcpyAsync(d_row, I, (N + 1) * sizeof(int), - cudaMemcpyHostToDevice, stream1)); - checkCudaErrors(cudaMemcpyAsync(d_val, val, nz * sizeof(float), - cudaMemcpyHostToDevice, stream1)); - - initVectors<<>>(d_r, d_x, N); - - alpha = 1.0; - alpham1 = -1.0; - beta = 0.0; - - checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); - checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, matA, vecx, &beta, vecAx, CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors(cublasSetStream(cublasHandle, stream1)); - checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1)); - - checkCudaErrors( - cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE)); - checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); - - k = 1; - // First Iteration when k=1 starts - checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1)); - checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); - - r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); - - checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); - - a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); - - checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); - - checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), - cudaMemcpyDeviceToDevice, stream1)); - - checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); - - checkCudaErrors(cudaMemcpyAsync(&r1, d_r1, sizeof(float), - cudaMemcpyDeviceToHost, stream1)); - checkCudaErrors(cudaStreamSynchronize(stream1)); - printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); - // First Iteration when k=1 ends - k++; - -#if WITH_GRAPH - cudaGraph_t initGraph; - checkCudaErrors(cudaStreamCreate(&streamForGraph)); - checkCudaErrors(cublasSetStream(cublasHandle, stream1)); - checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); - checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal)); - - r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b); - cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); - checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1)); - cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST); - checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); - cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); - - checkCudaErrors( - cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST)); - checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1)); - checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); - - r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); - - checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); - - a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); - - checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); - - checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), - cudaMemcpyDeviceToDevice, stream1)); - checkCudaErrors(cudaMemsetAsync(d_r1, 0, sizeof(float), stream1)); - - checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); - - checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float), - cudaMemcpyDeviceToHost, stream1)); - - checkCudaErrors(cudaStreamEndCapture(stream1, &initGraph)); - cudaGraphExec_t graphExec; - checkCudaErrors(cudaGraphInstantiate(&graphExec, initGraph, NULL, NULL, 0)); -#endif - - checkCudaErrors(cublasSetStream(cublasHandle, stream1)); - checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); - - while (r1 > tol * tol && k <= max_iter) { -#if WITH_GRAPH - checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); - checkCudaErrors(cudaStreamSynchronize(streamForGraph)); -#else - r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b); - cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); - checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1)); - - cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST); - checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); - - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, - &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); @@ -344,81 +288,173 @@ int main(int argc, char **argv) { checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); - checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), - cudaMemcpyDeviceToDevice, stream1)); + checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), cudaMemcpyDeviceToDevice, stream1)); checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); - checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float), - cudaMemcpyDeviceToHost, stream1)); + + checkCudaErrors(cudaMemcpyAsync(&r1, d_r1, sizeof(float), cudaMemcpyDeviceToHost, stream1)); + checkCudaErrors(cudaStreamSynchronize(stream1)); + printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); + // First Iteration when k=1 ends + k++; + +#if WITH_GRAPH + cudaGraph_t initGraph; + checkCudaErrors(cudaStreamCreate(&streamForGraph)); + checkCudaErrors(cublasSetStream(cublasHandle, stream1)); + checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); + checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal)); + + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1)); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + + checkCudaErrors(cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST)); + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecp, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1)); + checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); + + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); + + a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); + + checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), cudaMemcpyDeviceToDevice, stream1)); + checkCudaErrors(cudaMemsetAsync(d_r1, 0, sizeof(float), stream1)); + + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); + + checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float), cudaMemcpyDeviceToHost, stream1)); + + checkCudaErrors(cudaStreamEndCapture(stream1, &initGraph)); + cudaGraphExec_t graphExec; + checkCudaErrors(cudaGraphInstantiate(&graphExec, initGraph, NULL, NULL, 0)); +#endif + + checkCudaErrors(cublasSetStream(cublasHandle, stream1)); + checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); + + while (r1 > tol * tol && k <= max_iter) { +#if WITH_GRAPH + checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); +#else + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b); + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1)); + + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); + + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecp, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); + checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); + + r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a); + + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1)); + + a_minus<<<1, 1, 0, stream1>>>(d_a, d_na); + checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1)); + + checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float), cudaMemcpyDeviceToDevice, stream1)); + + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1)); + checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float), cudaMemcpyDeviceToHost, stream1)); + checkCudaErrors(cudaStreamSynchronize(stream1)); +#endif + printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); + k++; + } + +#if WITH_GRAPH + checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost, streamForGraph)); + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); +#else + checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost, stream1)); checkCudaErrors(cudaStreamSynchronize(stream1)); #endif - printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); - k++; - } -#if WITH_GRAPH - checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float), - cudaMemcpyDeviceToHost, streamForGraph)); - checkCudaErrors(cudaStreamSynchronize(streamForGraph)); -#else - checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float), - cudaMemcpyDeviceToHost, stream1)); - checkCudaErrors(cudaStreamSynchronize(stream1)); -#endif + float rsum, diff, err = 0.0; - float rsum, diff, err = 0.0; + for (int i = 0; i < N; i++) { + rsum = 0.0; - for (int i = 0; i < N; i++) { - rsum = 0.0; + for (int j = I[i]; j < I[i + 1]; j++) { + rsum += val[j] * x[J[j]]; + } - for (int j = I[i]; j < I[i + 1]; j++) { - rsum += val[j] * x[J[j]]; + diff = fabs(rsum - rhs[i]); + + if (diff > err) { + err = diff; + } } - diff = fabs(rsum - rhs[i]); - - if (diff > err) { - err = diff; - } - } - #if WITH_GRAPH - checkCudaErrors(cudaGraphExecDestroy(graphExec)); - checkCudaErrors(cudaGraphDestroy(initGraph)); - checkCudaErrors(cudaStreamDestroy(streamForGraph)); + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphDestroy(initGraph)); + checkCudaErrors(cudaStreamDestroy(streamForGraph)); #endif - checkCudaErrors(cudaStreamDestroy(stream1)); - cusparseDestroy(cusparseHandle); - cublasDestroy(cublasHandle); + checkCudaErrors(cudaStreamDestroy(stream1)); + cusparseDestroy(cusparseHandle); + cublasDestroy(cublasHandle); - if (matA) { - checkCudaErrors(cusparseDestroySpMat(matA)); - } - if (vecx) { - checkCudaErrors(cusparseDestroyDnVec(vecx)); - } - if (vecAx) { - checkCudaErrors(cusparseDestroyDnVec(vecAx)); - } - if (vecp) { - checkCudaErrors(cusparseDestroyDnVec(vecp)); - } + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } + if (vecp) { + checkCudaErrors(cusparseDestroyDnVec(vecp)); + } - checkCudaErrors(cudaFreeHost(I)); - checkCudaErrors(cudaFreeHost(J)); - checkCudaErrors(cudaFreeHost(val)); - checkCudaErrors(cudaFreeHost(x)); - free(rhs); - checkCudaErrors(cudaFree(d_col)); - checkCudaErrors(cudaFree(d_row)); - checkCudaErrors(cudaFree(d_val)); - checkCudaErrors(cudaFree(d_x)); - checkCudaErrors(cudaFree(d_r)); - checkCudaErrors(cudaFree(d_p)); - checkCudaErrors(cudaFree(d_Ax)); + checkCudaErrors(cudaFreeHost(I)); + checkCudaErrors(cudaFreeHost(J)); + checkCudaErrors(cudaFreeHost(val)); + checkCudaErrors(cudaFreeHost(x)); + free(rhs); + checkCudaErrors(cudaFree(d_col)); + checkCudaErrors(cudaFree(d_row)); + checkCudaErrors(cudaFree(d_val)); + checkCudaErrors(cudaFree(d_x)); + checkCudaErrors(cudaFree(d_r)); + checkCudaErrors(cudaFree(d_p)); + checkCudaErrors(cudaFree(d_Ax)); - printf("Test Summary: Error amount = %f\n", err); - exit((k <= max_iter) ? 0 : 1); + printf("Test Summary: Error amount = %f\n", err); + exit((k <= max_iter) ? 0 : 1); } diff --git a/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu b/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu index 9d495245..879dfecf 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu +++ b/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu @@ -32,57 +32,57 @@ */ // includes, system +#include #include #include #include -#include - // Utilities and system includes -#include // helper function CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples - #include #include +#include // helper function CUDA error checking and initialization +#include // helper for shared functions common to CUDA Samples namespace cg = cooperative_groups; const char *sSDKname = "conjugateGradientMultiBlockCG"; #define ENABLE_CPU_DEBUG_CODE 0 -#define THREADS_PER_BLOCK 512 +#define THREADS_PER_BLOCK 512 /* genTridiag: generate a random tridiagonal symmetric matrix */ -void genTridiag(int *I, int *J, float *val, int N, int nz) { - I[0] = 0, J[0] = 0, J[1] = 1; - val[0] = static_cast(rand()) / RAND_MAX + 10.0f; - val[1] = static_cast(rand()) / RAND_MAX; - int start; +void genTridiag(int *I, int *J, float *val, int N, int nz) +{ + I[0] = 0, J[0] = 0, J[1] = 1; + val[0] = static_cast(rand()) / RAND_MAX + 10.0f; + val[1] = static_cast(rand()) / RAND_MAX; + int start; - for (int i = 1; i < N; i++) { - if (i > 1) { - I[i] = I[i - 1] + 3; - } else { - I[1] = 2; + for (int i = 1; i < N; i++) { + if (i > 1) { + I[i] = I[i - 1] + 3; + } + else { + I[1] = 2; + } + + start = (i - 1) * 3 + 2; + J[start] = i - 1; + J[start + 1] = i; + + if (i < N - 1) { + J[start + 2] = i + 1; + } + + val[start] = val[start - 1]; + val[start + 1] = static_cast(rand()) / RAND_MAX + 10.0f; + + if (i < N - 1) { + val[start + 2] = static_cast(rand()) / RAND_MAX; + } } - start = (i - 1) * 3 + 2; - J[start] = i - 1; - J[start + 1] = i; - - if (i < N - 1) { - J[start + 2] = i + 1; - } - - val[start] = val[start - 1]; - val[start + 1] = static_cast(rand()) / RAND_MAX + 10.0f; - - if (i < N - 1) { - val[start + 2] = static_cast(rand()) / RAND_MAX; - } - } - - I[N] = nz; + I[N] = nz; } // I - contains location of the given non-zero element in the row of the matrix @@ -90,213 +90,199 @@ void genTridiag(int *I, int *J, float *val, int N, int nz) { // matrix val - contains values of the given non-zero elements of the matrix // inputVecX - input vector to be multiplied // outputVecY - resultant vector -void cpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, float alpha, - float *inputVecX, float *outputVecY) { - for (int i = 0; i < num_rows; i++) { - int num_elems_this_row = I[i + 1] - I[i]; +void cpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, float alpha, float *inputVecX, float *outputVecY) +{ + for (int i = 0; i < num_rows; i++) { + int num_elems_this_row = I[i + 1] - I[i]; - float output = 0.0; - for (int j = 0; j < num_elems_this_row; j++) { - output += alpha * val[I[i] + j] * inputVecX[J[I[i] + j]]; - } - outputVecY[i] = output; - } - - return; -} - -double dotProduct(float *vecA, float *vecB, int size) { - double result = 0.0; - - for (int i = 0; i < size; i++) { - result = result + (vecA[i] * vecB[i]); - } - - return result; -} - -void scaleVector(float *vec, float alpha, int size) { - for (int i = 0; i < size; i++) { - vec[i] = alpha * vec[i]; - } -} - -void saxpy(float *x, float *y, float a, int size) { - for (int i = 0; i < size; i++) { - y[i] = a * x[i] + y[i]; - } -} - -void cpuConjugateGrad(int *I, int *J, float *val, float *x, float *Ax, float *p, - float *r, int nnz, int N, float tol) { - int max_iter = 10000; - - float alpha = 1.0; - float alpham1 = -1.0; - float r0 = 0.0, b, a, na; - - cpuSpMV(I, J, val, nnz, N, alpha, x, Ax); - saxpy(Ax, r, alpham1, N); - - float r1 = dotProduct(r, r, N); - - int k = 1; - - while (r1 > tol * tol && k <= max_iter) { - if (k > 1) { - b = r1 / r0; - scaleVector(p, b, N); - - saxpy(r, p, alpha, N); - } else { - for (int i = 0; i < N; i++) p[i] = r[i]; + float output = 0.0; + for (int j = 0; j < num_elems_this_row; j++) { + output += alpha * val[I[i] + j] * inputVecX[J[I[i] + j]]; + } + outputVecY[i] = output; } - cpuSpMV(I, J, val, nnz, N, alpha, p, Ax); - - float dot = dotProduct(p, Ax, N); - a = r1 / dot; - - saxpy(p, x, a, N); - na = -a; - saxpy(Ax, r, na, N); - - r0 = r1; - r1 = dotProduct(r, r, N); - - printf("\nCPU code iteration = %3d, residual = %e\n", k, sqrt(r1)); - k++; - } + return; } -__device__ void gpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, - float alpha, float *inputVecX, float *outputVecY, - cg::thread_block &cta, const cg::grid_group &grid) { - for (int i = grid.thread_rank(); i < num_rows; i += grid.size()) { - int row_elem = I[i]; - int next_row_elem = I[i + 1]; - int num_elems_this_row = next_row_elem - row_elem; +double dotProduct(float *vecA, float *vecB, int size) +{ + double result = 0.0; - float output = 0.0; - for (int j = 0; j < num_elems_this_row; j++) { - // I or J or val arrays - can be put in shared memory - // as the access is random and reused in next calls of gpuSpMV function. - output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]]; + for (int i = 0; i < size; i++) { + result = result + (vecA[i] * vecB[i]); } - outputVecY[i] = output; - } + return result; } -__device__ void gpuSaxpy(float *x, float *y, float a, int size, - const cg::grid_group &grid) { - for (int i = grid.thread_rank(); i < size; i += grid.size()) { - y[i] = a * x[i] + y[i]; - } +void scaleVector(float *vec, float alpha, int size) +{ + for (int i = 0; i < size; i++) { + vec[i] = alpha * vec[i]; + } } -__device__ void gpuDotProduct(float *vecA, float *vecB, double *result, - int size, const cg::thread_block &cta, - const cg::grid_group &grid) { - extern __shared__ double tmp[]; +void saxpy(float *x, float *y, float a, int size) +{ + for (int i = 0; i < size; i++) { + y[i] = a * x[i] + y[i]; + } +} - double temp_sum = 0.0; - for (int i = grid.thread_rank(); i < size; i += grid.size()) { - temp_sum += static_cast(vecA[i] * vecB[i]); - } +void cpuConjugateGrad(int *I, int *J, float *val, float *x, float *Ax, float *p, float *r, int nnz, int N, float tol) +{ + int max_iter = 10000; - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + float alpha = 1.0; + float alpham1 = -1.0; + float r0 = 0.0, b, a, na; - temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); + cpuSpMV(I, J, val, nnz, N, alpha, x, Ax); + saxpy(Ax, r, alpham1, N); - if (tile32.thread_rank() == 0) { - tmp[tile32.meta_group_rank()] = temp_sum; - } + float r1 = dotProduct(r, r, N); - cg::sync(cta); + int k = 1; - if (tile32.meta_group_rank() == 0) { - temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; - temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); + while (r1 > tol * tol && k <= max_iter) { + if (k > 1) { + b = r1 / r0; + scaleVector(p, b, N); + + saxpy(r, p, alpha, N); + } + else { + for (int i = 0; i < N; i++) + p[i] = r[i]; + } + + cpuSpMV(I, J, val, nnz, N, alpha, p, Ax); + + float dot = dotProduct(p, Ax, N); + a = r1 / dot; + + saxpy(p, x, a, N); + na = -a; + saxpy(Ax, r, na, N); + + r0 = r1; + r1 = dotProduct(r, r, N); + + printf("\nCPU code iteration = %3d, residual = %e\n", k, sqrt(r1)); + k++; + } +} + +__device__ void gpuSpMV(int *I, + int *J, + float *val, + int nnz, + int num_rows, + float alpha, + float *inputVecX, + float *outputVecY, + cg::thread_block &cta, + const cg::grid_group &grid) +{ + for (int i = grid.thread_rank(); i < num_rows; i += grid.size()) { + int row_elem = I[i]; + int next_row_elem = I[i + 1]; + int num_elems_this_row = next_row_elem - row_elem; + + float output = 0.0; + for (int j = 0; j < num_elems_this_row; j++) { + // I or J or val arrays - can be put in shared memory + // as the access is random and reused in next calls of gpuSpMV function. + output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]]; + } + + outputVecY[i] = output; + } +} + +__device__ void gpuSaxpy(float *x, float *y, float a, int size, const cg::grid_group &grid) +{ + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + y[i] = a * x[i] + y[i]; + } +} + +__device__ void gpuDotProduct(float *vecA, + float *vecB, + double *result, + int size, + const cg::thread_block &cta, + const cg::grid_group &grid) +{ + extern __shared__ double tmp[]; + + double temp_sum = 0.0; + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + temp_sum += static_cast(vecA[i] * vecB[i]); + } + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); + + temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); if (tile32.thread_rank() == 0) { - atomicAdd(result, temp_sum); - } - } -} - -__device__ void gpuCopyVector(float *srcA, float *destB, int size, - const cg::grid_group &grid) { - for (int i = grid.thread_rank(); i < size; i += grid.size()) { - destB[i] = srcA[i]; - } -} - -__device__ void gpuScaleVectorAndSaxpy(const float *x, float *y, float a, float scale, int size, - const cg::grid_group &grid) { - for (int i = grid.thread_rank(); i < size; i += grid.size()) { - y[i] = a * x[i] + scale * y[i]; - } -} - -extern "C" __global__ void gpuConjugateGradient(int *I, int *J, float *val, - float *x, float *Ax, float *p, - float *r, double *dot_result, - int nnz, int N, float tol) { - cg::thread_block cta = cg::this_thread_block(); - cg::grid_group grid = cg::this_grid(); - - int max_iter = 10000; - - float alpha = 1.0; - float alpham1 = -1.0; - float r0 = 0.0, r1, b, a, na; - - gpuSpMV(I, J, val, nnz, N, alpha, x, Ax, cta, grid); - - cg::sync(grid); - - gpuSaxpy(Ax, r, alpham1, N, grid); - - cg::sync(grid); - - gpuDotProduct(r, r, dot_result, N, cta, grid); - - cg::sync(grid); - - r1 = *dot_result; - - int k = 1; - while (r1 > tol * tol && k <= max_iter) { - if (k > 1) { - b = r1 / r0; - gpuScaleVectorAndSaxpy(r, p, alpha, b, N, grid); - } else { - gpuCopyVector(r, p, N, grid); + tmp[tile32.meta_group_rank()] = temp_sum; } - cg::sync(grid); + cg::sync(cta); - gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, cta, grid); + if (tile32.meta_group_rank() == 0) { + temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; + temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); - if (threadIdx.x == 0 && blockIdx.x == 0) *dot_result = 0.0; + if (tile32.thread_rank() == 0) { + atomicAdd(result, temp_sum); + } + } +} + +__device__ void gpuCopyVector(float *srcA, float *destB, int size, const cg::grid_group &grid) +{ + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + destB[i] = srcA[i]; + } +} + +__device__ void +gpuScaleVectorAndSaxpy(const float *x, float *y, float a, float scale, int size, const cg::grid_group &grid) +{ + for (int i = grid.thread_rank(); i < size; i += grid.size()) { + y[i] = a * x[i] + scale * y[i]; + } +} + +extern "C" __global__ void gpuConjugateGradient(int *I, + int *J, + float *val, + float *x, + float *Ax, + float *p, + float *r, + double *dot_result, + int nnz, + int N, + float tol) +{ + cg::thread_block cta = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); + + int max_iter = 10000; + + float alpha = 1.0; + float alpham1 = -1.0; + float r0 = 0.0, r1, b, a, na; + + gpuSpMV(I, J, val, nnz, N, alpha, x, Ax, cta, grid); cg::sync(grid); - gpuDotProduct(p, Ax, dot_result, N, cta, grid); - - cg::sync(grid); - - a = r1 / *dot_result; - - gpuSaxpy(p, x, a, N, grid); - na = -a; - gpuSaxpy(Ax, r, na, N, grid); - - r0 = r1; - - cg::sync(grid); - if (threadIdx.x == 0 && blockIdx.x == 0) *dot_result = 0.0; + gpuSaxpy(Ax, r, alpham1, N, grid); cg::sync(grid); @@ -305,188 +291,237 @@ extern "C" __global__ void gpuConjugateGradient(int *I, int *J, float *val, cg::sync(grid); r1 = *dot_result; - k++; - } + + int k = 1; + while (r1 > tol * tol && k <= max_iter) { + if (k > 1) { + b = r1 / r0; + gpuScaleVectorAndSaxpy(r, p, alpha, b, N, grid); + } + else { + gpuCopyVector(r, p, N, grid); + } + + cg::sync(grid); + + gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, cta, grid); + + if (threadIdx.x == 0 && blockIdx.x == 0) + *dot_result = 0.0; + + cg::sync(grid); + + gpuDotProduct(p, Ax, dot_result, N, cta, grid); + + cg::sync(grid); + + a = r1 / *dot_result; + + gpuSaxpy(p, x, a, N, grid); + na = -a; + gpuSaxpy(Ax, r, na, N, grid); + + r0 = r1; + + cg::sync(grid); + if (threadIdx.x == 0 && blockIdx.x == 0) + *dot_result = 0.0; + + cg::sync(grid); + + gpuDotProduct(r, r, dot_result, N, cta, grid); + + cg::sync(grid); + + r1 = *dot_result; + k++; + } } -bool areAlmostEqual(float a, float b, float maxRelDiff) { - float diff = fabsf(a - b); - float abs_a = fabsf(a); - float abs_b = fabsf(b); - float largest = abs_a > abs_b ? abs_a : abs_b; +bool areAlmostEqual(float a, float b, float maxRelDiff) +{ + float diff = fabsf(a - b); + float abs_a = fabsf(a); + float abs_b = fabsf(b); + float largest = abs_a > abs_b ? abs_a : abs_b; - if (diff <= largest * maxRelDiff) { - return true; - } else { - printf("maxRelDiff = %.8e\n", maxRelDiff); - printf( - "diff %.8e > largest * maxRelDiff %.8e therefore %.8e and %.8e are not " - "same\n", - diff, largest * maxRelDiff, a, b); - return false; - } + if (diff <= largest * maxRelDiff) { + return true; + } + else { + printf("maxRelDiff = %.8e\n", maxRelDiff); + printf("diff %.8e > largest * maxRelDiff %.8e therefore %.8e and %.8e are not " + "same\n", + diff, + largest * maxRelDiff, + a, + b); + return false; + } } -int main(int argc, char **argv) { - int N = 0, nz = 0, *I = NULL, *J = NULL; - float *val = NULL; - const float tol = 1e-5f; - float *x; - float *rhs; - float r1; - float *r, *p, *Ax; - cudaEvent_t start, stop; +int main(int argc, char **argv) +{ + int N = 0, nz = 0, *I = NULL, *J = NULL; + float *val = NULL; + const float tol = 1e-5f; + float *x; + float *rhs; + float r1; + float *r, *p, *Ax; + cudaEvent_t start, stop; - printf("Starting [%s]...\n", sSDKname); + printf("Starting [%s]...\n", sSDKname); - // This will pick the best possible CUDA capable device - cudaDeviceProp deviceProp; - int devID = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + // This will pick the best possible CUDA capable device + cudaDeviceProp deviceProp; + int devID = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - if (!deviceProp.managedMemory) { - // This sample requires being run on a device that supports Unified Memory - fprintf(stderr, "Unified Memory not supported on this device\n"); - exit(EXIT_WAIVED); - } - - // This sample requires being run on a device that supports Cooperative Kernel - // Launch - if (!deviceProp.cooperativeLaunch) { - printf( - "\nSelected GPU (%d) does not support Cooperative Kernel Launch, " - "Waiving the run\n", - devID); - exit(EXIT_WAIVED); - } - - // Statistics about the GPU device - printf( - "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", - deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); - - /* Generate a random tridiagonal symmetric matrix in CSR format */ - N = 1048576; - nz = (N - 2) * 3 + 4; - - cudaMallocManaged(reinterpret_cast(&I), sizeof(int) * (N + 1)); - cudaMallocManaged(reinterpret_cast(&J), sizeof(int) * nz); - cudaMallocManaged(reinterpret_cast(&val), sizeof(float) * nz); - - genTridiag(I, J, val, N, nz); - - cudaMallocManaged(reinterpret_cast(&x), sizeof(float) * N); - cudaMallocManaged(reinterpret_cast(&rhs), sizeof(float) * N); - - double *dot_result; - - cudaMallocManaged(reinterpret_cast(&dot_result), sizeof(double)); - - *dot_result = 0.0; - - // temp memory for CG - checkCudaErrors( - cudaMallocManaged(reinterpret_cast(&r), N * sizeof(float))); - checkCudaErrors( - cudaMallocManaged(reinterpret_cast(&p), N * sizeof(float))); - checkCudaErrors( - cudaMallocManaged(reinterpret_cast(&Ax), N * sizeof(float))); - - cudaDeviceSynchronize(); - - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - -#if ENABLE_CPU_DEBUG_CODE - float *Ax_cpu = reinterpret_cast(malloc(sizeof(float) * N)); - float *r_cpu = reinterpret_cast(malloc(sizeof(float) * N)); - float *p_cpu = reinterpret_cast(malloc(sizeof(float) * N)); - float *x_cpu = reinterpret_cast(malloc(sizeof(float) * N)); - - for (int i = 0; i < N; i++) { - r_cpu[i] = 1.0; - Ax_cpu[i] = x_cpu[i] = 0.0; - } - -#endif - - for (int i = 0; i < N; i++) { - r[i] = rhs[i] = 1.0; - x[i] = 0.0; - } - - void *kernelArgs[] = { - (void *)&I, (void *)&J, (void *)&val, (void *)&x, - (void *)&Ax, (void *)&p, (void *)&r, (void *)&dot_result, - (void *)&nz, (void *)&N, (void *)&tol, - }; - - int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK/32) + 1); - int numBlocksPerSm = 0; - int numThreads = THREADS_PER_BLOCK; - - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm, gpuConjugateGradient, numThreads, sMemSize)); - - int numSms = deviceProp.multiProcessorCount; - dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), - dimBlock(THREADS_PER_BLOCK, 1, 1); - checkCudaErrors(cudaEventRecord(start, 0)); - checkCudaErrors(cudaLaunchCooperativeKernel((void *)gpuConjugateGradient, - dimGrid, dimBlock, kernelArgs, - sMemSize, NULL)); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaDeviceSynchronize()); - - float time; - checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); - - r1 = *dot_result; - - printf("GPU Final, residual = %e, kernel execution time = %f ms\n", sqrt(r1), - time); - -#if ENABLE_CPU_DEBUG_CODE - cpuConjugateGrad(I, J, val, x_cpu, Ax_cpu, p_cpu, r_cpu, nz, N, tol); -#endif - - float rsum, diff, err = 0.0; - - for (int i = 0; i < N; i++) { - rsum = 0.0; - - for (int j = I[i]; j < I[i + 1]; j++) { - rsum += val[j] * x[J[j]]; + if (!deviceProp.managedMemory) { + // This sample requires being run on a device that supports Unified Memory + fprintf(stderr, "Unified Memory not supported on this device\n"); + exit(EXIT_WAIVED); } - diff = fabs(rsum - rhs[i]); - - if (diff > err) { - err = diff; + // This sample requires being run on a device that supports Cooperative Kernel + // Launch + if (!deviceProp.cooperativeLaunch) { + printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, " + "Waiving the run\n", + devID); + exit(EXIT_WAIVED); } - } - checkCudaErrors(cudaFree(I)); - checkCudaErrors(cudaFree(J)); - checkCudaErrors(cudaFree(val)); - checkCudaErrors(cudaFree(x)); - checkCudaErrors(cudaFree(rhs)); - checkCudaErrors(cudaFree(r)); - checkCudaErrors(cudaFree(p)); - checkCudaErrors(cudaFree(Ax)); - checkCudaErrors(cudaFree(dot_result)); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); + // Statistics about the GPU device + printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); + + /* Generate a random tridiagonal symmetric matrix in CSR format */ + N = 1048576; + nz = (N - 2) * 3 + 4; + + cudaMallocManaged(reinterpret_cast(&I), sizeof(int) * (N + 1)); + cudaMallocManaged(reinterpret_cast(&J), sizeof(int) * nz); + cudaMallocManaged(reinterpret_cast(&val), sizeof(float) * nz); + + genTridiag(I, J, val, N, nz); + + cudaMallocManaged(reinterpret_cast(&x), sizeof(float) * N); + cudaMallocManaged(reinterpret_cast(&rhs), sizeof(float) * N); + + double *dot_result; + + cudaMallocManaged(reinterpret_cast(&dot_result), sizeof(double)); + + *dot_result = 0.0; + + // temp memory for CG + checkCudaErrors(cudaMallocManaged(reinterpret_cast(&r), N * sizeof(float))); + checkCudaErrors(cudaMallocManaged(reinterpret_cast(&p), N * sizeof(float))); + checkCudaErrors(cudaMallocManaged(reinterpret_cast(&Ax), N * sizeof(float))); + + cudaDeviceSynchronize(); + + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); #if ENABLE_CPU_DEBUG_CODE - free(Ax_cpu); - free(r_cpu); - free(p_cpu); - free(x_cpu); + float *Ax_cpu = reinterpret_cast(malloc(sizeof(float) * N)); + float *r_cpu = reinterpret_cast(malloc(sizeof(float) * N)); + float *p_cpu = reinterpret_cast(malloc(sizeof(float) * N)); + float *x_cpu = reinterpret_cast(malloc(sizeof(float) * N)); + + for (int i = 0; i < N; i++) { + r_cpu[i] = 1.0; + Ax_cpu[i] = x_cpu[i] = 0.0; + } + #endif - printf("Test Summary: Error amount = %f \n", err); - fprintf(stdout, "&&&& conjugateGradientMultiBlockCG %s\n", - (sqrt(r1) < tol) ? "PASSED" : "FAILED"); - exit((sqrt(r1) < tol) ? EXIT_SUCCESS : EXIT_FAILURE); + for (int i = 0; i < N; i++) { + r[i] = rhs[i] = 1.0; + x[i] = 0.0; + } + + void *kernelArgs[] = { + (void *)&I, + (void *)&J, + (void *)&val, + (void *)&x, + (void *)&Ax, + (void *)&p, + (void *)&r, + (void *)&dot_result, + (void *)&nz, + (void *)&N, + (void *)&tol, + }; + + int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK / 32) + 1); + int numBlocksPerSm = 0; + int numThreads = THREADS_PER_BLOCK; + + checkCudaErrors( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, gpuConjugateGradient, numThreads, sMemSize)); + + int numSms = deviceProp.multiProcessorCount; + dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1); + checkCudaErrors(cudaEventRecord(start, 0)); + checkCudaErrors( + cudaLaunchCooperativeKernel((void *)gpuConjugateGradient, dimGrid, dimBlock, kernelArgs, sMemSize, NULL)); + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaDeviceSynchronize()); + + float time; + checkCudaErrors(cudaEventElapsedTime(&time, start, stop)); + + r1 = *dot_result; + + printf("GPU Final, residual = %e, kernel execution time = %f ms\n", sqrt(r1), time); + +#if ENABLE_CPU_DEBUG_CODE + cpuConjugateGrad(I, J, val, x_cpu, Ax_cpu, p_cpu, r_cpu, nz, N, tol); +#endif + + float rsum, diff, err = 0.0; + + for (int i = 0; i < N; i++) { + rsum = 0.0; + + for (int j = I[i]; j < I[i + 1]; j++) { + rsum += val[j] * x[J[j]]; + } + + diff = fabs(rsum - rhs[i]); + + if (diff > err) { + err = diff; + } + } + + checkCudaErrors(cudaFree(I)); + checkCudaErrors(cudaFree(J)); + checkCudaErrors(cudaFree(val)); + checkCudaErrors(cudaFree(x)); + checkCudaErrors(cudaFree(rhs)); + checkCudaErrors(cudaFree(r)); + checkCudaErrors(cudaFree(p)); + checkCudaErrors(cudaFree(Ax)); + checkCudaErrors(cudaFree(dot_result)); + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + +#if ENABLE_CPU_DEBUG_CODE + free(Ax_cpu); + free(r_cpu); + free(p_cpu); + free(x_cpu); +#endif + + printf("Test Summary: Error amount = %f \n", err); + fprintf(stdout, "&&&& conjugateGradientMultiBlockCG %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); + exit((sqrt(r1) < tol) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu b/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu index a83195b9..3b20c063 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu +++ b/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu @@ -32,63 +32,63 @@ */ // includes, system +#include +#include +#include +#include #include #include #include -#include -#include -#include #include -#include - // Utilities and system includes -#include // helper function CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples - #include #include +#include // helper function CUDA error checking and initialization +#include // helper for shared functions common to CUDA Samples namespace cg = cooperative_groups; const char *sSDKname = "conjugateGradientMultiDeviceCG"; #define ENABLE_CPU_DEBUG_CODE 0 -#define THREADS_PER_BLOCK 512 +#define THREADS_PER_BLOCK 512 __device__ double grid_dot_result = 0.0; /* genTridiag: generate a random tridiagonal symmetric matrix */ -void genTridiag(int *I, int *J, float *val, int N, int nz) { - I[0] = 0, J[0] = 0, J[1] = 1; - val[0] = (float)rand() / RAND_MAX + 10.0f; - val[1] = (float)rand() / RAND_MAX; - int start; +void genTridiag(int *I, int *J, float *val, int N, int nz) +{ + I[0] = 0, J[0] = 0, J[1] = 1; + val[0] = (float)rand() / RAND_MAX + 10.0f; + val[1] = (float)rand() / RAND_MAX; + int start; - for (int i = 1; i < N; i++) { - if (i > 1) { - I[i] = I[i - 1] + 3; - } else { - I[1] = 2; + for (int i = 1; i < N; i++) { + if (i > 1) { + I[i] = I[i - 1] + 3; + } + else { + I[1] = 2; + } + + start = (i - 1) * 3 + 2; + J[start] = i - 1; + J[start + 1] = i; + + if (i < N - 1) { + J[start + 2] = i + 1; + } + + val[start] = val[start - 1]; + val[start + 1] = (float)rand() / RAND_MAX + 10.0f; + + if (i < N - 1) { + val[start + 2] = (float)rand() / RAND_MAX; + } } - start = (i - 1) * 3 + 2; - J[start] = i - 1; - J[start + 1] = i; - - if (i < N - 1) { - J[start + 2] = i + 1; - } - - val[start] = val[start - 1]; - val[start + 1] = (float)rand() / RAND_MAX + 10.0f; - - if (i < N - 1) { - val[start + 2] = (float)rand() / RAND_MAX; - } - } - - I[N] = nz; + I[N] = nz; } // I - contains location of the given non-zero element in the row of the matrix @@ -96,698 +96,703 @@ void genTridiag(int *I, int *J, float *val, int N, int nz) { // matrix val - contains values of the given non-zero elements of the matrix // inputVecX - input vector to be multiplied // outputVecY - resultant vector -void cpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, float alpha, - float *inputVecX, float *outputVecY) { - for (int i = 0; i < num_rows; i++) { - int num_elems_this_row = I[i + 1] - I[i]; +void cpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, float alpha, float *inputVecX, float *outputVecY) +{ + for (int i = 0; i < num_rows; i++) { + int num_elems_this_row = I[i + 1] - I[i]; - float output = 0.0; - for (int j = 0; j < num_elems_this_row; j++) { - output += alpha * val[I[i] + j] * inputVecX[J[I[i] + j]]; - } - outputVecY[i] = output; - } - - return; -} - -float dotProduct(float *vecA, float *vecB, int size) { - float result = 0.0; - - for (int i = 0; i < size; i++) { - result = result + (vecA[i] * vecB[i]); - } - - return result; -} - -void scaleVector(float *vec, float alpha, int size) { - for (int i = 0; i < size; i++) { - vec[i] = alpha * vec[i]; - } -} - -void saxpy(float *x, float *y, float a, int size) { - for (int i = 0; i < size; i++) { - y[i] = a * x[i] + y[i]; - } -} - -void cpuConjugateGrad(int *I, int *J, float *val, float *x, float *Ax, float *p, - float *r, int nnz, int N, float tol) { - int max_iter = 10000; - - float alpha = 1.0; - float alpham1 = -1.0; - float r0 = 0.0, b, a, na; - - cpuSpMV(I, J, val, nnz, N, alpha, x, Ax); - saxpy(Ax, r, alpham1, N); - - float r1 = dotProduct(r, r, N); - - int k = 1; - - while (r1 > tol * tol && k <= max_iter) { - if (k > 1) { - b = r1 / r0; - scaleVector(p, b, N); - - saxpy(r, p, alpha, N); - } else { - for (int i = 0; i < N; i++) p[i] = r[i]; + float output = 0.0; + for (int j = 0; j < num_elems_this_row; j++) { + output += alpha * val[I[i] + j] * inputVecX[J[I[i] + j]]; + } + outputVecY[i] = output; } - cpuSpMV(I, J, val, nnz, N, alpha, p, Ax); + return; +} - float dot = dotProduct(p, Ax, N); - a = r1 / dot; +float dotProduct(float *vecA, float *vecB, int size) +{ + float result = 0.0; - saxpy(p, x, a, N); - na = -a; - saxpy(Ax, r, na, N); + for (int i = 0; i < size; i++) { + result = result + (vecA[i] * vecB[i]); + } - r0 = r1; - r1 = dotProduct(r, r, N); + return result; +} - printf("\nCPU code iteration = %3d, residual = %e\n", k, sqrt(r1)); - k++; - } +void scaleVector(float *vec, float alpha, int size) +{ + for (int i = 0; i < size; i++) { + vec[i] = alpha * vec[i]; + } +} + +void saxpy(float *x, float *y, float a, int size) +{ + for (int i = 0; i < size; i++) { + y[i] = a * x[i] + y[i]; + } +} + +void cpuConjugateGrad(int *I, int *J, float *val, float *x, float *Ax, float *p, float *r, int nnz, int N, float tol) +{ + int max_iter = 10000; + + float alpha = 1.0; + float alpham1 = -1.0; + float r0 = 0.0, b, a, na; + + cpuSpMV(I, J, val, nnz, N, alpha, x, Ax); + saxpy(Ax, r, alpham1, N); + + float r1 = dotProduct(r, r, N); + + int k = 1; + + while (r1 > tol * tol && k <= max_iter) { + if (k > 1) { + b = r1 / r0; + scaleVector(p, b, N); + + saxpy(r, p, alpha, N); + } + else { + for (int i = 0; i < N; i++) + p[i] = r[i]; + } + + cpuSpMV(I, J, val, nnz, N, alpha, p, Ax); + + float dot = dotProduct(p, Ax, N); + a = r1 / dot; + + saxpy(p, x, a, N); + na = -a; + saxpy(Ax, r, na, N); + + r0 = r1; + r1 = dotProduct(r, r, N); + + printf("\nCPU code iteration = %3d, residual = %e\n", k, sqrt(r1)); + k++; + } } // Data filled on CPU needed for MultiGPU operations. -struct MultiDeviceData { - unsigned char *hostMemoryArrivedList; - unsigned int numDevices; - unsigned int deviceRank; +struct MultiDeviceData +{ + unsigned char *hostMemoryArrivedList; + unsigned int numDevices; + unsigned int deviceRank; }; // Class used for coordination of multiple devices. -class PeerGroup { - const MultiDeviceData &data; - const cg::grid_group &grid; +class PeerGroup +{ + const MultiDeviceData &data; + const cg::grid_group &grid; - __device__ unsigned char load_arrived(unsigned char *arrived) const { + __device__ unsigned char load_arrived(unsigned char *arrived) const + { #if __CUDA_ARCH__ < 700 - return *(volatile unsigned char *)arrived; + return *(volatile unsigned char *)arrived; #else - unsigned int result; - asm volatile("ld.acquire.sys.global.u8 %0, [%1];" - : "=r"(result) - : "l"(arrived) - : "memory"); - return result; + unsigned int result; + asm volatile("ld.acquire.sys.global.u8 %0, [%1];" : "=r"(result) : "l"(arrived) : "memory"); + return result; #endif - } - - __device__ void store_arrived(unsigned char *arrived, - unsigned char val) const { -#if __CUDA_ARCH__ < 700 - *(volatile unsigned char *)arrived = val; -#else - unsigned int reg_val = val; - asm volatile( - "st.release.sys.global.u8 [%1], %0;" ::"r"(reg_val) "l"(arrived) - : "memory"); - - // Avoids compiler warnings from unused variable val. - (void)(reg_val = reg_val); -#endif - } - - public: - __device__ PeerGroup(const MultiDeviceData &data, const cg::grid_group &grid) - : data(data), grid(grid){}; - - __device__ unsigned int size() const { return data.numDevices * grid.size(); } - - __device__ unsigned int thread_rank() const { - return data.deviceRank * grid.size() + grid.thread_rank(); - } - - __device__ void sync() const { - grid.sync(); - - // One thread from each grid participates in the sync. - if (grid.thread_rank() == 0) { - if (data.deviceRank == 0) { - // Leader grid waits for others to join and then releases them. - // Other GPUs can arrive in any order, so the leader have to wait for - // all others. - for (int i = 0; i < data.numDevices - 1; i++) { - while (load_arrived(&data.hostMemoryArrivedList[i]) == 0) - ; - } - for (int i = 0; i < data.numDevices - 1; i++) { - store_arrived(&data.hostMemoryArrivedList[i], 0); - } - __threadfence_system(); - } else { - // Other grids note their arrival and wait to be released. - store_arrived(&data.hostMemoryArrivedList[data.deviceRank - 1], 1); - while (load_arrived(&data.hostMemoryArrivedList[data.deviceRank - 1]) == - 1) - ; - } } - grid.sync(); - } + __device__ void store_arrived(unsigned char *arrived, unsigned char val) const + { +#if __CUDA_ARCH__ < 700 + *(volatile unsigned char *)arrived = val; +#else + unsigned int reg_val = val; + asm volatile("st.release.sys.global.u8 [%1], %0;" ::"r"(reg_val) "l"(arrived) : "memory"); + + // Avoids compiler warnings from unused variable val. + (void)(reg_val = reg_val); +#endif + } + +public: + __device__ PeerGroup(const MultiDeviceData &data, const cg::grid_group &grid) + : data(data) + , grid(grid) {}; + + __device__ unsigned int size() const { return data.numDevices * grid.size(); } + + __device__ unsigned int thread_rank() const { return data.deviceRank * grid.size() + grid.thread_rank(); } + + __device__ void sync() const + { + grid.sync(); + + // One thread from each grid participates in the sync. + if (grid.thread_rank() == 0) { + if (data.deviceRank == 0) { + // Leader grid waits for others to join and then releases them. + // Other GPUs can arrive in any order, so the leader have to wait for + // all others. + for (int i = 0; i < data.numDevices - 1; i++) { + while (load_arrived(&data.hostMemoryArrivedList[i]) == 0) + ; + } + for (int i = 0; i < data.numDevices - 1; i++) { + store_arrived(&data.hostMemoryArrivedList[i], 0); + } + __threadfence_system(); + } + else { + // Other grids note their arrival and wait to be released. + store_arrived(&data.hostMemoryArrivedList[data.deviceRank - 1], 1); + while (load_arrived(&data.hostMemoryArrivedList[data.deviceRank - 1]) == 1) + ; + } + } + + grid.sync(); + } }; -__device__ void gpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, - float alpha, float *inputVecX, float *outputVecY, - const PeerGroup &peer_group) { - for (int i = peer_group.thread_rank(); i < num_rows; i += peer_group.size()) { - int row_elem = I[i]; - int next_row_elem = I[i + 1]; - int num_elems_this_row = next_row_elem - row_elem; +__device__ void gpuSpMV(int *I, + int *J, + float *val, + int nnz, + int num_rows, + float alpha, + float *inputVecX, + float *outputVecY, + const PeerGroup &peer_group) +{ + for (int i = peer_group.thread_rank(); i < num_rows; i += peer_group.size()) { + int row_elem = I[i]; + int next_row_elem = I[i + 1]; + int num_elems_this_row = next_row_elem - row_elem; - float output = 0.0; - for (int j = 0; j < num_elems_this_row; j++) { - output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]]; + float output = 0.0; + for (int j = 0; j < num_elems_this_row; j++) { + output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]]; + } + + outputVecY[i] = output; + } +} + +__device__ void gpuSaxpy(float *x, float *y, float a, int size, const PeerGroup &peer_group) +{ + for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { + y[i] = a * x[i] + y[i]; + } +} + +__device__ void +gpuDotProduct(float *vecA, float *vecB, int size, const cg::thread_block &cta, const PeerGroup &peer_group) +{ + extern __shared__ double tmp[]; + + double temp_sum = 0.0; + + for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { + temp_sum += (double)(vecA[i] * vecB[i]); } - outputVecY[i] = output; - } -} + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); -__device__ void gpuSaxpy(float *x, float *y, float a, int size, - const PeerGroup &peer_group) { - for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { - y[i] = a * x[i] + y[i]; - } -} - -__device__ void gpuDotProduct(float *vecA, float *vecB, int size, - const cg::thread_block &cta, - const PeerGroup &peer_group) { - extern __shared__ double tmp[]; - - double temp_sum = 0.0; - - for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { - temp_sum += (double)(vecA[i] * vecB[i]); - } - - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - - temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); - - if (tile32.thread_rank() == 0) { - tmp[tile32.meta_group_rank()] = temp_sum; - } - - cg::sync(cta); - - if (tile32.meta_group_rank() == 0) { - temp_sum = tile32.thread_rank() < tile32.meta_group_size() - ? tmp[tile32.thread_rank()] - : 0.0; temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); if (tile32.thread_rank() == 0) { - atomicAdd(&grid_dot_result, temp_sum); - } - } -} - -__device__ void gpuCopyVector(float *srcA, float *destB, int size, - const PeerGroup &peer_group) { - for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { - destB[i] = srcA[i]; - } -} - -__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, - int size, const PeerGroup &peer_group) { - for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { - y[i] = a * x[i] + scale * y[i]; - } -} - -extern "C" __global__ void multiGpuConjugateGradient( - int *I, int *J, float *val, float *x, float *Ax, float *p, float *r, - double *dot_result, int nnz, int N, float tol, - MultiDeviceData multi_device_data) { - cg::thread_block cta = cg::this_thread_block(); - cg::grid_group grid = cg::this_grid(); - PeerGroup peer_group(multi_device_data, grid); - - const int max_iter = 10000; - - float alpha = 1.0; - float alpham1 = -1.0; - float r0 = 0.0, r1, b, a, na; - - for (int i = peer_group.thread_rank(); i < N; i += peer_group.size()) { - r[i] = 1.0; - x[i] = 0.0; - } - - cg::sync(grid); - - gpuSpMV(I, J, val, nnz, N, alpha, x, Ax, peer_group); - - cg::sync(grid); - - gpuSaxpy(Ax, r, alpham1, N, peer_group); - - cg::sync(grid); - - gpuDotProduct(r, r, N, cta, peer_group); - - cg::sync(grid); - - if (grid.thread_rank() == 0) { - atomicAdd_system(dot_result, grid_dot_result); - grid_dot_result = 0.0; - } - peer_group.sync(); - - r1 = *dot_result; - - int k = 1; - while (r1 > tol * tol && k <= max_iter) { - if (k > 1) { - b = r1 / r0; - gpuScaleVectorAndSaxpy(r, p, alpha, b, N, peer_group); - } else { - gpuCopyVector(r, p, N, peer_group); + tmp[tile32.meta_group_rank()] = temp_sum; } - peer_group.sync(); + cg::sync(cta); - gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, peer_group); + if (tile32.meta_group_rank() == 0) { + temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; + temp_sum = cg::reduce(tile32, temp_sum, cg::plus()); - if (peer_group.thread_rank() == 0) { - *dot_result = 0.0; + if (tile32.thread_rank() == 0) { + atomicAdd(&grid_dot_result, temp_sum); + } } - peer_group.sync(); +} - gpuDotProduct(p, Ax, N, cta, peer_group); +__device__ void gpuCopyVector(float *srcA, float *destB, int size, const PeerGroup &peer_group) +{ + for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { + destB[i] = srcA[i]; + } +} + +__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, int size, const PeerGroup &peer_group) +{ + for (int i = peer_group.thread_rank(); i < size; i += peer_group.size()) { + y[i] = a * x[i] + scale * y[i]; + } +} + +extern "C" __global__ void multiGpuConjugateGradient(int *I, + int *J, + float *val, + float *x, + float *Ax, + float *p, + float *r, + double *dot_result, + int nnz, + int N, + float tol, + MultiDeviceData multi_device_data) +{ + cg::thread_block cta = cg::this_thread_block(); + cg::grid_group grid = cg::this_grid(); + PeerGroup peer_group(multi_device_data, grid); + + const int max_iter = 10000; + + float alpha = 1.0; + float alpham1 = -1.0; + float r0 = 0.0, r1, b, a, na; + + for (int i = peer_group.thread_rank(); i < N; i += peer_group.size()) { + r[i] = 1.0; + x[i] = 0.0; + } cg::sync(grid); - if (grid.thread_rank() == 0) { - atomicAdd_system(dot_result, grid_dot_result); - grid_dot_result = 0.0; - } - peer_group.sync(); + gpuSpMV(I, J, val, nnz, N, alpha, x, Ax, peer_group); - a = r1 / *dot_result; + cg::sync(grid); - gpuSaxpy(p, x, a, N, peer_group); + gpuSaxpy(Ax, r, alpham1, N, peer_group); - na = -a; - - gpuSaxpy(Ax, r, na, N, peer_group); - - r0 = r1; - - peer_group.sync(); - - if (peer_group.thread_rank() == 0) { - *dot_result = 0.0; - } - - peer_group.sync(); + cg::sync(grid); gpuDotProduct(r, r, N, cta, peer_group); cg::sync(grid); if (grid.thread_rank() == 0) { - atomicAdd_system(dot_result, grid_dot_result); - grid_dot_result = 0.0; + atomicAdd_system(dot_result, grid_dot_result); + grid_dot_result = 0.0; } peer_group.sync(); r1 = *dot_result; - k++; - } + + int k = 1; + while (r1 > tol * tol && k <= max_iter) { + if (k > 1) { + b = r1 / r0; + gpuScaleVectorAndSaxpy(r, p, alpha, b, N, peer_group); + } + else { + gpuCopyVector(r, p, N, peer_group); + } + + peer_group.sync(); + + gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, peer_group); + + if (peer_group.thread_rank() == 0) { + *dot_result = 0.0; + } + peer_group.sync(); + + gpuDotProduct(p, Ax, N, cta, peer_group); + + cg::sync(grid); + + if (grid.thread_rank() == 0) { + atomicAdd_system(dot_result, grid_dot_result); + grid_dot_result = 0.0; + } + peer_group.sync(); + + a = r1 / *dot_result; + + gpuSaxpy(p, x, a, N, peer_group); + + na = -a; + + gpuSaxpy(Ax, r, na, N, peer_group); + + r0 = r1; + + peer_group.sync(); + + if (peer_group.thread_rank() == 0) { + *dot_result = 0.0; + } + + peer_group.sync(); + + gpuDotProduct(r, r, N, cta, peer_group); + + cg::sync(grid); + + if (grid.thread_rank() == 0) { + atomicAdd_system(dot_result, grid_dot_result); + grid_dot_result = 0.0; + } + peer_group.sync(); + + r1 = *dot_result; + k++; + } } // Map of device version to device number -std::multimap, int> getIdenticalGPUs() { - int numGpus = 0; - checkCudaErrors(cudaGetDeviceCount(&numGpus)); +std::multimap, int> getIdenticalGPUs() +{ + int numGpus = 0; + checkCudaErrors(cudaGetDeviceCount(&numGpus)); - std::multimap, int> identicalGpus; + std::multimap, int> identicalGpus; - for (int i = 0; i < numGpus; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); + for (int i = 0; i < numGpus; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); - // Filter unsupported devices - if (deviceProp.cooperativeLaunch && deviceProp.concurrentManagedAccess) { - identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), - i); + // Filter unsupported devices + if (deviceProp.cooperativeLaunch && deviceProp.concurrentManagedAccess) { + identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), i); + } + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", + i, + deviceProp.name, + deviceProp.major, + deviceProp.minor); } - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i, - deviceProp.name, deviceProp.major, deviceProp.minor); - } - return identicalGpus; + return identicalGpus; } -int main(int argc, char **argv) { - constexpr size_t kNumGpusRequired = 2; - int N = 0, nz = 0, *I = NULL, *J = NULL; - float *val = NULL; - const float tol = 1e-5f; - float *x; - float rhs = 1.0; - float r1; - float *r, *p, *Ax; +int main(int argc, char **argv) +{ + constexpr size_t kNumGpusRequired = 2; + int N = 0, nz = 0, *I = NULL, *J = NULL; + float *val = NULL; + const float tol = 1e-5f; + float *x; + float rhs = 1.0; + float r1; + float *r, *p, *Ax; - printf("Starting [%s]...\n", sSDKname); - auto gpusByArch = getIdenticalGPUs(); + printf("Starting [%s]...\n", sSDKname); + auto gpusByArch = getIdenticalGPUs(); - auto it = gpusByArch.begin(); - auto end = gpusByArch.end(); + auto it = gpusByArch.begin(); + auto end = gpusByArch.end(); - auto bestFit = std::make_pair(it, it); - // use std::distance to find the largest number of GPUs amongst architectures - auto distance = [](decltype(bestFit) p) { - return std::distance(p.first, p.second); - }; + auto bestFit = std::make_pair(it, it); + // use std::distance to find the largest number of GPUs amongst architectures + auto distance = [](decltype(bestFit) p) { return std::distance(p.first, p.second); }; - // Read each unique key/pair element in order - for (; it != end; it = gpusByArch.upper_bound(it->first)) { - // first and second are iterators bounded within the architecture group - auto testFit = gpusByArch.equal_range(it->first); - // Always use devices with highest architecture version or whichever has the - // most devices available - if (distance(bestFit) <= distance(testFit)) bestFit = testFit; - } + // Read each unique key/pair element in order + for (; it != end; it = gpusByArch.upper_bound(it->first)) { + // first and second are iterators bounded within the architecture group + auto testFit = gpusByArch.equal_range(it->first); + // Always use devices with highest architecture version or whichever has the + // most devices available + if (distance(bestFit) <= distance(testFit)) + bestFit = testFit; + } - if (distance(bestFit) < kNumGpusRequired) { - printf( - "No two or more GPUs with same architecture capable of " - "concurrentManagedAccess found. " - "\nWaiving the sample\n"); - exit(EXIT_WAIVED); - } + if (distance(bestFit) < kNumGpusRequired) { + printf("No two or more GPUs with same architecture capable of " + "concurrentManagedAccess found. " + "\nWaiving the sample\n"); + exit(EXIT_WAIVED); + } - std::set bestFitDeviceIds; + std::set bestFitDeviceIds; - // Check & select peer-to-peer access capable GPU devices as enabling p2p - // access between participating GPUs gives better performance. - for (auto itr = bestFit.first; itr != bestFit.second; itr++) { - int deviceId = itr->second; - checkCudaErrors(cudaSetDevice(deviceId)); + // Check & select peer-to-peer access capable GPU devices as enabling p2p + // access between participating GPUs gives better performance. + for (auto itr = bestFit.first; itr != bestFit.second; itr++) { + int deviceId = itr->second; + checkCudaErrors(cudaSetDevice(deviceId)); - std::for_each( - itr, bestFit.second, - [&deviceId, &bestFitDeviceIds, - &kNumGpusRequired](decltype(*itr) mapPair) { - if (deviceId != mapPair.second) { - int access = 0; - checkCudaErrors( - cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); - printf("Device=%d %s Access Peer Device=%d\n", deviceId, - access ? "CAN" : "CANNOT", mapPair.second); - if (access && bestFitDeviceIds.size() < kNumGpusRequired) { - bestFitDeviceIds.emplace(deviceId); - bestFitDeviceIds.emplace(mapPair.second); - } else { - printf("Ignoring device %i (max devices exceeded)\n", - mapPair.second); + std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds, &kNumGpusRequired](decltype(*itr) mapPair) { + if (deviceId != mapPair.second) { + int access = 0; + checkCudaErrors(cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); + printf("Device=%d %s Access Peer Device=%d\n", deviceId, access ? "CAN" : "CANNOT", mapPair.second); + if (access && bestFitDeviceIds.size() < kNumGpusRequired) { + bestFitDeviceIds.emplace(deviceId); + bestFitDeviceIds.emplace(mapPair.second); + } + else { + printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); + } } - } }); - if (bestFitDeviceIds.size() >= kNumGpusRequired) { - printf("Selected p2p capable devices - "); - for (auto devicesItr = bestFitDeviceIds.begin(); - devicesItr != bestFitDeviceIds.end(); devicesItr++) { - printf("deviceId = %d ", *devicesItr); - } - printf("\n"); - break; - } - } - - // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p - // capable, hence we add it without p2p capability check. - if (!bestFitDeviceIds.size()) { - printf("Devices involved are not p2p capable.. selecting %zu of them\n", - kNumGpusRequired); - std::for_each(bestFit.first, bestFit.second, - [&bestFitDeviceIds, - &kNumGpusRequired](decltype(*bestFit.first) mapPair) { - if (bestFitDeviceIds.size() < kNumGpusRequired) { - bestFitDeviceIds.emplace(mapPair.second); - } else { - printf("Ignoring device %i (max devices exceeded)\n", - mapPair.second); - } - // Insert the sequence into the deviceIds set - }); - } else { - // perform cudaDeviceEnablePeerAccess in both directions for all - // participating devices. - for (auto p1_itr = bestFitDeviceIds.begin(); - p1_itr != bestFitDeviceIds.end(); p1_itr++) { - checkCudaErrors(cudaSetDevice(*p1_itr)); - for (auto p2_itr = bestFitDeviceIds.begin(); - p2_itr != bestFitDeviceIds.end(); p2_itr++) { - if (*p1_itr != *p2_itr) { - checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0)); - checkCudaErrors(cudaSetDevice(*p1_itr)); + if (bestFitDeviceIds.size() >= kNumGpusRequired) { + printf("Selected p2p capable devices - "); + for (auto devicesItr = bestFitDeviceIds.begin(); devicesItr != bestFitDeviceIds.end(); devicesItr++) { + printf("deviceId = %d ", *devicesItr); + } + printf("\n"); + break; } - } } - } - /* Generate a random tridiagonal symmetric matrix in CSR format */ - N = 10485760 * 2; - nz = (N - 2) * 3 + 4; - - checkCudaErrors(cudaMallocManaged((void **)&I, sizeof(int) * (N + 1))); - checkCudaErrors(cudaMallocManaged((void **)&J, sizeof(int) * nz)); - checkCudaErrors(cudaMallocManaged((void **)&val, sizeof(float) * nz)); - - float *val_cpu = (float *)malloc(sizeof(float) * nz); - - genTridiag(I, J, val_cpu, N, nz); - - memcpy(val, val_cpu, sizeof(float) * nz); - checkCudaErrors( - cudaMemAdvise(I, sizeof(int) * (N + 1), cudaMemAdviseSetReadMostly, 0)); - checkCudaErrors( - cudaMemAdvise(J, sizeof(int) * nz, cudaMemAdviseSetReadMostly, 0)); - checkCudaErrors( - cudaMemAdvise(val, sizeof(float) * nz, cudaMemAdviseSetReadMostly, 0)); - - checkCudaErrors(cudaMallocManaged((void **)&x, sizeof(float) * N)); - - double *dot_result; - checkCudaErrors(cudaMallocManaged((void **)&dot_result, sizeof(double))); - - checkCudaErrors(cudaMemset(dot_result, 0, sizeof(double))); - - // temp memory for ConjugateGradient - checkCudaErrors(cudaMallocManaged((void **)&r, N * sizeof(float))); - checkCudaErrors(cudaMallocManaged((void **)&p, N * sizeof(float))); - checkCudaErrors(cudaMallocManaged((void **)&Ax, N * sizeof(float))); - - std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl; - cudaStream_t nStreams[kNumGpusRequired]; - - int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK / 32) + 1); - int numBlocksPerSm = INT_MAX; - int numThreads = THREADS_PER_BLOCK; - int numSms = INT_MAX; - auto deviceId = bestFitDeviceIds.begin(); - - // set numSms & numBlocksPerSm to be lowest of 2 devices - while (deviceId != bestFitDeviceIds.end()) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaSetDevice(*deviceId)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId)); - - int numBlocksPerSm_current = 0; - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, - sMemSize)); - - if (numBlocksPerSm > numBlocksPerSm_current) { - numBlocksPerSm = numBlocksPerSm_current; + // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p + // capable, hence we add it without p2p capability check. + if (!bestFitDeviceIds.size()) { + printf("Devices involved are not p2p capable.. selecting %zu of them\n", kNumGpusRequired); + std::for_each( + bestFit.first, bestFit.second, [&bestFitDeviceIds, &kNumGpusRequired](decltype(*bestFit.first) mapPair) { + if (bestFitDeviceIds.size() < kNumGpusRequired) { + bestFitDeviceIds.emplace(mapPair.second); + } + else { + printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); + } + // Insert the sequence into the deviceIds set + }); } - if (numSms > deviceProp.multiProcessorCount) { - numSms = deviceProp.multiProcessorCount; - } - deviceId++; - } - - if (!numBlocksPerSm) { - printf( - "Max active blocks per SM is returned as 0.\n Hence, Waiving the " - "sample\n"); - exit(EXIT_WAIVED); - } - - int device_count = 0; - int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK; - deviceId = bestFitDeviceIds.begin(); - while (deviceId != bestFitDeviceIds.end()) { - checkCudaErrors(cudaSetDevice(*deviceId)); - checkCudaErrors(cudaStreamCreate(&nStreams[device_count])); - - int perGPUIter = N / (totalThreadsPerGPU * kNumGpusRequired); - int offset_Ax = device_count * totalThreadsPerGPU; - int offset_r = device_count * totalThreadsPerGPU; - int offset_p = device_count * totalThreadsPerGPU; - int offset_x = device_count * totalThreadsPerGPU; - - checkCudaErrors(cudaMemPrefetchAsync(I, sizeof(int) * N, *deviceId, - nStreams[device_count])); - checkCudaErrors(cudaMemPrefetchAsync(val, sizeof(float) * nz, *deviceId, - nStreams[device_count])); - checkCudaErrors(cudaMemPrefetchAsync(J, sizeof(float) * nz, *deviceId, - nStreams[device_count])); - - if (offset_Ax <= N) { - for (int i = 0; i < perGPUIter; i++) { - cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetPreferredLocation, *deviceId); - cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetPreferredLocation, *deviceId); - cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetPreferredLocation, *deviceId); - cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetPreferredLocation, *deviceId); - - cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetAccessedBy, *deviceId); - cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetAccessedBy, *deviceId); - cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetAccessedBy, *deviceId); - cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU, - cudaMemAdviseSetAccessedBy, *deviceId); - - offset_Ax += totalThreadsPerGPU * kNumGpusRequired; - offset_r += totalThreadsPerGPU * kNumGpusRequired; - offset_p += totalThreadsPerGPU * kNumGpusRequired; - offset_x += totalThreadsPerGPU * kNumGpusRequired; - - if (offset_Ax >= N) { - break; + else { + // perform cudaDeviceEnablePeerAccess in both directions for all + // participating devices. + for (auto p1_itr = bestFitDeviceIds.begin(); p1_itr != bestFitDeviceIds.end(); p1_itr++) { + checkCudaErrors(cudaSetDevice(*p1_itr)); + for (auto p2_itr = bestFitDeviceIds.begin(); p2_itr != bestFitDeviceIds.end(); p2_itr++) { + if (*p1_itr != *p2_itr) { + checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0)); + checkCudaErrors(cudaSetDevice(*p1_itr)); + } + } } - } } - device_count++; - deviceId++; - } + /* Generate a random tridiagonal symmetric matrix in CSR format */ + N = 10485760 * 2; + nz = (N - 2) * 3 + 4; -#if ENABLE_CPU_DEBUG_CODE - float *Ax_cpu = (float *)malloc(sizeof(float) * N); - float *r_cpu = (float *)malloc(sizeof(float) * N); - float *p_cpu = (float *)malloc(sizeof(float) * N); - float *x_cpu = (float *)malloc(sizeof(float) * N); + checkCudaErrors(cudaMallocManaged((void **)&I, sizeof(int) * (N + 1))); + checkCudaErrors(cudaMallocManaged((void **)&J, sizeof(int) * nz)); + checkCudaErrors(cudaMallocManaged((void **)&val, sizeof(float) * nz)); - for (int i = 0; i < N; i++) { - r_cpu[i] = 1.0; - Ax_cpu[i] = x_cpu[i] = 0.0; - } -#endif + float *val_cpu = (float *)malloc(sizeof(float) * nz); - printf("Total threads per GPU = %d numBlocksPerSm = %d\n", - numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm); - dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), - dimBlock(THREADS_PER_BLOCK, 1, 1); + genTridiag(I, J, val_cpu, N, nz); - // Structure used for cross-grid synchronization. - MultiDeviceData multi_device_data; - checkCudaErrors(cudaHostAlloc( - &multi_device_data.hostMemoryArrivedList, - (kNumGpusRequired - 1) * sizeof(*multi_device_data.hostMemoryArrivedList), - cudaHostAllocPortable)); - memset(multi_device_data.hostMemoryArrivedList, 0, - (kNumGpusRequired - 1) * - sizeof(*multi_device_data.hostMemoryArrivedList)); - multi_device_data.numDevices = kNumGpusRequired; - multi_device_data.deviceRank = 0; + memcpy(val, val_cpu, sizeof(float) * nz); + checkCudaErrors(cudaMemAdvise(I, sizeof(int) * (N + 1), cudaMemAdviseSetReadMostly, 0)); + checkCudaErrors(cudaMemAdvise(J, sizeof(int) * nz, cudaMemAdviseSetReadMostly, 0)); + checkCudaErrors(cudaMemAdvise(val, sizeof(float) * nz, cudaMemAdviseSetReadMostly, 0)); - void *kernelArgs[] = { - (void *)&I, (void *)&J, (void *)&val, (void *)&x, - (void *)&Ax, (void *)&p, (void *)&r, (void *)&dot_result, - (void *)&nz, (void *)&N, (void *)&tol, (void *)&multi_device_data, - }; + checkCudaErrors(cudaMallocManaged((void **)&x, sizeof(float) * N)); - printf("Launching kernel\n"); + double *dot_result; + checkCudaErrors(cudaMallocManaged((void **)&dot_result, sizeof(double))); - deviceId = bestFitDeviceIds.begin(); - device_count = 0; - while (deviceId != bestFitDeviceIds.end()) { - checkCudaErrors(cudaSetDevice(*deviceId)); - checkCudaErrors(cudaLaunchCooperativeKernel( - (void *)multiGpuConjugateGradient, dimGrid, dimBlock, kernelArgs, - sMemSize, nStreams[device_count++])); - multi_device_data.deviceRank++; - deviceId++; - } + checkCudaErrors(cudaMemset(dot_result, 0, sizeof(double))); - checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); - checkCudaErrors( - cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId)); + // temp memory for ConjugateGradient + checkCudaErrors(cudaMallocManaged((void **)&r, N * sizeof(float))); + checkCudaErrors(cudaMallocManaged((void **)&p, N * sizeof(float))); + checkCudaErrors(cudaMallocManaged((void **)&Ax, N * sizeof(float))); - deviceId = bestFitDeviceIds.begin(); - device_count = 0; - while (deviceId != bestFitDeviceIds.end()) { - checkCudaErrors(cudaSetDevice(*deviceId)); - checkCudaErrors(cudaStreamSynchronize(nStreams[device_count++])); - deviceId++; - } + std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl; + cudaStream_t nStreams[kNumGpusRequired]; - r1 = (float)*dot_result; + int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK / 32) + 1); + int numBlocksPerSm = INT_MAX; + int numThreads = THREADS_PER_BLOCK; + int numSms = INT_MAX; + auto deviceId = bestFitDeviceIds.begin(); - printf("GPU Final, residual = %e \n ", sqrt(r1)); + // set numSms & numBlocksPerSm to be lowest of 2 devices + while (deviceId != bestFitDeviceIds.end()) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaSetDevice(*deviceId)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId)); -#if ENABLE_CPU_DEBUG_CODE - cpuConjugateGrad(I, J, val, x_cpu, Ax_cpu, p_cpu, r_cpu, nz, N, tol); -#endif + int numBlocksPerSm_current = 0; + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, sMemSize)); - float rsum, diff, err = 0.0; - - for (int i = 0; i < N; i++) { - rsum = 0.0; - - for (int j = I[i]; j < I[i + 1]; j++) { - rsum += val_cpu[j] * x[J[j]]; + if (numBlocksPerSm > numBlocksPerSm_current) { + numBlocksPerSm = numBlocksPerSm_current; + } + if (numSms > deviceProp.multiProcessorCount) { + numSms = deviceProp.multiProcessorCount; + } + deviceId++; } - diff = fabs(rsum - rhs); - - if (diff > err) { - err = diff; + if (!numBlocksPerSm) { + printf("Max active blocks per SM is returned as 0.\n Hence, Waiving the " + "sample\n"); + exit(EXIT_WAIVED); } - } - checkCudaErrors(cudaFreeHost(multi_device_data.hostMemoryArrivedList)); - checkCudaErrors(cudaFree(I)); - checkCudaErrors(cudaFree(J)); - checkCudaErrors(cudaFree(val)); - checkCudaErrors(cudaFree(x)); - checkCudaErrors(cudaFree(r)); - checkCudaErrors(cudaFree(p)); - checkCudaErrors(cudaFree(Ax)); - checkCudaErrors(cudaFree(dot_result)); - free(val_cpu); + int device_count = 0; + int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK; + deviceId = bestFitDeviceIds.begin(); + while (deviceId != bestFitDeviceIds.end()) { + checkCudaErrors(cudaSetDevice(*deviceId)); + checkCudaErrors(cudaStreamCreate(&nStreams[device_count])); + + int perGPUIter = N / (totalThreadsPerGPU * kNumGpusRequired); + int offset_Ax = device_count * totalThreadsPerGPU; + int offset_r = device_count * totalThreadsPerGPU; + int offset_p = device_count * totalThreadsPerGPU; + int offset_x = device_count * totalThreadsPerGPU; + + checkCudaErrors(cudaMemPrefetchAsync(I, sizeof(int) * N, *deviceId, nStreams[device_count])); + checkCudaErrors(cudaMemPrefetchAsync(val, sizeof(float) * nz, *deviceId, nStreams[device_count])); + checkCudaErrors(cudaMemPrefetchAsync(J, sizeof(float) * nz, *deviceId, nStreams[device_count])); + + if (offset_Ax <= N) { + for (int i = 0; i < perGPUIter; i++) { + cudaMemAdvise( + Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + cudaMemAdvise( + r + offset_r, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + cudaMemAdvise( + x + offset_x, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + cudaMemAdvise( + p + offset_p, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + + cudaMemAdvise( + Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); + cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); + cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); + cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); + + offset_Ax += totalThreadsPerGPU * kNumGpusRequired; + offset_r += totalThreadsPerGPU * kNumGpusRequired; + offset_p += totalThreadsPerGPU * kNumGpusRequired; + offset_x += totalThreadsPerGPU * kNumGpusRequired; + + if (offset_Ax >= N) { + break; + } + } + } + + device_count++; + deviceId++; + } #if ENABLE_CPU_DEBUG_CODE - free(Ax_cpu); - free(r_cpu); - free(p_cpu); - free(x_cpu); + float *Ax_cpu = (float *)malloc(sizeof(float) * N); + float *r_cpu = (float *)malloc(sizeof(float) * N); + float *p_cpu = (float *)malloc(sizeof(float) * N); + float *x_cpu = (float *)malloc(sizeof(float) * N); + + for (int i = 0; i < N; i++) { + r_cpu[i] = 1.0; + Ax_cpu[i] = x_cpu[i] = 0.0; + } #endif - printf("Test Summary: Error amount = %f \n", err); - fprintf(stdout, "&&&& conjugateGradientMultiDeviceCG %s\n", - (sqrt(r1) < tol) ? "PASSED" : "FAILED"); - exit((sqrt(r1) < tol) ? EXIT_SUCCESS : EXIT_FAILURE); + printf("Total threads per GPU = %d numBlocksPerSm = %d\n", + numSms * numBlocksPerSm * THREADS_PER_BLOCK, + numBlocksPerSm); + dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1); + + // Structure used for cross-grid synchronization. + MultiDeviceData multi_device_data; + checkCudaErrors(cudaHostAlloc(&multi_device_data.hostMemoryArrivedList, + (kNumGpusRequired - 1) * sizeof(*multi_device_data.hostMemoryArrivedList), + cudaHostAllocPortable)); + memset(multi_device_data.hostMemoryArrivedList, + 0, + (kNumGpusRequired - 1) * sizeof(*multi_device_data.hostMemoryArrivedList)); + multi_device_data.numDevices = kNumGpusRequired; + multi_device_data.deviceRank = 0; + + void *kernelArgs[] = { + (void *)&I, + (void *)&J, + (void *)&val, + (void *)&x, + (void *)&Ax, + (void *)&p, + (void *)&r, + (void *)&dot_result, + (void *)&nz, + (void *)&N, + (void *)&tol, + (void *)&multi_device_data, + }; + + printf("Launching kernel\n"); + + deviceId = bestFitDeviceIds.begin(); + device_count = 0; + while (deviceId != bestFitDeviceIds.end()) { + checkCudaErrors(cudaSetDevice(*deviceId)); + checkCudaErrors(cudaLaunchCooperativeKernel( + (void *)multiGpuConjugateGradient, dimGrid, dimBlock, kernelArgs, sMemSize, nStreams[device_count++])); + multi_device_data.deviceRank++; + deviceId++; + } + + checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId)); + + deviceId = bestFitDeviceIds.begin(); + device_count = 0; + while (deviceId != bestFitDeviceIds.end()) { + checkCudaErrors(cudaSetDevice(*deviceId)); + checkCudaErrors(cudaStreamSynchronize(nStreams[device_count++])); + deviceId++; + } + + r1 = (float)*dot_result; + + printf("GPU Final, residual = %e \n ", sqrt(r1)); + +#if ENABLE_CPU_DEBUG_CODE + cpuConjugateGrad(I, J, val, x_cpu, Ax_cpu, p_cpu, r_cpu, nz, N, tol); +#endif + + float rsum, diff, err = 0.0; + + for (int i = 0; i < N; i++) { + rsum = 0.0; + + for (int j = I[i]; j < I[i + 1]; j++) { + rsum += val_cpu[j] * x[J[j]]; + } + + diff = fabs(rsum - rhs); + + if (diff > err) { + err = diff; + } + } + + checkCudaErrors(cudaFreeHost(multi_device_data.hostMemoryArrivedList)); + checkCudaErrors(cudaFree(I)); + checkCudaErrors(cudaFree(J)); + checkCudaErrors(cudaFree(val)); + checkCudaErrors(cudaFree(x)); + checkCudaErrors(cudaFree(r)); + checkCudaErrors(cudaFree(p)); + checkCudaErrors(cudaFree(Ax)); + checkCudaErrors(cudaFree(dot_result)); + free(val_cpu); + +#if ENABLE_CPU_DEBUG_CODE + free(Ax_cpu); + free(r_cpu); + free(p_cpu); + free(x_cpu); +#endif + + printf("Test Summary: Error amount = %f \n", err); + fprintf(stdout, "&&&& conjugateGradientMultiDeviceCG %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); + exit((sqrt(r1) < tol) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/4_CUDA_Libraries/conjugateGradientPrecond/main.cpp b/Samples/4_CUDA_Libraries/conjugateGradientPrecond/main.cpp index f8d1d29b..1d1ea9a9 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientPrecond/main.cpp +++ b/Samples/4_CUDA_Libraries/conjugateGradientPrecond/main.cpp @@ -44,99 +44,90 @@ // includes, system -#include -#include -#include #include +#include +#include +#include // CUDA Runtime #include // Using updated (v2) interfaces for CUBLAS and CUSPARSE -#include #include +#include // Utilities and system includes -#include // shared functions common to CUDA Samples -#include // CUDA error checking +#include // CUDA error checking +#include // shared functions common to CUDA Samples -const char *sSDKname = "conjugateGradientPrecond"; +const char *sSDKname = "conjugateGradientPrecond"; /* * Generate a matrix representing a second order regular Laplacian operator * on a 2D domain in Compressed Sparse Row format. */ -void genLaplace(int *row_ptr, int *col_ind, float *val, int M, int N, int nz, - float *rhs) +void genLaplace(int *row_ptr, int *col_ind, float *val, int M, int N, int nz, float *rhs) { - assert(M==N); - int n=(int)sqrt((double)N); - assert(n*n==N); + assert(M == N); + int n = (int)sqrt((double)N); + assert(n * n == N); printf("laplace dimension = %d\n", n); int idx = 0; // loop over degrees of freedom - for (int i = 0; i < N; i++) - { + for (int i = 0; i < N; i++) { int ix = i % n; int iy = i / n; row_ptr[i] = idx; // up - if (iy > 0) - { - val[idx] = 1.0; + if (iy > 0) { + val[idx] = 1.0; col_ind[idx] = i - n; idx++; } - else - { + else { rhs[i] -= 1.0; } // left if (ix > 0) { - val[idx] = 1.0; + val[idx] = 1.0; col_ind[idx] = i - 1; idx++; - } else { + } + else { rhs[i] -= 0.0; } // center - val[idx] = -4.0; + val[idx] = -4.0; col_ind[idx] = i; idx++; - //right - if (ix < n - 1) - { - val[idx] = 1.0; + // right + if (ix < n - 1) { + val[idx] = 1.0; col_ind[idx] = i + 1; idx++; } - else - { + else { rhs[i] -= 0.0; } // down - if (iy < n - 1) - { - val[idx] = 1.0; + if (iy < n - 1) { + val[idx] = 1.0; col_ind[idx] = i + n; idx++; } - else - { + else { rhs[i] -= 0.0; } - } row_ptr[N] = idx; - } /* @@ -145,41 +136,41 @@ void genLaplace(int *row_ptr, int *col_ind, float *val, int M, int N, int nz, * b) using an Incomplete Cholesky preconditioner, and * c) using an ILU0 preconditioner. */ -int main(int argc, char **argv){ - const int max_iter = 1000; - int k, M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; - int *d_col, *d_row; - int qatest = 0; - const float tol = 1e-12f; - float *x, *rhs; - float r0, r1, alpha, beta; - float *d_val, *d_x; - float *d_zm1, *d_zm2, *d_rm2; - float *d_r, *d_p, *d_omega, *d_y; - float *val = NULL; - float *d_valsILU0; - float rsum, diff, err = 0.0; - float qaerr1, qaerr2 = 0.0; - float dot, numerator, denominator, nalpha; - const float floatone = 1.0; +int main(int argc, char **argv) +{ + const int max_iter = 1000; + int k, M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; + int *d_col, *d_row; + int qatest = 0; + const float tol = 1e-12f; + float *x, *rhs; + float r0, r1, alpha, beta; + float *d_val, *d_x; + float *d_zm1, *d_zm2, *d_rm2; + float *d_r, *d_p, *d_omega, *d_y; + float *val = NULL; + float *d_valsILU0; + float rsum, diff, err = 0.0; + float qaerr1, qaerr2 = 0.0; + float dot, numerator, denominator, nalpha; + const float floatone = 1.0; const float floatzero = 0.0; int nErrors = 0; printf("conjugateGradientPrecond starting...\n"); - /* QA testing mode */ - if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { - qatest = 1; - } + /* QA testing mode */ + if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { + qatest = 1; + } /* This will pick the best possible CUDA capable device */ cudaDeviceProp deviceProp; - int devID = findCudaDevice(argc, (const char **)argv); + int devID = findCudaDevice(argc, (const char **)argv); printf("GPU selected Device ID = %d \n", devID); - if (devID < 0) - { + if (devID < 0) { printf("Invalid GPU device %d selected, exiting...\n", devID); exit(EXIT_SUCCESS); } @@ -188,21 +179,22 @@ int main(int argc, char **argv){ /* Statistics about the GPU device */ printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", - deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); /* Generate a Laplace matrix in CSR (Compressed Sparse Row) format */ M = N = 16384; - nz = 5 * N - 4 * (int)sqrt((double)N); - I = (int *)malloc(sizeof(int) * (N + 1)); // csr row pointers for matrix A - J = (int *)malloc(sizeof(int) * nz); // csr column indices for matrix A - val = (float *)malloc(sizeof(float) * nz); // csr values for matrix A - x = (float *)malloc(sizeof(float) * N); - rhs = (float *)malloc(sizeof(float) * N); + nz = 5 * N - 4 * (int)sqrt((double)N); + I = (int *)malloc(sizeof(int) * (N + 1)); // csr row pointers for matrix A + J = (int *)malloc(sizeof(int) * nz); // csr column indices for matrix A + val = (float *)malloc(sizeof(float) * nz); // csr values for matrix A + x = (float *)malloc(sizeof(float) * N); + rhs = (float *)malloc(sizeof(float) * N); - for (int i = 0; i < N; i++) - { - rhs[i] = 0.0; // Initialize RHS - x[i] = 0.0; // Initial solution approximation + for (int i = 0; i < N; i++) { + rhs[i] = 0.0; // Initialize RHS + x[i] = 0.0; // Initial solution approximation } genLaplace(I, J, val, M, N, nz, rhs); @@ -236,7 +228,7 @@ int main(int argc, char **argv){ checkCudaErrors(cudaMalloc((void **)&d_rm2, (N) * sizeof(float))); /* Wrap raw data into cuSPARSE generic API objects */ - cusparseDnVecDescr_t vecp = NULL, vecX=NULL, vecY = NULL, vecR = NULL, vecZM1=NULL; + cusparseDnVecDescr_t vecp = NULL, vecX = NULL, vecY = NULL, vecR = NULL, vecZM1 = NULL; checkCudaErrors(cusparseCreateDnVec(&vecp, N, d_p, CUDA_R_32F)); checkCudaErrors(cusparseCreateDnVec(&vecX, N, d_x, CUDA_R_32F)); checkCudaErrors(cusparseCreateDnVec(&vecY, N, d_y, CUDA_R_32F)); @@ -246,18 +238,12 @@ int main(int argc, char **argv){ checkCudaErrors(cusparseCreateDnVec(&vecomega, N, d_omega, CUDA_R_32F)); /* Initialize problem data */ - checkCudaErrors(cudaMemcpy( - d_col, J, nz * sizeof(int), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy( - d_row, I, (N + 1) * sizeof(int), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy( - d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy( - d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy( - d_x, x, N*sizeof(float), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy( - d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_col, J, nz * sizeof(int), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_row, I, (N + 1) * sizeof(int), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice)); cusparseSpMatDescr_t matA = NULL; cusparseSpMatDescr_t matM_lower, matM_upper; @@ -266,73 +252,108 @@ int main(int argc, char **argv){ cusparseFillMode_t fill_upper = CUSPARSE_FILL_MODE_UPPER; cusparseDiagType_t diag_non_unit = CUSPARSE_DIAG_TYPE_NON_UNIT; - checkCudaErrors(cusparseCreateCsr( - &matA, N, N, nz, d_row, d_col, d_val, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + checkCudaErrors(cusparseCreateCsr(&matA, + N, + N, + nz, + d_row, + d_col, + d_val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); /* Copy A data to ILU(0) vals as input*/ - checkCudaErrors(cudaMemcpy( - d_valsILU0, d_val, nz*sizeof(float), cudaMemcpyDeviceToDevice)); - - //Lower Part - checkCudaErrors( cusparseCreateCsr(&matM_lower, N, N, nz, d_row, d_col, d_valsILU0, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) ); + checkCudaErrors(cudaMemcpy(d_valsILU0, d_val, nz * sizeof(float), cudaMemcpyDeviceToDevice)); - checkCudaErrors( cusparseSpMatSetAttribute(matM_lower, - CUSPARSE_SPMAT_FILL_MODE, - &fill_lower, sizeof(fill_lower)) ); - checkCudaErrors( cusparseSpMatSetAttribute(matM_lower, - CUSPARSE_SPMAT_DIAG_TYPE, - &diag_unit, sizeof(diag_unit)) ); + // Lower Part + checkCudaErrors(cusparseCreateCsr(&matM_lower, + N, + N, + nz, + d_row, + d_col, + d_valsILU0, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + + checkCudaErrors(cusparseSpMatSetAttribute(matM_lower, CUSPARSE_SPMAT_FILL_MODE, &fill_lower, sizeof(fill_lower))); + checkCudaErrors(cusparseSpMatSetAttribute(matM_lower, CUSPARSE_SPMAT_DIAG_TYPE, &diag_unit, sizeof(diag_unit))); // M_upper - checkCudaErrors( cusparseCreateCsr(&matM_upper, N, N, nz, d_row, d_col, d_valsILU0, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) ); - checkCudaErrors( cusparseSpMatSetAttribute(matM_upper, - CUSPARSE_SPMAT_FILL_MODE, - &fill_upper, sizeof(fill_upper)) ); - checkCudaErrors( cusparseSpMatSetAttribute(matM_upper, - CUSPARSE_SPMAT_DIAG_TYPE, - &diag_non_unit, - sizeof(diag_non_unit)) ); + checkCudaErrors(cusparseCreateCsr(&matM_upper, + N, + N, + nz, + d_row, + d_col, + d_valsILU0, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + checkCudaErrors(cusparseSpMatSetAttribute(matM_upper, CUSPARSE_SPMAT_FILL_MODE, &fill_upper, sizeof(fill_upper))); + checkCudaErrors( + cusparseSpMatSetAttribute(matM_upper, CUSPARSE_SPMAT_DIAG_TYPE, &diag_non_unit, sizeof(diag_non_unit))); /* Create ILU(0) info object */ int bufferSizeLU = 0; size_t bufferSizeMV, bufferSizeL, bufferSizeU; - void* d_bufferLU, *d_bufferMV, *d_bufferL, *d_bufferU; + void *d_bufferLU, *d_bufferMV, *d_bufferL, *d_bufferU; cusparseSpSVDescr_t spsvDescrL, spsvDescrU; - cusparseMatDescr_t matLU; + cusparseMatDescr_t matLU; csrilu02Info_t infoILU = NULL; checkCudaErrors(cusparseCreateCsrilu02Info(&infoILU)); - checkCudaErrors( cusparseCreateMatDescr(&matLU) ); - checkCudaErrors( cusparseSetMatType(matLU, CUSPARSE_MATRIX_TYPE_GENERAL) ); - checkCudaErrors( cusparseSetMatIndexBase(matLU, CUSPARSE_INDEX_BASE_ZERO) ); + checkCudaErrors(cusparseCreateMatDescr(&matLU)); + checkCudaErrors(cusparseSetMatType(matLU, CUSPARSE_MATRIX_TYPE_GENERAL)); + checkCudaErrors(cusparseSetMatIndexBase(matLU, CUSPARSE_INDEX_BASE_ZERO)); /* Allocate workspace for cuSPARSE */ - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, matA, - vecp, &floatzero, vecomega, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, - &bufferSizeMV)); - checkCudaErrors( cudaMalloc(&d_bufferMV, bufferSizeMV) ); + checkCudaErrors(cusparseSpMV_bufferSize(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matA, + vecp, + &floatzero, + vecomega, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSizeMV)); + checkCudaErrors(cudaMalloc(&d_bufferMV, bufferSizeMV)); - checkCudaErrors(cusparseScsrilu02_bufferSize( - cusparseHandle, N, nz, matLU, d_val, d_row, d_col, infoILU, &bufferSizeLU)); - checkCudaErrors( cudaMalloc(&d_bufferLU, bufferSizeLU) ); + checkCudaErrors( + cusparseScsrilu02_bufferSize(cusparseHandle, N, nz, matLU, d_val, d_row, d_col, infoILU, &bufferSizeLU)); + checkCudaErrors(cudaMalloc(&d_bufferLU, bufferSizeLU)); - checkCudaErrors( cusparseSpSV_createDescr(&spsvDescrL) ); - checkCudaErrors(cusparseSpSV_bufferSize( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, matM_lower, vecR, vecX, CUDA_R_32F, - CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL, &bufferSizeL)); - checkCudaErrors( cudaMalloc(&d_bufferL, bufferSizeL) ); + checkCudaErrors(cusparseSpSV_createDescr(&spsvDescrL)); + checkCudaErrors(cusparseSpSV_bufferSize(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matM_lower, + vecR, + vecX, + CUDA_R_32F, + CUSPARSE_SPSV_ALG_DEFAULT, + spsvDescrL, + &bufferSizeL)); + checkCudaErrors(cudaMalloc(&d_bufferL, bufferSizeL)); - checkCudaErrors( cusparseSpSV_createDescr(&spsvDescrU) ); - checkCudaErrors( cusparseSpSV_bufferSize( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, matM_upper, vecR, vecX, CUDA_R_32F, - CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU, &bufferSizeU)); - checkCudaErrors( cudaMalloc(&d_bufferU, bufferSizeU) ); + checkCudaErrors(cusparseSpSV_createDescr(&spsvDescrU)); + checkCudaErrors(cusparseSpSV_bufferSize(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matM_upper, + vecR, + vecX, + CUDA_R_32F, + CUSPARSE_SPSV_ALG_DEFAULT, + spsvDescrU, + &bufferSizeU)); + checkCudaErrors(cudaMalloc(&d_bufferU, bufferSizeU)); /* Conjugate gradient without preconditioning. ------------------------------------------ @@ -341,61 +362,58 @@ int main(int argc, char **argv){ "Matrix Computations 3rd ed.", Section 10.2.6 */ printf("Convergence of CG without preconditioning: \n"); - k = 0; + k = 0; r0 = 0; checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1)); - while (r1 > tol * tol && k <= max_iter) - { + while (r1 > tol * tol && k <= max_iter) { k++; - if (k == 1) - { + if (k == 1) { checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1)); } - else - { + else { beta = r1 / r0; checkCudaErrors(cublasSscal(cublasHandle, N, &beta, d_p, 1)); - checkCudaErrors(cublasSaxpy( - cublasHandle, N, &floatone, d_r, 1, d_p, 1)); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &floatone, d_r, 1, d_p, 1)); } - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, matA, - vecp, &floatzero, vecomega, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, - d_bufferMV)); + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matA, + vecp, + &floatzero, + vecomega, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + d_bufferMV)); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &dot)); alpha = r1 / dot; checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1)); nalpha = -alpha; - checkCudaErrors(cublasSaxpy( - cublasHandle, N, &nalpha, d_omega, 1, d_r, 1)); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1)); r0 = r1; checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1)); } printf(" iteration = %3d, residual = %e \n", k, sqrt(r1)); - checkCudaErrors(cudaMemcpy( - x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost)); /* check result */ err = 0.0; - for (int i = 0; i < N; i++) - { + for (int i = 0; i < N; i++) { rsum = 0.0; - for (int j = I[i]; j < I[i + 1]; j++) - { + for (int j = I[i]; j < I[i + 1]; j++) { rsum += val[j] * x[J[j]]; } diff = fabs(rsum - rhs[i]); - if (diff > err) - { + if (diff > err) { err = diff; } } @@ -404,21 +422,17 @@ int main(int argc, char **argv){ nErrors += (k > max_iter) ? 1 : 0; qaerr1 = err; - if (0) - { + if (0) { // output result in matlab-style array int n = (int)sqrt((double)N); printf("a = [ "); - for (int iy = 0; iy < n; iy++) - { - for (int ix = 0; ix < n; ix++) - { + for (int iy = 0; iy < n; iy++) { + for (int ix = 0; ix < n; ix++) { printf(" %f ", x[iy * n + ix]); } - if (iy == n - 1) - { + if (iy == n - 1) { printf(" ]"); } @@ -436,110 +450,117 @@ int main(int argc, char **argv){ /* Perform analysis for ILU(0) */ checkCudaErrors(cusparseScsrilu02_analysis( - cusparseHandle, N, nz, descr, d_valsILU0, d_row, d_col, infoILU, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_bufferLU)); + cusparseHandle, N, nz, descr, d_valsILU0, d_row, d_col, infoILU, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_bufferLU)); /* generate the ILU(0) factors */ checkCudaErrors(cusparseScsrilu02( - cusparseHandle, N, nz, matLU, d_valsILU0, d_row, d_col, infoILU, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_bufferLU)); + cusparseHandle, N, nz, matLU, d_valsILU0, d_row, d_col, infoILU, CUSPARSE_SOLVE_POLICY_USE_LEVEL, d_bufferLU)); /* perform triangular solve analysis */ - checkCudaErrors(cusparseSpSV_analysis( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, - matM_lower, vecR, vecX, CUDA_R_32F, - CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL, d_bufferL)); + checkCudaErrors(cusparseSpSV_analysis(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matM_lower, + vecR, + vecX, + CUDA_R_32F, + CUSPARSE_SPSV_ALG_DEFAULT, + spsvDescrL, + d_bufferL)); - checkCudaErrors(cusparseSpSV_analysis( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, - matM_upper, vecR, vecX, CUDA_R_32F, - CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU, d_bufferU)); + checkCudaErrors(cusparseSpSV_analysis(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matM_upper, + vecR, + vecX, + CUDA_R_32F, + CUSPARSE_SPSV_ALG_DEFAULT, + spsvDescrU, + d_bufferU)); /* reset the initial guess of the solution to zero */ - for (int i = 0; i < N; i++) - { + for (int i = 0; i < N; i++) { x[i] = 0.0; } - checkCudaErrors(cudaMemcpy( - d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy( - d_x, x, N * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice)); k = 0; checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1)); - while (r1 > tol * tol && k <= max_iter) - { + while (r1 > tol * tol && k <= max_iter) { // preconditioner application: d_zm1 = U^-1 L^-1 d_r checkCudaErrors(cusparseSpSV_solve(cusparseHandle, - CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, - matM_lower, vecR, vecY, CUDA_R_32F, - CUSPARSE_SPSV_ALG_DEFAULT, - spsvDescrL) ); - + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matM_lower, + vecR, + vecY, + CUDA_R_32F, + CUSPARSE_SPSV_ALG_DEFAULT, + spsvDescrL)); + checkCudaErrors(cusparseSpSV_solve(cusparseHandle, - CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, matM_upper, - vecY, vecZM1, - CUDA_R_32F, - CUSPARSE_SPSV_ALG_DEFAULT, - spsvDescrU)); + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matM_upper, + vecY, + vecZM1, + CUDA_R_32F, + CUSPARSE_SPSV_ALG_DEFAULT, + spsvDescrU)); k++; - if (k == 1) - { + if (k == 1) { checkCudaErrors(cublasScopy(cublasHandle, N, d_zm1, 1, d_p, 1)); } - else - { - checkCudaErrors(cublasSdot( - cublasHandle, N, d_r, 1, d_zm1, 1, &numerator)); - checkCudaErrors(cublasSdot( - cublasHandle, N, d_rm2, 1, d_zm2, 1, &denominator)); + else { + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator)); + checkCudaErrors(cublasSdot(cublasHandle, N, d_rm2, 1, d_zm2, 1, &denominator)); beta = numerator / denominator; checkCudaErrors(cublasSscal(cublasHandle, N, &beta, d_p, 1)); - checkCudaErrors(cublasSaxpy( - cublasHandle, N, &floatone, d_zm1, 1, d_p, 1)); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &floatone, d_zm1, 1, d_p, 1)); } - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &floatone, matA, - vecp, &floatzero, vecomega, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, - d_bufferMV)); - checkCudaErrors(cublasSdot( - cublasHandle, N, d_r, 1, d_zm1, 1, &numerator)); - checkCudaErrors(cublasSdot( - cublasHandle, N, d_p, 1, d_omega, 1, &denominator)); + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &floatone, + matA, + vecp, + &floatzero, + vecomega, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + d_bufferMV)); + checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator)); + checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &denominator)); alpha = numerator / denominator; checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1)); checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_rm2, 1)); checkCudaErrors(cublasScopy(cublasHandle, N, d_zm1, 1, d_zm2, 1)); nalpha = -alpha; - checkCudaErrors(cublasSaxpy( - cublasHandle, N, &nalpha, d_omega, 1, d_r, 1)); + checkCudaErrors(cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1)); checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1)); } printf(" iteration = %3d, residual = %e \n", k, sqrt(r1)); - checkCudaErrors(cudaMemcpy( - x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(x, d_x, N * sizeof(float), cudaMemcpyDeviceToHost)); /* check result */ err = 0.0; - for (int i = 0; i < N; i++) - { + for (int i = 0; i < N; i++) { rsum = 0.0; - for (int j = I[i]; j < I[i + 1]; j++) - { + for (int j = I[i]; j < I[i + 1]; j++) { rsum += val[j] * x[J[j]]; } diff = fabs(rsum - rhs[i]); - if (diff > err) - { + if (diff > err) { err = diff; } } @@ -601,8 +622,5 @@ int main(int argc, char **argv){ printf("Test Summary:\n"); printf(" Counted total of %d errors\n", nErrors); printf(" qaerr1 = %f qaerr2 = %f\n\n", fabs(qaerr1), fabs(qaerr2)); - exit((nErrors == 0 &&fabs(qaerr1) < 1e-5 && fabs(qaerr2) < 1e-5 - ? EXIT_SUCCESS - : EXIT_FAILURE)); + exit((nErrors == 0 && fabs(qaerr1) < 1e-5 && fabs(qaerr2) < 1e-5 ? EXIT_SUCCESS : EXIT_FAILURE)); } - diff --git a/Samples/4_CUDA_Libraries/conjugateGradientUM/main.cpp b/Samples/4_CUDA_Libraries/conjugateGradientUM/main.cpp index 83ce84ef..baa74997 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientUM/main.cpp +++ b/Samples/4_CUDA_Libraries/conjugateGradientUM/main.cpp @@ -42,232 +42,255 @@ #include // Utilities and system includes -#include // helper function CUDA error checking and initialization -#include // helper for shared functions common to CUDA Samples +#include // helper function CUDA error checking and initialization +#include // helper for shared functions common to CUDA Samples const char *sSDKname = "conjugateGradientUM"; /* genTridiag: generate a random tridiagonal symmetric matrix */ -void genTridiag(int *I, int *J, float *val, int N, int nz) { - I[0] = 0, J[0] = 0, J[1] = 1; - val[0] = (float)rand() / RAND_MAX + 10.0f; - val[1] = (float)rand() / RAND_MAX; - int start; +void genTridiag(int *I, int *J, float *val, int N, int nz) +{ + I[0] = 0, J[0] = 0, J[1] = 1; + val[0] = (float)rand() / RAND_MAX + 10.0f; + val[1] = (float)rand() / RAND_MAX; + int start; - for (int i = 1; i < N; i++) { - if (i > 1) { - I[i] = I[i - 1] + 3; - } else { - I[1] = 2; + for (int i = 1; i < N; i++) { + if (i > 1) { + I[i] = I[i - 1] + 3; + } + else { + I[1] = 2; + } + + start = (i - 1) * 3 + 2; + J[start] = i - 1; + J[start + 1] = i; + + if (i < N - 1) { + J[start + 2] = i + 1; + } + + val[start] = val[start - 1]; + val[start + 1] = (float)rand() / RAND_MAX + 10.0f; + + if (i < N - 1) { + val[start + 2] = (float)rand() / RAND_MAX; + } } - start = (i - 1) * 3 + 2; - J[start] = i - 1; - J[start + 1] = i; - - if (i < N - 1) { - J[start + 2] = i + 1; - } - - val[start] = val[start - 1]; - val[start + 1] = (float)rand() / RAND_MAX + 10.0f; - - if (i < N - 1) { - val[start + 2] = (float)rand() / RAND_MAX; - } - } - - I[N] = nz; + I[N] = nz; } -int main(int argc, char **argv) { - int N = 0, nz = 0, *I = NULL, *J = NULL; - float *val = NULL; - const float tol = 1e-5f; - const int max_iter = 10000; - float *x; - float *rhs; - float a, b, na, r0, r1; - float dot; - float *r, *p, *Ax; - int k; - float alpha, beta, alpham1; +int main(int argc, char **argv) +{ + int N = 0, nz = 0, *I = NULL, *J = NULL; + float *val = NULL; + const float tol = 1e-5f; + const int max_iter = 10000; + float *x; + float *rhs; + float a, b, na, r0, r1; + float dot; + float *r, *p, *Ax; + int k; + float alpha, beta, alpham1; - printf("Starting [%s]...\n", sSDKname); + printf("Starting [%s]...\n", sSDKname); - // This will pick the best possible CUDA capable device - cudaDeviceProp deviceProp; - int devID = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + // This will pick the best possible CUDA capable device + cudaDeviceProp deviceProp; + int devID = findCudaDevice(argc, (const char **)argv); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - if (!deviceProp.managedMemory) { - // This samples requires being run on a device that supports Unified Memory - fprintf(stderr, "Unified Memory not supported on this device\n"); - exit(EXIT_WAIVED); - } - - // Statistics about the GPU device - printf( - "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", - deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); - - /* Generate a random tridiagonal symmetric matrix in CSR format */ - N = 1048576; - nz = (N - 2) * 3 + 4; - - cudaMallocManaged((void **)&I, sizeof(int) * (N + 1)); - cudaMallocManaged((void **)&J, sizeof(int) * nz); - cudaMallocManaged((void **)&val, sizeof(float) * nz); - - genTridiag(I, J, val, N, nz); - - cudaMallocManaged((void **)&x, sizeof(float) * N); - cudaMallocManaged((void **)&rhs, sizeof(float) * N); - - for (int i = 0; i < N; i++) { - rhs[i] = 1.0; - x[i] = 0.0; - } - - /* Get handle to the CUBLAS context */ - cublasHandle_t cublasHandle = 0; - cublasStatus_t cublasStatus; - cublasStatus = cublasCreate(&cublasHandle); - - checkCudaErrors(cublasStatus); - - /* Get handle to the CUSPARSE context */ - cusparseHandle_t cusparseHandle = 0; - cusparseStatus_t cusparseStatus; - cusparseStatus = cusparseCreate(&cusparseHandle); - - checkCudaErrors(cusparseStatus); - - cusparseMatDescr_t descr = 0; - cusparseStatus = cusparseCreateMatDescr(&descr); - - checkCudaErrors(cusparseStatus); - - cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); - - // temp memory for CG - checkCudaErrors(cudaMallocManaged((void **)&r, N * sizeof(float))); - checkCudaErrors(cudaMallocManaged((void **)&p, N * sizeof(float))); - checkCudaErrors(cudaMallocManaged((void **)&Ax, N * sizeof(float))); - - /* Wrap raw data into cuSPARSE generic API objects */ - cusparseSpMatDescr_t matA = NULL; - checkCudaErrors(cusparseCreateCsr(&matA, N, N, nz, I, J, val, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - cusparseDnVecDescr_t vecx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecx, N, x, CUDA_R_32F)); - cusparseDnVecDescr_t vecp = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecp, N, p, CUDA_R_32F)); - cusparseDnVecDescr_t vecAx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecAx, N, Ax, CUDA_R_32F)); - - cudaDeviceSynchronize(); - - for (int i = 0; i < N; i++) { - r[i] = rhs[i]; - } - - alpha = 1.0; - alpham1 = -1.0; - beta = 0.0; - r0 = 0.; - - /* Allocate workspace for cuSPARSE */ - size_t bufferSize = 0; - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx, - &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); - void *buffer = NULL; - checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - - checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, matA, vecx, &beta, vecAx, CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1); - cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); - - k = 1; - - while (r1 > tol * tol && k <= max_iter) { - if (k > 1) { - b = r1 / r0; - cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1); - cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1); - } else { - cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1); + if (!deviceProp.managedMemory) { + // This samples requires being run on a device that supports Unified Memory + fprintf(stderr, "Unified Memory not supported on this device\n"); + exit(EXIT_WAIVED); } - checkCudaErrors(cusparseSpMV( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, - &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot); - a = r1 / dot; + // Statistics about the GPU device + printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); - cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1); - na = -a; - cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1); + /* Generate a random tridiagonal symmetric matrix in CSR format */ + N = 1048576; + nz = (N - 2) * 3 + 4; + + cudaMallocManaged((void **)&I, sizeof(int) * (N + 1)); + cudaMallocManaged((void **)&J, sizeof(int) * nz); + cudaMallocManaged((void **)&val, sizeof(float) * nz); + + genTridiag(I, J, val, N, nz); + + cudaMallocManaged((void **)&x, sizeof(float) * N); + cudaMallocManaged((void **)&rhs, sizeof(float) * N); + + for (int i = 0; i < N; i++) { + rhs[i] = 1.0; + x[i] = 0.0; + } + + /* Get handle to the CUBLAS context */ + cublasHandle_t cublasHandle = 0; + cublasStatus_t cublasStatus; + cublasStatus = cublasCreate(&cublasHandle); + + checkCudaErrors(cublasStatus); + + /* Get handle to the CUSPARSE context */ + cusparseHandle_t cusparseHandle = 0; + cusparseStatus_t cusparseStatus; + cusparseStatus = cusparseCreate(&cusparseHandle); + + checkCudaErrors(cusparseStatus); + + cusparseMatDescr_t descr = 0; + cusparseStatus = cusparseCreateMatDescr(&descr); + + checkCudaErrors(cusparseStatus); + + cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); + + // temp memory for CG + checkCudaErrors(cudaMallocManaged((void **)&r, N * sizeof(float))); + checkCudaErrors(cudaMallocManaged((void **)&p, N * sizeof(float))); + checkCudaErrors(cudaMallocManaged((void **)&Ax, N * sizeof(float))); + + /* Wrap raw data into cuSPARSE generic API objects */ + cusparseSpMatDescr_t matA = NULL; + checkCudaErrors(cusparseCreateCsr( + &matA, N, N, nz, I, J, val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + cusparseDnVecDescr_t vecx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecx, N, x, CUDA_R_32F)); + cusparseDnVecDescr_t vecp = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecp, N, p, CUDA_R_32F)); + cusparseDnVecDescr_t vecAx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecAx, N, Ax, CUDA_R_32F)); - r0 = r1; - cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); cudaDeviceSynchronize(); - printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); - k++; - } - printf("Final residual: %e\n", sqrt(r1)); - - fprintf(stdout, "&&&& conjugateGradientUM %s\n", - (sqrt(r1) < tol) ? "PASSED" : "FAILED"); - - float rsum, diff, err = 0.0; - - for (int i = 0; i < N; i++) { - rsum = 0.0; - - for (int j = I[i]; j < I[i + 1]; j++) { - rsum += val[j] * x[J[j]]; + for (int i = 0; i < N; i++) { + r[i] = rhs[i]; } - diff = fabs(rsum - rhs[i]); + alpha = 1.0; + alpham1 = -1.0; + beta = 0.0; + r0 = 0.; - if (diff > err) { - err = diff; + /* Allocate workspace for cuSPARSE */ + size_t bufferSize = 0; + checkCudaErrors(cusparseSpMV_bufferSize(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecx, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); + void *buffer = NULL; + checkCudaErrors(cudaMalloc(&buffer, bufferSize)); + + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecx, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1); + cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); + + k = 1; + + while (r1 > tol * tol && k <= max_iter) { + if (k > 1) { + b = r1 / r0; + cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1); + cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1); + } + else { + cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1); + } + + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + vecp, + &beta, + vecAx, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot); + a = r1 / dot; + + cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1); + na = -a; + cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1); + + r0 = r1; + cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); + cudaDeviceSynchronize(); + printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); + k++; } - } - cusparseDestroy(cusparseHandle); - cublasDestroy(cublasHandle); - if (matA) { - checkCudaErrors(cusparseDestroySpMat(matA)); - } - if (vecx) { - checkCudaErrors(cusparseDestroyDnVec(vecx)); - } - if (vecAx) { - checkCudaErrors(cusparseDestroyDnVec(vecAx)); - } - if (vecp) { - checkCudaErrors(cusparseDestroyDnVec(vecp)); - } + printf("Final residual: %e\n", sqrt(r1)); - cudaFree(I); - cudaFree(J); - cudaFree(val); - cudaFree(x); - cudaFree(rhs); - cudaFree(r); - cudaFree(p); - cudaFree(Ax); + fprintf(stdout, "&&&& conjugateGradientUM %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); - printf("Test Summary: Error amount = %f, result = %s\n", err, - (k <= max_iter) ? "SUCCESS" : "FAILURE"); - exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE); + float rsum, diff, err = 0.0; + + for (int i = 0; i < N; i++) { + rsum = 0.0; + + for (int j = I[i]; j < I[i + 1]; j++) { + rsum += val[j] * x[J[j]]; + } + + diff = fabs(rsum - rhs[i]); + + if (diff > err) { + err = diff; + } + } + + cusparseDestroy(cusparseHandle); + cublasDestroy(cublasHandle); + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } + if (vecp) { + checkCudaErrors(cusparseDestroyDnVec(vecp)); + } + + cudaFree(I); + cudaFree(J); + cudaFree(val); + cudaFree(x); + cudaFree(rhs); + cudaFree(r); + cudaFree(p); + cudaFree(Ax); + + printf("Test Summary: Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE"); + exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver.cpp b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver.cpp index 156e7ec2..4153cb4f 100644 --- a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver.cpp @@ -56,529 +56,532 @@ #include #include +#include #include #include #include -#include - #include "cublas_v2.h" #include "cusolverDn.h" #include "helper_cuda.h" - #include "helper_cusolver.h" template -int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m, - int *n, int *nnz, T_ELEM **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -void UsageDN(void) { - printf("\n"); - printf("-h : display this help\n"); - printf("-R= : choose a linear solver\n"); - printf(" chol (cholesky factorization), this is default\n"); - printf(" qr (QR factorization)\n"); - printf(" lu (LU factorization)\n"); - printf("-lda= : leading dimension of A , m by default\n"); - printf("-file=: filename containing a matrix in MM format\n"); - printf("-device= : if want to run on specific GPU\n"); +void UsageDN(void) +{ + printf("\n"); + printf("-h : display this help\n"); + printf("-R= : choose a linear solver\n"); + printf(" chol (cholesky factorization), this is default\n"); + printf(" qr (QR factorization)\n"); + printf(" lu (LU factorization)\n"); + printf("-lda= : leading dimension of A , m by default\n"); + printf("-file=: filename containing a matrix in MM format\n"); + printf("-device= : if want to run on specific GPU\n"); - exit(0); + exit(0); } /* * solve A*x = b by Cholesky factorization * */ -int linearSolverCHOL(cusolverDnHandle_t handle, int n, const double *Acopy, - int lda, const double *b, double *x) { - int bufferSize = 0; - int *info = NULL; - double *buffer = NULL; - double *A = NULL; - int h_info = 0; - double start, stop; - double time_solve; - cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; +int linearSolverCHOL(cusolverDnHandle_t handle, int n, const double *Acopy, int lda, const double *b, double *x) +{ + int bufferSize = 0; + int *info = NULL; + double *buffer = NULL; + double *A = NULL; + int h_info = 0; + double start, stop; + double time_solve; + cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; - checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double *)Acopy, - lda, &bufferSize)); + checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double *)Acopy, lda, &bufferSize)); - checkCudaErrors(cudaMalloc(&info, sizeof(int))); - checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize)); - checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n)); + checkCudaErrors(cudaMalloc(&info, sizeof(int))); + checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize)); + checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n)); - // prepare a copy of A because potrf will overwrite A with L - checkCudaErrors( - cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice)); - checkCudaErrors(cudaMemset(info, 0, sizeof(int))); + // prepare a copy of A because potrf will overwrite A with L + checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaMemset(info, 0, sizeof(int))); - start = second(); - start = second(); + start = second(); + start = second(); - checkCudaErrors( - cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info)); + checkCudaErrors(cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info)); - checkCudaErrors( - cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost)); - if (0 != h_info) { - fprintf(stderr, "Error: Cholesky factorization failed\n"); - } + if (0 != h_info) { + fprintf(stderr, "Error: Cholesky factorization failed\n"); + } - checkCudaErrors( - cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice)); - checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info)); + checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info)); - checkCudaErrors(cudaDeviceSynchronize()); - stop = second(); + checkCudaErrors(cudaDeviceSynchronize()); + stop = second(); - time_solve = stop - start; - fprintf(stdout, "timing: cholesky = %10.6f sec\n", time_solve); + time_solve = stop - start; + fprintf(stdout, "timing: cholesky = %10.6f sec\n", time_solve); - if (info) { - checkCudaErrors(cudaFree(info)); - } - if (buffer) { - checkCudaErrors(cudaFree(buffer)); - } - if (A) { - checkCudaErrors(cudaFree(A)); - } + if (info) { + checkCudaErrors(cudaFree(info)); + } + if (buffer) { + checkCudaErrors(cudaFree(buffer)); + } + if (A) { + checkCudaErrors(cudaFree(A)); + } - return 0; + return 0; } /* * solve A*x = b by LU with partial pivoting * */ -int linearSolverLU(cusolverDnHandle_t handle, int n, const double *Acopy, - int lda, const double *b, double *x) { - int bufferSize = 0; - int *info = NULL; - double *buffer = NULL; - double *A = NULL; - int *ipiv = NULL; // pivoting sequence - int h_info = 0; - double start, stop; - double time_solve; +int linearSolverLU(cusolverDnHandle_t handle, int n, const double *Acopy, int lda, const double *b, double *x) +{ + int bufferSize = 0; + int *info = NULL; + double *buffer = NULL; + double *A = NULL; + int *ipiv = NULL; // pivoting sequence + int h_info = 0; + double start, stop; + double time_solve; - checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double *)Acopy, - lda, &bufferSize)); + checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double *)Acopy, lda, &bufferSize)); - checkCudaErrors(cudaMalloc(&info, sizeof(int))); - checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize)); - checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n)); - checkCudaErrors(cudaMalloc(&ipiv, sizeof(int) * n)); + checkCudaErrors(cudaMalloc(&info, sizeof(int))); + checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize)); + checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n)); + checkCudaErrors(cudaMalloc(&ipiv, sizeof(int) * n)); - // prepare a copy of A because getrf will overwrite A with L - checkCudaErrors( - cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice)); - checkCudaErrors(cudaMemset(info, 0, sizeof(int))); + // prepare a copy of A because getrf will overwrite A with L + checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaMemset(info, 0, sizeof(int))); - start = second(); - start = second(); + start = second(); + start = second(); - checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info)); - checkCudaErrors( - cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost)); + checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info)); + checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost)); - if (0 != h_info) { - fprintf(stderr, "Error: LU factorization failed\n"); - } + if (0 != h_info) { + fprintf(stderr, "Error: LU factorization failed\n"); + } - checkCudaErrors( - cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice)); - checkCudaErrors( - cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info)); - checkCudaErrors(cudaDeviceSynchronize()); - stop = second(); + checkCudaErrors(cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice)); + checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info)); + checkCudaErrors(cudaDeviceSynchronize()); + stop = second(); - time_solve = stop - start; - fprintf(stdout, "timing: LU = %10.6f sec\n", time_solve); + time_solve = stop - start; + fprintf(stdout, "timing: LU = %10.6f sec\n", time_solve); - if (info) { - checkCudaErrors(cudaFree(info)); - } - if (buffer) { - checkCudaErrors(cudaFree(buffer)); - } - if (A) { - checkCudaErrors(cudaFree(A)); - } - if (ipiv) { - checkCudaErrors(cudaFree(ipiv)); - } + if (info) { + checkCudaErrors(cudaFree(info)); + } + if (buffer) { + checkCudaErrors(cudaFree(buffer)); + } + if (A) { + checkCudaErrors(cudaFree(A)); + } + if (ipiv) { + checkCudaErrors(cudaFree(ipiv)); + } - return 0; + return 0; } /* * solve A*x = b by QR * */ -int linearSolverQR(cusolverDnHandle_t handle, int n, const double *Acopy, - int lda, const double *b, double *x) { - cublasHandle_t cublasHandle = NULL; // used in residual evaluation - int bufferSize = 0; - int bufferSize_geqrf = 0; - int bufferSize_ormqr = 0; - int *info = NULL; - double *buffer = NULL; - double *A = NULL; - double *tau = NULL; - int h_info = 0; - double start, stop; - double time_solve; - const double one = 1.0; +int linearSolverQR(cusolverDnHandle_t handle, int n, const double *Acopy, int lda, const double *b, double *x) +{ + cublasHandle_t cublasHandle = NULL; // used in residual evaluation + int bufferSize = 0; + int bufferSize_geqrf = 0; + int bufferSize_ormqr = 0; + int *info = NULL; + double *buffer = NULL; + double *A = NULL; + double *tau = NULL; + int h_info = 0; + double start, stop; + double time_solve; + const double one = 1.0; - checkCudaErrors(cublasCreate(&cublasHandle)); + checkCudaErrors(cublasCreate(&cublasHandle)); - checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double *)Acopy, - lda, &bufferSize_geqrf)); - checkCudaErrors(cusolverDnDormqr_bufferSize(handle, CUBLAS_SIDE_LEFT, - CUBLAS_OP_T, n, 1, n, A, lda, - NULL, x, n, &bufferSize_ormqr)); + checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double *)Acopy, lda, &bufferSize_geqrf)); + checkCudaErrors(cusolverDnDormqr_bufferSize( + handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, n, 1, n, A, lda, NULL, x, n, &bufferSize_ormqr)); - printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf, - bufferSize_ormqr); + printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf, bufferSize_ormqr); - bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf - : bufferSize_ormqr; + bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf : bufferSize_ormqr; - checkCudaErrors(cudaMalloc(&info, sizeof(int))); - checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize)); - checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n)); - checkCudaErrors(cudaMalloc((void **)&tau, sizeof(double) * n)); + checkCudaErrors(cudaMalloc(&info, sizeof(int))); + checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize)); + checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n)); + checkCudaErrors(cudaMalloc((void **)&tau, sizeof(double) * n)); - // prepare a copy of A because getrf will overwrite A with L - checkCudaErrors( - cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice)); + // prepare a copy of A because getrf will overwrite A with L + checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice)); - checkCudaErrors(cudaMemset(info, 0, sizeof(int))); + checkCudaErrors(cudaMemset(info, 0, sizeof(int))); - start = second(); - start = second(); + start = second(); + start = second(); - // compute QR factorization - checkCudaErrors( - cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info)); + // compute QR factorization + checkCudaErrors(cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info)); - checkCudaErrors( - cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost)); - if (0 != h_info) { - fprintf(stderr, "Error: LU factorization failed\n"); - } + if (0 != h_info) { + fprintf(stderr, "Error: LU factorization failed\n"); + } - checkCudaErrors( - cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice)); - // compute Q^T*b - checkCudaErrors(cusolverDnDormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, n, 1, - n, A, lda, tau, x, n, buffer, bufferSize, - info)); + // compute Q^T*b + checkCudaErrors( + cusolverDnDormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, n, 1, n, A, lda, tau, x, n, buffer, bufferSize, info)); - // x = R \ Q^T*b - checkCudaErrors(cublasDtrsm(cublasHandle, CUBLAS_SIDE_LEFT, - CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, - CUBLAS_DIAG_NON_UNIT, n, 1, &one, A, lda, x, n)); - checkCudaErrors(cudaDeviceSynchronize()); - stop = second(); + // x = R \ Q^T*b + checkCudaErrors(cublasDtrsm(cublasHandle, + CUBLAS_SIDE_LEFT, + CUBLAS_FILL_MODE_UPPER, + CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, + n, + 1, + &one, + A, + lda, + x, + n)); + checkCudaErrors(cudaDeviceSynchronize()); + stop = second(); - time_solve = stop - start; - fprintf(stdout, "timing: QR = %10.6f sec\n", time_solve); + time_solve = stop - start; + fprintf(stdout, "timing: QR = %10.6f sec\n", time_solve); - if (cublasHandle) { - checkCudaErrors(cublasDestroy(cublasHandle)); - } - if (info) { - checkCudaErrors(cudaFree(info)); - } - if (buffer) { - checkCudaErrors(cudaFree(buffer)); - } - if (A) { - checkCudaErrors(cudaFree(A)); - } - if (tau) { - checkCudaErrors(cudaFree(tau)); - } + if (cublasHandle) { + checkCudaErrors(cublasDestroy(cublasHandle)); + } + if (info) { + checkCudaErrors(cudaFree(info)); + } + if (buffer) { + checkCudaErrors(cudaFree(buffer)); + } + if (A) { + checkCudaErrors(cudaFree(A)); + } + if (tau) { + checkCudaErrors(cudaFree(tau)); + } - return 0; + return 0; } -void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) { - memset(&opts, 0, sizeof(opts)); +void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) +{ + memset(&opts, 0, sizeof(opts)); - if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { - UsageDN(); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "R")) { - char *solverType = NULL; - getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType); - - if (solverType) { - if ((STRCASECMP(solverType, "chol") != 0) && - (STRCASECMP(solverType, "lu") != 0) && - (STRCASECMP(solverType, "qr") != 0)) { - printf("\nIncorrect argument passed to -R option\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { UsageDN(); - } else { - opts.testFunc = solverType; - } } - } - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - char *fileName = 0; - getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); + if (checkCmdLineFlag(argc, (const char **)argv, "R")) { + char *solverType = NULL; + getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType); - if (fileName) { - opts.sparse_mat_filename = fileName; - } else { - printf("\nIncorrect filename passed to -file \n "); - UsageDN(); - } - } - - if (checkCmdLineFlag(argc, (const char **)argv, "lda")) { - opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda"); - } -} - -int main(int argc, char *argv[]) { - struct testOpts opts; - cusolverDnHandle_t handle = NULL; - cublasHandle_t cublasHandle = NULL; // used in residual evaluation - cudaStream_t stream = NULL; - - int rowsA = 0; // number of rows of A - int colsA = 0; // number of columns of A - int nnzA = 0; // number of nonzeros of A - int baseA = 0; // base index in CSR format - int lda = 0; // leading dimension in dense matrix - - // CSR(A) from I/O - int *h_csrRowPtrA = NULL; - int *h_csrColIndA = NULL; - double *h_csrValA = NULL; - - double *h_A = NULL; // dense matrix from CSR(A) - double *h_x = NULL; // a copy of d_x - double *h_b = NULL; // b = ones(m,1) - double *h_r = NULL; // r = b - A*x, a copy of d_r - - double *d_A = NULL; // a copy of h_A - double *d_x = NULL; // x = A \ b - double *d_b = NULL; // a copy of h_b - double *d_r = NULL; // r = b - A*x - - // the constants are used in residual evaluation, r = b - A*x - const double minus_one = -1.0; - const double one = 1.0; - - double x_inf = 0.0; - double r_inf = 0.0; - double A_inf = 0.0; - int errors = 0; - - parseCommandLineArguments(argc, argv, opts); - - if (NULL == opts.testFunc) { - opts.testFunc = "chol"; // By default running Cholesky as NO solver - // selected with -R option. - } - - findCudaDevice(argc, (const char **)argv); - - printf("step 1: read matrix market format\n"); - - if (opts.sparse_mat_filename == NULL) { - opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]); - if (opts.sparse_mat_filename != NULL) - printf("Using default input file [%s]\n", opts.sparse_mat_filename); - else - printf("Could not find gr_900_900_crg.mtx\n"); - } else { - printf("Using input file [%s]\n", opts.sparse_mat_filename); - } - - if (opts.sparse_mat_filename == NULL) { - fprintf(stderr, "Error: input matrix is not provided\n"); - return EXIT_FAILURE; - } - - if (loadMMSparseMatrix(opts.sparse_mat_filename, 'd', true, &rowsA, - &colsA, &nnzA, &h_csrValA, &h_csrRowPtrA, - &h_csrColIndA, true)) { - exit(EXIT_FAILURE); - } - baseA = h_csrRowPtrA[0]; // baseA = {0,1} - - printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, - nnzA, baseA); - - if (rowsA != colsA) { - fprintf(stderr, "Error: only support square matrix\n"); - exit(EXIT_FAILURE); - } - - printf("step 2: convert CSR(A) to dense matrix\n"); - - lda = opts.lda ? opts.lda : rowsA; - if (lda < rowsA) { - fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n"); - exit(EXIT_FAILURE); - } - - h_A = (double *)malloc(sizeof(double) * lda * colsA); - h_x = (double *)malloc(sizeof(double) * colsA); - h_b = (double *)malloc(sizeof(double) * rowsA); - h_r = (double *)malloc(sizeof(double) * rowsA); - assert(NULL != h_A); - assert(NULL != h_x); - assert(NULL != h_b); - assert(NULL != h_r); - - memset(h_A, 0, sizeof(double) * lda * colsA); - - for (int row = 0; row < rowsA; row++) { - const int start = h_csrRowPtrA[row] - baseA; - const int end = h_csrRowPtrA[row + 1] - baseA; - for (int colidx = start; colidx < end; colidx++) { - const int col = h_csrColIndA[colidx] - baseA; - const double Areg = h_csrValA[colidx]; - h_A[row + col * lda] = Areg; - } - } - - printf("step 3: set right hand side vector (b) to 1\n"); - for (int row = 0; row < rowsA; row++) { - h_b[row] = 1.0; - } - - // verify if A is symmetric or not. - if (0 == strcmp(opts.testFunc, "chol")) { - int issym = 1; - for (int j = 0; j < colsA; j++) { - for (int i = j; i < rowsA; i++) { - double Aij = h_A[i + j * lda]; - double Aji = h_A[j + i * lda]; - if (Aij != Aji) { - issym = 0; - break; + if (solverType) { + if ((STRCASECMP(solverType, "chol") != 0) && (STRCASECMP(solverType, "lu") != 0) + && (STRCASECMP(solverType, "qr") != 0)) { + printf("\nIncorrect argument passed to -R option\n"); + UsageDN(); + } + else { + opts.testFunc = solverType; + } } - } } - if (!issym) { - printf("Error: A has no symmetric pattern, please use LU or QR \n"); - exit(EXIT_FAILURE); + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + char *fileName = 0; + getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); + + if (fileName) { + opts.sparse_mat_filename = fileName; + } + else { + printf("\nIncorrect filename passed to -file \n "); + UsageDN(); + } } - } - checkCudaErrors(cusolverDnCreate(&handle)); - checkCudaErrors(cublasCreate(&cublasHandle)); - checkCudaErrors(cudaStreamCreate(&stream)); - - checkCudaErrors(cusolverDnSetStream(handle, stream)); - checkCudaErrors(cublasSetStream(cublasHandle, stream)); - - checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double) * lda * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); - - printf("step 4: prepare data on device\n"); - checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double) * lda * colsA, - cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - - printf("step 5: solve A*x = b \n"); - // d_A and d_b are read-only - if (0 == strcmp(opts.testFunc, "chol")) { - linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x); - } else if (0 == strcmp(opts.testFunc, "lu")) { - linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x); - } else if (0 == strcmp(opts.testFunc, "qr")) { - linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x); - } else { - fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc); - exit(EXIT_FAILURE); - } - printf("step 6: evaluate residual\n"); - checkCudaErrors( - cudaMemcpy(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice)); - - // r = b - A*x - checkCudaErrors(cublasDgemm_v2(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, rowsA, - 1, colsA, &minus_one, d_A, lda, d_x, rowsA, - &one, d_r, rowsA)); - - checkCudaErrors( - cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost)); - checkCudaErrors( - cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); - - x_inf = vec_norminf(colsA, h_x); - r_inf = vec_norminf(rowsA, h_r); - A_inf = mat_norminf(rowsA, colsA, h_A, lda); - - printf("|b - A*x| = %E \n", r_inf); - printf("|A| = %E \n", A_inf); - printf("|x| = %E \n", x_inf); - printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); - - if (handle) { - checkCudaErrors(cusolverDnDestroy(handle)); - } - if (cublasHandle) { - checkCudaErrors(cublasDestroy(cublasHandle)); - } - if (stream) { - checkCudaErrors(cudaStreamDestroy(stream)); - } - - if (h_csrValA) { - free(h_csrValA); - } - if (h_csrRowPtrA) { - free(h_csrRowPtrA); - } - if (h_csrColIndA) { - free(h_csrColIndA); - } - - if (h_A) { - free(h_A); - } - if (h_x) { - free(h_x); - } - if (h_b) { - free(h_b); - } - if (h_r) { - free(h_r); - } - - if (d_A) { - checkCudaErrors(cudaFree(d_A)); - } - if (d_x) { - checkCudaErrors(cudaFree(d_x)); - } - if (d_b) { - checkCudaErrors(cudaFree(d_b)); - } - if (d_r) { - checkCudaErrors(cudaFree(d_r)); - } - - return 0; + if (checkCmdLineFlag(argc, (const char **)argv, "lda")) { + opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda"); + } +} + +int main(int argc, char *argv[]) +{ + struct testOpts opts; + cusolverDnHandle_t handle = NULL; + cublasHandle_t cublasHandle = NULL; // used in residual evaluation + cudaStream_t stream = NULL; + + int rowsA = 0; // number of rows of A + int colsA = 0; // number of columns of A + int nnzA = 0; // number of nonzeros of A + int baseA = 0; // base index in CSR format + int lda = 0; // leading dimension in dense matrix + + // CSR(A) from I/O + int *h_csrRowPtrA = NULL; + int *h_csrColIndA = NULL; + double *h_csrValA = NULL; + + double *h_A = NULL; // dense matrix from CSR(A) + double *h_x = NULL; // a copy of d_x + double *h_b = NULL; // b = ones(m,1) + double *h_r = NULL; // r = b - A*x, a copy of d_r + + double *d_A = NULL; // a copy of h_A + double *d_x = NULL; // x = A \ b + double *d_b = NULL; // a copy of h_b + double *d_r = NULL; // r = b - A*x + + // the constants are used in residual evaluation, r = b - A*x + const double minus_one = -1.0; + const double one = 1.0; + + double x_inf = 0.0; + double r_inf = 0.0; + double A_inf = 0.0; + int errors = 0; + + parseCommandLineArguments(argc, argv, opts); + + if (NULL == opts.testFunc) { + opts.testFunc = "chol"; // By default running Cholesky as NO solver + // selected with -R option. + } + + findCudaDevice(argc, (const char **)argv); + + printf("step 1: read matrix market format\n"); + + if (opts.sparse_mat_filename == NULL) { + opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]); + if (opts.sparse_mat_filename != NULL) + printf("Using default input file [%s]\n", opts.sparse_mat_filename); + else + printf("Could not find gr_900_900_crg.mtx\n"); + } + else { + printf("Using input file [%s]\n", opts.sparse_mat_filename); + } + + if (opts.sparse_mat_filename == NULL) { + fprintf(stderr, "Error: input matrix is not provided\n"); + return EXIT_FAILURE; + } + + if (loadMMSparseMatrix(opts.sparse_mat_filename, + 'd', + true, + &rowsA, + &colsA, + &nnzA, + &h_csrValA, + &h_csrRowPtrA, + &h_csrColIndA, + true)) { + exit(EXIT_FAILURE); + } + baseA = h_csrRowPtrA[0]; // baseA = {0,1} + + printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA); + + if (rowsA != colsA) { + fprintf(stderr, "Error: only support square matrix\n"); + exit(EXIT_FAILURE); + } + + printf("step 2: convert CSR(A) to dense matrix\n"); + + lda = opts.lda ? opts.lda : rowsA; + if (lda < rowsA) { + fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n"); + exit(EXIT_FAILURE); + } + + h_A = (double *)malloc(sizeof(double) * lda * colsA); + h_x = (double *)malloc(sizeof(double) * colsA); + h_b = (double *)malloc(sizeof(double) * rowsA); + h_r = (double *)malloc(sizeof(double) * rowsA); + assert(NULL != h_A); + assert(NULL != h_x); + assert(NULL != h_b); + assert(NULL != h_r); + + memset(h_A, 0, sizeof(double) * lda * colsA); + + for (int row = 0; row < rowsA; row++) { + const int start = h_csrRowPtrA[row] - baseA; + const int end = h_csrRowPtrA[row + 1] - baseA; + for (int colidx = start; colidx < end; colidx++) { + const int col = h_csrColIndA[colidx] - baseA; + const double Areg = h_csrValA[colidx]; + h_A[row + col * lda] = Areg; + } + } + + printf("step 3: set right hand side vector (b) to 1\n"); + for (int row = 0; row < rowsA; row++) { + h_b[row] = 1.0; + } + + // verify if A is symmetric or not. + if (0 == strcmp(opts.testFunc, "chol")) { + int issym = 1; + for (int j = 0; j < colsA; j++) { + for (int i = j; i < rowsA; i++) { + double Aij = h_A[i + j * lda]; + double Aji = h_A[j + i * lda]; + if (Aij != Aji) { + issym = 0; + break; + } + } + } + if (!issym) { + printf("Error: A has no symmetric pattern, please use LU or QR \n"); + exit(EXIT_FAILURE); + } + } + + checkCudaErrors(cusolverDnCreate(&handle)); + checkCudaErrors(cublasCreate(&cublasHandle)); + checkCudaErrors(cudaStreamCreate(&stream)); + + checkCudaErrors(cusolverDnSetStream(handle, stream)); + checkCudaErrors(cublasSetStream(cublasHandle, stream)); + + checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double) * lda * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); + + printf("step 4: prepare data on device\n"); + checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double) * lda * colsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + + printf("step 5: solve A*x = b \n"); + // d_A and d_b are read-only + if (0 == strcmp(opts.testFunc, "chol")) { + linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x); + } + else if (0 == strcmp(opts.testFunc, "lu")) { + linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x); + } + else if (0 == strcmp(opts.testFunc, "qr")) { + linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x); + } + else { + fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc); + exit(EXIT_FAILURE); + } + printf("step 6: evaluate residual\n"); + checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice)); + + // r = b - A*x + checkCudaErrors(cublasDgemm_v2( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, rowsA, 1, colsA, &minus_one, d_A, lda, d_x, rowsA, &one, d_r, rowsA)); + + checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); + + x_inf = vec_norminf(colsA, h_x); + r_inf = vec_norminf(rowsA, h_r); + A_inf = mat_norminf(rowsA, colsA, h_A, lda); + + printf("|b - A*x| = %E \n", r_inf); + printf("|A| = %E \n", A_inf); + printf("|x| = %E \n", x_inf); + printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); + + if (handle) { + checkCudaErrors(cusolverDnDestroy(handle)); + } + if (cublasHandle) { + checkCudaErrors(cublasDestroy(cublasHandle)); + } + if (stream) { + checkCudaErrors(cudaStreamDestroy(stream)); + } + + if (h_csrValA) { + free(h_csrValA); + } + if (h_csrRowPtrA) { + free(h_csrRowPtrA); + } + if (h_csrColIndA) { + free(h_csrColIndA); + } + + if (h_A) { + free(h_A); + } + if (h_x) { + free(h_x); + } + if (h_b) { + free(h_b); + } + if (h_r) { + free(h_r); + } + + if (d_A) { + checkCudaErrors(cudaFree(d_A)); + } + if (d_x) { + checkCudaErrors(cudaFree(d_x)); + } + if (d_b) { + checkCudaErrors(cudaFree(d_b)); + } + if (d_r) { + checkCudaErrors(cudaFree(d_r)); + } + + return 0; } diff --git a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio.c b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio.c index 299625e1..10852428 100644 --- a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio.c +++ b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio.c @@ -1,128 +1,129 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ +/* + * Matrix Market I/O library for ANSI C + * + * See http://math.nist.gov/MatrixMarket for details. + * + * + */ /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif -#include -#include -#include +// System includes #include +#include +#include +#include +// Project includes #include "mmio.h" -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_) + +int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_) { - FILE *f; + FILE *f; MM_typecode matcode; - int M, N, nz; - int i; - double *val; - int *I, *J; - + int M, N, nz; + int i; + double *val; + int *I, *J; + if ((f = fopen(fname, "r")) == NULL) - return -1; - - - if (mm_read_banner(f, &matcode) != 0) - { + return -1; + + + if (mm_read_banner(f, &matcode) != 0) { printf("mm_read_unsymetric: Could not process Matrix Market banner "); printf(" in file [%s]\n", fname); return -1; } - - - - if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) && - mm_is_sparse(matcode))) - { + + + if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))) { fprintf(stderr, "Sorry, this application does not support "); - fprintf(stderr, "Market Market type: [%s]\n", - mm_typecode_to_str(matcode)); + fprintf(stderr, "Market Market type: [%s]\n", mm_typecode_to_str(matcode)); return -1; } - + /* find out size of sparse matrix: M, N, nz .... */ - - if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0) - { + + if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) { fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n"); return -1; } - - *M_ = M; - *N_ = N; + + *M_ = M; + *N_ = N; *nz_ = nz; - + /* reseve memory for matrices */ - - I = (int *) malloc(nz * sizeof(int)); - J = (int *) malloc(nz * sizeof(int)); - val = (double *) malloc(nz * sizeof(double)); - + + I = (int *)malloc(nz * sizeof(int)); + J = (int *)malloc(nz * sizeof(int)); + val = (double *)malloc(nz * sizeof(double)); + *val_ = val; - *I_ = I; - *J_ = J; - + *I_ = I; + *J_ = J; + /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ - - for (i=0; i + #if defined(__cplusplus) -extern "C" { +extern "C" +{ #endif /* __cplusplus */ -#define MM_MAX_LINE_LENGTH 1025 -#define MatrixMarketBanner "%%MatrixMarket" +#define MM_MAX_LINE_LENGTH 1025 +#define MatrixMarketBanner "%%MatrixMarket" #define MM_MAX_TOKEN_LENGTH 64 -typedef char MM_typecode[4]; + typedef char MM_typecode[4]; -char *mm_typecode_to_str(MM_typecode matcode); + char *mm_typecode_to_str(MM_typecode matcode); -int mm_read_banner(FILE *f, MM_typecode *matcode); -int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); -int mm_read_mtx_array_size(FILE *f, int *M, int *N); + int mm_read_banner(FILE *f, MM_typecode *matcode); + int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); + int mm_read_mtx_array_size(FILE *f, int *M, int *N); -int mm_write_banner(FILE *f, MM_typecode matcode); -int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); -int mm_write_mtx_array_size(FILE *f, int M, int N); + int mm_write_banner(FILE *f, MM_typecode matcode); + int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); + int mm_write_mtx_array_size(FILE *f, int M, int N); -/********************* MM_typecode query fucntions ***************************/ + /********************* MM_typecode query fucntions ***************************/ -#define mm_is_matrix(typecode) ((typecode)[0]=='M') +#define mm_is_matrix(typecode) ((typecode)[0] == 'M') -#define mm_is_sparse(typecode) ((typecode)[1]=='C') -#define mm_is_coordinate(typecode)((typecode)[1]=='C') -#define mm_is_dense(typecode) ((typecode)[1]=='A') -#define mm_is_array(typecode) ((typecode)[1]=='A') +#define mm_is_sparse(typecode) ((typecode)[1] == 'C') +#define mm_is_coordinate(typecode) ((typecode)[1] == 'C') +#define mm_is_dense(typecode) ((typecode)[1] == 'A') +#define mm_is_array(typecode) ((typecode)[1] == 'A') -#define mm_is_complex(typecode) ((typecode)[2]=='C') -#define mm_is_real(typecode) ((typecode)[2]=='R') -#define mm_is_pattern(typecode) ((typecode)[2]=='P') -#define mm_is_integer(typecode) ((typecode)[2]=='I') +#define mm_is_complex(typecode) ((typecode)[2] == 'C') +#define mm_is_real(typecode) ((typecode)[2] == 'R') +#define mm_is_pattern(typecode) ((typecode)[2] == 'P') +#define mm_is_integer(typecode) ((typecode)[2] == 'I') -#define mm_is_symmetric(typecode)((typecode)[3]=='S') -#define mm_is_general(typecode) ((typecode)[3]=='G') -#define mm_is_skew(typecode) ((typecode)[3]=='K') -#define mm_is_hermitian(typecode)((typecode)[3]=='H') +#define mm_is_symmetric(typecode) ((typecode)[3] == 'S') +#define mm_is_general(typecode) ((typecode)[3] == 'G') +#define mm_is_skew(typecode) ((typecode)[3] == 'K') +#define mm_is_hermitian(typecode) ((typecode)[3] == 'H') -int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ + int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ -/********************* MM_typecode modify fucntions ***************************/ + /********************* MM_typecode modify fucntions ***************************/ -#define mm_set_matrix(typecode) ((*typecode)[0]='M') -#define mm_set_coordinate(typecode) ((*typecode)[1]='C') -#define mm_set_array(typecode) ((*typecode)[1]='A') -#define mm_set_dense(typecode) mm_set_array(typecode) -#define mm_set_sparse(typecode) mm_set_coordinate(typecode) +#define mm_set_matrix(typecode) ((*typecode)[0] = 'M') +#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C') +#define mm_set_array(typecode) ((*typecode)[1] = 'A') +#define mm_set_dense(typecode) mm_set_array(typecode) +#define mm_set_sparse(typecode) mm_set_coordinate(typecode) -#define mm_set_complex(typecode)((*typecode)[2]='C') -#define mm_set_real(typecode) ((*typecode)[2]='R') -#define mm_set_pattern(typecode)((*typecode)[2]='P') -#define mm_set_integer(typecode)((*typecode)[2]='I') +#define mm_set_complex(typecode) ((*typecode)[2] = 'C') +#define mm_set_real(typecode) ((*typecode)[2] = 'R') +#define mm_set_pattern(typecode) ((*typecode)[2] = 'P') +#define mm_set_integer(typecode) ((*typecode)[2] = 'I') -#define mm_set_symmetric(typecode)((*typecode)[3]='S') -#define mm_set_general(typecode)((*typecode)[3]='G') -#define mm_set_skew(typecode) ((*typecode)[3]='K') -#define mm_set_hermitian(typecode)((*typecode)[3]='H') +#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') +#define mm_set_general(typecode) ((*typecode)[3] = 'G') +#define mm_set_skew(typecode) ((*typecode)[3] = 'K') +#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') -#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \ - (*typecode)[2]=' ',(*typecode)[3]='G') +#define mm_clear_typecode(typecode) ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) -/********************* Matrix Market error codes ***************************/ + /********************* Matrix Market error codes ***************************/ -#define MM_COULD_NOT_READ_FILE 11 -#define MM_PREMATURE_EOF 12 -#define MM_NOT_MTX 13 -#define MM_NO_HEADER 14 -#define MM_UNSUPPORTED_TYPE 15 -#define MM_LINE_TOO_LONG 16 -#define MM_COULD_NOT_WRITE_FILE 17 +#define MM_COULD_NOT_READ_FILE 11 +#define MM_PREMATURE_EOF 12 +#define MM_NOT_MTX 13 +#define MM_NO_HEADER 14 +#define MM_UNSUPPORTED_TYPE 15 +#define MM_LINE_TOO_LONG 16 +#define MM_COULD_NOT_WRITE_FILE 17 -/******************** Matrix Market internal definitions ******************** + /******************** Matrix Market internal definitions ******************** - MM_matrix_typecode: 4-character sequence + MM_matrix_typecode: 4-character sequence - ojbect sparse/ data storage - dense type scheme + ojbect sparse/ data storage + dense type scheme - string position: [0] [1] [2] [3] + string position: [0] [1] [2] [3] - Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) - A(array) C(omplex) H(ermitian) - P(attern) S(ymmetric) - I(nteger) K(kew) + Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) + A(array) C(omplex) H(ermitian) + P(attern) S(ymmetric) + I(nteger) K(kew) - ***********************************************************************/ + ***********************************************************************/ -#define MM_MTX_STR "matrix" -#define MM_ARRAY_STR "array" -#define MM_DENSE_STR "array" -#define MM_COORDINATE_STR "coordinate" -#define MM_SPARSE_STR "coordinate" -#define MM_COMPLEX_STR "complex" -#define MM_REAL_STR "real" -#define MM_INT_STR "integer" -#define MM_GENERAL_STR "general" -#define MM_SYMM_STR "symmetric" -#define MM_HERM_STR "hermitian" -#define MM_SKEW_STR "skew-symmetric" -#define MM_PATTERN_STR "pattern" +#define MM_MTX_STR "matrix" +#define MM_ARRAY_STR "array" +#define MM_DENSE_STR "array" +#define MM_COORDINATE_STR "coordinate" +#define MM_SPARSE_STR "coordinate" +#define MM_COMPLEX_STR "complex" +#define MM_REAL_STR "real" +#define MM_INT_STR "integer" +#define MM_GENERAL_STR "general" +#define MM_SYMM_STR "symmetric" +#define MM_HERM_STR "hermitian" +#define MM_SKEW_STR "skew-symmetric" +#define MM_PATTERN_STR "pattern" -/* high level routines */ -int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, - double **val, MM_typecode *matcode); + /* high level routines */ + int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, double **val, MM_typecode *matcode); -int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, - MM_typecode matcode); + int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, MM_typecode matcode); -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_); + int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_); #if defined(__cplusplus) } -#endif /* __cplusplus */ +#endif /* __cplusplus */ #endif diff --git a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio_wrapper.cpp b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio_wrapper.cpp index 6a69af55..f6f6c8e4 100644 --- a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio_wrapper.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/mmio_wrapper.cpp @@ -1,427 +1,375 @@ +#include +#include #include #include -#include #include "mmio.h" -#include - /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif /* various __inline__ __device__ function to initialize a T_ELEM */ -template __inline__ T_ELEM cuGet (int ); -template <> __inline__ float cuGet(int x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(int); +template <> __inline__ float cuGet(int x) { return float(x); } -template <> __inline__ double cuGet(int x) -{ - return double(x); -} +template <> __inline__ double cuGet(int x) { return double(x); } -template <> __inline__ cuComplex cuGet(int x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(int x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(int x) +template <> __inline__ cuDoubleComplex cuGet(int x) { return (make_cuDoubleComplex(double(x), 0.0)); } + + +template __inline__ T_ELEM cuGet(int, int); +template <> __inline__ float cuGet(int x, int y) { return float(x); } + +template <> __inline__ double cuGet(int x, int y) { return double(x); } + +template <> __inline__ cuComplex cuGet(int x, int y) { return make_cuComplex(float(x), float(y)); } + +template <> __inline__ cuDoubleComplex cuGet(int x, int y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (int , int ); -template <> __inline__ float cuGet(int x, int y) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(float); +template <> __inline__ float cuGet(float x) { return float(x); } -template <> __inline__ double cuGet(int x, int y) -{ - return double(x); -} +template <> __inline__ double cuGet(float x) { return double(x); } -template <> __inline__ cuComplex cuGet(int x, int y) -{ - return make_cuComplex( float(x), float(y) ); -} +template <> __inline__ cuComplex cuGet(float x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(int x, int y) +template <> __inline__ cuDoubleComplex cuGet(float x) { - return (make_cuDoubleComplex( double(x), double(y) )); + return (make_cuDoubleComplex(double(x), 0.0)); } -template __inline__ T_ELEM cuGet (float ); -template <> __inline__ float cuGet(float x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(float, float); +template <> __inline__ float cuGet(float x, float y) { return float(x); } -template <> __inline__ double cuGet(float x) -{ - return double(x); -} +template <> __inline__ double cuGet(float x, float y) { return double(x); } -template <> __inline__ cuComplex cuGet(float x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(float x, float y) { return (make_cuComplex(float(x), float(y))); } -template <> __inline__ cuDoubleComplex cuGet(float x) +template <> __inline__ cuDoubleComplex cuGet(float x, float y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (float, float ); -template <> __inline__ float cuGet(float x, float y) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(double); +template <> __inline__ float cuGet(double x) { return float(x); } -template <> __inline__ double cuGet(float x, float y) -{ - return double(x); -} +template <> __inline__ double cuGet(double x) { return double(x); } -template <> __inline__ cuComplex cuGet(float x, float y) -{ - return (make_cuComplex( float(x), float(y) )); -} +template <> __inline__ cuComplex cuGet(double x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(float x, float y) +template <> __inline__ cuDoubleComplex cuGet(double x) { - return (make_cuDoubleComplex( double(x), double(y) )); + return (make_cuDoubleComplex(double(x), 0.0)); } -template __inline__ T_ELEM cuGet (double ); -template <> __inline__ float cuGet(double x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(double, double); +template <> __inline__ float cuGet(double x, double y) { return float(x); } -template <> __inline__ double cuGet(double x) -{ - return double(x); -} +template <> __inline__ double cuGet(double x, double y) { return double(x); } -template <> __inline__ cuComplex cuGet(double x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(double x, double y) { return (make_cuComplex(float(x), float(y))); } -template <> __inline__ cuDoubleComplex cuGet(double x) +template <> __inline__ cuDoubleComplex cuGet(double x, double y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (double, double ); -template <> __inline__ float cuGet(double x, double y) -{ - return float(x); -} - -template <> __inline__ double cuGet(double x, double y) -{ - return double(x); -} - -template <> __inline__ cuComplex cuGet(double x, double y) -{ - return (make_cuComplex( float(x), float(y) )); -} - -template <> __inline__ cuDoubleComplex cuGet(double x, double y) -{ - return (make_cuDoubleComplex( double(x), double(y) )); -} - - - - - -static void compress_index( - const int *Ind, - int nnz, - int m, - int *Ptr, - int base) +static void compress_index(const int *Ind, int nnz, int m, int *Ptr, int base) { int i; /* initialize everything to zero */ - for(i=0; ii < t->i ){ - return -1 ; + if (s->i < t->i) { + return -1; } - else if ( s->i > t->i ){ - return 1 ; + else if (s->i > t->i) { + return 1; } - else{ - return s->j - t->j ; + else { + return s->j - t->j; } } -int cmp_cooFormat_csc( struct cooFormat *s, struct cooFormat *t) +int cmp_cooFormat_csc(struct cooFormat *s, struct cooFormat *t) { - if ( s->j < t->j ){ - return -1 ; + if (s->j < t->j) { + return -1; } - else if ( s->j > t->j ){ - return 1 ; + else if (s->j > t->j) { + return 1; } - else{ - return s->i - t->i ; + else { + return s->i - t->i; } } -typedef int (*FUNPTR) (const void*, const void*) ; -typedef int (*FUNPTR2) ( struct cooFormat *s, struct cooFormat *t) ; +typedef int (*FUNPTR)(const void *, const void *); +typedef int (*FUNPTR2)(struct cooFormat *s, struct cooFormat *t); -static FUNPTR2 fptr_array[2] = { +static FUNPTR2 fptr_array[2] = { cmp_cooFormat_csr, cmp_cooFormat_csc, }; -static int verify_pattern( - int m, - int nnz, - int *csrRowPtr, - int *csrColInd) +static int verify_pattern(int m, int nnz, int *csrRowPtr, int *csrColInd) { int i, col, start, end, base_index; int error_found = 0; - if (nnz != (csrRowPtr[m] - csrRowPtr[0])){ - fprintf(stderr, "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", 0, csrRowPtr[0], m, csrRowPtr[m], nnz); + if (nnz != (csrRowPtr[m] - csrRowPtr[0])) { + fprintf(stderr, + "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", + 0, + csrRowPtr[0], + m, + csrRowPtr[m], + nnz); error_found = 1; } base_index = csrRowPtr[0]; - if ((0 != base_index) && (1 != base_index)){ + if ((0 != base_index) && (1 != base_index)) { fprintf(stderr, "Error (base index check failed): base index = %d\n", base_index); error_found = 1; } - for (i=0; (!error_found) && (i end){ - fprintf(stderr, "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", i, start+base_index, i+1, end+base_index); + for (i = 0; (!error_found) && (i < m); i++) { + start = csrRowPtr[i] - base_index; + end = csrRowPtr[i + 1] - base_index; + if (start > end) { + fprintf(stderr, + "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", + i, + start + base_index, + i + 1, + end + base_index); error_found = 1; } - for (col=start; col= csrColInd[col+1])){ - fprintf(stderr, "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", col, csrColInd[col], col+1, csrColInd[col+1]); + if ((col < (end - 1)) && (csrColInd[col] >= csrColInd[col + 1])) { + fprintf( + stderr, + "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", + col, + csrColInd[col], + col + 1, + csrColInd[col + 1]); error_found = 1; } } } - return error_found ; + return error_found; } template -int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - T_ELEM **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix) +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix) { - MM_typecode matcode; - double *tempVal; - int *tempRowInd,*tempColInd; - double *tval; - int *trow,*tcol; - int *csrRowPtr, *cscColPtr; - int i,j,error,base,count; + MM_typecode matcode; + double *tempVal; + int *tempRowInd, *tempColInd; + double *tval; + int *trow, *tcol; + int *csrRowPtr, *cscColPtr; + int i, j, error, base, count; struct cooFormat *work; - /* read the matrix */ + /* read the matrix */ error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode); if (error) { fprintf(stderr, "!!!! can not open file: '%s'\n", filename); - return 1; + return 1; } /* start error checking */ if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) { fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n"); - return 1; + return 1; } - if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/){ + if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/) { fprintf(stderr, "!!!! dense, array, pattern and integer matrices are not supported\n"); - return 1; + return 1; } /* if necessary symmetrize the pattern (transform from triangular to full) */ - if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))){ - //count number of non-diagonal elements - count=0; - for(i=0; i<(*nnz); i++){ - if (trow[i] != tcol[i]){ + if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))) { + // count number of non-diagonal elements + count = 0; + for (i = 0; i < (*nnz); i++) { + if (trow[i] != tcol[i]) { count++; } } - //allocate space for the symmetrized matrix - tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); - tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); - if (mm_is_real(matcode) || mm_is_integer(matcode)){ + // allocate space for the symmetrized matrix + tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); + tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { tempVal = (double *)malloc((*nnz + count) * sizeof(double)); } - else{ + else { tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double)); } - //copy the elements regular and transposed locations - for(j=0, i=0; i<(*nnz); i++){ - tempRowInd[j]=trow[i]; - tempColInd[j]=tcol[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - tempVal[j]=tval[i]; + // copy the elements regular and transposed locations + for (j = 0, i = 0; i < (*nnz); i++) { + tempRowInd[j] = trow[i]; + tempColInd[j] = tcol[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + tempVal[j] = tval[i]; } - else{ - tempVal[2*j] =tval[2*i]; - tempVal[2*j+1]=tval[2*i+1]; + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; } j++; - if (trow[i] != tcol[i]){ - tempRowInd[j]=tcol[i]; - tempColInd[j]=trow[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - if (mm_is_skew(matcode)){ - tempVal[j]=-tval[i]; + if (trow[i] != tcol[i]) { + tempRowInd[j] = tcol[i]; + tempColInd[j] = trow[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + if (mm_is_skew(matcode)) { + tempVal[j] = -tval[i]; } - else{ - tempVal[j]= tval[i]; + else { + tempVal[j] = tval[i]; } } - else{ - if(mm_is_hermitian(matcode)){ - tempVal[2*j] = tval[2*i]; - tempVal[2*j+1]=-tval[2*i+1]; + else { + if (mm_is_hermitian(matcode)) { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = -tval[2 * i + 1]; } - else{ - tempVal[2*j] = tval[2*i]; - tempVal[2*j+1]= tval[2*i+1]; + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; } } j++; } } - (*nnz)+=count; - //free temporary storage + (*nnz) += count; + // free temporary storage free(trow); free(tcol); - free(tval); + free(tval); } - else{ - tempRowInd=trow; - tempColInd=tcol; - tempVal =tval; + else { + tempRowInd = trow; + tempColInd = tcol; + tempVal = tval; } // life time of (trow, tcol, tval) is over. // please use COO format (tempRowInd, tempColInd, tempVal) -// use qsort to sort COO format - work = (struct cooFormat *)malloc(sizeof(struct cooFormat)*(*nnz)); - if (NULL == work){ + // use qsort to sort COO format + work = (struct cooFormat *)malloc(sizeof(struct cooFormat) * (*nnz)); + if (NULL == work) { fprintf(stderr, "!!!! allocation error, malloc failed\n"); return 1; } - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { work[i].i = tempRowInd[i]; work[i].j = tempColInd[i]; work[i].p = i; // permutation is identity } - - if (csrFormat){ - /* create row-major ordering of indices (sorted by row and within each row by column) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0] ); - }else{ - /* create column-major ordering of indices (sorted by column and within each column by row) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1] ); + if (csrFormat) { + /* create row-major ordering of indices (sorted by row and within each row by column) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0]); + } + else { + /* create column-major ordering of indices (sorted by column and within each column by row) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1]); } // (tempRowInd, tempColInd) is sorted either by row-major or by col-major - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { tempRowInd[i] = work[i].i; tempColInd[i] = work[i].j; } - // setup base + // setup base // check if there is any row/col 0, if so base-0 // check if there is any row/col equal to matrix dimension m/n, if so base-1 int base0 = 0; int base1 = 0; - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { const int row = tempRowInd[i]; const int col = tempColInd[i]; - if ( (0 == row) || (0 == col) ){ + if ((0 == row) || (0 == col)) { base0 = 1; } - if ( (*m == row) || (*n == col) ){ + if ((*m == row) || (*n == col)) { base1 = 1; } } - if ( base0 && base1 ){ + if (base0 && base1) { printf("Error: input matrix is base-0 and base-1 \n"); return 1; } base = 0; - if (base1){ + if (base1) { base = 1; } /* compress the appropriate indices */ - if (csrFormat){ + if (csrFormat) { /* CSR format (assuming row-major format) */ - csrRowPtr = (int *)malloc(((*m)+1) * sizeof(csrRowPtr[0])); - if (!csrRowPtr) return 1; + csrRowPtr = (int *)malloc(((*m) + 1) * sizeof(csrRowPtr[0])); + if (!csrRowPtr) + return 1; compress_index(tempRowInd, *nnz, *m, csrRowPtr, base); *aRowInd = csrRowPtr; @@ -429,101 +377,97 @@ int loadMMSparseMatrix( } else { /* CSC format (assuming column-major format) */ - cscColPtr = (int *)malloc(((*n)+1) * sizeof(cscColPtr[0])); - if (!cscColPtr) return 1; + cscColPtr = (int *)malloc(((*n) + 1) * sizeof(cscColPtr[0])); + if (!cscColPtr) + return 1; compress_index(tempColInd, *nnz, *n, cscColPtr, base); *aColInd = cscColPtr; *aRowInd = (int *)malloc((*nnz) * sizeof(int)); - } + } - /* transfrom the matrix values of type double into one of the cusparse library types */ + /* transfrom the matrix values of type double into one of the cusparse library types */ *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM)); - - for (i=0; i<(*nnz); i++) { - if (csrFormat){ + + for (i = 0; i < (*nnz); i++) { + if (csrFormat) { (*aColInd)[i] = tempColInd[i]; } - else{ + else { (*aRowInd)[i] = tempRowInd[i]; } - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - (*aVal)[i] = cuGet( tempVal[ work[i].p ] ); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + (*aVal)[i] = cuGet(tempVal[work[i].p]); } - else{ - (*aVal)[i] = cuGet(tempVal[2*work[i].p], tempVal[2*work[i].p+1]); + else { + (*aVal)[i] = cuGet(tempVal[2 * work[i].p], tempVal[2 * work[i].p + 1]); } } /* check for corruption */ int error_found; - if (csrFormat){ + if (csrFormat) { error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd); - }else{ + } + else { error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd); } - if (error_found){ + if (error_found) { fprintf(stderr, "!!!! verify_pattern failed\n"); return 1; } /* cleanup and exit */ free(work); - free(tempVal); + free(tempVal); free(tempColInd); free(tempRowInd); return 0; -} +} /* specific instantiation */ -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - float **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + float **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - double **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); - -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - cuComplex **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); - -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - cuDoubleComplex **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + double **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuDoubleComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); diff --git a/Samples/4_CUDA_Libraries/cuSolverRf/cuSolverRf.cpp b/Samples/4_CUDA_Libraries/cuSolverRf/cuSolverRf.cpp index e9759bde..8794d231 100644 --- a/Samples/4_CUDA_Libraries/cuSolverRf/cuSolverRf.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverRf/cuSolverRf.cpp @@ -58,790 +58,842 @@ #include "helper_string.h" template -int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m, - int *n, int *nnz, T_ELEM **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -void UsageRF(void) { - printf("\n"); - printf("-h : display this help\n"); - printf("-P= : choose a reordering\n"); - printf(" symrcm (Reverse Cuthill-McKee)\n"); - printf(" symamd (Approximate Minimum Degree)\n"); - printf("-file= : filename containing a matrix in MM format\n"); - printf("-device= : if want to run on specific GPU\n"); +void UsageRF(void) +{ + printf("\n"); + printf("-h : display this help\n"); + printf("-P= : choose a reordering\n"); + printf(" symrcm (Reverse Cuthill-McKee)\n"); + printf(" symamd (Approximate Minimum Degree)\n"); + printf("-file= : filename containing a matrix in MM format\n"); + printf("-device= : if want to run on specific GPU\n"); - exit(0); + exit(0); } -void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) { - memset(&opts, 0, sizeof(opts)); +void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) +{ + memset(&opts, 0, sizeof(opts)); - if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { - UsageRF(); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "P")) { - char *reorderType = NULL; - getCmdLineArgumentString(argc, (const char **)argv, "P", &reorderType); - - if (reorderType) { - if ((STRCASECMP(reorderType, "symrcm") != 0) && - (STRCASECMP(reorderType, "symamd") != 0)) { - printf("\nIncorrect argument passed to -P option\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { UsageRF(); - } else { - opts.reorder = reorderType; - } } - } - if (!opts.reorder) { - opts.reorder = "symrcm"; // Setting default reordering to be symrcm. - } + if (checkCmdLineFlag(argc, (const char **)argv, "P")) { + char *reorderType = NULL; + getCmdLineArgumentString(argc, (const char **)argv, "P", &reorderType); - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - char *fileName = 0; - getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); - - if (fileName) { - opts.sparse_mat_filename = fileName; - } else { - printf("\nIncorrect filename passed to -file \n "); - UsageRF(); + if (reorderType) { + if ((STRCASECMP(reorderType, "symrcm") != 0) && (STRCASECMP(reorderType, "symamd") != 0)) { + printf("\nIncorrect argument passed to -P option\n"); + UsageRF(); + } + else { + opts.reorder = reorderType; + } + } + } + + if (!opts.reorder) { + opts.reorder = "symrcm"; // Setting default reordering to be symrcm. + } + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + char *fileName = 0; + getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); + + if (fileName) { + opts.sparse_mat_filename = fileName; + } + else { + printf("\nIncorrect filename passed to -file \n "); + UsageRF(); + } } - } } -int main(int argc, char *argv[]) { - struct testOpts opts; - cusolverRfHandle_t cusolverRfH = NULL; // refactorization - cusolverSpHandle_t cusolverSpH = - NULL; // reordering, permutation and 1st LU factorization - cusparseHandle_t cusparseH = NULL; // residual evaluation - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix +int main(int argc, char *argv[]) +{ + struct testOpts opts; + cusolverRfHandle_t cusolverRfH = NULL; // refactorization + cusolverSpHandle_t cusolverSpH = NULL; // reordering, permutation and 1st LU factorization + cusparseHandle_t cusparseH = NULL; // residual evaluation + cudaStream_t stream = NULL; + cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix - csrluInfoHost_t info = - NULL; // opaque info structure for LU with parital pivoting + csrluInfoHost_t info = NULL; // opaque info structure for LU with parital pivoting - int rowsA = 0; // number of rows of A - int colsA = 0; // number of columns of A - int nnzA = 0; // number of nonzeros of A - int baseA = 0; // base index in CSR format - // cusolverRf only works for base-0 + int rowsA = 0; // number of rows of A + int colsA = 0; // number of columns of A + int nnzA = 0; // number of nonzeros of A + int baseA = 0; // base index in CSR format + // cusolverRf only works for base-0 - // cusolverRf only works for square matrix, - // assume n = rowsA = colsA + // cusolverRf only works for square matrix, + // assume n = rowsA = colsA - // CSR(A) from I/O - int *h_csrRowPtrA = NULL; // n+1 - int *h_csrColIndA = NULL; // nnzA - double *h_csrValA = NULL; // nnzA + // CSR(A) from I/O + int *h_csrRowPtrA = NULL; // n+1 + int *h_csrColIndA = NULL; // nnzA + double *h_csrValA = NULL; // nnzA - int *h_Qreorder = NULL; // n - // reorder to reduce zero fill-in - // Qreorder = symrcm(A) or Qreroder = symamd(A) - // B = Q*A*Q^T - int *h_csrRowPtrB = NULL; // n+1 - int *h_csrColIndB = NULL; // nnzA - double *h_csrValB = NULL; // nnzA - int *h_mapBfromA = NULL; // nnzA + int *h_Qreorder = NULL; // n + // reorder to reduce zero fill-in + // Qreorder = symrcm(A) or Qreroder = symamd(A) + // B = Q*A*Q^T + int *h_csrRowPtrB = NULL; // n+1 + int *h_csrColIndB = NULL; // nnzA + double *h_csrValB = NULL; // nnzA + int *h_mapBfromA = NULL; // nnzA - double *h_x = NULL; // n, x = A \ b - double *h_b = NULL; // n, b = ones(m,1) - double *h_r = NULL; // n, r = b - A*x + double *h_x = NULL; // n, x = A \ b + double *h_b = NULL; // n, b = ones(m,1) + double *h_r = NULL; // n, r = b - A*x - // solve B*(Qx) = Q*b - double *h_xhat = NULL; // n, Q*x_hat = x - double *h_bhat = NULL; // n, b_hat = Q*b + // solve B*(Qx) = Q*b + double *h_xhat = NULL; // n, Q*x_hat = x + double *h_bhat = NULL; // n, b_hat = Q*b - size_t size_perm = 0; - size_t size_internal = 0; - size_t size_lu = 0; // size of working space for csrlu - void *buffer_cpu = NULL; // working space for - // - permutation: B = Q*A*Q^T - // - LU with partial pivoting in cusolverSp + size_t size_perm = 0; + size_t size_internal = 0; + size_t size_lu = 0; // size of working space for csrlu + void *buffer_cpu = NULL; // working space for + // - permutation: B = Q*A*Q^T + // - LU with partial pivoting in cusolverSp - // cusolverSp computes LU with partial pivoting - // Plu*B*Qlu^T = L*U - // where B = Q*A*Q^T - // - // nnzL and nnzU are not known until factorization is done. - // However upper bound of L+U is known after symbolic analysis of LU. - int *h_Plu = NULL; // n - int *h_Qlu = NULL; // n + // cusolverSp computes LU with partial pivoting + // Plu*B*Qlu^T = L*U + // where B = Q*A*Q^T + // + // nnzL and nnzU are not known until factorization is done. + // However upper bound of L+U is known after symbolic analysis of LU. + int *h_Plu = NULL; // n + int *h_Qlu = NULL; // n - int nnzL = 0; - int *h_csrRowPtrL = NULL; // n+1 - int *h_csrColIndL = NULL; // nnzL - double *h_csrValL = NULL; // nnzL + int nnzL = 0; + int *h_csrRowPtrL = NULL; // n+1 + int *h_csrColIndL = NULL; // nnzL + double *h_csrValL = NULL; // nnzL - int nnzU = 0; - int *h_csrRowPtrU = NULL; // n+1 - int *h_csrColIndU = NULL; // nnzU - double *h_csrValU = NULL; // nnzU + int nnzU = 0; + int *h_csrRowPtrU = NULL; // n+1 + int *h_csrColIndU = NULL; // nnzU + double *h_csrValU = NULL; // nnzU - int *h_P = NULL; // n, P = Plu * Qreorder - int *h_Q = NULL; // n, Q = Qlu * Qreorder + int *h_P = NULL; // n, P = Plu * Qreorder + int *h_Q = NULL; // n, Q = Qlu * Qreorder - int *d_csrRowPtrA = NULL; // n+1 - int *d_csrColIndA = NULL; // nnzA - double *d_csrValA = NULL; // nnzA - double *d_x = NULL; // n, x = A \ b - double *d_b = NULL; // n, a copy of h_b - double *d_r = NULL; // n, r = b - A*x + int *d_csrRowPtrA = NULL; // n+1 + int *d_csrColIndA = NULL; // nnzA + double *d_csrValA = NULL; // nnzA + double *d_x = NULL; // n, x = A \ b + double *d_b = NULL; // n, a copy of h_b + double *d_r = NULL; // n, r = b - A*x - int *d_P = NULL; // n, P*A*Q^T = L*U - int *d_Q = NULL; // n + int *d_P = NULL; // n, P*A*Q^T = L*U + int *d_Q = NULL; // n - double *d_T = NULL; // working space in cusolverRfSolve - // |d_T| = n * nrhs + double *d_T = NULL; // working space in cusolverRfSolve + // |d_T| = n * nrhs - // the constants used in residual evaluation, r = b - A*x - const double minus_one = -1.0; - const double one = 1.0; - // the constants used in cusolverRf - // nzero is the value below which zero pivot is flagged. - // nboost is the value which is substitured for zero pivot. - double nzero = 0.0; - double nboost = 0.0; - // the constant used in cusolverSp - // singularity is -1 if A is invertible under tol - // tol determines the condition of singularity - // pivot_threshold decides pivoting strategy - int singularity = 0; - const double tol = 1.e-14; - const double pivot_threshold = 1.0; - // the constants used in cusolverRf - const cusolverRfFactorization_t fact_alg = - CUSOLVERRF_FACTORIZATION_ALG0; // default - const cusolverRfTriangularSolve_t solve_alg = - CUSOLVERRF_TRIANGULAR_SOLVE_ALG1; // default + // the constants used in residual evaluation, r = b - A*x + const double minus_one = -1.0; + const double one = 1.0; + // the constants used in cusolverRf + // nzero is the value below which zero pivot is flagged. + // nboost is the value which is substitured for zero pivot. + double nzero = 0.0; + double nboost = 0.0; + // the constant used in cusolverSp + // singularity is -1 if A is invertible under tol + // tol determines the condition of singularity + // pivot_threshold decides pivoting strategy + int singularity = 0; + const double tol = 1.e-14; + const double pivot_threshold = 1.0; + // the constants used in cusolverRf + const cusolverRfFactorization_t fact_alg = CUSOLVERRF_FACTORIZATION_ALG0; // default + const cusolverRfTriangularSolve_t solve_alg = CUSOLVERRF_TRIANGULAR_SOLVE_ALG1; // default - double x_inf = 0.0; // |x| - double r_inf = 0.0; // |r| - double A_inf = 0.0; // |A| - int errors = 0; + double x_inf = 0.0; // |x| + double r_inf = 0.0; // |r| + double A_inf = 0.0; // |A| + int errors = 0; - double start, stop; - double time_reorder; - double time_perm; - double time_sp_analysis; - double time_sp_factor; - double time_sp_solve; - double time_sp_extract; - double time_rf_assemble; - double time_rf_reset; - double time_rf_refactor; - double time_rf_solve; + double start, stop; + double time_reorder; + double time_perm; + double time_sp_analysis; + double time_sp_factor; + double time_sp_solve; + double time_sp_extract; + double time_rf_assemble; + double time_rf_reset; + double time_rf_refactor; + double time_rf_solve; - parseCommandLineArguments(argc, argv, opts); + parseCommandLineArguments(argc, argv, opts); - printf("step 1.1: preparation\n"); - printf("step 1.1: read matrix market format\n"); + printf("step 1.1: preparation\n"); + printf("step 1.1: read matrix market format\n"); - findCudaDevice(argc, (const char **)argv); + findCudaDevice(argc, (const char **)argv); - if (opts.sparse_mat_filename == NULL) { - opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]); - printf("Using default input file [%s]\n", opts.sparse_mat_filename); - } else { - printf("Using input file [%s]\n", opts.sparse_mat_filename); - } - - if (opts.sparse_mat_filename) { - if (loadMMSparseMatrix(opts.sparse_mat_filename, 'd', true, &rowsA, - &colsA, &nnzA, &h_csrValA, &h_csrRowPtrA, - &h_csrColIndA, true)) { - return 1; + if (opts.sparse_mat_filename == NULL) { + opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]); + printf("Using default input file [%s]\n", opts.sparse_mat_filename); } - baseA = h_csrRowPtrA[0]; // baseA = {0,1} - } - - if (rowsA != colsA) { - fprintf(stderr, "Error: only support square matrix\n"); - return 1; - } - - printf("WARNING: cusolverRf only works for base-0 \n"); - if (baseA) { - for (int i = 0; i <= rowsA; i++) { - h_csrRowPtrA[i]--; + else { + printf("Using input file [%s]\n", opts.sparse_mat_filename); } - for (int i = 0; i < nnzA; i++) { - h_csrColIndA[i]--; + + if (opts.sparse_mat_filename) { + if (loadMMSparseMatrix(opts.sparse_mat_filename, + 'd', + true, + &rowsA, + &colsA, + &nnzA, + &h_csrValA, + &h_csrRowPtrA, + &h_csrColIndA, + true)) { + return 1; + } + baseA = h_csrRowPtrA[0]; // baseA = {0,1} } - baseA = 0; - } - - printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, - nnzA, baseA); - - checkCudaErrors(cusolverSpCreate(&cusolverSpH)); - checkCudaErrors(cusparseCreate(&cusparseH)); - checkCudaErrors(cudaStreamCreate(&stream)); - - checkCudaErrors(cusolverSpSetStream(cusolverSpH, stream)); - checkCudaErrors(cusparseSetStream(cusparseH, stream)); - - checkCudaErrors(cusparseCreateMatDescr(&descrA)); - checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - - if (baseA) { - checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); - } else { - checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); - } - - h_Qreorder = (int *)malloc(sizeof(int) * colsA); - - h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1)); - h_csrColIndB = (int *)malloc(sizeof(int) * nnzA); - h_csrValB = (double *)malloc(sizeof(double) * nnzA); - h_mapBfromA = (int *)malloc(sizeof(int) * nnzA); - - h_x = (double *)malloc(sizeof(double) * colsA); - h_b = (double *)malloc(sizeof(double) * rowsA); - h_r = (double *)malloc(sizeof(double) * rowsA); - h_xhat = (double *)malloc(sizeof(double) * colsA); - h_bhat = (double *)malloc(sizeof(double) * rowsA); - - assert(NULL != h_Qreorder); - - assert(NULL != h_csrRowPtrB); - assert(NULL != h_csrColIndB); - assert(NULL != h_csrValB); - assert(NULL != h_mapBfromA); - - assert(NULL != h_x); - assert(NULL != h_b); - assert(NULL != h_r); - assert(NULL != h_xhat); - assert(NULL != h_bhat); - - checkCudaErrors( - cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1))); - checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_P, sizeof(int) * rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_Q, sizeof(int) * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_T, sizeof(double) * rowsA * 1)); - - printf("step 1.2: set right hand side vector (b) to 1\n"); - for (int row = 0; row < rowsA; row++) { - h_b[row] = 1.0; - } - - printf("step 2: reorder the matrix to reduce zero fill-in\n"); - printf(" Q = symrcm(A) or Q = symamd(A) \n"); - start = second(); - start = second(); - - if (0 == strcmp(opts.reorder, "symrcm")) { - checkCudaErrors(cusolverSpXcsrsymrcmHost(cusolverSpH, rowsA, nnzA, descrA, - h_csrRowPtrA, h_csrColIndA, - h_Qreorder)); - } else if (0 == strcmp(opts.reorder, "symamd")) { - checkCudaErrors(cusolverSpXcsrsymamdHost(cusolverSpH, rowsA, nnzA, descrA, - h_csrRowPtrA, h_csrColIndA, - h_Qreorder)); - } else { - fprintf(stderr, "Error: %s is unknow reordering\n", opts.reorder); - return 1; - } - - stop = second(); - time_reorder = stop - start; - - printf("step 3: B = Q*A*Q^T\n"); - memcpy(h_csrRowPtrB, h_csrRowPtrA, sizeof(int) * (rowsA + 1)); - memcpy(h_csrColIndB, h_csrColIndA, sizeof(int) * nnzA); - - start = second(); - start = second(); - - checkCudaErrors(cusolverSpXcsrperm_bufferSizeHost( - cusolverSpH, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, - h_Qreorder, h_Qreorder, &size_perm)); - - if (buffer_cpu) { - free(buffer_cpu); - } - buffer_cpu = (void *)malloc(sizeof(char) * size_perm); - assert(NULL != buffer_cpu); - - // h_mapBfromA = Identity - for (int j = 0; j < nnzA; j++) { - h_mapBfromA[j] = j; - } - checkCudaErrors(cusolverSpXcsrpermHost( - cusolverSpH, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, - h_Qreorder, h_Qreorder, h_mapBfromA, buffer_cpu)); - - // B = A( mapBfromA ) - for (int j = 0; j < nnzA; j++) { - h_csrValB[j] = h_csrValA[h_mapBfromA[j]]; - } - - stop = second(); - time_perm = stop - start; - - printf("step 4: solve A*x = b by LU(B) in cusolverSp\n"); - - printf("step 4.1: create opaque info structure\n"); - checkCudaErrors(cusolverSpCreateCsrluInfoHost(&info)); - - printf( - "step 4.2: analyze LU(B) to know structure of Q and R, and upper bound " - "for nnz(L+U)\n"); - start = second(); - start = second(); - - checkCudaErrors(cusolverSpXcsrluAnalysisHost( - cusolverSpH, rowsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, info)); - - stop = second(); - time_sp_analysis = stop - start; - - printf("step 4.3: workspace for LU(B)\n"); - checkCudaErrors(cusolverSpDcsrluBufferInfoHost( - cusolverSpH, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, - info, &size_internal, &size_lu)); - - if (buffer_cpu) { - free(buffer_cpu); - } - buffer_cpu = (void *)malloc(sizeof(char) * size_lu); - assert(NULL != buffer_cpu); - - printf("step 4.4: compute Ppivot*B = L*U \n"); - start = second(); - start = second(); - - checkCudaErrors(cusolverSpDcsrluFactorHost( - cusolverSpH, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, - info, pivot_threshold, buffer_cpu)); - - stop = second(); - time_sp_factor = stop - start; - - // TODO: check singularity by tol - printf("step 4.5: check if the matrix is singular \n"); - checkCudaErrors( - cusolverSpDcsrluZeroPivotHost(cusolverSpH, info, tol, &singularity)); - - if (0 <= singularity) { - fprintf(stderr, "Error: A is not invertible, singularity=%d\n", - singularity); - return 1; - } - - printf("step 4.6: solve A*x = b \n"); - printf(" i.e. solve B*(Qx) = Q*b \n"); - start = second(); - start = second(); - - // b_hat = Q*b - for (int j = 0; j < rowsA; j++) { - h_bhat[j] = h_b[h_Qreorder[j]]; - } - // B*x_hat = b_hat - checkCudaErrors(cusolverSpDcsrluSolveHost(cusolverSpH, rowsA, h_bhat, h_xhat, - info, buffer_cpu)); - - // x = Q^T * x_hat - for (int j = 0; j < rowsA; j++) { - h_x[h_Qreorder[j]] = h_xhat[j]; - } - - stop = second(); - time_sp_solve = stop - start; - - printf("step 4.7: evaluate residual r = b - A*x (result on CPU)\n"); - // use GPU gemv to compute r = b - A*x - checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, - sizeof(int) * (rowsA + 1), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, - cudaMemcpyHostToDevice)); - - checkCudaErrors( - cudaMemcpy(d_r, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_x, h_x, sizeof(double) * colsA, cudaMemcpyHostToDevice)); - - /* Wrap raw data into cuSPARSE generic API objects */ - cusparseSpMatDescr_t matA = NULL; - if (baseA) { - checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA, - d_csrColIndA, d_csrValA, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)); - } else { - checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA, - d_csrColIndA, d_csrValA, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); - } - - cusparseDnVecDescr_t vecx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F)); - cusparseDnVecDescr_t vecAx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F)); - - /* Allocate workspace for cuSPARSE */ - size_t bufferSize = 0; - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, &one, - vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); - void *buffer = NULL; - checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - - checkCudaErrors(cusparseSpMV(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, - &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors( - cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); - - x_inf = vec_norminf(colsA, h_x); - r_inf = vec_norminf(rowsA, h_r); - A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, - h_csrColIndA); - - printf("(CPU) |b - A*x| = %E \n", r_inf); - printf("(CPU) |A| = %E \n", A_inf); - printf("(CPU) |x| = %E \n", x_inf); - printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); - - printf("step 5: extract P, Q, L and U from P*B*Q^T = L*U \n"); - printf(" L has implicit unit diagonal\n"); - start = second(); - start = second(); - - checkCudaErrors(cusolverSpXcsrluNnzHost(cusolverSpH, &nnzL, &nnzU, info)); - - h_Plu = (int *)malloc(sizeof(int) * rowsA); - h_Qlu = (int *)malloc(sizeof(int) * colsA); - - h_csrValL = (double *)malloc(sizeof(double) * nnzL); - h_csrRowPtrL = (int *)malloc(sizeof(int) * (rowsA + 1)); - h_csrColIndL = (int *)malloc(sizeof(int) * nnzL); - - h_csrValU = (double *)malloc(sizeof(double) * nnzU); - h_csrRowPtrU = (int *)malloc(sizeof(int) * (rowsA + 1)); - h_csrColIndU = (int *)malloc(sizeof(int) * nnzU); - - assert(NULL != h_Plu); - assert(NULL != h_Qlu); - - assert(NULL != h_csrValL); - assert(NULL != h_csrRowPtrL); - assert(NULL != h_csrColIndL); - - assert(NULL != h_csrValU); - assert(NULL != h_csrRowPtrU); - assert(NULL != h_csrColIndU); - - checkCudaErrors(cusolverSpDcsrluExtractHost( - cusolverSpH, h_Plu, h_Qlu, descrA, h_csrValL, h_csrRowPtrL, h_csrColIndL, - descrA, h_csrValU, h_csrRowPtrU, h_csrColIndU, info, buffer_cpu)); - - stop = second(); - time_sp_extract = stop - start; - - printf("nnzL = %d, nnzU = %d\n", nnzL, nnzU); - - /* B = Qreorder*A*Qreorder^T - * Plu*B*Qlu^T = L*U - * - * (Plu*Qreorder)*A*(Qlu*Qreorder)^T = L*U - * - * Let P = Plu*Qreroder, Q = Qlu*Qreorder, - * then we have - * P*A*Q^T = L*U - * which is the fundamental relation in cusolverRf. - */ - printf("step 6: form P*A*Q^T = L*U\n"); - - h_P = (int *)malloc(sizeof(int) * rowsA); - h_Q = (int *)malloc(sizeof(int) * colsA); - assert(NULL != h_P); - assert(NULL != h_Q); - - printf("step 6.1: P = Plu*Qreroder\n"); - // gather operation, P = Qreorder(Plu) - for (int j = 0; j < rowsA; j++) { - h_P[j] = h_Qreorder[h_Plu[j]]; - } - - printf("step 6.2: Q = Qlu*Qreorder \n"); - // gather operation, Q = Qreorder(Qlu) - for (int j = 0; j < colsA; j++) { - h_Q[j] = h_Qreorder[h_Qlu[j]]; - } - - printf("step 7: create cusolverRf handle\n"); - checkCudaErrors(cusolverRfCreate(&cusolverRfH)); - - printf("step 8: set parameters for cusolverRf \n"); - // numerical values for checking "zeros" and for boosting. - checkCudaErrors(cusolverRfSetNumericProperties(cusolverRfH, nzero, nboost)); - - // choose algorithm for refactorization and solve - checkCudaErrors(cusolverRfSetAlgs(cusolverRfH, fact_alg, solve_alg)); - - // matrix mode: L and U are CSR format, and L has implicit unit diagonal - checkCudaErrors( - cusolverRfSetMatrixFormat(cusolverRfH, CUSOLVERRF_MATRIX_FORMAT_CSR, - CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L)); - - // fast mode for matrix assembling - checkCudaErrors(cusolverRfSetResetValuesFastMode( - cusolverRfH, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON)); - - printf("step 9: assemble P*A*Q = L*U \n"); - start = second(); - start = second(); - - checkCudaErrors(cusolverRfSetupHost( - rowsA, nnzA, h_csrRowPtrA, h_csrColIndA, h_csrValA, nnzL, h_csrRowPtrL, - h_csrColIndL, h_csrValL, nnzU, h_csrRowPtrU, h_csrColIndU, h_csrValU, h_P, - h_Q, cusolverRfH)); - - checkCudaErrors(cudaDeviceSynchronize()); - stop = second(); - time_rf_assemble = stop - start; - - printf("step 10: analyze to extract parallelism \n"); - checkCudaErrors(cusolverRfAnalyze(cusolverRfH)); - - printf("step 11: import A to cusolverRf \n"); - checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, - sizeof(int) * (rowsA + 1), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, - cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_P, h_P, sizeof(int) * rowsA, cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_Q, h_Q, sizeof(int) * colsA, cudaMemcpyHostToDevice)); - - start = second(); - start = second(); - - checkCudaErrors(cusolverRfResetValues(rowsA, nnzA, d_csrRowPtrA, d_csrColIndA, - d_csrValA, d_P, d_Q, cusolverRfH)); - - checkCudaErrors(cudaDeviceSynchronize()); - stop = second(); - time_rf_reset = stop - start; - - printf("step 12: refactorization \n"); - start = second(); - start = second(); - - checkCudaErrors(cusolverRfRefactor(cusolverRfH)); - - checkCudaErrors(cudaDeviceSynchronize()); - stop = second(); - time_rf_refactor = stop - start; - - printf("step 13: solve A*x = b \n"); - checkCudaErrors( - cudaMemcpy(d_x, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - - start = second(); - start = second(); - - checkCudaErrors( - cusolverRfSolve(cusolverRfH, d_P, d_Q, 1, d_T, rowsA, d_x, rowsA)); - - checkCudaErrors(cudaDeviceSynchronize()); - stop = second(); - time_rf_solve = stop - start; - - printf("step 14: evaluate residual r = b - A*x (result on GPU)\n"); - checkCudaErrors( - cudaMemcpy(d_r, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - - checkCudaErrors(cusparseSpMV(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, - &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors( - cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost)); - checkCudaErrors( - cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); - - x_inf = vec_norminf(colsA, h_x); - r_inf = vec_norminf(rowsA, h_r); - printf("(GPU) |b - A*x| = %E \n", r_inf); - printf("(GPU) |A| = %E \n", A_inf); - printf("(GPU) |x| = %E \n", x_inf); - printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); - - printf("===== statistics \n"); - printf(" nnz(A) = %d, nnz(L+U) = %d, zero fill-in ratio = %f\n", nnzA, - nnzL + nnzU, ((double)(nnzL + nnzU)) / (double)nnzA); - printf("\n"); - printf("===== timing profile \n"); - printf(" reorder A : %f sec\n", time_reorder); - printf(" B = Q*A*Q^T : %f sec\n", time_perm); - printf("\n"); - printf(" cusolverSp LU analysis: %f sec\n", time_sp_analysis); - printf(" cusolverSp LU factor : %f sec\n", time_sp_factor); - printf(" cusolverSp LU solve : %f sec\n", time_sp_solve); - printf(" cusolverSp LU extract : %f sec\n", time_sp_extract); - printf("\n"); - printf(" cusolverRf assemble : %f sec\n", time_rf_assemble); - printf(" cusolverRf reset : %f sec\n", time_rf_reset); - printf(" cusolverRf refactor : %f sec\n", time_rf_refactor); - printf(" cusolverRf solve : %f sec\n", time_rf_solve); - - if (cusolverRfH) { - checkCudaErrors(cusolverRfDestroy(cusolverRfH)); - } - if (cusolverSpH) { - checkCudaErrors(cusolverSpDestroy(cusolverSpH)); - } - if (cusparseH) { - checkCudaErrors(cusparseDestroy(cusparseH)); - } - if (stream) { - checkCudaErrors(cudaStreamDestroy(stream)); - } - if (descrA) { - checkCudaErrors(cusparseDestroyMatDescr(descrA)); - } - if (info) { - checkCudaErrors(cusolverSpDestroyCsrluInfoHost(info)); - } - - if (matA) { - checkCudaErrors(cusparseDestroySpMat(matA)); - } - if (vecx) { - checkCudaErrors(cusparseDestroyDnVec(vecx)); - } - if (vecAx) { - checkCudaErrors(cusparseDestroyDnVec(vecAx)); - } - - if (h_csrValA) { - free(h_csrValA); - } - if (h_csrRowPtrA) { - free(h_csrRowPtrA); - } - if (h_csrColIndA) { - free(h_csrColIndA); - } - - if (h_Qreorder) { - free(h_Qreorder); - } - - if (h_csrRowPtrB) { - free(h_csrRowPtrB); - } - if (h_csrColIndB) { - free(h_csrColIndB); - } - if (h_csrValB) { - free(h_csrValB); - } - if (h_mapBfromA) { - free(h_mapBfromA); - } - - if (h_x) { - free(h_x); - } - if (h_b) { - free(h_b); - } - if (h_r) { - free(h_r); - } - if (h_xhat) { - free(h_xhat); - } - if (h_bhat) { - free(h_bhat); - } - - if (buffer_cpu) { - free(buffer_cpu); - } - - if (h_Plu) { - free(h_Plu); - } - if (h_Qlu) { - free(h_Qlu); - } - if (h_csrRowPtrL) { - free(h_csrRowPtrL); - } - if (h_csrColIndL) { - free(h_csrColIndL); - } - if (h_csrValL) { - free(h_csrValL); - } - if (h_csrRowPtrU) { - free(h_csrRowPtrU); - } - if (h_csrColIndU) { - free(h_csrColIndU); - } - if (h_csrValU) { - free(h_csrValU); - } - - if (h_P) { - free(h_P); - } - if (h_Q) { - free(h_Q); - } - - if (d_csrValA) { - checkCudaErrors(cudaFree(d_csrValA)); - } - if (d_csrRowPtrA) { - checkCudaErrors(cudaFree(d_csrRowPtrA)); - } - if (d_csrColIndA) { - checkCudaErrors(cudaFree(d_csrColIndA)); - } - if (d_x) { - checkCudaErrors(cudaFree(d_x)); - } - if (d_b) { - checkCudaErrors(cudaFree(d_b)); - } - if (d_r) { - checkCudaErrors(cudaFree(d_r)); - } - if (d_P) { - checkCudaErrors(cudaFree(d_P)); - } - if (d_Q) { - checkCudaErrors(cudaFree(d_Q)); - } - if (d_T) { - checkCudaErrors(cudaFree(d_T)); - } - - return 0; + + if (rowsA != colsA) { + fprintf(stderr, "Error: only support square matrix\n"); + return 1; + } + + printf("WARNING: cusolverRf only works for base-0 \n"); + if (baseA) { + for (int i = 0; i <= rowsA; i++) { + h_csrRowPtrA[i]--; + } + for (int i = 0; i < nnzA; i++) { + h_csrColIndA[i]--; + } + baseA = 0; + } + + printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA); + + checkCudaErrors(cusolverSpCreate(&cusolverSpH)); + checkCudaErrors(cusparseCreate(&cusparseH)); + checkCudaErrors(cudaStreamCreate(&stream)); + + checkCudaErrors(cusolverSpSetStream(cusolverSpH, stream)); + checkCudaErrors(cusparseSetStream(cusparseH, stream)); + + checkCudaErrors(cusparseCreateMatDescr(&descrA)); + checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + + if (baseA) { + checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); + } + else { + checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + } + + h_Qreorder = (int *)malloc(sizeof(int) * colsA); + + h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1)); + h_csrColIndB = (int *)malloc(sizeof(int) * nnzA); + h_csrValB = (double *)malloc(sizeof(double) * nnzA); + h_mapBfromA = (int *)malloc(sizeof(int) * nnzA); + + h_x = (double *)malloc(sizeof(double) * colsA); + h_b = (double *)malloc(sizeof(double) * rowsA); + h_r = (double *)malloc(sizeof(double) * rowsA); + h_xhat = (double *)malloc(sizeof(double) * colsA); + h_bhat = (double *)malloc(sizeof(double) * rowsA); + + assert(NULL != h_Qreorder); + + assert(NULL != h_csrRowPtrB); + assert(NULL != h_csrColIndB); + assert(NULL != h_csrValB); + assert(NULL != h_mapBfromA); + + assert(NULL != h_x); + assert(NULL != h_b); + assert(NULL != h_r); + assert(NULL != h_xhat); + assert(NULL != h_bhat); + + checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1))); + checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_P, sizeof(int) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_Q, sizeof(int) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_T, sizeof(double) * rowsA * 1)); + + printf("step 1.2: set right hand side vector (b) to 1\n"); + for (int row = 0; row < rowsA; row++) { + h_b[row] = 1.0; + } + + printf("step 2: reorder the matrix to reduce zero fill-in\n"); + printf(" Q = symrcm(A) or Q = symamd(A) \n"); + start = second(); + start = second(); + + if (0 == strcmp(opts.reorder, "symrcm")) { + checkCudaErrors( + cusolverSpXcsrsymrcmHost(cusolverSpH, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Qreorder)); + } + else if (0 == strcmp(opts.reorder, "symamd")) { + checkCudaErrors( + cusolverSpXcsrsymamdHost(cusolverSpH, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Qreorder)); + } + else { + fprintf(stderr, "Error: %s is unknow reordering\n", opts.reorder); + return 1; + } + + stop = second(); + time_reorder = stop - start; + + printf("step 3: B = Q*A*Q^T\n"); + memcpy(h_csrRowPtrB, h_csrRowPtrA, sizeof(int) * (rowsA + 1)); + memcpy(h_csrColIndB, h_csrColIndA, sizeof(int) * nnzA); + + start = second(); + start = second(); + + checkCudaErrors(cusolverSpXcsrperm_bufferSizeHost( + cusolverSpH, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, h_Qreorder, h_Qreorder, &size_perm)); + + if (buffer_cpu) { + free(buffer_cpu); + } + buffer_cpu = (void *)malloc(sizeof(char) * size_perm); + assert(NULL != buffer_cpu); + + // h_mapBfromA = Identity + for (int j = 0; j < nnzA; j++) { + h_mapBfromA[j] = j; + } + checkCudaErrors(cusolverSpXcsrpermHost(cusolverSpH, + rowsA, + colsA, + nnzA, + descrA, + h_csrRowPtrB, + h_csrColIndB, + h_Qreorder, + h_Qreorder, + h_mapBfromA, + buffer_cpu)); + + // B = A( mapBfromA ) + for (int j = 0; j < nnzA; j++) { + h_csrValB[j] = h_csrValA[h_mapBfromA[j]]; + } + + stop = second(); + time_perm = stop - start; + + printf("step 4: solve A*x = b by LU(B) in cusolverSp\n"); + + printf("step 4.1: create opaque info structure\n"); + checkCudaErrors(cusolverSpCreateCsrluInfoHost(&info)); + + printf("step 4.2: analyze LU(B) to know structure of Q and R, and upper bound " + "for nnz(L+U)\n"); + start = second(); + start = second(); + + checkCudaErrors(cusolverSpXcsrluAnalysisHost(cusolverSpH, rowsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, info)); + + stop = second(); + time_sp_analysis = stop - start; + + printf("step 4.3: workspace for LU(B)\n"); + checkCudaErrors(cusolverSpDcsrluBufferInfoHost( + cusolverSpH, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, info, &size_internal, &size_lu)); + + if (buffer_cpu) { + free(buffer_cpu); + } + buffer_cpu = (void *)malloc(sizeof(char) * size_lu); + assert(NULL != buffer_cpu); + + printf("step 4.4: compute Ppivot*B = L*U \n"); + start = second(); + start = second(); + + checkCudaErrors(cusolverSpDcsrluFactorHost( + cusolverSpH, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, info, pivot_threshold, buffer_cpu)); + + stop = second(); + time_sp_factor = stop - start; + + // TODO: check singularity by tol + printf("step 4.5: check if the matrix is singular \n"); + checkCudaErrors(cusolverSpDcsrluZeroPivotHost(cusolverSpH, info, tol, &singularity)); + + if (0 <= singularity) { + fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity); + return 1; + } + + printf("step 4.6: solve A*x = b \n"); + printf(" i.e. solve B*(Qx) = Q*b \n"); + start = second(); + start = second(); + + // b_hat = Q*b + for (int j = 0; j < rowsA; j++) { + h_bhat[j] = h_b[h_Qreorder[j]]; + } + // B*x_hat = b_hat + checkCudaErrors(cusolverSpDcsrluSolveHost(cusolverSpH, rowsA, h_bhat, h_xhat, info, buffer_cpu)); + + // x = Q^T * x_hat + for (int j = 0; j < rowsA; j++) { + h_x[h_Qreorder[j]] = h_xhat[j]; + } + + stop = second(); + time_sp_solve = stop - start; + + printf("step 4.7: evaluate residual r = b - A*x (result on CPU)\n"); + // use GPU gemv to compute r = b - A*x + checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_x, h_x, sizeof(double) * colsA, cudaMemcpyHostToDevice)); + + /* Wrap raw data into cuSPARSE generic API objects */ + cusparseSpMatDescr_t matA = NULL; + if (baseA) { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ONE, + CUDA_R_64F)); + } + else { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + } + + cusparseDnVecDescr_t vecx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F)); + cusparseDnVecDescr_t vecAx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F)); + + /* Allocate workspace for cuSPARSE */ + size_t bufferSize = 0; + checkCudaErrors(cusparseSpMV_bufferSize(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); + void *buffer = NULL; + checkCudaErrors(cudaMalloc(&buffer, bufferSize)); + + checkCudaErrors(cusparseSpMV(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); + + x_inf = vec_norminf(colsA, h_x); + r_inf = vec_norminf(rowsA, h_r); + A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA); + + printf("(CPU) |b - A*x| = %E \n", r_inf); + printf("(CPU) |A| = %E \n", A_inf); + printf("(CPU) |x| = %E \n", x_inf); + printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); + + printf("step 5: extract P, Q, L and U from P*B*Q^T = L*U \n"); + printf(" L has implicit unit diagonal\n"); + start = second(); + start = second(); + + checkCudaErrors(cusolverSpXcsrluNnzHost(cusolverSpH, &nnzL, &nnzU, info)); + + h_Plu = (int *)malloc(sizeof(int) * rowsA); + h_Qlu = (int *)malloc(sizeof(int) * colsA); + + h_csrValL = (double *)malloc(sizeof(double) * nnzL); + h_csrRowPtrL = (int *)malloc(sizeof(int) * (rowsA + 1)); + h_csrColIndL = (int *)malloc(sizeof(int) * nnzL); + + h_csrValU = (double *)malloc(sizeof(double) * nnzU); + h_csrRowPtrU = (int *)malloc(sizeof(int) * (rowsA + 1)); + h_csrColIndU = (int *)malloc(sizeof(int) * nnzU); + + assert(NULL != h_Plu); + assert(NULL != h_Qlu); + + assert(NULL != h_csrValL); + assert(NULL != h_csrRowPtrL); + assert(NULL != h_csrColIndL); + + assert(NULL != h_csrValU); + assert(NULL != h_csrRowPtrU); + assert(NULL != h_csrColIndU); + + checkCudaErrors(cusolverSpDcsrluExtractHost(cusolverSpH, + h_Plu, + h_Qlu, + descrA, + h_csrValL, + h_csrRowPtrL, + h_csrColIndL, + descrA, + h_csrValU, + h_csrRowPtrU, + h_csrColIndU, + info, + buffer_cpu)); + + stop = second(); + time_sp_extract = stop - start; + + printf("nnzL = %d, nnzU = %d\n", nnzL, nnzU); + + /* B = Qreorder*A*Qreorder^T + * Plu*B*Qlu^T = L*U + * + * (Plu*Qreorder)*A*(Qlu*Qreorder)^T = L*U + * + * Let P = Plu*Qreroder, Q = Qlu*Qreorder, + * then we have + * P*A*Q^T = L*U + * which is the fundamental relation in cusolverRf. + */ + printf("step 6: form P*A*Q^T = L*U\n"); + + h_P = (int *)malloc(sizeof(int) * rowsA); + h_Q = (int *)malloc(sizeof(int) * colsA); + assert(NULL != h_P); + assert(NULL != h_Q); + + printf("step 6.1: P = Plu*Qreroder\n"); + // gather operation, P = Qreorder(Plu) + for (int j = 0; j < rowsA; j++) { + h_P[j] = h_Qreorder[h_Plu[j]]; + } + + printf("step 6.2: Q = Qlu*Qreorder \n"); + // gather operation, Q = Qreorder(Qlu) + for (int j = 0; j < colsA; j++) { + h_Q[j] = h_Qreorder[h_Qlu[j]]; + } + + printf("step 7: create cusolverRf handle\n"); + checkCudaErrors(cusolverRfCreate(&cusolverRfH)); + + printf("step 8: set parameters for cusolverRf \n"); + // numerical values for checking "zeros" and for boosting. + checkCudaErrors(cusolverRfSetNumericProperties(cusolverRfH, nzero, nboost)); + + // choose algorithm for refactorization and solve + checkCudaErrors(cusolverRfSetAlgs(cusolverRfH, fact_alg, solve_alg)); + + // matrix mode: L and U are CSR format, and L has implicit unit diagonal + checkCudaErrors( + cusolverRfSetMatrixFormat(cusolverRfH, CUSOLVERRF_MATRIX_FORMAT_CSR, CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L)); + + // fast mode for matrix assembling + checkCudaErrors(cusolverRfSetResetValuesFastMode(cusolverRfH, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON)); + + printf("step 9: assemble P*A*Q = L*U \n"); + start = second(); + start = second(); + + checkCudaErrors(cusolverRfSetupHost(rowsA, + nnzA, + h_csrRowPtrA, + h_csrColIndA, + h_csrValA, + nnzL, + h_csrRowPtrL, + h_csrColIndL, + h_csrValL, + nnzU, + h_csrRowPtrU, + h_csrColIndU, + h_csrValU, + h_P, + h_Q, + cusolverRfH)); + + checkCudaErrors(cudaDeviceSynchronize()); + stop = second(); + time_rf_assemble = stop - start; + + printf("step 10: analyze to extract parallelism \n"); + checkCudaErrors(cusolverRfAnalyze(cusolverRfH)); + + printf("step 11: import A to cusolverRf \n"); + checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_P, h_P, sizeof(int) * rowsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Q, h_Q, sizeof(int) * colsA, cudaMemcpyHostToDevice)); + + start = second(); + start = second(); + + checkCudaErrors(cusolverRfResetValues(rowsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, d_P, d_Q, cusolverRfH)); + + checkCudaErrors(cudaDeviceSynchronize()); + stop = second(); + time_rf_reset = stop - start; + + printf("step 12: refactorization \n"); + start = second(); + start = second(); + + checkCudaErrors(cusolverRfRefactor(cusolverRfH)); + + checkCudaErrors(cudaDeviceSynchronize()); + stop = second(); + time_rf_refactor = stop - start; + + printf("step 13: solve A*x = b \n"); + checkCudaErrors(cudaMemcpy(d_x, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + + start = second(); + start = second(); + + checkCudaErrors(cusolverRfSolve(cusolverRfH, d_P, d_Q, 1, d_T, rowsA, d_x, rowsA)); + + checkCudaErrors(cudaDeviceSynchronize()); + stop = second(); + time_rf_solve = stop - start; + + printf("step 14: evaluate residual r = b - A*x (result on GPU)\n"); + checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + + checkCudaErrors(cusparseSpMV(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); + + x_inf = vec_norminf(colsA, h_x); + r_inf = vec_norminf(rowsA, h_r); + printf("(GPU) |b - A*x| = %E \n", r_inf); + printf("(GPU) |A| = %E \n", A_inf); + printf("(GPU) |x| = %E \n", x_inf); + printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); + + printf("===== statistics \n"); + printf(" nnz(A) = %d, nnz(L+U) = %d, zero fill-in ratio = %f\n", + nnzA, + nnzL + nnzU, + ((double)(nnzL + nnzU)) / (double)nnzA); + printf("\n"); + printf("===== timing profile \n"); + printf(" reorder A : %f sec\n", time_reorder); + printf(" B = Q*A*Q^T : %f sec\n", time_perm); + printf("\n"); + printf(" cusolverSp LU analysis: %f sec\n", time_sp_analysis); + printf(" cusolverSp LU factor : %f sec\n", time_sp_factor); + printf(" cusolverSp LU solve : %f sec\n", time_sp_solve); + printf(" cusolverSp LU extract : %f sec\n", time_sp_extract); + printf("\n"); + printf(" cusolverRf assemble : %f sec\n", time_rf_assemble); + printf(" cusolverRf reset : %f sec\n", time_rf_reset); + printf(" cusolverRf refactor : %f sec\n", time_rf_refactor); + printf(" cusolverRf solve : %f sec\n", time_rf_solve); + + if (cusolverRfH) { + checkCudaErrors(cusolverRfDestroy(cusolverRfH)); + } + if (cusolverSpH) { + checkCudaErrors(cusolverSpDestroy(cusolverSpH)); + } + if (cusparseH) { + checkCudaErrors(cusparseDestroy(cusparseH)); + } + if (stream) { + checkCudaErrors(cudaStreamDestroy(stream)); + } + if (descrA) { + checkCudaErrors(cusparseDestroyMatDescr(descrA)); + } + if (info) { + checkCudaErrors(cusolverSpDestroyCsrluInfoHost(info)); + } + + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } + + if (h_csrValA) { + free(h_csrValA); + } + if (h_csrRowPtrA) { + free(h_csrRowPtrA); + } + if (h_csrColIndA) { + free(h_csrColIndA); + } + + if (h_Qreorder) { + free(h_Qreorder); + } + + if (h_csrRowPtrB) { + free(h_csrRowPtrB); + } + if (h_csrColIndB) { + free(h_csrColIndB); + } + if (h_csrValB) { + free(h_csrValB); + } + if (h_mapBfromA) { + free(h_mapBfromA); + } + + if (h_x) { + free(h_x); + } + if (h_b) { + free(h_b); + } + if (h_r) { + free(h_r); + } + if (h_xhat) { + free(h_xhat); + } + if (h_bhat) { + free(h_bhat); + } + + if (buffer_cpu) { + free(buffer_cpu); + } + + if (h_Plu) { + free(h_Plu); + } + if (h_Qlu) { + free(h_Qlu); + } + if (h_csrRowPtrL) { + free(h_csrRowPtrL); + } + if (h_csrColIndL) { + free(h_csrColIndL); + } + if (h_csrValL) { + free(h_csrValL); + } + if (h_csrRowPtrU) { + free(h_csrRowPtrU); + } + if (h_csrColIndU) { + free(h_csrColIndU); + } + if (h_csrValU) { + free(h_csrValU); + } + + if (h_P) { + free(h_P); + } + if (h_Q) { + free(h_Q); + } + + if (d_csrValA) { + checkCudaErrors(cudaFree(d_csrValA)); + } + if (d_csrRowPtrA) { + checkCudaErrors(cudaFree(d_csrRowPtrA)); + } + if (d_csrColIndA) { + checkCudaErrors(cudaFree(d_csrColIndA)); + } + if (d_x) { + checkCudaErrors(cudaFree(d_x)); + } + if (d_b) { + checkCudaErrors(cudaFree(d_b)); + } + if (d_r) { + checkCudaErrors(cudaFree(d_r)); + } + if (d_P) { + checkCudaErrors(cudaFree(d_P)); + } + if (d_Q) { + checkCudaErrors(cudaFree(d_Q)); + } + if (d_T) { + checkCudaErrors(cudaFree(d_T)); + } + + return 0; } diff --git a/Samples/4_CUDA_Libraries/cuSolverRf/mmio.c b/Samples/4_CUDA_Libraries/cuSolverRf/mmio.c index 8d710fd2..de5528e8 100644 --- a/Samples/4_CUDA_Libraries/cuSolverRf/mmio.c +++ b/Samples/4_CUDA_Libraries/cuSolverRf/mmio.c @@ -1,112 +1,107 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ +/* + * Matrix Market I/O library for ANSI C + * + * See http://math.nist.gov/MatrixMarket for details. + * + * + */ /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif -#include -#include -#include -#include - #include "mmio.h" -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_) +#include +#include +#include +#include + +int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_) { - FILE *f; + FILE *f; MM_typecode matcode; - int M, N, nz; - int i; - double *val; - int *I, *J; - + int M, N, nz; + int i; + double *val; + int *I, *J; + if ((f = fopen(fname, "r")) == NULL) - return -1; - - - if (mm_read_banner(f, &matcode) != 0) - { + return -1; + + + if (mm_read_banner(f, &matcode) != 0) { printf("mm_read_unsymetric: Could not process Matrix Market banner "); printf(" in file [%s]\n", fname); return -1; } - - - - if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) && - mm_is_sparse(matcode))) - { + + + if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))) { fprintf(stderr, "Sorry, this application does not support "); - fprintf(stderr, "Market Market type: [%s]\n", - mm_typecode_to_str(matcode)); + fprintf(stderr, "Market Market type: [%s]\n", mm_typecode_to_str(matcode)); return -1; } - + /* find out size of sparse matrix: M, N, nz .... */ - - if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0) - { + + if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) { fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n"); return -1; } - - *M_ = M; - *N_ = N; + + *M_ = M; + *N_ = N; *nz_ = nz; - + /* reserve memory for matrices */ - - I = (int *) malloc(nz * sizeof(int)); - J = (int *) malloc(nz * sizeof(int)); - val = (double *) malloc(nz * sizeof(double)); - + + I = (int *)malloc(nz * sizeof(int)); + J = (int *)malloc(nz * sizeof(int)); + val = (double *)malloc(nz * sizeof(double)); + *val_ = val; - *I_ = I; - *J_ = J; - + *I_ = I; + *J_ = J; + /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ - - for (i=0; i + #if defined(__cplusplus) -extern "C" { +extern "C" +{ #endif /* __cplusplus */ -#define MM_MAX_LINE_LENGTH 1025 -#define MatrixMarketBanner "%%MatrixMarket" +#define MM_MAX_LINE_LENGTH 1025 +#define MatrixMarketBanner "%%MatrixMarket" #define MM_MAX_TOKEN_LENGTH 64 -typedef char MM_typecode[4]; + typedef char MM_typecode[4]; -char *mm_typecode_to_str(MM_typecode matcode); + char *mm_typecode_to_str(MM_typecode matcode); -int mm_read_banner(FILE *f, MM_typecode *matcode); -int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); -int mm_read_mtx_array_size(FILE *f, int *M, int *N); + int mm_read_banner(FILE *f, MM_typecode *matcode); + int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); + int mm_read_mtx_array_size(FILE *f, int *M, int *N); -int mm_write_banner(FILE *f, MM_typecode matcode); -int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); -int mm_write_mtx_array_size(FILE *f, int M, int N); + int mm_write_banner(FILE *f, MM_typecode matcode); + int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); + int mm_write_mtx_array_size(FILE *f, int M, int N); -/********************* MM_typecode query fucntions ***************************/ + /********************* MM_typecode query fucntions ***************************/ -#define mm_is_matrix(typecode) ((typecode)[0]=='M') +#define mm_is_matrix(typecode) ((typecode)[0] == 'M') -#define mm_is_sparse(typecode) ((typecode)[1]=='C') -#define mm_is_coordinate(typecode)((typecode)[1]=='C') -#define mm_is_dense(typecode) ((typecode)[1]=='A') -#define mm_is_array(typecode) ((typecode)[1]=='A') +#define mm_is_sparse(typecode) ((typecode)[1] == 'C') +#define mm_is_coordinate(typecode) ((typecode)[1] == 'C') +#define mm_is_dense(typecode) ((typecode)[1] == 'A') +#define mm_is_array(typecode) ((typecode)[1] == 'A') -#define mm_is_complex(typecode) ((typecode)[2]=='C') -#define mm_is_real(typecode) ((typecode)[2]=='R') -#define mm_is_pattern(typecode) ((typecode)[2]=='P') -#define mm_is_integer(typecode) ((typecode)[2]=='I') +#define mm_is_complex(typecode) ((typecode)[2] == 'C') +#define mm_is_real(typecode) ((typecode)[2] == 'R') +#define mm_is_pattern(typecode) ((typecode)[2] == 'P') +#define mm_is_integer(typecode) ((typecode)[2] == 'I') -#define mm_is_symmetric(typecode)((typecode)[3]=='S') -#define mm_is_general(typecode) ((typecode)[3]=='G') -#define mm_is_skew(typecode) ((typecode)[3]=='K') -#define mm_is_hermitian(typecode)((typecode)[3]=='H') +#define mm_is_symmetric(typecode) ((typecode)[3] == 'S') +#define mm_is_general(typecode) ((typecode)[3] == 'G') +#define mm_is_skew(typecode) ((typecode)[3] == 'K') +#define mm_is_hermitian(typecode) ((typecode)[3] == 'H') -int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ + int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ -/********************* MM_typecode modify fucntions ***************************/ + /********************* MM_typecode modify fucntions ***************************/ -#define mm_set_matrix(typecode) ((*typecode)[0]='M') -#define mm_set_coordinate(typecode) ((*typecode)[1]='C') -#define mm_set_array(typecode) ((*typecode)[1]='A') -#define mm_set_dense(typecode) mm_set_array(typecode) -#define mm_set_sparse(typecode) mm_set_coordinate(typecode) +#define mm_set_matrix(typecode) ((*typecode)[0] = 'M') +#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C') +#define mm_set_array(typecode) ((*typecode)[1] = 'A') +#define mm_set_dense(typecode) mm_set_array(typecode) +#define mm_set_sparse(typecode) mm_set_coordinate(typecode) -#define mm_set_complex(typecode)((*typecode)[2]='C') -#define mm_set_real(typecode) ((*typecode)[2]='R') -#define mm_set_pattern(typecode)((*typecode)[2]='P') -#define mm_set_integer(typecode)((*typecode)[2]='I') +#define mm_set_complex(typecode) ((*typecode)[2] = 'C') +#define mm_set_real(typecode) ((*typecode)[2] = 'R') +#define mm_set_pattern(typecode) ((*typecode)[2] = 'P') +#define mm_set_integer(typecode) ((*typecode)[2] = 'I') -#define mm_set_symmetric(typecode)((*typecode)[3]='S') -#define mm_set_general(typecode)((*typecode)[3]='G') -#define mm_set_skew(typecode) ((*typecode)[3]='K') -#define mm_set_hermitian(typecode)((*typecode)[3]='H') +#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') +#define mm_set_general(typecode) ((*typecode)[3] = 'G') +#define mm_set_skew(typecode) ((*typecode)[3] = 'K') +#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') -#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \ - (*typecode)[2]=' ',(*typecode)[3]='G') +#define mm_clear_typecode(typecode) ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) -/********************* Matrix Market error codes ***************************/ + /********************* Matrix Market error codes ***************************/ -#define MM_COULD_NOT_READ_FILE 11 -#define MM_PREMATURE_EOF 12 -#define MM_NOT_MTX 13 -#define MM_NO_HEADER 14 -#define MM_UNSUPPORTED_TYPE 15 -#define MM_LINE_TOO_LONG 16 -#define MM_COULD_NOT_WRITE_FILE 17 +#define MM_COULD_NOT_READ_FILE 11 +#define MM_PREMATURE_EOF 12 +#define MM_NOT_MTX 13 +#define MM_NO_HEADER 14 +#define MM_UNSUPPORTED_TYPE 15 +#define MM_LINE_TOO_LONG 16 +#define MM_COULD_NOT_WRITE_FILE 17 -/******************** Matrix Market internal definitions ******************** + /******************** Matrix Market internal definitions ******************** - MM_matrix_typecode: 4-character sequence + MM_matrix_typecode: 4-character sequence - ojbect sparse/ data storage - dense type scheme + ojbect sparse/ data storage + dense type scheme - string position: [0] [1] [2] [3] + string position: [0] [1] [2] [3] - Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) - A(array) C(omplex) H(ermitian) - P(attern) S(ymmetric) - I(nteger) K(kew) + Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) + A(array) C(omplex) H(ermitian) + P(attern) S(ymmetric) + I(nteger) K(kew) - ***********************************************************************/ + ***********************************************************************/ -#define MM_MTX_STR "matrix" -#define MM_ARRAY_STR "array" -#define MM_DENSE_STR "array" -#define MM_COORDINATE_STR "coordinate" -#define MM_SPARSE_STR "coordinate" -#define MM_COMPLEX_STR "complex" -#define MM_REAL_STR "real" -#define MM_INT_STR "integer" -#define MM_GENERAL_STR "general" -#define MM_SYMM_STR "symmetric" -#define MM_HERM_STR "hermitian" -#define MM_SKEW_STR "skew-symmetric" -#define MM_PATTERN_STR "pattern" +#define MM_MTX_STR "matrix" +#define MM_ARRAY_STR "array" +#define MM_DENSE_STR "array" +#define MM_COORDINATE_STR "coordinate" +#define MM_SPARSE_STR "coordinate" +#define MM_COMPLEX_STR "complex" +#define MM_REAL_STR "real" +#define MM_INT_STR "integer" +#define MM_GENERAL_STR "general" +#define MM_SYMM_STR "symmetric" +#define MM_HERM_STR "hermitian" +#define MM_SKEW_STR "skew-symmetric" +#define MM_PATTERN_STR "pattern" -/* high level routines */ -int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, - double **val, MM_typecode *matcode); + /* high level routines */ + int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, double **val, MM_typecode *matcode); -int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, - MM_typecode matcode); + int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, MM_typecode matcode); -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_); + int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_); #if defined(__cplusplus) } -#endif /* __cplusplus */ +#endif /* __cplusplus */ #endif diff --git a/Samples/4_CUDA_Libraries/cuSolverRf/mmio_wrapper.cpp b/Samples/4_CUDA_Libraries/cuSolverRf/mmio_wrapper.cpp index 04e680a9..3f798d53 100644 --- a/Samples/4_CUDA_Libraries/cuSolverRf/mmio_wrapper.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverRf/mmio_wrapper.cpp @@ -38,180 +38,125 @@ #endif /* various __inline__ __device__ function to initialize a T_ELEM */ -template -__inline__ T_ELEM cuGet(int); -template <> -__inline__ float cuGet(int x) { - return float(x); +template __inline__ T_ELEM cuGet(int); +template <> __inline__ float cuGet(int x) { return float(x); } + +template <> __inline__ double cuGet(int x) { return double(x); } + +template <> __inline__ cuComplex cuGet(int x) { return (make_cuComplex(float(x), 0.0f)); } + +template <> __inline__ cuDoubleComplex cuGet(int x) { return (make_cuDoubleComplex(double(x), 0.0)); } + +template __inline__ T_ELEM cuGet(int, int); +template <> __inline__ float cuGet(int x, int y) { return float(x); } + +template <> __inline__ double cuGet(int x, int y) { return double(x); } + +template <> __inline__ cuComplex cuGet(int x, int y) { return make_cuComplex(float(x), float(y)); } + +template <> __inline__ cuDoubleComplex cuGet(int x, int y) +{ + return (make_cuDoubleComplex(double(x), double(y))); } -template <> -__inline__ double cuGet(int x) { - return double(x); +template __inline__ T_ELEM cuGet(float); +template <> __inline__ float cuGet(float x) { return float(x); } + +template <> __inline__ double cuGet(float x) { return double(x); } + +template <> __inline__ cuComplex cuGet(float x) { return (make_cuComplex(float(x), 0.0f)); } + +template <> __inline__ cuDoubleComplex cuGet(float x) +{ + return (make_cuDoubleComplex(double(x), 0.0)); } -template <> -__inline__ cuComplex cuGet(int x) { - return (make_cuComplex(float(x), 0.0f)); +template __inline__ T_ELEM cuGet(float, float); +template <> __inline__ float cuGet(float x, float y) { return float(x); } + +template <> __inline__ double cuGet(float x, float y) { return double(x); } + +template <> __inline__ cuComplex cuGet(float x, float y) { return (make_cuComplex(float(x), float(y))); } + +template <> __inline__ cuDoubleComplex cuGet(float x, float y) +{ + return (make_cuDoubleComplex(double(x), double(y))); } -template <> -__inline__ cuDoubleComplex cuGet(int x) { - return (make_cuDoubleComplex(double(x), 0.0)); +template __inline__ T_ELEM cuGet(double); +template <> __inline__ float cuGet(double x) { return float(x); } + +template <> __inline__ double cuGet(double x) { return double(x); } + +template <> __inline__ cuComplex cuGet(double x) { return (make_cuComplex(float(x), 0.0f)); } + +template <> __inline__ cuDoubleComplex cuGet(double x) +{ + return (make_cuDoubleComplex(double(x), 0.0)); } -template -__inline__ T_ELEM cuGet(int, int); -template <> -__inline__ float cuGet(int x, int y) { - return float(x); +template __inline__ T_ELEM cuGet(double, double); +template <> __inline__ float cuGet(double x, double y) { return float(x); } + +template <> __inline__ double cuGet(double x, double y) { return double(x); } + +template <> __inline__ cuComplex cuGet(double x, double y) { return (make_cuComplex(float(x), float(y))); } + +template <> __inline__ cuDoubleComplex cuGet(double x, double y) +{ + return (make_cuDoubleComplex(double(x), double(y))); } -template <> -__inline__ double cuGet(int x, int y) { - return double(x); +static void compress_index(const int *Ind, int nnz, int m, int *Ptr, int base) +{ + int i; + + /* initialize everything to zero */ + for (i = 0; i < m + 1; i++) { + Ptr[i] = 0; + } + /* count elements in every row */ + Ptr[0] = base; + for (i = 0; i < nnz; i++) { + Ptr[Ind[i] + (1 - base)]++; + } + /* add all the values */ + for (i = 0; i < m; i++) { + Ptr[i + 1] += Ptr[i]; + } } -template <> -__inline__ cuComplex cuGet(int x, int y) { - return make_cuComplex(float(x), float(y)); -} - -template <> -__inline__ cuDoubleComplex cuGet(int x, int y) { - return (make_cuDoubleComplex(double(x), double(y))); -} - -template -__inline__ T_ELEM cuGet(float); -template <> -__inline__ float cuGet(float x) { - return float(x); -} - -template <> -__inline__ double cuGet(float x) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(float x) { - return (make_cuComplex(float(x), 0.0f)); -} - -template <> -__inline__ cuDoubleComplex cuGet(float x) { - return (make_cuDoubleComplex(double(x), 0.0)); -} - -template -__inline__ T_ELEM cuGet(float, float); -template <> -__inline__ float cuGet(float x, float y) { - return float(x); -} - -template <> -__inline__ double cuGet(float x, float y) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(float x, float y) { - return (make_cuComplex(float(x), float(y))); -} - -template <> -__inline__ cuDoubleComplex cuGet(float x, float y) { - return (make_cuDoubleComplex(double(x), double(y))); -} - -template -__inline__ T_ELEM cuGet(double); -template <> -__inline__ float cuGet(double x) { - return float(x); -} - -template <> -__inline__ double cuGet(double x) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(double x) { - return (make_cuComplex(float(x), 0.0f)); -} - -template <> -__inline__ cuDoubleComplex cuGet(double x) { - return (make_cuDoubleComplex(double(x), 0.0)); -} - -template -__inline__ T_ELEM cuGet(double, double); -template <> -__inline__ float cuGet(double x, double y) { - return float(x); -} - -template <> -__inline__ double cuGet(double x, double y) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(double x, double y) { - return (make_cuComplex(float(x), float(y))); -} - -template <> -__inline__ cuDoubleComplex cuGet(double x, double y) { - return (make_cuDoubleComplex(double(x), double(y))); -} - -static void compress_index(const int *Ind, int nnz, int m, int *Ptr, int base) { - int i; - - /* initialize everything to zero */ - for (i = 0; i < m + 1; i++) { - Ptr[i] = 0; - } - /* count elements in every row */ - Ptr[0] = base; - for (i = 0; i < nnz; i++) { - Ptr[Ind[i] + (1 - base)]++; - } - /* add all the values */ - for (i = 0; i < m; i++) { - Ptr[i + 1] += Ptr[i]; - } -} - -struct cooFormat { - int i; - int j; - int p; // permutation +struct cooFormat +{ + int i; + int j; + int p; // permutation }; -int cmp_cooFormat_csr(struct cooFormat *s, struct cooFormat *t) { - if (s->i < t->i) { - return -1; - } else if (s->i > t->i) { - return 1; - } else { - return s->j - t->j; - } +int cmp_cooFormat_csr(struct cooFormat *s, struct cooFormat *t) +{ + if (s->i < t->i) { + return -1; + } + else if (s->i > t->i) { + return 1; + } + else { + return s->j - t->j; + } } -int cmp_cooFormat_csc(struct cooFormat *s, struct cooFormat *t) { - if (s->j < t->j) { - return -1; - } else if (s->j > t->j) { - return 1; - } else { - return s->i - t->i; - } +int cmp_cooFormat_csc(struct cooFormat *s, struct cooFormat *t) +{ + if (s->j < t->j) { + return -1; + } + else if (s->j > t->j) { + return 1; + } + else { + return s->i - t->i; + } } typedef int (*FUNPTR)(const void *, const void *); @@ -222,280 +167,326 @@ static FUNPTR2 fptr_array[2] = { cmp_cooFormat_csc, }; -static int verify_pattern(int m, int nnz, int *csrRowPtr, int *csrColInd) { - int i, col, start, end, base_index; - int error_found = 0; +static int verify_pattern(int m, int nnz, int *csrRowPtr, int *csrColInd) +{ + int i, col, start, end, base_index; + int error_found = 0; - if (nnz != (csrRowPtr[m] - csrRowPtr[0])) { - fprintf(stderr, - "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) " - "!= (nnz=%d)\n", - 0, csrRowPtr[0], m, csrRowPtr[m], nnz); - error_found = 1; - } - - base_index = csrRowPtr[0]; - if ((0 != base_index) && (1 != base_index)) { - fprintf(stderr, "Error (base index check failed): base index = %d\n", - base_index); - error_found = 1; - } - - for (i = 0; (!error_found) && (i < m); i++) { - start = csrRowPtr[i] - base_index; - end = csrRowPtr[i + 1] - base_index; - if (start > end) { - fprintf( - stderr, - "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", - i, start + base_index, i + 1, end + base_index); - error_found = 1; - } - for (col = start; col < end; col++) { - if (csrColInd[col] < base_index) { - fprintf( - stderr, - "Error (column vs. base index check failed): csrColInd[%d] < %d\n", - col, base_index); - error_found = 1; - } - if ((col < (end - 1)) && (csrColInd[col] >= csrColInd[col + 1])) { + if (nnz != (csrRowPtr[m] - csrRowPtr[0])) { fprintf(stderr, - "Error (sorting of the column indecis check failed): " - "(csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", - col, csrColInd[col], col + 1, csrColInd[col + 1]); + "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) " + "!= (nnz=%d)\n", + 0, + csrRowPtr[0], + m, + csrRowPtr[m], + nnz); error_found = 1; - } } - } - return error_found; + + base_index = csrRowPtr[0]; + if ((0 != base_index) && (1 != base_index)) { + fprintf(stderr, "Error (base index check failed): base index = %d\n", base_index); + error_found = 1; + } + + for (i = 0; (!error_found) && (i < m); i++) { + start = csrRowPtr[i] - base_index; + end = csrRowPtr[i + 1] - base_index; + if (start > end) { + fprintf(stderr, + "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", + i, + start + base_index, + i + 1, + end + base_index); + error_found = 1; + } + for (col = start; col < end; col++) { + if (csrColInd[col] < base_index) { + fprintf(stderr, "Error (column vs. base index check failed): csrColInd[%d] < %d\n", col, base_index); + error_found = 1; + } + if ((col < (end - 1)) && (csrColInd[col] >= csrColInd[col + 1])) { + fprintf(stderr, + "Error (sorting of the column indecis check failed): " + "(csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", + col, + csrColInd[col], + col + 1, + csrColInd[col + 1]); + error_found = 1; + } + } + } + return error_found; } template -int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m, - int *n, int *nnz, T_ELEM **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix) { - MM_typecode matcode; - double *tempVal; - int *tempRowInd, *tempColInd; - double *tval; - int *trow, *tcol; - int *csrRowPtr, *cscColPtr; - int i, j, error, base, count; - struct cooFormat *work; +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix) +{ + MM_typecode matcode; + double *tempVal; + int *tempRowInd, *tempColInd; + double *tval; + int *trow, *tcol; + int *csrRowPtr, *cscColPtr; + int i, j, error, base, count; + struct cooFormat *work; - /* read the matrix */ - error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode); - if (error) { - fprintf(stderr, "!!!! can not open file: '%s'\n", filename); - return 1; - } - - /* start error checking */ - if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) { - fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n"); - return 1; - } - - if (mm_is_dense(matcode) || mm_is_array(matcode) || - mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/) { - fprintf( - stderr, - "!!!! dense, array, pattern and integer matrices are not supported\n"); - return 1; - } - - /* if necessary symmetrize the pattern (transform from triangular to full) */ - if ((extendSymMatrix) && (mm_is_symmetric(matcode) || - mm_is_hermitian(matcode) || mm_is_skew(matcode))) { - // count number of non-diagonal elements - count = 0; - for (i = 0; i < (*nnz); i++) { - if (trow[i] != tcol[i]) { - count++; - } + /* read the matrix */ + error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode); + if (error) { + fprintf(stderr, "!!!! can not open file: '%s'\n", filename); + return 1; } - // allocate space for the symmetrized matrix - tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); - tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - tempVal = (double *)malloc((*nnz + count) * sizeof(double)); - } else { - tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double)); + + /* start error checking */ + if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) { + fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n"); + return 1; } - // copy the elements regular and transposed locations - for (j = 0, i = 0; i < (*nnz); i++) { - tempRowInd[j] = trow[i]; - tempColInd[j] = tcol[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - tempVal[j] = tval[i]; - } else { - tempVal[2 * j] = tval[2 * i]; - tempVal[2 * j + 1] = tval[2 * i + 1]; - } - j++; - if (trow[i] != tcol[i]) { - tempRowInd[j] = tcol[i]; - tempColInd[j] = trow[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - if (mm_is_skew(matcode)) { - tempVal[j] = -tval[i]; - } else { - tempVal[j] = tval[i]; - } - } else { - if (mm_is_hermitian(matcode)) { - tempVal[2 * j] = tval[2 * i]; - tempVal[2 * j + 1] = -tval[2 * i + 1]; - } else { - tempVal[2 * j] = tval[2 * i]; - tempVal[2 * j + 1] = tval[2 * i + 1]; - } + + if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/) { + fprintf(stderr, "!!!! dense, array, pattern and integer matrices are not supported\n"); + return 1; + } + + /* if necessary symmetrize the pattern (transform from triangular to full) */ + if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))) { + // count number of non-diagonal elements + count = 0; + for (i = 0; i < (*nnz); i++) { + if (trow[i] != tcol[i]) { + count++; + } } - j++; - } + // allocate space for the symmetrized matrix + tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); + tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + tempVal = (double *)malloc((*nnz + count) * sizeof(double)); + } + else { + tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double)); + } + // copy the elements regular and transposed locations + for (j = 0, i = 0; i < (*nnz); i++) { + tempRowInd[j] = trow[i]; + tempColInd[j] = tcol[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + tempVal[j] = tval[i]; + } + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; + } + j++; + if (trow[i] != tcol[i]) { + tempRowInd[j] = tcol[i]; + tempColInd[j] = trow[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + if (mm_is_skew(matcode)) { + tempVal[j] = -tval[i]; + } + else { + tempVal[j] = tval[i]; + } + } + else { + if (mm_is_hermitian(matcode)) { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = -tval[2 * i + 1]; + } + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; + } + } + j++; + } + } + (*nnz) += count; + // free temporary storage + free(trow); + free(tcol); + free(tval); } - (*nnz) += count; - // free temporary storage - free(trow); - free(tcol); - free(tval); - } else { - tempRowInd = trow; - tempColInd = tcol; - tempVal = tval; - } - // life time of (trow, tcol, tval) is over. - // please use COO format (tempRowInd, tempColInd, tempVal) - - // use qsort to sort COO format - work = (struct cooFormat *)malloc(sizeof(struct cooFormat) * (*nnz)); - if (NULL == work) { - fprintf(stderr, "!!!! allocation error, malloc failed\n"); - return 1; - } - for (i = 0; i < (*nnz); i++) { - work[i].i = tempRowInd[i]; - work[i].j = tempColInd[i]; - work[i].p = i; // permutation is identity - } - - if (csrFormat) { - /* create row-major ordering of indices (sorted by row and within each row - * by column) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0]); - } else { - /* create column-major ordering of indices (sorted by column and within each - * column by row) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1]); - } - - // (tempRowInd, tempColInd) is sorted either by row-major or by col-major - for (i = 0; i < (*nnz); i++) { - tempRowInd[i] = work[i].i; - tempColInd[i] = work[i].j; - } - - // setup base - // check if there is any row/col 0, if so base-0 - // check if there is any row/col equal to matrix dimension m/n, if so base-1 - int base0 = 0; - int base1 = 0; - for (i = 0; i < (*nnz); i++) { - const int row = tempRowInd[i]; - const int col = tempColInd[i]; - if ((0 == row) || (0 == col)) { - base0 = 1; + else { + tempRowInd = trow; + tempColInd = tcol; + tempVal = tval; } - if ((*m == row) || (*n == col)) { - base1 = 1; + // life time of (trow, tcol, tval) is over. + // please use COO format (tempRowInd, tempColInd, tempVal) + + // use qsort to sort COO format + work = (struct cooFormat *)malloc(sizeof(struct cooFormat) * (*nnz)); + if (NULL == work) { + fprintf(stderr, "!!!! allocation error, malloc failed\n"); + return 1; + } + for (i = 0; i < (*nnz); i++) { + work[i].i = tempRowInd[i]; + work[i].j = tempColInd[i]; + work[i].p = i; // permutation is identity } - } - if (base0 && base1) { - printf("Error: input matrix is base-0 and base-1 \n"); - return 1; - } - base = 0; - if (base1) { - base = 1; - } - - /* compress the appropriate indices */ - if (csrFormat) { - /* CSR format (assuming row-major format) */ - csrRowPtr = (int *)malloc(((*m) + 1) * sizeof(csrRowPtr[0])); - if (!csrRowPtr) return 1; - compress_index(tempRowInd, *nnz, *m, csrRowPtr, base); - - *aRowInd = csrRowPtr; - *aColInd = (int *)malloc((*nnz) * sizeof(int)); - } else { - /* CSC format (assuming column-major format) */ - cscColPtr = (int *)malloc(((*n) + 1) * sizeof(cscColPtr[0])); - if (!cscColPtr) return 1; - compress_index(tempColInd, *nnz, *n, cscColPtr, base); - - *aColInd = cscColPtr; - *aRowInd = (int *)malloc((*nnz) * sizeof(int)); - } - - /* transfrom the matrix values of type double into one of the cusparse library - * types */ - *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM)); - - for (i = 0; i < (*nnz); i++) { if (csrFormat) { - (*aColInd)[i] = tempColInd[i]; - } else { - (*aRowInd)[i] = tempRowInd[i]; + /* create row-major ordering of indices (sorted by row and within each row + * by column) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0]); } - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - (*aVal)[i] = cuGet(tempVal[work[i].p]); - } else { - (*aVal)[i] = - cuGet(tempVal[2 * work[i].p], tempVal[2 * work[i].p + 1]); + else { + /* create column-major ordering of indices (sorted by column and within each + * column by row) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1]); } - } - /* check for corruption */ - int error_found; - if (csrFormat) { - error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd); - } else { - error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd); - } - if (error_found) { - fprintf(stderr, "!!!! verify_pattern failed\n"); - return 1; - } + // (tempRowInd, tempColInd) is sorted either by row-major or by col-major + for (i = 0; i < (*nnz); i++) { + tempRowInd[i] = work[i].i; + tempColInd[i] = work[i].j; + } - /* cleanup and exit */ - free(work); - free(tempVal); - free(tempColInd); - free(tempRowInd); + // setup base + // check if there is any row/col 0, if so base-0 + // check if there is any row/col equal to matrix dimension m/n, if so base-1 + int base0 = 0; + int base1 = 0; + for (i = 0; i < (*nnz); i++) { + const int row = tempRowInd[i]; + const int col = tempColInd[i]; + if ((0 == row) || (0 == col)) { + base0 = 1; + } + if ((*m == row) || (*n == col)) { + base1 = 1; + } + } + if (base0 && base1) { + printf("Error: input matrix is base-0 and base-1 \n"); + return 1; + } - return 0; + base = 0; + if (base1) { + base = 1; + } + + /* compress the appropriate indices */ + if (csrFormat) { + /* CSR format (assuming row-major format) */ + csrRowPtr = (int *)malloc(((*m) + 1) * sizeof(csrRowPtr[0])); + if (!csrRowPtr) + return 1; + compress_index(tempRowInd, *nnz, *m, csrRowPtr, base); + + *aRowInd = csrRowPtr; + *aColInd = (int *)malloc((*nnz) * sizeof(int)); + } + else { + /* CSC format (assuming column-major format) */ + cscColPtr = (int *)malloc(((*n) + 1) * sizeof(cscColPtr[0])); + if (!cscColPtr) + return 1; + compress_index(tempColInd, *nnz, *n, cscColPtr, base); + + *aColInd = cscColPtr; + *aRowInd = (int *)malloc((*nnz) * sizeof(int)); + } + + /* transfrom the matrix values of type double into one of the cusparse library + * types */ + *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM)); + + for (i = 0; i < (*nnz); i++) { + if (csrFormat) { + (*aColInd)[i] = tempColInd[i]; + } + else { + (*aRowInd)[i] = tempRowInd[i]; + } + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + (*aVal)[i] = cuGet(tempVal[work[i].p]); + } + else { + (*aVal)[i] = cuGet(tempVal[2 * work[i].p], tempVal[2 * work[i].p + 1]); + } + } + + /* check for corruption */ + int error_found; + if (csrFormat) { + error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd); + } + else { + error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd); + } + if (error_found) { + fprintf(stderr, "!!!! verify_pattern failed\n"); + return 1; + } + + /* cleanup and exit */ + free(work); + free(tempVal); + free(tempColInd); + free(tempRowInd); + + return 0; } /* specific instantiation */ -template int loadMMSparseMatrix(char *filename, char elem_type, - bool csrFormat, int *m, int *n, int *nnz, - float **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + float **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix(char *filename, char elem_type, - bool csrFormat, int *m, int *n, - int *nnz, double **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + double **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix(char *filename, char elem_type, - bool csrFormat, int *m, int *n, - int *nnz, cuComplex **aVal, - int **aRowInd, int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix( - char *filename, char elem_type, bool csrFormat, int *m, int *n, int *nnz, - cuDoubleComplex **aVal, int **aRowInd, int **aColInd, int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuDoubleComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp index d42ac64c..dde0734a 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/cuSolverSp_LinearSolver.cpp @@ -73,638 +73,666 @@ #include #include +#include #include #include #include -#include - #include "cusolverSp.h" #include "cusparse.h" - #include "helper_cuda.h" #include "helper_cusolver.h" template -int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m, - int *n, int *nnz, T_ELEM **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -void UsageSP(void) { - printf("\n"); - printf("-h : display this help\n"); - printf("-R= : choose a linear solver\n"); - printf(" chol (cholesky factorization), this is default\n"); - printf(" qr (QR factorization)\n"); - printf(" lu (LU factorization)\n"); - printf("-P= : choose a reordering\n"); - printf(" symrcm (Reverse Cuthill-McKee)\n"); - printf(" symamd (Approximate Minimum Degree)\n"); - printf(" metis (nested dissection)\n"); - printf("-file= : filename containing a matrix in MM format\n"); - printf("-device= : if want to run on specific GPU\n"); +void UsageSP(void) +{ + printf("\n"); + printf("-h : display this help\n"); + printf("-R= : choose a linear solver\n"); + printf(" chol (cholesky factorization), this is default\n"); + printf(" qr (QR factorization)\n"); + printf(" lu (LU factorization)\n"); + printf("-P= : choose a reordering\n"); + printf(" symrcm (Reverse Cuthill-McKee)\n"); + printf(" symamd (Approximate Minimum Degree)\n"); + printf(" metis (nested dissection)\n"); + printf("-file= : filename containing a matrix in MM format\n"); + printf("-device= : if want to run on specific GPU\n"); - exit(0); + exit(0); } -void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) { - memset(&opts, 0, sizeof(opts)); +void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) +{ + memset(&opts, 0, sizeof(opts)); - if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { - UsageSP(); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "R")) { - char *solverType = NULL; - getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType); - - if (solverType) { - if ((STRCASECMP(solverType, "chol") != 0) && - (STRCASECMP(solverType, "lu") != 0) && - (STRCASECMP(solverType, "qr") != 0)) { - printf("\nIncorrect argument passed to -R option\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { UsageSP(); - } else { - opts.testFunc = solverType; - } } - } - if (checkCmdLineFlag(argc, (const char **)argv, "P")) { - char *reorderType = NULL; - getCmdLineArgumentString(argc, (const char **)argv, "P", &reorderType); + if (checkCmdLineFlag(argc, (const char **)argv, "R")) { + char *solverType = NULL; + getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType); - if (reorderType) { - if ((STRCASECMP(reorderType, "symrcm") != 0) && - (STRCASECMP(reorderType, "symamd") != 0) && - (STRCASECMP(reorderType, "metis") != 0)) { - printf("\nIncorrect argument passed to -P option\n"); - UsageSP(); - } else { - opts.reorder = reorderType; - } + if (solverType) { + if ((STRCASECMP(solverType, "chol") != 0) && (STRCASECMP(solverType, "lu") != 0) + && (STRCASECMP(solverType, "qr") != 0)) { + printf("\nIncorrect argument passed to -R option\n"); + UsageSP(); + } + else { + opts.testFunc = solverType; + } + } } - } - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - char *fileName = 0; - getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); + if (checkCmdLineFlag(argc, (const char **)argv, "P")) { + char *reorderType = NULL; + getCmdLineArgumentString(argc, (const char **)argv, "P", &reorderType); - if (fileName) { - opts.sparse_mat_filename = fileName; - } else { - printf("\nIncorrect filename passed to -file \n "); - UsageSP(); + if (reorderType) { + if ((STRCASECMP(reorderType, "symrcm") != 0) && (STRCASECMP(reorderType, "symamd") != 0) + && (STRCASECMP(reorderType, "metis") != 0)) { + printf("\nIncorrect argument passed to -P option\n"); + UsageSP(); + } + else { + opts.reorder = reorderType; + } + } + } + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + char *fileName = 0; + getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); + + if (fileName) { + opts.sparse_mat_filename = fileName; + } + else { + printf("\nIncorrect filename passed to -file \n "); + UsageSP(); + } } - } } -int main(int argc, char *argv[]) { - struct testOpts opts; - cusolverSpHandle_t handle = NULL; - cusparseHandle_t cusparseHandle = NULL; /* used in residual evaluation */ - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; +int main(int argc, char *argv[]) +{ + struct testOpts opts; + cusolverSpHandle_t handle = NULL; + cusparseHandle_t cusparseHandle = NULL; /* used in residual evaluation */ + cudaStream_t stream = NULL; + cusparseMatDescr_t descrA = NULL; - int rowsA = 0; /* number of rows of A */ - int colsA = 0; /* number of columns of A */ - int nnzA = 0; /* number of nonzeros of A */ - int baseA = 0; /* base index in CSR format */ + int rowsA = 0; /* number of rows of A */ + int colsA = 0; /* number of columns of A */ + int nnzA = 0; /* number of nonzeros of A */ + int baseA = 0; /* base index in CSR format */ - /* CSR(A) from I/O */ - int *h_csrRowPtrA = NULL; - int *h_csrColIndA = NULL; - double *h_csrValA = NULL; + /* CSR(A) from I/O */ + int *h_csrRowPtrA = NULL; + int *h_csrColIndA = NULL; + double *h_csrValA = NULL; - double *h_z = NULL; /* z = B \ (Q*b) */ - double *h_x = NULL; /* x = A \ b */ - double *h_b = NULL; /* b = ones(n,1) */ - double *h_Qb = NULL; /* Q*b */ - double *h_r = NULL; /* r = b - A*x */ + double *h_z = NULL; /* z = B \ (Q*b) */ + double *h_x = NULL; /* x = A \ b */ + double *h_b = NULL; /* b = ones(n,1) */ + double *h_Qb = NULL; /* Q*b */ + double *h_r = NULL; /* r = b - A*x */ - int *h_Q = NULL; /* n */ - /* reorder to reduce zero fill-in */ - /* Q = symrcm(A) or Q = symamd(A) */ - /* B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */ - int *h_csrRowPtrB = NULL; /* n+1 */ - int *h_csrColIndB = NULL; /* nnzA */ - double *h_csrValB = NULL; /* nnzA */ - int *h_mapBfromA = NULL; /* nnzA */ + int *h_Q = NULL; /* n */ + /* reorder to reduce zero fill-in */ + /* Q = symrcm(A) or Q = symamd(A) */ + /* B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */ + int *h_csrRowPtrB = NULL; /* n+1 */ + int *h_csrColIndB = NULL; /* nnzA */ + double *h_csrValB = NULL; /* nnzA */ + int *h_mapBfromA = NULL; /* nnzA */ - size_t size_perm = 0; - void *buffer_cpu = NULL; /* working space for permutation: B = Q*A*Q^T */ + size_t size_perm = 0; + void *buffer_cpu = NULL; /* working space for permutation: B = Q*A*Q^T */ - /* device copy of A: used in residual evaluation */ - int *d_csrRowPtrA = NULL; - int *d_csrColIndA = NULL; - double *d_csrValA = NULL; + /* device copy of A: used in residual evaluation */ + int *d_csrRowPtrA = NULL; + int *d_csrColIndA = NULL; + double *d_csrValA = NULL; - /* device copy of B: used in B*z = Q*b */ - int *d_csrRowPtrB = NULL; - int *d_csrColIndB = NULL; - double *d_csrValB = NULL; + /* device copy of B: used in B*z = Q*b */ + int *d_csrRowPtrB = NULL; + int *d_csrColIndB = NULL; + double *d_csrValB = NULL; - int *d_Q = NULL; /* device copy of h_Q */ - double *d_z = NULL; /* z = B \ Q*b */ - double *d_x = NULL; /* x = A \ b */ - double *d_b = NULL; /* a copy of h_b */ - double *d_Qb = NULL; /* a copy of h_Qb */ - double *d_r = NULL; /* r = b - A*x */ + int *d_Q = NULL; /* device copy of h_Q */ + double *d_z = NULL; /* z = B \ Q*b */ + double *d_x = NULL; /* x = A \ b */ + double *d_b = NULL; /* a copy of h_b */ + double *d_Qb = NULL; /* a copy of h_Qb */ + double *d_r = NULL; /* r = b - A*x */ - double tol = 1.e-12; - const int reorder = 0; /* no reordering */ - int singularity = 0; /* -1 if A is invertible under tol. */ + double tol = 1.e-12; + const int reorder = 0; /* no reordering */ + int singularity = 0; /* -1 if A is invertible under tol. */ - /* the constants are used in residual evaluation, r = b - A*x */ - const double minus_one = -1.0; - const double one = 1.0; + /* the constants are used in residual evaluation, r = b - A*x */ + const double minus_one = -1.0; + const double one = 1.0; - double b_inf = 0.0; - double x_inf = 0.0; - double r_inf = 0.0; - double A_inf = 0.0; - int errors = 0; - int issym = 0; + double b_inf = 0.0; + double x_inf = 0.0; + double r_inf = 0.0; + double A_inf = 0.0; + int errors = 0; + int issym = 0; - double start, stop; - double time_solve_cpu; - double time_solve_gpu; + double start, stop; + double time_solve_cpu; + double time_solve_gpu; - parseCommandLineArguments(argc, argv, opts); + parseCommandLineArguments(argc, argv, opts); - if (NULL == opts.testFunc) { - opts.testFunc = - "chol"; /* By default running Cholesky as NO solver selected with -R - option. */ - } - - findCudaDevice(argc, (const char **)argv); - - if (opts.sparse_mat_filename == NULL) { - opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]); - if (opts.sparse_mat_filename != NULL) - printf("Using default input file [%s]\n", opts.sparse_mat_filename); - else - printf("Could not find lap2D_5pt_n100.mtx\n"); - } else { - printf("Using input file [%s]\n", opts.sparse_mat_filename); - } - - printf("step 1: read matrix market format\n"); - - if (opts.sparse_mat_filename == NULL) { - fprintf(stderr, "Error: input matrix is not provided\n"); - return EXIT_FAILURE; - } - - if (loadMMSparseMatrix(opts.sparse_mat_filename, 'd', true, &rowsA, - &colsA, &nnzA, &h_csrValA, &h_csrRowPtrA, - &h_csrColIndA, true)) { - exit(EXIT_FAILURE); - } - baseA = h_csrRowPtrA[0]; // baseA = {0,1} - printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, - nnzA, baseA); - - if (rowsA != colsA) { - fprintf(stderr, "Error: only support square matrix\n"); - return 1; - } - - checkCudaErrors(cusolverSpCreate(&handle)); - checkCudaErrors(cusparseCreate(&cusparseHandle)); - - checkCudaErrors(cudaStreamCreate(&stream)); - /* bind stream to cusparse and cusolver*/ - checkCudaErrors(cusolverSpSetStream(handle, stream)); - checkCudaErrors(cusparseSetStream(cusparseHandle, stream)); - - /* configure matrix descriptor*/ - checkCudaErrors(cusparseCreateMatDescr(&descrA)); - checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - if (baseA) { - checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); - } else { - checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); - } - - h_z = (double *)malloc(sizeof(double) * colsA); - h_x = (double *)malloc(sizeof(double) * colsA); - h_b = (double *)malloc(sizeof(double) * rowsA); - h_Qb = (double *)malloc(sizeof(double) * rowsA); - h_r = (double *)malloc(sizeof(double) * rowsA); - - h_Q = (int *)malloc(sizeof(int) * colsA); - h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1)); - h_csrColIndB = (int *)malloc(sizeof(int) * nnzA); - h_csrValB = (double *)malloc(sizeof(double) * nnzA); - h_mapBfromA = (int *)malloc(sizeof(int) * nnzA); - - assert(NULL != h_z); - assert(NULL != h_x); - assert(NULL != h_b); - assert(NULL != h_Qb); - assert(NULL != h_r); - assert(NULL != h_Q); - assert(NULL != h_csrRowPtrB); - assert(NULL != h_csrColIndB); - assert(NULL != h_csrValB); - assert(NULL != h_mapBfromA); - - checkCudaErrors( - cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1))); - checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA)); - checkCudaErrors( - cudaMalloc((void **)&d_csrRowPtrB, sizeof(int) * (rowsA + 1))); - checkCudaErrors(cudaMalloc((void **)&d_csrColIndB, sizeof(int) * nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_csrValB, sizeof(double) * nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_Q, sizeof(int) * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_z, sizeof(double) * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_Qb, sizeof(double) * rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); - - /* verify if A has symmetric pattern or not */ - checkCudaErrors(cusolverSpXcsrissymHost(handle, rowsA, nnzA, descrA, - h_csrRowPtrA, h_csrRowPtrA + 1, - h_csrColIndA, &issym)); - - if (0 == strcmp(opts.testFunc, "chol")) { - if (!issym) { - printf("Error: A has no symmetric pattern, please use LU or QR \n"); - exit(EXIT_FAILURE); + if (NULL == opts.testFunc) { + opts.testFunc = "chol"; /* By default running Cholesky as NO solver selected with -R + option. */ } - } - printf("step 2: reorder the matrix A to minimize zero fill-in\n"); - printf( - " if the user choose a reordering by -P=symrcm, -P=symamd or " - "-P=metis\n"); + findCudaDevice(argc, (const char **)argv); - if (NULL != opts.reorder) { - if (0 == strcmp(opts.reorder, "symrcm")) { - printf("step 2.1: Q = symrcm(A) \n"); - checkCudaErrors(cusolverSpXcsrsymrcmHost( - handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Q)); - } else if (0 == strcmp(opts.reorder, "symamd")) { - printf("step 2.1: Q = symamd(A) \n"); - checkCudaErrors(cusolverSpXcsrsymamdHost( - handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Q)); - } else if (0 == strcmp(opts.reorder, "metis")) { - printf("step 2.1: Q = metis(A) \n"); - checkCudaErrors(cusolverSpXcsrmetisndHost(handle, rowsA, nnzA, descrA, - h_csrRowPtrA, h_csrColIndA, - NULL, /* default setting. */ - h_Q)); - } else { - fprintf(stderr, "Error: %s is unknown reordering\n", opts.reorder); - return 1; + if (opts.sparse_mat_filename == NULL) { + opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]); + if (opts.sparse_mat_filename != NULL) + printf("Using default input file [%s]\n", opts.sparse_mat_filename); + else + printf("Could not find lap2D_5pt_n100.mtx\n"); } - } else { - printf("step 2.1: no reordering is chosen, Q = 0:n-1 \n"); - for (int j = 0; j < rowsA; j++) { - h_Q[j] = j; + else { + printf("Using input file [%s]\n", opts.sparse_mat_filename); } - } - printf("step 2.2: B = A(Q,Q) \n"); + printf("step 1: read matrix market format\n"); - memcpy(h_csrRowPtrB, h_csrRowPtrA, sizeof(int) * (rowsA + 1)); - memcpy(h_csrColIndB, h_csrColIndA, sizeof(int) * nnzA); - - checkCudaErrors(cusolverSpXcsrperm_bufferSizeHost( - handle, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, h_Q, h_Q, - &size_perm)); - - if (buffer_cpu) { - free(buffer_cpu); - } - buffer_cpu = (void *)malloc(sizeof(char) * size_perm); - assert(NULL != buffer_cpu); - - /* h_mapBfromA = Identity */ - for (int j = 0; j < nnzA; j++) { - h_mapBfromA[j] = j; - } - checkCudaErrors(cusolverSpXcsrpermHost(handle, rowsA, colsA, nnzA, descrA, - h_csrRowPtrB, h_csrColIndB, h_Q, h_Q, - h_mapBfromA, buffer_cpu)); - - /* B = A( mapBfromA ) */ - for (int j = 0; j < nnzA; j++) { - h_csrValB[j] = h_csrValA[h_mapBfromA[j]]; - } - - printf("step 3: b(j) = 1 + j/n \n"); - for (int row = 0; row < rowsA; row++) { - h_b[row] = 1.0 + ((double)row) / ((double)rowsA); - } - - /* h_Qb = b(Q) */ - for (int row = 0; row < rowsA; row++) { - h_Qb[row] = h_b[h_Q[row]]; - } - - printf("step 4: prepare data on device\n"); - checkCudaErrors(cudaMemcpyAsync(d_csrRowPtrA, h_csrRowPtrA, - sizeof(int) * (rowsA + 1), - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_csrColIndA, h_csrColIndA, - sizeof(int) * nnzA, cudaMemcpyHostToDevice, - stream)); - checkCudaErrors(cudaMemcpyAsync(d_csrValA, h_csrValA, sizeof(double) * nnzA, - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_csrRowPtrB, h_csrRowPtrB, - sizeof(int) * (rowsA + 1), - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_csrColIndB, h_csrColIndB, - sizeof(int) * nnzA, cudaMemcpyHostToDevice, - stream)); - checkCudaErrors(cudaMemcpyAsync(d_csrValB, h_csrValB, sizeof(double) * nnzA, - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_b, h_b, sizeof(double) * rowsA, - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_Qb, h_Qb, sizeof(double) * rowsA, - cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_Q, h_Q, sizeof(int) * rowsA, - cudaMemcpyHostToDevice, stream)); - - printf("step 5: solve A*x = b on CPU \n"); - start = second(); - - /* solve B*z = Q*b */ - if (0 == strcmp(opts.testFunc, "chol")) { - checkCudaErrors(cusolverSpDcsrlsvcholHost( - handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, - h_Qb, tol, reorder, h_z, &singularity)); - } else if (0 == strcmp(opts.testFunc, "lu")) { - checkCudaErrors(cusolverSpDcsrlsvluHost( - handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, - h_Qb, tol, reorder, h_z, &singularity)); - - } else if (0 == strcmp(opts.testFunc, "qr")) { - checkCudaErrors(cusolverSpDcsrlsvqrHost( - handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, - h_Qb, tol, reorder, h_z, &singularity)); - } else { - fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc); - return 1; - } - - /* Q*x = z */ - for (int row = 0; row < rowsA; row++) { - h_x[h_Q[row]] = h_z[row]; - } - - if (0 <= singularity) { - printf("WARNING: the matrix is singular at row %d under tol (%E)\n", - singularity, tol); - } - - stop = second(); - time_solve_cpu = stop - start; - - printf("step 6: evaluate residual r = b - A*x (result on CPU)\n"); - checkCudaErrors(cudaMemcpyAsync(d_r, d_b, sizeof(double) * rowsA, - cudaMemcpyDeviceToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_x, h_x, sizeof(double) * colsA, - cudaMemcpyHostToDevice, stream)); - - /* Wrap raw data into cuSPARSE generic API objects */ - cusparseSpMatDescr_t matA = NULL; - if (baseA) { - checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA, - d_csrColIndA, d_csrValA, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)); - } else { - checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA, - d_csrColIndA, d_csrValA, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); - } - - cusparseDnVecDescr_t vecx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F)); - cusparseDnVecDescr_t vecAx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F)); - - /* Allocate workspace for cuSPARSE */ - size_t bufferSize = 0; - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); - void *buffer = NULL; - checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - - checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, - cudaMemcpyDeviceToHost, stream)); - /* wait until h_r is ready */ - checkCudaErrors(cudaDeviceSynchronize()); - - b_inf = vec_norminf(rowsA, h_b); - x_inf = vec_norminf(colsA, h_x); - r_inf = vec_norminf(rowsA, h_r); - A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, - h_csrColIndA); - - printf("(CPU) |b - A*x| = %E \n", r_inf); - printf("(CPU) |A| = %E \n", A_inf); - printf("(CPU) |x| = %E \n", x_inf); - printf("(CPU) |b| = %E \n", b_inf); - printf("(CPU) |b - A*x|/(|A|*|x| + |b|) = %E \n", - r_inf / (A_inf * x_inf + b_inf)); - - printf("step 7: solve A*x = b on GPU\n"); - start = second(); - - /* solve B*z = Q*b */ - if (0 == strcmp(opts.testFunc, "chol")) { - checkCudaErrors(cusolverSpDcsrlsvchol( - handle, rowsA, nnzA, descrA, d_csrValB, d_csrRowPtrB, d_csrColIndB, - d_Qb, tol, reorder, d_z, &singularity)); - - } else if (0 == strcmp(opts.testFunc, "lu")) { - printf("WARNING: no LU available on GPU \n"); - } else if (0 == strcmp(opts.testFunc, "qr")) { - checkCudaErrors(cusolverSpDcsrlsvqr(handle, rowsA, nnzA, descrA, d_csrValB, - d_csrRowPtrB, d_csrColIndB, d_Qb, tol, - reorder, d_z, &singularity)); - } else { - fprintf(stderr, "Error: %s is unknow function\n", opts.testFunc); - return 1; - } - checkCudaErrors(cudaDeviceSynchronize()); - if (0 <= singularity) { - printf("WARNING: the matrix is singular at row %d under tol (%E)\n", - singularity, tol); - } - /* Q*x = z */ - cusparseSpVecDescr_t vecz = NULL; - checkCudaErrors(cusparseCreateSpVec(&vecz, colsA, rowsA, d_Q, d_z, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); - checkCudaErrors(cusparseScatter(cusparseHandle, vecz, vecx)); - checkCudaErrors(cusparseDestroySpVec(vecz)); - - - checkCudaErrors(cudaDeviceSynchronize()); - - stop = second(); - time_solve_gpu = stop - start; - - printf("step 8: evaluate residual r = b - A*x (result on GPU)\n"); - checkCudaErrors(cudaMemcpyAsync(d_r, d_b, sizeof(double) * rowsA, - cudaMemcpyDeviceToDevice, stream)); - - checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA, - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, - cudaMemcpyDeviceToHost, stream)); - /* wait until h_x and h_r are ready */ - checkCudaErrors(cudaDeviceSynchronize()); - - b_inf = vec_norminf(rowsA, h_b); - x_inf = vec_norminf(colsA, h_x); - r_inf = vec_norminf(rowsA, h_r); - - if (0 != strcmp(opts.testFunc, "lu")) { - // only cholesky and qr have GPU version - printf("(GPU) |b - A*x| = %E \n", r_inf); - printf("(GPU) |A| = %E \n", A_inf); - printf("(GPU) |x| = %E \n", x_inf); - printf("(GPU) |b| = %E \n", b_inf); - printf("(GPU) |b - A*x|/(|A|*|x| + |b|) = %E \n", - r_inf / (A_inf * x_inf + b_inf)); - } - - fprintf(stdout, "timing %s: CPU = %10.6f sec , GPU = %10.6f sec\n", - opts.testFunc, time_solve_cpu, time_solve_gpu); - - if (0 != strcmp(opts.testFunc, "lu")) { - printf("show last 10 elements of solution vector (GPU) \n"); - printf("consistent result for different reordering and solver \n"); - for (int j = rowsA - 10; j < rowsA; j++) { - printf("x[%d] = %E\n", j, h_x[j]); + if (opts.sparse_mat_filename == NULL) { + fprintf(stderr, "Error: input matrix is not provided\n"); + return EXIT_FAILURE; } - } - if (handle) { - checkCudaErrors(cusolverSpDestroy(handle)); - } - if (cusparseHandle) { - checkCudaErrors(cusparseDestroy(cusparseHandle)); - } - if (stream) { - checkCudaErrors(cudaStreamDestroy(stream)); - } - if (descrA) { - checkCudaErrors(cusparseDestroyMatDescr(descrA)); - } - if (matA) { - checkCudaErrors(cusparseDestroySpMat(matA)); - } - if (vecx) { - checkCudaErrors(cusparseDestroyDnVec(vecx)); - } - if (vecAx) { - checkCudaErrors(cusparseDestroyDnVec(vecAx)); - } + if (loadMMSparseMatrix(opts.sparse_mat_filename, + 'd', + true, + &rowsA, + &colsA, + &nnzA, + &h_csrValA, + &h_csrRowPtrA, + &h_csrColIndA, + true)) { + exit(EXIT_FAILURE); + } + baseA = h_csrRowPtrA[0]; // baseA = {0,1} + printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA); - if (h_csrValA) { - free(h_csrValA); - } - if (h_csrRowPtrA) { - free(h_csrRowPtrA); - } - if (h_csrColIndA) { - free(h_csrColIndA); - } - if (h_z) { - free(h_z); - } - if (h_x) { - free(h_x); - } - if (h_b) { - free(h_b); - } - if (h_Qb) { - free(h_Qb); - } - if (h_r) { - free(h_r); - } + if (rowsA != colsA) { + fprintf(stderr, "Error: only support square matrix\n"); + return 1; + } - if (h_Q) { - free(h_Q); - } + checkCudaErrors(cusolverSpCreate(&handle)); + checkCudaErrors(cusparseCreate(&cusparseHandle)); - if (h_csrRowPtrB) { - free(h_csrRowPtrB); - } - if (h_csrColIndB) { - free(h_csrColIndB); - } - if (h_csrValB) { - free(h_csrValB); - } - if (h_mapBfromA) { - free(h_mapBfromA); - } + checkCudaErrors(cudaStreamCreate(&stream)); + /* bind stream to cusparse and cusolver*/ + checkCudaErrors(cusolverSpSetStream(handle, stream)); + checkCudaErrors(cusparseSetStream(cusparseHandle, stream)); - if (buffer_cpu) { - free(buffer_cpu); - } + /* configure matrix descriptor*/ + checkCudaErrors(cusparseCreateMatDescr(&descrA)); + checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + if (baseA) { + checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); + } + else { + checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + } - if (buffer) { - checkCudaErrors(cudaFree(buffer)); - } - if (d_csrValA) { - checkCudaErrors(cudaFree(d_csrValA)); - } - if (d_csrRowPtrA) { - checkCudaErrors(cudaFree(d_csrRowPtrA)); - } - if (d_csrColIndA) { - checkCudaErrors(cudaFree(d_csrColIndA)); - } - if (d_csrValB) { - checkCudaErrors(cudaFree(d_csrValB)); - } - if (d_csrRowPtrB) { - checkCudaErrors(cudaFree(d_csrRowPtrB)); - } - if (d_csrColIndB) { - checkCudaErrors(cudaFree(d_csrColIndB)); - } - if (d_Q) { - checkCudaErrors(cudaFree(d_Q)); - } - if (d_z) { - checkCudaErrors(cudaFree(d_z)); - } - if (d_x) { - checkCudaErrors(cudaFree(d_x)); - } - if (d_b) { - checkCudaErrors(cudaFree(d_b)); - } - if (d_Qb) { - checkCudaErrors(cudaFree(d_Qb)); - } - if (d_r) { - checkCudaErrors(cudaFree(d_r)); - } + h_z = (double *)malloc(sizeof(double) * colsA); + h_x = (double *)malloc(sizeof(double) * colsA); + h_b = (double *)malloc(sizeof(double) * rowsA); + h_Qb = (double *)malloc(sizeof(double) * rowsA); + h_r = (double *)malloc(sizeof(double) * rowsA); - return 0; + h_Q = (int *)malloc(sizeof(int) * colsA); + h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1)); + h_csrColIndB = (int *)malloc(sizeof(int) * nnzA); + h_csrValB = (double *)malloc(sizeof(double) * nnzA); + h_mapBfromA = (int *)malloc(sizeof(int) * nnzA); + + assert(NULL != h_z); + assert(NULL != h_x); + assert(NULL != h_b); + assert(NULL != h_Qb); + assert(NULL != h_r); + assert(NULL != h_Q); + assert(NULL != h_csrRowPtrB); + assert(NULL != h_csrColIndB); + assert(NULL != h_csrValB); + assert(NULL != h_mapBfromA); + + checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1))); + checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrB, sizeof(int) * (rowsA + 1))); + checkCudaErrors(cudaMalloc((void **)&d_csrColIndB, sizeof(int) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_csrValB, sizeof(double) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_Q, sizeof(int) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_z, sizeof(double) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_Qb, sizeof(double) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); + + /* verify if A has symmetric pattern or not */ + checkCudaErrors( + cusolverSpXcsrissymHost(handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrRowPtrA + 1, h_csrColIndA, &issym)); + + if (0 == strcmp(opts.testFunc, "chol")) { + if (!issym) { + printf("Error: A has no symmetric pattern, please use LU or QR \n"); + exit(EXIT_FAILURE); + } + } + + printf("step 2: reorder the matrix A to minimize zero fill-in\n"); + printf(" if the user choose a reordering by -P=symrcm, -P=symamd or " + "-P=metis\n"); + + if (NULL != opts.reorder) { + if (0 == strcmp(opts.reorder, "symrcm")) { + printf("step 2.1: Q = symrcm(A) \n"); + checkCudaErrors(cusolverSpXcsrsymrcmHost(handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Q)); + } + else if (0 == strcmp(opts.reorder, "symamd")) { + printf("step 2.1: Q = symamd(A) \n"); + checkCudaErrors(cusolverSpXcsrsymamdHost(handle, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_Q)); + } + else if (0 == strcmp(opts.reorder, "metis")) { + printf("step 2.1: Q = metis(A) \n"); + checkCudaErrors(cusolverSpXcsrmetisndHost(handle, + rowsA, + nnzA, + descrA, + h_csrRowPtrA, + h_csrColIndA, + NULL, /* default setting. */ + h_Q)); + } + else { + fprintf(stderr, "Error: %s is unknown reordering\n", opts.reorder); + return 1; + } + } + else { + printf("step 2.1: no reordering is chosen, Q = 0:n-1 \n"); + for (int j = 0; j < rowsA; j++) { + h_Q[j] = j; + } + } + + printf("step 2.2: B = A(Q,Q) \n"); + + memcpy(h_csrRowPtrB, h_csrRowPtrA, sizeof(int) * (rowsA + 1)); + memcpy(h_csrColIndB, h_csrColIndA, sizeof(int) * nnzA); + + checkCudaErrors(cusolverSpXcsrperm_bufferSizeHost( + handle, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, h_Q, h_Q, &size_perm)); + + if (buffer_cpu) { + free(buffer_cpu); + } + buffer_cpu = (void *)malloc(sizeof(char) * size_perm); + assert(NULL != buffer_cpu); + + /* h_mapBfromA = Identity */ + for (int j = 0; j < nnzA; j++) { + h_mapBfromA[j] = j; + } + checkCudaErrors(cusolverSpXcsrpermHost( + handle, rowsA, colsA, nnzA, descrA, h_csrRowPtrB, h_csrColIndB, h_Q, h_Q, h_mapBfromA, buffer_cpu)); + + /* B = A( mapBfromA ) */ + for (int j = 0; j < nnzA; j++) { + h_csrValB[j] = h_csrValA[h_mapBfromA[j]]; + } + + printf("step 3: b(j) = 1 + j/n \n"); + for (int row = 0; row < rowsA; row++) { + h_b[row] = 1.0 + ((double)row) / ((double)rowsA); + } + + /* h_Qb = b(Q) */ + for (int row = 0; row < rowsA; row++) { + h_Qb[row] = h_b[h_Q[row]]; + } + + printf("step 4: prepare data on device\n"); + checkCudaErrors( + cudaMemcpyAsync(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice, stream)); + checkCudaErrors( + cudaMemcpyAsync(d_csrRowPtrB, h_csrRowPtrB, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_csrColIndB, h_csrColIndB, sizeof(int) * nnzA, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_csrValB, h_csrValB, sizeof(double) * nnzA, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_Qb, h_Qb, sizeof(double) * rowsA, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_Q, h_Q, sizeof(int) * rowsA, cudaMemcpyHostToDevice, stream)); + + printf("step 5: solve A*x = b on CPU \n"); + start = second(); + + /* solve B*z = Q*b */ + if (0 == strcmp(opts.testFunc, "chol")) { + checkCudaErrors(cusolverSpDcsrlsvcholHost( + handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, h_Qb, tol, reorder, h_z, &singularity)); + } + else if (0 == strcmp(opts.testFunc, "lu")) { + checkCudaErrors(cusolverSpDcsrlsvluHost( + handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, h_Qb, tol, reorder, h_z, &singularity)); + } + else if (0 == strcmp(opts.testFunc, "qr")) { + checkCudaErrors(cusolverSpDcsrlsvqrHost( + handle, rowsA, nnzA, descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB, h_Qb, tol, reorder, h_z, &singularity)); + } + else { + fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc); + return 1; + } + + /* Q*x = z */ + for (int row = 0; row < rowsA; row++) { + h_x[h_Q[row]] = h_z[row]; + } + + if (0 <= singularity) { + printf("WARNING: the matrix is singular at row %d under tol (%E)\n", singularity, tol); + } + + stop = second(); + time_solve_cpu = stop - start; + + printf("step 6: evaluate residual r = b - A*x (result on CPU)\n"); + checkCudaErrors(cudaMemcpyAsync(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_x, h_x, sizeof(double) * colsA, cudaMemcpyHostToDevice, stream)); + + /* Wrap raw data into cuSPARSE generic API objects */ + cusparseSpMatDescr_t matA = NULL; + if (baseA) { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ONE, + CUDA_R_64F)); + } + else { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + } + + cusparseDnVecDescr_t vecx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F)); + cusparseDnVecDescr_t vecAx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F)); + + /* Allocate workspace for cuSPARSE */ + size_t bufferSize = 0; + checkCudaErrors(cusparseSpMV_bufferSize(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); + void *buffer = NULL; + checkCudaErrors(cudaMalloc(&buffer, bufferSize)); + + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost, stream)); + /* wait until h_r is ready */ + checkCudaErrors(cudaDeviceSynchronize()); + + b_inf = vec_norminf(rowsA, h_b); + x_inf = vec_norminf(colsA, h_x); + r_inf = vec_norminf(rowsA, h_r); + A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA); + + printf("(CPU) |b - A*x| = %E \n", r_inf); + printf("(CPU) |A| = %E \n", A_inf); + printf("(CPU) |x| = %E \n", x_inf); + printf("(CPU) |b| = %E \n", b_inf); + printf("(CPU) |b - A*x|/(|A|*|x| + |b|) = %E \n", r_inf / (A_inf * x_inf + b_inf)); + + printf("step 7: solve A*x = b on GPU\n"); + start = second(); + + /* solve B*z = Q*b */ + if (0 == strcmp(opts.testFunc, "chol")) { + checkCudaErrors(cusolverSpDcsrlsvchol( + handle, rowsA, nnzA, descrA, d_csrValB, d_csrRowPtrB, d_csrColIndB, d_Qb, tol, reorder, d_z, &singularity)); + } + else if (0 == strcmp(opts.testFunc, "lu")) { + printf("WARNING: no LU available on GPU \n"); + } + else if (0 == strcmp(opts.testFunc, "qr")) { + checkCudaErrors(cusolverSpDcsrlsvqr( + handle, rowsA, nnzA, descrA, d_csrValB, d_csrRowPtrB, d_csrColIndB, d_Qb, tol, reorder, d_z, &singularity)); + } + else { + fprintf(stderr, "Error: %s is unknow function\n", opts.testFunc); + return 1; + } + checkCudaErrors(cudaDeviceSynchronize()); + if (0 <= singularity) { + printf("WARNING: the matrix is singular at row %d under tol (%E)\n", singularity, tol); + } + /* Q*x = z */ + cusparseSpVecDescr_t vecz = NULL; + checkCudaErrors( + cusparseCreateSpVec(&vecz, colsA, rowsA, d_Q, d_z, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); + checkCudaErrors(cusparseScatter(cusparseHandle, vecz, vecx)); + checkCudaErrors(cusparseDestroySpVec(vecz)); + + + checkCudaErrors(cudaDeviceSynchronize()); + + stop = second(); + time_solve_gpu = stop - start; + + printf("step 8: evaluate residual r = b - A*x (result on GPU)\n"); + checkCudaErrors(cudaMemcpyAsync(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice, stream)); + + checkCudaErrors(cusparseSpMV(cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost, stream)); + /* wait until h_x and h_r are ready */ + checkCudaErrors(cudaDeviceSynchronize()); + + b_inf = vec_norminf(rowsA, h_b); + x_inf = vec_norminf(colsA, h_x); + r_inf = vec_norminf(rowsA, h_r); + + if (0 != strcmp(opts.testFunc, "lu")) { + // only cholesky and qr have GPU version + printf("(GPU) |b - A*x| = %E \n", r_inf); + printf("(GPU) |A| = %E \n", A_inf); + printf("(GPU) |x| = %E \n", x_inf); + printf("(GPU) |b| = %E \n", b_inf); + printf("(GPU) |b - A*x|/(|A|*|x| + |b|) = %E \n", r_inf / (A_inf * x_inf + b_inf)); + } + + fprintf(stdout, "timing %s: CPU = %10.6f sec , GPU = %10.6f sec\n", opts.testFunc, time_solve_cpu, time_solve_gpu); + + if (0 != strcmp(opts.testFunc, "lu")) { + printf("show last 10 elements of solution vector (GPU) \n"); + printf("consistent result for different reordering and solver \n"); + for (int j = rowsA - 10; j < rowsA; j++) { + printf("x[%d] = %E\n", j, h_x[j]); + } + } + + if (handle) { + checkCudaErrors(cusolverSpDestroy(handle)); + } + if (cusparseHandle) { + checkCudaErrors(cusparseDestroy(cusparseHandle)); + } + if (stream) { + checkCudaErrors(cudaStreamDestroy(stream)); + } + if (descrA) { + checkCudaErrors(cusparseDestroyMatDescr(descrA)); + } + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } + + if (h_csrValA) { + free(h_csrValA); + } + if (h_csrRowPtrA) { + free(h_csrRowPtrA); + } + if (h_csrColIndA) { + free(h_csrColIndA); + } + if (h_z) { + free(h_z); + } + if (h_x) { + free(h_x); + } + if (h_b) { + free(h_b); + } + if (h_Qb) { + free(h_Qb); + } + if (h_r) { + free(h_r); + } + + if (h_Q) { + free(h_Q); + } + + if (h_csrRowPtrB) { + free(h_csrRowPtrB); + } + if (h_csrColIndB) { + free(h_csrColIndB); + } + if (h_csrValB) { + free(h_csrValB); + } + if (h_mapBfromA) { + free(h_mapBfromA); + } + + if (buffer_cpu) { + free(buffer_cpu); + } + + if (buffer) { + checkCudaErrors(cudaFree(buffer)); + } + if (d_csrValA) { + checkCudaErrors(cudaFree(d_csrValA)); + } + if (d_csrRowPtrA) { + checkCudaErrors(cudaFree(d_csrRowPtrA)); + } + if (d_csrColIndA) { + checkCudaErrors(cudaFree(d_csrColIndA)); + } + if (d_csrValB) { + checkCudaErrors(cudaFree(d_csrValB)); + } + if (d_csrRowPtrB) { + checkCudaErrors(cudaFree(d_csrRowPtrB)); + } + if (d_csrColIndB) { + checkCudaErrors(cudaFree(d_csrColIndB)); + } + if (d_Q) { + checkCudaErrors(cudaFree(d_Q)); + } + if (d_z) { + checkCudaErrors(cudaFree(d_z)); + } + if (d_x) { + checkCudaErrors(cudaFree(d_x)); + } + if (d_b) { + checkCudaErrors(cudaFree(d_b)); + } + if (d_Qb) { + checkCudaErrors(cudaFree(d_Qb)); + } + if (d_r) { + checkCudaErrors(cudaFree(d_r)); + } + + return 0; } diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio.c b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio.c index 66e90770..7ecf9c2c 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio.c +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio.c @@ -1,128 +1,126 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ +/* + * Matrix Market I/O library for ANSI C + * + * See http://math.nist.gov/MatrixMarket for details. + * + * + */ /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif -#include -#include -#include -#include - #include "mmio.h" -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_) +#include +#include +#include +#include + +int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_) { - FILE *f; + FILE *f; MM_typecode matcode; - int M, N, nz; - int i; - double *val; - int *I, *J; - + int M, N, nz; + int i; + double *val; + int *I, *J; + if ((f = fopen(fname, "r")) == NULL) - return -1; - - - if (mm_read_banner(f, &matcode) != 0) - { + return -1; + + + if (mm_read_banner(f, &matcode) != 0) { printf("mm_read_unsymetric: Could not process Matrix Market banner "); printf(" in file [%s]\n", fname); return -1; } - - - - if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) && - mm_is_sparse(matcode))) - { + + + if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))) { fprintf(stderr, "Sorry, this application does not support "); - fprintf(stderr, "Market Market type: [%s]\n", - mm_typecode_to_str(matcode)); + fprintf(stderr, "Market Market type: [%s]\n", mm_typecode_to_str(matcode)); return -1; } - + /* find out size of sparse matrix: M, N, nz .... */ - - if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0) - { + + if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) { fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n"); return -1; } - - *M_ = M; - *N_ = N; + + *M_ = M; + *N_ = N; *nz_ = nz; - + /* reserve memory for matrices */ - - I = (int *) malloc(nz * sizeof(int)); - J = (int *) malloc(nz * sizeof(int)); - val = (double *) malloc(nz * sizeof(double)); - + + I = (int *)malloc(nz * sizeof(int)); + J = (int *)malloc(nz * sizeof(int)); + val = (double *)malloc(nz * sizeof(double)); + *val_ = val; - *I_ = I; - *J_ = J; - + *I_ = I; + *J_ = J; + /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ - - for (i=0; i + #if defined(__cplusplus) -extern "C" { +extern "C" +{ #endif /* __cplusplus */ -#define MM_MAX_LINE_LENGTH 1025 -#define MatrixMarketBanner "%%MatrixMarket" +#define MM_MAX_LINE_LENGTH 1025 +#define MatrixMarketBanner "%%MatrixMarket" #define MM_MAX_TOKEN_LENGTH 64 -typedef char MM_typecode[4]; + typedef char MM_typecode[4]; -char *mm_typecode_to_str(MM_typecode matcode); + char *mm_typecode_to_str(MM_typecode matcode); -int mm_read_banner(FILE *f, MM_typecode *matcode); -int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); -int mm_read_mtx_array_size(FILE *f, int *M, int *N); + int mm_read_banner(FILE *f, MM_typecode *matcode); + int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); + int mm_read_mtx_array_size(FILE *f, int *M, int *N); -int mm_write_banner(FILE *f, MM_typecode matcode); -int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); -int mm_write_mtx_array_size(FILE *f, int M, int N); + int mm_write_banner(FILE *f, MM_typecode matcode); + int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); + int mm_write_mtx_array_size(FILE *f, int M, int N); -/********************* MM_typecode query fucntions ***************************/ + /********************* MM_typecode query fucntions ***************************/ -#define mm_is_matrix(typecode) ((typecode)[0]=='M') +#define mm_is_matrix(typecode) ((typecode)[0] == 'M') -#define mm_is_sparse(typecode) ((typecode)[1]=='C') -#define mm_is_coordinate(typecode)((typecode)[1]=='C') -#define mm_is_dense(typecode) ((typecode)[1]=='A') -#define mm_is_array(typecode) ((typecode)[1]=='A') +#define mm_is_sparse(typecode) ((typecode)[1] == 'C') +#define mm_is_coordinate(typecode) ((typecode)[1] == 'C') +#define mm_is_dense(typecode) ((typecode)[1] == 'A') +#define mm_is_array(typecode) ((typecode)[1] == 'A') -#define mm_is_complex(typecode) ((typecode)[2]=='C') -#define mm_is_real(typecode) ((typecode)[2]=='R') -#define mm_is_pattern(typecode) ((typecode)[2]=='P') -#define mm_is_integer(typecode) ((typecode)[2]=='I') +#define mm_is_complex(typecode) ((typecode)[2] == 'C') +#define mm_is_real(typecode) ((typecode)[2] == 'R') +#define mm_is_pattern(typecode) ((typecode)[2] == 'P') +#define mm_is_integer(typecode) ((typecode)[2] == 'I') -#define mm_is_symmetric(typecode)((typecode)[3]=='S') -#define mm_is_general(typecode) ((typecode)[3]=='G') -#define mm_is_skew(typecode) ((typecode)[3]=='K') -#define mm_is_hermitian(typecode)((typecode)[3]=='H') +#define mm_is_symmetric(typecode) ((typecode)[3] == 'S') +#define mm_is_general(typecode) ((typecode)[3] == 'G') +#define mm_is_skew(typecode) ((typecode)[3] == 'K') +#define mm_is_hermitian(typecode) ((typecode)[3] == 'H') -int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ + int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ -/********************* MM_typecode modify fucntions ***************************/ + /********************* MM_typecode modify fucntions ***************************/ -#define mm_set_matrix(typecode) ((*typecode)[0]='M') -#define mm_set_coordinate(typecode) ((*typecode)[1]='C') -#define mm_set_array(typecode) ((*typecode)[1]='A') -#define mm_set_dense(typecode) mm_set_array(typecode) -#define mm_set_sparse(typecode) mm_set_coordinate(typecode) +#define mm_set_matrix(typecode) ((*typecode)[0] = 'M') +#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C') +#define mm_set_array(typecode) ((*typecode)[1] = 'A') +#define mm_set_dense(typecode) mm_set_array(typecode) +#define mm_set_sparse(typecode) mm_set_coordinate(typecode) -#define mm_set_complex(typecode)((*typecode)[2]='C') -#define mm_set_real(typecode) ((*typecode)[2]='R') -#define mm_set_pattern(typecode)((*typecode)[2]='P') -#define mm_set_integer(typecode)((*typecode)[2]='I') +#define mm_set_complex(typecode) ((*typecode)[2] = 'C') +#define mm_set_real(typecode) ((*typecode)[2] = 'R') +#define mm_set_pattern(typecode) ((*typecode)[2] = 'P') +#define mm_set_integer(typecode) ((*typecode)[2] = 'I') -#define mm_set_symmetric(typecode)((*typecode)[3]='S') -#define mm_set_general(typecode)((*typecode)[3]='G') -#define mm_set_skew(typecode) ((*typecode)[3]='K') -#define mm_set_hermitian(typecode)((*typecode)[3]='H') +#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') +#define mm_set_general(typecode) ((*typecode)[3] = 'G') +#define mm_set_skew(typecode) ((*typecode)[3] = 'K') +#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') -#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \ - (*typecode)[2]=' ',(*typecode)[3]='G') +#define mm_clear_typecode(typecode) ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) -/********************* Matrix Market error codes ***************************/ + /********************* Matrix Market error codes ***************************/ -#define MM_COULD_NOT_READ_FILE 11 -#define MM_PREMATURE_EOF 12 -#define MM_NOT_MTX 13 -#define MM_NO_HEADER 14 -#define MM_UNSUPPORTED_TYPE 15 -#define MM_LINE_TOO_LONG 16 -#define MM_COULD_NOT_WRITE_FILE 17 +#define MM_COULD_NOT_READ_FILE 11 +#define MM_PREMATURE_EOF 12 +#define MM_NOT_MTX 13 +#define MM_NO_HEADER 14 +#define MM_UNSUPPORTED_TYPE 15 +#define MM_LINE_TOO_LONG 16 +#define MM_COULD_NOT_WRITE_FILE 17 -/******************** Matrix Market internal definitions ******************** + /******************** Matrix Market internal definitions ******************** - MM_matrix_typecode: 4-character sequence + MM_matrix_typecode: 4-character sequence - ojbect sparse/ data storage - dense type scheme + ojbect sparse/ data storage + dense type scheme - string position: [0] [1] [2] [3] + string position: [0] [1] [2] [3] - Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) - A(array) C(omplex) H(ermitian) - P(attern) S(ymmetric) - I(nteger) K(kew) + Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) + A(array) C(omplex) H(ermitian) + P(attern) S(ymmetric) + I(nteger) K(kew) - ***********************************************************************/ + ***********************************************************************/ -#define MM_MTX_STR "matrix" -#define MM_ARRAY_STR "array" -#define MM_DENSE_STR "array" -#define MM_COORDINATE_STR "coordinate" -#define MM_SPARSE_STR "coordinate" -#define MM_COMPLEX_STR "complex" -#define MM_REAL_STR "real" -#define MM_INT_STR "integer" -#define MM_GENERAL_STR "general" -#define MM_SYMM_STR "symmetric" -#define MM_HERM_STR "hermitian" -#define MM_SKEW_STR "skew-symmetric" -#define MM_PATTERN_STR "pattern" +#define MM_MTX_STR "matrix" +#define MM_ARRAY_STR "array" +#define MM_DENSE_STR "array" +#define MM_COORDINATE_STR "coordinate" +#define MM_SPARSE_STR "coordinate" +#define MM_COMPLEX_STR "complex" +#define MM_REAL_STR "real" +#define MM_INT_STR "integer" +#define MM_GENERAL_STR "general" +#define MM_SYMM_STR "symmetric" +#define MM_HERM_STR "hermitian" +#define MM_SKEW_STR "skew-symmetric" +#define MM_PATTERN_STR "pattern" -/* high level routines */ -int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, - double **val, MM_typecode *matcode); + /* high level routines */ + int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, double **val, MM_typecode *matcode); -int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, - MM_typecode matcode); + int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, MM_typecode matcode); -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_); + int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_); #if defined(__cplusplus) } -#endif /* __cplusplus */ +#endif /* __cplusplus */ #endif diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio_wrapper.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio_wrapper.cpp index 6a69af55..f6f6c8e4 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio_wrapper.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/mmio_wrapper.cpp @@ -1,427 +1,375 @@ +#include +#include #include #include -#include #include "mmio.h" -#include - /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif /* various __inline__ __device__ function to initialize a T_ELEM */ -template __inline__ T_ELEM cuGet (int ); -template <> __inline__ float cuGet(int x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(int); +template <> __inline__ float cuGet(int x) { return float(x); } -template <> __inline__ double cuGet(int x) -{ - return double(x); -} +template <> __inline__ double cuGet(int x) { return double(x); } -template <> __inline__ cuComplex cuGet(int x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(int x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(int x) +template <> __inline__ cuDoubleComplex cuGet(int x) { return (make_cuDoubleComplex(double(x), 0.0)); } + + +template __inline__ T_ELEM cuGet(int, int); +template <> __inline__ float cuGet(int x, int y) { return float(x); } + +template <> __inline__ double cuGet(int x, int y) { return double(x); } + +template <> __inline__ cuComplex cuGet(int x, int y) { return make_cuComplex(float(x), float(y)); } + +template <> __inline__ cuDoubleComplex cuGet(int x, int y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (int , int ); -template <> __inline__ float cuGet(int x, int y) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(float); +template <> __inline__ float cuGet(float x) { return float(x); } -template <> __inline__ double cuGet(int x, int y) -{ - return double(x); -} +template <> __inline__ double cuGet(float x) { return double(x); } -template <> __inline__ cuComplex cuGet(int x, int y) -{ - return make_cuComplex( float(x), float(y) ); -} +template <> __inline__ cuComplex cuGet(float x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(int x, int y) +template <> __inline__ cuDoubleComplex cuGet(float x) { - return (make_cuDoubleComplex( double(x), double(y) )); + return (make_cuDoubleComplex(double(x), 0.0)); } -template __inline__ T_ELEM cuGet (float ); -template <> __inline__ float cuGet(float x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(float, float); +template <> __inline__ float cuGet(float x, float y) { return float(x); } -template <> __inline__ double cuGet(float x) -{ - return double(x); -} +template <> __inline__ double cuGet(float x, float y) { return double(x); } -template <> __inline__ cuComplex cuGet(float x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(float x, float y) { return (make_cuComplex(float(x), float(y))); } -template <> __inline__ cuDoubleComplex cuGet(float x) +template <> __inline__ cuDoubleComplex cuGet(float x, float y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (float, float ); -template <> __inline__ float cuGet(float x, float y) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(double); +template <> __inline__ float cuGet(double x) { return float(x); } -template <> __inline__ double cuGet(float x, float y) -{ - return double(x); -} +template <> __inline__ double cuGet(double x) { return double(x); } -template <> __inline__ cuComplex cuGet(float x, float y) -{ - return (make_cuComplex( float(x), float(y) )); -} +template <> __inline__ cuComplex cuGet(double x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(float x, float y) +template <> __inline__ cuDoubleComplex cuGet(double x) { - return (make_cuDoubleComplex( double(x), double(y) )); + return (make_cuDoubleComplex(double(x), 0.0)); } -template __inline__ T_ELEM cuGet (double ); -template <> __inline__ float cuGet(double x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(double, double); +template <> __inline__ float cuGet(double x, double y) { return float(x); } -template <> __inline__ double cuGet(double x) -{ - return double(x); -} +template <> __inline__ double cuGet(double x, double y) { return double(x); } -template <> __inline__ cuComplex cuGet(double x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(double x, double y) { return (make_cuComplex(float(x), float(y))); } -template <> __inline__ cuDoubleComplex cuGet(double x) +template <> __inline__ cuDoubleComplex cuGet(double x, double y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (double, double ); -template <> __inline__ float cuGet(double x, double y) -{ - return float(x); -} - -template <> __inline__ double cuGet(double x, double y) -{ - return double(x); -} - -template <> __inline__ cuComplex cuGet(double x, double y) -{ - return (make_cuComplex( float(x), float(y) )); -} - -template <> __inline__ cuDoubleComplex cuGet(double x, double y) -{ - return (make_cuDoubleComplex( double(x), double(y) )); -} - - - - - -static void compress_index( - const int *Ind, - int nnz, - int m, - int *Ptr, - int base) +static void compress_index(const int *Ind, int nnz, int m, int *Ptr, int base) { int i; /* initialize everything to zero */ - for(i=0; ii < t->i ){ - return -1 ; + if (s->i < t->i) { + return -1; } - else if ( s->i > t->i ){ - return 1 ; + else if (s->i > t->i) { + return 1; } - else{ - return s->j - t->j ; + else { + return s->j - t->j; } } -int cmp_cooFormat_csc( struct cooFormat *s, struct cooFormat *t) +int cmp_cooFormat_csc(struct cooFormat *s, struct cooFormat *t) { - if ( s->j < t->j ){ - return -1 ; + if (s->j < t->j) { + return -1; } - else if ( s->j > t->j ){ - return 1 ; + else if (s->j > t->j) { + return 1; } - else{ - return s->i - t->i ; + else { + return s->i - t->i; } } -typedef int (*FUNPTR) (const void*, const void*) ; -typedef int (*FUNPTR2) ( struct cooFormat *s, struct cooFormat *t) ; +typedef int (*FUNPTR)(const void *, const void *); +typedef int (*FUNPTR2)(struct cooFormat *s, struct cooFormat *t); -static FUNPTR2 fptr_array[2] = { +static FUNPTR2 fptr_array[2] = { cmp_cooFormat_csr, cmp_cooFormat_csc, }; -static int verify_pattern( - int m, - int nnz, - int *csrRowPtr, - int *csrColInd) +static int verify_pattern(int m, int nnz, int *csrRowPtr, int *csrColInd) { int i, col, start, end, base_index; int error_found = 0; - if (nnz != (csrRowPtr[m] - csrRowPtr[0])){ - fprintf(stderr, "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", 0, csrRowPtr[0], m, csrRowPtr[m], nnz); + if (nnz != (csrRowPtr[m] - csrRowPtr[0])) { + fprintf(stderr, + "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", + 0, + csrRowPtr[0], + m, + csrRowPtr[m], + nnz); error_found = 1; } base_index = csrRowPtr[0]; - if ((0 != base_index) && (1 != base_index)){ + if ((0 != base_index) && (1 != base_index)) { fprintf(stderr, "Error (base index check failed): base index = %d\n", base_index); error_found = 1; } - for (i=0; (!error_found) && (i end){ - fprintf(stderr, "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", i, start+base_index, i+1, end+base_index); + for (i = 0; (!error_found) && (i < m); i++) { + start = csrRowPtr[i] - base_index; + end = csrRowPtr[i + 1] - base_index; + if (start > end) { + fprintf(stderr, + "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", + i, + start + base_index, + i + 1, + end + base_index); error_found = 1; } - for (col=start; col= csrColInd[col+1])){ - fprintf(stderr, "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", col, csrColInd[col], col+1, csrColInd[col+1]); + if ((col < (end - 1)) && (csrColInd[col] >= csrColInd[col + 1])) { + fprintf( + stderr, + "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", + col, + csrColInd[col], + col + 1, + csrColInd[col + 1]); error_found = 1; } } } - return error_found ; + return error_found; } template -int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - T_ELEM **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix) +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix) { - MM_typecode matcode; - double *tempVal; - int *tempRowInd,*tempColInd; - double *tval; - int *trow,*tcol; - int *csrRowPtr, *cscColPtr; - int i,j,error,base,count; + MM_typecode matcode; + double *tempVal; + int *tempRowInd, *tempColInd; + double *tval; + int *trow, *tcol; + int *csrRowPtr, *cscColPtr; + int i, j, error, base, count; struct cooFormat *work; - /* read the matrix */ + /* read the matrix */ error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode); if (error) { fprintf(stderr, "!!!! can not open file: '%s'\n", filename); - return 1; + return 1; } /* start error checking */ if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) { fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n"); - return 1; + return 1; } - if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/){ + if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/) { fprintf(stderr, "!!!! dense, array, pattern and integer matrices are not supported\n"); - return 1; + return 1; } /* if necessary symmetrize the pattern (transform from triangular to full) */ - if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))){ - //count number of non-diagonal elements - count=0; - for(i=0; i<(*nnz); i++){ - if (trow[i] != tcol[i]){ + if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))) { + // count number of non-diagonal elements + count = 0; + for (i = 0; i < (*nnz); i++) { + if (trow[i] != tcol[i]) { count++; } } - //allocate space for the symmetrized matrix - tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); - tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); - if (mm_is_real(matcode) || mm_is_integer(matcode)){ + // allocate space for the symmetrized matrix + tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); + tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { tempVal = (double *)malloc((*nnz + count) * sizeof(double)); } - else{ + else { tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double)); } - //copy the elements regular and transposed locations - for(j=0, i=0; i<(*nnz); i++){ - tempRowInd[j]=trow[i]; - tempColInd[j]=tcol[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - tempVal[j]=tval[i]; + // copy the elements regular and transposed locations + for (j = 0, i = 0; i < (*nnz); i++) { + tempRowInd[j] = trow[i]; + tempColInd[j] = tcol[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + tempVal[j] = tval[i]; } - else{ - tempVal[2*j] =tval[2*i]; - tempVal[2*j+1]=tval[2*i+1]; + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; } j++; - if (trow[i] != tcol[i]){ - tempRowInd[j]=tcol[i]; - tempColInd[j]=trow[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - if (mm_is_skew(matcode)){ - tempVal[j]=-tval[i]; + if (trow[i] != tcol[i]) { + tempRowInd[j] = tcol[i]; + tempColInd[j] = trow[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + if (mm_is_skew(matcode)) { + tempVal[j] = -tval[i]; } - else{ - tempVal[j]= tval[i]; + else { + tempVal[j] = tval[i]; } } - else{ - if(mm_is_hermitian(matcode)){ - tempVal[2*j] = tval[2*i]; - tempVal[2*j+1]=-tval[2*i+1]; + else { + if (mm_is_hermitian(matcode)) { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = -tval[2 * i + 1]; } - else{ - tempVal[2*j] = tval[2*i]; - tempVal[2*j+1]= tval[2*i+1]; + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; } } j++; } } - (*nnz)+=count; - //free temporary storage + (*nnz) += count; + // free temporary storage free(trow); free(tcol); - free(tval); + free(tval); } - else{ - tempRowInd=trow; - tempColInd=tcol; - tempVal =tval; + else { + tempRowInd = trow; + tempColInd = tcol; + tempVal = tval; } // life time of (trow, tcol, tval) is over. // please use COO format (tempRowInd, tempColInd, tempVal) -// use qsort to sort COO format - work = (struct cooFormat *)malloc(sizeof(struct cooFormat)*(*nnz)); - if (NULL == work){ + // use qsort to sort COO format + work = (struct cooFormat *)malloc(sizeof(struct cooFormat) * (*nnz)); + if (NULL == work) { fprintf(stderr, "!!!! allocation error, malloc failed\n"); return 1; } - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { work[i].i = tempRowInd[i]; work[i].j = tempColInd[i]; work[i].p = i; // permutation is identity } - - if (csrFormat){ - /* create row-major ordering of indices (sorted by row and within each row by column) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0] ); - }else{ - /* create column-major ordering of indices (sorted by column and within each column by row) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1] ); + if (csrFormat) { + /* create row-major ordering of indices (sorted by row and within each row by column) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0]); + } + else { + /* create column-major ordering of indices (sorted by column and within each column by row) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1]); } // (tempRowInd, tempColInd) is sorted either by row-major or by col-major - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { tempRowInd[i] = work[i].i; tempColInd[i] = work[i].j; } - // setup base + // setup base // check if there is any row/col 0, if so base-0 // check if there is any row/col equal to matrix dimension m/n, if so base-1 int base0 = 0; int base1 = 0; - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { const int row = tempRowInd[i]; const int col = tempColInd[i]; - if ( (0 == row) || (0 == col) ){ + if ((0 == row) || (0 == col)) { base0 = 1; } - if ( (*m == row) || (*n == col) ){ + if ((*m == row) || (*n == col)) { base1 = 1; } } - if ( base0 && base1 ){ + if (base0 && base1) { printf("Error: input matrix is base-0 and base-1 \n"); return 1; } base = 0; - if (base1){ + if (base1) { base = 1; } /* compress the appropriate indices */ - if (csrFormat){ + if (csrFormat) { /* CSR format (assuming row-major format) */ - csrRowPtr = (int *)malloc(((*m)+1) * sizeof(csrRowPtr[0])); - if (!csrRowPtr) return 1; + csrRowPtr = (int *)malloc(((*m) + 1) * sizeof(csrRowPtr[0])); + if (!csrRowPtr) + return 1; compress_index(tempRowInd, *nnz, *m, csrRowPtr, base); *aRowInd = csrRowPtr; @@ -429,101 +377,97 @@ int loadMMSparseMatrix( } else { /* CSC format (assuming column-major format) */ - cscColPtr = (int *)malloc(((*n)+1) * sizeof(cscColPtr[0])); - if (!cscColPtr) return 1; + cscColPtr = (int *)malloc(((*n) + 1) * sizeof(cscColPtr[0])); + if (!cscColPtr) + return 1; compress_index(tempColInd, *nnz, *n, cscColPtr, base); *aColInd = cscColPtr; *aRowInd = (int *)malloc((*nnz) * sizeof(int)); - } + } - /* transfrom the matrix values of type double into one of the cusparse library types */ + /* transfrom the matrix values of type double into one of the cusparse library types */ *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM)); - - for (i=0; i<(*nnz); i++) { - if (csrFormat){ + + for (i = 0; i < (*nnz); i++) { + if (csrFormat) { (*aColInd)[i] = tempColInd[i]; } - else{ + else { (*aRowInd)[i] = tempRowInd[i]; } - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - (*aVal)[i] = cuGet( tempVal[ work[i].p ] ); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + (*aVal)[i] = cuGet(tempVal[work[i].p]); } - else{ - (*aVal)[i] = cuGet(tempVal[2*work[i].p], tempVal[2*work[i].p+1]); + else { + (*aVal)[i] = cuGet(tempVal[2 * work[i].p], tempVal[2 * work[i].p + 1]); } } /* check for corruption */ int error_found; - if (csrFormat){ + if (csrFormat) { error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd); - }else{ + } + else { error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd); } - if (error_found){ + if (error_found) { fprintf(stderr, "!!!! verify_pattern failed\n"); return 1; } /* cleanup and exit */ free(work); - free(tempVal); + free(tempVal); free(tempColInd); free(tempRowInd); return 0; -} +} /* specific instantiation */ -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - float **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + float **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - double **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); - -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - cuComplex **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); - -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - cuDoubleComplex **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + double **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuDoubleComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp index fd4cd8d8..baee4342 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp @@ -9,64 +9,56 @@ * */ +#include +#include +#include #include #include #include -#include -#include #include "cusolverSp.h" - #include "cusolverSp_LOWLEVEL_PREVIEW.h" - -#include - #include "helper_cuda.h" #include "helper_cusolver.h" template -int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - T_ELEM **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); void UsageSP(void) { - printf( "\n"); - printf( "-h : display this help\n"); - printf( "-file= : filename containing a matrix in MM format\n"); - printf( "-device= : if want to run on specific GPU\n"); + printf("\n"); + printf("-h : display this help\n"); + printf("-file= : filename containing a matrix in MM format\n"); + printf("-device= : if want to run on specific GPU\n"); - exit( 0 ); + exit(0); } void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) { memset(&opts, 0, sizeof(opts)); - if (checkCmdLineFlag(argc, (const char **)argv, "-h")) - { + if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { UsageSP(); } - if (checkCmdLineFlag(argc, (const char **)argv, "file")) - { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { char *fileName = 0; getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); - if (fileName) - { + if (fileName) { opts.sparse_mat_filename = fileName; } - else - { + else { printf("\nIncorrect filename passed to -file \n "); UsageSP(); } @@ -74,16 +66,16 @@ void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) } -int main (int argc, char *argv[]) +int main(int argc, char *argv[]) { - struct testOpts opts; + struct testOpts opts; cusolverSpHandle_t cusolverSpH = NULL; // reordering, permutation and 1st LU factorization - cusparseHandle_t cusparseH = NULL; // residual evaluation - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix + cusparseHandle_t cusparseH = NULL; // residual evaluation + cudaStream_t stream = NULL; + cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix csrcholInfoHost_t h_info = NULL; // opaque info structure for LU with parital pivoting - csrcholInfo_t d_info = NULL; // opaque info structure for LU with parital pivoting + csrcholInfo_t d_info = NULL; // opaque info structure for LU with parital pivoting int rowsA = 0; // number of rows of A int colsA = 0; // number of columns of A @@ -91,76 +83,78 @@ int main (int argc, char *argv[]) int baseA = 0; // base index in CSR format // CSR(A) from I/O - int *h_csrRowPtrA = NULL; // n+1 - int *h_csrColIndA = NULL; // nnzA - double *h_csrValA = NULL; // nnzA + int *h_csrRowPtrA = NULL; // n+1 + int *h_csrColIndA = NULL; // nnzA + double *h_csrValA = NULL; // nnzA double *h_x = NULL; // n, x = A \ b double *h_b = NULL; // n, b = ones(m,1) double *h_r = NULL; // n, r = b - A*x - size_t size_internal = 0; - size_t size_chol = 0; // size of working space for csrlu - void *buffer_cpu = NULL; // working space for Cholesky - void *buffer_gpu = NULL; // working space for Cholesky + size_t size_internal = 0; + size_t size_chol = 0; // size of working space for csrlu + void *buffer_cpu = NULL; // working space for Cholesky + void *buffer_gpu = NULL; // working space for Cholesky - int *d_csrRowPtrA = NULL; // n+1 - int *d_csrColIndA = NULL; // nnzA - double *d_csrValA = NULL; // nnzA - double *d_x = NULL; // n, x = A \ b - double *d_b = NULL; // n, a copy of h_b - double *d_r = NULL; // n, r = b - A*x + int *d_csrRowPtrA = NULL; // n+1 + int *d_csrColIndA = NULL; // nnzA + double *d_csrValA = NULL; // nnzA + double *d_x = NULL; // n, x = A \ b + double *d_b = NULL; // n, a copy of h_b + double *d_r = NULL; // n, r = b - A*x // the constants used in residual evaluation, r = b - A*x const double minus_one = -1.0; - const double one = 1.0; + const double one = 1.0; // the constant used in cusolverSp // singularity is -1 if A is invertible under tol // tol determines the condition of singularity - int singularity = 0; - const double tol = 1.e-14; + int singularity = 0; + const double tol = 1.e-14; - double x_inf = 0.0; // |x| - double r_inf = 0.0; // |r| - double A_inf = 0.0; // |A| - int errors = 0; + double x_inf = 0.0; // |x| + double r_inf = 0.0; // |r| + double A_inf = 0.0; // |A| + int errors = 0; parseCommandLineArguments(argc, argv, opts); findCudaDevice(argc, (const char **)argv); - if (opts.sparse_mat_filename == NULL) - { - opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]); + if (opts.sparse_mat_filename == NULL) { + opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]); if (opts.sparse_mat_filename != NULL) printf("Using default input file [%s]\n", opts.sparse_mat_filename); else printf("Could not find lap2D_5pt_n100.mtx\n"); } - else - { + else { printf("Using input file [%s]\n", opts.sparse_mat_filename); } printf("step 1: read matrix market format\n"); - if (opts.sparse_mat_filename) - { - if (loadMMSparseMatrix(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA, - &nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true)) - { + if (opts.sparse_mat_filename) { + if (loadMMSparseMatrix(opts.sparse_mat_filename, + 'd', + true, + &rowsA, + &colsA, + &nnzA, + &h_csrValA, + &h_csrRowPtrA, + &h_csrColIndA, + true)) { return 1; } baseA = h_csrRowPtrA[0]; // baseA = {0,1} } - else - { + else { fprintf(stderr, "Error: input matrix is not provided\n"); return 1; } - if ( rowsA != colsA ) - { + if (rowsA != colsA) { fprintf(stderr, "Error: only support square matrix\n"); return 1; } @@ -180,32 +174,29 @@ int main (int argc, char *argv[]) checkCudaErrors(cusparseCreateMatDescr(&descrA)); checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - if (baseA) - { + if (baseA) { checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); } - else - { + else { checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); } - h_x = (double*)malloc(sizeof(double)*colsA); - h_b = (double*)malloc(sizeof(double)*rowsA); - h_r = (double*)malloc(sizeof(double)*rowsA); + h_x = (double *)malloc(sizeof(double) * colsA); + h_b = (double *)malloc(sizeof(double) * rowsA); + h_r = (double *)malloc(sizeof(double) * rowsA); assert(NULL != h_x); assert(NULL != h_b); assert(NULL != h_r); - checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int)*(rowsA+1))); - checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int)*nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_csrValA , sizeof(double)*nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA)); - checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1))); + checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); - for(int row = 0 ; row < rowsA ; row++) - { + for (int row = 0; row < rowsA; row++) { h_b[row] = 1.0; } @@ -213,69 +204,70 @@ int main (int argc, char *argv[]) checkCudaErrors(cusolverSpCreateCsrcholInfoHost(&h_info)); printf("step 3: analyze chol(A) to know structure of L\n"); - checkCudaErrors(cusolverSpXcsrcholAnalysisHost( - cusolverSpH, rowsA, nnzA, - descrA, h_csrRowPtrA, h_csrColIndA, - h_info)); + checkCudaErrors( + cusolverSpXcsrcholAnalysisHost(cusolverSpH, rowsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_info)); printf("step 4: workspace for chol(A)\n"); checkCudaErrors(cusolverSpDcsrcholBufferInfoHost( - cusolverSpH, rowsA, nnzA, - descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA, - h_info, - &size_internal, - &size_chol)); + cusolverSpH, rowsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA, h_info, &size_internal, &size_chol)); - if (buffer_cpu) - { - free(buffer_cpu); + if (buffer_cpu) { + free(buffer_cpu); } - buffer_cpu = (void*)malloc(sizeof(char)*size_chol); + buffer_cpu = (void *)malloc(sizeof(char) * size_chol); assert(NULL != buffer_cpu); printf("step 5: compute A = L*L^T \n"); checkCudaErrors(cusolverSpDcsrcholFactorHost( - cusolverSpH, rowsA, nnzA, - descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA, - h_info, - buffer_cpu)); + cusolverSpH, rowsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA, h_info, buffer_cpu)); printf("step 6: check if the matrix is singular \n"); - checkCudaErrors(cusolverSpDcsrcholZeroPivotHost( - cusolverSpH, h_info, tol, &singularity)); + checkCudaErrors(cusolverSpDcsrcholZeroPivotHost(cusolverSpH, h_info, tol, &singularity)); - if ( 0 <= singularity) - { + if (0 <= singularity) { fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity); return 1; } printf("step 7: solve A*x = b \n"); - checkCudaErrors(cusolverSpDcsrcholSolveHost( - cusolverSpH, rowsA, h_b, h_x, h_info, buffer_cpu)); + checkCudaErrors(cusolverSpDcsrcholSolveHost(cusolverSpH, rowsA, h_b, h_x, h_info, buffer_cpu)); printf("step 8: evaluate residual r = b - A*x (result on CPU)\n"); // use GPU gemv to compute r = b - A*x - checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA , cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrValA , h_csrValA , sizeof(double)*nnzA , cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_x, h_x, sizeof(double)*colsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_x, h_x, sizeof(double) * colsA, cudaMemcpyHostToDevice)); /* Wrap raw data into cuSPARSE generic API objects */ cusparseSpMatDescr_t matA = NULL; - if (baseA) - { - checkCudaErrors(cusparseCreateCsr( - &matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)); + if (baseA) { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ONE, + CUDA_R_64F)); } - else - { - checkCudaErrors(cusparseCreateCsr( - &matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); + else { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); } cusparseDnVecDescr_t vecx = NULL; @@ -285,17 +277,31 @@ int main (int argc, char *argv[]) /* Allocate workspace for cuSPARSE */ size_t bufferSize = 0; - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); + checkCudaErrors(cusparseSpMV_bufferSize(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); void *buffer = NULL; checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - checkCudaErrors(cusparseSpMV( - cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer)); + checkCudaErrors(cusparseSpMV(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); - checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); x_inf = vec_norminf(colsA, h_x); r_inf = vec_norminf(rowsA, h_r); @@ -304,97 +310,137 @@ int main (int argc, char *argv[]) printf("(CPU) |b - A*x| = %E \n", r_inf); printf("(CPU) |A| = %E \n", A_inf); printf("(CPU) |x| = %E \n", x_inf); - printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf)); + printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); printf("step 9: create opaque info structure\n"); checkCudaErrors(cusolverSpCreateCsrcholInfo(&d_info)); - checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA , cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrValA , h_csrValA , sizeof(double)*nnzA , cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); printf("step 10: analyze chol(A) to know structure of L\n"); - checkCudaErrors(cusolverSpXcsrcholAnalysis( - cusolverSpH, rowsA, nnzA, - descrA, d_csrRowPtrA, d_csrColIndA, - d_info)); + checkCudaErrors(cusolverSpXcsrcholAnalysis(cusolverSpH, rowsA, nnzA, descrA, d_csrRowPtrA, d_csrColIndA, d_info)); printf("step 11: workspace for chol(A)\n"); checkCudaErrors(cusolverSpDcsrcholBufferInfo( - cusolverSpH, rowsA, nnzA, - descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, - d_info, - &size_internal, - &size_chol)); + cusolverSpH, rowsA, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_info, &size_internal, &size_chol)); if (buffer_gpu) { checkCudaErrors(cudaFree(buffer_gpu)); } - checkCudaErrors(cudaMalloc(&buffer_gpu, sizeof(char)*size_chol)); + checkCudaErrors(cudaMalloc(&buffer_gpu, sizeof(char) * size_chol)); printf("step 12: compute A = L*L^T \n"); checkCudaErrors(cusolverSpDcsrcholFactor( - cusolverSpH, rowsA, nnzA, - descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, - d_info, - buffer_gpu)); + cusolverSpH, rowsA, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_info, buffer_gpu)); printf("step 13: check if the matrix is singular \n"); - checkCudaErrors(cusolverSpDcsrcholZeroPivot( - cusolverSpH, d_info, tol, &singularity)); + checkCudaErrors(cusolverSpDcsrcholZeroPivot(cusolverSpH, d_info, tol, &singularity)); - if ( 0 <= singularity){ + if (0 <= singularity) { fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity); return 1; } printf("step 14: solve A*x = b \n"); - checkCudaErrors(cusolverSpDcsrcholSolve( - cusolverSpH, rowsA, d_b, d_x, d_info, buffer_gpu)); + checkCudaErrors(cusolverSpDcsrcholSolve(cusolverSpH, rowsA, d_b, d_x, d_info, buffer_gpu)); - checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - checkCudaErrors(cusparseSpMV( - cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer)); + checkCudaErrors(cusparseSpMV(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); - checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); r_inf = vec_norminf(rowsA, h_r); printf("(GPU) |b - A*x| = %E \n", r_inf); - printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf)); + printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); - if (cusolverSpH) { checkCudaErrors(cusolverSpDestroy(cusolverSpH)); } - if (cusparseH ) { checkCudaErrors(cusparseDestroy(cusparseH)); } - if (stream ) { checkCudaErrors(cudaStreamDestroy(stream)); } - if (descrA ) { checkCudaErrors(cusparseDestroyMatDescr(descrA)); } - if (h_info ) { checkCudaErrors(cusolverSpDestroyCsrcholInfoHost(h_info)); } - if (d_info ) { checkCudaErrors(cusolverSpDestroyCsrcholInfo(d_info)); } - if (matA ) { checkCudaErrors(cusparseDestroySpMat(matA)); } - if (vecx ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); } - if (vecAx ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); } + if (cusolverSpH) { + checkCudaErrors(cusolverSpDestroy(cusolverSpH)); + } + if (cusparseH) { + checkCudaErrors(cusparseDestroy(cusparseH)); + } + if (stream) { + checkCudaErrors(cudaStreamDestroy(stream)); + } + if (descrA) { + checkCudaErrors(cusparseDestroyMatDescr(descrA)); + } + if (h_info) { + checkCudaErrors(cusolverSpDestroyCsrcholInfoHost(h_info)); + } + if (d_info) { + checkCudaErrors(cusolverSpDestroyCsrcholInfo(d_info)); + } + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } - if (h_csrValA ) { free(h_csrValA); } - if (h_csrRowPtrA) { free(h_csrRowPtrA); } - if (h_csrColIndA) { free(h_csrColIndA); } + if (h_csrValA) { + free(h_csrValA); + } + if (h_csrRowPtrA) { + free(h_csrRowPtrA); + } + if (h_csrColIndA) { + free(h_csrColIndA); + } - if (h_x ) { free(h_x); } - if (h_b ) { free(h_b); } - if (h_r ) { free(h_r); } + if (h_x) { + free(h_x); + } + if (h_b) { + free(h_b); + } + if (h_r) { + free(h_r); + } - if (buffer_cpu) { free(buffer_cpu); } - if (buffer_gpu) { checkCudaErrors(cudaFree(buffer_gpu)); } + if (buffer_cpu) { + free(buffer_cpu); + } + if (buffer_gpu) { + checkCudaErrors(cudaFree(buffer_gpu)); + } - if (d_csrValA ) { checkCudaErrors(cudaFree(d_csrValA)); } - if (d_csrRowPtrA) { checkCudaErrors(cudaFree(d_csrRowPtrA)); } - if (d_csrColIndA) { checkCudaErrors(cudaFree(d_csrColIndA)); } - if (d_x) { checkCudaErrors(cudaFree(d_x)); } - if (d_b) { checkCudaErrors(cudaFree(d_b)); } - if (d_r) { checkCudaErrors(cudaFree(d_r)); } + if (d_csrValA) { + checkCudaErrors(cudaFree(d_csrValA)); + } + if (d_csrRowPtrA) { + checkCudaErrors(cudaFree(d_csrRowPtrA)); + } + if (d_csrColIndA) { + checkCudaErrors(cudaFree(d_csrColIndA)); + } + if (d_x) { + checkCudaErrors(cudaFree(d_x)); + } + if (d_b) { + checkCudaErrors(cudaFree(d_b)); + } + if (d_r) { + checkCudaErrors(cudaFree(d_r)); + } return 0; } - diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio.c b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio.c index 66e90770..7ecf9c2c 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio.c +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio.c @@ -1,128 +1,126 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ +/* + * Matrix Market I/O library for ANSI C + * + * See http://math.nist.gov/MatrixMarket for details. + * + * + */ /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif -#include -#include -#include -#include - #include "mmio.h" -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_) +#include +#include +#include +#include + +int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_) { - FILE *f; + FILE *f; MM_typecode matcode; - int M, N, nz; - int i; - double *val; - int *I, *J; - + int M, N, nz; + int i; + double *val; + int *I, *J; + if ((f = fopen(fname, "r")) == NULL) - return -1; - - - if (mm_read_banner(f, &matcode) != 0) - { + return -1; + + + if (mm_read_banner(f, &matcode) != 0) { printf("mm_read_unsymetric: Could not process Matrix Market banner "); printf(" in file [%s]\n", fname); return -1; } - - - - if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) && - mm_is_sparse(matcode))) - { + + + if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))) { fprintf(stderr, "Sorry, this application does not support "); - fprintf(stderr, "Market Market type: [%s]\n", - mm_typecode_to_str(matcode)); + fprintf(stderr, "Market Market type: [%s]\n", mm_typecode_to_str(matcode)); return -1; } - + /* find out size of sparse matrix: M, N, nz .... */ - - if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0) - { + + if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) { fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n"); return -1; } - - *M_ = M; - *N_ = N; + + *M_ = M; + *N_ = N; *nz_ = nz; - + /* reserve memory for matrices */ - - I = (int *) malloc(nz * sizeof(int)); - J = (int *) malloc(nz * sizeof(int)); - val = (double *) malloc(nz * sizeof(double)); - + + I = (int *)malloc(nz * sizeof(int)); + J = (int *)malloc(nz * sizeof(int)); + val = (double *)malloc(nz * sizeof(double)); + *val_ = val; - *I_ = I; - *J_ = J; - + *I_ = I; + *J_ = J; + /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ - - for (i=0; i + #if defined(__cplusplus) -extern "C" { +extern "C" +{ #endif /* __cplusplus */ -#define MM_MAX_LINE_LENGTH 1025 -#define MatrixMarketBanner "%%MatrixMarket" +#define MM_MAX_LINE_LENGTH 1025 +#define MatrixMarketBanner "%%MatrixMarket" #define MM_MAX_TOKEN_LENGTH 64 -typedef char MM_typecode[4]; + typedef char MM_typecode[4]; -char *mm_typecode_to_str(MM_typecode matcode); + char *mm_typecode_to_str(MM_typecode matcode); -int mm_read_banner(FILE *f, MM_typecode *matcode); -int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); -int mm_read_mtx_array_size(FILE *f, int *M, int *N); + int mm_read_banner(FILE *f, MM_typecode *matcode); + int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); + int mm_read_mtx_array_size(FILE *f, int *M, int *N); -int mm_write_banner(FILE *f, MM_typecode matcode); -int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); -int mm_write_mtx_array_size(FILE *f, int M, int N); + int mm_write_banner(FILE *f, MM_typecode matcode); + int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); + int mm_write_mtx_array_size(FILE *f, int M, int N); -/********************* MM_typecode query fucntions ***************************/ + /********************* MM_typecode query fucntions ***************************/ -#define mm_is_matrix(typecode) ((typecode)[0]=='M') +#define mm_is_matrix(typecode) ((typecode)[0] == 'M') -#define mm_is_sparse(typecode) ((typecode)[1]=='C') -#define mm_is_coordinate(typecode)((typecode)[1]=='C') -#define mm_is_dense(typecode) ((typecode)[1]=='A') -#define mm_is_array(typecode) ((typecode)[1]=='A') +#define mm_is_sparse(typecode) ((typecode)[1] == 'C') +#define mm_is_coordinate(typecode) ((typecode)[1] == 'C') +#define mm_is_dense(typecode) ((typecode)[1] == 'A') +#define mm_is_array(typecode) ((typecode)[1] == 'A') -#define mm_is_complex(typecode) ((typecode)[2]=='C') -#define mm_is_real(typecode) ((typecode)[2]=='R') -#define mm_is_pattern(typecode) ((typecode)[2]=='P') -#define mm_is_integer(typecode) ((typecode)[2]=='I') +#define mm_is_complex(typecode) ((typecode)[2] == 'C') +#define mm_is_real(typecode) ((typecode)[2] == 'R') +#define mm_is_pattern(typecode) ((typecode)[2] == 'P') +#define mm_is_integer(typecode) ((typecode)[2] == 'I') -#define mm_is_symmetric(typecode)((typecode)[3]=='S') -#define mm_is_general(typecode) ((typecode)[3]=='G') -#define mm_is_skew(typecode) ((typecode)[3]=='K') -#define mm_is_hermitian(typecode)((typecode)[3]=='H') +#define mm_is_symmetric(typecode) ((typecode)[3] == 'S') +#define mm_is_general(typecode) ((typecode)[3] == 'G') +#define mm_is_skew(typecode) ((typecode)[3] == 'K') +#define mm_is_hermitian(typecode) ((typecode)[3] == 'H') -int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ + int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ -/********************* MM_typecode modify fucntions ***************************/ + /********************* MM_typecode modify fucntions ***************************/ -#define mm_set_matrix(typecode) ((*typecode)[0]='M') -#define mm_set_coordinate(typecode) ((*typecode)[1]='C') -#define mm_set_array(typecode) ((*typecode)[1]='A') -#define mm_set_dense(typecode) mm_set_array(typecode) -#define mm_set_sparse(typecode) mm_set_coordinate(typecode) +#define mm_set_matrix(typecode) ((*typecode)[0] = 'M') +#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C') +#define mm_set_array(typecode) ((*typecode)[1] = 'A') +#define mm_set_dense(typecode) mm_set_array(typecode) +#define mm_set_sparse(typecode) mm_set_coordinate(typecode) -#define mm_set_complex(typecode)((*typecode)[2]='C') -#define mm_set_real(typecode) ((*typecode)[2]='R') -#define mm_set_pattern(typecode)((*typecode)[2]='P') -#define mm_set_integer(typecode)((*typecode)[2]='I') +#define mm_set_complex(typecode) ((*typecode)[2] = 'C') +#define mm_set_real(typecode) ((*typecode)[2] = 'R') +#define mm_set_pattern(typecode) ((*typecode)[2] = 'P') +#define mm_set_integer(typecode) ((*typecode)[2] = 'I') -#define mm_set_symmetric(typecode)((*typecode)[3]='S') -#define mm_set_general(typecode)((*typecode)[3]='G') -#define mm_set_skew(typecode) ((*typecode)[3]='K') -#define mm_set_hermitian(typecode)((*typecode)[3]='H') +#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') +#define mm_set_general(typecode) ((*typecode)[3] = 'G') +#define mm_set_skew(typecode) ((*typecode)[3] = 'K') +#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') -#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \ - (*typecode)[2]=' ',(*typecode)[3]='G') +#define mm_clear_typecode(typecode) ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) -/********************* Matrix Market error codes ***************************/ + /********************* Matrix Market error codes ***************************/ -#define MM_COULD_NOT_READ_FILE 11 -#define MM_PREMATURE_EOF 12 -#define MM_NOT_MTX 13 -#define MM_NO_HEADER 14 -#define MM_UNSUPPORTED_TYPE 15 -#define MM_LINE_TOO_LONG 16 -#define MM_COULD_NOT_WRITE_FILE 17 +#define MM_COULD_NOT_READ_FILE 11 +#define MM_PREMATURE_EOF 12 +#define MM_NOT_MTX 13 +#define MM_NO_HEADER 14 +#define MM_UNSUPPORTED_TYPE 15 +#define MM_LINE_TOO_LONG 16 +#define MM_COULD_NOT_WRITE_FILE 17 -/******************** Matrix Market internal definitions ******************** + /******************** Matrix Market internal definitions ******************** - MM_matrix_typecode: 4-character sequence + MM_matrix_typecode: 4-character sequence - ojbect sparse/ data storage - dense type scheme + ojbect sparse/ data storage + dense type scheme - string position: [0] [1] [2] [3] + string position: [0] [1] [2] [3] - Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) - A(array) C(omplex) H(ermitian) - P(attern) S(ymmetric) - I(nteger) K(kew) + Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) + A(array) C(omplex) H(ermitian) + P(attern) S(ymmetric) + I(nteger) K(kew) - ***********************************************************************/ + ***********************************************************************/ -#define MM_MTX_STR "matrix" -#define MM_ARRAY_STR "array" -#define MM_DENSE_STR "array" -#define MM_COORDINATE_STR "coordinate" -#define MM_SPARSE_STR "coordinate" -#define MM_COMPLEX_STR "complex" -#define MM_REAL_STR "real" -#define MM_INT_STR "integer" -#define MM_GENERAL_STR "general" -#define MM_SYMM_STR "symmetric" -#define MM_HERM_STR "hermitian" -#define MM_SKEW_STR "skew-symmetric" -#define MM_PATTERN_STR "pattern" +#define MM_MTX_STR "matrix" +#define MM_ARRAY_STR "array" +#define MM_DENSE_STR "array" +#define MM_COORDINATE_STR "coordinate" +#define MM_SPARSE_STR "coordinate" +#define MM_COMPLEX_STR "complex" +#define MM_REAL_STR "real" +#define MM_INT_STR "integer" +#define MM_GENERAL_STR "general" +#define MM_SYMM_STR "symmetric" +#define MM_HERM_STR "hermitian" +#define MM_SKEW_STR "skew-symmetric" +#define MM_PATTERN_STR "pattern" -/* high level routines */ -int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, - double **val, MM_typecode *matcode); + /* high level routines */ + int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, double **val, MM_typecode *matcode); -int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, - MM_typecode matcode); + int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, MM_typecode matcode); -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_); + int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_); #if defined(__cplusplus) } -#endif /* __cplusplus */ +#endif /* __cplusplus */ #endif diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio_wrapper.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio_wrapper.cpp index 6a69af55..f6f6c8e4 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio_wrapper.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/mmio_wrapper.cpp @@ -1,427 +1,375 @@ +#include +#include #include #include -#include #include "mmio.h" -#include - /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif /* various __inline__ __device__ function to initialize a T_ELEM */ -template __inline__ T_ELEM cuGet (int ); -template <> __inline__ float cuGet(int x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(int); +template <> __inline__ float cuGet(int x) { return float(x); } -template <> __inline__ double cuGet(int x) -{ - return double(x); -} +template <> __inline__ double cuGet(int x) { return double(x); } -template <> __inline__ cuComplex cuGet(int x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(int x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(int x) +template <> __inline__ cuDoubleComplex cuGet(int x) { return (make_cuDoubleComplex(double(x), 0.0)); } + + +template __inline__ T_ELEM cuGet(int, int); +template <> __inline__ float cuGet(int x, int y) { return float(x); } + +template <> __inline__ double cuGet(int x, int y) { return double(x); } + +template <> __inline__ cuComplex cuGet(int x, int y) { return make_cuComplex(float(x), float(y)); } + +template <> __inline__ cuDoubleComplex cuGet(int x, int y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (int , int ); -template <> __inline__ float cuGet(int x, int y) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(float); +template <> __inline__ float cuGet(float x) { return float(x); } -template <> __inline__ double cuGet(int x, int y) -{ - return double(x); -} +template <> __inline__ double cuGet(float x) { return double(x); } -template <> __inline__ cuComplex cuGet(int x, int y) -{ - return make_cuComplex( float(x), float(y) ); -} +template <> __inline__ cuComplex cuGet(float x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(int x, int y) +template <> __inline__ cuDoubleComplex cuGet(float x) { - return (make_cuDoubleComplex( double(x), double(y) )); + return (make_cuDoubleComplex(double(x), 0.0)); } -template __inline__ T_ELEM cuGet (float ); -template <> __inline__ float cuGet(float x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(float, float); +template <> __inline__ float cuGet(float x, float y) { return float(x); } -template <> __inline__ double cuGet(float x) -{ - return double(x); -} +template <> __inline__ double cuGet(float x, float y) { return double(x); } -template <> __inline__ cuComplex cuGet(float x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(float x, float y) { return (make_cuComplex(float(x), float(y))); } -template <> __inline__ cuDoubleComplex cuGet(float x) +template <> __inline__ cuDoubleComplex cuGet(float x, float y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (float, float ); -template <> __inline__ float cuGet(float x, float y) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(double); +template <> __inline__ float cuGet(double x) { return float(x); } -template <> __inline__ double cuGet(float x, float y) -{ - return double(x); -} +template <> __inline__ double cuGet(double x) { return double(x); } -template <> __inline__ cuComplex cuGet(float x, float y) -{ - return (make_cuComplex( float(x), float(y) )); -} +template <> __inline__ cuComplex cuGet(double x) { return (make_cuComplex(float(x), 0.0f)); } -template <> __inline__ cuDoubleComplex cuGet(float x, float y) +template <> __inline__ cuDoubleComplex cuGet(double x) { - return (make_cuDoubleComplex( double(x), double(y) )); + return (make_cuDoubleComplex(double(x), 0.0)); } -template __inline__ T_ELEM cuGet (double ); -template <> __inline__ float cuGet(double x) -{ - return float(x); -} +template __inline__ T_ELEM cuGet(double, double); +template <> __inline__ float cuGet(double x, double y) { return float(x); } -template <> __inline__ double cuGet(double x) -{ - return double(x); -} +template <> __inline__ double cuGet(double x, double y) { return double(x); } -template <> __inline__ cuComplex cuGet(double x) -{ - return (make_cuComplex( float(x), 0.0f )); -} +template <> __inline__ cuComplex cuGet(double x, double y) { return (make_cuComplex(float(x), float(y))); } -template <> __inline__ cuDoubleComplex cuGet(double x) +template <> __inline__ cuDoubleComplex cuGet(double x, double y) { - return (make_cuDoubleComplex( double(x), 0.0 )); + return (make_cuDoubleComplex(double(x), double(y))); } -template __inline__ T_ELEM cuGet (double, double ); -template <> __inline__ float cuGet(double x, double y) -{ - return float(x); -} - -template <> __inline__ double cuGet(double x, double y) -{ - return double(x); -} - -template <> __inline__ cuComplex cuGet(double x, double y) -{ - return (make_cuComplex( float(x), float(y) )); -} - -template <> __inline__ cuDoubleComplex cuGet(double x, double y) -{ - return (make_cuDoubleComplex( double(x), double(y) )); -} - - - - - -static void compress_index( - const int *Ind, - int nnz, - int m, - int *Ptr, - int base) +static void compress_index(const int *Ind, int nnz, int m, int *Ptr, int base) { int i; /* initialize everything to zero */ - for(i=0; ii < t->i ){ - return -1 ; + if (s->i < t->i) { + return -1; } - else if ( s->i > t->i ){ - return 1 ; + else if (s->i > t->i) { + return 1; } - else{ - return s->j - t->j ; + else { + return s->j - t->j; } } -int cmp_cooFormat_csc( struct cooFormat *s, struct cooFormat *t) +int cmp_cooFormat_csc(struct cooFormat *s, struct cooFormat *t) { - if ( s->j < t->j ){ - return -1 ; + if (s->j < t->j) { + return -1; } - else if ( s->j > t->j ){ - return 1 ; + else if (s->j > t->j) { + return 1; } - else{ - return s->i - t->i ; + else { + return s->i - t->i; } } -typedef int (*FUNPTR) (const void*, const void*) ; -typedef int (*FUNPTR2) ( struct cooFormat *s, struct cooFormat *t) ; +typedef int (*FUNPTR)(const void *, const void *); +typedef int (*FUNPTR2)(struct cooFormat *s, struct cooFormat *t); -static FUNPTR2 fptr_array[2] = { +static FUNPTR2 fptr_array[2] = { cmp_cooFormat_csr, cmp_cooFormat_csc, }; -static int verify_pattern( - int m, - int nnz, - int *csrRowPtr, - int *csrColInd) +static int verify_pattern(int m, int nnz, int *csrRowPtr, int *csrColInd) { int i, col, start, end, base_index; int error_found = 0; - if (nnz != (csrRowPtr[m] - csrRowPtr[0])){ - fprintf(stderr, "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", 0, csrRowPtr[0], m, csrRowPtr[m], nnz); + if (nnz != (csrRowPtr[m] - csrRowPtr[0])) { + fprintf(stderr, + "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", + 0, + csrRowPtr[0], + m, + csrRowPtr[m], + nnz); error_found = 1; } base_index = csrRowPtr[0]; - if ((0 != base_index) && (1 != base_index)){ + if ((0 != base_index) && (1 != base_index)) { fprintf(stderr, "Error (base index check failed): base index = %d\n", base_index); error_found = 1; } - for (i=0; (!error_found) && (i end){ - fprintf(stderr, "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", i, start+base_index, i+1, end+base_index); + for (i = 0; (!error_found) && (i < m); i++) { + start = csrRowPtr[i] - base_index; + end = csrRowPtr[i + 1] - base_index; + if (start > end) { + fprintf(stderr, + "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", + i, + start + base_index, + i + 1, + end + base_index); error_found = 1; } - for (col=start; col= csrColInd[col+1])){ - fprintf(stderr, "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", col, csrColInd[col], col+1, csrColInd[col+1]); + if ((col < (end - 1)) && (csrColInd[col] >= csrColInd[col + 1])) { + fprintf( + stderr, + "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", + col, + csrColInd[col], + col + 1, + csrColInd[col + 1]); error_found = 1; } } } - return error_found ; + return error_found; } template -int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - T_ELEM **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix) +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix) { - MM_typecode matcode; - double *tempVal; - int *tempRowInd,*tempColInd; - double *tval; - int *trow,*tcol; - int *csrRowPtr, *cscColPtr; - int i,j,error,base,count; + MM_typecode matcode; + double *tempVal; + int *tempRowInd, *tempColInd; + double *tval; + int *trow, *tcol; + int *csrRowPtr, *cscColPtr; + int i, j, error, base, count; struct cooFormat *work; - /* read the matrix */ + /* read the matrix */ error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode); if (error) { fprintf(stderr, "!!!! can not open file: '%s'\n", filename); - return 1; + return 1; } /* start error checking */ if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) { fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n"); - return 1; + return 1; } - if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/){ + if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/) { fprintf(stderr, "!!!! dense, array, pattern and integer matrices are not supported\n"); - return 1; + return 1; } /* if necessary symmetrize the pattern (transform from triangular to full) */ - if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))){ - //count number of non-diagonal elements - count=0; - for(i=0; i<(*nnz); i++){ - if (trow[i] != tcol[i]){ + if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))) { + // count number of non-diagonal elements + count = 0; + for (i = 0; i < (*nnz); i++) { + if (trow[i] != tcol[i]) { count++; } } - //allocate space for the symmetrized matrix - tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); - tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); - if (mm_is_real(matcode) || mm_is_integer(matcode)){ + // allocate space for the symmetrized matrix + tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); + tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { tempVal = (double *)malloc((*nnz + count) * sizeof(double)); } - else{ + else { tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double)); } - //copy the elements regular and transposed locations - for(j=0, i=0; i<(*nnz); i++){ - tempRowInd[j]=trow[i]; - tempColInd[j]=tcol[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - tempVal[j]=tval[i]; + // copy the elements regular and transposed locations + for (j = 0, i = 0; i < (*nnz); i++) { + tempRowInd[j] = trow[i]; + tempColInd[j] = tcol[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + tempVal[j] = tval[i]; } - else{ - tempVal[2*j] =tval[2*i]; - tempVal[2*j+1]=tval[2*i+1]; + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; } j++; - if (trow[i] != tcol[i]){ - tempRowInd[j]=tcol[i]; - tempColInd[j]=trow[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - if (mm_is_skew(matcode)){ - tempVal[j]=-tval[i]; + if (trow[i] != tcol[i]) { + tempRowInd[j] = tcol[i]; + tempColInd[j] = trow[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + if (mm_is_skew(matcode)) { + tempVal[j] = -tval[i]; } - else{ - tempVal[j]= tval[i]; + else { + tempVal[j] = tval[i]; } } - else{ - if(mm_is_hermitian(matcode)){ - tempVal[2*j] = tval[2*i]; - tempVal[2*j+1]=-tval[2*i+1]; + else { + if (mm_is_hermitian(matcode)) { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = -tval[2 * i + 1]; } - else{ - tempVal[2*j] = tval[2*i]; - tempVal[2*j+1]= tval[2*i+1]; + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; } } j++; } } - (*nnz)+=count; - //free temporary storage + (*nnz) += count; + // free temporary storage free(trow); free(tcol); - free(tval); + free(tval); } - else{ - tempRowInd=trow; - tempColInd=tcol; - tempVal =tval; + else { + tempRowInd = trow; + tempColInd = tcol; + tempVal = tval; } // life time of (trow, tcol, tval) is over. // please use COO format (tempRowInd, tempColInd, tempVal) -// use qsort to sort COO format - work = (struct cooFormat *)malloc(sizeof(struct cooFormat)*(*nnz)); - if (NULL == work){ + // use qsort to sort COO format + work = (struct cooFormat *)malloc(sizeof(struct cooFormat) * (*nnz)); + if (NULL == work) { fprintf(stderr, "!!!! allocation error, malloc failed\n"); return 1; } - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { work[i].i = tempRowInd[i]; work[i].j = tempColInd[i]; work[i].p = i; // permutation is identity } - - if (csrFormat){ - /* create row-major ordering of indices (sorted by row and within each row by column) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0] ); - }else{ - /* create column-major ordering of indices (sorted by column and within each column by row) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1] ); + if (csrFormat) { + /* create row-major ordering of indices (sorted by row and within each row by column) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0]); + } + else { + /* create column-major ordering of indices (sorted by column and within each column by row) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1]); } // (tempRowInd, tempColInd) is sorted either by row-major or by col-major - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { tempRowInd[i] = work[i].i; tempColInd[i] = work[i].j; } - // setup base + // setup base // check if there is any row/col 0, if so base-0 // check if there is any row/col equal to matrix dimension m/n, if so base-1 int base0 = 0; int base1 = 0; - for(i=0; i<(*nnz); i++){ + for (i = 0; i < (*nnz); i++) { const int row = tempRowInd[i]; const int col = tempColInd[i]; - if ( (0 == row) || (0 == col) ){ + if ((0 == row) || (0 == col)) { base0 = 1; } - if ( (*m == row) || (*n == col) ){ + if ((*m == row) || (*n == col)) { base1 = 1; } } - if ( base0 && base1 ){ + if (base0 && base1) { printf("Error: input matrix is base-0 and base-1 \n"); return 1; } base = 0; - if (base1){ + if (base1) { base = 1; } /* compress the appropriate indices */ - if (csrFormat){ + if (csrFormat) { /* CSR format (assuming row-major format) */ - csrRowPtr = (int *)malloc(((*m)+1) * sizeof(csrRowPtr[0])); - if (!csrRowPtr) return 1; + csrRowPtr = (int *)malloc(((*m) + 1) * sizeof(csrRowPtr[0])); + if (!csrRowPtr) + return 1; compress_index(tempRowInd, *nnz, *m, csrRowPtr, base); *aRowInd = csrRowPtr; @@ -429,101 +377,97 @@ int loadMMSparseMatrix( } else { /* CSC format (assuming column-major format) */ - cscColPtr = (int *)malloc(((*n)+1) * sizeof(cscColPtr[0])); - if (!cscColPtr) return 1; + cscColPtr = (int *)malloc(((*n) + 1) * sizeof(cscColPtr[0])); + if (!cscColPtr) + return 1; compress_index(tempColInd, *nnz, *n, cscColPtr, base); *aColInd = cscColPtr; *aRowInd = (int *)malloc((*nnz) * sizeof(int)); - } + } - /* transfrom the matrix values of type double into one of the cusparse library types */ + /* transfrom the matrix values of type double into one of the cusparse library types */ *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM)); - - for (i=0; i<(*nnz); i++) { - if (csrFormat){ + + for (i = 0; i < (*nnz); i++) { + if (csrFormat) { (*aColInd)[i] = tempColInd[i]; } - else{ + else { (*aRowInd)[i] = tempRowInd[i]; } - if (mm_is_real(matcode) || mm_is_integer(matcode)){ - (*aVal)[i] = cuGet( tempVal[ work[i].p ] ); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + (*aVal)[i] = cuGet(tempVal[work[i].p]); } - else{ - (*aVal)[i] = cuGet(tempVal[2*work[i].p], tempVal[2*work[i].p+1]); + else { + (*aVal)[i] = cuGet(tempVal[2 * work[i].p], tempVal[2 * work[i].p + 1]); } } /* check for corruption */ int error_found; - if (csrFormat){ + if (csrFormat) { error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd); - }else{ + } + else { error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd); } - if (error_found){ + if (error_found) { fprintf(stderr, "!!!! verify_pattern failed\n"); return 1; } /* cleanup and exit */ free(work); - free(tempVal); + free(tempVal); free(tempColInd); free(tempRowInd); return 0; -} +} /* specific instantiation */ -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - float **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + float **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - double **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); - -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - cuComplex **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); - -template int loadMMSparseMatrix( - char *filename, - char elem_type, - bool csrFormat, - int *m, - int *n, - int *nnz, - cuDoubleComplex **aVal, - int **aRowInd, - int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + double **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuDoubleComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/cuSolverSp_LowlevelQR.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/cuSolverSp_LowlevelQR.cpp index e238ea2d..b1e336b9 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/cuSolverSp_LowlevelQR.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/cuSolverSp_LowlevelQR.cpp @@ -38,412 +38,453 @@ #include "helper_cusolver.h" template -int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m, - int *n, int *nnz, T_ELEM **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -void UsageSP(void) { - printf("\n"); - printf("-h : display this help\n"); - printf("-file= : filename containing a matrix in MM format\n"); - printf("-device= : if want to run on specific GPU\n"); +void UsageSP(void) +{ + printf("\n"); + printf("-h : display this help\n"); + printf("-file= : filename containing a matrix in MM format\n"); + printf("-device= : if want to run on specific GPU\n"); - exit(0); + exit(0); } -void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) { - memset(&opts, 0, sizeof(opts)); +void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) +{ + memset(&opts, 0, sizeof(opts)); - if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { - UsageSP(); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - char *fileName = 0; - getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); - - if (fileName) { - opts.sparse_mat_filename = fileName; - } else { - printf("\nIncorrect filename passed to -file \n "); - UsageSP(); + if (checkCmdLineFlag(argc, (const char **)argv, "-h")) { + UsageSP(); } - } -} -int main(int argc, char *argv[]) { - struct testOpts opts; - cusolverSpHandle_t cusolverSpH = - NULL; // reordering, permutation and 1st LU factorization - cusparseHandle_t cusparseH = NULL; // residual evaluation - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + char *fileName = 0; + getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName); - csrqrInfoHost_t h_info = - NULL; // opaque info structure for LU with parital pivoting - csrqrInfo_t d_info = - NULL; // opaque info structure for LU with parital pivoting - - int rowsA = 0; // number of rows of A - int colsA = 0; // number of columns of A - int nnzA = 0; // number of nonzeros of A - int baseA = 0; // base index in CSR format - - // CSR(A) from I/O - int *h_csrRowPtrA = NULL; // n+1 - int *h_csrColIndA = NULL; // nnzA - double *h_csrValA = NULL; // nnzA - - double *h_x = NULL; // n, x = A \ b - double *h_b = NULL; // n, b = ones(m,1) - double *h_bcopy = NULL; // n, b = ones(m,1) - double *h_r = NULL; // n, r = b - A*x - - size_t size_internal = 0; - size_t size_chol = 0; // size of working space for csrlu - void *buffer_cpu = NULL; // working space for Cholesky - void *buffer_gpu = NULL; // working space for Cholesky - - int *d_csrRowPtrA = NULL; // n+1 - int *d_csrColIndA = NULL; // nnzA - double *d_csrValA = NULL; // nnzA - double *d_x = NULL; // n, x = A \ b - double *d_b = NULL; // n, a copy of h_b - double *d_r = NULL; // n, r = b - A*x - - // the constants used in residual evaluation, r = b - A*x - const double minus_one = -1.0; - const double one = 1.0; - const double zero = 0.0; - // the constant used in cusolverSp - // singularity is -1 if A is invertible under tol - // tol determines the condition of singularity - int singularity = 0; - const double tol = 1.e-14; - - double x_inf = 0.0; // |x| - double r_inf = 0.0; // |r| - double A_inf = 0.0; // |A| - - parseCommandLineArguments(argc, argv, opts); - - findCudaDevice(argc, (const char **)argv); - - if (opts.sparse_mat_filename == NULL) { - opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n32.mtx", argv[0]); - if (opts.sparse_mat_filename != NULL) - printf("Using default input file [%s]\n", opts.sparse_mat_filename); - else - printf("Could not find lap2D_5pt_n32.mtx\n"); - } else { - printf("Using input file [%s]\n", opts.sparse_mat_filename); - } - - printf("step 1: read matrix market format\n"); - - if (opts.sparse_mat_filename) { - if (loadMMSparseMatrix(opts.sparse_mat_filename, 'd', true, &rowsA, - &colsA, &nnzA, &h_csrValA, &h_csrRowPtrA, - &h_csrColIndA, true)) { - return 1; + if (fileName) { + opts.sparse_mat_filename = fileName; + } + else { + printf("\nIncorrect filename passed to -file \n "); + UsageSP(); + } } - baseA = h_csrRowPtrA[0]; // baseA = {0,1} - } else { - fprintf(stderr, "Error: input matrix is not provided\n"); - return 1; - } - - if (rowsA != colsA) { - fprintf(stderr, "Error: only support square matrix\n"); - return 1; - } - - printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, - nnzA, baseA); - - checkCudaErrors(cusolverSpCreate(&cusolverSpH)); - checkCudaErrors(cusparseCreate(&cusparseH)); - checkCudaErrors(cudaStreamCreate(&stream)); - checkCudaErrors(cusolverSpSetStream(cusolverSpH, stream)); - checkCudaErrors(cusparseSetStream(cusparseH, stream)); - - checkCudaErrors(cusparseCreateMatDescr(&descrA)); - - checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - - if (baseA) { - checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); - } else { - checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); - } - - h_x = (double *)malloc(sizeof(double) * colsA); - h_b = (double *)malloc(sizeof(double) * rowsA); - h_bcopy = (double *)malloc(sizeof(double) * rowsA); - h_r = (double *)malloc(sizeof(double) * rowsA); - - assert(NULL != h_x); - assert(NULL != h_b); - assert(NULL != h_bcopy); - assert(NULL != h_r); - - checkCudaErrors( - cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1))); - checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA)); - checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); - checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); - checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); - - for (int row = 0; row < rowsA; row++) { - h_b[row] = 1.0; - } - - memcpy(h_bcopy, h_b, sizeof(double) * rowsA); - - printf("step 2: create opaque info structure\n"); - checkCudaErrors(cusolverSpCreateCsrqrInfoHost(&h_info)); - - printf("step 3: analyze qr(A) to know structure of L\n"); - checkCudaErrors(cusolverSpXcsrqrAnalysisHost(cusolverSpH, rowsA, colsA, nnzA, - descrA, h_csrRowPtrA, - h_csrColIndA, h_info)); - - printf("step 4: workspace for qr(A)\n"); - checkCudaErrors(cusolverSpDcsrqrBufferInfoHost( - cusolverSpH, rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, - h_csrColIndA, h_info, &size_internal, &size_chol)); - - if (buffer_cpu) { - free(buffer_cpu); - } - buffer_cpu = (void *)malloc(sizeof(char) * size_chol); - assert(NULL != buffer_cpu); - - printf("step 5: compute A = L*L^T \n"); - checkCudaErrors(cusolverSpDcsrqrSetupHost(cusolverSpH, rowsA, colsA, nnzA, - descrA, h_csrValA, h_csrRowPtrA, - h_csrColIndA, zero, h_info)); - - checkCudaErrors(cusolverSpDcsrqrFactorHost(cusolverSpH, rowsA, colsA, nnzA, - NULL, NULL, h_info, buffer_cpu)); - - printf("step 6: check if the matrix is singular \n"); - checkCudaErrors( - cusolverSpDcsrqrZeroPivotHost(cusolverSpH, h_info, tol, &singularity)); - - if (0 <= singularity) { - fprintf(stderr, "Error: A is not invertible, singularity=%d\n", - singularity); - return 1; - } - - printf("step 7: solve A*x = b \n"); - checkCudaErrors(cusolverSpDcsrqrSolveHost(cusolverSpH, rowsA, colsA, h_b, h_x, - h_info, buffer_cpu)); - - printf("step 8: evaluate residual r = b - A*x (result on CPU)\n"); - // use GPU gemv to compute r = b - A*x - checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, - sizeof(int) * (rowsA + 1), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, - cudaMemcpyHostToDevice)); - - checkCudaErrors( - cudaMemcpy(d_r, h_bcopy, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_x, h_x, sizeof(double) * colsA, cudaMemcpyHostToDevice)); - - /* Wrap raw data into cuSPARSE generic API objects */ - cusparseSpMatDescr_t matA = NULL; - if (baseA) { - checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA, - d_csrColIndA, d_csrValA, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)); - } else { - checkCudaErrors(cusparseCreateCsr(&matA, rowsA, colsA, nnzA, d_csrRowPtrA, - d_csrColIndA, d_csrValA, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); - } - - cusparseDnVecDescr_t vecx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F)); - cusparseDnVecDescr_t vecAx = NULL; - checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F)); - - /* Allocate workspace for cuSPARSE */ - size_t bufferSize = 0; - checkCudaErrors(cusparseSpMV_bufferSize( - cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, &one, - vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); - void *buffer = NULL; - checkCudaErrors(cudaMalloc(&buffer, bufferSize)); - - checkCudaErrors(cusparseSpMV(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, - &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors( - cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); - - x_inf = vec_norminf(colsA, h_x); - r_inf = vec_norminf(rowsA, h_r); - A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, - h_csrColIndA); - - printf("(CPU) |b - A*x| = %E \n", r_inf); - printf("(CPU) |A| = %E \n", A_inf); - printf("(CPU) |x| = %E \n", x_inf); - printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); - - printf("step 9: create opaque info structure\n"); - checkCudaErrors(cusolverSpCreateCsrqrInfo(&d_info)); - - checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, - sizeof(int) * (rowsA + 1), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, - cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_b, h_bcopy, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - - printf("step 10: analyze qr(A) to know structure of L\n"); - checkCudaErrors(cusolverSpXcsrqrAnalysis(cusolverSpH, rowsA, colsA, nnzA, - descrA, d_csrRowPtrA, d_csrColIndA, - d_info)); - - printf("step 11: workspace for qr(A)\n"); - checkCudaErrors(cusolverSpDcsrqrBufferInfo( - cusolverSpH, rowsA, colsA, nnzA, descrA, d_csrValA, d_csrRowPtrA, - d_csrColIndA, d_info, &size_internal, &size_chol)); - - printf("GPU buffer size = %lld bytes\n", (signed long long)size_chol); - if (buffer_gpu) { - checkCudaErrors(cudaFree(buffer_gpu)); - } - checkCudaErrors(cudaMalloc(&buffer_gpu, sizeof(char) * size_chol)); - - printf("step 12: compute A = L*L^T \n"); - checkCudaErrors(cusolverSpDcsrqrSetup(cusolverSpH, rowsA, colsA, nnzA, descrA, - d_csrValA, d_csrRowPtrA, d_csrColIndA, - zero, d_info)); - - checkCudaErrors(cusolverSpDcsrqrFactor(cusolverSpH, rowsA, colsA, nnzA, NULL, - NULL, d_info, buffer_gpu)); - - printf("step 13: check if the matrix is singular \n"); - checkCudaErrors( - cusolverSpDcsrqrZeroPivot(cusolverSpH, d_info, tol, &singularity)); - - if (0 <= singularity) { - fprintf(stderr, "Error: A is not invertible, singularity=%d\n", - singularity); - return 1; - } - - printf("step 14: solve A*x = b \n"); - checkCudaErrors(cusolverSpDcsrqrSolve(cusolverSpH, rowsA, colsA, d_b, d_x, - d_info, buffer_gpu)); - - checkCudaErrors( - cudaMemcpy(d_r, h_bcopy, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); - - checkCudaErrors(cusparseSpMV(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, - &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT, buffer)); - - checkCudaErrors( - cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); - - r_inf = vec_norminf(rowsA, h_r); - - printf("(GPU) |b - A*x| = %E \n", r_inf); - printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); - - if (cusolverSpH) { - checkCudaErrors(cusolverSpDestroy(cusolverSpH)); - } - if (cusparseH) { - checkCudaErrors(cusparseDestroy(cusparseH)); - } - if (stream) { - checkCudaErrors(cudaStreamDestroy(stream)); - } - if (descrA) { - checkCudaErrors(cusparseDestroyMatDescr(descrA)); - } - if (h_info) { - checkCudaErrors(cusolverSpDestroyCsrqrInfoHost(h_info)); - } - if (d_info) { - checkCudaErrors(cusolverSpDestroyCsrqrInfo(d_info)); - } - - if (matA) { - checkCudaErrors(cusparseDestroySpMat(matA)); - } - if (vecx) { - checkCudaErrors(cusparseDestroyDnVec(vecx)); - } - if (vecAx) { - checkCudaErrors(cusparseDestroyDnVec(vecAx)); - } - - if (h_csrValA) { - free(h_csrValA); - } - if (h_csrRowPtrA) { - free(h_csrRowPtrA); - } - if (h_csrColIndA) { - free(h_csrColIndA); - } - - if (h_x) { - free(h_x); - } - if (h_b) { - free(h_b); - } - if (h_bcopy) { - free(h_bcopy); - } - if (h_r) { - free(h_r); - } - - if (buffer_cpu) { - free(buffer_cpu); - } - if (buffer_gpu) { - checkCudaErrors(cudaFree(buffer_gpu)); - } - - if (d_csrValA) { - checkCudaErrors(cudaFree(d_csrValA)); - } - if (d_csrRowPtrA) { - checkCudaErrors(cudaFree(d_csrRowPtrA)); - } - if (d_csrColIndA) { - checkCudaErrors(cudaFree(d_csrColIndA)); - } - if (d_x) { - checkCudaErrors(cudaFree(d_x)); - } - if (d_b) { - checkCudaErrors(cudaFree(d_b)); - } - if (d_r) { - checkCudaErrors(cudaFree(d_r)); - } - - return 0; +} + +int main(int argc, char *argv[]) +{ + struct testOpts opts; + cusolverSpHandle_t cusolverSpH = NULL; // reordering, permutation and 1st LU factorization + cusparseHandle_t cusparseH = NULL; // residual evaluation + cudaStream_t stream = NULL; + cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix + + csrqrInfoHost_t h_info = NULL; // opaque info structure for LU with parital pivoting + csrqrInfo_t d_info = NULL; // opaque info structure for LU with parital pivoting + + int rowsA = 0; // number of rows of A + int colsA = 0; // number of columns of A + int nnzA = 0; // number of nonzeros of A + int baseA = 0; // base index in CSR format + + // CSR(A) from I/O + int *h_csrRowPtrA = NULL; // n+1 + int *h_csrColIndA = NULL; // nnzA + double *h_csrValA = NULL; // nnzA + + double *h_x = NULL; // n, x = A \ b + double *h_b = NULL; // n, b = ones(m,1) + double *h_bcopy = NULL; // n, b = ones(m,1) + double *h_r = NULL; // n, r = b - A*x + + size_t size_internal = 0; + size_t size_chol = 0; // size of working space for csrlu + void *buffer_cpu = NULL; // working space for Cholesky + void *buffer_gpu = NULL; // working space for Cholesky + + int *d_csrRowPtrA = NULL; // n+1 + int *d_csrColIndA = NULL; // nnzA + double *d_csrValA = NULL; // nnzA + double *d_x = NULL; // n, x = A \ b + double *d_b = NULL; // n, a copy of h_b + double *d_r = NULL; // n, r = b - A*x + + // the constants used in residual evaluation, r = b - A*x + const double minus_one = -1.0; + const double one = 1.0; + const double zero = 0.0; + // the constant used in cusolverSp + // singularity is -1 if A is invertible under tol + // tol determines the condition of singularity + int singularity = 0; + const double tol = 1.e-14; + + double x_inf = 0.0; // |x| + double r_inf = 0.0; // |r| + double A_inf = 0.0; // |A| + + parseCommandLineArguments(argc, argv, opts); + + findCudaDevice(argc, (const char **)argv); + + if (opts.sparse_mat_filename == NULL) { + opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n32.mtx", argv[0]); + if (opts.sparse_mat_filename != NULL) + printf("Using default input file [%s]\n", opts.sparse_mat_filename); + else + printf("Could not find lap2D_5pt_n32.mtx\n"); + } + else { + printf("Using input file [%s]\n", opts.sparse_mat_filename); + } + + printf("step 1: read matrix market format\n"); + + if (opts.sparse_mat_filename) { + if (loadMMSparseMatrix(opts.sparse_mat_filename, + 'd', + true, + &rowsA, + &colsA, + &nnzA, + &h_csrValA, + &h_csrRowPtrA, + &h_csrColIndA, + true)) { + return 1; + } + baseA = h_csrRowPtrA[0]; // baseA = {0,1} + } + else { + fprintf(stderr, "Error: input matrix is not provided\n"); + return 1; + } + + if (rowsA != colsA) { + fprintf(stderr, "Error: only support square matrix\n"); + return 1; + } + + printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA); + + checkCudaErrors(cusolverSpCreate(&cusolverSpH)); + checkCudaErrors(cusparseCreate(&cusparseH)); + checkCudaErrors(cudaStreamCreate(&stream)); + checkCudaErrors(cusolverSpSetStream(cusolverSpH, stream)); + checkCudaErrors(cusparseSetStream(cusparseH, stream)); + + checkCudaErrors(cusparseCreateMatDescr(&descrA)); + + checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + + if (baseA) { + checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); + } + else { + checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + } + + h_x = (double *)malloc(sizeof(double) * colsA); + h_b = (double *)malloc(sizeof(double) * rowsA); + h_bcopy = (double *)malloc(sizeof(double) * rowsA); + h_r = (double *)malloc(sizeof(double) * rowsA); + + assert(NULL != h_x); + assert(NULL != h_b); + assert(NULL != h_bcopy); + assert(NULL != h_r); + + checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int) * (rowsA + 1))); + checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_csrValA, sizeof(double) * nnzA)); + checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA)); + checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA)); + checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA)); + + for (int row = 0; row < rowsA; row++) { + h_b[row] = 1.0; + } + + memcpy(h_bcopy, h_b, sizeof(double) * rowsA); + + printf("step 2: create opaque info structure\n"); + checkCudaErrors(cusolverSpCreateCsrqrInfoHost(&h_info)); + + printf("step 3: analyze qr(A) to know structure of L\n"); + checkCudaErrors( + cusolverSpXcsrqrAnalysisHost(cusolverSpH, rowsA, colsA, nnzA, descrA, h_csrRowPtrA, h_csrColIndA, h_info)); + + printf("step 4: workspace for qr(A)\n"); + checkCudaErrors(cusolverSpDcsrqrBufferInfoHost(cusolverSpH, + rowsA, + colsA, + nnzA, + descrA, + h_csrValA, + h_csrRowPtrA, + h_csrColIndA, + h_info, + &size_internal, + &size_chol)); + + if (buffer_cpu) { + free(buffer_cpu); + } + buffer_cpu = (void *)malloc(sizeof(char) * size_chol); + assert(NULL != buffer_cpu); + + printf("step 5: compute A = L*L^T \n"); + checkCudaErrors(cusolverSpDcsrqrSetupHost( + cusolverSpH, rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA, zero, h_info)); + + checkCudaErrors(cusolverSpDcsrqrFactorHost(cusolverSpH, rowsA, colsA, nnzA, NULL, NULL, h_info, buffer_cpu)); + + printf("step 6: check if the matrix is singular \n"); + checkCudaErrors(cusolverSpDcsrqrZeroPivotHost(cusolverSpH, h_info, tol, &singularity)); + + if (0 <= singularity) { + fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity); + return 1; + } + + printf("step 7: solve A*x = b \n"); + checkCudaErrors(cusolverSpDcsrqrSolveHost(cusolverSpH, rowsA, colsA, h_b, h_x, h_info, buffer_cpu)); + + printf("step 8: evaluate residual r = b - A*x (result on CPU)\n"); + // use GPU gemv to compute r = b - A*x + checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemcpy(d_r, h_bcopy, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_x, h_x, sizeof(double) * colsA, cudaMemcpyHostToDevice)); + + /* Wrap raw data into cuSPARSE generic API objects */ + cusparseSpMatDescr_t matA = NULL; + if (baseA) { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ONE, + CUDA_R_64F)); + } + else { + checkCudaErrors(cusparseCreateCsr(&matA, + rowsA, + colsA, + nnzA, + d_csrRowPtrA, + d_csrColIndA, + d_csrValA, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + } + + cusparseDnVecDescr_t vecx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F)); + cusparseDnVecDescr_t vecAx = NULL; + checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F)); + + /* Allocate workspace for cuSPARSE */ + size_t bufferSize = 0; + checkCudaErrors(cusparseSpMV_bufferSize(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); + void *buffer = NULL; + checkCudaErrors(cudaMalloc(&buffer, bufferSize)); + + checkCudaErrors(cusparseSpMV(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); + + x_inf = vec_norminf(colsA, h_x); + r_inf = vec_norminf(rowsA, h_r); + A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA); + + printf("(CPU) |b - A*x| = %E \n", r_inf); + printf("(CPU) |A| = %E \n", A_inf); + printf("(CPU) |x| = %E \n", x_inf); + printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); + + printf("step 9: create opaque info structure\n"); + checkCudaErrors(cusolverSpCreateCsrqrInfo(&d_info)); + + checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (rowsA + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_csrValA, h_csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_b, h_bcopy, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + + printf("step 10: analyze qr(A) to know structure of L\n"); + checkCudaErrors( + cusolverSpXcsrqrAnalysis(cusolverSpH, rowsA, colsA, nnzA, descrA, d_csrRowPtrA, d_csrColIndA, d_info)); + + printf("step 11: workspace for qr(A)\n"); + checkCudaErrors(cusolverSpDcsrqrBufferInfo(cusolverSpH, + rowsA, + colsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_info, + &size_internal, + &size_chol)); + + printf("GPU buffer size = %lld bytes\n", (signed long long)size_chol); + if (buffer_gpu) { + checkCudaErrors(cudaFree(buffer_gpu)); + } + checkCudaErrors(cudaMalloc(&buffer_gpu, sizeof(char) * size_chol)); + + printf("step 12: compute A = L*L^T \n"); + checkCudaErrors(cusolverSpDcsrqrSetup( + cusolverSpH, rowsA, colsA, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, zero, d_info)); + + checkCudaErrors(cusolverSpDcsrqrFactor(cusolverSpH, rowsA, colsA, nnzA, NULL, NULL, d_info, buffer_gpu)); + + printf("step 13: check if the matrix is singular \n"); + checkCudaErrors(cusolverSpDcsrqrZeroPivot(cusolverSpH, d_info, tol, &singularity)); + + if (0 <= singularity) { + fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity); + return 1; + } + + printf("step 14: solve A*x = b \n"); + checkCudaErrors(cusolverSpDcsrqrSolve(cusolverSpH, rowsA, colsA, d_b, d_x, d_info, buffer_gpu)); + + checkCudaErrors(cudaMemcpy(d_r, h_bcopy, sizeof(double) * rowsA, cudaMemcpyHostToDevice)); + + checkCudaErrors(cusparseSpMV(cusparseH, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minus_one, + matA, + vecx, + &one, + vecAx, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost)); + + r_inf = vec_norminf(rowsA, h_r); + + printf("(GPU) |b - A*x| = %E \n", r_inf); + printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf)); + + if (cusolverSpH) { + checkCudaErrors(cusolverSpDestroy(cusolverSpH)); + } + if (cusparseH) { + checkCudaErrors(cusparseDestroy(cusparseH)); + } + if (stream) { + checkCudaErrors(cudaStreamDestroy(stream)); + } + if (descrA) { + checkCudaErrors(cusparseDestroyMatDescr(descrA)); + } + if (h_info) { + checkCudaErrors(cusolverSpDestroyCsrqrInfoHost(h_info)); + } + if (d_info) { + checkCudaErrors(cusolverSpDestroyCsrqrInfo(d_info)); + } + + if (matA) { + checkCudaErrors(cusparseDestroySpMat(matA)); + } + if (vecx) { + checkCudaErrors(cusparseDestroyDnVec(vecx)); + } + if (vecAx) { + checkCudaErrors(cusparseDestroyDnVec(vecAx)); + } + + if (h_csrValA) { + free(h_csrValA); + } + if (h_csrRowPtrA) { + free(h_csrRowPtrA); + } + if (h_csrColIndA) { + free(h_csrColIndA); + } + + if (h_x) { + free(h_x); + } + if (h_b) { + free(h_b); + } + if (h_bcopy) { + free(h_bcopy); + } + if (h_r) { + free(h_r); + } + + if (buffer_cpu) { + free(buffer_cpu); + } + if (buffer_gpu) { + checkCudaErrors(cudaFree(buffer_gpu)); + } + + if (d_csrValA) { + checkCudaErrors(cudaFree(d_csrValA)); + } + if (d_csrRowPtrA) { + checkCudaErrors(cudaFree(d_csrRowPtrA)); + } + if (d_csrColIndA) { + checkCudaErrors(cudaFree(d_csrColIndA)); + } + if (d_x) { + checkCudaErrors(cudaFree(d_x)); + } + if (d_b) { + checkCudaErrors(cudaFree(d_b)); + } + if (d_r) { + checkCudaErrors(cudaFree(d_r)); + } + + return 0; } diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio.c b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio.c index 66e90770..7ecf9c2c 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio.c +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio.c @@ -1,128 +1,126 @@ -/* -* Matrix Market I/O library for ANSI C -* -* See http://math.nist.gov/MatrixMarket for details. -* -* -*/ +/* + * Matrix Market I/O library for ANSI C + * + * See http://math.nist.gov/MatrixMarket for details. + * + * + */ /* avoid Windows warnings (for example: strcpy, fscanf, etc.) */ -#if defined(_WIN32) +#if defined(_WIN32) #define _CRT_SECURE_NO_WARNINGS #endif -#include -#include -#include -#include - #include "mmio.h" -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_) +#include +#include +#include +#include + +int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_) { - FILE *f; + FILE *f; MM_typecode matcode; - int M, N, nz; - int i; - double *val; - int *I, *J; - + int M, N, nz; + int i; + double *val; + int *I, *J; + if ((f = fopen(fname, "r")) == NULL) - return -1; - - - if (mm_read_banner(f, &matcode) != 0) - { + return -1; + + + if (mm_read_banner(f, &matcode) != 0) { printf("mm_read_unsymetric: Could not process Matrix Market banner "); printf(" in file [%s]\n", fname); return -1; } - - - - if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) && - mm_is_sparse(matcode))) - { + + + if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))) { fprintf(stderr, "Sorry, this application does not support "); - fprintf(stderr, "Market Market type: [%s]\n", - mm_typecode_to_str(matcode)); + fprintf(stderr, "Market Market type: [%s]\n", mm_typecode_to_str(matcode)); return -1; } - + /* find out size of sparse matrix: M, N, nz .... */ - - if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0) - { + + if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) { fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n"); return -1; } - - *M_ = M; - *N_ = N; + + *M_ = M; + *N_ = N; *nz_ = nz; - + /* reserve memory for matrices */ - - I = (int *) malloc(nz * sizeof(int)); - J = (int *) malloc(nz * sizeof(int)); - val = (double *) malloc(nz * sizeof(double)); - + + I = (int *)malloc(nz * sizeof(int)); + J = (int *)malloc(nz * sizeof(int)); + val = (double *)malloc(nz * sizeof(double)); + *val_ = val; - *I_ = I; - *J_ = J; - + *I_ = I; + *J_ = J; + /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ - - for (i=0; i + #if defined(__cplusplus) -extern "C" { +extern "C" +{ #endif /* __cplusplus */ -#define MM_MAX_LINE_LENGTH 1025 -#define MatrixMarketBanner "%%MatrixMarket" +#define MM_MAX_LINE_LENGTH 1025 +#define MatrixMarketBanner "%%MatrixMarket" #define MM_MAX_TOKEN_LENGTH 64 -typedef char MM_typecode[4]; + typedef char MM_typecode[4]; -char *mm_typecode_to_str(MM_typecode matcode); + char *mm_typecode_to_str(MM_typecode matcode); -int mm_read_banner(FILE *f, MM_typecode *matcode); -int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); -int mm_read_mtx_array_size(FILE *f, int *M, int *N); + int mm_read_banner(FILE *f, MM_typecode *matcode); + int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); + int mm_read_mtx_array_size(FILE *f, int *M, int *N); -int mm_write_banner(FILE *f, MM_typecode matcode); -int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); -int mm_write_mtx_array_size(FILE *f, int M, int N); + int mm_write_banner(FILE *f, MM_typecode matcode); + int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); + int mm_write_mtx_array_size(FILE *f, int M, int N); -/********************* MM_typecode query fucntions ***************************/ + /********************* MM_typecode query fucntions ***************************/ -#define mm_is_matrix(typecode) ((typecode)[0]=='M') +#define mm_is_matrix(typecode) ((typecode)[0] == 'M') -#define mm_is_sparse(typecode) ((typecode)[1]=='C') -#define mm_is_coordinate(typecode)((typecode)[1]=='C') -#define mm_is_dense(typecode) ((typecode)[1]=='A') -#define mm_is_array(typecode) ((typecode)[1]=='A') +#define mm_is_sparse(typecode) ((typecode)[1] == 'C') +#define mm_is_coordinate(typecode) ((typecode)[1] == 'C') +#define mm_is_dense(typecode) ((typecode)[1] == 'A') +#define mm_is_array(typecode) ((typecode)[1] == 'A') -#define mm_is_complex(typecode) ((typecode)[2]=='C') -#define mm_is_real(typecode) ((typecode)[2]=='R') -#define mm_is_pattern(typecode) ((typecode)[2]=='P') -#define mm_is_integer(typecode) ((typecode)[2]=='I') +#define mm_is_complex(typecode) ((typecode)[2] == 'C') +#define mm_is_real(typecode) ((typecode)[2] == 'R') +#define mm_is_pattern(typecode) ((typecode)[2] == 'P') +#define mm_is_integer(typecode) ((typecode)[2] == 'I') -#define mm_is_symmetric(typecode)((typecode)[3]=='S') -#define mm_is_general(typecode) ((typecode)[3]=='G') -#define mm_is_skew(typecode) ((typecode)[3]=='K') -#define mm_is_hermitian(typecode)((typecode)[3]=='H') +#define mm_is_symmetric(typecode) ((typecode)[3] == 'S') +#define mm_is_general(typecode) ((typecode)[3] == 'G') +#define mm_is_skew(typecode) ((typecode)[3] == 'K') +#define mm_is_hermitian(typecode) ((typecode)[3] == 'H') -int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ + int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ -/********************* MM_typecode modify fucntions ***************************/ + /********************* MM_typecode modify fucntions ***************************/ -#define mm_set_matrix(typecode) ((*typecode)[0]='M') -#define mm_set_coordinate(typecode) ((*typecode)[1]='C') -#define mm_set_array(typecode) ((*typecode)[1]='A') -#define mm_set_dense(typecode) mm_set_array(typecode) -#define mm_set_sparse(typecode) mm_set_coordinate(typecode) +#define mm_set_matrix(typecode) ((*typecode)[0] = 'M') +#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C') +#define mm_set_array(typecode) ((*typecode)[1] = 'A') +#define mm_set_dense(typecode) mm_set_array(typecode) +#define mm_set_sparse(typecode) mm_set_coordinate(typecode) -#define mm_set_complex(typecode)((*typecode)[2]='C') -#define mm_set_real(typecode) ((*typecode)[2]='R') -#define mm_set_pattern(typecode)((*typecode)[2]='P') -#define mm_set_integer(typecode)((*typecode)[2]='I') +#define mm_set_complex(typecode) ((*typecode)[2] = 'C') +#define mm_set_real(typecode) ((*typecode)[2] = 'R') +#define mm_set_pattern(typecode) ((*typecode)[2] = 'P') +#define mm_set_integer(typecode) ((*typecode)[2] = 'I') -#define mm_set_symmetric(typecode)((*typecode)[3]='S') -#define mm_set_general(typecode)((*typecode)[3]='G') -#define mm_set_skew(typecode) ((*typecode)[3]='K') -#define mm_set_hermitian(typecode)((*typecode)[3]='H') +#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') +#define mm_set_general(typecode) ((*typecode)[3] = 'G') +#define mm_set_skew(typecode) ((*typecode)[3] = 'K') +#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') -#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \ - (*typecode)[2]=' ',(*typecode)[3]='G') +#define mm_clear_typecode(typecode) ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) -/********************* Matrix Market error codes ***************************/ + /********************* Matrix Market error codes ***************************/ -#define MM_COULD_NOT_READ_FILE 11 -#define MM_PREMATURE_EOF 12 -#define MM_NOT_MTX 13 -#define MM_NO_HEADER 14 -#define MM_UNSUPPORTED_TYPE 15 -#define MM_LINE_TOO_LONG 16 -#define MM_COULD_NOT_WRITE_FILE 17 +#define MM_COULD_NOT_READ_FILE 11 +#define MM_PREMATURE_EOF 12 +#define MM_NOT_MTX 13 +#define MM_NO_HEADER 14 +#define MM_UNSUPPORTED_TYPE 15 +#define MM_LINE_TOO_LONG 16 +#define MM_COULD_NOT_WRITE_FILE 17 -/******************** Matrix Market internal definitions ******************** + /******************** Matrix Market internal definitions ******************** - MM_matrix_typecode: 4-character sequence + MM_matrix_typecode: 4-character sequence - ojbect sparse/ data storage - dense type scheme + ojbect sparse/ data storage + dense type scheme - string position: [0] [1] [2] [3] + string position: [0] [1] [2] [3] - Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) - A(array) C(omplex) H(ermitian) - P(attern) S(ymmetric) - I(nteger) K(kew) + Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) + A(array) C(omplex) H(ermitian) + P(attern) S(ymmetric) + I(nteger) K(kew) - ***********************************************************************/ + ***********************************************************************/ -#define MM_MTX_STR "matrix" -#define MM_ARRAY_STR "array" -#define MM_DENSE_STR "array" -#define MM_COORDINATE_STR "coordinate" -#define MM_SPARSE_STR "coordinate" -#define MM_COMPLEX_STR "complex" -#define MM_REAL_STR "real" -#define MM_INT_STR "integer" -#define MM_GENERAL_STR "general" -#define MM_SYMM_STR "symmetric" -#define MM_HERM_STR "hermitian" -#define MM_SKEW_STR "skew-symmetric" -#define MM_PATTERN_STR "pattern" +#define MM_MTX_STR "matrix" +#define MM_ARRAY_STR "array" +#define MM_DENSE_STR "array" +#define MM_COORDINATE_STR "coordinate" +#define MM_SPARSE_STR "coordinate" +#define MM_COMPLEX_STR "complex" +#define MM_REAL_STR "real" +#define MM_INT_STR "integer" +#define MM_GENERAL_STR "general" +#define MM_SYMM_STR "symmetric" +#define MM_HERM_STR "hermitian" +#define MM_SKEW_STR "skew-symmetric" +#define MM_PATTERN_STR "pattern" -/* high level routines */ -int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, - double **val, MM_typecode *matcode); + /* high level routines */ + int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, double **val, MM_typecode *matcode); -int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], - double val[], MM_typecode matcode); -int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, - MM_typecode matcode); + int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], double val[], MM_typecode matcode); + int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, MM_typecode matcode); -int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, - double **val_, int **I_, int **J_); + int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, double **val_, int **I_, int **J_); #if defined(__cplusplus) } -#endif /* __cplusplus */ +#endif /* __cplusplus */ #endif diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio_wrapper.cpp b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio_wrapper.cpp index 04e680a9..3f798d53 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio_wrapper.cpp +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/mmio_wrapper.cpp @@ -38,180 +38,125 @@ #endif /* various __inline__ __device__ function to initialize a T_ELEM */ -template -__inline__ T_ELEM cuGet(int); -template <> -__inline__ float cuGet(int x) { - return float(x); +template __inline__ T_ELEM cuGet(int); +template <> __inline__ float cuGet(int x) { return float(x); } + +template <> __inline__ double cuGet(int x) { return double(x); } + +template <> __inline__ cuComplex cuGet(int x) { return (make_cuComplex(float(x), 0.0f)); } + +template <> __inline__ cuDoubleComplex cuGet(int x) { return (make_cuDoubleComplex(double(x), 0.0)); } + +template __inline__ T_ELEM cuGet(int, int); +template <> __inline__ float cuGet(int x, int y) { return float(x); } + +template <> __inline__ double cuGet(int x, int y) { return double(x); } + +template <> __inline__ cuComplex cuGet(int x, int y) { return make_cuComplex(float(x), float(y)); } + +template <> __inline__ cuDoubleComplex cuGet(int x, int y) +{ + return (make_cuDoubleComplex(double(x), double(y))); } -template <> -__inline__ double cuGet(int x) { - return double(x); +template __inline__ T_ELEM cuGet(float); +template <> __inline__ float cuGet(float x) { return float(x); } + +template <> __inline__ double cuGet(float x) { return double(x); } + +template <> __inline__ cuComplex cuGet(float x) { return (make_cuComplex(float(x), 0.0f)); } + +template <> __inline__ cuDoubleComplex cuGet(float x) +{ + return (make_cuDoubleComplex(double(x), 0.0)); } -template <> -__inline__ cuComplex cuGet(int x) { - return (make_cuComplex(float(x), 0.0f)); +template __inline__ T_ELEM cuGet(float, float); +template <> __inline__ float cuGet(float x, float y) { return float(x); } + +template <> __inline__ double cuGet(float x, float y) { return double(x); } + +template <> __inline__ cuComplex cuGet(float x, float y) { return (make_cuComplex(float(x), float(y))); } + +template <> __inline__ cuDoubleComplex cuGet(float x, float y) +{ + return (make_cuDoubleComplex(double(x), double(y))); } -template <> -__inline__ cuDoubleComplex cuGet(int x) { - return (make_cuDoubleComplex(double(x), 0.0)); +template __inline__ T_ELEM cuGet(double); +template <> __inline__ float cuGet(double x) { return float(x); } + +template <> __inline__ double cuGet(double x) { return double(x); } + +template <> __inline__ cuComplex cuGet(double x) { return (make_cuComplex(float(x), 0.0f)); } + +template <> __inline__ cuDoubleComplex cuGet(double x) +{ + return (make_cuDoubleComplex(double(x), 0.0)); } -template -__inline__ T_ELEM cuGet(int, int); -template <> -__inline__ float cuGet(int x, int y) { - return float(x); +template __inline__ T_ELEM cuGet(double, double); +template <> __inline__ float cuGet(double x, double y) { return float(x); } + +template <> __inline__ double cuGet(double x, double y) { return double(x); } + +template <> __inline__ cuComplex cuGet(double x, double y) { return (make_cuComplex(float(x), float(y))); } + +template <> __inline__ cuDoubleComplex cuGet(double x, double y) +{ + return (make_cuDoubleComplex(double(x), double(y))); } -template <> -__inline__ double cuGet(int x, int y) { - return double(x); +static void compress_index(const int *Ind, int nnz, int m, int *Ptr, int base) +{ + int i; + + /* initialize everything to zero */ + for (i = 0; i < m + 1; i++) { + Ptr[i] = 0; + } + /* count elements in every row */ + Ptr[0] = base; + for (i = 0; i < nnz; i++) { + Ptr[Ind[i] + (1 - base)]++; + } + /* add all the values */ + for (i = 0; i < m; i++) { + Ptr[i + 1] += Ptr[i]; + } } -template <> -__inline__ cuComplex cuGet(int x, int y) { - return make_cuComplex(float(x), float(y)); -} - -template <> -__inline__ cuDoubleComplex cuGet(int x, int y) { - return (make_cuDoubleComplex(double(x), double(y))); -} - -template -__inline__ T_ELEM cuGet(float); -template <> -__inline__ float cuGet(float x) { - return float(x); -} - -template <> -__inline__ double cuGet(float x) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(float x) { - return (make_cuComplex(float(x), 0.0f)); -} - -template <> -__inline__ cuDoubleComplex cuGet(float x) { - return (make_cuDoubleComplex(double(x), 0.0)); -} - -template -__inline__ T_ELEM cuGet(float, float); -template <> -__inline__ float cuGet(float x, float y) { - return float(x); -} - -template <> -__inline__ double cuGet(float x, float y) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(float x, float y) { - return (make_cuComplex(float(x), float(y))); -} - -template <> -__inline__ cuDoubleComplex cuGet(float x, float y) { - return (make_cuDoubleComplex(double(x), double(y))); -} - -template -__inline__ T_ELEM cuGet(double); -template <> -__inline__ float cuGet(double x) { - return float(x); -} - -template <> -__inline__ double cuGet(double x) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(double x) { - return (make_cuComplex(float(x), 0.0f)); -} - -template <> -__inline__ cuDoubleComplex cuGet(double x) { - return (make_cuDoubleComplex(double(x), 0.0)); -} - -template -__inline__ T_ELEM cuGet(double, double); -template <> -__inline__ float cuGet(double x, double y) { - return float(x); -} - -template <> -__inline__ double cuGet(double x, double y) { - return double(x); -} - -template <> -__inline__ cuComplex cuGet(double x, double y) { - return (make_cuComplex(float(x), float(y))); -} - -template <> -__inline__ cuDoubleComplex cuGet(double x, double y) { - return (make_cuDoubleComplex(double(x), double(y))); -} - -static void compress_index(const int *Ind, int nnz, int m, int *Ptr, int base) { - int i; - - /* initialize everything to zero */ - for (i = 0; i < m + 1; i++) { - Ptr[i] = 0; - } - /* count elements in every row */ - Ptr[0] = base; - for (i = 0; i < nnz; i++) { - Ptr[Ind[i] + (1 - base)]++; - } - /* add all the values */ - for (i = 0; i < m; i++) { - Ptr[i + 1] += Ptr[i]; - } -} - -struct cooFormat { - int i; - int j; - int p; // permutation +struct cooFormat +{ + int i; + int j; + int p; // permutation }; -int cmp_cooFormat_csr(struct cooFormat *s, struct cooFormat *t) { - if (s->i < t->i) { - return -1; - } else if (s->i > t->i) { - return 1; - } else { - return s->j - t->j; - } +int cmp_cooFormat_csr(struct cooFormat *s, struct cooFormat *t) +{ + if (s->i < t->i) { + return -1; + } + else if (s->i > t->i) { + return 1; + } + else { + return s->j - t->j; + } } -int cmp_cooFormat_csc(struct cooFormat *s, struct cooFormat *t) { - if (s->j < t->j) { - return -1; - } else if (s->j > t->j) { - return 1; - } else { - return s->i - t->i; - } +int cmp_cooFormat_csc(struct cooFormat *s, struct cooFormat *t) +{ + if (s->j < t->j) { + return -1; + } + else if (s->j > t->j) { + return 1; + } + else { + return s->i - t->i; + } } typedef int (*FUNPTR)(const void *, const void *); @@ -222,280 +167,326 @@ static FUNPTR2 fptr_array[2] = { cmp_cooFormat_csc, }; -static int verify_pattern(int m, int nnz, int *csrRowPtr, int *csrColInd) { - int i, col, start, end, base_index; - int error_found = 0; +static int verify_pattern(int m, int nnz, int *csrRowPtr, int *csrColInd) +{ + int i, col, start, end, base_index; + int error_found = 0; - if (nnz != (csrRowPtr[m] - csrRowPtr[0])) { - fprintf(stderr, - "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) " - "!= (nnz=%d)\n", - 0, csrRowPtr[0], m, csrRowPtr[m], nnz); - error_found = 1; - } - - base_index = csrRowPtr[0]; - if ((0 != base_index) && (1 != base_index)) { - fprintf(stderr, "Error (base index check failed): base index = %d\n", - base_index); - error_found = 1; - } - - for (i = 0; (!error_found) && (i < m); i++) { - start = csrRowPtr[i] - base_index; - end = csrRowPtr[i + 1] - base_index; - if (start > end) { - fprintf( - stderr, - "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", - i, start + base_index, i + 1, end + base_index); - error_found = 1; - } - for (col = start; col < end; col++) { - if (csrColInd[col] < base_index) { - fprintf( - stderr, - "Error (column vs. base index check failed): csrColInd[%d] < %d\n", - col, base_index); - error_found = 1; - } - if ((col < (end - 1)) && (csrColInd[col] >= csrColInd[col + 1])) { + if (nnz != (csrRowPtr[m] - csrRowPtr[0])) { fprintf(stderr, - "Error (sorting of the column indecis check failed): " - "(csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", - col, csrColInd[col], col + 1, csrColInd[col + 1]); + "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) " + "!= (nnz=%d)\n", + 0, + csrRowPtr[0], + m, + csrRowPtr[m], + nnz); error_found = 1; - } } - } - return error_found; + + base_index = csrRowPtr[0]; + if ((0 != base_index) && (1 != base_index)) { + fprintf(stderr, "Error (base index check failed): base index = %d\n", base_index); + error_found = 1; + } + + for (i = 0; (!error_found) && (i < m); i++) { + start = csrRowPtr[i] - base_index; + end = csrRowPtr[i + 1] - base_index; + if (start > end) { + fprintf(stderr, + "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", + i, + start + base_index, + i + 1, + end + base_index); + error_found = 1; + } + for (col = start; col < end; col++) { + if (csrColInd[col] < base_index) { + fprintf(stderr, "Error (column vs. base index check failed): csrColInd[%d] < %d\n", col, base_index); + error_found = 1; + } + if ((col < (end - 1)) && (csrColInd[col] >= csrColInd[col + 1])) { + fprintf(stderr, + "Error (sorting of the column indecis check failed): " + "(csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", + col, + csrColInd[col], + col + 1, + csrColInd[col + 1]); + error_found = 1; + } + } + } + return error_found; } template -int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m, - int *n, int *nnz, T_ELEM **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix) { - MM_typecode matcode; - double *tempVal; - int *tempRowInd, *tempColInd; - double *tval; - int *trow, *tcol; - int *csrRowPtr, *cscColPtr; - int i, j, error, base, count; - struct cooFormat *work; +int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + T_ELEM **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix) +{ + MM_typecode matcode; + double *tempVal; + int *tempRowInd, *tempColInd; + double *tval; + int *trow, *tcol; + int *csrRowPtr, *cscColPtr; + int i, j, error, base, count; + struct cooFormat *work; - /* read the matrix */ - error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode); - if (error) { - fprintf(stderr, "!!!! can not open file: '%s'\n", filename); - return 1; - } - - /* start error checking */ - if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) { - fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n"); - return 1; - } - - if (mm_is_dense(matcode) || mm_is_array(matcode) || - mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/) { - fprintf( - stderr, - "!!!! dense, array, pattern and integer matrices are not supported\n"); - return 1; - } - - /* if necessary symmetrize the pattern (transform from triangular to full) */ - if ((extendSymMatrix) && (mm_is_symmetric(matcode) || - mm_is_hermitian(matcode) || mm_is_skew(matcode))) { - // count number of non-diagonal elements - count = 0; - for (i = 0; i < (*nnz); i++) { - if (trow[i] != tcol[i]) { - count++; - } + /* read the matrix */ + error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode); + if (error) { + fprintf(stderr, "!!!! can not open file: '%s'\n", filename); + return 1; } - // allocate space for the symmetrized matrix - tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); - tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - tempVal = (double *)malloc((*nnz + count) * sizeof(double)); - } else { - tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double)); + + /* start error checking */ + if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) { + fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n"); + return 1; } - // copy the elements regular and transposed locations - for (j = 0, i = 0; i < (*nnz); i++) { - tempRowInd[j] = trow[i]; - tempColInd[j] = tcol[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - tempVal[j] = tval[i]; - } else { - tempVal[2 * j] = tval[2 * i]; - tempVal[2 * j + 1] = tval[2 * i + 1]; - } - j++; - if (trow[i] != tcol[i]) { - tempRowInd[j] = tcol[i]; - tempColInd[j] = trow[i]; - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - if (mm_is_skew(matcode)) { - tempVal[j] = -tval[i]; - } else { - tempVal[j] = tval[i]; - } - } else { - if (mm_is_hermitian(matcode)) { - tempVal[2 * j] = tval[2 * i]; - tempVal[2 * j + 1] = -tval[2 * i + 1]; - } else { - tempVal[2 * j] = tval[2 * i]; - tempVal[2 * j + 1] = tval[2 * i + 1]; - } + + if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/) { + fprintf(stderr, "!!!! dense, array, pattern and integer matrices are not supported\n"); + return 1; + } + + /* if necessary symmetrize the pattern (transform from triangular to full) */ + if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))) { + // count number of non-diagonal elements + count = 0; + for (i = 0; i < (*nnz); i++) { + if (trow[i] != tcol[i]) { + count++; + } } - j++; - } + // allocate space for the symmetrized matrix + tempRowInd = (int *)malloc((*nnz + count) * sizeof(int)); + tempColInd = (int *)malloc((*nnz + count) * sizeof(int)); + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + tempVal = (double *)malloc((*nnz + count) * sizeof(double)); + } + else { + tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double)); + } + // copy the elements regular and transposed locations + for (j = 0, i = 0; i < (*nnz); i++) { + tempRowInd[j] = trow[i]; + tempColInd[j] = tcol[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + tempVal[j] = tval[i]; + } + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; + } + j++; + if (trow[i] != tcol[i]) { + tempRowInd[j] = tcol[i]; + tempColInd[j] = trow[i]; + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + if (mm_is_skew(matcode)) { + tempVal[j] = -tval[i]; + } + else { + tempVal[j] = tval[i]; + } + } + else { + if (mm_is_hermitian(matcode)) { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = -tval[2 * i + 1]; + } + else { + tempVal[2 * j] = tval[2 * i]; + tempVal[2 * j + 1] = tval[2 * i + 1]; + } + } + j++; + } + } + (*nnz) += count; + // free temporary storage + free(trow); + free(tcol); + free(tval); } - (*nnz) += count; - // free temporary storage - free(trow); - free(tcol); - free(tval); - } else { - tempRowInd = trow; - tempColInd = tcol; - tempVal = tval; - } - // life time of (trow, tcol, tval) is over. - // please use COO format (tempRowInd, tempColInd, tempVal) - - // use qsort to sort COO format - work = (struct cooFormat *)malloc(sizeof(struct cooFormat) * (*nnz)); - if (NULL == work) { - fprintf(stderr, "!!!! allocation error, malloc failed\n"); - return 1; - } - for (i = 0; i < (*nnz); i++) { - work[i].i = tempRowInd[i]; - work[i].j = tempColInd[i]; - work[i].p = i; // permutation is identity - } - - if (csrFormat) { - /* create row-major ordering of indices (sorted by row and within each row - * by column) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0]); - } else { - /* create column-major ordering of indices (sorted by column and within each - * column by row) */ - qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1]); - } - - // (tempRowInd, tempColInd) is sorted either by row-major or by col-major - for (i = 0; i < (*nnz); i++) { - tempRowInd[i] = work[i].i; - tempColInd[i] = work[i].j; - } - - // setup base - // check if there is any row/col 0, if so base-0 - // check if there is any row/col equal to matrix dimension m/n, if so base-1 - int base0 = 0; - int base1 = 0; - for (i = 0; i < (*nnz); i++) { - const int row = tempRowInd[i]; - const int col = tempColInd[i]; - if ((0 == row) || (0 == col)) { - base0 = 1; + else { + tempRowInd = trow; + tempColInd = tcol; + tempVal = tval; } - if ((*m == row) || (*n == col)) { - base1 = 1; + // life time of (trow, tcol, tval) is over. + // please use COO format (tempRowInd, tempColInd, tempVal) + + // use qsort to sort COO format + work = (struct cooFormat *)malloc(sizeof(struct cooFormat) * (*nnz)); + if (NULL == work) { + fprintf(stderr, "!!!! allocation error, malloc failed\n"); + return 1; + } + for (i = 0; i < (*nnz); i++) { + work[i].i = tempRowInd[i]; + work[i].j = tempColInd[i]; + work[i].p = i; // permutation is identity } - } - if (base0 && base1) { - printf("Error: input matrix is base-0 and base-1 \n"); - return 1; - } - base = 0; - if (base1) { - base = 1; - } - - /* compress the appropriate indices */ - if (csrFormat) { - /* CSR format (assuming row-major format) */ - csrRowPtr = (int *)malloc(((*m) + 1) * sizeof(csrRowPtr[0])); - if (!csrRowPtr) return 1; - compress_index(tempRowInd, *nnz, *m, csrRowPtr, base); - - *aRowInd = csrRowPtr; - *aColInd = (int *)malloc((*nnz) * sizeof(int)); - } else { - /* CSC format (assuming column-major format) */ - cscColPtr = (int *)malloc(((*n) + 1) * sizeof(cscColPtr[0])); - if (!cscColPtr) return 1; - compress_index(tempColInd, *nnz, *n, cscColPtr, base); - - *aColInd = cscColPtr; - *aRowInd = (int *)malloc((*nnz) * sizeof(int)); - } - - /* transfrom the matrix values of type double into one of the cusparse library - * types */ - *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM)); - - for (i = 0; i < (*nnz); i++) { if (csrFormat) { - (*aColInd)[i] = tempColInd[i]; - } else { - (*aRowInd)[i] = tempRowInd[i]; + /* create row-major ordering of indices (sorted by row and within each row + * by column) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0]); } - if (mm_is_real(matcode) || mm_is_integer(matcode)) { - (*aVal)[i] = cuGet(tempVal[work[i].p]); - } else { - (*aVal)[i] = - cuGet(tempVal[2 * work[i].p], tempVal[2 * work[i].p + 1]); + else { + /* create column-major ordering of indices (sorted by column and within each + * column by row) */ + qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1]); } - } - /* check for corruption */ - int error_found; - if (csrFormat) { - error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd); - } else { - error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd); - } - if (error_found) { - fprintf(stderr, "!!!! verify_pattern failed\n"); - return 1; - } + // (tempRowInd, tempColInd) is sorted either by row-major or by col-major + for (i = 0; i < (*nnz); i++) { + tempRowInd[i] = work[i].i; + tempColInd[i] = work[i].j; + } - /* cleanup and exit */ - free(work); - free(tempVal); - free(tempColInd); - free(tempRowInd); + // setup base + // check if there is any row/col 0, if so base-0 + // check if there is any row/col equal to matrix dimension m/n, if so base-1 + int base0 = 0; + int base1 = 0; + for (i = 0; i < (*nnz); i++) { + const int row = tempRowInd[i]; + const int col = tempColInd[i]; + if ((0 == row) || (0 == col)) { + base0 = 1; + } + if ((*m == row) || (*n == col)) { + base1 = 1; + } + } + if (base0 && base1) { + printf("Error: input matrix is base-0 and base-1 \n"); + return 1; + } - return 0; + base = 0; + if (base1) { + base = 1; + } + + /* compress the appropriate indices */ + if (csrFormat) { + /* CSR format (assuming row-major format) */ + csrRowPtr = (int *)malloc(((*m) + 1) * sizeof(csrRowPtr[0])); + if (!csrRowPtr) + return 1; + compress_index(tempRowInd, *nnz, *m, csrRowPtr, base); + + *aRowInd = csrRowPtr; + *aColInd = (int *)malloc((*nnz) * sizeof(int)); + } + else { + /* CSC format (assuming column-major format) */ + cscColPtr = (int *)malloc(((*n) + 1) * sizeof(cscColPtr[0])); + if (!cscColPtr) + return 1; + compress_index(tempColInd, *nnz, *n, cscColPtr, base); + + *aColInd = cscColPtr; + *aRowInd = (int *)malloc((*nnz) * sizeof(int)); + } + + /* transfrom the matrix values of type double into one of the cusparse library + * types */ + *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM)); + + for (i = 0; i < (*nnz); i++) { + if (csrFormat) { + (*aColInd)[i] = tempColInd[i]; + } + else { + (*aRowInd)[i] = tempRowInd[i]; + } + if (mm_is_real(matcode) || mm_is_integer(matcode)) { + (*aVal)[i] = cuGet(tempVal[work[i].p]); + } + else { + (*aVal)[i] = cuGet(tempVal[2 * work[i].p], tempVal[2 * work[i].p + 1]); + } + } + + /* check for corruption */ + int error_found; + if (csrFormat) { + error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd); + } + else { + error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd); + } + if (error_found) { + fprintf(stderr, "!!!! verify_pattern failed\n"); + return 1; + } + + /* cleanup and exit */ + free(work); + free(tempVal); + free(tempColInd); + free(tempRowInd); + + return 0; } /* specific instantiation */ -template int loadMMSparseMatrix(char *filename, char elem_type, - bool csrFormat, int *m, int *n, int *nnz, - float **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + float **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix(char *filename, char elem_type, - bool csrFormat, int *m, int *n, - int *nnz, double **aVal, int **aRowInd, - int **aColInd, int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + double **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix(char *filename, char elem_type, - bool csrFormat, int *m, int *n, - int *nnz, cuComplex **aVal, - int **aRowInd, int **aColInd, - int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); -template int loadMMSparseMatrix( - char *filename, char elem_type, bool csrFormat, int *m, int *n, int *nnz, - cuDoubleComplex **aVal, int **aRowInd, int **aColInd, int extendSymMatrix); +template int loadMMSparseMatrix(char *filename, + char elem_type, + bool csrFormat, + int *m, + int *n, + int *nnz, + cuDoubleComplex **aVal, + int **aRowInd, + int **aColInd, + int extendSymMatrix); diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/README.md b/Samples/4_CUDA_Libraries/cudaNvSci/README.md index cda2ac49..d2f102dc 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/README.md +++ b/Samples/4_CUDA_Libraries/cudaNvSci/README.md @@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.cpp b/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.cpp index 988df3d4..7cb3a530 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.cpp +++ b/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.cpp @@ -26,585 +26,595 @@ */ #include "cudaNvSci.h" -#include + #include +#include #include #include -std::mutex m_mutex; +std::mutex m_mutex; std::condition_variable m_condVar; -bool workSubmitted = false; +bool workSubmitted = false; -class cudaNvSciSignal { - private: - NvSciSyncModule m_syncModule; - NvSciBufModule m_bufModule; +class cudaNvSciSignal +{ +private: + NvSciSyncModule m_syncModule; + NvSciBufModule m_bufModule; - NvSciSyncAttrList m_syncAttrList; - NvSciSyncFence *m_fence; + NvSciSyncAttrList m_syncAttrList; + NvSciSyncFence *m_fence; - NvSciBufAttrList m_rawBufAttrList; - NvSciBufAttrList m_imageBufAttrList; - NvSciBufAttrList m_buffAttrListOut[2]; - NvSciBufAttrKeyValuePair pairArrayOut[10]; + NvSciBufAttrList m_rawBufAttrList; + NvSciBufAttrList m_imageBufAttrList; + NvSciBufAttrList m_buffAttrListOut[2]; + NvSciBufAttrKeyValuePair pairArrayOut[10]; - cudaExternalMemory_t extMemRawBuf, extMemImageBuf; - cudaMipmappedArray_t d_mipmapArray; - cudaArray_t d_mipLevelArray; - cudaTextureObject_t texObject; - cudaExternalSemaphore_t signalSem; + cudaExternalMemory_t extMemRawBuf, extMemImageBuf; + cudaMipmappedArray_t d_mipmapArray; + cudaArray_t d_mipLevelArray; + cudaTextureObject_t texObject; + cudaExternalSemaphore_t signalSem; - cudaStream_t streamToRun; - int m_cudaDeviceId; - CUuuid m_devUUID; - uint64_t m_imageWidth; - uint64_t m_imageHeight; - void *d_outputBuf; - size_t m_bufSize; + cudaStream_t streamToRun; + int m_cudaDeviceId; + CUuuid m_devUUID; + uint64_t m_imageWidth; + uint64_t m_imageHeight; + void *d_outputBuf; + size_t m_bufSize; - public: - cudaNvSciSignal(NvSciBufModule bufModule, NvSciSyncModule syncModule, - int cudaDeviceId, int bufSize, uint64_t imageWidth, - uint64_t imageHeight, NvSciSyncFence *fence) - : m_syncModule(syncModule), - m_bufModule(bufModule), - m_cudaDeviceId(cudaDeviceId), - d_outputBuf(NULL), - m_bufSize(bufSize), - m_imageWidth(imageWidth), - m_imageHeight(imageHeight), - m_fence(fence) { - initCuda(); +public: + cudaNvSciSignal(NvSciBufModule bufModule, + NvSciSyncModule syncModule, + int cudaDeviceId, + int bufSize, + uint64_t imageWidth, + uint64_t imageHeight, + NvSciSyncFence *fence) + : m_syncModule(syncModule) + , m_bufModule(bufModule) + , m_cudaDeviceId(cudaDeviceId) + , d_outputBuf(NULL) + , m_bufSize(bufSize) + , m_imageWidth(imageWidth) + , m_imageHeight(imageHeight) + , m_fence(fence) + { + initCuda(); - checkNvSciErrors(NvSciSyncAttrListCreate(m_syncModule, &m_syncAttrList)); - checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_rawBufAttrList)); - checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_imageBufAttrList)); + checkNvSciErrors(NvSciSyncAttrListCreate(m_syncModule, &m_syncAttrList)); + checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_rawBufAttrList)); + checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_imageBufAttrList)); - setRawBufAttrList(m_bufSize); - setImageBufAttrList(m_imageWidth, m_imageHeight); + setRawBufAttrList(m_bufSize); + setImageBufAttrList(m_imageWidth, m_imageHeight); - checkCudaErrors(cudaDeviceGetNvSciSyncAttributes( - m_syncAttrList, m_cudaDeviceId, cudaNvSciSyncAttrSignal)); - } - - ~cudaNvSciSignal() { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - checkCudaErrors(cudaFreeMipmappedArray(d_mipmapArray)); - checkCudaErrors(cudaFree(d_outputBuf)); - checkCudaErrors(cudaDestroyExternalSemaphore(signalSem)); - checkCudaErrors(cudaDestroyExternalMemory(extMemRawBuf)); - checkCudaErrors(cudaDestroyExternalMemory(extMemImageBuf)); - checkCudaErrors(cudaDestroyTextureObject(texObject)); - checkCudaErrors(cudaStreamDestroy(streamToRun)); - } - - void initCuda() { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - checkCudaErrors( - cudaStreamCreateWithFlags(&streamToRun, cudaStreamNonBlocking)); - - int major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute( - &major, cudaDevAttrComputeCapabilityMajor, m_cudaDeviceId)); - checkCudaErrors(cudaDeviceGetAttribute( - &minor, cudaDevAttrComputeCapabilityMinor, m_cudaDeviceId)); - printf( - "[cudaNvSciSignal] GPU Device %d: \"%s\" with compute capability " - "%d.%d\n\n", - m_cudaDeviceId, _ConvertSMVer2ArchName(major, minor), major, minor); - -#ifdef cuDeviceGetUuid_v2 - CUresult res = cuDeviceGetUuid_v2(&m_devUUID, m_cudaDeviceId); -#else - CUresult res = cuDeviceGetUuid(&m_devUUID, m_cudaDeviceId); -#endif - - if (res != CUDA_SUCCESS) { - fprintf(stderr, "Driver API error = %04d \n", res); - exit(EXIT_FAILURE); - } - } - - void setRawBufAttrList(uint64_t size) { - NvSciBufType bufType = NvSciBufType_RawBuffer; - bool cpuAccess = false; - NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; - NvSciBufAttrKeyValuePair rawBufAttrs[] = { - {NvSciBufRawBufferAttrKey_Size, &size, sizeof(size)}, - {NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)}, - {NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccess, sizeof(cpuAccess)}, - {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)}, - {NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)}, - }; - - checkNvSciErrors(NvSciBufAttrListSetAttrs( - m_rawBufAttrList, rawBufAttrs, - sizeof(rawBufAttrs) / sizeof(NvSciBufAttrKeyValuePair))); - } - - void setImageBufAttrList(uint32_t width, uint32_t height) { - NvSciBufType bufType = NvSciBufType_Image; - NvSciBufAttrValImageLayoutType layout = NvSciBufImage_BlockLinearType; - NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; - - uint32_t planeCount = 1; - uint32_t planeWidths[] = {width}; - uint32_t planeHeights[] = {height}; - uint64_t lrpad = 0, tbpad = 100; - - bool cpuAccessFlag = false; - - NvSciBufAttrValColorFmt planecolorfmts[] = {NvSciColor_B8G8R8A8}; - NvSciBufAttrValColorStd planecolorstds[] = {NvSciColorStd_SRGB}; - NvSciBufAttrValImageScanType planescantype[] = {NvSciBufScan_InterlaceType}; - - NvSciBufAttrKeyValuePair imgBufAttrs[] = { - {NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)}, - {NvSciBufImageAttrKey_PlaneCount, &planeCount, sizeof(planeCount)}, - {NvSciBufImageAttrKey_Layout, &layout, sizeof(layout)}, - {NvSciBufImageAttrKey_TopPadding, &tbpad, sizeof(tbpad)}, - {NvSciBufImageAttrKey_BottomPadding, &tbpad, sizeof(tbpad)}, - {NvSciBufImageAttrKey_LeftPadding, &lrpad, sizeof(lrpad)}, - {NvSciBufImageAttrKey_RightPadding, &lrpad, sizeof(lrpad)}, - {NvSciBufImageAttrKey_PlaneColorFormat, planecolorfmts, - sizeof(planecolorfmts)}, - {NvSciBufImageAttrKey_PlaneColorStd, planecolorstds, - sizeof(planecolorstds)}, - {NvSciBufImageAttrKey_PlaneWidth, planeWidths, sizeof(planeWidths)}, - {NvSciBufImageAttrKey_PlaneHeight, planeHeights, sizeof(planeHeights)}, - {NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccessFlag, - sizeof(cpuAccessFlag)}, - {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)}, - {NvSciBufImageAttrKey_PlaneScanType, planescantype, - sizeof(planescantype)}, - {NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)}, - }; - - checkNvSciErrors(NvSciBufAttrListSetAttrs( - m_imageBufAttrList, imgBufAttrs, - sizeof(imgBufAttrs) / sizeof(NvSciBufAttrKeyValuePair))); - } - - NvSciSyncAttrList getNvSciSyncAttrList() { return m_syncAttrList; } - - NvSciBufAttrList getNvSciRawBufAttrList() { return m_rawBufAttrList; } - - NvSciBufAttrList getNvSciImageBufAttrList() { return m_imageBufAttrList; } - - void runRotateImageAndSignal(unsigned char *imageData) { - int numOfGPUs = 0; - checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); // For cuda init purpose - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - - copyDataToImageArray(imageData); - createTexture(); - - float angle = 0.5f; // angle to rotate image by (in radians) - rotateKernel(texObject, angle, (unsigned int *)d_outputBuf, m_imageWidth, - m_imageHeight, streamToRun); - - signalExternalSemaphore(); - } - - void cudaImportNvSciSemaphore(NvSciSyncObj syncObj) { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - - cudaExternalSemaphoreHandleDesc extSemDesc; - memset(&extSemDesc, 0, sizeof(extSemDesc)); - extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; - extSemDesc.handle.nvSciSyncObj = (void *)syncObj; - - checkCudaErrors(cudaImportExternalSemaphore(&signalSem, &extSemDesc)); - } - - void signalExternalSemaphore() { - cudaExternalSemaphoreSignalParams signalParams; - memset(&signalParams, 0, sizeof(signalParams)); - // For cross-process signaler-waiter applications need to use NvSciIpc - // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence - // across process. This step is optional in single-process. - signalParams.params.nvSciSync.fence = (void *)m_fence; - signalParams.flags = 0; - - checkCudaErrors(cudaSignalExternalSemaphoresAsync(&signalSem, &signalParams, - 1, streamToRun)); - } - - void cudaImportNvSciRawBuf(NvSciBufObj inputBufObj) { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - checkNvSciErrors( - NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[0])); - - memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); - pairArrayOut[0].key = NvSciBufRawBufferAttrKey_Size; - - checkNvSciErrors( - NvSciBufAttrListGetAttrs(m_buffAttrListOut[0], pairArrayOut, 1)); - - uint64_t size = *(uint64_t *)pairArrayOut[0].value; - - cudaExternalMemoryHandleDesc memHandleDesc; - memset(&memHandleDesc, 0, sizeof(memHandleDesc)); - memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; - memHandleDesc.handle.nvSciBufObject = inputBufObj; - memHandleDesc.size = size; - checkCudaErrors(cudaImportExternalMemory(&extMemRawBuf, &memHandleDesc)); - - cudaExternalMemoryBufferDesc bufferDesc; - memset(&bufferDesc, 0, sizeof(bufferDesc)); - bufferDesc.offset = 0; - bufferDesc.size = size; - m_bufSize = size; - checkCudaErrors(cudaExternalMemoryGetMappedBuffer( - &d_outputBuf, extMemRawBuf, &bufferDesc)); - } - - void cudaImportNvSciImage(NvSciBufObj inputBufObj) { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - checkNvSciErrors( - NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[1])); - - memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); - pairArrayOut[0].key = NvSciBufImageAttrKey_Size; - pairArrayOut[1].key = NvSciBufImageAttrKey_Alignment; - pairArrayOut[2].key = NvSciBufImageAttrKey_PlaneCount; - pairArrayOut[3].key = NvSciBufImageAttrKey_PlaneWidth; - pairArrayOut[4].key = NvSciBufImageAttrKey_PlaneHeight; - - checkNvSciErrors( - NvSciBufAttrListGetAttrs(m_buffAttrListOut[1], pairArrayOut, 5)); - - uint64_t size = *(uint64_t *)pairArrayOut[0].value; - uint64_t alignment = *(uint64_t *)pairArrayOut[1].value; - uint64_t planeCount = *(uint64_t *)pairArrayOut[2].value; - uint64_t imageWidth = *(uint64_t *)pairArrayOut[3].value; - uint64_t imageHeight = *(uint64_t *)pairArrayOut[4].value; - - cudaExternalMemoryHandleDesc memHandleDesc; - memset(&memHandleDesc, 0, sizeof(memHandleDesc)); - memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; - memHandleDesc.handle.nvSciBufObject = inputBufObj; - memHandleDesc.size = size; - checkCudaErrors(cudaImportExternalMemory(&extMemImageBuf, &memHandleDesc)); - - cudaExtent extent = {}; - memset(&extent, 0, sizeof(extent)); - extent.width = imageWidth; - extent.height = imageHeight; - extent.depth = 0; - - cudaChannelFormatDesc desc; - desc.x = 8; - desc.y = 8; - desc.z = 8; - desc.w = 8; - desc.f = cudaChannelFormatKindUnsigned; - - cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0}; - mipmapDesc.offset = 0; - mipmapDesc.formatDesc = desc; - mipmapDesc.extent = extent; - mipmapDesc.flags = 0; - - mipmapDesc.numLevels = 1; - checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray( - &d_mipmapArray, extMemImageBuf, &mipmapDesc)); - } - - void copyDataToImageArray(unsigned char *imageData) { - uint32_t mipLevelId = 0; - checkCudaErrors(cudaGetMipmappedArrayLevel(&d_mipLevelArray, d_mipmapArray, - mipLevelId)); - - checkCudaErrors(cudaMemcpy2DToArrayAsync( - d_mipLevelArray, 0, 0, imageData, m_imageWidth * sizeof(unsigned int), - m_imageWidth * sizeof(unsigned int), m_imageHeight, - cudaMemcpyHostToDevice, streamToRun)); - } - - void createTexture() { - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_mipLevelArray; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeNormalizedFloat; - - checkCudaErrors( - cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); - } -}; - -class cudaNvSciWait { - private: - NvSciSyncModule m_syncModule; - NvSciBufModule m_bufModule; - - NvSciSyncAttrList m_syncAttrList; - NvSciBufAttrList m_rawBufAttrList; - NvSciBufAttrList m_buffAttrListOut; - NvSciBufAttrKeyValuePair pairArrayOut[10]; - NvSciSyncFence *m_fence; - - cudaExternalMemory_t extMemRawBuf; - cudaExternalSemaphore_t waitSem; - cudaStream_t streamToRun; - int m_cudaDeviceId; - CUuuid m_devUUID; - void *d_outputBuf; - size_t m_bufSize; - size_t imageWidth; - size_t imageHeight; - - public: - cudaNvSciWait(NvSciBufModule bufModule, NvSciSyncModule syncModule, - int cudaDeviceId, int bufSize, NvSciSyncFence *fence) - : m_bufModule(bufModule), - m_syncModule(syncModule), - m_cudaDeviceId(cudaDeviceId), - m_bufSize(bufSize), - m_fence(fence) { - initCuda(); - checkNvSciErrors(NvSciSyncAttrListCreate(m_syncModule, &m_syncAttrList)); - checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_rawBufAttrList)); - - setRawBufAttrList(m_bufSize); - checkCudaErrors(cudaDeviceGetNvSciSyncAttributes( - m_syncAttrList, m_cudaDeviceId, cudaNvSciSyncAttrWait)); - } - - ~cudaNvSciWait() { - checkCudaErrors(cudaStreamDestroy(streamToRun)); - checkCudaErrors(cudaDestroyExternalSemaphore(waitSem)); - checkCudaErrors(cudaDestroyExternalMemory(extMemRawBuf)); - checkCudaErrors(cudaFree(d_outputBuf)); - } - - void initCuda() { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - checkCudaErrors( - cudaStreamCreateWithFlags(&streamToRun, cudaStreamNonBlocking)); -#ifdef cuDeviceGetUuid_v2 - CUresult res = cuDeviceGetUuid_v2(&m_devUUID, m_cudaDeviceId); -#else - CUresult res = cuDeviceGetUuid(&m_devUUID, m_cudaDeviceId); -#endif - if (res != CUDA_SUCCESS) { - fprintf(stderr, "Driver API error = %04d \n", res); - exit(EXIT_FAILURE); + checkCudaErrors(cudaDeviceGetNvSciSyncAttributes(m_syncAttrList, m_cudaDeviceId, cudaNvSciSyncAttrSignal)); } - int major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute( - &major, cudaDevAttrComputeCapabilityMajor, m_cudaDeviceId)); - checkCudaErrors(cudaDeviceGetAttribute( - &minor, cudaDevAttrComputeCapabilityMinor, m_cudaDeviceId)); - printf( - "[cudaNvSciWait] GPU Device %d: \"%s\" with compute capability " - "%d.%d\n\n", - m_cudaDeviceId, _ConvertSMVer2ArchName(major, minor), major, minor); - } + ~cudaNvSciSignal() + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + checkCudaErrors(cudaFreeMipmappedArray(d_mipmapArray)); + checkCudaErrors(cudaFree(d_outputBuf)); + checkCudaErrors(cudaDestroyExternalSemaphore(signalSem)); + checkCudaErrors(cudaDestroyExternalMemory(extMemRawBuf)); + checkCudaErrors(cudaDestroyExternalMemory(extMemImageBuf)); + checkCudaErrors(cudaDestroyTextureObject(texObject)); + checkCudaErrors(cudaStreamDestroy(streamToRun)); + } - void setRawBufAttrList(uint64_t size) { - NvSciBufType bufType = NvSciBufType_RawBuffer; - bool cpuAccess = false; - NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; - NvSciBufAttrKeyValuePair rawBufAttrs[] = { - {NvSciBufRawBufferAttrKey_Size, &size, sizeof(size)}, - {NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)}, - {NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccess, sizeof(cpuAccess)}, - {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)}, - {NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)}, - }; + void initCuda() + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + checkCudaErrors(cudaStreamCreateWithFlags(&streamToRun, cudaStreamNonBlocking)); - checkNvSciErrors(NvSciBufAttrListSetAttrs( - m_rawBufAttrList, rawBufAttrs, - sizeof(rawBufAttrs) / sizeof(NvSciBufAttrKeyValuePair))); - } + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, m_cudaDeviceId)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, m_cudaDeviceId)); + printf("[cudaNvSciSignal] GPU Device %d: \"%s\" with compute capability " + "%d.%d\n\n", + m_cudaDeviceId, + _ConvertSMVer2ArchName(major, minor), + major, + minor); - NvSciSyncAttrList getNvSciSyncAttrList() { return m_syncAttrList; } +#ifdef cuDeviceGetUuid_v2 + CUresult res = cuDeviceGetUuid_v2(&m_devUUID, m_cudaDeviceId); +#else + CUresult res = cuDeviceGetUuid(&m_devUUID, m_cudaDeviceId); +#endif - NvSciBufAttrList getNvSciRawBufAttrList() { return m_rawBufAttrList; } + if (res != CUDA_SUCCESS) { + fprintf(stderr, "Driver API error = %04d \n", res); + exit(EXIT_FAILURE); + } + } - void runImageGrayscale(std::string image_filename, size_t imageWidth, - size_t imageHeight) { - int numOfGPUs = 0; - checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); // For cuda init purpose - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + void setRawBufAttrList(uint64_t size) + { + NvSciBufType bufType = NvSciBufType_RawBuffer; + bool cpuAccess = false; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; + NvSciBufAttrKeyValuePair rawBufAttrs[] = { + {NvSciBufRawBufferAttrKey_Size, &size, sizeof(size)}, + {NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)}, + {NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccess, sizeof(cpuAccess)}, + {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)}, + {NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)}, + }; - waitExternalSemaphore(); - launchGrayScaleKernel((unsigned int *)d_outputBuf, image_filename, - imageWidth, imageHeight, streamToRun); - } + checkNvSciErrors(NvSciBufAttrListSetAttrs( + m_rawBufAttrList, rawBufAttrs, sizeof(rawBufAttrs) / sizeof(NvSciBufAttrKeyValuePair))); + } - void cudaImportNvSciSemaphore(NvSciSyncObj syncObj) { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + void setImageBufAttrList(uint32_t width, uint32_t height) + { + NvSciBufType bufType = NvSciBufType_Image; + NvSciBufAttrValImageLayoutType layout = NvSciBufImage_BlockLinearType; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; - cudaExternalSemaphoreHandleDesc extSemDesc; - memset(&extSemDesc, 0, sizeof(extSemDesc)); - extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; - extSemDesc.handle.nvSciSyncObj = (void *)syncObj; + uint32_t planeCount = 1; + uint32_t planeWidths[] = {width}; + uint32_t planeHeights[] = {height}; + uint64_t lrpad = 0, tbpad = 100; - checkCudaErrors(cudaImportExternalSemaphore(&waitSem, &extSemDesc)); - } + bool cpuAccessFlag = false; - void waitExternalSemaphore() { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + NvSciBufAttrValColorFmt planecolorfmts[] = {NvSciColor_B8G8R8A8}; + NvSciBufAttrValColorStd planecolorstds[] = {NvSciColorStd_SRGB}; + NvSciBufAttrValImageScanType planescantype[] = {NvSciBufScan_InterlaceType}; - cudaExternalSemaphoreWaitParams waitParams; - memset(&waitParams, 0, sizeof(waitParams)); - // For cross-process signaler-waiter applications need to use NvSciIpc - // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence - // across process. This step is optional in single-process. - waitParams.params.nvSciSync.fence = (void *)m_fence; - waitParams.flags = 0; + NvSciBufAttrKeyValuePair imgBufAttrs[] = { + {NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)}, + {NvSciBufImageAttrKey_PlaneCount, &planeCount, sizeof(planeCount)}, + {NvSciBufImageAttrKey_Layout, &layout, sizeof(layout)}, + {NvSciBufImageAttrKey_TopPadding, &tbpad, sizeof(tbpad)}, + {NvSciBufImageAttrKey_BottomPadding, &tbpad, sizeof(tbpad)}, + {NvSciBufImageAttrKey_LeftPadding, &lrpad, sizeof(lrpad)}, + {NvSciBufImageAttrKey_RightPadding, &lrpad, sizeof(lrpad)}, + {NvSciBufImageAttrKey_PlaneColorFormat, planecolorfmts, sizeof(planecolorfmts)}, + {NvSciBufImageAttrKey_PlaneColorStd, planecolorstds, sizeof(planecolorstds)}, + {NvSciBufImageAttrKey_PlaneWidth, planeWidths, sizeof(planeWidths)}, + {NvSciBufImageAttrKey_PlaneHeight, planeHeights, sizeof(planeHeights)}, + {NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccessFlag, sizeof(cpuAccessFlag)}, + {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)}, + {NvSciBufImageAttrKey_PlaneScanType, planescantype, sizeof(planescantype)}, + {NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)}, + }; - checkCudaErrors( - cudaWaitExternalSemaphoresAsync(&waitSem, &waitParams, 1, streamToRun)); - } + checkNvSciErrors(NvSciBufAttrListSetAttrs( + m_imageBufAttrList, imgBufAttrs, sizeof(imgBufAttrs) / sizeof(NvSciBufAttrKeyValuePair))); + } - void cudaImportNvSciRawBuf(NvSciBufObj inputBufObj) { - checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + NvSciSyncAttrList getNvSciSyncAttrList() { return m_syncAttrList; } - checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut)); + NvSciBufAttrList getNvSciRawBufAttrList() { return m_rawBufAttrList; } - memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); - pairArrayOut[0].key = NvSciBufRawBufferAttrKey_Size; + NvSciBufAttrList getNvSciImageBufAttrList() { return m_imageBufAttrList; } - checkNvSciErrors( - NvSciBufAttrListGetAttrs(m_buffAttrListOut, pairArrayOut, 1)); + void runRotateImageAndSignal(unsigned char *imageData) + { + int numOfGPUs = 0; + checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); // For cuda init purpose + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); - uint64_t size = *(uint64_t *)pairArrayOut[0].value; + copyDataToImageArray(imageData); + createTexture(); - cudaExternalMemoryHandleDesc memHandleDesc; - memset(&memHandleDesc, 0, sizeof(memHandleDesc)); - memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; - memHandleDesc.handle.nvSciBufObject = inputBufObj; - memHandleDesc.size = size; - checkCudaErrors(cudaImportExternalMemory(&extMemRawBuf, &memHandleDesc)); + float angle = 0.5f; // angle to rotate image by (in radians) + rotateKernel(texObject, angle, (unsigned int *)d_outputBuf, m_imageWidth, m_imageHeight, streamToRun); - cudaExternalMemoryBufferDesc bufferDesc; - memset(&bufferDesc, 0, sizeof(bufferDesc)); - bufferDesc.offset = 0; - bufferDesc.size = size; - m_bufSize = size; + signalExternalSemaphore(); + } - checkCudaErrors(cudaExternalMemoryGetMappedBuffer( - &d_outputBuf, extMemRawBuf, &bufferDesc)); - } + void cudaImportNvSciSemaphore(NvSciSyncObj syncObj) + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + + cudaExternalSemaphoreHandleDesc extSemDesc; + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = (void *)syncObj; + + checkCudaErrors(cudaImportExternalSemaphore(&signalSem, &extSemDesc)); + } + + void signalExternalSemaphore() + { + cudaExternalSemaphoreSignalParams signalParams; + memset(&signalParams, 0, sizeof(signalParams)); + // For cross-process signaler-waiter applications need to use NvSciIpc + // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence + // across process. This step is optional in single-process. + signalParams.params.nvSciSync.fence = (void *)m_fence; + signalParams.flags = 0; + + checkCudaErrors(cudaSignalExternalSemaphoresAsync(&signalSem, &signalParams, 1, streamToRun)); + } + + void cudaImportNvSciRawBuf(NvSciBufObj inputBufObj) + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[0])); + + memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); + pairArrayOut[0].key = NvSciBufRawBufferAttrKey_Size; + + checkNvSciErrors(NvSciBufAttrListGetAttrs(m_buffAttrListOut[0], pairArrayOut, 1)); + + uint64_t size = *(uint64_t *)pairArrayOut[0].value; + + cudaExternalMemoryHandleDesc memHandleDesc; + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.handle.nvSciBufObject = inputBufObj; + memHandleDesc.size = size; + checkCudaErrors(cudaImportExternalMemory(&extMemRawBuf, &memHandleDesc)); + + cudaExternalMemoryBufferDesc bufferDesc; + memset(&bufferDesc, 0, sizeof(bufferDesc)); + bufferDesc.offset = 0; + bufferDesc.size = size; + m_bufSize = size; + checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&d_outputBuf, extMemRawBuf, &bufferDesc)); + } + + void cudaImportNvSciImage(NvSciBufObj inputBufObj) + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[1])); + + memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); + pairArrayOut[0].key = NvSciBufImageAttrKey_Size; + pairArrayOut[1].key = NvSciBufImageAttrKey_Alignment; + pairArrayOut[2].key = NvSciBufImageAttrKey_PlaneCount; + pairArrayOut[3].key = NvSciBufImageAttrKey_PlaneWidth; + pairArrayOut[4].key = NvSciBufImageAttrKey_PlaneHeight; + + checkNvSciErrors(NvSciBufAttrListGetAttrs(m_buffAttrListOut[1], pairArrayOut, 5)); + + uint64_t size = *(uint64_t *)pairArrayOut[0].value; + uint64_t alignment = *(uint64_t *)pairArrayOut[1].value; + uint64_t planeCount = *(uint64_t *)pairArrayOut[2].value; + uint64_t imageWidth = *(uint64_t *)pairArrayOut[3].value; + uint64_t imageHeight = *(uint64_t *)pairArrayOut[4].value; + + cudaExternalMemoryHandleDesc memHandleDesc; + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.handle.nvSciBufObject = inputBufObj; + memHandleDesc.size = size; + checkCudaErrors(cudaImportExternalMemory(&extMemImageBuf, &memHandleDesc)); + + cudaExtent extent = {}; + memset(&extent, 0, sizeof(extent)); + extent.width = imageWidth; + extent.height = imageHeight; + extent.depth = 0; + + cudaChannelFormatDesc desc; + desc.x = 8; + desc.y = 8; + desc.z = 8; + desc.w = 8; + desc.f = cudaChannelFormatKindUnsigned; + + cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0}; + mipmapDesc.offset = 0; + mipmapDesc.formatDesc = desc; + mipmapDesc.extent = extent; + mipmapDesc.flags = 0; + + mipmapDesc.numLevels = 1; + checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&d_mipmapArray, extMemImageBuf, &mipmapDesc)); + } + + void copyDataToImageArray(unsigned char *imageData) + { + uint32_t mipLevelId = 0; + checkCudaErrors(cudaGetMipmappedArrayLevel(&d_mipLevelArray, d_mipmapArray, mipLevelId)); + + checkCudaErrors(cudaMemcpy2DToArrayAsync(d_mipLevelArray, + 0, + 0, + imageData, + m_imageWidth * sizeof(unsigned int), + m_imageWidth * sizeof(unsigned int), + m_imageHeight, + cudaMemcpyHostToDevice, + streamToRun)); + } + + void createTexture() + { + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_mipLevelArray; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; + + checkCudaErrors(cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); + } }; -void thread_rotateAndSignal(cudaNvSciSignal *cudaNvSciSignalObj, - unsigned char *imageData) { - std::lock_guard guard(m_mutex); - cudaNvSciSignalObj->runRotateImageAndSignal(imageData); - workSubmitted = true; - m_condVar.notify_one(); +class cudaNvSciWait +{ +private: + NvSciSyncModule m_syncModule; + NvSciBufModule m_bufModule; + + NvSciSyncAttrList m_syncAttrList; + NvSciBufAttrList m_rawBufAttrList; + NvSciBufAttrList m_buffAttrListOut; + NvSciBufAttrKeyValuePair pairArrayOut[10]; + NvSciSyncFence *m_fence; + + cudaExternalMemory_t extMemRawBuf; + cudaExternalSemaphore_t waitSem; + cudaStream_t streamToRun; + int m_cudaDeviceId; + CUuuid m_devUUID; + void *d_outputBuf; + size_t m_bufSize; + size_t imageWidth; + size_t imageHeight; + +public: + cudaNvSciWait(NvSciBufModule bufModule, + NvSciSyncModule syncModule, + int cudaDeviceId, + int bufSize, + NvSciSyncFence *fence) + : m_bufModule(bufModule) + , m_syncModule(syncModule) + , m_cudaDeviceId(cudaDeviceId) + , m_bufSize(bufSize) + , m_fence(fence) + { + initCuda(); + checkNvSciErrors(NvSciSyncAttrListCreate(m_syncModule, &m_syncAttrList)); + checkNvSciErrors(NvSciBufAttrListCreate(m_bufModule, &m_rawBufAttrList)); + + setRawBufAttrList(m_bufSize); + checkCudaErrors(cudaDeviceGetNvSciSyncAttributes(m_syncAttrList, m_cudaDeviceId, cudaNvSciSyncAttrWait)); + } + + ~cudaNvSciWait() + { + checkCudaErrors(cudaStreamDestroy(streamToRun)); + checkCudaErrors(cudaDestroyExternalSemaphore(waitSem)); + checkCudaErrors(cudaDestroyExternalMemory(extMemRawBuf)); + checkCudaErrors(cudaFree(d_outputBuf)); + } + + void initCuda() + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + checkCudaErrors(cudaStreamCreateWithFlags(&streamToRun, cudaStreamNonBlocking)); +#ifdef cuDeviceGetUuid_v2 + CUresult res = cuDeviceGetUuid_v2(&m_devUUID, m_cudaDeviceId); +#else + CUresult res = cuDeviceGetUuid(&m_devUUID, m_cudaDeviceId); +#endif + if (res != CUDA_SUCCESS) { + fprintf(stderr, "Driver API error = %04d \n", res); + exit(EXIT_FAILURE); + } + + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, m_cudaDeviceId)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, m_cudaDeviceId)); + printf("[cudaNvSciWait] GPU Device %d: \"%s\" with compute capability " + "%d.%d\n\n", + m_cudaDeviceId, + _ConvertSMVer2ArchName(major, minor), + major, + minor); + } + + void setRawBufAttrList(uint64_t size) + { + NvSciBufType bufType = NvSciBufType_RawBuffer; + bool cpuAccess = false; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; + NvSciBufAttrKeyValuePair rawBufAttrs[] = { + {NvSciBufRawBufferAttrKey_Size, &size, sizeof(size)}, + {NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)}, + {NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccess, sizeof(cpuAccess)}, + {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)}, + {NvSciBufGeneralAttrKey_GpuId, &m_devUUID, sizeof(m_devUUID)}, + }; + + checkNvSciErrors(NvSciBufAttrListSetAttrs( + m_rawBufAttrList, rawBufAttrs, sizeof(rawBufAttrs) / sizeof(NvSciBufAttrKeyValuePair))); + } + + NvSciSyncAttrList getNvSciSyncAttrList() { return m_syncAttrList; } + + NvSciBufAttrList getNvSciRawBufAttrList() { return m_rawBufAttrList; } + + void runImageGrayscale(std::string image_filename, size_t imageWidth, size_t imageHeight) + { + int numOfGPUs = 0; + checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); // For cuda init purpose + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + + waitExternalSemaphore(); + launchGrayScaleKernel((unsigned int *)d_outputBuf, image_filename, imageWidth, imageHeight, streamToRun); + } + + void cudaImportNvSciSemaphore(NvSciSyncObj syncObj) + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + + cudaExternalSemaphoreHandleDesc extSemDesc; + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = (void *)syncObj; + + checkCudaErrors(cudaImportExternalSemaphore(&waitSem, &extSemDesc)); + } + + void waitExternalSemaphore() + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + + cudaExternalSemaphoreWaitParams waitParams; + memset(&waitParams, 0, sizeof(waitParams)); + // For cross-process signaler-waiter applications need to use NvSciIpc + // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence + // across process. This step is optional in single-process. + waitParams.params.nvSciSync.fence = (void *)m_fence; + waitParams.flags = 0; + + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&waitSem, &waitParams, 1, streamToRun)); + } + + void cudaImportNvSciRawBuf(NvSciBufObj inputBufObj) + { + checkCudaErrors(cudaSetDevice(m_cudaDeviceId)); + + checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut)); + + memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); + pairArrayOut[0].key = NvSciBufRawBufferAttrKey_Size; + + checkNvSciErrors(NvSciBufAttrListGetAttrs(m_buffAttrListOut, pairArrayOut, 1)); + + uint64_t size = *(uint64_t *)pairArrayOut[0].value; + + cudaExternalMemoryHandleDesc memHandleDesc; + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.handle.nvSciBufObject = inputBufObj; + memHandleDesc.size = size; + checkCudaErrors(cudaImportExternalMemory(&extMemRawBuf, &memHandleDesc)); + + cudaExternalMemoryBufferDesc bufferDesc; + memset(&bufferDesc, 0, sizeof(bufferDesc)); + bufferDesc.offset = 0; + bufferDesc.size = size; + m_bufSize = size; + + checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&d_outputBuf, extMemRawBuf, &bufferDesc)); + } +}; + +void thread_rotateAndSignal(cudaNvSciSignal *cudaNvSciSignalObj, unsigned char *imageData) +{ + std::lock_guard guard(m_mutex); + cudaNvSciSignalObj->runRotateImageAndSignal(imageData); + workSubmitted = true; + m_condVar.notify_one(); } void thread_waitAndGrayscale(cudaNvSciWait *cudaNvSciWaitObj, - std::string image_filename, size_t imageWidth, - size_t imageHeight) { - // Acquire the lock - std::unique_lock mlock(m_mutex); - m_condVar.wait(mlock, [] { return workSubmitted; }); - cudaNvSciWaitObj->runImageGrayscale(image_filename, imageWidth, imageHeight); + std::string image_filename, + size_t imageWidth, + size_t imageHeight) +{ + // Acquire the lock + std::unique_lock mlock(m_mutex); + m_condVar.wait(mlock, [] { return workSubmitted; }); + cudaNvSciWaitObj->runImageGrayscale(image_filename, imageWidth, imageHeight); } -cudaNvSci::cudaNvSci(int isMultiGPU, std::vector &deviceIds, - unsigned char *imageData, size_t width, size_t height) - : m_isMultiGPU(isMultiGPU), - image_data(imageData), - imageWidth(width), - imageHeight(height) { - if (isMultiGPU) { - m_cudaNvSciSignalDeviceId = deviceIds[0]; - m_cudaNvSciWaitDeviceId = deviceIds[1]; - } else { - m_cudaNvSciSignalDeviceId = m_cudaNvSciWaitDeviceId = deviceIds[0]; - } +cudaNvSci::cudaNvSci(int isMultiGPU, std::vector &deviceIds, unsigned char *imageData, size_t width, size_t height) + : m_isMultiGPU(isMultiGPU) + , image_data(imageData) + , imageWidth(width) + , imageHeight(height) +{ + if (isMultiGPU) { + m_cudaNvSciSignalDeviceId = deviceIds[0]; + m_cudaNvSciWaitDeviceId = deviceIds[1]; + } + else { + m_cudaNvSciSignalDeviceId = m_cudaNvSciWaitDeviceId = deviceIds[0]; + } - m_bufSize = imageWidth * imageHeight * sizeof(unsigned int); + m_bufSize = imageWidth * imageHeight * sizeof(unsigned int); } -void cudaNvSci::initNvSci() { - checkNvSciErrors(NvSciSyncModuleOpen(&syncModule)); - checkNvSciErrors(NvSciBufModuleOpen(&buffModule)); - fence = (NvSciSyncFence *)calloc(1, sizeof(NvSciSyncFence)); +void cudaNvSci::initNvSci() +{ + checkNvSciErrors(NvSciSyncModuleOpen(&syncModule)); + checkNvSciErrors(NvSciBufModuleOpen(&buffModule)); + fence = (NvSciSyncFence *)calloc(1, sizeof(NvSciSyncFence)); } -void cudaNvSci::runCudaNvSci(std::string &image_filename) { - initNvSci(); +void cudaNvSci::runCudaNvSci(std::string &image_filename) +{ + initNvSci(); - cudaNvSciSignal rotateAndSignal(buffModule, syncModule, - m_cudaNvSciSignalDeviceId, m_bufSize, - imageWidth, imageHeight, fence); - cudaNvSciWait waitAndGrayscale(buffModule, syncModule, - m_cudaNvSciWaitDeviceId, m_bufSize, fence); + cudaNvSciSignal rotateAndSignal( + buffModule, syncModule, m_cudaNvSciSignalDeviceId, m_bufSize, imageWidth, imageHeight, fence); + cudaNvSciWait waitAndGrayscale(buffModule, syncModule, m_cudaNvSciWaitDeviceId, m_bufSize, fence); - rawBufUnreconciledList[0] = rotateAndSignal.getNvSciRawBufAttrList(); - rawBufUnreconciledList[1] = waitAndGrayscale.getNvSciRawBufAttrList(); + rawBufUnreconciledList[0] = rotateAndSignal.getNvSciRawBufAttrList(); + rawBufUnreconciledList[1] = waitAndGrayscale.getNvSciRawBufAttrList(); - createNvSciRawBufObj(); + createNvSciRawBufObj(); - imageBufUnreconciledList[0] = rotateAndSignal.getNvSciImageBufAttrList(); + imageBufUnreconciledList[0] = rotateAndSignal.getNvSciImageBufAttrList(); - createNvSciBufImageObj(); + createNvSciBufImageObj(); - rotateAndSignal.cudaImportNvSciRawBuf(rawBufObj); - rotateAndSignal.cudaImportNvSciImage(imageBufObj); + rotateAndSignal.cudaImportNvSciRawBuf(rawBufObj); + rotateAndSignal.cudaImportNvSciImage(imageBufObj); - waitAndGrayscale.cudaImportNvSciRawBuf(rawBufObj); + waitAndGrayscale.cudaImportNvSciRawBuf(rawBufObj); - syncUnreconciledList[0] = rotateAndSignal.getNvSciSyncAttrList(); - syncUnreconciledList[1] = waitAndGrayscale.getNvSciSyncAttrList(); + syncUnreconciledList[0] = rotateAndSignal.getNvSciSyncAttrList(); + syncUnreconciledList[1] = waitAndGrayscale.getNvSciSyncAttrList(); - createNvSciSyncObj(); + createNvSciSyncObj(); - rotateAndSignal.cudaImportNvSciSemaphore(syncObj); - waitAndGrayscale.cudaImportNvSciSemaphore(syncObj); + rotateAndSignal.cudaImportNvSciSemaphore(syncObj); + waitAndGrayscale.cudaImportNvSciSemaphore(syncObj); - std::thread rotateThread(&thread_rotateAndSignal, &rotateAndSignal, - image_data); + std::thread rotateThread(&thread_rotateAndSignal, &rotateAndSignal, image_data); - std::thread grayscaleThread(&thread_waitAndGrayscale, &waitAndGrayscale, - image_filename, imageWidth, imageHeight); + std::thread grayscaleThread(&thread_waitAndGrayscale, &waitAndGrayscale, image_filename, imageWidth, imageHeight); - rotateThread.join(); - grayscaleThread.join(); + rotateThread.join(); + grayscaleThread.join(); } -void cudaNvSci::createNvSciRawBufObj() { - int numAttrList = 2; - checkNvSciErrors(NvSciBufAttrListReconcile(rawBufUnreconciledList, - numAttrList, &rawBufReconciledList, - &buffConflictList)); - checkNvSciErrors(NvSciBufObjAlloc(rawBufReconciledList, &rawBufObj)); - printf("created NvSciBufObj\n"); +void cudaNvSci::createNvSciRawBufObj() +{ + int numAttrList = 2; + checkNvSciErrors( + NvSciBufAttrListReconcile(rawBufUnreconciledList, numAttrList, &rawBufReconciledList, &buffConflictList)); + checkNvSciErrors(NvSciBufObjAlloc(rawBufReconciledList, &rawBufObj)); + printf("created NvSciBufObj\n"); } -void cudaNvSci::createNvSciBufImageObj() { - int numAttrList = 1; - checkNvSciErrors(NvSciBufAttrListReconcile( - imageBufUnreconciledList, numAttrList, &imageBufReconciledList, - &imageBufConflictList)); - checkNvSciErrors(NvSciBufObjAlloc(imageBufReconciledList, &imageBufObj)); - printf("created NvSciBufImageObj\n"); +void cudaNvSci::createNvSciBufImageObj() +{ + int numAttrList = 1; + checkNvSciErrors(NvSciBufAttrListReconcile( + imageBufUnreconciledList, numAttrList, &imageBufReconciledList, &imageBufConflictList)); + checkNvSciErrors(NvSciBufObjAlloc(imageBufReconciledList, &imageBufObj)); + printf("created NvSciBufImageObj\n"); } -void cudaNvSci::createNvSciSyncObj() { - int numAttrList = 2; - checkNvSciErrors(NvSciSyncAttrListReconcile(syncUnreconciledList, numAttrList, - &syncReconciledList, - &syncConflictList)); - checkNvSciErrors(NvSciSyncObjAlloc(syncReconciledList, &syncObj)); - printf("created NvSciSyncObj\n"); +void cudaNvSci::createNvSciSyncObj() +{ + int numAttrList = 2; + checkNvSciErrors( + NvSciSyncAttrListReconcile(syncUnreconciledList, numAttrList, &syncReconciledList, &syncConflictList)); + checkNvSciErrors(NvSciSyncObjAlloc(syncReconciledList, &syncObj)); + printf("created NvSciSyncObj\n"); } diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.h b/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.h index c91030f2..3b8547dd 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.h +++ b/Samples/4_CUDA_Libraries/cudaNvSci/cudaNvSci.h @@ -34,68 +34,79 @@ #include #include -#define checkNvSciErrors(call) \ - do { \ - NvSciError _status = call; \ - if (NvSciError_Success != _status) { \ - printf( \ - "NVSCI call in file '%s' in line %i returned" \ - " %d, expected %d\n", \ - __FILE__, __LINE__, _status, NvSciError_Success); \ - fflush(stdout); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define checkNvSciErrors(call) \ + do { \ + NvSciError _status = call; \ + if (NvSciError_Success != _status) { \ + printf("NVSCI call in file '%s' in line %i returned" \ + " %d, expected %d\n", \ + __FILE__, \ + __LINE__, \ + _status, \ + NvSciError_Success); \ + fflush(stdout); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) -extern void rotateKernel(cudaTextureObject_t &texObj, const float angle, - unsigned int *d_outputData, const int imageWidth, - const int imageHeight, cudaStream_t stream); +extern void rotateKernel(cudaTextureObject_t &texObj, + const float angle, + unsigned int *d_outputData, + const int imageWidth, + const int imageHeight, + cudaStream_t stream); extern void launchGrayScaleKernel(unsigned int *d_rgbaImage, - std::string image_filename, size_t imageWidth, - size_t imageHeight, cudaStream_t stream); + std::string image_filename, + size_t imageWidth, + size_t imageHeight, + cudaStream_t stream); -class cudaNvSci { - private: - int m_isMultiGPU; - int m_cudaNvSciSignalDeviceId; - int m_cudaNvSciWaitDeviceId; - unsigned char *image_data; - size_t m_bufSize; - size_t imageWidth; - size_t imageHeight; +class cudaNvSci +{ +private: + int m_isMultiGPU; + int m_cudaNvSciSignalDeviceId; + int m_cudaNvSciWaitDeviceId; + unsigned char *image_data; + size_t m_bufSize; + size_t imageWidth; + size_t imageHeight; - public: - NvSciSyncModule syncModule; - NvSciBufModule buffModule; - NvSciSyncAttrList syncUnreconciledList[2]; - NvSciSyncAttrList syncReconciledList; - NvSciSyncAttrList syncConflictList; +public: + NvSciSyncModule syncModule; + NvSciBufModule buffModule; + NvSciSyncAttrList syncUnreconciledList[2]; + NvSciSyncAttrList syncReconciledList; + NvSciSyncAttrList syncConflictList; - NvSciBufAttrList rawBufUnreconciledList[2]; - NvSciBufAttrList imageBufUnreconciledList[2]; - NvSciBufAttrList rawBufReconciledList; - NvSciBufAttrList buffConflictList; - NvSciBufAttrList imageBufReconciledList; - NvSciBufAttrList imageBufConflictList; - NvSciBufAttrList buffAttrListOut; + NvSciBufAttrList rawBufUnreconciledList[2]; + NvSciBufAttrList imageBufUnreconciledList[2]; + NvSciBufAttrList rawBufReconciledList; + NvSciBufAttrList buffConflictList; + NvSciBufAttrList imageBufReconciledList; + NvSciBufAttrList imageBufConflictList; + NvSciBufAttrList buffAttrListOut; - NvSciSyncObj syncObj; - NvSciBufObj rawBufObj; - NvSciBufObj imageBufObj; - NvSciSyncFence *fence; + NvSciSyncObj syncObj; + NvSciBufObj rawBufObj; + NvSciBufObj imageBufObj; + NvSciSyncFence *fence; - cudaNvSci(int isMultiGPU, std::vector &deviceIds, - unsigned char *image_data, size_t imageWidth, size_t imageHeight); + cudaNvSci(int isMultiGPU, + std::vector &deviceIds, + unsigned char *image_data, + size_t imageWidth, + size_t imageHeight); - void initNvSci(); + void initNvSci(); - void runCudaNvSci(std::string &image_filename); + void runCudaNvSci(std::string &image_filename); - void createNvSciRawBufObj(); + void createNvSciRawBufObj(); - void createNvSciSyncObj(); + void createNvSciSyncObj(); - void createNvSciBufImageObj(); + void createNvSciBufImageObj(); }; -#endif // CUDANVSCI_H +#endif // CUDANVSCI_H diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/imageKernels.cu b/Samples/4_CUDA_Libraries/cudaNvSci/imageKernels.cu index 11075eb1..1d86d23b 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/imageKernels.cu +++ b/Samples/4_CUDA_Libraries/cudaNvSci/imageKernels.cu @@ -30,91 +30,90 @@ #include // convert floating point rgba color to 32-bit integer -__device__ unsigned int rgbaFloatToInt(float4 rgba) { - rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] - rgba.y = __saturatef(rgba.y); - rgba.z = __saturatef(rgba.z); - rgba.w = __saturatef(rgba.w); - return ((unsigned int)(rgba.w * 255.0f) << 24) | - ((unsigned int)(rgba.z * 255.0f) << 16) | - ((unsigned int)(rgba.y * 255.0f) << 8) | - ((unsigned int)(rgba.x * 255.0f)); +__device__ unsigned int rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return ((unsigned int)(rgba.w * 255.0f) << 24) | ((unsigned int)(rgba.z * 255.0f) << 16) + | ((unsigned int)(rgba.y * 255.0f) << 8) | ((unsigned int)(rgba.x * 255.0f)); } //////////////////////////////////////////////////////////////////////////////// //! Rotate an image using texture lookups //! @param outputData output data in global memory //////////////////////////////////////////////////////////////////////////////// -static __global__ void transformKernel(unsigned int *outputData, int width, - int height, float theta, - cudaTextureObject_t tex) { - // calculate normalized texture coordinates - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +static __global__ void +transformKernel(unsigned int *outputData, int width, int height, float theta, cudaTextureObject_t tex) +{ + // calculate normalized texture coordinates + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - float u = (float)x - (float)width / 2; - float v = (float)y - (float)height / 2; - float tu = u * cosf(theta) - v * sinf(theta); - float tv = v * cosf(theta) + u * sinf(theta); + float u = (float)x - (float)width / 2; + float v = (float)y - (float)height / 2; + float tu = u * cosf(theta) - v * sinf(theta); + float tv = v * cosf(theta) + u * sinf(theta); - tu /= (float)width; - tv /= (float)height; + tu /= (float)width; + tv /= (float)height; - // read from texture and write to global memory - float4 pix = tex2D(tex, tu + 0.5f, tv + 0.5f); - unsigned int pixelInt = rgbaFloatToInt(pix); - outputData[y * width + x] = pixelInt; + // read from texture and write to global memory + float4 pix = tex2D(tex, tu + 0.5f, tv + 0.5f); + unsigned int pixelInt = rgbaFloatToInt(pix); + outputData[y * width + x] = pixelInt; } -static __global__ void rgbToGrayscaleKernel(unsigned int *rgbaImage, - size_t imageWidth, - size_t imageHeight) { - size_t gidX = blockDim.x * blockIdx.x + threadIdx.x; +static __global__ void rgbToGrayscaleKernel(unsigned int *rgbaImage, size_t imageWidth, size_t imageHeight) +{ + size_t gidX = blockDim.x * blockIdx.x + threadIdx.x; - uchar4 *pixArray = (uchar4 *)rgbaImage; + uchar4 *pixArray = (uchar4 *)rgbaImage; - for (int pixId = gidX; pixId < imageWidth * imageHeight; - pixId += gridDim.x * blockDim.x) { - uchar4 dataA = pixArray[pixId]; - unsigned char grayscale = - (unsigned char)(dataA.x * 0.3 + dataA.y * 0.59 + dataA.z * 0.11); - uchar4 dataB = make_uchar4(grayscale, grayscale, grayscale, 0); - pixArray[pixId] = dataB; - } + for (int pixId = gidX; pixId < imageWidth * imageHeight; pixId += gridDim.x * blockDim.x) { + uchar4 dataA = pixArray[pixId]; + unsigned char grayscale = (unsigned char)(dataA.x * 0.3 + dataA.y * 0.59 + dataA.z * 0.11); + uchar4 dataB = make_uchar4(grayscale, grayscale, grayscale, 0); + pixArray[pixId] = dataB; + } } void launchGrayScaleKernel(unsigned int *d_rgbaImage, - std::string image_filename, size_t imageWidth, - size_t imageHeight, cudaStream_t stream) { - int numThreadsPerBlock = 1024; - int numOfBlocks = (imageWidth * imageHeight) / numThreadsPerBlock; + std::string image_filename, + size_t imageWidth, + size_t imageHeight, + cudaStream_t stream) +{ + int numThreadsPerBlock = 1024; + int numOfBlocks = (imageWidth * imageHeight) / numThreadsPerBlock; - rgbToGrayscaleKernel<<>>( - d_rgbaImage, imageWidth, imageHeight); + rgbToGrayscaleKernel<<>>(d_rgbaImage, imageWidth, imageHeight); - unsigned int *outputData; - checkCudaErrors(cudaMallocHost((void**)&outputData, sizeof(unsigned int) * imageWidth * imageHeight)); - checkCudaErrors(cudaMemcpyAsync( - outputData, d_rgbaImage, sizeof(unsigned int) * imageWidth * imageHeight, - cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); + unsigned int *outputData; + checkCudaErrors(cudaMallocHost((void **)&outputData, sizeof(unsigned int) * imageWidth * imageHeight)); + checkCudaErrors(cudaMemcpyAsync( + outputData, d_rgbaImage, sizeof(unsigned int) * imageWidth * imageHeight, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); - char outputFilename[1024]; - strcpy(outputFilename, image_filename.c_str()); - strcpy(outputFilename + image_filename.length() - 4, "_out.ppm"); - sdkSavePPM4ub(outputFilename, (unsigned char *)outputData, imageWidth, - imageHeight); - printf("Wrote '%s'\n", outputFilename); + char outputFilename[1024]; + strcpy(outputFilename, image_filename.c_str()); + strcpy(outputFilename + image_filename.length() - 4, "_out.ppm"); + sdkSavePPM4ub(outputFilename, (unsigned char *)outputData, imageWidth, imageHeight); + printf("Wrote '%s'\n", outputFilename); - checkCudaErrors(cudaFreeHost(outputData)); + checkCudaErrors(cudaFreeHost(outputData)); } -void rotateKernel(cudaTextureObject_t &texObj, const float angle, - unsigned int *d_outputData, const int imageWidth, - const int imageHeight, cudaStream_t stream) { - dim3 dimBlock(8, 8, 1); - dim3 dimGrid(imageWidth / dimBlock.x, imageHeight / dimBlock.y, 1); +void rotateKernel(cudaTextureObject_t &texObj, + const float angle, + unsigned int *d_outputData, + const int imageWidth, + const int imageHeight, + cudaStream_t stream) +{ + dim3 dimBlock(8, 8, 1); + dim3 dimGrid(imageWidth / dimBlock.x, imageHeight / dimBlock.y, 1); - transformKernel<<>>(d_outputData, imageWidth, - imageHeight, angle, texObj); + transformKernel<<>>(d_outputData, imageWidth, imageHeight, angle, texObj); } diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp b/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp index 04282383..c929bae8 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp +++ b/Samples/4_CUDA_Libraries/cudaNvSci/main.cpp @@ -26,76 +26,77 @@ */ #include -#include -#include "cudaNvSci.h" #include #include +#include -void loadImageData(const std::string &filename, const char **argv, - unsigned char **image_data, uint32_t &imageWidth, - uint32_t &imageHeight) { - // load image (needed so we can get the width and height before we create - // the window - char *image_path = sdkFindFilePath(filename.c_str(), argv[0]); +#include "cudaNvSci.h" - if (image_path == 0) { - printf("Error finding image file '%s'\n", filename.c_str()); - exit(EXIT_FAILURE); - } +void loadImageData(const std::string &filename, + const char **argv, + unsigned char **image_data, + uint32_t &imageWidth, + uint32_t &imageHeight) +{ + // load image (needed so we can get the width and height before we create + // the window + char *image_path = sdkFindFilePath(filename.c_str(), argv[0]); - sdkLoadPPM4(image_path, image_data, &imageWidth, &imageHeight); + if (image_path == 0) { + printf("Error finding image file '%s'\n", filename.c_str()); + exit(EXIT_FAILURE); + } - if (!image_data) { - printf("Error opening file '%s'\n", image_path); - exit(EXIT_FAILURE); - } + sdkLoadPPM4(image_path, image_data, &imageWidth, &imageHeight); - printf("Loaded '%s', %d x %d pixels\n", image_path, imageWidth, imageHeight); + if (!image_data) { + printf("Error opening file '%s'\n", image_path); + exit(EXIT_FAILURE); + } + + printf("Loaded '%s', %d x %d pixels\n", image_path, imageWidth, imageHeight); } -int main(int argc, const char **argv) { - int numOfGPUs = 0; - std::vector deviceIds; - checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); +int main(int argc, const char **argv) +{ + int numOfGPUs = 0; + std::vector deviceIds; + checkCudaErrors(cudaGetDeviceCount(&numOfGPUs)); - printf("%d GPUs found\n", numOfGPUs); - if (!numOfGPUs) { - exit(EXIT_WAIVED); - } else { - for (int devID = 0; devID < numOfGPUs; devID++) { - int major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute( - &major, cudaDevAttrComputeCapabilityMajor, devID)); - checkCudaErrors(cudaDeviceGetAttribute( - &minor, cudaDevAttrComputeCapabilityMinor, devID)); - if (major >= 6) { - deviceIds.push_back(devID); - } + printf("%d GPUs found\n", numOfGPUs); + if (!numOfGPUs) { + exit(EXIT_WAIVED); } - if (deviceIds.size() == 0) { - printf( - "cudaNvSci requires one or more GPUs of Pascal(SM 6.0) or higher " - "archs\nWaiving..\n"); - exit(EXIT_WAIVED); + else { + for (int devID = 0; devID < numOfGPUs; devID++) { + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + if (major >= 6) { + deviceIds.push_back(devID); + } + } + if (deviceIds.size() == 0) { + printf("cudaNvSci requires one or more GPUs of Pascal(SM 6.0) or higher " + "archs\nWaiving..\n"); + exit(EXIT_WAIVED); + } } - } - std::string image_filename = "teapot1024.ppm"; + std::string image_filename = "teapot1024.ppm"; - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", - (char **)&image_filename); - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&image_filename); + } - uint32_t imageWidth = 0; - uint32_t imageHeight = 0; - unsigned char *image_data = NULL; + uint32_t imageWidth = 0; + uint32_t imageHeight = 0; + unsigned char *image_data = NULL; - loadImageData(image_filename, argv, &image_data, imageWidth, imageHeight); + loadImageData(image_filename, argv, &image_data, imageWidth, imageHeight); - cudaNvSci cudaNvSciApp(deviceIds.size() > 1, deviceIds, image_data, - imageWidth, imageHeight); - cudaNvSciApp.runCudaNvSci(image_filename); + cudaNvSci cudaNvSciApp(deviceIds.size() > 1, deviceIds, image_data, imageWidth, imageHeight); + cudaNvSciApp.runCudaNvSci(image_filename); - return EXIT_SUCCESS; -} \ No newline at end of file + return EXIT_SUCCESS; +} diff --git a/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp b/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp index 43aad888..36987695 100644 --- a/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp +++ b/Samples/4_CUDA_Libraries/freeImageInteropNPP/freeImageInteropNPP.cpp @@ -36,285 +36,290 @@ #include #endif -#include "FreeImage.h" -#include "Exceptions.h" +// Common Helpers -#include +#include "Exceptions.h" +#include "FreeImage.h" + +// Other Includes +#include #include #include +#include // CUDA NPP Definitions +#include -#include -#include // CUDA NPP Definitions +// Other helpers +#include // helper for CUDA Error handling and initialization +#include // helper for string parsing -#include // helper for CUDA Error handling and initialization -#include // helper for string parsing +inline int cudaDeviceInit(int argc, const char **argv) +{ + int deviceCount; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); -inline int cudaDeviceInit(int argc, const char **argv) { - int deviceCount; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + if (deviceCount == 0) { + std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; + exit(EXIT_FAILURE); + } - if (deviceCount == 0) { - std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; - exit(EXIT_FAILURE); - } + int dev = findCudaDevice(argc, argv); - int dev = findCudaDevice(argc, argv); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name << std::endl; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name - << std::endl; + checkCudaErrors(cudaSetDevice(dev)); - checkCudaErrors(cudaSetDevice(dev)); - - return dev; + return dev; } // Error handler for FreeImage library. // In case this handler is invoked, it throws an NPP exception. -extern "C" void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, - const char *zMessage) { - throw npp::Exception(zMessage); -} +extern "C" void FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char *zMessage) { throw npp::Exception(zMessage); } -std::ostream &operator<<(std::ostream &rOutputStream, const FIBITMAP &rBitmap) { - unsigned int nImageWidth = - FreeImage_GetWidth(const_cast(&rBitmap)); - unsigned int nImageHeight = - FreeImage_GetHeight(const_cast(&rBitmap)); - unsigned int nPitch = FreeImage_GetPitch(const_cast(&rBitmap)); - unsigned int nBPP = FreeImage_GetBPP(const_cast(&rBitmap)); +std::ostream &operator<<(std::ostream &rOutputStream, const FIBITMAP &rBitmap) +{ + unsigned int nImageWidth = FreeImage_GetWidth(const_cast(&rBitmap)); + unsigned int nImageHeight = FreeImage_GetHeight(const_cast(&rBitmap)); + unsigned int nPitch = FreeImage_GetPitch(const_cast(&rBitmap)); + unsigned int nBPP = FreeImage_GetBPP(const_cast(&rBitmap)); - FREE_IMAGE_COLOR_TYPE eType = - FreeImage_GetColorType(const_cast(&rBitmap)); + FREE_IMAGE_COLOR_TYPE eType = FreeImage_GetColorType(const_cast(&rBitmap)); - rOutputStream << "Size (" << nImageWidth << ", " << nImageHeight << ")\n"; - rOutputStream << "Pitch " << nPitch << "\n"; - rOutputStream << "Type "; + rOutputStream << "Size (" << nImageWidth << ", " << nImageHeight << ")\n"; + rOutputStream << "Pitch " << nPitch << "\n"; + rOutputStream << "Type "; - switch (eType) { + switch (eType) { case FIC_MINISWHITE: - rOutputStream << "FIC_MINISWHITE\n"; - break; + rOutputStream << "FIC_MINISWHITE\n"; + break; case FIC_MINISBLACK: - rOutputStream << "FIC_MINISBLACK\n"; - break; + rOutputStream << "FIC_MINISBLACK\n"; + break; case FIC_RGB: - rOutputStream << "FIC_RGB\n"; - break; + rOutputStream << "FIC_RGB\n"; + break; case FIC_PALETTE: - rOutputStream << "FIC_PALETTE\n"; - break; + rOutputStream << "FIC_PALETTE\n"; + break; case FIC_RGBALPHA: - rOutputStream << "FIC_RGBALPHA\n"; - break; + rOutputStream << "FIC_RGBALPHA\n"; + break; case FIC_CMYK: - rOutputStream << "FIC_CMYK\n"; - break; + rOutputStream << "FIC_CMYK\n"; + break; default: - rOutputStream << "Unknown pixel format.\n"; - } + rOutputStream << "Unknown pixel format.\n"; + } - rOutputStream << "BPP " << nBPP << std::endl; + rOutputStream << "BPP " << nBPP << std::endl; - return rOutputStream; + return rOutputStream; } -int main(int argc, char *argv[]) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char *argv[]) +{ + printf("%s Starting...\n\n", argv[0]); - try { - std::string sFilename; - char *filePath; + try { + std::string sFilename; + char *filePath; - // set your own FreeImage error handler - FreeImage_SetOutputMessage(FreeImageErrorHandler); + // set your own FreeImage error handler + FreeImage_SetOutputMessage(FreeImageErrorHandler); - cudaDeviceInit(argc, (const char **)argv); + cudaDeviceInit(argc, (const char **)argv); - NppStreamContext nppStreamCtx; - nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. + NppStreamContext nppStreamCtx; + nppStreamCtx.hStream = + 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. - cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - { - printf("CUDA error: no devices supporting CUDA.\n"); - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) { + printf("CUDA error: no devices supporting CUDA.\n"); + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + } + + const NppLibraryVersion *libVer = nppGetLibVersion(); + + printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); + + int driverVersion, runtimeVersion; + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); + + printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, + cudaDevAttrComputeCapabilityMajor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, + cudaDevAttrComputeCapabilityMinor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + + cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); + + cudaDeviceProp oDeviceProperties; + + cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); + + nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; + nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; + nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; + nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; + + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); + } + else { + filePath = sdkFindFilePath("teapot512.pgm", argv[0]); + } + + if (filePath) { + sFilename = filePath; + } + else { + sFilename = "teapot512.pgm"; + } + + // if we specify the filename at the command line, then we only test + // sFilename otherwise we will check both sFilename[0,1] + int file_errors = 0; + std::ifstream infile(sFilename.data(), std::ifstream::in); + + if (infile.good()) { + std::cout << "freeImageInteropNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; + file_errors = 0; + infile.close(); + } + else { + std::cout << "freeImageInteropNPP unable to open: <" << sFilename.data() << ">" << std::endl; + file_errors++; + infile.close(); + } + + if (file_errors > 0) { + exit(EXIT_FAILURE); + } + + std::string sResultFilename = sFilename; + + std::string::size_type dot = sResultFilename.rfind('.'); + + if (dot != std::string::npos) { + sResultFilename = sResultFilename.substr(0, dot); + } + + sResultFilename += "_boxFilterFII.pgm"; + + if (checkCmdLineFlag(argc, (const char **)argv, "output")) { + char *outputFilePath; + getCmdLineArgumentString(argc, (const char **)argv, "output", &outputFilePath); + sResultFilename = outputFilePath; + } + + FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(sFilename.c_str()); + + // no signature? try to guess the file format from the file extension + if (eFormat == FIF_UNKNOWN) { + eFormat = FreeImage_GetFIFFromFilename(sFilename.c_str()); + } + + NPP_ASSERT(eFormat != FIF_UNKNOWN); + // check that the plugin has reading capabilities ... + FIBITMAP *pBitmap; + + if (FreeImage_FIFSupportsReading(eFormat)) { + pBitmap = FreeImage_Load(eFormat, sFilename.c_str()); + } + + NPP_ASSERT(pBitmap != 0); + // Dump the bitmap information to the console + std::cout << (*pBitmap) << std::endl; + // make sure this is an 8-bit single channel image + NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); + NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8); + + unsigned int nImageWidth = FreeImage_GetWidth(pBitmap); + unsigned int nImageHeight = FreeImage_GetHeight(pBitmap); + unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); + unsigned char *pSrcData = FreeImage_GetBits(pBitmap); + + int nSrcPitchCUDA; + Npp8u *pSrcImageCUDA = nppiMalloc_8u_C1(nImageWidth, nImageHeight, &nSrcPitchCUDA); + NPP_ASSERT_NOT_NULL(pSrcImageCUDA); + // copy image loaded via FreeImage to into CUDA device memory, i.e. + // transfer the image-data up to the GPU's video-memory + NPP_CHECK_CUDA(cudaMemcpy2D( + pSrcImageCUDA, nSrcPitchCUDA, pSrcData, nSrcPitch, nImageWidth, nImageHeight, cudaMemcpyHostToDevice)); + + // define size of the box filter + const NppiSize oMaskSize = {7, 7}; + const NppiPoint oMaskAchnor = {0, 0}; + // compute maximal result image size + const NppiSize oSizeROI = {(int)nImageWidth - (oMaskSize.width - 1), + (int)nImageHeight - (oMaskSize.height - 1)}; + // allocate result image memory + int nDstPitchCUDA; + Npp8u *pDstImageCUDA = nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA); + NPP_ASSERT_NOT_NULL(pDstImageCUDA); + NPP_CHECK_NPP(nppiFilterBox_8u_C1R_Ctx(pSrcImageCUDA, + nSrcPitchCUDA, + pDstImageCUDA, + nDstPitchCUDA, + oSizeROI, + oMaskSize, + oMaskAchnor, + nppStreamCtx)); + // create the result image storage using FreeImage so we can easily + // save + FIBITMAP *pResultBitmap = FreeImage_Allocate(oSizeROI.width, oSizeROI.height, 8 /* bits per pixel */); + NPP_ASSERT_NOT_NULL(pResultBitmap); + unsigned int nResultPitch = FreeImage_GetPitch(pResultBitmap); + unsigned char *pResultData = FreeImage_GetBits(pResultBitmap); + + NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, + nResultPitch, + pDstImageCUDA, + nDstPitchCUDA, + oSizeROI.width, + oSizeROI.height, + cudaMemcpyDeviceToHost)); + // now save the result image + bool bSuccess; + bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, sResultFilename.c_str(), 0) == TRUE; + NPP_ASSERT_MSG(bSuccess, "Failed to save result image."); + + // free nppiImage + nppiFree(pSrcImageCUDA); + nppiFree(pDstImageCUDA); + + exit(EXIT_SUCCESS); } - - const NppLibraryVersion *libVer = nppGetLibVersion(); - - printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); - - int driverVersion, runtimeVersion; - cudaDriverGetVersion(&driverVersion); - cudaRuntimeGetVersion(&runtimeVersion); - - printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); - printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10); - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, - cudaDevAttrComputeCapabilityMajor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, - cudaDevAttrComputeCapabilityMinor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - - cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); - - cudaDeviceProp oDeviceProperties; - - cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); - - nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; - nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; - nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; - nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; - - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); - } else { - filePath = sdkFindFilePath("teapot512.pgm", argv[0]); + catch (npp::Exception &rException) { + std::cerr << "Program error! The following exception occurred: \n"; + std::cerr << rException << std::endl; + std::cerr << "Aborting." << std::endl; + exit(EXIT_FAILURE); } - - if (filePath) { - sFilename = filePath; - } else { - sFilename = "teapot512.pgm"; + catch (...) { + std::cerr << "Program error! An unknow type of exception occurred. \n"; + std::cerr << "Aborting." << std::endl; + exit(EXIT_FAILURE); } - // if we specify the filename at the command line, then we only test - // sFilename otherwise we will check both sFilename[0,1] - int file_errors = 0; - std::ifstream infile(sFilename.data(), std::ifstream::in); - - if (infile.good()) { - std::cout << "freeImageInteropNPP opened: <" << sFilename.data() - << "> successfully!" << std::endl; - file_errors = 0; - infile.close(); - } else { - std::cout << "freeImageInteropNPP unable to open: <" << sFilename.data() - << ">" << std::endl; - file_errors++; - infile.close(); - } - - if (file_errors > 0) { - exit(EXIT_FAILURE); - } - - std::string sResultFilename = sFilename; - - std::string::size_type dot = sResultFilename.rfind('.'); - - if (dot != std::string::npos) { - sResultFilename = sResultFilename.substr(0, dot); - } - - sResultFilename += "_boxFilterFII.pgm"; - - if (checkCmdLineFlag(argc, (const char **)argv, "output")) { - char *outputFilePath; - getCmdLineArgumentString(argc, (const char **)argv, "output", - &outputFilePath); - sResultFilename = outputFilePath; - } - - FREE_IMAGE_FORMAT eFormat = FreeImage_GetFileType(sFilename.c_str()); - - // no signature? try to guess the file format from the file extension - if (eFormat == FIF_UNKNOWN) { - eFormat = FreeImage_GetFIFFromFilename(sFilename.c_str()); - } - - NPP_ASSERT(eFormat != FIF_UNKNOWN); - // check that the plugin has reading capabilities ... - FIBITMAP *pBitmap; - - if (FreeImage_FIFSupportsReading(eFormat)) { - pBitmap = FreeImage_Load(eFormat, sFilename.c_str()); - } - - NPP_ASSERT(pBitmap != 0); - // Dump the bitmap information to the console - std::cout << (*pBitmap) << std::endl; - // make sure this is an 8-bit single channel image - NPP_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); - NPP_ASSERT(FreeImage_GetBPP(pBitmap) == 8); - - unsigned int nImageWidth = FreeImage_GetWidth(pBitmap); - unsigned int nImageHeight = FreeImage_GetHeight(pBitmap); - unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); - unsigned char *pSrcData = FreeImage_GetBits(pBitmap); - - int nSrcPitchCUDA; - Npp8u *pSrcImageCUDA = - nppiMalloc_8u_C1(nImageWidth, nImageHeight, &nSrcPitchCUDA); - NPP_ASSERT_NOT_NULL(pSrcImageCUDA); - // copy image loaded via FreeImage to into CUDA device memory, i.e. - // transfer the image-data up to the GPU's video-memory - NPP_CHECK_CUDA(cudaMemcpy2D(pSrcImageCUDA, nSrcPitchCUDA, pSrcData, - nSrcPitch, nImageWidth, nImageHeight, - cudaMemcpyHostToDevice)); - - // define size of the box filter - const NppiSize oMaskSize = {7, 7}; - const NppiPoint oMaskAchnor = {0, 0}; - // compute maximal result image size - const NppiSize oSizeROI = {(int)nImageWidth - (oMaskSize.width - 1), - (int)nImageHeight - (oMaskSize.height - 1)}; - // allocate result image memory - int nDstPitchCUDA; - Npp8u *pDstImageCUDA = - nppiMalloc_8u_C1(oSizeROI.width, oSizeROI.height, &nDstPitchCUDA); - NPP_ASSERT_NOT_NULL(pDstImageCUDA); - NPP_CHECK_NPP(nppiFilterBox_8u_C1R_Ctx(pSrcImageCUDA, nSrcPitchCUDA, - pDstImageCUDA, nDstPitchCUDA, oSizeROI, - oMaskSize, oMaskAchnor, nppStreamCtx)); - // create the result image storage using FreeImage so we can easily - // save - FIBITMAP *pResultBitmap = FreeImage_Allocate( - oSizeROI.width, oSizeROI.height, 8 /* bits per pixel */); - NPP_ASSERT_NOT_NULL(pResultBitmap); - unsigned int nResultPitch = FreeImage_GetPitch(pResultBitmap); - unsigned char *pResultData = FreeImage_GetBits(pResultBitmap); - - NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, pDstImageCUDA, - nDstPitchCUDA, oSizeROI.width, oSizeROI.height, - cudaMemcpyDeviceToHost)); - // now save the result image - bool bSuccess; - bSuccess = FreeImage_Save(FIF_PGM, pResultBitmap, sResultFilename.c_str(), - 0) == TRUE; - NPP_ASSERT_MSG(bSuccess, "Failed to save result image."); - - // free nppiImage - nppiFree(pSrcImageCUDA); - nppiFree(pDstImageCUDA); - exit(EXIT_SUCCESS); - } catch (npp::Exception &rException) { - std::cerr << "Program error! The following exception occurred: \n"; - std::cerr << rException << std::endl; - std::cerr << "Aborting." << std::endl; - exit(EXIT_FAILURE); - } catch (...) { - std::cerr << "Program error! An unknow type of exception occurred. \n"; - std::cerr << "Aborting." << std::endl; - exit(EXIT_FAILURE); - } - - exit(EXIT_SUCCESS); } diff --git a/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp b/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp index 1869feb7..a0a43412 100644 --- a/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp +++ b/Samples/4_CUDA_Libraries/histEqualizationNPP/histEqualizationNPP.cpp @@ -36,288 +36,291 @@ #include #include #include +#include #include +#include #include #include - -#include -#include #include #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#define STRCASECMP _stricmp +#define STRCASECMP _stricmp #define STRNCASECMP _strnicmp #else -#define STRCASECMP strcasecmp +#define STRCASECMP strcasecmp #define STRNCASECMP strncasecmp #endif -inline int cudaDeviceInit(int argc, const char **argv) { - int deviceCount; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); +inline int cudaDeviceInit(int argc, const char **argv) +{ + int deviceCount; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - if (deviceCount == 0) { - std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; - exit(EXIT_FAILURE); - } + if (deviceCount == 0) { + std::cerr << "CUDA error: no devices supporting CUDA." << std::endl; + exit(EXIT_FAILURE); + } - int dev = findCudaDevice(argc, argv); + int dev = findCudaDevice(argc, argv); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name - << std::endl; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name << std::endl; - checkCudaErrors(cudaSetDevice(dev)); + checkCudaErrors(cudaSetDevice(dev)); - return dev; + return dev; } -int main(int argc, char *argv[]) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char *argv[]) +{ + printf("%s Starting...\n\n", argv[0]); - try { - std::string sFilename; - char *filePath; + try { + std::string sFilename; + char *filePath; - cudaDeviceInit(argc, (const char **)argv); + cudaDeviceInit(argc, (const char **)argv); - NppStreamContext nppStreamCtx; - nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. + NppStreamContext nppStreamCtx; + nppStreamCtx.hStream = + 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. - cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - { - printf("CUDA error: no devices supporting CUDA.\n"); - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - } + cudaError_t cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) { + printf("CUDA error: no devices supporting CUDA.\n"); + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + } - const NppLibraryVersion *libVer = nppGetLibVersion(); + const NppLibraryVersion *libVer = nppGetLibVersion(); - printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); + printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); - int driverVersion, runtimeVersion; - cudaDriverGetVersion(&driverVersion); - cudaRuntimeGetVersion(&runtimeVersion); + int driverVersion, runtimeVersion; + cudaDriverGetVersion(&driverVersion); + cudaRuntimeGetVersion(&runtimeVersion); - printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); - printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10); + printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, - cudaDevAttrComputeCapabilityMajor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, + cudaDevAttrComputeCapabilityMajor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, - cudaDevAttrComputeCapabilityMinor, - nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, + cudaDevAttrComputeCapabilityMinor, + nppStreamCtx.nCudaDeviceId); + if (cudaError != cudaSuccess) + return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); + cudaError = cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags); - cudaDeviceProp oDeviceProperties; + cudaDeviceProp oDeviceProperties; - cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); + cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); - nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; - nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; - nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; - nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; + nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; + nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; + nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; + nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); - } else { - filePath = sdkFindFilePath("teapot512.pgm", argv[0]); - } + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", &filePath); + } + else { + filePath = sdkFindFilePath("teapot512.pgm", argv[0]); + } - if (filePath) { - sFilename = filePath; - } else { - sFilename = "teapot512.pgm"; - } + if (filePath) { + sFilename = filePath; + } + else { + sFilename = "teapot512.pgm"; + } - // if we specify the filename at the command line, then we only test - // sFilename. - int file_errors = 0; - std::ifstream infile(sFilename.data(), std::ifstream::in); + // if we specify the filename at the command line, then we only test + // sFilename. + int file_errors = 0; + std::ifstream infile(sFilename.data(), std::ifstream::in); - if (infile.good()) { - std::cout << "histEqualizationNPP opened: <" << sFilename.data() - << "> successfully!" << std::endl; - file_errors = 0; - infile.close(); - } else { - std::cout << "histEqualizationNPP unable to open: <" << sFilename.data() - << ">" << std::endl; - file_errors++; - infile.close(); - } + if (infile.good()) { + std::cout << "histEqualizationNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; + file_errors = 0; + infile.close(); + } + else { + std::cout << "histEqualizationNPP unable to open: <" << sFilename.data() << ">" << std::endl; + file_errors++; + infile.close(); + } - if (file_errors > 0) { - exit(EXIT_FAILURE); - } + if (file_errors > 0) { + exit(EXIT_FAILURE); + } - std::string dstFileName = sFilename; + std::string dstFileName = sFilename; - std::string::size_type dot = dstFileName.rfind('.'); + std::string::size_type dot = dstFileName.rfind('.'); - if (dot != std::string::npos) { - dstFileName = dstFileName.substr(0, dot); - } + if (dot != std::string::npos) { + dstFileName = dstFileName.substr(0, dot); + } - dstFileName += "_histEqualization.pgm"; + dstFileName += "_histEqualization.pgm"; - if (checkCmdLineFlag(argc, (const char **)argv, "output")) { - char *outputFilePath; - getCmdLineArgumentString(argc, (const char **)argv, "output", - &outputFilePath); - dstFileName = outputFilePath; - } + if (checkCmdLineFlag(argc, (const char **)argv, "output")) { + char *outputFilePath; + getCmdLineArgumentString(argc, (const char **)argv, "output", &outputFilePath); + dstFileName = outputFilePath; + } - npp::ImageCPU_8u_C1 oHostSrc; - npp::loadImage(sFilename, oHostSrc); - npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); + npp::ImageCPU_8u_C1 oHostSrc; + npp::loadImage(sFilename, oHostSrc); + npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); - // - // allocate arrays for histogram and levels - // + // + // allocate arrays for histogram and levels + // - const int binCount = 255; - const int levelCount = binCount + 1; // levels array has one more element + const int binCount = 255; + const int levelCount = binCount + 1; // levels array has one more element - Npp32s *histDevice = 0; - Npp32s *levelsDevice = 0; + Npp32s *histDevice = 0; + Npp32s *levelsDevice = 0; - NPP_CHECK_CUDA(cudaMalloc((void **)&histDevice, binCount * sizeof(Npp32s))); - NPP_CHECK_CUDA( - cudaMalloc((void **)&levelsDevice, levelCount * sizeof(Npp32s))); + NPP_CHECK_CUDA(cudaMalloc((void **)&histDevice, binCount * sizeof(Npp32s))); + NPP_CHECK_CUDA(cudaMalloc((void **)&levelsDevice, levelCount * sizeof(Npp32s))); - // - // compute histogram - // + // + // compute histogram + // - NppiSize oSizeROI = {(int)oDeviceSrc.width(), - (int)oDeviceSrc.height()}; // full image - // create device scratch buffer for nppiHistogram - size_t nDeviceBufferSize; - nppiHistogramEvenGetBufferSize_8u_C1R_Ctx(oSizeROI, levelCount, - &nDeviceBufferSize, - nppStreamCtx); - Npp8u *pDeviceBuffer; - NPP_CHECK_CUDA(cudaMalloc((void **)&pDeviceBuffer, nDeviceBufferSize)); + NppiSize oSizeROI = {(int)oDeviceSrc.width(), (int)oDeviceSrc.height()}; // full image + // create device scratch buffer for nppiHistogram + size_t nDeviceBufferSize; + nppiHistogramEvenGetBufferSize_8u_C1R_Ctx(oSizeROI, levelCount, &nDeviceBufferSize, nppStreamCtx); + Npp8u *pDeviceBuffer; + NPP_CHECK_CUDA(cudaMalloc((void **)&pDeviceBuffer, nDeviceBufferSize)); - // compute levels values on host - Npp32s levelsHost[levelCount]; - NPP_CHECK_NPP(nppiEvenLevelsHost_32s(levelsHost, levelCount, 0, binCount)); - // compute the histogram - NPP_CHECK_NPP(nppiHistogramEven_8u_C1R_Ctx( - oDeviceSrc.data(), oDeviceSrc.pitch(), oSizeROI, histDevice, levelCount, - 0, binCount, pDeviceBuffer, nppStreamCtx)); - // copy histogram and levels to host memory - Npp32s histHost[binCount]; - NPP_CHECK_CUDA(cudaMemcpy(histHost, histDevice, binCount * sizeof(Npp32s), - cudaMemcpyDeviceToHost)); + // compute levels values on host + Npp32s levelsHost[levelCount]; + NPP_CHECK_NPP(nppiEvenLevelsHost_32s(levelsHost, levelCount, 0, binCount)); + // compute the histogram + NPP_CHECK_NPP(nppiHistogramEven_8u_C1R_Ctx(oDeviceSrc.data(), + oDeviceSrc.pitch(), + oSizeROI, + histDevice, + levelCount, + 0, + binCount, + pDeviceBuffer, + nppStreamCtx)); + // copy histogram and levels to host memory + Npp32s histHost[binCount]; + NPP_CHECK_CUDA(cudaMemcpy(histHost, histDevice, binCount * sizeof(Npp32s), cudaMemcpyDeviceToHost)); - Npp32s lutHost[levelCount]; + Npp32s lutHost[levelCount]; - // fill LUT - { - Npp32s *pHostHistogram = histHost; - Npp32s totalSum = 0; + // fill LUT + { + Npp32s *pHostHistogram = histHost; + Npp32s totalSum = 0; - for (; pHostHistogram < histHost + binCount; ++pHostHistogram) { - totalSum += *pHostHistogram; - } + for (; pHostHistogram < histHost + binCount; ++pHostHistogram) { + totalSum += *pHostHistogram; + } - NPP_ASSERT(totalSum <= oSizeROI.width * oSizeROI.height); + NPP_ASSERT(totalSum <= oSizeROI.width * oSizeROI.height); - if (totalSum == 0) { - totalSum = 1; - } + if (totalSum == 0) { + totalSum = 1; + } - float multiplier = 1.0f / float(oSizeROI.width * oSizeROI.height) * 0xFF; + float multiplier = 1.0f / float(oSizeROI.width * oSizeROI.height) * 0xFF; - Npp32s runningSum = 0; - Npp32s *pLookupTable = lutHost; + Npp32s runningSum = 0; + Npp32s *pLookupTable = lutHost; - for (pHostHistogram = histHost; pHostHistogram < histHost + binCount; - ++pHostHistogram) { - *pLookupTable = (Npp32s)(runningSum * multiplier + 0.5f); - pLookupTable++; - runningSum += *pHostHistogram; - } + for (pHostHistogram = histHost; pHostHistogram < histHost + binCount; ++pHostHistogram) { + *pLookupTable = (Npp32s)(runningSum * multiplier + 0.5f); + pLookupTable++; + runningSum += *pHostHistogram; + } - lutHost[binCount] = 0xFF; // last element is always 1 - } + lutHost[binCount] = 0xFF; // last element is always 1 + } - // - // apply LUT transformation to the image - // - // Create a device image for the result. - npp::ImageNPP_8u_C1 oDeviceDst(oDeviceSrc.size()); + // + // apply LUT transformation to the image + // + // Create a device image for the result. + npp::ImageNPP_8u_C1 oDeviceDst(oDeviceSrc.size()); #if CUDART_VERSION >= 5000 - // Note for CUDA 5.0, that nppiLUT_Linear_8u_C1R requires these pointers to - // be in GPU device memory - Npp32s *lutDevice = 0; - Npp32s *lvlsDevice = 0; + // Note for CUDA 5.0, that nppiLUT_Linear_8u_C1R requires these pointers to + // be in GPU device memory + Npp32s *lutDevice = 0; + Npp32s *lvlsDevice = 0; - NPP_CHECK_CUDA( - cudaMalloc((void **)&lutDevice, sizeof(Npp32s) * (levelCount))); - NPP_CHECK_CUDA( - cudaMalloc((void **)&lvlsDevice, sizeof(Npp32s) * (levelCount))); + NPP_CHECK_CUDA(cudaMalloc((void **)&lutDevice, sizeof(Npp32s) * (levelCount))); + NPP_CHECK_CUDA(cudaMalloc((void **)&lvlsDevice, sizeof(Npp32s) * (levelCount))); - NPP_CHECK_CUDA(cudaMemcpy(lutDevice, lutHost, sizeof(Npp32s) * (levelCount), - cudaMemcpyHostToDevice)); - NPP_CHECK_CUDA(cudaMemcpy(lvlsDevice, levelsHost, - sizeof(Npp32s) * (levelCount), - cudaMemcpyHostToDevice)); + NPP_CHECK_CUDA(cudaMemcpy(lutDevice, lutHost, sizeof(Npp32s) * (levelCount), cudaMemcpyHostToDevice)); + NPP_CHECK_CUDA(cudaMemcpy(lvlsDevice, levelsHost, sizeof(Npp32s) * (levelCount), cudaMemcpyHostToDevice)); - NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx( - oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), - oDeviceDst.pitch(), oSizeROI, - lutDevice, // value and level arrays are in host memory - lvlsDevice, levelCount, - nppStreamCtx)); + NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx(oDeviceSrc.data(), + oDeviceSrc.pitch(), + oDeviceDst.data(), + oDeviceDst.pitch(), + oSizeROI, + lutDevice, // value and level arrays are in host memory + lvlsDevice, + levelCount, + nppStreamCtx)); - NPP_CHECK_CUDA(cudaFree(lutDevice)); - NPP_CHECK_CUDA(cudaFree(lvlsDevice)); + NPP_CHECK_CUDA(cudaFree(lutDevice)); + NPP_CHECK_CUDA(cudaFree(lvlsDevice)); #else - NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx( - oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), - oDeviceDst.pitch(), oSizeROI, - lutHost, // value and level arrays are in host memory - levelsHost, levelCount, - nppStreamCtx)); + NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R_Ctx(oDeviceSrc.data(), + oDeviceSrc.pitch(), + oDeviceDst.data(), + oDeviceDst.pitch(), + oSizeROI, + lutHost, // value and level arrays are in host memory + levelsHost, + levelCount, + nppStreamCtx)); #endif - // copy the result image back into the storage that contained the - // input image - npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); - oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); + // copy the result image back into the storage that contained the + // input image + npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); + oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); - cudaFree(histDevice); - cudaFree(levelsDevice); - cudaFree(pDeviceBuffer); - nppiFree(oDeviceSrc.data()); - nppiFree(oDeviceDst.data()); + cudaFree(histDevice); + cudaFree(levelsDevice); + cudaFree(pDeviceBuffer); + nppiFree(oDeviceSrc.data()); + nppiFree(oDeviceDst.data()); - // save the result - npp::saveImage(dstFileName.c_str(), oHostDst); - std::cout << "Saved image file " << dstFileName << std::endl; - exit(EXIT_SUCCESS); - } catch (npp::Exception &rException) { - std::cerr << "Program error! The following exception occurred: \n"; - std::cerr << rException << std::endl; - std::cerr << "Aborting." << std::endl; - exit(EXIT_FAILURE); - } catch (...) { - std::cerr << "Program error! An unknow type of exception occurred. \n"; - std::cerr << "Aborting." << std::endl; - exit(EXIT_FAILURE); - } + // save the result + npp::saveImage(dstFileName.c_str(), oHostDst); + std::cout << "Saved image file " << dstFileName << std::endl; + exit(EXIT_SUCCESS); + } + catch (npp::Exception &rException) { + std::cerr << "Program error! The following exception occurred: \n"; + std::cerr << rException << std::endl; + std::cerr << "Aborting." << std::endl; + exit(EXIT_FAILURE); + } + catch (...) { + std::cerr << "Program error! An unknow type of exception occurred. \n"; + std::cerr << "Aborting." << std::endl; + exit(EXIT_FAILURE); + } - return 0; + return 0; } diff --git a/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp b/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp index 7b6ae4a8..0bf1d732 100644 --- a/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp +++ b/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp @@ -25,54 +25,51 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include +#include #include #include -#include -#include #define NUM_THREADS 128 -#define NUM_BLOCKS 32 +#define NUM_BLOCKS 32 -#define NVRTC_SAFE_CALL(x) \ - do { \ - nvrtcResult result = x; \ - if (result != NVRTC_SUCCESS) { \ - std::cerr << "\nerror: " #x " failed with error " \ - << nvrtcGetErrorString(result) << '\n'; \ - exit(1); \ - } \ - } while(0) -#define CUDA_SAFE_CALL(x) \ - do { \ - CUresult result = x; \ - if (result != CUDA_SUCCESS) { \ - const char *msg; \ - cuGetErrorName(result, &msg); \ - std::cerr << "\nerror: " #x " failed with error " \ - << msg << '\n'; \ - exit(1); \ - } \ - } while(0) -#define NVJITLINK_SAFE_CALL(h,x) \ - do { \ - nvJitLinkResult result = x; \ - if (result != NVJITLINK_SUCCESS) { \ - std::cerr << "\nerror: " #x " failed with error " \ - << result << '\n'; \ - size_t lsize; \ - result = nvJitLinkGetErrorLogSize(h, &lsize); \ - if (result == NVJITLINK_SUCCESS && lsize > 0) { \ - char *log = (char*)malloc(lsize); \ - result = nvJitLinkGetErrorLog(h, log); \ - if (result == NVJITLINK_SUCCESS) { \ - std::cerr << "error log: " << log << '\n'; \ - free(log); \ - } \ - } \ - exit(1); \ - } \ - } while(0) +#define NVRTC_SAFE_CALL(x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " << nvrtcGetErrorString(result) << '\n'; \ + exit(1); \ + } \ + } while (0) +#define CUDA_SAFE_CALL(x) \ + do { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS) { \ + const char *msg; \ + cuGetErrorName(result, &msg); \ + std::cerr << "\nerror: " #x " failed with error " << msg << '\n'; \ + exit(1); \ + } \ + } while (0) +#define NVJITLINK_SAFE_CALL(h, x) \ + do { \ + nvJitLinkResult result = x; \ + if (result != NVJITLINK_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " << result << '\n'; \ + size_t lsize; \ + result = nvJitLinkGetErrorLogSize(h, &lsize); \ + if (result == NVJITLINK_SUCCESS && lsize > 0) { \ + char *log = (char *)malloc(lsize); \ + result = nvJitLinkGetErrorLog(h, log); \ + if (result == NVJITLINK_SUCCESS) { \ + std::cerr << "error log: " << log << '\n'; \ + free(log); \ + } \ + } \ + exit(1); \ + } \ + } while (0) const char *lto_saxpy = " \n\ extern __device__ float compute(float a, float x, float y); \n\ @@ -92,170 +89,171 @@ __device__ float compute(float a, float x, float y) { \n\ } \n"; // compile code into LTOIR, returning the IR and its size -static void getLTOIR (const char *code, const char *name, - char **ltoIR, size_t *ltoIRSize) +static void getLTOIR(const char *code, const char *name, char **ltoIR, size_t *ltoIRSize) { - // Create an instance of nvrtcProgram with the code string. - nvrtcProgram prog; - NVRTC_SAFE_CALL( - nvrtcCreateProgram(&prog, // prog - code, // buffer - name, // name - 0, // numHeaders - NULL, // headers - NULL)); // includeNames - - // specify that LTO IR should be generated for LTO operation - const char *opts[] = {"-dlto", - "--relocatable-device-code=true"}; - nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog - 2, // numOptions - opts); // options - // Obtain compilation log from the program. - size_t logSize; - NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); - char *log = new char[logSize]; - NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); - std::cout << log << '\n'; - delete[] log; - if (compileResult != NVRTC_SUCCESS) { - exit(1); - } - // Obtain generated LTO IR from the program. - NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize)); - *ltoIR = new char[*ltoIRSize]; - NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR)); - // Destroy the program. - NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); + // Create an instance of nvrtcProgram with the code string. + nvrtcProgram prog; + NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog + code, // buffer + name, // name + 0, // numHeaders + NULL, // headers + NULL)); // includeNames + + // specify that LTO IR should be generated for LTO operation + const char *opts[] = {"-dlto", "--relocatable-device-code=true"}; + nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog + 2, // numOptions + opts); // options + // Obtain compilation log from the program. + size_t logSize; + NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); + char *log = new char[logSize]; + NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); + std::cout << log << '\n'; + delete[] log; + if (compileResult != NVRTC_SUCCESS) { + exit(1); + } + // Obtain generated LTO IR from the program. + NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize)); + *ltoIR = new char[*ltoIRSize]; + NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR)); + // Destroy the program. + NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); } int main(int argc, char *argv[]) { - unsigned int cuda_major = 0; - unsigned int cuda_minor = 0; - nvJitLinkResult res = nvJitLinkVersion(&cuda_major, &cuda_minor); - if (res != NVJITLINK_SUCCESS) { - std::cerr << "Version check failed" << '\n'; - } else { - std::cout << "CUDA " << cuda_major << "." << cuda_minor << '\n'; - } + unsigned int cuda_major = 0; + unsigned int cuda_minor = 0; + nvJitLinkResult res = nvJitLinkVersion(&cuda_major, &cuda_minor); + if (res != NVJITLINK_SUCCESS) { + std::cerr << "Version check failed" << '\n'; + } + else { + std::cout << "CUDA " << cuda_major << "." << cuda_minor << '\n'; + } - char *ltoIR1; - char *ltoIR2; - size_t ltoIR1Size; - size_t ltoIR2Size; - // getLTOIR uses nvrtc to get the LTOIR. - // We could also use nvcc offline with -dlto -fatbin - // to generate the IR, but using nvrtc keeps the build simpler. - getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size); - getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size); + char *ltoIR1; + char *ltoIR2; + size_t ltoIR1Size; + size_t ltoIR2Size; + // getLTOIR uses nvrtc to get the LTOIR. + // We could also use nvcc offline with -dlto -fatbin + // to generate the IR, but using nvrtc keeps the build simpler. + getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size); + getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size); - CUdevice cuDevice; - CUcontext context; - CUmodule module; - CUfunction kernel; - CUDA_SAFE_CALL(cuInit(0)); - CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); - CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); + CUdevice cuDevice; + CUcontext context; + CUmodule module; + CUfunction kernel; + CUDA_SAFE_CALL(cuInit(0)); + CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); + CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); - // Dynamically determine the arch to link for - int major = 0; - int minor = 0; - CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - int arch = major*10 + minor; - char smbuf[16]; - memset(smbuf,0,16); - sprintf(smbuf, "-arch=sm_%d", arch); + // Dynamically determine the arch to link for + int major = 0; + int minor = 0; + CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + int arch = major * 10 + minor; + char smbuf[16]; + memset(smbuf, 0, 16); + sprintf(smbuf, "-arch=sm_%d", arch); - // Load the generated LTO IR and link them together - nvJitLinkHandle handle; - const char *lopts[] = {"-lto", smbuf}; - NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts)); + // Load the generated LTO IR and link them together + nvJitLinkHandle handle; + const char *lopts[] = {"-lto", smbuf}; + NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts)); - NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, - (void *)ltoIR1, ltoIR1Size, "lto_saxpy")); - NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, - (void *)ltoIR2, ltoIR2Size, "lto_compute")); + NVJITLINK_SAFE_CALL(handle, + nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, (void *)ltoIR1, ltoIR1Size, "lto_saxpy")); + NVJITLINK_SAFE_CALL(handle, + nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, (void *)ltoIR2, ltoIR2Size, "lto_compute")); - // The call to nvJitLinkComplete causes linker to link together the two - // LTO IR modules, do optimization on the linked LTO IR, - // and generate cubin from it. - NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle)); + // The call to nvJitLinkComplete causes linker to link together the two + // LTO IR modules, do optimization on the linked LTO IR, + // and generate cubin from it. + NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle)); - // check error log - size_t logSize; - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize)); - if (logSize > 0) { - char *log = (char*)malloc(logSize+1); - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log)); - std::cout << "Error log: " << log << std::endl; - free(log); - } + // check error log + size_t logSize; + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize)); + if (logSize > 0) { + char *log = (char *)malloc(logSize + 1); + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log)); + std::cout << "Error log: " << log << std::endl; + free(log); + } - // get linked cubin - size_t cubinSize; - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize)); - void *cubin = malloc(cubinSize); - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin)); + // get linked cubin + size_t cubinSize; + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize)); + void *cubin = malloc(cubinSize); + NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin)); - NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle)); - delete[] ltoIR1; - delete[] ltoIR2; + NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle)); + delete[] ltoIR1; + delete[] ltoIR2; - // cubin is linked, so now load it - CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin)); - CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy")); - - // Generate input for execution, and create output buffers. - size_t n = NUM_THREADS * NUM_BLOCKS; - size_t bufferSize = n * sizeof(float); - float a = 5.1f; - float *hX = new float[n], *hY = new float[n], *hOut = new float[n]; - for (size_t i = 0; i < n; ++i) { - hX[i] = static_cast(i); - hY[i] = static_cast(i * 2); - } - CUdeviceptr dX, dY, dOut; - CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize)); - CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize)); - CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize)); - CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize)); - CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize)); - // Execute SAXPY. - void *args[] = { &a, &dX, &dY, &dOut, &n }; - CUDA_SAFE_CALL( - cuLaunchKernel(kernel, - NUM_BLOCKS, 1, 1, // grid dim - NUM_THREADS, 1, 1, // block dim - 0, NULL, // shared mem and stream - args, 0)); // arguments - CUDA_SAFE_CALL(cuCtxSynchronize()); - // Retrieve and print output. - CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize)); - - for (size_t i = 0; i < n; ++i) { - std::cout << a << " * " << hX[i] << " + " << hY[i] - << " = " << hOut[i] << '\n'; - } - // check last value to verify - if (hOut[n-1] == 29074.5) { - std::cout << "PASSED!\n"; - } else { - std::cout << "values not expected?\n"; - } - // Release resources. - CUDA_SAFE_CALL(cuMemFree(dX)); - CUDA_SAFE_CALL(cuMemFree(dY)); - CUDA_SAFE_CALL(cuMemFree(dOut)); - CUDA_SAFE_CALL(cuModuleUnload(module)); - CUDA_SAFE_CALL(cuCtxDestroy(context)); - free(cubin); - delete[] hX; - delete[] hY; - delete[] hOut; - return 0; + // cubin is linked, so now load it + CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin)); + CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy")); + + // Generate input for execution, and create output buffers. + size_t n = NUM_THREADS * NUM_BLOCKS; + size_t bufferSize = n * sizeof(float); + float a = 5.1f; + float *hX = new float[n], *hY = new float[n], *hOut = new float[n]; + for (size_t i = 0; i < n; ++i) { + hX[i] = static_cast(i); + hY[i] = static_cast(i * 2); + } + CUdeviceptr dX, dY, dOut; + CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize)); + CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize)); + CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize)); + CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize)); + CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize)); + // Execute SAXPY. + void *args[] = {&a, &dX, &dY, &dOut, &n}; + CUDA_SAFE_CALL(cuLaunchKernel(kernel, + NUM_BLOCKS, + 1, + 1, // grid dim + NUM_THREADS, + 1, + 1, // block dim + 0, + NULL, // shared mem and stream + args, + 0)); // arguments + CUDA_SAFE_CALL(cuCtxSynchronize()); + // Retrieve and print output. + CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize)); + + for (size_t i = 0; i < n; ++i) { + std::cout << a << " * " << hX[i] << " + " << hY[i] << " = " << hOut[i] << '\n'; + } + // check last value to verify + if (hOut[n - 1] == 29074.5) { + std::cout << "PASSED!\n"; + } + else { + std::cout << "values not expected?\n"; + } + // Release resources. + CUDA_SAFE_CALL(cuMemFree(dX)); + CUDA_SAFE_CALL(cuMemFree(dY)); + CUDA_SAFE_CALL(cuMemFree(dOut)); + CUDA_SAFE_CALL(cuModuleUnload(module)); + CUDA_SAFE_CALL(cuCtxDestroy(context)); + free(cubin); + delete[] hX; + delete[] hY; + delete[] hOut; + return 0; } diff --git a/Samples/4_CUDA_Libraries/lineOfSight/lineOfSight.cu b/Samples/4_CUDA_Libraries/lineOfSight/lineOfSight.cu index 7a49f020..0bb46993 100644 --- a/Samples/4_CUDA_Libraries/lineOfSight/lineOfSight.cu +++ b/Samples/4_CUDA_Libraries/lineOfSight/lineOfSight.cu @@ -38,22 +38,22 @@ #endif // includes, system -#include -#include -#include -#include #include +#include +#include +#include +#include // includes, project -#include #include +#include #include // includes, library +#include #include #include #include -#include //////////////////////////////////////////////////////////////////////////////// // declaration, types @@ -63,200 +63,203 @@ typedef unsigned char Bool; enum { False = 0, True = 1 }; // 2D height field -struct HeightField { - int width; - float *height; +struct HeightField +{ + int width; + float *height; }; // Ray -struct Ray { - float3 origin; - float2 dir; - int length; - float oneOverLength; +struct Ray +{ + float3 origin; + float2 dir; + int length; + float oneOverLength; }; //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // declaration, forward -int runTest(int argc, char **argv); -__global__ void computeAngles_kernel(const Ray, float *, cudaTextureObject_t); -__global__ void computeVisibilities_kernel(const float *, const float *, int, - Bool *); -void lineOfSight_gold(const HeightField, const Ray, Bool *); +int runTest(int argc, char **argv); +__global__ void computeAngles_kernel(const Ray, float *, cudaTextureObject_t); +__global__ void computeVisibilities_kernel(const float *, const float *, int, Bool *); +void lineOfSight_gold(const HeightField, const Ray, Bool *); __device__ __host__ float2 getLocation(const Ray, int); -__device__ __host__ float getAngle(const Ray, float2, float); +__device__ __host__ float getAngle(const Ray, float2, float); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - int res = runTest(argc, argv); +int main(int argc, char **argv) +{ + int res = runTest(argc, argv); - if (res != 1) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + if (res != 1) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - printf("Test passed\n"); - exit(EXIT_SUCCESS); + printf("Test passed\n"); + exit(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////// //! Run a line-of-sight test for CUDA //////////////////////////////////////////////////////////////////////////////// -int runTest(int argc, char **argv) { - //////////////////////////////////////////////////////////////////////////// - // Device initialization +int runTest(int argc, char **argv) +{ + //////////////////////////////////////////////////////////////////////////// + // Device initialization - printf("[%s] - Starting...\n", argv[0]); + printf("[%s] - Starting...\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - //////////////////////////////////////////////////////////////////////////// - // Timer + //////////////////////////////////////////////////////////////////////////// + // Timer - // Create - StopWatchInterface *timer; - sdkCreateTimer(&timer); + // Create + StopWatchInterface *timer; + sdkCreateTimer(&timer); - // Number of iterations to get accurate timing - uint numIterations = 100; + // Number of iterations to get accurate timing + uint numIterations = 100; - //////////////////////////////////////////////////////////////////////////// - // Height field + //////////////////////////////////////////////////////////////////////////// + // Height field - HeightField heightField; + HeightField heightField; - // Allocate in host memory - int2 dim = make_int2(10000, 100); - heightField.width = dim.x; - thrust::host_vector height(dim.x * dim.y); - heightField.height = (float *)&height[0]; + // Allocate in host memory + int2 dim = make_int2(10000, 100); + heightField.width = dim.x; + thrust::host_vector height(dim.x * dim.y); + heightField.height = (float *)&height[0]; - // - // Fill in with an arbitrary sine surface - for (int x = 0; x < dim.x; ++x) - for (int y = 0; y < dim.y; ++y) { - float amp = 0.1f * (x + y); - float period = 2.0f + amp; - *(heightField.height + dim.x * y + x) = - amp * (sinf(sqrtf((float)(x * x + y * y)) * 2.0f * 3.1416f / period) + - 1.0f); + // + // Fill in with an arbitrary sine surface + for (int x = 0; x < dim.x; ++x) + for (int y = 0; y < dim.y; ++y) { + float amp = 0.1f * (x + y); + float period = 2.0f + amp; + *(heightField.height + dim.x * y + x) = + amp * (sinf(sqrtf((float)(x * x + y * y)) * 2.0f * 3.1416f / period) + 1.0f); + } + + // Allocate CUDA array in device memory + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaArray *heightFieldArray; + checkCudaErrors(cudaMallocArray(&heightFieldArray, &channelDesc, dim.x, dim.y)); + + // Initialize device memory + checkCudaErrors(cudaMemcpy2DToArray(heightFieldArray, + 0, + 0, + heightField.height, + dim.x * sizeof(float), + dim.x * sizeof(float), + dim.y, + cudaMemcpyHostToDevice)); + + cudaTextureObject_t heightFieldTex; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = heightFieldArray; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; + + checkCudaErrors(cudaCreateTextureObject(&heightFieldTex, &texRes, &texDescr, NULL)); + + ////////////////////////////////////////////////////////////////////////////// + // Ray (starts at origin and traverses the height field diagonally) + + Ray ray; + ray.origin = make_float3(0, 0, 2.0f); + int2 dir = make_int2(dim.x - 1, dim.y - 1); + ray.dir = make_float2((float)dir.x, (float)dir.y); + ray.length = max(abs(dir.x), abs(dir.y)); + ray.oneOverLength = 1.0f / ray.length; + + ////////////////////////////////////////////////////////////////////////////// + // View angles + + // Allocate view angles for each point along the ray + thrust::device_vector d_angles(ray.length); + + // Allocate result of max-scan operation on the array of view angles + thrust::device_vector d_scannedAngles(ray.length); + + ////////////////////////////////////////////////////////////////////////////// + // Visibility results + + // Allocate visibility results for each point along the ray + thrust::device_vector d_visibilities(ray.length); + thrust::host_vector h_visibilities(ray.length); + thrust::host_vector h_visibilitiesRef(ray.length); + + ////////////////////////////////////////////////////////////////////////////// + // Reference solution + lineOfSight_gold(heightField, ray, (Bool *)&h_visibilitiesRef[0]); + + ////////////////////////////////////////////////////////////////////////////// + // Device solution + + // Execution configuration + dim3 block(256); + dim3 grid((uint)ceil(ray.length / (double)block.x)); + + // Compute device solution + printf("Line of sight\n"); + sdkStartTimer(&timer); + + for (uint i = 0; i < numIterations; ++i) { + // Compute view angle for each point along the ray + computeAngles_kernel<<>>(ray, thrust::raw_pointer_cast(&d_angles[0]), heightFieldTex); + getLastCudaError("Kernel execution failed"); + + // Perform a max-scan operation on the array of view angles + thrust::inclusive_scan(d_angles.begin(), d_angles.end(), d_scannedAngles.begin(), thrust::maximum()); + getLastCudaError("Kernel execution failed"); + + // Compute visibility results based on the array of view angles + // and its scanned version + computeVisibilities_kernel<<>>(thrust::raw_pointer_cast(&d_angles[0]), + thrust::raw_pointer_cast(&d_scannedAngles[0]), + ray.length, + thrust::raw_pointer_cast(&d_visibilities[0])); + getLastCudaError("Kernel execution failed"); } - // Allocate CUDA array in device memory - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); - cudaArray *heightFieldArray; - checkCudaErrors( - cudaMallocArray(&heightFieldArray, &channelDesc, dim.x, dim.y)); - - // Initialize device memory - checkCudaErrors(cudaMemcpy2DToArray( - heightFieldArray, 0, 0, heightField.height, dim.x * sizeof(float), - dim.x * sizeof(float), dim.y, cudaMemcpyHostToDevice)); - - cudaTextureObject_t heightFieldTex; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = heightFieldArray; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors( - cudaCreateTextureObject(&heightFieldTex, &texRes, &texDescr, NULL)); - - ////////////////////////////////////////////////////////////////////////////// - // Ray (starts at origin and traverses the height field diagonally) - - Ray ray; - ray.origin = make_float3(0, 0, 2.0f); - int2 dir = make_int2(dim.x - 1, dim.y - 1); - ray.dir = make_float2((float)dir.x, (float)dir.y); - ray.length = max(abs(dir.x), abs(dir.y)); - ray.oneOverLength = 1.0f / ray.length; - - ////////////////////////////////////////////////////////////////////////////// - // View angles - - // Allocate view angles for each point along the ray - thrust::device_vector d_angles(ray.length); - - // Allocate result of max-scan operation on the array of view angles - thrust::device_vector d_scannedAngles(ray.length); - - ////////////////////////////////////////////////////////////////////////////// - // Visibility results - - // Allocate visibility results for each point along the ray - thrust::device_vector d_visibilities(ray.length); - thrust::host_vector h_visibilities(ray.length); - thrust::host_vector h_visibilitiesRef(ray.length); - - ////////////////////////////////////////////////////////////////////////////// - // Reference solution - lineOfSight_gold(heightField, ray, (Bool *)&h_visibilitiesRef[0]); - - ////////////////////////////////////////////////////////////////////////////// - // Device solution - - // Execution configuration - dim3 block(256); - dim3 grid((uint)ceil(ray.length / (double)block.x)); - - // Compute device solution - printf("Line of sight\n"); - sdkStartTimer(&timer); - - for (uint i = 0; i < numIterations; ++i) { - // Compute view angle for each point along the ray - computeAngles_kernel<<>>( - ray, thrust::raw_pointer_cast(&d_angles[0]), heightFieldTex); + cudaDeviceSynchronize(); + sdkStopTimer(&timer); getLastCudaError("Kernel execution failed"); - // Perform a max-scan operation on the array of view angles - thrust::inclusive_scan(d_angles.begin(), d_angles.end(), - d_scannedAngles.begin(), thrust::maximum()); - getLastCudaError("Kernel execution failed"); + // Copy visibility results back to the host + thrust::copy(d_visibilities.begin(), d_visibilities.end(), h_visibilities.begin()); - // Compute visibility results based on the array of view angles - // and its scanned version - computeVisibilities_kernel<<>>( - thrust::raw_pointer_cast(&d_angles[0]), - thrust::raw_pointer_cast(&d_scannedAngles[0]), ray.length, - thrust::raw_pointer_cast(&d_visibilities[0])); - getLastCudaError("Kernel execution failed"); - } + // Compare device visibility results against reference results + bool res = compareData(thrust::raw_pointer_cast(&h_visibilitiesRef[0]), + thrust::raw_pointer_cast(&h_visibilities[0]), + ray.length, + 0.0f, + 0.0f); + printf("Average time: %f ms\n\n", sdkGetTimerValue(&timer) / numIterations); + sdkResetTimer(&timer); - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - getLastCudaError("Kernel execution failed"); - - // Copy visibility results back to the host - thrust::copy(d_visibilities.begin(), d_visibilities.end(), - h_visibilities.begin()); - - // Compare device visibility results against reference results - bool res = compareData(thrust::raw_pointer_cast(&h_visibilitiesRef[0]), - thrust::raw_pointer_cast(&h_visibilities[0]), - ray.length, 0.0f, 0.0f); - printf("Average time: %f ms\n\n", sdkGetTimerValue(&timer) / numIterations); - sdkResetTimer(&timer); - - // Cleanup memory - checkCudaErrors(cudaFreeArray(heightFieldArray)); - return res; + // Cleanup memory + checkCudaErrors(cudaFreeArray(heightFieldArray)); + return res; } //////////////////////////////////////////////////////////////////////////////// @@ -264,16 +267,16 @@ int runTest(int argc, char **argv) { //! @param ray ray //! @param angles view angles //////////////////////////////////////////////////////////////////////////////// -__global__ void computeAngles_kernel(const Ray ray, float *angles, - cudaTextureObject_t HeightFieldTex) { - uint i = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void computeAngles_kernel(const Ray ray, float *angles, cudaTextureObject_t HeightFieldTex) +{ + uint i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < ray.length) { - float2 location = getLocation(ray, i + 1); - float height = tex2D(HeightFieldTex, location.x, location.y); - float angle = getAngle(ray, location, height); - angles[i] = angle; - } + if (i < ray.length) { + float2 location = getLocation(ray, i + 1); + float height = tex2D(HeightFieldTex, location.x, location.y); + float angle = getAngle(ray, location, height); + angles[i] = angle; + } } //////////////////////////////////////////////////////////////////////////////// @@ -284,14 +287,14 @@ __global__ void computeAngles_kernel(const Ray ray, float *angles, //! @param visibilities boolean array indicating the visibility of each point //! along the ray //////////////////////////////////////////////////////////////////////////////// -__global__ void computeVisibilities_kernel(const float *angles, - const float *scannedAngles, - int numAngles, Bool *visibilities) { - uint i = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void +computeVisibilities_kernel(const float *angles, const float *scannedAngles, int numAngles, Bool *visibilities) +{ + uint i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < numAngles) { - visibilities[i] = scannedAngles[i] <= angles[i]; - } + if (i < numAngles) { + visibilities[i] = scannedAngles[i] <= angles[i]; + } } //////////////////////////////////////////////////////////////////////////////// @@ -301,24 +304,23 @@ __global__ void computeVisibilities_kernel(const float *angles, //! @param visibilities boolean array indicating the visibility of each point //! along the ray //////////////////////////////////////////////////////////////////////////////// -void lineOfSight_gold(const HeightField heightField, const Ray ray, - Bool *visibilities) { - float angleMax = asinf(-1.0f); +void lineOfSight_gold(const HeightField heightField, const Ray ray, Bool *visibilities) +{ + float angleMax = asinf(-1.0f); - for (int i = 0; i < ray.length; ++i) { - float2 location = getLocation(ray, i + 1); - float height = - *(heightField.height + heightField.width * (int)floorf(location.y) + - (int)floorf(location.x)); - float angle = getAngle(ray, location, height); + for (int i = 0; i < ray.length; ++i) { + float2 location = getLocation(ray, i + 1); + float height = *(heightField.height + heightField.width * (int)floorf(location.y) + (int)floorf(location.x)); + float angle = getAngle(ray, location, height); - if (angle > angleMax) { - angleMax = angle; - visibilities[i] = True; - } else { - visibilities[i] = False; + if (angle > angleMax) { + angleMax = angle; + visibilities[i] = True; + } + else { + visibilities[i] = False; + } } - } } //////////////////////////////////////////////////////////////////////////////// @@ -327,9 +329,10 @@ void lineOfSight_gold(const HeightField heightField, const Ray ray, //! @param ray ray //! @param i integer offset along the ray //////////////////////////////////////////////////////////////////////////////// -__device__ __host__ float2 getLocation(const Ray ray, int i) { - float step = i * ray.oneOverLength; - return make_float2(ray.origin.x, ray.origin.y) + ray.dir * step; +__device__ __host__ float2 getLocation(const Ray ray, int i) +{ + float step = i * ray.oneOverLength; + return make_float2(ray.origin.x, ray.origin.y) + ray.dir * step; } //////////////////////////////////////////////////////////////////////////////// @@ -338,8 +341,8 @@ __device__ __host__ float2 getLocation(const Ray ray, int i) { //! @param location 2D coordinates of the input point //! @param height height of the input point //////////////////////////////////////////////////////////////////////////////// -__device__ __host__ float getAngle(const Ray ray, float2 location, - float height) { - float2 dir = location - make_float2(ray.origin.x, ray.origin.y); - return atanf((height - ray.origin.z) / length(dir)); +__device__ __host__ float getAngle(const Ray ray, float2 location, float height) +{ + float2 dir = location - make_float2(ray.origin.x, ray.origin.y); + return atanf((height - ray.origin.z) / length(dir)); } diff --git a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp index 0cd33127..0a5adb0b 100644 --- a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp +++ b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp @@ -65,15 +65,15 @@ // Utilities and system includes #include -#include // helper for shared functions common to CUDA Samples +#include // helper for shared functions common to CUDA Samples // CUDA runtime -#include #include +#include // CUDA and CUBLAS functions -#include #include +#include #ifndef min #define min(a, b) ((a < b) ? a : b) @@ -83,8 +83,9 @@ #endif // Optional Command-line multiplier for matrix sizes -typedef struct _matrixSize { - unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; +typedef struct _matrixSize +{ + unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; } sMatrixSize; //////////////////////////////////////////////////////////////////////////////// @@ -96,270 +97,292 @@ typedef struct _matrixSize { //! @param hA height of matrix A //! @param wB width of matrix B //////////////////////////////////////////////////////////////////////////////// -void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, - unsigned int wA, unsigned int wB) { - for (unsigned int i = 0; i < hA; ++i) - for (unsigned int j = 0; j < wB; ++j) { - double sum = 0; +void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB) +{ + for (unsigned int i = 0; i < hA; ++i) + for (unsigned int j = 0; j < wB; ++j) { + double sum = 0; - for (unsigned int k = 0; k < wA; ++k) { - double a = A[i * wA + k]; - double b = B[k * wB + j]; - sum += a * b; - } + for (unsigned int k = 0; k < wA; ++k) { + double a = A[i * wA + k]; + double b = B[k * wB + j]; + sum += a * b; + } - C[i * wB + j] = (float)sum; - } + C[i * wB + j] = (float)sum; + } } // Allocates a matrix with random float entries. -void randomInit(float *data, int size) { - for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX; +void randomInit(float *data, int size) +{ + for (int i = 0; i < size; ++i) + data[i] = rand() / (float)RAND_MAX; } -void printDiff(float *data1, float *data2, int width, int height, - int iListLength, float fListTol) { - printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol); - int i, j, k; - int error_count = 0; +void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol) +{ + printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol); + int i, j, k; + int error_count = 0; - for (j = 0; j < height; j++) { - if (error_count < iListLength) { - printf("\n Row %d:\n", j); - } - - for (i = 0; i < width; i++) { - k = j * width + i; - float fDiff = fabs(data1[k] - data2[k]); - - if (fDiff > fListTol) { + for (j = 0; j < height; j++) { if (error_count < iListLength) { - printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, - data1[k], data2[k], fDiff); + printf("\n Row %d:\n", j); } - error_count++; - } - } - } + for (i = 0; i < width; i++) { + k = j * width + i; + float fDiff = fabs(data1[k] - data2[k]); - printf(" \n Total Errors = %d\n", error_count); + if (fDiff > fListTol) { + if (error_count < iListLength) { + printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff); + } + + error_count++; + } + } + } + + printf(" \n Total Errors = %d\n", error_count); } -void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, - sMatrixSize &matrix_size) { - // By default, we use device 0, otherwise we override the device ID based on - // what is provided at the command line - cudaError_t error; - devID = 0; +void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size) +{ + // By default, we use device 0, otherwise we override the device ID based on + // what is provided at the command line + cudaError_t error; + devID = 0; - devID = findCudaDevice(argc, (const char **)argv); + devID = findCudaDevice(argc, (const char **)argv); - if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) { - iSizeMultiple = - getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) { + iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); + } - iSizeMultiple = min(iSizeMultiple, 10); - iSizeMultiple = max(iSizeMultiple, 1); + iSizeMultiple = min(iSizeMultiple, 10); + iSizeMultiple = max(iSizeMultiple, 1); - cudaDeviceProp deviceProp; + cudaDeviceProp deviceProp; - error = cudaGetDeviceProperties(&deviceProp, devID); + error = cudaGetDeviceProperties(&deviceProp, devID); - if (error != cudaSuccess) { - printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, - __LINE__); - exit(EXIT_FAILURE); - } + if (error != cudaSuccess) { + printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); + exit(EXIT_FAILURE); + } - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, - deviceProp.name, deviceProp.major, deviceProp.minor); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, + deviceProp.name, + deviceProp.major, + deviceProp.minor); - int block_size = 32; + int block_size = 32; - matrix_size.uiWA = 3 * block_size * iSizeMultiple; - matrix_size.uiHA = 4 * block_size * iSizeMultiple; - matrix_size.uiWB = 2 * block_size * iSizeMultiple; - matrix_size.uiHB = 3 * block_size * iSizeMultiple; - matrix_size.uiWC = 2 * block_size * iSizeMultiple; - matrix_size.uiHC = 4 * block_size * iSizeMultiple; + matrix_size.uiWA = 3 * block_size * iSizeMultiple; + matrix_size.uiHA = 4 * block_size * iSizeMultiple; + matrix_size.uiWB = 2 * block_size * iSizeMultiple; + matrix_size.uiHB = 3 * block_size * iSizeMultiple; + matrix_size.uiWC = 2 * block_size * iSizeMultiple; + matrix_size.uiHC = 4 * block_size * iSizeMultiple; - printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", matrix_size.uiHA, - matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, - matrix_size.uiWC); + printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", + matrix_size.uiHA, + matrix_size.uiWA, + matrix_size.uiHB, + matrix_size.uiWB, + matrix_size.uiHC, + matrix_size.uiWC); - if (matrix_size.uiWA != matrix_size.uiHB || - matrix_size.uiHA != matrix_size.uiHC || - matrix_size.uiWB != matrix_size.uiWC) { - printf("ERROR: Matrix sizes do not match!\n"); - exit(-1); - } + if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC + || matrix_size.uiWB != matrix_size.uiWC) { + printf("ERROR: Matrix sizes do not match!\n"); + exit(-1); + } } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// -int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { - cudaDeviceProp deviceProp; +int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) +{ + cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - int block_size = 32; + int block_size = 32; - // set seed for rand() - srand(2006); + // set seed for rand() + srand(2006); - // allocate host memory for matrices A and B - unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; - unsigned int mem_size_A = sizeof(float) * size_A; - float *h_A = (float *)malloc(mem_size_A); - unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; - unsigned int mem_size_B = sizeof(float) * size_B; - float *h_B = (float *)malloc(mem_size_B); + // allocate host memory for matrices A and B + unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A = (float *)malloc(mem_size_A); + unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B = (float *)malloc(mem_size_B); - // set seed for rand() - srand(2006); + // set seed for rand() + srand(2006); - // initialize host memory - randomInit(h_A, size_A); - randomInit(h_B, size_B); + // initialize host memory + randomInit(h_A, size_A); + randomInit(h_B, size_B); - // allocate device memory - float *d_A, *d_B, *d_C; - unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; - unsigned int mem_size_C = sizeof(float) * size_C; + // allocate device memory + float *d_A, *d_B, *d_C; + unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; + unsigned int mem_size_C = sizeof(float) * size_C; - // allocate host memory for the result - float *h_C = (float *)malloc(mem_size_C); - float *h_CUBLAS = (float *)malloc(mem_size_C); + // allocate host memory for the result + float *h_C = (float *)malloc(mem_size_C); + float *h_CUBLAS = (float *)malloc(mem_size_C); - checkCudaErrors(cudaMalloc((void **)&d_A, mem_size_A)); - checkCudaErrors(cudaMalloc((void **)&d_B, mem_size_B)); - checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMalloc((void **)&d_C, mem_size_C)); + checkCudaErrors(cudaMalloc((void **)&d_A, mem_size_A)); + checkCudaErrors(cudaMalloc((void **)&d_B, mem_size_B)); + checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc((void **)&d_C, mem_size_C)); - // setup execution parameters - dim3 threads(block_size, block_size); - dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); + // setup execution parameters + dim3 threads(block_size, block_size); + dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); - // create and start timer - printf("Computing result using CUBLAS..."); + // create and start timer + printf("Computing result using CUBLAS..."); - // execute the kernel - int nIter = 30; + // execute the kernel + int nIter = 30; - // CUBLAS version 2.0 - { - const float alpha = 1.0f; - const float beta = 0.0f; - cublasHandle_t handle; - cudaEvent_t start, stop; + // CUBLAS version 2.0 + { + const float alpha = 1.0f; + const float beta = 0.0f; + cublasHandle_t handle; + cudaEvent_t start, stop; - checkCudaErrors(cublasCreate(&handle)); + checkCudaErrors(cublasCreate(&handle)); - // Perform warmup operation with cublas - checkCudaErrors(cublasSgemm( - handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, - matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, - &beta, d_C, matrix_size.uiWB)); + // Perform warmup operation with cublas + checkCudaErrors(cublasSgemm(handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + matrix_size.uiWB, + matrix_size.uiHA, + matrix_size.uiWA, + &alpha, + d_B, + matrix_size.uiWB, + d_A, + matrix_size.uiWA, + &beta, + d_C, + matrix_size.uiWB)); - // Allocate CUDA events that we'll use for timing - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); + // Allocate CUDA events that we'll use for timing + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); - // Record the start event - checkCudaErrors(cudaEventRecord(start, NULL)); + // Record the start event + checkCudaErrors(cudaEventRecord(start, NULL)); - for (int j = 0; j < nIter; j++) { - // note cublas is column primary! - // need to transpose the order - checkCudaErrors(cublasSgemm( - handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, - matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, - matrix_size.uiWA, &beta, d_C, matrix_size.uiWB)); + for (int j = 0; j < nIter; j++) { + // note cublas is column primary! + // need to transpose the order + checkCudaErrors(cublasSgemm(handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + matrix_size.uiWB, + matrix_size.uiHA, + matrix_size.uiWA, + &alpha, + d_B, + matrix_size.uiWB, + d_A, + matrix_size.uiWA, + &beta, + d_C, + matrix_size.uiWB)); + } + + printf("done.\n"); + + // Record the stop event + checkCudaErrors(cudaEventRecord(stop, NULL)); + + // Wait for the stop event to complete + checkCudaErrors(cudaEventSynchronize(stop)); + + float msecTotal = 0.0f; + checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); + + // Compute and print the performance + float msecPerMatrixMul = msecTotal / nIter; + double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB; + double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); + printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", + gigaFlops, + msecPerMatrixMul, + flopsPerMatrixMul); + + // copy result from device to host + checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost)); + + // Destroy the handle + checkCudaErrors(cublasDestroy(handle)); } + // compute reference solution + printf("Computing result using host CPU..."); + float *reference = (float *)malloc(mem_size_C); + matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); printf("done.\n"); - // Record the stop event - checkCudaErrors(cudaEventRecord(stop, NULL)); + // check result (CUBLAS) + bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); - // Wait for the stop event to complete - checkCudaErrors(cudaEventSynchronize(stop)); + if (resCUBLAS != true) { + printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); + } - float msecTotal = 0.0f; - checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); + printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); - // Compute and print the performance - float msecPerMatrixMul = msecTotal / nIter; - double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * - (double)matrix_size.uiWC * - (double)matrix_size.uiHB; - double gigaFlops = - (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); - printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", - gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); - // copy result from device to host - checkCudaErrors( - cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost)); + // clean up memory + free(h_A); + free(h_B); + free(h_C); + free(reference); + checkCudaErrors(cudaFree(d_A)); + checkCudaErrors(cudaFree(d_B)); + checkCudaErrors(cudaFree(d_C)); - // Destroy the handle - checkCudaErrors(cublasDestroy(handle)); - } - - // compute reference solution - printf("Computing result using host CPU..."); - float *reference = (float *)malloc(mem_size_C); - matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, - matrix_size.uiWB); - printf("done.\n"); - - // check result (CUBLAS) - bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); - - if (resCUBLAS != true) { - printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, - 1.0e-5f); - } - - printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", - (true == resCUBLAS) ? "PASS" : "FAIL"); - - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); - - // clean up memory - free(h_A); - free(h_B); - free(h_C); - free(reference); - checkCudaErrors(cudaFree(d_A)); - checkCudaErrors(cudaFree(d_B)); - checkCudaErrors(cudaFree(d_C)); - - if (resCUBLAS == true) { - return EXIT_SUCCESS; // return value = 1 - } else { - return EXIT_FAILURE; // return value = 0 - } + if (resCUBLAS == true) { + return EXIT_SUCCESS; // return value = 1 + } + else { + return EXIT_FAILURE; // return value = 0 + } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("[Matrix Multiply CUBLAS] - Starting...\n"); +int main(int argc, char **argv) +{ + printf("[Matrix Multiply CUBLAS] - Starting...\n"); - int devID = 0, sizeMult = 5; - sMatrixSize matrix_size; + int devID = 0, sizeMult = 5; + sMatrixSize matrix_size; - initializeCUDA(argc, argv, devID, sizeMult, matrix_size); + initializeCUDA(argc, argv, devID, sizeMult, matrix_size); - int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); + int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); - return matrix_result; + return matrix_result; } diff --git a/Samples/4_CUDA_Libraries/nvJPEG/nvJPEG.cpp b/Samples/4_CUDA_Libraries/nvJPEG/nvJPEG.cpp index 913fae64..ab3644bd 100644 --- a/Samples/4_CUDA_Libraries/nvJPEG/nvJPEG.cpp +++ b/Samples/4_CUDA_Libraries/nvJPEG/nvJPEG.cpp @@ -30,584 +30,611 @@ // images can be decoded using the API for batch mode #include + #include "helper_nvJPEG.hxx" int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); } int dev_free(void *p) { return (int)cudaFree(p); } -int host_malloc(void** p, size_t s, unsigned int f) { return (int)cudaHostAlloc(p, s, f); } +int host_malloc(void **p, size_t s, unsigned int f) { return (int)cudaHostAlloc(p, s, f); } -int host_free(void* p) { return (int)cudaFreeHost(p); } +int host_free(void *p) { return (int)cudaFreeHost(p); } -typedef std::vector FileNames; -typedef std::vector > FileData; +typedef std::vector FileNames; +typedef std::vector> FileData; -struct decode_params_t { - std::string input_dir; - int batch_size; - int total_images; - int dev; - int warmup; +struct decode_params_t +{ + std::string input_dir; + int batch_size; + int total_images; + int dev; + int warmup; - nvjpegJpegState_t nvjpeg_state; - nvjpegHandle_t nvjpeg_handle; - cudaStream_t stream; + nvjpegJpegState_t nvjpeg_state; + nvjpegHandle_t nvjpeg_handle; + cudaStream_t stream; - // used with decoupled API - nvjpegJpegState_t nvjpeg_decoupled_state; - nvjpegBufferPinned_t pinned_buffers[2]; // 2 buffers for pipelining - nvjpegBufferDevice_t device_buffer; - nvjpegJpegStream_t jpeg_streams[2]; // 2 streams for pipelining - nvjpegDecodeParams_t nvjpeg_decode_params; - nvjpegJpegDecoder_t nvjpeg_decoder; + // used with decoupled API + nvjpegJpegState_t nvjpeg_decoupled_state; + nvjpegBufferPinned_t pinned_buffers[2]; // 2 buffers for pipelining + nvjpegBufferDevice_t device_buffer; + nvjpegJpegStream_t jpeg_streams[2]; // 2 streams for pipelining + nvjpegDecodeParams_t nvjpeg_decode_params; + nvjpegJpegDecoder_t nvjpeg_decoder; - nvjpegOutputFormat_t fmt; - bool write_decoded; - std::string output_dir; + nvjpegOutputFormat_t fmt; + bool write_decoded; + std::string output_dir; - bool pipelined; - bool batched; + bool pipelined; + bool batched; }; -int read_next_batch(FileNames &image_names, int batch_size, - FileNames::iterator &cur_iter, FileData &raw_data, - std::vector &raw_len, FileNames ¤t_names) { - int counter = 0; +int read_next_batch(FileNames &image_names, + int batch_size, + FileNames::iterator &cur_iter, + FileData &raw_data, + std::vector &raw_len, + FileNames ¤t_names) +{ + int counter = 0; - while (counter < batch_size) { - if (cur_iter == image_names.end()) { - std::cerr << "Image list is too short to fill the batch, adding files " - "from the beginning of the image list" - << std::endl; - cur_iter = image_names.begin(); + while (counter < batch_size) { + if (cur_iter == image_names.end()) { + std::cerr << "Image list is too short to fill the batch, adding files " + "from the beginning of the image list" + << std::endl; + cur_iter = image_names.begin(); + } + + if (image_names.size() == 0) { + std::cerr << "No valid images left in the input list, exit" << std::endl; + return EXIT_FAILURE; + } + + // Read an image from disk. + std::ifstream input(cur_iter->c_str(), std::ios::in | std::ios::binary | std::ios::ate); + if (!(input.is_open())) { + std::cerr << "Cannot open image: " << *cur_iter << ", removing it from image list" << std::endl; + image_names.erase(cur_iter); + continue; + } + + // Get the size + std::streamsize file_size = input.tellg(); + input.seekg(0, std::ios::beg); + // resize if buffer is too small + if (raw_data[counter].size() < file_size) { + raw_data[counter].resize(file_size); + } + if (!input.read(raw_data[counter].data(), file_size)) { + std::cerr << "Cannot read from file: " << *cur_iter << ", removing it from image list" << std::endl; + image_names.erase(cur_iter); + continue; + } + raw_len[counter] = file_size; + + current_names[counter] = *cur_iter; + + counter++; + cur_iter++; } - - if (image_names.size() == 0) { - std::cerr << "No valid images left in the input list, exit" << std::endl; - return EXIT_FAILURE; - } - - // Read an image from disk. - std::ifstream input(cur_iter->c_str(), - std::ios::in | std::ios::binary | std::ios::ate); - if (!(input.is_open())) { - std::cerr << "Cannot open image: " << *cur_iter - << ", removing it from image list" << std::endl; - image_names.erase(cur_iter); - continue; - } - - // Get the size - std::streamsize file_size = input.tellg(); - input.seekg(0, std::ios::beg); - // resize if buffer is too small - if (raw_data[counter].size() < file_size) { - raw_data[counter].resize(file_size); - } - if (!input.read(raw_data[counter].data(), file_size)) { - std::cerr << "Cannot read from file: " << *cur_iter - << ", removing it from image list" << std::endl; - image_names.erase(cur_iter); - continue; - } - raw_len[counter] = file_size; - - current_names[counter] = *cur_iter; - - counter++; - cur_iter++; - } - return EXIT_SUCCESS; + return EXIT_SUCCESS; } // prepare buffers for RGBi output format -int prepare_buffers(FileData &file_data, std::vector &file_len, - std::vector &img_width, std::vector &img_height, +int prepare_buffers(FileData &file_data, + std::vector &file_len, + std::vector &img_width, + std::vector &img_height, std::vector &ibuf, - std::vector &isz, FileNames ¤t_names, - decode_params_t ¶ms) { - int widths[NVJPEG_MAX_COMPONENT]; - int heights[NVJPEG_MAX_COMPONENT]; - int channels; - nvjpegChromaSubsampling_t subsampling; + std::vector &isz, + FileNames ¤t_names, + decode_params_t ¶ms) +{ + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + int channels; + nvjpegChromaSubsampling_t subsampling; - for (int i = 0; i < file_data.size(); i++) { - checkCudaErrors(nvjpegGetImageInfo( - params.nvjpeg_handle, (unsigned char *)file_data[i].data(), file_len[i], - &channels, &subsampling, widths, heights)); + for (int i = 0; i < file_data.size(); i++) { + checkCudaErrors(nvjpegGetImageInfo(params.nvjpeg_handle, + (unsigned char *)file_data[i].data(), + file_len[i], + &channels, + &subsampling, + widths, + heights)); - img_width[i] = widths[0]; - img_height[i] = heights[0]; + img_width[i] = widths[0]; + img_height[i] = heights[0]; - std::cout << "Processing: " << current_names[i] << std::endl; - std::cout << "Image is " << channels << " channels." << std::endl; - for (int c = 0; c < channels; c++) { - std::cout << "Channel #" << c << " size: " << widths[c] << " x " - << heights[c] << std::endl; - } - - switch (subsampling) { - case NVJPEG_CSS_444: - std::cout << "YUV 4:4:4 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_440: - std::cout << "YUV 4:4:0 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_422: - std::cout << "YUV 4:2:2 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_420: - std::cout << "YUV 4:2:0 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_411: - std::cout << "YUV 4:1:1 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_410: - std::cout << "YUV 4:1:0 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_GRAY: - std::cout << "Grayscale JPEG " << std::endl; - break; - case NVJPEG_CSS_UNKNOWN: - std::cout << "Unknown chroma subsampling" << std::endl; - return EXIT_FAILURE; - } - - int mul = 1; - // in the case of interleaved RGB output, write only to single channel, but - // 3 samples at once - if (params.fmt == NVJPEG_OUTPUT_RGBI || params.fmt == NVJPEG_OUTPUT_BGRI) { - channels = 1; - mul = 3; - } - // in the case of rgb create 3 buffers with sizes of original image - else if (params.fmt == NVJPEG_OUTPUT_RGB || - params.fmt == NVJPEG_OUTPUT_BGR) { - channels = 3; - widths[1] = widths[2] = widths[0]; - heights[1] = heights[2] = heights[0]; - } - - // realloc output buffer if required - for (int c = 0; c < channels; c++) { - int aw = mul * widths[c]; - int ah = heights[c]; - int sz = aw * ah; - ibuf[i].pitch[c] = aw; - if (sz > isz[i].pitch[c]) { - if (ibuf[i].channel[c]) { - checkCudaErrors(cudaFree(ibuf[i].channel[c])); + std::cout << "Processing: " << current_names[i] << std::endl; + std::cout << "Image is " << channels << " channels." << std::endl; + for (int c = 0; c < channels; c++) { + std::cout << "Channel #" << c << " size: " << widths[c] << " x " << heights[c] << std::endl; + } + + switch (subsampling) { + case NVJPEG_CSS_444: + std::cout << "YUV 4:4:4 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_440: + std::cout << "YUV 4:4:0 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_422: + std::cout << "YUV 4:2:2 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_420: + std::cout << "YUV 4:2:0 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_411: + std::cout << "YUV 4:1:1 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_410: + std::cout << "YUV 4:1:0 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_GRAY: + std::cout << "Grayscale JPEG " << std::endl; + break; + case NVJPEG_CSS_UNKNOWN: + std::cout << "Unknown chroma subsampling" << std::endl; + return EXIT_FAILURE; + } + + int mul = 1; + // in the case of interleaved RGB output, write only to single channel, but + // 3 samples at once + if (params.fmt == NVJPEG_OUTPUT_RGBI || params.fmt == NVJPEG_OUTPUT_BGRI) { + channels = 1; + mul = 3; + } + // in the case of rgb create 3 buffers with sizes of original image + else if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) { + channels = 3; + widths[1] = widths[2] = widths[0]; + heights[1] = heights[2] = heights[0]; + } + + // realloc output buffer if required + for (int c = 0; c < channels; c++) { + int aw = mul * widths[c]; + int ah = heights[c]; + int sz = aw * ah; + ibuf[i].pitch[c] = aw; + if (sz > isz[i].pitch[c]) { + if (ibuf[i].channel[c]) { + checkCudaErrors(cudaFree(ibuf[i].channel[c])); + } + checkCudaErrors(cudaMalloc(&ibuf[i].channel[c], sz)); + isz[i].pitch[c] = sz; + } } - checkCudaErrors(cudaMalloc(&ibuf[i].channel[c], sz)); - isz[i].pitch[c] = sz; - } } - } - return EXIT_SUCCESS; + return EXIT_SUCCESS; } -void create_decoupled_api_handles(decode_params_t& params){ +void create_decoupled_api_handles(decode_params_t ¶ms) +{ - checkCudaErrors(nvjpegDecoderCreate(params.nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, ¶ms.nvjpeg_decoder)); - checkCudaErrors(nvjpegDecoderStateCreate(params.nvjpeg_handle, params.nvjpeg_decoder, ¶ms.nvjpeg_decoupled_state)); - - checkCudaErrors(nvjpegBufferPinnedCreate(params.nvjpeg_handle, NULL, ¶ms.pinned_buffers[0])); - checkCudaErrors(nvjpegBufferPinnedCreate(params.nvjpeg_handle, NULL, ¶ms.pinned_buffers[1])); - checkCudaErrors(nvjpegBufferDeviceCreate(params.nvjpeg_handle, NULL, ¶ms.device_buffer)); - - checkCudaErrors(nvjpegJpegStreamCreate(params.nvjpeg_handle, ¶ms.jpeg_streams[0])); - checkCudaErrors(nvjpegJpegStreamCreate(params.nvjpeg_handle, ¶ms.jpeg_streams[1])); + checkCudaErrors(nvjpegDecoderCreate(params.nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, ¶ms.nvjpeg_decoder)); + checkCudaErrors( + nvjpegDecoderStateCreate(params.nvjpeg_handle, params.nvjpeg_decoder, ¶ms.nvjpeg_decoupled_state)); - checkCudaErrors(nvjpegDecodeParamsCreate(params.nvjpeg_handle, ¶ms.nvjpeg_decode_params)); + checkCudaErrors(nvjpegBufferPinnedCreate(params.nvjpeg_handle, NULL, ¶ms.pinned_buffers[0])); + checkCudaErrors(nvjpegBufferPinnedCreate(params.nvjpeg_handle, NULL, ¶ms.pinned_buffers[1])); + checkCudaErrors(nvjpegBufferDeviceCreate(params.nvjpeg_handle, NULL, ¶ms.device_buffer)); + + checkCudaErrors(nvjpegJpegStreamCreate(params.nvjpeg_handle, ¶ms.jpeg_streams[0])); + checkCudaErrors(nvjpegJpegStreamCreate(params.nvjpeg_handle, ¶ms.jpeg_streams[1])); + + checkCudaErrors(nvjpegDecodeParamsCreate(params.nvjpeg_handle, ¶ms.nvjpeg_decode_params)); } -void destroy_decoupled_api_handles(decode_params_t& params){ +void destroy_decoupled_api_handles(decode_params_t ¶ms) +{ - checkCudaErrors(nvjpegDecodeParamsDestroy(params.nvjpeg_decode_params)); - checkCudaErrors(nvjpegJpegStreamDestroy(params.jpeg_streams[0])); - checkCudaErrors(nvjpegJpegStreamDestroy(params.jpeg_streams[1])); - checkCudaErrors(nvjpegBufferPinnedDestroy(params.pinned_buffers[0])); - checkCudaErrors(nvjpegBufferPinnedDestroy(params.pinned_buffers[1])); - checkCudaErrors(nvjpegBufferDeviceDestroy(params.device_buffer)); - checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_decoupled_state)); - checkCudaErrors(nvjpegDecoderDestroy(params.nvjpeg_decoder)); + checkCudaErrors(nvjpegDecodeParamsDestroy(params.nvjpeg_decode_params)); + checkCudaErrors(nvjpegJpegStreamDestroy(params.jpeg_streams[0])); + checkCudaErrors(nvjpegJpegStreamDestroy(params.jpeg_streams[1])); + checkCudaErrors(nvjpegBufferPinnedDestroy(params.pinned_buffers[0])); + checkCudaErrors(nvjpegBufferPinnedDestroy(params.pinned_buffers[1])); + checkCudaErrors(nvjpegBufferDeviceDestroy(params.device_buffer)); + checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_decoupled_state)); + checkCudaErrors(nvjpegDecoderDestroy(params.nvjpeg_decoder)); } -void release_buffers(std::vector &ibuf) { - for (int i = 0; i < ibuf.size(); i++) { - for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) - if (ibuf[i].channel[c]) checkCudaErrors(cudaFree(ibuf[i].channel[c])); - } -} - -int decode_images(const FileData &img_data, const std::vector &img_len, - std::vector &out, decode_params_t ¶ms, - double &time) { - checkCudaErrors(cudaStreamSynchronize(params.stream)); - cudaEvent_t startEvent = NULL, stopEvent = NULL; - float loopTime = 0; - - checkCudaErrors(cudaEventCreate(&startEvent, cudaEventBlockingSync)); - checkCudaErrors(cudaEventCreate(&stopEvent, cudaEventBlockingSync)); - - if (!params.batched) { - if (!params.pipelined) // decode one image at a time - { - checkCudaErrors(cudaEventRecord(startEvent, params.stream)); - for (int i = 0; i < params.batch_size; i++) { - checkCudaErrors(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_state, - (const unsigned char *)img_data[i].data(), - img_len[i], params.fmt, &out[i], - params.stream)); - } - checkCudaErrors(cudaEventRecord(stopEvent, params.stream)); - } else { - // use de-coupled API in pipelined mode - checkCudaErrors(cudaEventRecord(startEvent, params.stream)); - checkCudaErrors(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer)); - int buffer_index = 0; - checkCudaErrors(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt)); - for (int i = 0; i < params.batch_size; i++) { - checkCudaErrors( - nvjpegJpegStreamParse(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], - 0, 0, params.jpeg_streams[buffer_index])); - - checkCudaErrors(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state, - params.pinned_buffers[buffer_index])); - - checkCudaErrors(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, - params.nvjpeg_decode_params, params.jpeg_streams[buffer_index])); - - checkCudaErrors(cudaStreamSynchronize(params.stream)); - - checkCudaErrors(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, - params.jpeg_streams[buffer_index], params.stream)); - - buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync - - checkCudaErrors(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, - &out[i], params.stream)); - - } - checkCudaErrors(cudaEventRecord(stopEvent, params.stream)); +void release_buffers(std::vector &ibuf) +{ + for (int i = 0; i < ibuf.size(); i++) { + for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) + if (ibuf[i].channel[c]) + checkCudaErrors(cudaFree(ibuf[i].channel[c])); } - } else { - std::vector raw_inputs; +} + +int decode_images(const FileData &img_data, + const std::vector &img_len, + std::vector &out, + decode_params_t ¶ms, + double &time) +{ + checkCudaErrors(cudaStreamSynchronize(params.stream)); + cudaEvent_t startEvent = NULL, stopEvent = NULL; + float loopTime = 0; + + checkCudaErrors(cudaEventCreate(&startEvent, cudaEventBlockingSync)); + checkCudaErrors(cudaEventCreate(&stopEvent, cudaEventBlockingSync)); + + if (!params.batched) { + if (!params.pipelined) // decode one image at a time + { + checkCudaErrors(cudaEventRecord(startEvent, params.stream)); + for (int i = 0; i < params.batch_size; i++) { + checkCudaErrors(nvjpegDecode(params.nvjpeg_handle, + params.nvjpeg_state, + (const unsigned char *)img_data[i].data(), + img_len[i], + params.fmt, + &out[i], + params.stream)); + } + checkCudaErrors(cudaEventRecord(stopEvent, params.stream)); + } + else { + // use de-coupled API in pipelined mode + checkCudaErrors(cudaEventRecord(startEvent, params.stream)); + checkCudaErrors(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer)); + int buffer_index = 0; + checkCudaErrors(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt)); + for (int i = 0; i < params.batch_size; i++) { + checkCudaErrors(nvjpegJpegStreamParse(params.nvjpeg_handle, + (const unsigned char *)img_data[i].data(), + img_len[i], + 0, + 0, + params.jpeg_streams[buffer_index])); + + checkCudaErrors( + nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state, params.pinned_buffers[buffer_index])); + + checkCudaErrors(nvjpegDecodeJpegHost(params.nvjpeg_handle, + params.nvjpeg_decoder, + params.nvjpeg_decoupled_state, + params.nvjpeg_decode_params, + params.jpeg_streams[buffer_index])); + + checkCudaErrors(cudaStreamSynchronize(params.stream)); + + checkCudaErrors(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, + params.nvjpeg_decoder, + params.nvjpeg_decoupled_state, + params.jpeg_streams[buffer_index], + params.stream)); + + buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync + + checkCudaErrors(nvjpegDecodeJpegDevice(params.nvjpeg_handle, + params.nvjpeg_decoder, + params.nvjpeg_decoupled_state, + &out[i], + params.stream)); + } + checkCudaErrors(cudaEventRecord(stopEvent, params.stream)); + } + } + else { + std::vector raw_inputs; + for (int i = 0; i < params.batch_size; i++) { + raw_inputs.push_back((const unsigned char *)img_data[i].data()); + } + + checkCudaErrors(cudaEventRecord(startEvent, params.stream)); + checkCudaErrors(nvjpegDecodeBatched( + params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(), img_len.data(), out.data(), params.stream)); + checkCudaErrors(cudaEventRecord(stopEvent, params.stream)); + } + checkCudaErrors(cudaEventSynchronize(stopEvent)); + checkCudaErrors(cudaEventElapsedTime(&loopTime, startEvent, stopEvent)); + time = static_cast(loopTime); + + return EXIT_SUCCESS; +} + +void write_images(std::vector &iout, + std::vector &widths, + std::vector &heights, + decode_params_t ¶ms, + FileNames &filenames) +{ for (int i = 0; i < params.batch_size; i++) { - raw_inputs.push_back((const unsigned char *)img_data[i].data()); + // Get the file name, without extension. + // This will be used to rename the output file. + size_t position = filenames[i].rfind("/"); + std::string sFileName = + (std::string::npos == position) ? filenames[i] : filenames[i].substr(position + 1, filenames[i].size()); + position = sFileName.rfind("."); + sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(0, position); + std::string fname(params.output_dir + "/" + sFileName + ".bmp"); + + int err; + if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) { + err = writeBMP(fname.c_str(), + iout[i].channel[0], + iout[i].pitch[0], + iout[i].channel[1], + iout[i].pitch[1], + iout[i].channel[2], + iout[i].pitch[2], + widths[i], + heights[i]); + } + else if (params.fmt == NVJPEG_OUTPUT_RGBI || params.fmt == NVJPEG_OUTPUT_BGRI) { + // Write BMP from interleaved data + err = writeBMPi(fname.c_str(), iout[i].channel[0], iout[i].pitch[0], widths[i], heights[i]); + } + if (err) { + std::cout << "Cannot write output file: " << fname << std::endl; + return; + } + std::cout << "Done writing decoded image to file: " << fname << std::endl; } - - checkCudaErrors(cudaEventRecord(startEvent, params.stream)); - checkCudaErrors(nvjpegDecodeBatched( - params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(), - img_len.data(), out.data(), params.stream)); - checkCudaErrors(cudaEventRecord(stopEvent, params.stream)); - - } - checkCudaErrors(cudaEventSynchronize(stopEvent)); - checkCudaErrors(cudaEventElapsedTime(&loopTime, startEvent, stopEvent)); - time = static_cast(loopTime); - - return EXIT_SUCCESS; } -void write_images(std::vector &iout, std::vector &widths, - std::vector &heights, decode_params_t ¶ms, - FileNames &filenames) { - for (int i = 0; i < params.batch_size; i++) { - // Get the file name, without extension. - // This will be used to rename the output file. - size_t position = filenames[i].rfind("/"); - std::string sFileName = - (std::string::npos == position) - ? filenames[i] - : filenames[i].substr(position + 1, filenames[i].size()); - position = sFileName.rfind("."); - sFileName = (std::string::npos == position) ? sFileName - : sFileName.substr(0, position); - std::string fname(params.output_dir + "/" + sFileName + ".bmp"); +double process_images(FileNames &image_names, decode_params_t ¶ms, double &total) +{ + // vector for storing raw files and file lengths + FileData file_data(params.batch_size); + std::vector file_len(params.batch_size); + FileNames current_names(params.batch_size); + std::vector widths(params.batch_size); + std::vector heights(params.batch_size); + // we wrap over image files to process total_images of files + FileNames::iterator file_iter = image_names.begin(); - int err; - if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) { - err = writeBMP(fname.c_str(), iout[i].channel[0], iout[i].pitch[0], - iout[i].channel[1], iout[i].pitch[1], iout[i].channel[2], - iout[i].pitch[2], widths[i], heights[i]); - } else if (params.fmt == NVJPEG_OUTPUT_RGBI || - params.fmt == NVJPEG_OUTPUT_BGRI) { - // Write BMP from interleaved data - err = writeBMPi(fname.c_str(), iout[i].channel[0], iout[i].pitch[0], - widths[i], heights[i]); - } - if (err) { - std::cout << "Cannot write output file: " << fname << std::endl; - return; - } - std::cout << "Done writing decoded image to file: " << fname << std::endl; - } -} + // stream for decoding + checkCudaErrors(cudaStreamCreateWithFlags(¶ms.stream, cudaStreamNonBlocking)); -double process_images(FileNames &image_names, decode_params_t ¶ms, - double &total) { - // vector for storing raw files and file lengths - FileData file_data(params.batch_size); - std::vector file_len(params.batch_size); - FileNames current_names(params.batch_size); - std::vector widths(params.batch_size); - std::vector heights(params.batch_size); - // we wrap over image files to process total_images of files - FileNames::iterator file_iter = image_names.begin(); + int total_processed = 0; - // stream for decoding - checkCudaErrors( - cudaStreamCreateWithFlags(¶ms.stream, cudaStreamNonBlocking)); + // output buffers + std::vector iout(params.batch_size); + // output buffer sizes, for convenience + std::vector isz(params.batch_size); - int total_processed = 0; - - // output buffers - std::vector iout(params.batch_size); - // output buffer sizes, for convenience - std::vector isz(params.batch_size); - - for (int i = 0; i < iout.size(); i++) { - for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { - iout[i].channel[c] = NULL; - iout[i].pitch[c] = 0; - isz[i].pitch[c] = 0; - } - } - - double test_time = 0; - int warmup = 0; - while (total_processed < params.total_images) { - if (read_next_batch(image_names, params.batch_size, file_iter, file_data, - file_len, current_names)) - return EXIT_FAILURE; - - if (prepare_buffers(file_data, file_len, widths, heights, iout, isz, - current_names, params)) - return EXIT_FAILURE; - - double time; - if (decode_images(file_data, file_len, iout, params, time)) - return EXIT_FAILURE; - if (warmup < params.warmup) { - warmup++; - } else { - total_processed += params.batch_size; - test_time += time; + for (int i = 0; i < iout.size(); i++) { + for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { + iout[i].channel[c] = NULL; + iout[i].pitch[c] = 0; + isz[i].pitch[c] = 0; + } } - if (params.write_decoded) - write_images(iout, widths, heights, params, current_names); - } - total = test_time; + double test_time = 0; + int warmup = 0; + while (total_processed < params.total_images) { + if (read_next_batch(image_names, params.batch_size, file_iter, file_data, file_len, current_names)) + return EXIT_FAILURE; - release_buffers(iout); + if (prepare_buffers(file_data, file_len, widths, heights, iout, isz, current_names, params)) + return EXIT_FAILURE; - checkCudaErrors(cudaStreamDestroy(params.stream)); + double time; + if (decode_images(file_data, file_len, iout, params, time)) + return EXIT_FAILURE; + if (warmup < params.warmup) { + warmup++; + } + else { + total_processed += params.batch_size; + test_time += time; + } - return EXIT_SUCCESS; + if (params.write_decoded) + write_images(iout, widths, heights, params, current_names); + } + total = test_time; + + release_buffers(iout); + + checkCudaErrors(cudaStreamDestroy(params.stream)); + + return EXIT_SUCCESS; } // parse parameters -int findParamIndex(const char **argv, int argc, const char *parm) { - int count = 0; - int index = -1; +int findParamIndex(const char **argv, int argc, const char *parm) +{ + int count = 0; + int index = -1; - for (int i = 0; i < argc; i++) { - if (strncmp(argv[i], parm, 100) == 0) { - index = i; - count++; + for (int i = 0; i < argc; i++) { + if (strncmp(argv[i], parm, 100) == 0) { + index = i; + count++; + } } - } - if (count == 0 || count == 1) { - return index; - } else { - std::cout << "Error, parameter " << parm - << " has been specified more than once, exiting\n" - << std::endl; - return -1; - } - - return -1; -} - -int main(int argc, const char *argv[]) { - int pidx; - - if ((pidx = findParamIndex(argv, argc, "-h")) != -1 || - (pidx = findParamIndex(argv, argc, "--help")) != -1) { - std::cout << "Usage: " << argv[0] - << " -i images_dir [-b batch_size] [-t total_images] [-device= " - "device_id] [-w warmup_iterations] [-o output_dir] " - "[-pipelined] [-batched] [-fmt output_format]\n"; - std::cout << "Parameters: " << std::endl; - std::cout << "\timages_dir\t:\tPath to single image or directory of images" - << std::endl; - std::cout << "\tbatch_size\t:\tDecode images from input by batches of " - "specified size" - << std::endl; - std::cout << "\ttotal_images\t:\tDecode this much images, if there are " - "less images \n" - << "\t\t\t\t\tin the input than total images, decoder will loop " - "over the input" - << std::endl; - std::cout << "\tdevice_id\t:\tWhich device to use for decoding" - << std::endl; - std::cout << "\twarmup_iterations\t:\tRun this amount of batches first " - "without measuring performance" - << std::endl; - std::cout - << "\toutput_dir\t:\tWrite decoded images as BMPs to this directory" - << std::endl; - std::cout << "\tpipelined\t:\tUse decoding in phases" << std::endl; - std::cout << "\tbatched\t\t:\tUse batched interface" << std::endl; - std::cout << "\toutput_format\t:\tnvJPEG output format for decoding. One " - "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]" - << std::endl; - return EXIT_SUCCESS; - } - - decode_params_t params; - - params.input_dir = "./"; - if ((pidx = findParamIndex(argv, argc, "-i")) != -1) { - params.input_dir = argv[pidx + 1]; - } else { - // Search in default paths for input images. - int found = getInputDir(params.input_dir, argv[0]); - if (!found) - { - std::cout << "Please specify input directory with encoded images"<< std::endl; - return EXIT_WAIVED; + if (count == 0 || count == 1) { + return index; } - } - - params.batch_size = 1; - if ((pidx = findParamIndex(argv, argc, "-b")) != -1) { - params.batch_size = std::atoi(argv[pidx + 1]); - } - - params.total_images = -1; - if ((pidx = findParamIndex(argv, argc, "-t")) != -1) { - params.total_images = std::atoi(argv[pidx + 1]); - } - - params.dev = 0; - params.dev = findCudaDevice(argc, argv); - - params.warmup = 0; - if ((pidx = findParamIndex(argv, argc, "-w")) != -1) { - params.warmup = std::atoi(argv[pidx + 1]); - } - - params.batched = false; - if ((pidx = findParamIndex(argv, argc, "-batched")) != -1) { - params.batched = true; - } - - params.pipelined = false; - if ((pidx = findParamIndex(argv, argc, "-pipelined")) != -1) { - params.pipelined = true; - } - - params.fmt = NVJPEG_OUTPUT_RGB; - if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) { - std::string sfmt = argv[pidx + 1]; - if (sfmt == "rgb") - params.fmt = NVJPEG_OUTPUT_RGB; - else if (sfmt == "bgr") - params.fmt = NVJPEG_OUTPUT_BGR; - else if (sfmt == "rgbi") - params.fmt = NVJPEG_OUTPUT_RGBI; - else if (sfmt == "bgri") - params.fmt = NVJPEG_OUTPUT_BGRI; - else if (sfmt == "yuv") - params.fmt = NVJPEG_OUTPUT_YUV; - else if (sfmt == "y") - params.fmt = NVJPEG_OUTPUT_Y; - else if (sfmt == "unchanged") - params.fmt = NVJPEG_OUTPUT_UNCHANGED; else { - std::cout << "Unknown format: " << sfmt << std::endl; - return EXIT_FAILURE; + std::cout << "Error, parameter " << parm << " has been specified more than once, exiting\n" << std::endl; + return -1; } - } - params.write_decoded = false; - if ((pidx = findParamIndex(argv, argc, "-o")) != -1) { - params.output_dir = argv[pidx + 1]; - if (params.fmt != NVJPEG_OUTPUT_RGB && params.fmt != NVJPEG_OUTPUT_BGR && - params.fmt != NVJPEG_OUTPUT_RGBI && params.fmt != NVJPEG_OUTPUT_BGRI) { - std::cout << "We can write ony BMPs, which require output format be " - "either RGB/BGR or RGBi/BGRi" - << std::endl; - return EXIT_FAILURE; - } - params.write_decoded = true; - } - - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, params.dev)); - - printf("Using GPU %d (%s, %d SMs, %d th/SM max, CC %d.%d, ECC %s)\n", - params.dev, props.name, props.multiProcessorCount, - props.maxThreadsPerMultiProcessor, props.major, props.minor, - props.ECCEnabled ? "on" : "off"); - - nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free}; - nvjpegPinnedAllocator_t pinned_allocator ={&host_malloc, &host_free}; - int flags = 0; - checkCudaErrors(nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &dev_allocator, - &pinned_allocator,flags, ¶ms.nvjpeg_handle)); - - checkCudaErrors( - nvjpegJpegStateCreate(params.nvjpeg_handle, ¶ms.nvjpeg_state)); - checkCudaErrors( - nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state, - params.batch_size, 1, params.fmt)); - - if(params.pipelined ){ - create_decoupled_api_handles(params); - } - // read source images - FileNames image_names; - readInput(params.input_dir, image_names); - - if (params.total_images == -1) { - params.total_images = image_names.size(); - } else if (params.total_images % params.batch_size) { - params.total_images = - ((params.total_images) / params.batch_size) * params.batch_size; - std::cout << "Changing total_images number to " << params.total_images - << " to be multiple of batch_size - " << params.batch_size - << std::endl; - } - - std::cout << "Decoding images in directory: " << params.input_dir - << ", total " << params.total_images << ", batchsize " - << params.batch_size << std::endl; - - double total; - if (process_images(image_names, params, total)) return EXIT_FAILURE; - std::cout << "Total decoding time: " << total << std::endl; - std::cout << "Avg decoding time per image: " << total / params.total_images - << std::endl; - std::cout << "Avg images per sec: " << params.total_images / total - << std::endl; - std::cout << "Avg decoding time per batch: " - << total / ((params.total_images + params.batch_size - 1) / - params.batch_size) - << std::endl; - - if(params.pipelined ){ - destroy_decoupled_api_handles(params); - } - - checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_state)); - checkCudaErrors(nvjpegDestroy(params.nvjpeg_handle)); - - return EXIT_SUCCESS; + return -1; +} + +int main(int argc, const char *argv[]) +{ + int pidx; + + if ((pidx = findParamIndex(argv, argc, "-h")) != -1 || (pidx = findParamIndex(argv, argc, "--help")) != -1) { + std::cout << "Usage: " << argv[0] + << " -i images_dir [-b batch_size] [-t total_images] [-device= " + "device_id] [-w warmup_iterations] [-o output_dir] " + "[-pipelined] [-batched] [-fmt output_format]\n"; + std::cout << "Parameters: " << std::endl; + std::cout << "\timages_dir\t:\tPath to single image or directory of images" << std::endl; + std::cout << "\tbatch_size\t:\tDecode images from input by batches of " + "specified size" + << std::endl; + std::cout << "\ttotal_images\t:\tDecode this much images, if there are " + "less images \n" + << "\t\t\t\t\tin the input than total images, decoder will loop " + "over the input" + << std::endl; + std::cout << "\tdevice_id\t:\tWhich device to use for decoding" << std::endl; + std::cout << "\twarmup_iterations\t:\tRun this amount of batches first " + "without measuring performance" + << std::endl; + std::cout << "\toutput_dir\t:\tWrite decoded images as BMPs to this directory" << std::endl; + std::cout << "\tpipelined\t:\tUse decoding in phases" << std::endl; + std::cout << "\tbatched\t\t:\tUse batched interface" << std::endl; + std::cout << "\toutput_format\t:\tnvJPEG output format for decoding. One " + "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]" + << std::endl; + return EXIT_SUCCESS; + } + + decode_params_t params; + + params.input_dir = "./"; + if ((pidx = findParamIndex(argv, argc, "-i")) != -1) { + params.input_dir = argv[pidx + 1]; + } + else { + // Search in default paths for input images. + int found = getInputDir(params.input_dir, argv[0]); + if (!found) { + std::cout << "Please specify input directory with encoded images" << std::endl; + return EXIT_WAIVED; + } + } + + params.batch_size = 1; + if ((pidx = findParamIndex(argv, argc, "-b")) != -1) { + params.batch_size = std::atoi(argv[pidx + 1]); + } + + params.total_images = -1; + if ((pidx = findParamIndex(argv, argc, "-t")) != -1) { + params.total_images = std::atoi(argv[pidx + 1]); + } + + params.dev = 0; + params.dev = findCudaDevice(argc, argv); + + params.warmup = 0; + if ((pidx = findParamIndex(argv, argc, "-w")) != -1) { + params.warmup = std::atoi(argv[pidx + 1]); + } + + params.batched = false; + if ((pidx = findParamIndex(argv, argc, "-batched")) != -1) { + params.batched = true; + } + + params.pipelined = false; + if ((pidx = findParamIndex(argv, argc, "-pipelined")) != -1) { + params.pipelined = true; + } + + params.fmt = NVJPEG_OUTPUT_RGB; + if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) { + std::string sfmt = argv[pidx + 1]; + if (sfmt == "rgb") + params.fmt = NVJPEG_OUTPUT_RGB; + else if (sfmt == "bgr") + params.fmt = NVJPEG_OUTPUT_BGR; + else if (sfmt == "rgbi") + params.fmt = NVJPEG_OUTPUT_RGBI; + else if (sfmt == "bgri") + params.fmt = NVJPEG_OUTPUT_BGRI; + else if (sfmt == "yuv") + params.fmt = NVJPEG_OUTPUT_YUV; + else if (sfmt == "y") + params.fmt = NVJPEG_OUTPUT_Y; + else if (sfmt == "unchanged") + params.fmt = NVJPEG_OUTPUT_UNCHANGED; + else { + std::cout << "Unknown format: " << sfmt << std::endl; + return EXIT_FAILURE; + } + } + + params.write_decoded = false; + if ((pidx = findParamIndex(argv, argc, "-o")) != -1) { + params.output_dir = argv[pidx + 1]; + if (params.fmt != NVJPEG_OUTPUT_RGB && params.fmt != NVJPEG_OUTPUT_BGR && params.fmt != NVJPEG_OUTPUT_RGBI + && params.fmt != NVJPEG_OUTPUT_BGRI) { + std::cout << "We can write ony BMPs, which require output format be " + "either RGB/BGR or RGBi/BGRi" + << std::endl; + return EXIT_FAILURE; + } + params.write_decoded = true; + } + + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, params.dev)); + + printf("Using GPU %d (%s, %d SMs, %d th/SM max, CC %d.%d, ECC %s)\n", + params.dev, + props.name, + props.multiProcessorCount, + props.maxThreadsPerMultiProcessor, + props.major, + props.minor, + props.ECCEnabled ? "on" : "off"); + + nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free}; + nvjpegPinnedAllocator_t pinned_allocator = {&host_malloc, &host_free}; + int flags = 0; + checkCudaErrors( + nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &dev_allocator, &pinned_allocator, flags, ¶ms.nvjpeg_handle)); + + checkCudaErrors(nvjpegJpegStateCreate(params.nvjpeg_handle, ¶ms.nvjpeg_state)); + checkCudaErrors( + nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state, params.batch_size, 1, params.fmt)); + + if (params.pipelined) { + create_decoupled_api_handles(params); + } + // read source images + FileNames image_names; + readInput(params.input_dir, image_names); + + if (params.total_images == -1) { + params.total_images = image_names.size(); + } + else if (params.total_images % params.batch_size) { + params.total_images = ((params.total_images) / params.batch_size) * params.batch_size; + std::cout << "Changing total_images number to " << params.total_images << " to be multiple of batch_size - " + << params.batch_size << std::endl; + } + + std::cout << "Decoding images in directory: " << params.input_dir << ", total " << params.total_images + << ", batchsize " << params.batch_size << std::endl; + + double total; + if (process_images(image_names, params, total)) + return EXIT_FAILURE; + std::cout << "Total decoding time: " << total << std::endl; + std::cout << "Avg decoding time per image: " << total / params.total_images << std::endl; + std::cout << "Avg images per sec: " << params.total_images / total << std::endl; + std::cout << "Avg decoding time per batch: " + << total / ((params.total_images + params.batch_size - 1) / params.batch_size) << std::endl; + + if (params.pipelined) { + destroy_decoupled_api_handles(params); + } + + checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_state)); + checkCudaErrors(nvjpegDestroy(params.nvjpeg_handle)); + + return EXIT_SUCCESS; } diff --git a/Samples/4_CUDA_Libraries/nvJPEG_encoder/nvJPEG_encoder.cpp b/Samples/4_CUDA_Libraries/nvJPEG_encoder/nvJPEG_encoder.cpp index 796f1a6d..0bf7aaa6 100644 --- a/Samples/4_CUDA_Libraries/nvJPEG_encoder/nvJPEG_encoder.cpp +++ b/Samples/4_CUDA_Libraries/nvJPEG_encoder/nvJPEG_encoder.cpp @@ -29,6 +29,7 @@ // library nvJPEG encoder supports single and multiple image encode. #include + #include "helper_nvJPEG.hxx" @@ -43,133 +44,127 @@ bool is_interleaved(nvjpegOutputFormat_t format) return false; } -struct encode_params_t { - std::string input_dir; - std::string output_dir; - std::string format; - std::string subsampling; - int quality; - int huf; - int dev; +struct encode_params_t +{ + std::string input_dir; + std::string output_dir; + std::string format; + std::string subsampling; + int quality; + int huf; + int dev; }; nvjpegEncoderParams_t encode_params; -nvjpegHandle_t nvjpeg_handle; -nvjpegJpegState_t jpeg_state; -nvjpegEncoderState_t encoder_state; +nvjpegHandle_t nvjpeg_handle; +nvjpegJpegState_t jpeg_state; +nvjpegEncoderState_t encoder_state; -int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double &time, nvjpegOutputFormat_t output_format, nvjpegInputFormat_t input_format) +int decodeEncodeOneImage(std::string sImagePath, + std::string sOutputPath, + double &time, + nvjpegOutputFormat_t output_format, + nvjpegInputFormat_t input_format) { - time = 0.; + time = 0.; cudaEvent_t startEvent = NULL, stopEvent = NULL; - float loopTime = 0; + float loopTime = 0; checkCudaErrors(cudaEventCreate(&startEvent, cudaEventBlockingSync)); checkCudaErrors(cudaEventCreate(&stopEvent, cudaEventBlockingSync)); // Get the file name, without extension. - // This will be used to rename the output file. - size_t position = sImagePath.rfind("/"); - std::string sFileName = (std::string::npos == position)? sImagePath : sImagePath.substr(position + 1, sImagePath.size()); - position = sFileName.rfind("."); - sFileName = (std::string::npos == position)? sFileName : sFileName.substr(0, position); - position = sFileName.rfind("/"); + // This will be used to rename the output file. + size_t position = sImagePath.rfind("/"); + std::string sFileName = + (std::string::npos == position) ? sImagePath : sImagePath.substr(position + 1, sImagePath.size()); + position = sFileName.rfind("."); + sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(0, position); + position = sFileName.rfind("/"); + sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(position + 1, sFileName.length()); + position = sFileName.rfind("\\"); sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(position + 1, sFileName.length()); - position = sFileName.rfind("\\"); - sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(position+1, sFileName.length()); // Read an image from disk. std::ifstream oInputStream(sImagePath.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - if(!(oInputStream.is_open())) - { + if (!(oInputStream.is_open())) { std::cerr << "Cannot open image: " << sImagePath << std::endl; return 1; } - + // Get the size. std::streamsize nSize = oInputStream.tellg(); oInputStream.seekg(0, std::ios::beg); - // Image buffers. - unsigned char * pBuffer = NULL; - double encoder_time = 0.; - + // Image buffers. + unsigned char *pBuffer = NULL; + double encoder_time = 0.; + std::vector vBuffer(nSize); - - if (oInputStream.read(vBuffer.data(), nSize)) - { - unsigned char * dpImage = (unsigned char *)vBuffer.data(); - + + if (oInputStream.read(vBuffer.data(), nSize)) { + unsigned char *dpImage = (unsigned char *)vBuffer.data(); + // Retrieve the componenet and size info. - int nComponent = 0; + int nComponent = 0; nvjpegChromaSubsampling_t subsampling; - int widths[NVJPEG_MAX_COMPONENT]; - int heights[NVJPEG_MAX_COMPONENT]; - if (NVJPEG_STATUS_SUCCESS != nvjpegGetImageInfo(nvjpeg_handle, dpImage, nSize, &nComponent, &subsampling, widths, heights)) - { + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + if (NVJPEG_STATUS_SUCCESS + != nvjpegGetImageInfo(nvjpeg_handle, dpImage, nSize, &nComponent, &subsampling, widths, heights)) { std::cerr << "Error decoding JPEG header: " << sImagePath << std::endl; return 1; } // image information std::cout << "Image is " << nComponent << " channels." << std::endl; - for (int i = 0; i < nComponent; i++) - { - std::cout << "Channel #" << i << " size: " << widths[i] << " x " << heights[i] << std::endl; + for (int i = 0; i < nComponent; i++) { + std::cout << "Channel #" << i << " size: " << widths[i] << " x " << heights[i] << std::endl; } - - switch (subsampling) - { - case NVJPEG_CSS_444: - std::cout << "YUV 4:4:4 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_440: - std::cout << "YUV 4:4:0 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_422: - std::cout << "YUV 4:2:2 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_420: - std::cout << "YUV 4:2:0 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_411: - std::cout << "YUV 4:1:1 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_410: - std::cout << "YUV 4:1:0 chroma subsampling" << std::endl; - break; - case NVJPEG_CSS_GRAY: - std::cout << "Grayscale JPEG " << std::endl; - break; - case NVJPEG_CSS_UNKNOWN: - std::cout << "Unknown chroma subsampling" << std::endl; - return 1; + + switch (subsampling) { + case NVJPEG_CSS_444: + std::cout << "YUV 4:4:4 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_440: + std::cout << "YUV 4:4:0 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_422: + std::cout << "YUV 4:2:2 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_420: + std::cout << "YUV 4:2:0 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_411: + std::cout << "YUV 4:1:1 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_410: + std::cout << "YUV 4:1:0 chroma subsampling" << std::endl; + break; + case NVJPEG_CSS_GRAY: + std::cout << "Grayscale JPEG " << std::endl; + break; + case NVJPEG_CSS_UNKNOWN: + std::cout << "Unknown chroma subsampling" << std::endl; + return 1; } { cudaError_t eCopy = cudaMalloc(&pBuffer, widths[0] * heights[0] * NVJPEG_MAX_COMPONENT); - if(cudaSuccess != eCopy) - { + if (cudaSuccess != eCopy) { std::cerr << "cudaMalloc failed for component Y: " << cudaGetErrorString(eCopy) << std::endl; return 1; } - nvjpegImage_t imgdesc = - { - { - pBuffer, - pBuffer + widths[0]*heights[0], - pBuffer + widths[0]*heights[0]*2, - pBuffer + widths[0]*heights[0]*3 - }, - { - (unsigned int)(is_interleaved(output_format) ? widths[0] * 3 : widths[0]), - (unsigned int)widths[0], - (unsigned int)widths[0], - (unsigned int)widths[0] - } - }; - + nvjpegImage_t imgdesc = {{pBuffer, + pBuffer + widths[0] * heights[0], + pBuffer + widths[0] * heights[0] * 2, + pBuffer + widths[0] * heights[0] * 3}, + {(unsigned int)(is_interleaved(output_format) ? widths[0] * 3 : widths[0]), + (unsigned int)widths[0], + (unsigned int)widths[0], + (unsigned int)widths[0]}}; + int nReturnCode = 0; cudaDeviceSynchronize(); @@ -182,52 +177,27 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double nReturnCode = nvjpegDecodeGPU(nvjpeg_handle, NULL);*/ cudaDeviceSynchronize(); - if(nReturnCode != 0) - { + if (nReturnCode != 0) { std::cerr << "Error in nvjpegDecode." << std::endl; return 1; } checkCudaErrors(cudaEventRecord(startEvent, NULL)); /////////////////////// encode //////////////////// - if (NVJPEG_OUTPUT_YUV == output_format) - { - checkCudaErrors(nvjpegEncodeYUV(nvjpeg_handle, - encoder_state, - encode_params, - &imgdesc, - subsampling, - widths[0], - heights[0], - NULL)); + if (NVJPEG_OUTPUT_YUV == output_format) { + checkCudaErrors(nvjpegEncodeYUV( + nvjpeg_handle, encoder_state, encode_params, &imgdesc, subsampling, widths[0], heights[0], NULL)); } - else - { - checkCudaErrors(nvjpegEncodeImage(nvjpeg_handle, - encoder_state, - encode_params, - &imgdesc, - input_format, - widths[0], - heights[0], - NULL)); + else { + checkCudaErrors(nvjpegEncodeImage( + nvjpeg_handle, encoder_state, encode_params, &imgdesc, input_format, widths[0], heights[0], NULL)); } std::vector obuffer; - size_t length; - checkCudaErrors(nvjpegEncodeRetrieveBitstream( - nvjpeg_handle, - encoder_state, - NULL, - &length, - NULL)); + size_t length; + checkCudaErrors(nvjpegEncodeRetrieveBitstream(nvjpeg_handle, encoder_state, NULL, &length, NULL)); obuffer.resize(length); - checkCudaErrors(nvjpegEncodeRetrieveBitstream( - nvjpeg_handle, - encoder_state, - obuffer.data(), - &length, - NULL)); + checkCudaErrors(nvjpegEncodeRetrieveBitstream(nvjpeg_handle, encoder_state, obuffer.data(), &length, NULL)); checkCudaErrors(cudaEventRecord(stopEvent, NULL)); checkCudaErrors(cudaEventSynchronize(stopEvent)); @@ -235,10 +205,10 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double encoder_time = static_cast(loopTime); std::string output_filename = sOutputPath + "/" + sFileName + ".jpg"; - char directory[120]; - char mkdir_cmd[256]; + char directory[120]; + char mkdir_cmd[256]; std::string folder = sOutputPath; - output_filename = folder + "/"+ sFileName +".jpg"; + output_filename = folder + "/" + sFileName + ".jpg"; #if !defined(_WIN32) sprintf(directory, "%s", folder.c_str()); sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory); @@ -252,7 +222,7 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double std::cout << "Writing JPEG file: " << output_filename << std::endl; std::ofstream outputFile(output_filename.c_str(), std::ios::out | std::ios::binary); outputFile.write(reinterpret_cast(obuffer.data()), static_cast(length)); - + // Free memory checkCudaErrors(cudaFree(pBuffer)); } @@ -265,75 +235,61 @@ int decodeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double int processArgs(encode_params_t param) { - std::string sInputPath(param.input_dir); - std::string sOutputPath(param.output_dir); - std::string sFormat(param.format); - std::string sSubsampling(param.subsampling); + std::string sInputPath(param.input_dir); + std::string sOutputPath(param.output_dir); + std::string sFormat(param.format); + std::string sSubsampling(param.subsampling); nvjpegOutputFormat_t oformat = NVJPEG_OUTPUT_RGB; - nvjpegInputFormat_t iformat = NVJPEG_INPUT_RGB; + nvjpegInputFormat_t iformat = NVJPEG_INPUT_RGB; int error_code = 1; - if (sFormat == "yuv") - { + if (sFormat == "yuv") { oformat = NVJPEG_OUTPUT_YUV; - } - else if (sFormat == "rgb") - { + } + else if (sFormat == "rgb") { oformat = NVJPEG_OUTPUT_RGB; iformat = NVJPEG_INPUT_RGB; } - else if (sFormat == "bgr") - { + else if (sFormat == "bgr") { oformat = NVJPEG_OUTPUT_BGR; iformat = NVJPEG_INPUT_BGR; } - else if (sFormat == "rgbi") - { + else if (sFormat == "rgbi") { oformat = NVJPEG_OUTPUT_RGBI; iformat = NVJPEG_INPUT_RGBI; } - else if (sFormat == "bgri") - { + else if (sFormat == "bgri") { oformat = NVJPEG_OUTPUT_BGRI; iformat = NVJPEG_INPUT_BGRI; } - else - { + else { std::cerr << "Unknown or unsupported output format: " << sFormat << std::endl; return error_code; } - if (sSubsampling == "444") - { + if (sSubsampling == "444") { checkCudaErrors(nvjpegEncoderParamsSetSamplingFactors(encode_params, NVJPEG_CSS_444, NULL)); } - else if (sSubsampling == "422") - { + else if (sSubsampling == "422") { checkCudaErrors(nvjpegEncoderParamsSetSamplingFactors(encode_params, NVJPEG_CSS_422, NULL)); } - else if (sSubsampling == "420") - { + else if (sSubsampling == "420") { checkCudaErrors(nvjpegEncoderParamsSetSamplingFactors(encode_params, NVJPEG_CSS_420, NULL)); } - else if (sSubsampling == "440") - { + else if (sSubsampling == "440") { checkCudaErrors(nvjpegEncoderParamsSetSamplingFactors(encode_params, NVJPEG_CSS_440, NULL)); } - else if (sSubsampling == "411") - { + else if (sSubsampling == "411") { checkCudaErrors(nvjpegEncoderParamsSetSamplingFactors(encode_params, NVJPEG_CSS_411, NULL)); } - else if (sSubsampling == "410") - { + else if (sSubsampling == "410") { checkCudaErrors(nvjpegEncoderParamsSetSamplingFactors(encode_params, NVJPEG_CSS_410, NULL)); } - else if (sSubsampling == "400") - { + else if (sSubsampling == "400") { checkCudaErrors(nvjpegEncoderParamsSetSamplingFactors(encode_params, NVJPEG_CSS_GRAY, NULL)); } - else - { + else { std::cerr << "Unknown or unsupported subsampling: " << sSubsampling << std::endl; return error_code; } @@ -355,144 +311,146 @@ int processArgs(encode_params_t param) }*/ std::vector inputFiles; - if (readInput(sInputPath, inputFiles)) - { + if (readInput(sInputPath, inputFiles)) { return error_code; } - - double total_time = 0., encoder_time = 0.; - int total_images = 0; - for (unsigned int i = 0; i < inputFiles.size(); i++) - { + double total_time = 0., encoder_time = 0.; + int total_images = 0; + + for (unsigned int i = 0; i < inputFiles.size(); i++) { std::string &sFileName = inputFiles[i]; std::cout << "Processing file: " << sFileName << std::endl; int image_error_code = decodeEncodeOneImage(sFileName, sOutputPath, encoder_time, oformat, iformat); - if (image_error_code) - { + if (image_error_code) { std::cerr << "Error processing file: " << sFileName << std::endl; - //return image_error_code; + // return image_error_code; } - else - { + else { total_images++; total_time += encoder_time; - } + } } std::cout << "Total images processed: " << total_images << std::endl; std::cout << "Total time spent on encoding: " << total_time << std::endl; - std::cout << "Avg time/image: " << total_time/total_images << std::endl; + std::cout << "Avg time/image: " << total_time / total_images << std::endl; return 0; } // parse parameters -int findParamIndex(const char **argv, int argc, const char *parm) { - int count = 0; - int index = -1; +int findParamIndex(const char **argv, int argc, const char *parm) +{ + int count = 0; + int index = -1; - for (int i = 0; i < argc; i++) { - if (strncmp(argv[i], parm, 100) == 0) { - index = i; - count++; + for (int i = 0; i < argc; i++) { + if (strncmp(argv[i], parm, 100) == 0) { + index = i; + count++; + } + } + + if (count == 0 || count == 1) { + return index; + } + else { + std::cout << "Error, parameter " << parm << " has been specified more than once, exiting\n" << std::endl; + return -1; } - } - if (count == 0 || count == 1) { - return index; - } else { - std::cout << "Error, parameter " << parm - << " has been specified more than once, exiting\n" - << std::endl; return -1; - } - - return -1; } -int main(int argc, const char *argv[]) +int main(int argc, const char *argv[]) { - int pidx; + int pidx; - if ((pidx = findParamIndex(argv, argc, "-h")) != -1 || - (pidx = findParamIndex(argv, argc, "--help")) != -1) { - std::cout << "Usage: " << argv[0] - << " -i images_dir [-o output_dir] [-device=device_id]" - "[-q quality][-s 420/444] [-fmt output_format] [-huf 0]\n"; - std::cout << "Parameters: " << std::endl; - std::cout << "\timages_dir\t:\tPath to single image or directory of images" << std::endl; - std::cout << "\toutput_dir\t:\tWrite encoded images as jpeg to this directory" << std::endl; - std::cout << "\tdevice_id\t:\tWhich device to use for encoding" << std::endl; - std::cout << "\tQuality\t:\tUse image quality [default 70]" << std::endl; - std::cout << "\tsubsampling\t:\tUse Subsampling [420, 444]" << std::endl; - std::cout << "\toutput_format\t:\tnvJPEG output format for encoding. One " - "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]" - << std::endl; - std::cout << "\tHuffman Optimization\t:\tUse Huffman optimization [default 0]" << std::endl; - return EXIT_SUCCESS; - } - - encode_params_t params; - - params.input_dir = "./"; - if ((pidx = findParamIndex(argv, argc, "-i")) != -1) { - params.input_dir = argv[pidx + 1]; - } else { - // Search in default paths for input images. - int found = getInputDir(params.input_dir, argv[0]); - if (!found) - { - std::cout << "Please specify input directory with encoded images"<< std::endl; - return EXIT_WAIVED; + if ((pidx = findParamIndex(argv, argc, "-h")) != -1 || (pidx = findParamIndex(argv, argc, "--help")) != -1) { + std::cout << "Usage: " << argv[0] + << " -i images_dir [-o output_dir] [-device=device_id]" + "[-q quality][-s 420/444] [-fmt output_format] [-huf 0]\n"; + std::cout << "Parameters: " << std::endl; + std::cout << "\timages_dir\t:\tPath to single image or directory of images" << std::endl; + std::cout << "\toutput_dir\t:\tWrite encoded images as jpeg to this directory" << std::endl; + std::cout << "\tdevice_id\t:\tWhich device to use for encoding" << std::endl; + std::cout << "\tQuality\t:\tUse image quality [default 70]" << std::endl; + std::cout << "\tsubsampling\t:\tUse Subsampling [420, 444]" << std::endl; + std::cout << "\toutput_format\t:\tnvJPEG output format for encoding. One " + "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]" + << std::endl; + std::cout << "\tHuffman Optimization\t:\tUse Huffman optimization [default 0]" << std::endl; + return EXIT_SUCCESS; } - } - if ((pidx = findParamIndex(argv, argc, "-o")) != -1) { - params.output_dir = argv[pidx + 1]; - } else { - // by-default write the folder named "output" in cwd - params.output_dir = "encode_output"; - } - params.dev = 0; - params.dev = findCudaDevice(argc, argv); - params.quality = 70; - if ((pidx = findParamIndex(argv, argc, "-q")) != -1) { - params.quality = std::atoi(argv[pidx + 1]); - } + encode_params_t params; - if ((pidx = findParamIndex(argv, argc, "-s")) != -1) { - params.subsampling = argv[pidx + 1]; - } else { - // by-default use subsampling as 420 - params.subsampling = "420"; - } - if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) { - params.format = argv[pidx + 1]; - } else { - // by-default use output format yuv - params.format = "yuv"; - } + params.input_dir = "./"; + if ((pidx = findParamIndex(argv, argc, "-i")) != -1) { + params.input_dir = argv[pidx + 1]; + } + else { + // Search in default paths for input images. + int found = getInputDir(params.input_dir, argv[0]); + if (!found) { + std::cout << "Please specify input directory with encoded images" << std::endl; + return EXIT_WAIVED; + } + } + if ((pidx = findParamIndex(argv, argc, "-o")) != -1) { + params.output_dir = argv[pidx + 1]; + } + else { + // by-default write the folder named "output" in cwd + params.output_dir = "encode_output"; + } + params.dev = 0; + params.dev = findCudaDevice(argc, argv); - params.huf = 0; - if ((pidx = findParamIndex(argv, argc, "-huf")) != -1) { - params.huf = std::atoi(argv[pidx + 1]); - } + params.quality = 70; + if ((pidx = findParamIndex(argv, argc, "-q")) != -1) { + params.quality = std::atoi(argv[pidx + 1]); + } + + if ((pidx = findParamIndex(argv, argc, "-s")) != -1) { + params.subsampling = argv[pidx + 1]; + } + else { + // by-default use subsampling as 420 + params.subsampling = "420"; + } + if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) { + params.format = argv[pidx + 1]; + } + else { + // by-default use output format yuv + params.format = "yuv"; + } + + params.huf = 0; + if ((pidx = findParamIndex(argv, argc, "-huf")) != -1) { + params.huf = std::atoi(argv[pidx + 1]); + } cudaDeviceProp props; checkCudaErrors(cudaGetDeviceProperties(&props, params.dev)); printf("Using GPU %d (%s, %d SMs, %d th/SM max, CC %d.%d, ECC %s)\n", - params.dev, props.name, props.multiProcessorCount, - props.maxThreadsPerMultiProcessor, props.major, props.minor, - props.ECCEnabled ? "on" : "off"); + params.dev, + props.name, + props.multiProcessorCount, + props.maxThreadsPerMultiProcessor, + props.major, + props.minor, + props.ECCEnabled ? "on" : "off"); nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free}; checkCudaErrors(nvjpegCreate(NVJPEG_BACKEND_DEFAULT, &dev_allocator, &nvjpeg_handle)); checkCudaErrors(nvjpegJpegStateCreate(nvjpeg_handle, &jpeg_state)); checkCudaErrors(nvjpegEncoderStateCreate(nvjpeg_handle, &encoder_state, NULL)); checkCudaErrors(nvjpegEncoderParamsCreate(nvjpeg_handle, &encode_params, NULL)); - + // sample input parameters checkCudaErrors(nvjpegEncoderParamsSetQuality(encode_params, params.quality, NULL)); checkCudaErrors(nvjpegEncoderParamsSetOptimizedHuffman(encode_params, params.huf, NULL)); diff --git a/Samples/4_CUDA_Libraries/oceanFFT/README.md b/Samples/4_CUDA_Libraries/oceanFFT/README.md index de66ebbe..b5de0181 100644 --- a/Samples/4_CUDA_Libraries/oceanFFT/README.md +++ b/Samples/4_CUDA_Libraries/oceanFFT/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp b/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp index 68c5d97c..a4cc4890 100644 --- a/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp +++ b/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT.cpp @@ -46,19 +46,17 @@ #endif // includes -#include -#include -#include -#include -#include - -#include #include +#include #include - #include #include +#include +#include #include +#include +#include +#include #if defined(__APPLE__) || defined(MACOSX) #pragma clang diagnostic ignored "-Wdeprecated-declarations" @@ -71,89 +69,88 @@ const char *sSDKsample = "CUDA FFT Ocean Simulation"; -#define MAX_EPSILON 0.10f -#define THRESHOLD 0.15f -#define REFRESH_DELAY 10 // ms +#define MAX_EPSILON 0.10f +#define THRESHOLD 0.15f +#define REFRESH_DELAY 10 // ms //////////////////////////////////////////////////////////////////////////////// // constants unsigned int windowW = 512, windowH = 512; -const unsigned int meshSize = 256; +const unsigned int meshSize = 256; const unsigned int spectrumW = meshSize + 4; const unsigned int spectrumH = meshSize + 1; const int frameCompare = 4; // OpenGL vertex buffers -GLuint posVertexBuffer; -GLuint heightVertexBuffer, slopeVertexBuffer; +GLuint posVertexBuffer; +GLuint heightVertexBuffer, slopeVertexBuffer; struct cudaGraphicsResource *cuda_posVB_resource, *cuda_heightVB_resource, - *cuda_slopeVB_resource; // handles OpenGL-CUDA exchange + *cuda_slopeVB_resource; // handles OpenGL-CUDA exchange GLuint indexBuffer; GLuint shaderProg; -char *vertShaderPath = 0, *fragShaderPath = 0; +char *vertShaderPath = 0, *fragShaderPath = 0; // mouse controls -int mouseOldX, mouseOldY; -int mouseButtons = 0; +int mouseOldX, mouseOldY; +int mouseButtons = 0; float rotateX = 20.0f, rotateY = 0.0f; float translateX = 0.0f, translateY = 0.0f, translateZ = -2.0f; -bool animate = true; -bool drawPoints = false; -bool wireFrame = false; +bool animate = true; +bool drawPoints = false; +bool wireFrame = false; bool g_hasDouble = false; // FFT data cufftHandle fftPlan; -float2 *d_h0 = 0; // heightfield at time 0 -float2 *h_h0 = 0; -float2 *d_ht = 0; // heightfield at time t -float2 *d_slope = 0; +float2 *d_h0 = 0; // heightfield at time 0 +float2 *h_h0 = 0; +float2 *d_ht = 0; // heightfield at time t +float2 *d_slope = 0; // pointers to device object -float *g_hptr = NULL; +float *g_hptr = NULL; float2 *g_sptr = NULL; // simulation parameters -const float g = 9.81f; // gravitational constant -const float A = 1e-7f; // wave scale factor -const float patchSize = 100; // patch size -float windSpeed = 100.0f; -float windDir = CUDART_PI_F / 3.0f; -float dirDepend = 0.07f; +const float g = 9.81f; // gravitational constant +const float A = 1e-7f; // wave scale factor +const float patchSize = 100; // patch size +float windSpeed = 100.0f; +float windDir = CUDART_PI_F / 3.0f; +float dirDepend = 0.07f; -StopWatchInterface *timer = NULL; -float animTime = 0.0f; -float prevTime = 0.0f; -float animationRate = -0.001f; +StopWatchInterface *timer = NULL; +float animTime = 0.0f; +float prevTime = 0.0f; +float animationRate = -0.001f; // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; //////////////////////////////////////////////////////////////////////////////// // kernels -//#include +// #include -extern "C" void cudaGenerateSpectrumKernel(float2 *d_h0, float2 *d_ht, +extern "C" void cudaGenerateSpectrumKernel(float2 *d_h0, + float2 *d_ht, unsigned int in_width, unsigned int out_width, unsigned int out_height, - float animTime, float patchSize); + float animTime, + float patchSize); -extern "C" void cudaUpdateHeightmapKernel(float *d_heightMap, float2 *d_ht, - unsigned int width, - unsigned int height, bool autoTest); +extern "C" void +cudaUpdateHeightmapKernel(float *d_heightMap, float2 *d_ht, unsigned int width, unsigned int height, bool autoTest); -extern "C" void cudaCalculateSlopeKernel(float *h, float2 *slopeOut, - unsigned int width, - unsigned int height); +extern "C" void cudaCalculateSlopeKernel(float *h, float2 *slopeOut, unsigned int width, unsigned int height); //////////////////////////////////////////////////////////////////////////////// // forward declarations @@ -161,11 +158,11 @@ void runAutoTest(int argc, char **argv); void runGraphicsTest(int argc, char **argv); // GL functionality -bool initGL(int *argc, char **argv); -void createVBO(GLuint *vbo, int size); -void deleteVBO(GLuint *vbo); -void createMeshIndexBuffer(GLuint *id, int w, int h); -void createMeshPositionVBO(GLuint *id, int w, int h); +bool initGL(int *argc, char **argv); +void createVBO(GLuint *vbo, int size); +void deleteVBO(GLuint *vbo); +void createMeshIndexBuffer(GLuint *id, int w, int h); +void createMeshPositionVBO(GLuint *id, int w, int h); GLuint loadGLSLProgram(const char *vertFileName, const char *fragFileName); // rendering callbacks @@ -184,162 +181,163 @@ void generate_h0(float2 *h0); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf( - "NOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); +int main(int argc, char **argv) +{ + printf("NOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - // check for command line arguments - if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { - animate = false; - fpsLimit = frameCheckNumber; - runAutoTest(argc, argv); - } else { - printf( - "[%s]\n\n" - "Left mouse button - rotate\n" - "Middle mouse button - pan\n" - "Right mouse button - zoom\n" - "'w' key - toggle wireframe\n", - sSDKsample); + // check for command line arguments + if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { + animate = false; + fpsLimit = frameCheckNumber; + runAutoTest(argc, argv); + } + else { + printf("[%s]\n\n" + "Left mouse button - rotate\n" + "Middle mouse button - pan\n" + "Right mouse button - zoom\n" + "'w' key - toggle wireframe\n", + sSDKsample); - runGraphicsTest(argc, argv); - } - - exit(EXIT_SUCCESS); -} - -//////////////////////////////////////////////////////////////////////////////// -//! Run test -//////////////////////////////////////////////////////////////////////////////// -void runAutoTest(int argc, char **argv) { - printf("%s Starting...\n\n", argv[0]); - - // Cuda init - int dev = findCudaDevice(argc, (const char **)argv); - - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor); - - // create FFT plan - checkCudaErrors(cufftPlan2d(&fftPlan, meshSize, meshSize, CUFFT_C2C)); - - // allocate memory - int spectrumSize = spectrumW * spectrumH * sizeof(float2); - checkCudaErrors(cudaMalloc((void **)&d_h0, spectrumSize)); - h_h0 = (float2 *)malloc(spectrumSize); - generate_h0(h_h0); - checkCudaErrors(cudaMemcpy(d_h0, h_h0, spectrumSize, cudaMemcpyHostToDevice)); - - int outputSize = meshSize * meshSize * sizeof(float2); - checkCudaErrors(cudaMalloc((void **)&d_ht, outputSize)); - checkCudaErrors(cudaMalloc((void **)&d_slope, outputSize)); - - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - prevTime = sdkGetTimerValue(&timer); - - runCudaTest(argv[0]); - - checkCudaErrors(cudaFree(d_ht)); - checkCudaErrors(cudaFree(d_slope)); - checkCudaErrors(cudaFree(d_h0)); - checkCudaErrors(cufftDestroy(fftPlan)); - free(h_h0); - - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); -} - -//////////////////////////////////////////////////////////////////////////////// -//! Run test -//////////////////////////////////////////////////////////////////////////////// -void runGraphicsTest(int argc, char **argv) { -#if defined(__linux__) - setenv("DISPLAY", ":0", 0); -#endif - - printf("[%s] ", sSDKsample); - printf("\n"); - - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf("[%s]\n", argv[0]); - printf(" Does not explicitly support -device=n in OpenGL mode\n"); - printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); - printf(" > %s -device=n -qatest\n", argv[0]); - printf("exiting...\n"); + runGraphicsTest(argc, argv); + } exit(EXIT_SUCCESS); - } +} - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with OpenGL/CUDA - // interop. - if (false == initGL(&argc, argv)) { - return; - } +//////////////////////////////////////////////////////////////////////////////// +//! Run test +//////////////////////////////////////////////////////////////////////////////// +void runAutoTest(int argc, char **argv) +{ + printf("%s Starting...\n\n", argv[0]); - findCudaDevice(argc, (const char **)argv); + // Cuda init + int dev = findCudaDevice(argc, (const char **)argv); - // create FFT plan - checkCudaErrors(cufftPlan2d(&fftPlan, meshSize, meshSize, CUFFT_C2C)); + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor); - // allocate memory - int spectrumSize = spectrumW * spectrumH * sizeof(float2); - checkCudaErrors(cudaMalloc((void **)&d_h0, spectrumSize)); - h_h0 = (float2 *)malloc(spectrumSize); - generate_h0(h_h0); - checkCudaErrors(cudaMemcpy(d_h0, h_h0, spectrumSize, cudaMemcpyHostToDevice)); + // create FFT plan + checkCudaErrors(cufftPlan2d(&fftPlan, meshSize, meshSize, CUFFT_C2C)); - int outputSize = meshSize * meshSize * sizeof(float2); - checkCudaErrors(cudaMalloc((void **)&d_ht, outputSize)); - checkCudaErrors(cudaMalloc((void **)&d_slope, outputSize)); + // allocate memory + int spectrumSize = spectrumW * spectrumH * sizeof(float2); + checkCudaErrors(cudaMalloc((void **)&d_h0, spectrumSize)); + h_h0 = (float2 *)malloc(spectrumSize); + generate_h0(h_h0); + checkCudaErrors(cudaMemcpy(d_h0, h_h0, spectrumSize, cudaMemcpyHostToDevice)); - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - prevTime = sdkGetTimerValue(&timer); + int outputSize = meshSize * meshSize * sizeof(float2); + checkCudaErrors(cudaMalloc((void **)&d_ht, outputSize)); + checkCudaErrors(cudaMalloc((void **)&d_slope, outputSize)); - // create vertex buffers and register with CUDA - createVBO(&heightVertexBuffer, meshSize * meshSize * sizeof(float)); - checkCudaErrors( - cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, - cudaGraphicsMapFlagsWriteDiscard)); + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + prevTime = sdkGetTimerValue(&timer); - createVBO(&slopeVertexBuffer, outputSize); - checkCudaErrors( - cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, - cudaGraphicsMapFlagsWriteDiscard)); + runCudaTest(argv[0]); - // create vertex and index buffer for mesh - createMeshPositionVBO(&posVertexBuffer, meshSize, meshSize); - createMeshIndexBuffer(&indexBuffer, meshSize, meshSize); + checkCudaErrors(cudaFree(d_ht)); + checkCudaErrors(cudaFree(d_slope)); + checkCudaErrors(cudaFree(d_h0)); + checkCudaErrors(cufftDestroy(fftPlan)); + free(h_h0); - runCuda(); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} - // register callbacks - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutReshapeFunc(reshape); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); +//////////////////////////////////////////////////////////////////////////////// +//! Run test +//////////////////////////////////////////////////////////////////////////////// +void runGraphicsTest(int argc, char **argv) +{ +#if defined(__linux__) + setenv("DISPLAY", ":0", 0); +#endif - // start rendering mainloop - glutMainLoop(); + printf("[%s] ", sSDKsample); + printf("\n"); + + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf("[%s]\n", argv[0]); + printf(" Does not explicitly support -device=n in OpenGL mode\n"); + printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); + printf(" > %s -device=n -qatest\n", argv[0]); + printf("exiting...\n"); + + exit(EXIT_SUCCESS); + } + + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with OpenGL/CUDA + // interop. + if (false == initGL(&argc, argv)) { + return; + } + + findCudaDevice(argc, (const char **)argv); + + // create FFT plan + checkCudaErrors(cufftPlan2d(&fftPlan, meshSize, meshSize, CUFFT_C2C)); + + // allocate memory + int spectrumSize = spectrumW * spectrumH * sizeof(float2); + checkCudaErrors(cudaMalloc((void **)&d_h0, spectrumSize)); + h_h0 = (float2 *)malloc(spectrumSize); + generate_h0(h_h0); + checkCudaErrors(cudaMemcpy(d_h0, h_h0, spectrumSize, cudaMemcpyHostToDevice)); + + int outputSize = meshSize * meshSize * sizeof(float2); + checkCudaErrors(cudaMalloc((void **)&d_ht, outputSize)); + checkCudaErrors(cudaMalloc((void **)&d_slope, outputSize)); + + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + prevTime = sdkGetTimerValue(&timer); + + // create vertex buffers and register with CUDA + createVBO(&heightVertexBuffer, meshSize * meshSize * sizeof(float)); + checkCudaErrors( + cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); + + createVBO(&slopeVertexBuffer, outputSize); + checkCudaErrors( + cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); + + // create vertex and index buffer for mesh + createMeshPositionVBO(&posVertexBuffer, meshSize, meshSize); + createMeshIndexBuffer(&indexBuffer, meshSize, meshSize); + + runCuda(); + + // register callbacks + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutReshapeFunc(reshape); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + + // start rendering mainloop + glutMainLoop(); } float urand() { return rand() / (float)RAND_MAX; } // Generates Gaussian random number with mean 0 and standard deviation 1. -float gauss() { - float u1 = urand(); - float u2 = urand(); +float gauss() +{ + float u1 = urand(); + float u2 = urand(); - if (u1 < 1e-6f) { - u1 = 1e-6f; - } + if (u1 < 1e-6f) { + u1 = 1e-6f; + } - return sqrtf(-2 * logf(u1)) * cosf(2 * CUDART_PI_F * u2); + return sqrtf(-2 * logf(u1)) * cosf(2 * CUDART_PI_F * u2); } // Phillips spectrum @@ -347,147 +345,142 @@ float gauss() { // Vdir - wind angle in radians // V - wind speed // A - constant -float phillips(float Kx, float Ky, float Vdir, float V, float A, - float dir_depend) { - float k_squared = Kx * Kx + Ky * Ky; +float phillips(float Kx, float Ky, float Vdir, float V, float A, float dir_depend) +{ + float k_squared = Kx * Kx + Ky * Ky; - if (k_squared == 0.0f) { - return 0.0f; - } + if (k_squared == 0.0f) { + return 0.0f; + } - // largest possible wave from constant wind of velocity v - float L = V * V / g; + // largest possible wave from constant wind of velocity v + float L = V * V / g; - float k_x = Kx / sqrtf(k_squared); - float k_y = Ky / sqrtf(k_squared); - float w_dot_k = k_x * cosf(Vdir) + k_y * sinf(Vdir); + float k_x = Kx / sqrtf(k_squared); + float k_y = Ky / sqrtf(k_squared); + float w_dot_k = k_x * cosf(Vdir) + k_y * sinf(Vdir); - float phillips = A * expf(-1.0f / (k_squared * L * L)) / - (k_squared * k_squared) * w_dot_k * w_dot_k; + float phillips = A * expf(-1.0f / (k_squared * L * L)) / (k_squared * k_squared) * w_dot_k * w_dot_k; - // filter out waves moving opposite to wind - if (w_dot_k < 0.0f) { - phillips *= dir_depend; - } + // filter out waves moving opposite to wind + if (w_dot_k < 0.0f) { + phillips *= dir_depend; + } - // damp out waves with very small length w << l - // float w = L / 10000; - // phillips *= expf(-k_squared * w * w); + // damp out waves with very small length w << l + // float w = L / 10000; + // phillips *= expf(-k_squared * w * w); - return phillips; + return phillips; } // Generate base heightfield in frequency space -void generate_h0(float2 *h0) { - for (unsigned int y = 0; y <= meshSize; y++) { - for (unsigned int x = 0; x <= meshSize; x++) { - float kx = (-(int)meshSize / 2.0f + x) * (2.0f * CUDART_PI_F / patchSize); - float ky = (-(int)meshSize / 2.0f + y) * (2.0f * CUDART_PI_F / patchSize); +void generate_h0(float2 *h0) +{ + for (unsigned int y = 0; y <= meshSize; y++) { + for (unsigned int x = 0; x <= meshSize; x++) { + float kx = (-(int)meshSize / 2.0f + x) * (2.0f * CUDART_PI_F / patchSize); + float ky = (-(int)meshSize / 2.0f + y) * (2.0f * CUDART_PI_F / patchSize); - float P = sqrtf(phillips(kx, ky, windDir, windSpeed, A, dirDepend)); + float P = sqrtf(phillips(kx, ky, windDir, windSpeed, A, dirDepend)); - if (kx == 0.0f && ky == 0.0f) { - P = 0.0f; - } + if (kx == 0.0f && ky == 0.0f) { + P = 0.0f; + } - // float Er = urand()*2.0f-1.0f; - // float Ei = urand()*2.0f-1.0f; - float Er = gauss(); - float Ei = gauss(); + // float Er = urand()*2.0f-1.0f; + // float Ei = urand()*2.0f-1.0f; + float Er = gauss(); + float Ei = gauss(); - float h0_re = Er * P * CUDART_SQRT_HALF_F; - float h0_im = Ei * P * CUDART_SQRT_HALF_F; + float h0_re = Er * P * CUDART_SQRT_HALF_F; + float h0_im = Ei * P * CUDART_SQRT_HALF_F; - int i = y * spectrumW + x; - h0[i].x = h0_re; - h0[i].y = h0_im; + int i = y * spectrumW + x; + h0[i].x = h0_re; + h0[i].y = h0_im; + } } - } } //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda kernels //////////////////////////////////////////////////////////////////////////////// -void runCuda() { - size_t num_bytes; +void runCuda() +{ + size_t num_bytes; - // generate wave spectrum in frequency domain - cudaGenerateSpectrumKernel(d_h0, d_ht, spectrumW, meshSize, meshSize, - animTime, patchSize); + // generate wave spectrum in frequency domain + cudaGenerateSpectrumKernel(d_h0, d_ht, spectrumW, meshSize, meshSize, animTime, patchSize); - // execute inverse FFT to convert to spatial domain - checkCudaErrors(cufftExecC2C(fftPlan, d_ht, d_ht, CUFFT_INVERSE)); + // execute inverse FFT to convert to spatial domain + checkCudaErrors(cufftExecC2C(fftPlan, d_ht, d_ht, CUFFT_INVERSE)); - // update heightmap values in vertex buffer - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_heightVB_resource, 0)); - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&g_hptr, &num_bytes, cuda_heightVB_resource)); + // update heightmap values in vertex buffer + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_heightVB_resource, 0)); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&g_hptr, &num_bytes, cuda_heightVB_resource)); - cudaUpdateHeightmapKernel(g_hptr, d_ht, meshSize, meshSize, false); + cudaUpdateHeightmapKernel(g_hptr, d_ht, meshSize, meshSize, false); - // calculate slope for shading - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_slopeVB_resource, 0)); - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&g_sptr, &num_bytes, cuda_slopeVB_resource)); + // calculate slope for shading + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_slopeVB_resource, 0)); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&g_sptr, &num_bytes, cuda_slopeVB_resource)); - cudaCalculateSlopeKernel(g_hptr, g_sptr, meshSize, meshSize); + cudaCalculateSlopeKernel(g_hptr, g_sptr, meshSize, meshSize); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_heightVB_resource, 0)); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_slopeVB_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_heightVB_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_slopeVB_resource, 0)); } -void runCudaTest(char *exec_path) { - checkCudaErrors( - cudaMalloc((void **)&g_hptr, meshSize * meshSize * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&g_sptr, meshSize * meshSize * sizeof(float2))); +void runCudaTest(char *exec_path) +{ + checkCudaErrors(cudaMalloc((void **)&g_hptr, meshSize * meshSize * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&g_sptr, meshSize * meshSize * sizeof(float2))); - // generate wave spectrum in frequency domain - cudaGenerateSpectrumKernel(d_h0, d_ht, spectrumW, meshSize, meshSize, - animTime, patchSize); + // generate wave spectrum in frequency domain + cudaGenerateSpectrumKernel(d_h0, d_ht, spectrumW, meshSize, meshSize, animTime, patchSize); - // execute inverse FFT to convert to spatial domain - checkCudaErrors(cufftExecC2C(fftPlan, d_ht, d_ht, CUFFT_INVERSE)); + // execute inverse FFT to convert to spatial domain + checkCudaErrors(cufftExecC2C(fftPlan, d_ht, d_ht, CUFFT_INVERSE)); - // update heightmap values - cudaUpdateHeightmapKernel(g_hptr, d_ht, meshSize, meshSize, true); + // update heightmap values + cudaUpdateHeightmapKernel(g_hptr, d_ht, meshSize, meshSize, true); - { - float *hptr = (float *)malloc(meshSize * meshSize * sizeof(float)); - cudaMemcpy((void *)hptr, (void *)g_hptr, - meshSize * meshSize * sizeof(float), cudaMemcpyDeviceToHost); - sdkDumpBin((void *)hptr, meshSize * meshSize * sizeof(float), - "spatialDomain.bin"); + { + float *hptr = (float *)malloc(meshSize * meshSize * sizeof(float)); + cudaMemcpy((void *)hptr, (void *)g_hptr, meshSize * meshSize * sizeof(float), cudaMemcpyDeviceToHost); + sdkDumpBin((void *)hptr, meshSize * meshSize * sizeof(float), "spatialDomain.bin"); - if (!sdkCompareBin2BinFloat("spatialDomain.bin", "ref_spatialDomain.bin", - meshSize * meshSize, MAX_EPSILON, THRESHOLD, - exec_path)) { - g_TotalErrors++; + if (!sdkCompareBin2BinFloat( + "spatialDomain.bin", "ref_spatialDomain.bin", meshSize * meshSize, MAX_EPSILON, THRESHOLD, exec_path)) { + g_TotalErrors++; + } + + free(hptr); } - free(hptr); - } + // calculate slope for shading + cudaCalculateSlopeKernel(g_hptr, g_sptr, meshSize, meshSize); - // calculate slope for shading - cudaCalculateSlopeKernel(g_hptr, g_sptr, meshSize, meshSize); + { + float2 *sptr = (float2 *)malloc(meshSize * meshSize * sizeof(float2)); + cudaMemcpy((void *)sptr, (void *)g_sptr, meshSize * meshSize * sizeof(float2), cudaMemcpyDeviceToHost); + sdkDumpBin(sptr, meshSize * meshSize * sizeof(float2), "slopeShading.bin"); - { - float2 *sptr = (float2 *)malloc(meshSize * meshSize * sizeof(float2)); - cudaMemcpy((void *)sptr, (void *)g_sptr, - meshSize * meshSize * sizeof(float2), cudaMemcpyDeviceToHost); - sdkDumpBin(sptr, meshSize * meshSize * sizeof(float2), "slopeShading.bin"); + if (!sdkCompareBin2BinFloat("slopeShading.bin", + "ref_slopeShading.bin", + meshSize * meshSize * 2, + MAX_EPSILON, + THRESHOLD, + exec_path)) { + g_TotalErrors++; + } - if (!sdkCompareBin2BinFloat("slopeShading.bin", "ref_slopeShading.bin", - meshSize * meshSize * 2, MAX_EPSILON, THRESHOLD, - exec_path)) { - g_TotalErrors++; + free(sptr); } - free(sptr); - } - - checkCudaErrors(cudaFree(g_hptr)); - checkCudaErrors(cudaFree(g_sptr)); + checkCudaErrors(cudaFree(g_hptr)); + checkCudaErrors(cudaFree(g_sptr)); } // void computeFPS() @@ -503,393 +496,406 @@ void runCudaTest(char *exec_path) { //////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// -void display() { - // run CUDA kernel to generate vertex positions - if (animate) { - runCuda(); - } +void display() +{ + // run CUDA kernel to generate vertex positions + if (animate) { + runCuda(); + } - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - // set view matrix - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - glTranslatef(translateX, translateY, translateZ); - glRotatef(rotateX, 1.0, 0.0, 0.0); - glRotatef(rotateY, 0.0, 1.0, 0.0); + // set view matrix + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + glTranslatef(translateX, translateY, translateZ); + glRotatef(rotateX, 1.0, 0.0, 0.0); + glRotatef(rotateY, 0.0, 1.0, 0.0); - // render from the vbo - glBindBuffer(GL_ARRAY_BUFFER, posVertexBuffer); - glVertexPointer(4, GL_FLOAT, 0, 0); - glEnableClientState(GL_VERTEX_ARRAY); + // render from the vbo + glBindBuffer(GL_ARRAY_BUFFER, posVertexBuffer); + glVertexPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_VERTEX_ARRAY); - glBindBuffer(GL_ARRAY_BUFFER, heightVertexBuffer); - glClientActiveTexture(GL_TEXTURE0); - glTexCoordPointer(1, GL_FLOAT, 0, 0); - glEnableClientState(GL_TEXTURE_COORD_ARRAY); + glBindBuffer(GL_ARRAY_BUFFER, heightVertexBuffer); + glClientActiveTexture(GL_TEXTURE0); + glTexCoordPointer(1, GL_FLOAT, 0, 0); + glEnableClientState(GL_TEXTURE_COORD_ARRAY); - glBindBuffer(GL_ARRAY_BUFFER, slopeVertexBuffer); - glClientActiveTexture(GL_TEXTURE1); - glTexCoordPointer(2, GL_FLOAT, 0, 0); - glEnableClientState(GL_TEXTURE_COORD_ARRAY); + glBindBuffer(GL_ARRAY_BUFFER, slopeVertexBuffer); + glClientActiveTexture(GL_TEXTURE1); + glTexCoordPointer(2, GL_FLOAT, 0, 0); + glEnableClientState(GL_TEXTURE_COORD_ARRAY); - glUseProgram(shaderProg); + glUseProgram(shaderProg); - // Set default uniform variables parameters for the vertex shader - GLuint uniHeightScale, uniChopiness, uniSize; + // Set default uniform variables parameters for the vertex shader + GLuint uniHeightScale, uniChopiness, uniSize; - uniHeightScale = glGetUniformLocation(shaderProg, "heightScale"); - glUniform1f(uniHeightScale, 0.5f); + uniHeightScale = glGetUniformLocation(shaderProg, "heightScale"); + glUniform1f(uniHeightScale, 0.5f); - uniChopiness = glGetUniformLocation(shaderProg, "chopiness"); - glUniform1f(uniChopiness, 1.0f); + uniChopiness = glGetUniformLocation(shaderProg, "chopiness"); + glUniform1f(uniChopiness, 1.0f); - uniSize = glGetUniformLocation(shaderProg, "size"); - glUniform2f(uniSize, (float)meshSize, (float)meshSize); + uniSize = glGetUniformLocation(shaderProg, "size"); + glUniform2f(uniSize, (float)meshSize, (float)meshSize); - // Set default uniform variables parameters for the pixel shader - GLuint uniDeepColor, uniShallowColor, uniSkyColor, uniLightDir; + // Set default uniform variables parameters for the pixel shader + GLuint uniDeepColor, uniShallowColor, uniSkyColor, uniLightDir; - uniDeepColor = glGetUniformLocation(shaderProg, "deepColor"); - glUniform4f(uniDeepColor, 0.0f, 0.1f, 0.4f, 1.0f); + uniDeepColor = glGetUniformLocation(shaderProg, "deepColor"); + glUniform4f(uniDeepColor, 0.0f, 0.1f, 0.4f, 1.0f); - uniShallowColor = glGetUniformLocation(shaderProg, "shallowColor"); - glUniform4f(uniShallowColor, 0.1f, 0.3f, 0.3f, 1.0f); + uniShallowColor = glGetUniformLocation(shaderProg, "shallowColor"); + glUniform4f(uniShallowColor, 0.1f, 0.3f, 0.3f, 1.0f); - uniSkyColor = glGetUniformLocation(shaderProg, "skyColor"); - glUniform4f(uniSkyColor, 1.0f, 1.0f, 1.0f, 1.0f); + uniSkyColor = glGetUniformLocation(shaderProg, "skyColor"); + glUniform4f(uniSkyColor, 1.0f, 1.0f, 1.0f, 1.0f); - uniLightDir = glGetUniformLocation(shaderProg, "lightDir"); - glUniform3f(uniLightDir, 0.0f, 1.0f, 0.0f); - // end of uniform settings + uniLightDir = glGetUniformLocation(shaderProg, "lightDir"); + glUniform3f(uniLightDir, 0.0f, 1.0f, 0.0f); + // end of uniform settings - glColor3f(1.0, 1.0, 1.0); + glColor3f(1.0, 1.0, 1.0); - if (drawPoints) { - glDrawArrays(GL_POINTS, 0, meshSize * meshSize); - } else { - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, indexBuffer); + if (drawPoints) { + glDrawArrays(GL_POINTS, 0, meshSize * meshSize); + } + else { + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, indexBuffer); - glPolygonMode(GL_FRONT_AND_BACK, wireFrame ? GL_LINE : GL_FILL); - glDrawElements(GL_TRIANGLE_STRIP, ((meshSize * 2) + 2) * (meshSize - 1), - GL_UNSIGNED_INT, 0); - glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + glPolygonMode(GL_FRONT_AND_BACK, wireFrame ? GL_LINE : GL_FILL); + glDrawElements(GL_TRIANGLE_STRIP, ((meshSize * 2) + 2) * (meshSize - 1), GL_UNSIGNED_INT, 0); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); - } + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); + } - glDisableClientState(GL_VERTEX_ARRAY); - glClientActiveTexture(GL_TEXTURE0); - glDisableClientState(GL_TEXTURE_COORD_ARRAY); - glClientActiveTexture(GL_TEXTURE1); - glDisableClientState(GL_TEXTURE_COORD_ARRAY); + glDisableClientState(GL_VERTEX_ARRAY); + glClientActiveTexture(GL_TEXTURE0); + glDisableClientState(GL_TEXTURE_COORD_ARRAY); + glClientActiveTexture(GL_TEXTURE1); + glDisableClientState(GL_TEXTURE_COORD_ARRAY); - glUseProgram(0); + glUseProgram(0); - glutSwapBuffers(); + glutSwapBuffers(); - // computeFPS(); + // computeFPS(); } -void timerEvent(int value) { - float time = sdkGetTimerValue(&timer); +void timerEvent(int value) +{ + float time = sdkGetTimerValue(&timer); - if (animate) { - animTime += (time - prevTime) * animationRate; - } + if (animate) { + animTime += (time - prevTime) * animationRate; + } - glutPostRedisplay(); - prevTime = time; + glutPostRedisplay(); + prevTime = time; - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); } -void cleanup() { - sdkDeleteTimer(&timer); - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_heightVB_resource)); - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_slopeVB_resource)); +void cleanup() +{ + sdkDeleteTimer(&timer); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_heightVB_resource)); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_slopeVB_resource)); - deleteVBO(&posVertexBuffer); - deleteVBO(&heightVertexBuffer); - deleteVBO(&slopeVertexBuffer); + deleteVBO(&posVertexBuffer); + deleteVBO(&heightVertexBuffer); + deleteVBO(&slopeVertexBuffer); - checkCudaErrors(cudaFree(d_h0)); - checkCudaErrors(cudaFree(d_slope)); - checkCudaErrors(cudaFree(d_ht)); - free(h_h0); - cufftDestroy(fftPlan); + checkCudaErrors(cudaFree(d_h0)); + checkCudaErrors(cudaFree(d_slope)); + checkCudaErrors(cudaFree(d_ht)); + free(h_h0); + cufftDestroy(fftPlan); } //////////////////////////////////////////////////////////////////////////////// //! Keyboard events handler //////////////////////////////////////////////////////////////////////////////// -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case (27): - cleanup(); - exit(EXIT_SUCCESS); + cleanup(); + exit(EXIT_SUCCESS); case 'w': - wireFrame = !wireFrame; - break; + wireFrame = !wireFrame; + break; case 'p': - drawPoints = !drawPoints; - break; + drawPoints = !drawPoints; + break; case ' ': - animate = !animate; - break; - } + animate = !animate; + break; + } } //////////////////////////////////////////////////////////////////////////////// //! Mouse event handlers //////////////////////////////////////////////////////////////////////////////// -void mouse(int button, int state, int x, int y) { - if (state == GLUT_DOWN) { - mouseButtons |= 1 << button; - } else if (state == GLUT_UP) { - mouseButtons = 0; - } +void mouse(int button, int state, int x, int y) +{ + if (state == GLUT_DOWN) { + mouseButtons |= 1 << button; + } + else if (state == GLUT_UP) { + mouseButtons = 0; + } - mouseOldX = x; - mouseOldY = y; - glutPostRedisplay(); + mouseOldX = x; + mouseOldY = y; + glutPostRedisplay(); } -void motion(int x, int y) { - float dx, dy; - dx = (float)(x - mouseOldX); - dy = (float)(y - mouseOldY); +void motion(int x, int y) +{ + float dx, dy; + dx = (float)(x - mouseOldX); + dy = (float)(y - mouseOldY); - if (mouseButtons == 1) { - rotateX += dy * 0.2f; - rotateY += dx * 0.2f; - } else if (mouseButtons == 2) { - translateX += dx * 0.01f; - translateY -= dy * 0.01f; - } else if (mouseButtons == 4) { - translateZ += dy * 0.01f; - } + if (mouseButtons == 1) { + rotateX += dy * 0.2f; + rotateY += dx * 0.2f; + } + else if (mouseButtons == 2) { + translateX += dx * 0.01f; + translateY -= dy * 0.01f; + } + else if (mouseButtons == 4) { + translateZ += dy * 0.01f; + } - mouseOldX = x; - mouseOldY = y; + mouseOldX = x; + mouseOldY = y; } -void reshape(int w, int h) { - glViewport(0, 0, w, h); +void reshape(int w, int h) +{ + glViewport(0, 0, w, h); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - gluPerspective(60.0, (double)w / (double)h, 0.1, 10.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (double)w / (double)h, 0.1, 10.0); - windowW = w; - windowH = h; + windowW = w; + windowH = h; } //////////////////////////////////////////////////////////////////////////////// //! Initialize GL //////////////////////////////////////////////////////////////////////////////// -bool initGL(int *argc, char **argv) { - // Create GL context - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); - glutInitWindowSize(windowW, windowH); - glutCreateWindow("CUDA FFT Ocean Simulation"); +bool initGL(int *argc, char **argv) +{ + // Create GL context + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); + glutInitWindowSize(windowW, windowH); + glutCreateWindow("CUDA FFT Ocean Simulation"); - vertShaderPath = sdkFindFilePath("ocean.vert", argv[0]); - fragShaderPath = sdkFindFilePath("ocean.frag", argv[0]); + vertShaderPath = sdkFindFilePath("ocean.vert", argv[0]); + fragShaderPath = sdkFindFilePath("ocean.frag", argv[0]); - if (vertShaderPath == NULL || fragShaderPath == NULL) { - fprintf(stderr, "Error unable to find GLSL vertex and fragment shaders!\n"); - exit(EXIT_FAILURE); - } + if (vertShaderPath == NULL || fragShaderPath == NULL) { + fprintf(stderr, "Error unable to find GLSL vertex and fragment shaders!\n"); + exit(EXIT_FAILURE); + } - // initialize necessary OpenGL extensions + // initialize necessary OpenGL extensions - if (!isGLVersionSupported(2, 0)) { - fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); - fflush(stderr); - return false; - } + if (!isGLVersionSupported(2, 0)) { + fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); + fflush(stderr); + return false; + } - if (!areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); - fprintf(stderr, "This sample requires:\n"); - fprintf(stderr, " OpenGL version 1.5\n"); - fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); - fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); - cleanup(); - exit(EXIT_FAILURE); - } + if (!areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); + fprintf(stderr, "This sample requires:\n"); + fprintf(stderr, " OpenGL version 1.5\n"); + fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); + fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); + cleanup(); + exit(EXIT_FAILURE); + } - // default initialization - glClearColor(0.0, 0.0, 0.0, 1.0); - glEnable(GL_DEPTH_TEST); + // default initialization + glClearColor(0.0, 0.0, 0.0, 1.0); + glEnable(GL_DEPTH_TEST); - // load shader - shaderProg = loadGLSLProgram(vertShaderPath, fragShaderPath); + // load shader + shaderProg = loadGLSLProgram(vertShaderPath, fragShaderPath); - SDK_CHECK_ERROR_GL(); - return true; + SDK_CHECK_ERROR_GL(); + return true; } //////////////////////////////////////////////////////////////////////////////// //! Create VBO //////////////////////////////////////////////////////////////////////////////// -void createVBO(GLuint *vbo, int size) { - // create buffer object - glGenBuffers(1, vbo); - glBindBuffer(GL_ARRAY_BUFFER, *vbo); - glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); +void createVBO(GLuint *vbo, int size) +{ + // create buffer object + glGenBuffers(1, vbo); + glBindBuffer(GL_ARRAY_BUFFER, *vbo); + glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } //////////////////////////////////////////////////////////////////////////////// //! Delete VBO //////////////////////////////////////////////////////////////////////////////// -void deleteVBO(GLuint *vbo) { - glDeleteBuffers(1, vbo); - *vbo = 0; +void deleteVBO(GLuint *vbo) +{ + glDeleteBuffers(1, vbo); + *vbo = 0; } // create index buffer for rendering quad mesh -void createMeshIndexBuffer(GLuint *id, int w, int h) { - int size = ((w * 2) + 2) * (h - 1) * sizeof(GLuint); +void createMeshIndexBuffer(GLuint *id, int w, int h) +{ + int size = ((w * 2) + 2) * (h - 1) * sizeof(GLuint); - // create index buffer - glGenBuffers(1, id); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, *id); - glBufferData(GL_ELEMENT_ARRAY_BUFFER, size, 0, GL_STATIC_DRAW); + // create index buffer + glGenBuffers(1, id); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, *id); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, size, 0, GL_STATIC_DRAW); - // fill with indices for rendering mesh as triangle strips - GLuint *indices = - (GLuint *)glMapBuffer(GL_ELEMENT_ARRAY_BUFFER, GL_WRITE_ONLY); + // fill with indices for rendering mesh as triangle strips + GLuint *indices = (GLuint *)glMapBuffer(GL_ELEMENT_ARRAY_BUFFER, GL_WRITE_ONLY); - if (!indices) { - return; - } - - for (int y = 0; y < h - 1; y++) { - for (int x = 0; x < w; x++) { - *indices++ = y * w + x; - *indices++ = (y + 1) * w + x; + if (!indices) { + return; } - // start new strip with degenerate triangle - *indices++ = (y + 1) * w + (w - 1); - *indices++ = (y + 1) * w; - } + for (int y = 0; y < h - 1; y++) { + for (int x = 0; x < w; x++) { + *indices++ = y * w + x; + *indices++ = (y + 1) * w + x; + } - glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); + // start new strip with degenerate triangle + *indices++ = (y + 1) * w + (w - 1); + *indices++ = (y + 1) * w; + } + + glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); } // create fixed vertex buffer to store mesh vertices -void createMeshPositionVBO(GLuint *id, int w, int h) { - createVBO(id, w * h * 4 * sizeof(float)); +void createMeshPositionVBO(GLuint *id, int w, int h) +{ + createVBO(id, w * h * 4 * sizeof(float)); - glBindBuffer(GL_ARRAY_BUFFER, *id); - float *pos = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); + glBindBuffer(GL_ARRAY_BUFFER, *id); + float *pos = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - if (!pos) { - return; - } - - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - float u = x / (float)(w - 1); - float v = y / (float)(h - 1); - *pos++ = u * 2.0f - 1.0f; - *pos++ = 0.0f; - *pos++ = v * 2.0f - 1.0f; - *pos++ = 1.0f; + if (!pos) { + return; } - } - glUnmapBuffer(GL_ARRAY_BUFFER); - glBindBuffer(GL_ARRAY_BUFFER, 0); + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + float u = x / (float)(w - 1); + float v = y / (float)(h - 1); + *pos++ = u * 2.0f - 1.0f; + *pos++ = 0.0f; + *pos++ = v * 2.0f - 1.0f; + *pos++ = 1.0f; + } + } + + glUnmapBuffer(GL_ARRAY_BUFFER); + glBindBuffer(GL_ARRAY_BUFFER, 0); } // Attach shader to a program -int attachShader(GLuint prg, GLenum type, const char *name) { - GLuint shader; - FILE *fp; - int size, compiled; - char *src; +int attachShader(GLuint prg, GLenum type, const char *name) +{ + GLuint shader; + FILE *fp; + int size, compiled; + char *src; - fp = fopen(name, "rb"); + fp = fopen(name, "rb"); - if (!fp) { - return 0; - } + if (!fp) { + return 0; + } - fseek(fp, 0, SEEK_END); - size = ftell(fp); - src = (char *)malloc(size); + fseek(fp, 0, SEEK_END); + size = ftell(fp); + src = (char *)malloc(size); - fseek(fp, 0, SEEK_SET); - fread(src, sizeof(char), size, fp); - fclose(fp); + fseek(fp, 0, SEEK_SET); + fread(src, sizeof(char), size, fp); + fclose(fp); - shader = glCreateShader(type); - glShaderSource(shader, 1, (const char **)&src, (const GLint *)&size); - glCompileShader(shader); - glGetShaderiv(shader, GL_COMPILE_STATUS, (GLint *)&compiled); + shader = glCreateShader(type); + glShaderSource(shader, 1, (const char **)&src, (const GLint *)&size); + glCompileShader(shader); + glGetShaderiv(shader, GL_COMPILE_STATUS, (GLint *)&compiled); - if (!compiled) { - char log[2048]; - int len; + if (!compiled) { + char log[2048]; + int len; - glGetShaderInfoLog(shader, 2048, (GLsizei *)&len, log); - printf("Info log: %s\n", log); + glGetShaderInfoLog(shader, 2048, (GLsizei *)&len, log); + printf("Info log: %s\n", log); + glDeleteShader(shader); + return 0; + } + + free(src); + + glAttachShader(prg, shader); glDeleteShader(shader); - return 0; - } - free(src); - - glAttachShader(prg, shader); - glDeleteShader(shader); - - return 1; + return 1; } // Create shader program from vertex shader and fragment shader files -GLuint loadGLSLProgram(const char *vertFileName, const char *fragFileName) { - GLint linked; - GLuint program; +GLuint loadGLSLProgram(const char *vertFileName, const char *fragFileName) +{ + GLint linked; + GLuint program; - program = glCreateProgram(); + program = glCreateProgram(); - if (!attachShader(program, GL_VERTEX_SHADER, vertFileName)) { - glDeleteProgram(program); - fprintf(stderr, "Couldn't attach vertex shader from file %s\n", - vertFileName); - return 0; - } + if (!attachShader(program, GL_VERTEX_SHADER, vertFileName)) { + glDeleteProgram(program); + fprintf(stderr, "Couldn't attach vertex shader from file %s\n", vertFileName); + return 0; + } - if (!attachShader(program, GL_FRAGMENT_SHADER, fragFileName)) { - glDeleteProgram(program); - fprintf(stderr, "Couldn't attach fragment shader from file %s\n", - fragFileName); - return 0; - } + if (!attachShader(program, GL_FRAGMENT_SHADER, fragFileName)) { + glDeleteProgram(program); + fprintf(stderr, "Couldn't attach fragment shader from file %s\n", fragFileName); + return 0; + } - glLinkProgram(program); - glGetProgramiv(program, GL_LINK_STATUS, &linked); + glLinkProgram(program); + glGetProgramiv(program, GL_LINK_STATUS, &linked); - if (!linked) { - glDeleteProgram(program); - char temp[256]; - glGetProgramInfoLog(program, 256, 0, temp); - fprintf(stderr, "Failed to link program: %s\n", temp); - return 0; - } + if (!linked) { + glDeleteProgram(program); + char temp[256]; + glGetProgramInfoLog(program, 256, 0, temp); + fprintf(stderr, "Failed to link program: %s\n", temp); + return 0; + } - return program; + return program; } diff --git a/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT_kernel.cu b/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT_kernel.cu index 8616da2e..e0d47ec4 100644 --- a/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT_kernel.cu +++ b/Samples/4_CUDA_Libraries/oceanFFT/oceanFFT_kernel.cu @@ -35,125 +35,124 @@ int cuda_iDivUp(int a, int b) { return (a + (b - 1)) / b; } // complex math functions __device__ float2 conjugate(float2 arg) { return make_float2(arg.x, -arg.y); } -__device__ float2 complex_exp(float arg) { - return make_float2(cosf(arg), sinf(arg)); -} +__device__ float2 complex_exp(float arg) { return make_float2(cosf(arg), sinf(arg)); } -__device__ float2 complex_add(float2 a, float2 b) { - return make_float2(a.x + b.x, a.y + b.y); -} +__device__ float2 complex_add(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); } -__device__ float2 complex_mult(float2 ab, float2 cd) { - return make_float2(ab.x * cd.x - ab.y * cd.y, ab.x * cd.y + ab.y * cd.x); +__device__ float2 complex_mult(float2 ab, float2 cd) +{ + return make_float2(ab.x * cd.x - ab.y * cd.y, ab.x * cd.y + ab.y * cd.x); } // generate wave heightfield at time t based on initial heightfield and // dispersion relationship -__global__ void generateSpectrumKernel(float2 *h0, float2 *ht, +__global__ void generateSpectrumKernel(float2 *h0, + float2 *ht, unsigned int in_width, unsigned int out_width, - unsigned int out_height, float t, - float patchSize) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - unsigned int in_index = y * in_width + x; - unsigned int in_mindex = - (out_height - y) * in_width + (out_width - x); // mirrored - unsigned int out_index = y * out_width + x; + unsigned int out_height, + float t, + float patchSize) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int in_index = y * in_width + x; + unsigned int in_mindex = (out_height - y) * in_width + (out_width - x); // mirrored + unsigned int out_index = y * out_width + x; - // calculate wave vector - float2 k; - k.x = (-(int)out_width / 2.0f + x) * (2.0f * CUDART_PI_F / patchSize); - k.y = (-(int)out_width / 2.0f + y) * (2.0f * CUDART_PI_F / patchSize); + // calculate wave vector + float2 k; + k.x = (-(int)out_width / 2.0f + x) * (2.0f * CUDART_PI_F / patchSize); + k.y = (-(int)out_width / 2.0f + y) * (2.0f * CUDART_PI_F / patchSize); - // calculate dispersion w(k) - float k_len = sqrtf(k.x * k.x + k.y * k.y); - float w = sqrtf(9.81f * k_len); + // calculate dispersion w(k) + float k_len = sqrtf(k.x * k.x + k.y * k.y); + float w = sqrtf(9.81f * k_len); - if ((x < out_width) && (y < out_height)) { - float2 h0_k = h0[in_index]; - float2 h0_mk = h0[in_mindex]; + if ((x < out_width) && (y < out_height)) { + float2 h0_k = h0[in_index]; + float2 h0_mk = h0[in_mindex]; - // output frequency-space complex values - ht[out_index] = - complex_add(complex_mult(h0_k, complex_exp(w * t)), - complex_mult(conjugate(h0_mk), complex_exp(-w * t))); - // ht[out_index] = h0_k; - } + // output frequency-space complex values + ht[out_index] = + complex_add(complex_mult(h0_k, complex_exp(w * t)), complex_mult(conjugate(h0_mk), complex_exp(-w * t))); + // ht[out_index] = h0_k; + } } // update height map values based on output of FFT -__global__ void updateHeightmapKernel(float *heightMap, float2 *ht, - unsigned int width) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - unsigned int i = y * width + x; +__global__ void updateHeightmapKernel(float *heightMap, float2 *ht, unsigned int width) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int i = y * width + x; - // cos(pi * (m1 + m2)) - float sign_correction = ((x + y) & 0x01) ? -1.0f : 1.0f; + // cos(pi * (m1 + m2)) + float sign_correction = ((x + y) & 0x01) ? -1.0f : 1.0f; - heightMap[i] = ht[i].x * sign_correction; + heightMap[i] = ht[i].x * sign_correction; } // update height map values based on output of FFT -__global__ void updateHeightmapKernel_y(float *heightMap, float2 *ht, - unsigned int width) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - unsigned int i = y * width + x; +__global__ void updateHeightmapKernel_y(float *heightMap, float2 *ht, unsigned int width) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int i = y * width + x; - // cos(pi * (m1 + m2)) - float sign_correction = ((x + y) & 0x01) ? -1.0f : 1.0f; + // cos(pi * (m1 + m2)) + float sign_correction = ((x + y) & 0x01) ? -1.0f : 1.0f; - heightMap[i] = ht[i].y * sign_correction; + heightMap[i] = ht[i].y * sign_correction; } // generate slope by partial differences in spatial domain -__global__ void calculateSlopeKernel(float *h, float2 *slopeOut, - unsigned int width, unsigned int height) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - unsigned int i = y * width + x; +__global__ void calculateSlopeKernel(float *h, float2 *slopeOut, unsigned int width, unsigned int height) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int i = y * width + x; - float2 slope = make_float2(0.0f, 0.0f); + float2 slope = make_float2(0.0f, 0.0f); - if ((x > 0) && (y > 0) && (x < width - 1) && (y < height - 1)) { - slope.x = h[i + 1] - h[i - 1]; - slope.y = h[i + width] - h[i - width]; - } + if ((x > 0) && (y > 0) && (x < width - 1) && (y < height - 1)) { + slope.x = h[i + 1] - h[i - 1]; + slope.y = h[i + width] - h[i - width]; + } - slopeOut[i] = slope; + slopeOut[i] = slope; } // wrapper functions -extern "C" void cudaGenerateSpectrumKernel(float2 *d_h0, float2 *d_ht, +extern "C" void cudaGenerateSpectrumKernel(float2 *d_h0, + float2 *d_ht, unsigned int in_width, unsigned int out_width, unsigned int out_height, - float animTime, float patchSize) { - dim3 block(8, 8, 1); - dim3 grid(cuda_iDivUp(out_width, block.x), cuda_iDivUp(out_height, block.y), - 1); - generateSpectrumKernel<<>>(d_h0, d_ht, in_width, out_width, - out_height, animTime, patchSize); + float animTime, + float patchSize) +{ + dim3 block(8, 8, 1); + dim3 grid(cuda_iDivUp(out_width, block.x), cuda_iDivUp(out_height, block.y), 1); + generateSpectrumKernel<<>>(d_h0, d_ht, in_width, out_width, out_height, animTime, patchSize); } -extern "C" void cudaUpdateHeightmapKernel(float *d_heightMap, float2 *d_ht, - unsigned int width, - unsigned int height, bool autoTest) { - dim3 block(8, 8, 1); - dim3 grid(cuda_iDivUp(width, block.x), cuda_iDivUp(height, block.y), 1); - if (autoTest) { - updateHeightmapKernel_y<<>>(d_heightMap, d_ht, width); - } else { - updateHeightmapKernel<<>>(d_heightMap, d_ht, width); - } +extern "C" void +cudaUpdateHeightmapKernel(float *d_heightMap, float2 *d_ht, unsigned int width, unsigned int height, bool autoTest) +{ + dim3 block(8, 8, 1); + dim3 grid(cuda_iDivUp(width, block.x), cuda_iDivUp(height, block.y), 1); + if (autoTest) { + updateHeightmapKernel_y<<>>(d_heightMap, d_ht, width); + } + else { + updateHeightmapKernel<<>>(d_heightMap, d_ht, width); + } } -extern "C" void cudaCalculateSlopeKernel(float *hptr, float2 *slopeOut, - unsigned int width, - unsigned int height) { - dim3 block(8, 8, 1); - dim3 grid2(cuda_iDivUp(width, block.x), cuda_iDivUp(height, block.y), 1); - calculateSlopeKernel<<>>(hptr, slopeOut, width, height); +extern "C" void cudaCalculateSlopeKernel(float *hptr, float2 *slopeOut, unsigned int width, unsigned int height) +{ + dim3 block(8, 8, 1); + dim3 grid2(cuda_iDivUp(width, block.x), cuda_iDivUp(height, block.y), 1); + calculateSlopeKernel<<>>(hptr, slopeOut, width, height); } diff --git a/Samples/4_CUDA_Libraries/randomFog/README.md b/Samples/4_CUDA_Libraries/randomFog/README.md index 9291a64d..54fdb710 100644 --- a/Samples/4_CUDA_Libraries/randomFog/README.md +++ b/Samples/4_CUDA_Libraries/randomFog/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/4_CUDA_Libraries/randomFog/randomFog.cpp b/Samples/4_CUDA_Libraries/randomFog/randomFog.cpp index f89576fa..b4ce1edd 100644 --- a/Samples/4_CUDA_Libraries/randomFog/randomFog.cpp +++ b/Samples/4_CUDA_Libraries/randomFog/randomFog.cpp @@ -35,18 +35,18 @@ #endif // CUDA Library Headers -#include #include +#include // CUDA utilities and system includes #include #include // System includes -#include -#include #include #include +#include +#include // Includes #include "rng.h" @@ -64,667 +64,689 @@ RNG *g_pRng = NULL; CheckRender *g_pCheckRender = NULL; // Simple struct which contains the position and color of a vertex -struct SVertex { - GLfloat x, y, z; - GLfloat r, g, b; +struct SVertex +{ + GLfloat x, y, z; + GLfloat r, g, b; }; // Data for the vertices SVertex *g_pVertices = NULL; -int g_nVertices; // Size of the vertex array -int g_nVerticesPopulated; // Number currently populated +int g_nVertices; // Size of the vertex array +int g_nVerticesPopulated; // Number currently populated // Control the randomness -int nSkip1 = 0; // Number of samples to discard between x,y -int nSkip2 = 0; // Number of samples to discard between y,z -int nSkip3 = 0; // Number of samples to discard between z,x +int nSkip1 = 0; // Number of samples to discard between x,y +int nSkip2 = 0; // Number of samples to discard between y,z +int nSkip3 = 0; // Number of samples to discard between z,x // Control the display enum Shape_t { Sphere, SphericalShell, Cube, Plane }; Shape_t g_currentShape = Sphere; -bool g_bShowAxes = true; -bool g_bTenXZoom = false; -bool g_bAutoRotate = true; -int g_lastShapeX = 1024; -int g_lastShapeY = 1024; -float g_xRotated = 0.0f; -float g_yRotated = 0.0f; +bool g_bShowAxes = true; +bool g_bTenXZoom = false; +bool g_bAutoRotate = true; +int g_lastShapeX = 1024; +int g_lastShapeY = 1024; +float g_xRotated = 0.0f; +float g_yRotated = 0.0f; const float PI = 3.14159265359f; -void createCube(void) { - int startVertex = 0; +void createCube(void) +{ + int startVertex = 0; - for (int i = startVertex; i < g_nVerticesPopulated; i++) { - g_pVertices[i].x = (g_pRng->getNextU01() - .5f) * 2; + for (int i = startVertex; i < g_nVerticesPopulated; i++) { + g_pVertices[i].x = (g_pRng->getNextU01() - .5f) * 2; - for (int j = 0; j < nSkip1; j++) { - g_pRng->getNextU01(); + for (int j = 0; j < nSkip1; j++) { + g_pRng->getNextU01(); + } + + g_pVertices[i].y = (g_pRng->getNextU01() - .5f) * 2; + + for (int j = 0; j < nSkip2; j++) { + g_pRng->getNextU01(); + } + + g_pVertices[i].z = (g_pRng->getNextU01() - .5f) * 2; + + for (int j = 0; j < nSkip3; j++) { + g_pRng->getNextU01(); + } + + g_pVertices[i].r = 1.0f; + g_pVertices[i].g = 1.0f; + g_pVertices[i].b = 1.0f; } - - g_pVertices[i].y = (g_pRng->getNextU01() - .5f) * 2; - - for (int j = 0; j < nSkip2; j++) { - g_pRng->getNextU01(); - } - - g_pVertices[i].z = (g_pRng->getNextU01() - .5f) * 2; - - for (int j = 0; j < nSkip3; j++) { - g_pRng->getNextU01(); - } - - g_pVertices[i].r = 1.0f; - g_pVertices[i].g = 1.0f; - g_pVertices[i].b = 1.0f; - } } -void createPlane(void) { - int startVertex = 0; +void createPlane(void) +{ + int startVertex = 0; - for (int i = startVertex; i < g_nVerticesPopulated; i++) { - g_pVertices[i].x = (g_pRng->getNextU01() - .5f) * 2; + for (int i = startVertex; i < g_nVerticesPopulated; i++) { + g_pVertices[i].x = (g_pRng->getNextU01() - .5f) * 2; - for (int j = 0; j < nSkip1; j++) { - g_pRng->getNextU01(); + for (int j = 0; j < nSkip1; j++) { + g_pRng->getNextU01(); + } + + g_pVertices[i].y = (g_pRng->getNextU01() - .5f) * 2; + + for (int j = 0; j < nSkip2; j++) { + g_pRng->getNextU01(); + } + + g_pVertices[i].z = 0.0f; + + g_pVertices[i].r = 1.0f; + g_pVertices[i].g = 1.0f; + g_pVertices[i].b = 1.0f; + } +} + +void createSphere(void) +{ + int startVertex = 0; + + for (int i = startVertex; i < g_nVerticesPopulated; i++) { + float r; + float rho; + float theta; + + if (g_currentShape == Sphere) { + r = g_pRng->getNextU01(); + r = powf(r, 1.f / 3.f); + + for (int j = 0; j < nSkip3; j++) { + g_pRng->getNextU01(); + } + } + else { + r = 1.0f; + } + + rho = g_pRng->getNextU01() * PI * 2.0f; + + for (int j = 0; j < nSkip1; j++) { + g_pRng->getNextU01(); + } + + theta = (g_pRng->getNextU01() * 2.0f) - 1.0f; + theta = asin(theta); + + for (int j = 0; j < nSkip2; j++) { + g_pRng->getNextU01(); + } + + g_pVertices[i].x = r * fabs(cos(theta)) * cos(rho); + g_pVertices[i].y = r * fabs(cos(theta)) * sin(rho); + g_pVertices[i].z = r * sin(theta); + + g_pVertices[i].r = 1.0f; + g_pVertices[i].g = 1.0f; + g_pVertices[i].b = 1.0f; + } +} + +void createAxes(void) +{ + // z axis: + g_pVertices[200000].x = 0.0f; + g_pVertices[200000].y = 0.0f; + g_pVertices[200000].z = -1.5f; + g_pVertices[200001].x = 0.0f; + g_pVertices[200001].y = 0.0f; + g_pVertices[200001].z = 1.5f; + g_pVertices[200000].r = 1.0f; + g_pVertices[200000].g = 0.0f; + g_pVertices[200000].b = 0.0f; + g_pVertices[200001].r = 0.0f; + g_pVertices[200001].g = 1.0f; + g_pVertices[200001].b = 1.0f; + // y axis: + g_pVertices[200002].x = 0.0f; + g_pVertices[200002].y = -1.5f; + g_pVertices[200002].z = 0.0f; + g_pVertices[200003].x = 0.0f; + g_pVertices[200003].y = 1.5f; + g_pVertices[200003].z = 0.0f; + g_pVertices[200002].r = 0.0f; + g_pVertices[200002].g = 1.0f; + g_pVertices[200002].b = 0.0f; + g_pVertices[200003].r = 1.0f; + g_pVertices[200003].g = 0.0f; + g_pVertices[200003].b = 1.0f; + // x axis: + g_pVertices[200004].x = -1.5f; + g_pVertices[200004].y = 0.0f; + g_pVertices[200004].z = 0.0f; + g_pVertices[200005].x = 1.5f; + g_pVertices[200005].y = 0.0f; + g_pVertices[200005].z = 0.0f; + g_pVertices[200004].r = 0.0f; + g_pVertices[200004].g = 0.0f; + g_pVertices[200004].b = 1.0f; + g_pVertices[200005].r = 1.0f; + g_pVertices[200005].g = 1.0f; + g_pVertices[200005].b = 0.0f; +} + +void drawPoints(void) +{ + if (g_bShowAxes) { + glDrawArrays(GL_LINE_STRIP, 200000, 2); + glDrawArrays(GL_LINE_STRIP, 200002, 2); + glDrawArrays(GL_LINE_STRIP, 200004, 2); } - g_pVertices[i].y = (g_pRng->getNextU01() - .5f) * 2; + glDrawArrays(GL_POINTS, 0, g_nVerticesPopulated); +} - for (int j = 0; j < nSkip2; j++) { - g_pRng->getNextU01(); +void drawText(void) +{ + using std::string; + using std::stringstream; + + glPushMatrix(); + glLoadIdentity(); + glRasterPos2f(-1.2f, 1.2f); + + string infoString; + stringstream ss; + g_pRng->getInfoString(infoString); + ss << " skip1=" << nSkip1; + ss << " skip2=" << nSkip2; + ss << " skip3=" << nSkip3; + ss << " points=" << g_nVerticesPopulated; + infoString.append(ss.str()); + + for (unsigned int i = 0; i < infoString.size(); i++) { + glutBitmapCharacter(GLUT_BITMAP_HELVETICA_12, infoString[i]); } - g_pVertices[i].z = 0.0f; - - g_pVertices[i].r = 1.0f; - g_pVertices[i].g = 1.0f; - g_pVertices[i].b = 1.0f; - } + glPopMatrix(); } -void createSphere(void) { - int startVertex = 0; +void reshape(int x, int y) +{ + float xScale; + float yScale; - for (int i = startVertex; i < g_nVerticesPopulated; i++) { - float r; - float rho; - float theta; + g_lastShapeX = x; + g_lastShapeY = y; - if (g_currentShape == Sphere) { - r = g_pRng->getNextU01(); - r = powf(r, 1.f / 3.f); - - for (int j = 0; j < nSkip3; j++) { - g_pRng->getNextU01(); - } - } else { - r = 1.0f; + // Check if shape is visible + if (x == 0 || y == 0) { + return; } - rho = g_pRng->getNextU01() * PI * 2.0f; + // Set a new projection matrix + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); - for (int j = 0; j < nSkip1; j++) { - g_pRng->getNextU01(); + // Adjust fit + if (y > x) { + xScale = 1.0f; + yScale = (float)y / x; + } + else { + xScale = (float)x / y; + yScale = 1.0f; } - theta = (g_pRng->getNextU01() * 2.0f) - 1.0f; - theta = asin(theta); - - for (int j = 0; j < nSkip2; j++) { - g_pRng->getNextU01(); + // Angle of view:40 degrees + // Near clipping plane distance: 10.0 (default) + // Far clipping plane distance: 10.0 (default) + if (g_bTenXZoom) { + glOrtho(-.15f * xScale, .15f * xScale, -.15f * yScale, .15f * yScale, -5.0f, 5.0f); + } + else { + glOrtho(-1.5f * xScale, 1.5f * xScale, -1.5f * yScale, 1.5f * yScale, -10.0f, 10.0f); } - g_pVertices[i].x = r * fabs(cos(theta)) * cos(rho); - g_pVertices[i].y = r * fabs(cos(theta)) * sin(rho); - g_pVertices[i].z = r * sin(theta); - - g_pVertices[i].r = 1.0f; - g_pVertices[i].g = 1.0f; - g_pVertices[i].b = 1.0f; - } + // Use the whole window for rendering + glViewport(0, 0, x, y); + glMatrixMode(GL_MODELVIEW); } -void createAxes(void) { - // z axis: - g_pVertices[200000].x = 0.0f; - g_pVertices[200000].y = 0.0f; - g_pVertices[200000].z = -1.5f; - g_pVertices[200001].x = 0.0f; - g_pVertices[200001].y = 0.0f; - g_pVertices[200001].z = 1.5f; - g_pVertices[200000].r = 1.0f; - g_pVertices[200000].g = 0.0f; - g_pVertices[200000].b = 0.0f; - g_pVertices[200001].r = 0.0f; - g_pVertices[200001].g = 1.0f; - g_pVertices[200001].b = 1.0f; - // y axis: - g_pVertices[200002].x = 0.0f; - g_pVertices[200002].y = -1.5f; - g_pVertices[200002].z = 0.0f; - g_pVertices[200003].x = 0.0f; - g_pVertices[200003].y = 1.5f; - g_pVertices[200003].z = 0.0f; - g_pVertices[200002].r = 0.0f; - g_pVertices[200002].g = 1.0f; - g_pVertices[200002].b = 0.0f; - g_pVertices[200003].r = 1.0f; - g_pVertices[200003].g = 0.0f; - g_pVertices[200003].b = 1.0f; - // x axis: - g_pVertices[200004].x = -1.5f; - g_pVertices[200004].y = 0.0f; - g_pVertices[200004].z = 0.0f; - g_pVertices[200005].x = 1.5f; - g_pVertices[200005].y = 0.0f; - g_pVertices[200005].z = 0.0f; - g_pVertices[200004].r = 0.0f; - g_pVertices[200004].g = 0.0f; - g_pVertices[200004].b = 1.0f; - g_pVertices[200005].r = 1.0f; - g_pVertices[200005].g = 1.0f; - g_pVertices[200005].b = 0.0f; +void display(void) +{ + glClear(GL_COLOR_BUFFER_BIT); + glLoadIdentity(); + glTranslatef(0.0f, 0.0f, -4.0f); + glRotatef(g_yRotated, 0.0f, 1.0f, 0.0f); + glRotatef(g_xRotated, 1.0f, 0.0f, 0.0f); + drawPoints(); + drawText(); + glFlush(); + glutSwapBuffers(); } -void drawPoints(void) { - if (g_bShowAxes) { - glDrawArrays(GL_LINE_STRIP, 200000, 2); - glDrawArrays(GL_LINE_STRIP, 200002, 2); - glDrawArrays(GL_LINE_STRIP, 200004, 2); - } +void idle(void) +{ + if (g_bAutoRotate) { + g_yRotated += 0.1f; - glDrawArrays(GL_POINTS, 0, g_nVerticesPopulated); -} + if (g_yRotated >= 360.0f) { + g_yRotated -= 360.0f; + } -void drawText(void) { - using std::string; - using std::stringstream; + g_xRotated += 0.05f; - glPushMatrix(); - glLoadIdentity(); - glRasterPos2f(-1.2f, 1.2f); + if (g_xRotated >= 360.0f) { + g_xRotated -= 360.0f; + } - string infoString; - stringstream ss; - g_pRng->getInfoString(infoString); - ss << " skip1=" << nSkip1; - ss << " skip2=" << nSkip2; - ss << " skip3=" << nSkip3; - ss << " points=" << g_nVerticesPopulated; - infoString.append(ss.str()); - - for (unsigned int i = 0; i < infoString.size(); i++) { - glutBitmapCharacter(GLUT_BITMAP_HELVETICA_12, infoString[i]); - } - - glPopMatrix(); -} - -void reshape(int x, int y) { - float xScale; - float yScale; - - g_lastShapeX = x; - g_lastShapeY = y; - - // Check if shape is visible - if (x == 0 || y == 0) { - return; - } - - // Set a new projection matrix - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - - // Adjust fit - if (y > x) { - xScale = 1.0f; - yScale = (float)y / x; - } else { - xScale = (float)x / y; - yScale = 1.0f; - } - - // Angle of view:40 degrees - // Near clipping plane distance: 10.0 (default) - // Far clipping plane distance: 10.0 (default) - if (g_bTenXZoom) { - glOrtho(-.15f * xScale, .15f * xScale, -.15f * yScale, .15f * yScale, -5.0f, - 5.0f); - } else { - glOrtho(-1.5f * xScale, 1.5f * xScale, -1.5f * yScale, 1.5f * yScale, - -10.0f, 10.0f); - } - - // Use the whole window for rendering - glViewport(0, 0, x, y); - glMatrixMode(GL_MODELVIEW); -} - -void display(void) { - glClear(GL_COLOR_BUFFER_BIT); - glLoadIdentity(); - glTranslatef(0.0f, 0.0f, -4.0f); - glRotatef(g_yRotated, 0.0f, 1.0f, 0.0f); - glRotatef(g_xRotated, 1.0f, 0.0f, 0.0f); - drawPoints(); - drawText(); - glFlush(); - glutSwapBuffers(); -} - -void idle(void) { - if (g_bAutoRotate) { - g_yRotated += 0.1f; - - if (g_yRotated >= 360.0f) { - g_yRotated -= 360.0f; + display(); } +} - g_xRotated += 0.05f; +void reCreate(void) +{ + switch (g_currentShape) { + case Sphere: + case SphericalShell: + createSphere(); + break; - if (g_xRotated >= 360.0f) { - g_xRotated -= 360.0f; + case Cube: + createCube(); + break; + + default: + createPlane(); } display(); - } } -void reCreate(void) { - switch (g_currentShape) { - case Sphere: - case SphericalShell: - createSphere(); - break; +void cleanup(int code) +{ + if (g_pRng) { + delete g_pRng; + g_pRng = NULL; + } - case Cube: - createCube(); - break; + if (g_pVertices) { + delete[] g_pVertices; + g_pVertices = NULL; + } - default: - createPlane(); - } + if (g_pCheckRender) { + delete g_pCheckRender; + g_pCheckRender = NULL; + } - display(); -} - -void cleanup(int code) { - if (g_pRng) { - delete g_pRng; - g_pRng = NULL; - } - - if (g_pVertices) { - delete[] g_pVertices; - g_pVertices = NULL; - } - - if (g_pCheckRender) { - delete g_pCheckRender; - g_pCheckRender = NULL; - } - - exit(code); + exit(code); } void glutClose() { cleanup(EXIT_SUCCESS); } -void keyboard(unsigned char key, int x, int y) { - switch (key) { +void keyboard(unsigned char key, int x, int y) +{ + switch (key) { // Select shape case 's': case 'S': - g_currentShape = Sphere; - createSphere(); - display(); - break; + g_currentShape = Sphere; + createSphere(); + display(); + break; case 'e': case 'E': - g_currentShape = SphericalShell; - createSphere(); - display(); - break; + g_currentShape = SphericalShell; + createSphere(); + display(); + break; case 'b': case 'B': - g_currentShape = Cube; - createCube(); - display(); - break; + g_currentShape = Cube; + createCube(); + display(); + break; case 'p': case 'P': - g_currentShape = Plane; - createPlane(); - display(); - break; + g_currentShape = Plane; + createPlane(); + display(); + break; // Rotation case 'a': case 'A': - g_bAutoRotate = !g_bAutoRotate; - break; + g_bAutoRotate = !g_bAutoRotate; + break; case 'i': case 'I': - g_xRotated -= 1.0f; + g_xRotated -= 1.0f; - if (g_xRotated <= 0.0f) { - g_xRotated += 360.0f; - } + if (g_xRotated <= 0.0f) { + g_xRotated += 360.0f; + } - display(); - break; + display(); + break; case ',': - g_xRotated += 1.0f; + g_xRotated += 1.0f; - if (g_xRotated >= 360.0f) { - g_xRotated -= 360.0f; - } + if (g_xRotated >= 360.0f) { + g_xRotated -= 360.0f; + } - display(); - break; + display(); + break; case 'j': case 'J': - g_yRotated -= 1.0f; + g_yRotated -= 1.0f; - if (g_yRotated <= 0.0f) { - g_yRotated += 360.0f; - } + if (g_yRotated <= 0.0f) { + g_yRotated += 360.0f; + } - display(); - break; + display(); + break; case 'l': case 'L': - g_yRotated += 1.0f; + g_yRotated += 1.0f; - if (g_yRotated >= 360.0f) { - g_yRotated -= 360.0f; - } + if (g_yRotated >= 360.0f) { + g_yRotated -= 360.0f; + } - display(); - break; + display(); + break; // Zoom case 't': case 'T': - g_bTenXZoom = !g_bTenXZoom; - reshape(g_lastShapeX, g_lastShapeY); - reCreate(); - break; + g_bTenXZoom = !g_bTenXZoom; + reshape(g_lastShapeX, g_lastShapeY); + reCreate(); + break; // Axes case 'z': case 'Z': - g_bShowAxes = !g_bShowAxes; - reCreate(); - break; + g_bShowAxes = !g_bShowAxes; + reCreate(); + break; // RNG case 'x': case 'X': - g_pRng->selectRng(RNG::Pseudo); - reCreate(); - break; + g_pRng->selectRng(RNG::Pseudo); + reCreate(); + break; case 'c': case 'C': - g_pRng->selectRng(RNG::Quasi); - reCreate(); - break; + g_pRng->selectRng(RNG::Quasi); + reCreate(); + break; case 'v': case 'V': - g_pRng->selectRng(RNG::ScrambledQuasi); - reCreate(); - break; + g_pRng->selectRng(RNG::ScrambledQuasi); + reCreate(); + break; case 'r': case 'R': - g_pRng->resetSeed(); - reCreate(); - break; + g_pRng->resetSeed(); + reCreate(); + break; case ']': - g_pRng->incrementDimensions(); - reCreate(); - break; + g_pRng->incrementDimensions(); + reCreate(); + break; case '[': - g_pRng->resetDimensions(); - reCreate(); - break; + g_pRng->resetDimensions(); + reCreate(); + break; case '1': - nSkip1++; - reCreate(); - break; + nSkip1++; + reCreate(); + break; case '2': - nSkip2++; - reCreate(); - break; + nSkip2++; + reCreate(); + break; case '3': - nSkip3++; - reCreate(); - break; + nSkip3++; + reCreate(); + break; case '!': - nSkip1 = 0; - nSkip2 = 0; - nSkip3 = 0; - reCreate(); - break; + nSkip1 = 0; + nSkip2 = 0; + nSkip3 = 0; + reCreate(); + break; // Number of vertices case '+': - g_nVerticesPopulated += 8000; + g_nVerticesPopulated += 8000; - if (g_nVerticesPopulated > g_nVertices) { - g_nVerticesPopulated = g_nVertices; - } + if (g_nVerticesPopulated > g_nVertices) { + g_nVerticesPopulated = g_nVertices; + } - reCreate(); - break; + reCreate(); + break; case '-': - g_nVerticesPopulated -= 8000; + g_nVerticesPopulated -= 8000; - if (g_nVerticesPopulated < 8000) { - g_nVerticesPopulated = 8000; - } + if (g_nVerticesPopulated < 8000) { + g_nVerticesPopulated = 8000; + } - reCreate(); - break; + reCreate(); + break; // Quit case 27: case 'q': case 'Q': #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - } -} - -void showHelp(void) { - using std::left; - using std::setw; - using std::stringstream; - - stringstream ss; - - ss << "\nRandom number visualization\n\n"; - ss << "On creation, randomFog generates 200,000 random coordinates in " - "spherical coordinate space (radius, angle rho, angle theta) with " - "curand's XORWOW algorithm. The coordinates are normalized for a " - "uniform distribution through the sphere.\n\n"; - ss << "The X axis is drawn with blue in the negative direction and yellow " - "positive.\n" - << "The Y axis is drawn with green in the negative direction and magenta " - "positive.\n" - << "The Z axis is drawn with red in the negative direction and cyan " - "positive.\n\n"; - ss << "The following keys can be used to control the output:\n\n"; - ss << left; - ss << "\t" << setw(10) << "s" - << "Generate a new set of random numbers and display as spherical " - "coordinates (Sphere)\n"; - ss << "\t" << setw(10) << "e" - << "Generate a new set of random numbers and display on a spherical " - "surface (shEll)\n"; - ss << "\t" << setw(10) << "b" - << "Generate a new set of random numbers and display as cartesian " - "coordinates (cuBe/Box)\n"; - ss << "\t" << setw(10) << "p" - << "Generate a new set of random numbers and display on a cartesian plane " - "(Plane)\n\n"; - ss << "\t" << setw(10) << "i,l,j" - << "Rotate the negative Z-axis up, right, down and left respectively\n"; - ss << "\t" << setw(10) << "a" - << "Toggle auto-rotation\n"; - ss << "\t" << setw(10) << "t" - << "Toggle 10x zoom\n"; - ss << "\t" << setw(10) << "z" - << "Toggle axes display\n\n"; - ss << "\t" << setw(10) << "x" - << "Select XORWOW generator (default)\n"; - ss << "\t" << setw(10) << "c" - << "Select Sobol' generator\n"; - ss << "\t" << setw(10) << "v" - << "Select scrambled Sobol' generator\n"; - ss << "\t" << setw(10) << "r" - << "Reset XORWOW (i.e. reset to initial seed) and regenerate\n"; - ss << "\t" << setw(10) << "]" - << "Increment the number of Sobol' dimensions and regenerate\n"; - ss << "\t" << setw(10) << "[" - << "Reset the number of Sobol' dimensions to 1 and regenerate\n\n"; - ss << "\t" << setw(10) << "+" - << "Increment the number of displayed points by 8,000 (up to maximum " - "200,000)\n"; - ss << "\t" << setw(10) << "-" - << "Decrement the number of displayed points by 8,000 (down to minimum " - "8,000)\n\n"; - ss << "\t" << setw(10) << "q/[ESC]" - << "Quit the application.\n\n"; - puts(ss.str().c_str()); -} - -int main(int argc, char **argv) { - using std::runtime_error; - - try { - bool bQA = false; - - // Open the log file - printf("Random Fog\n"); - printf("==========\n\n"); - - // Check QA mode - if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { - bQA = true; - - findCudaDevice(argc, (const char **)argv); - - g_pCheckRender = - new CheckBackBuffer(g_lastShapeX, g_lastShapeY, 4, false); - } else { -#if defined(__linux__) - setenv("DISPLAY", ":0", 0); -#endif - // Initialize GL - glutInit(&argc, argv); - glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); - // TODO use width/height? - glutInitWindowSize(1000, 1000); - // Create a window with rendering context and everything else we need - glutCreateWindow("Random Fog"); - - if (!isGLVersionSupported(2, 0)) { - fprintf(stderr, "This sample requires at least OpenGL 2.0\n"); - exit(EXIT_WAIVED); - } - - // Select CUDA device with OpenGL interoperability - findCudaDevice(argc, (const char **)argv); } +} - // Create vertices - g_nVertices = 200000; - g_nVerticesPopulated = 200000; - g_pVertices = new SVertex[g_nVertices + 6]; +void showHelp(void) +{ + using std::left; + using std::setw; + using std::stringstream; - // Setup the random number generators - g_pRng = new RNG(12345, 1, 100000); - printf("CURAND initialized\n"); + stringstream ss; - // Compute the initial vertices and indices, starting in spherical mode - createSphere(); - createAxes(); + ss << "\nRandom number visualization\n\n"; + ss << "On creation, randomFog generates 200,000 random coordinates in " + "spherical coordinate space (radius, angle rho, angle theta) with " + "curand's XORWOW algorithm. The coordinates are normalized for a " + "uniform distribution through the sphere.\n\n"; + ss << "The X axis is drawn with blue in the negative direction and yellow " + "positive.\n" + << "The Y axis is drawn with green in the negative direction and magenta " + "positive.\n" + << "The Z axis is drawn with red in the negative direction and cyan " + "positive.\n\n"; + ss << "The following keys can be used to control the output:\n\n"; + ss << left; + ss << "\t" << setw(10) << "s" + << "Generate a new set of random numbers and display as spherical " + "coordinates (Sphere)\n"; + ss << "\t" << setw(10) << "e" + << "Generate a new set of random numbers and display on a spherical " + "surface (shEll)\n"; + ss << "\t" << setw(10) << "b" + << "Generate a new set of random numbers and display as cartesian " + "coordinates (cuBe/Box)\n"; + ss << "\t" << setw(10) << "p" + << "Generate a new set of random numbers and display on a cartesian plane " + "(Plane)\n\n"; + ss << "\t" << setw(10) << "i,l,j" + << "Rotate the negative Z-axis up, right, down and left respectively\n"; + ss << "\t" << setw(10) << "a" + << "Toggle auto-rotation\n"; + ss << "\t" << setw(10) << "t" + << "Toggle 10x zoom\n"; + ss << "\t" << setw(10) << "z" + << "Toggle axes display\n\n"; + ss << "\t" << setw(10) << "x" + << "Select XORWOW generator (default)\n"; + ss << "\t" << setw(10) << "c" + << "Select Sobol' generator\n"; + ss << "\t" << setw(10) << "v" + << "Select scrambled Sobol' generator\n"; + ss << "\t" << setw(10) << "r" + << "Reset XORWOW (i.e. reset to initial seed) and regenerate\n"; + ss << "\t" << setw(10) << "]" + << "Increment the number of Sobol' dimensions and regenerate\n"; + ss << "\t" << setw(10) << "[" + << "Reset the number of Sobol' dimensions to 1 and regenerate\n\n"; + ss << "\t" << setw(10) << "+" + << "Increment the number of displayed points by 8,000 (up to maximum " + "200,000)\n"; + ss << "\t" << setw(10) << "-" + << "Decrement the number of displayed points by 8,000 (down to minimum " + "8,000)\n\n"; + ss << "\t" << setw(10) << "q/[ESC]" + << "Quit the application.\n\n"; + puts(ss.str().c_str()); +} - showHelp(); +int main(int argc, char **argv) +{ + using std::runtime_error; - if (bQA) { - g_pCheckRender->setExecPath(argv[0]); - g_pCheckRender->dumpBin( - g_pVertices, g_nVerticesPopulated * sizeof(SVertex), "randomFog.bin"); + try { + bool bQA = false; - if (g_pCheckRender->compareBin2BinFloat( - "randomFog.bin", "ref_randomFog.bin", - g_nVerticesPopulated * sizeof(SVertex) / sizeof(float), 0.25f, - 0.35f)) { - cleanup(EXIT_SUCCESS); - } else { - cleanup(EXIT_FAILURE); - } - } else { - // As we do not yet use a depth buffer, we cannot fill our sphere - glPolygonMode(GL_FRONT_AND_BACK, GL_LINE); - // Enable the vertex array functionality: - glEnableClientState(GL_VERTEX_ARRAY); - // Enable the color array functionality (so we can specify a color for - // each vertex) - glEnableClientState(GL_COLOR_ARRAY); - // Pass the vertex pointer: - glVertexPointer(3, // 3 components per vertex (x,y,z) - GL_FLOAT, sizeof(SVertex), g_pVertices); - // Pass the color pointer - glColorPointer(3, // 3 components per vertex (r,g,b) - GL_FLOAT, sizeof(SVertex), - &g_pVertices[0].r); // Pointer to the first color - // Point size for point mode - glPointSize(1.0f); - glLineWidth(2.0f); - glClearColor(0.0f, 0.0f, 0.0f, 0.0f); - // Notify glut which messages we require: - glutDisplayFunc(display); - glutReshapeFunc(reshape); - glutKeyboardFunc(keyboard); - glutIdleFunc(idle); + // Open the log file + printf("Random Fog\n"); + printf("==========\n\n"); + + // Check QA mode + if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { + bQA = true; + + findCudaDevice(argc, (const char **)argv); + + g_pCheckRender = new CheckBackBuffer(g_lastShapeX, g_lastShapeY, 4, false); + } + else { +#if defined(__linux__) + setenv("DISPLAY", ":0", 0); +#endif + // Initialize GL + glutInit(&argc, argv); + glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); + // TODO use width/height? + glutInitWindowSize(1000, 1000); + // Create a window with rendering context and everything else we need + glutCreateWindow("Random Fog"); + + if (!isGLVersionSupported(2, 0)) { + fprintf(stderr, "This sample requires at least OpenGL 2.0\n"); + exit(EXIT_WAIVED); + } + + // Select CUDA device with OpenGL interoperability + findCudaDevice(argc, (const char **)argv); + } + + // Create vertices + g_nVertices = 200000; + g_nVerticesPopulated = 200000; + g_pVertices = new SVertex[g_nVertices + 6]; + + // Setup the random number generators + g_pRng = new RNG(12345, 1, 100000); + printf("CURAND initialized\n"); + + // Compute the initial vertices and indices, starting in spherical mode + createSphere(); + createAxes(); + + showHelp(); + + if (bQA) { + g_pCheckRender->setExecPath(argv[0]); + g_pCheckRender->dumpBin(g_pVertices, g_nVerticesPopulated * sizeof(SVertex), "randomFog.bin"); + + if (g_pCheckRender->compareBin2BinFloat("randomFog.bin", + "ref_randomFog.bin", + g_nVerticesPopulated * sizeof(SVertex) / sizeof(float), + 0.25f, + 0.35f)) { + cleanup(EXIT_SUCCESS); + } + else { + cleanup(EXIT_FAILURE); + } + } + else { + // As we do not yet use a depth buffer, we cannot fill our sphere + glPolygonMode(GL_FRONT_AND_BACK, GL_LINE); + // Enable the vertex array functionality: + glEnableClientState(GL_VERTEX_ARRAY); + // Enable the color array functionality (so we can specify a color for + // each vertex) + glEnableClientState(GL_COLOR_ARRAY); + // Pass the vertex pointer: + glVertexPointer(3, // 3 components per vertex (x,y,z) + GL_FLOAT, + sizeof(SVertex), + g_pVertices); + // Pass the color pointer + glColorPointer(3, // 3 components per vertex (r,g,b) + GL_FLOAT, + sizeof(SVertex), + &g_pVertices[0].r); // Pointer to the first color + // Point size for point mode + glPointSize(1.0f); + glLineWidth(2.0f); + glClearColor(0.0f, 0.0f, 0.0f, 0.0f); + // Notify glut which messages we require: + glutDisplayFunc(display); + glutReshapeFunc(reshape); + glutKeyboardFunc(keyboard); + glutIdleFunc(idle); #if defined(__APPLE__) || defined(MACOSX) - atexit(glutClose); + atexit(glutClose); #else - glutCloseFunc(glutClose); + glutCloseFunc(glutClose); #endif - // Let's get started! - glutMainLoop(); + // Let's get started! + glutMainLoop(); + } + } + catch (runtime_error &e) { + printf("runtime error (%s)\n", e.what()); } - } catch (runtime_error &e) { - printf("runtime error (%s)\n", e.what()); - } - exit(EXIT_SUCCESS); -} \ No newline at end of file + exit(EXIT_SUCCESS); +} diff --git a/Samples/4_CUDA_Libraries/randomFog/rng.cpp b/Samples/4_CUDA_Libraries/randomFog/rng.cpp index be5c6067..c5aea320 100644 --- a/Samples/4_CUDA_Libraries/randomFog/rng.cpp +++ b/Samples/4_CUDA_Libraries/randomFog/rng.cpp @@ -28,282 +28,287 @@ // Utilities and System includes // Includes -#include -#include -#include #include "rng.h" +#include +#include +#include + // Shared Library Test Functions -#include #include +#include const unsigned int RNG::s_maxQrngDimensions = 20000; -RNG::RNG(unsigned long prngSeed, unsigned int qrngDimensions, - unsigned int nSamples) - : m_prngSeed(prngSeed), - m_qrngDimensions(qrngDimensions), - m_nSamplesBatchTarget(nSamples), - m_nSamplesRemaining(0) { - using std::invalid_argument; - using std::runtime_error; - using std::string; +RNG::RNG(unsigned long prngSeed, unsigned int qrngDimensions, unsigned int nSamples) + : m_prngSeed(prngSeed) + , m_qrngDimensions(qrngDimensions) + , m_nSamplesBatchTarget(nSamples) + , m_nSamplesRemaining(0) +{ + using std::invalid_argument; + using std::runtime_error; + using std::string; - if (m_prngSeed == 0) { - throw invalid_argument("PRNG seed must be non-zero"); - } + if (m_prngSeed == 0) { + throw invalid_argument("PRNG seed must be non-zero"); + } - if (m_qrngDimensions == 0) { - throw invalid_argument("QRNG dimensions must be non-zero"); - } + if (m_qrngDimensions == 0) { + throw invalid_argument("QRNG dimensions must be non-zero"); + } - if (m_nSamplesBatchTarget == 0) { - throw invalid_argument("RNG batch size must be non-zero"); - } + if (m_nSamplesBatchTarget == 0) { + throw invalid_argument("RNG batch size must be non-zero"); + } - if (m_nSamplesBatchTarget < s_maxQrngDimensions) { - throw invalid_argument( - "RNG batch size must be greater than RNG::s_maxQrngDimensions"); - } + if (m_nSamplesBatchTarget < s_maxQrngDimensions) { + throw invalid_argument("RNG batch size must be greater than RNG::s_maxQrngDimensions"); + } - curandStatus_t curandResult; - cudaError_t cudaResult; + curandStatus_t curandResult; + cudaError_t cudaResult; - // Allocate sample array in host mem - m_h_samples = (float *)malloc(m_nSamplesBatchTarget * sizeof(float)); + // Allocate sample array in host mem + m_h_samples = (float *)malloc(m_nSamplesBatchTarget * sizeof(float)); - if (m_h_samples == NULL) { - throw runtime_error("Could not allocate host memory for RNG::m_h_samples"); - } + if (m_h_samples == NULL) { + throw runtime_error("Could not allocate host memory for RNG::m_h_samples"); + } - // Allocate sample array in device mem - cudaResult = - cudaMalloc((void **)&m_d_samples, m_nSamplesBatchTarget * sizeof(float)); + // Allocate sample array in device mem + cudaResult = cudaMalloc((void **)&m_d_samples, m_nSamplesBatchTarget * sizeof(float)); - if (cudaResult != cudaSuccess) { - string msg("Could not allocate device memory for RNG::m_d_samples: "); - msg += cudaGetErrorString(cudaResult); - throw runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not allocate device memory for RNG::m_d_samples: "); + msg += cudaGetErrorString(cudaResult); + throw runtime_error(msg); + } - // Create the Random Number Generators - curandResult = curandCreateGenerator(&m_prng, CURAND_RNG_PSEUDO_XORWOW); + // Create the Random Number Generators + curandResult = curandCreateGenerator(&m_prng, CURAND_RNG_PSEUDO_XORWOW); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not create pseudo-random number generator: "); - msg += curandResult; - throw runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not create pseudo-random number generator: "); + msg += curandResult; + throw runtime_error(msg); + } - curandResult = curandCreateGenerator(&m_qrng, CURAND_RNG_QUASI_SOBOL32); + curandResult = curandCreateGenerator(&m_qrng, CURAND_RNG_QUASI_SOBOL32); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not create quasi-random number generator: "); - msg += curandResult; - throw runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not create quasi-random number generator: "); + msg += curandResult; + throw runtime_error(msg); + } - curandResult = - curandCreateGenerator(&m_sqrng, CURAND_RNG_QUASI_SCRAMBLED_SOBOL32); + curandResult = curandCreateGenerator(&m_sqrng, CURAND_RNG_QUASI_SCRAMBLED_SOBOL32); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not create scrambled quasi-random number generator: "); - msg += curandResult; - throw runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not create scrambled quasi-random number generator: "); + msg += curandResult; + throw runtime_error(msg); + } - // Setup initial parameters - resetSeed(); - updateDimensions(); - setBatchSize(); + // Setup initial parameters + resetSeed(); + updateDimensions(); + setBatchSize(); - // Set default RNG to be pseudo-random (XORWOW) - m_pCurrent = &m_prng; + // Set default RNG to be pseudo-random (XORWOW) + m_pCurrent = &m_prng; } -RNG::~RNG() { - curandDestroyGenerator(m_prng); - curandDestroyGenerator(m_qrng); - curandDestroyGenerator(m_sqrng); +RNG::~RNG() +{ + curandDestroyGenerator(m_prng); + curandDestroyGenerator(m_qrng); + curandDestroyGenerator(m_sqrng); - if (m_d_samples) { - cudaFree(m_d_samples); - } + if (m_d_samples) { + cudaFree(m_d_samples); + } - if (m_h_samples) { - free(m_h_samples); - } + if (m_h_samples) { + free(m_h_samples); + } } -void RNG::generateBatch(void) { - using std::runtime_error; - using std::string; +void RNG::generateBatch(void) +{ + using std::runtime_error; + using std::string; - cudaError_t cudaResult; - curandStatus_t curandResult; + cudaError_t cudaResult; + curandStatus_t curandResult; - // Generate random numbers - curandResult = - curandGenerateUniform(*m_pCurrent, m_d_samples, m_nSamplesBatchActual); + // Generate random numbers + curandResult = curandGenerateUniform(*m_pCurrent, m_d_samples, m_nSamplesBatchActual); - if (curandResult != CURAND_STATUS_SUCCESS) { - string msg("Could not generate random numbers: "); - msg += curandResult; - throw runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + string msg("Could not generate random numbers: "); + msg += curandResult; + throw runtime_error(msg); + } - // Copy random numbers to host - cudaResult = - cudaMemcpy(m_h_samples, m_d_samples, - m_nSamplesBatchActual * sizeof(float), cudaMemcpyDeviceToHost); + // Copy random numbers to host + cudaResult = cudaMemcpy(m_h_samples, m_d_samples, m_nSamplesBatchActual * sizeof(float), cudaMemcpyDeviceToHost); - if (cudaResult != cudaSuccess) { - string msg("Could not copy random numbers to host: "); - msg += cudaGetErrorString(cudaResult); - throw runtime_error(msg); - } + if (cudaResult != cudaSuccess) { + string msg("Could not copy random numbers to host: "); + msg += cudaGetErrorString(cudaResult); + throw runtime_error(msg); + } } -float RNG::getNextU01(void) { - if (m_nSamplesRemaining == 0) { - generateBatch(); - m_nSamplesRemaining = m_nSamplesBatchActual; - } +float RNG::getNextU01(void) +{ + if (m_nSamplesRemaining == 0) { + generateBatch(); + m_nSamplesRemaining = m_nSamplesBatchActual; + } - if (m_pCurrent == &m_prng) { - return m_h_samples[m_nSamplesBatchActual - m_nSamplesRemaining--]; - } else { - unsigned int index = m_nSamplesBatchActual - m_nSamplesRemaining--; - unsigned int samplesPerDim = m_nSamplesBatchActual / m_qrngDimensions; - unsigned int dimOffset = (index % m_qrngDimensions) * samplesPerDim; - unsigned int drawOffset = index / m_qrngDimensions; - return m_h_samples[dimOffset + drawOffset]; - } + if (m_pCurrent == &m_prng) { + return m_h_samples[m_nSamplesBatchActual - m_nSamplesRemaining--]; + } + else { + unsigned int index = m_nSamplesBatchActual - m_nSamplesRemaining--; + unsigned int samplesPerDim = m_nSamplesBatchActual / m_qrngDimensions; + unsigned int dimOffset = (index % m_qrngDimensions) * samplesPerDim; + unsigned int drawOffset = index / m_qrngDimensions; + return m_h_samples[dimOffset + drawOffset]; + } } -void RNG::getInfoString(std::string &msg) { - using std::stringstream; +void RNG::getInfoString(std::string &msg) +{ + using std::stringstream; - stringstream ss; + stringstream ss; - if (m_pCurrent == &m_prng) { - ss << "XORWOW (seed=" << m_prngSeed << ")"; - } else if (m_pCurrent == &m_qrng) { - ss << "Sobol (dimensions=" << m_qrngDimensions << ")"; - } else if (m_pCurrent == &m_sqrng) { - ss << "Scrambled Sobol (dimensions=" << m_qrngDimensions << ")"; - } else { - ss << "Invalid RNG"; - } + if (m_pCurrent == &m_prng) { + ss << "XORWOW (seed=" << m_prngSeed << ")"; + } + else if (m_pCurrent == &m_qrng) { + ss << "Sobol (dimensions=" << m_qrngDimensions << ")"; + } + else if (m_pCurrent == &m_sqrng) { + ss << "Scrambled Sobol (dimensions=" << m_qrngDimensions << ")"; + } + else { + ss << "Invalid RNG"; + } - msg.assign(ss.str()); + msg.assign(ss.str()); } -void RNG::selectRng(RNG::RngType type) { - switch (type) { +void RNG::selectRng(RNG::RngType type) +{ + switch (type) { case Quasi: - m_pCurrent = &m_qrng; - break; + m_pCurrent = &m_qrng; + break; case ScrambledQuasi: - m_pCurrent = &m_sqrng; - break; + m_pCurrent = &m_sqrng; + break; case Pseudo: default: - m_pCurrent = &m_prng; - break; - } + m_pCurrent = &m_prng; + break; + } - setBatchSize(); + setBatchSize(); } -void RNG::resetSeed(void) { - using std::runtime_error; +void RNG::resetSeed(void) +{ + using std::runtime_error; - curandStatus_t curandResult; - curandResult = curandSetPseudoRandomGeneratorSeed(m_prng, m_prngSeed); + curandStatus_t curandResult; + curandResult = curandSetPseudoRandomGeneratorSeed(m_prng, m_prngSeed); - if (curandResult != CURAND_STATUS_SUCCESS) { - std::string msg("Could not set pseudo-random number generator seed: "); - msg += curandResult; - throw runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + std::string msg("Could not set pseudo-random number generator seed: "); + msg += curandResult; + throw runtime_error(msg); + } - curandResult = curandSetGeneratorOffset(m_prng, 0); + curandResult = curandSetGeneratorOffset(m_prng, 0); - if (curandResult != CURAND_STATUS_SUCCESS) { - std::string msg("Could not set pseudo-random number generator offset: "); - msg += curandResult; - throw runtime_error(msg); - } + if (curandResult != CURAND_STATUS_SUCCESS) { + std::string msg("Could not set pseudo-random number generator offset: "); + msg += curandResult; + throw runtime_error(msg); + } - setBatchSize(); + setBatchSize(); } -void RNG::resetDimensions(void) { - m_qrngDimensions = 1; - updateDimensions(); - setBatchSize(); -} - -void RNG::incrementDimensions(void) { - if (++m_qrngDimensions > s_maxQrngDimensions) { +void RNG::resetDimensions(void) +{ m_qrngDimensions = 1; - } - - updateDimensions(); - setBatchSize(); + updateDimensions(); + setBatchSize(); } -void RNG::updateDimensions(void) { - using std::runtime_error; +void RNG::incrementDimensions(void) +{ + if (++m_qrngDimensions > s_maxQrngDimensions) { + m_qrngDimensions = 1; + } - curandStatus_t curandResult; - curandResult = - curandSetQuasiRandomGeneratorDimensions(m_qrng, m_qrngDimensions); - - if (curandResult != CURAND_STATUS_SUCCESS) { - std::string msg("Could not set quasi-random number generator dimensions: "); - msg += curandResult; - throw runtime_error(msg); - } - - curandResult = curandSetGeneratorOffset(m_qrng, 0); - - if (curandResult != CURAND_STATUS_SUCCESS) { - std::string msg("Could not set quasi-random number generator offset: "); - msg += curandResult; - throw runtime_error(msg); - } - - curandResult = - curandSetQuasiRandomGeneratorDimensions(m_sqrng, m_qrngDimensions); - - if (curandResult != CURAND_STATUS_SUCCESS) { - std::string msg( - "Could not set scrambled quasi-random number generator dimensions: "); - msg += curandResult; - throw runtime_error(msg); - } - - curandResult = curandSetGeneratorOffset(m_sqrng, 0); - - if (curandResult != CURAND_STATUS_SUCCESS) { - std::string msg( - "Could not set scrambled quasi-random number generator offset: "); - msg += curandResult; - throw runtime_error(msg); - } + updateDimensions(); + setBatchSize(); } -void RNG::setBatchSize(void) { - if (m_pCurrent == &m_prng) { - m_nSamplesBatchActual = m_nSamplesBatchTarget; - } else { - m_nSamplesBatchActual = - (m_nSamplesBatchTarget / m_qrngDimensions) * m_qrngDimensions; - } +void RNG::updateDimensions(void) +{ + using std::runtime_error; - m_nSamplesRemaining = 0; + curandStatus_t curandResult; + curandResult = curandSetQuasiRandomGeneratorDimensions(m_qrng, m_qrngDimensions); + + if (curandResult != CURAND_STATUS_SUCCESS) { + std::string msg("Could not set quasi-random number generator dimensions: "); + msg += curandResult; + throw runtime_error(msg); + } + + curandResult = curandSetGeneratorOffset(m_qrng, 0); + + if (curandResult != CURAND_STATUS_SUCCESS) { + std::string msg("Could not set quasi-random number generator offset: "); + msg += curandResult; + throw runtime_error(msg); + } + + curandResult = curandSetQuasiRandomGeneratorDimensions(m_sqrng, m_qrngDimensions); + + if (curandResult != CURAND_STATUS_SUCCESS) { + std::string msg("Could not set scrambled quasi-random number generator dimensions: "); + msg += curandResult; + throw runtime_error(msg); + } + + curandResult = curandSetGeneratorOffset(m_sqrng, 0); + + if (curandResult != CURAND_STATUS_SUCCESS) { + std::string msg("Could not set scrambled quasi-random number generator offset: "); + msg += curandResult; + throw runtime_error(msg); + } +} + +void RNG::setBatchSize(void) +{ + if (m_pCurrent == &m_prng) { + m_nSamplesBatchActual = m_nSamplesBatchTarget; + } + else { + m_nSamplesBatchActual = (m_nSamplesBatchTarget / m_qrngDimensions) * m_qrngDimensions; + } + + m_nSamplesRemaining = 0; } diff --git a/Samples/4_CUDA_Libraries/randomFog/rng.h b/Samples/4_CUDA_Libraries/randomFog/rng.h index 2a01416e..06803801 100644 --- a/Samples/4_CUDA_Libraries/randomFog/rng.h +++ b/Samples/4_CUDA_Libraries/randomFog/rng.h @@ -31,42 +31,42 @@ // RNGs class RNG { - public: - enum RngType {Pseudo, Quasi, ScrambledQuasi}; - RNG(unsigned long prngSeed, unsigned int qrngDimensions, unsigned int nSamples); - virtual ~RNG(); +public: + enum RngType { Pseudo, Quasi, ScrambledQuasi }; + RNG(unsigned long prngSeed, unsigned int qrngDimensions, unsigned int nSamples); + virtual ~RNG(); - float getNextU01(void); - void getInfoString(std::string &msg); - void selectRng(RngType type); - void resetSeed(void); - void resetDimensions(void); - void incrementDimensions(void); + float getNextU01(void); + void getInfoString(std::string &msg); + void selectRng(RngType type); + void resetSeed(void); + void resetDimensions(void); + void incrementDimensions(void); - private: - // Generators - curandGenerator_t *m_pCurrent; - curandGenerator_t m_prng; - curandGenerator_t m_qrng; - curandGenerator_t m_sqrng; +private: + // Generators + curandGenerator_t *m_pCurrent; + curandGenerator_t m_prng; + curandGenerator_t m_qrng; + curandGenerator_t m_sqrng; - // Parameters - unsigned long m_prngSeed; - unsigned int m_qrngDimensions; + // Parameters + unsigned long m_prngSeed; + unsigned int m_qrngDimensions; - // Batches - const unsigned int m_nSamplesBatchTarget; - unsigned int m_nSamplesBatchActual; - unsigned int m_nSamplesRemaining; - void generateBatch(void); + // Batches + const unsigned int m_nSamplesBatchTarget; + unsigned int m_nSamplesBatchActual; + unsigned int m_nSamplesRemaining; + void generateBatch(void); - // Helpers - void updateDimensions(void); - void setBatchSize(void); + // Helpers + void updateDimensions(void); + void setBatchSize(void); - // Buffers - float *m_h_samples; - float *m_d_samples; + // Buffers + float *m_h_samples; + float *m_d_samples; - static const unsigned int s_maxQrngDimensions; + static const unsigned int s_maxQrngDimensions; }; diff --git a/Samples/4_CUDA_Libraries/simpleCUBLAS/simpleCUBLAS.cpp b/Samples/4_CUDA_Libraries/simpleCUBLAS/simpleCUBLAS.cpp index 001c8b49..6197a4ee 100644 --- a/Samples/4_CUDA_Libraries/simpleCUBLAS/simpleCUBLAS.cpp +++ b/Samples/4_CUDA_Libraries/simpleCUBLAS/simpleCUBLAS.cpp @@ -45,211 +45,209 @@ #define N (275) /* Host implementation of a simple version of sgemm */ -static void simple_sgemm(int n, float alpha, const float *A, const float *B, - float beta, float *C) { - int i; - int j; - int k; +static void simple_sgemm(int n, float alpha, const float *A, const float *B, float beta, float *C) +{ + int i; + int j; + int k; - for (i = 0; i < n; ++i) { - for (j = 0; j < n; ++j) { - float prod = 0; + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + float prod = 0; - for (k = 0; k < n; ++k) { - prod += A[k * n + i] * B[j * n + k]; - } + for (k = 0; k < n; ++k) { + prod += A[k * n + i] * B[j * n + k]; + } - C[j * n + i] = alpha * prod + beta * C[j * n + i]; + C[j * n + i] = alpha * prod + beta * C[j * n + i]; + } } - } } /* Main */ -int main(int argc, char **argv) { - cublasStatus_t status; - float *h_A; - float *h_B; - float *h_C; - float *h_C_ref; - float *d_A = 0; - float *d_B = 0; - float *d_C = 0; - float alpha = 1.0f; - float beta = 0.0f; - int n2 = N * N; - int i; - float error_norm; - float ref_norm; - float diff; - cublasHandle_t handle; +int main(int argc, char **argv) +{ + cublasStatus_t status; + float *h_A; + float *h_B; + float *h_C; + float *h_C_ref; + float *d_A = 0; + float *d_B = 0; + float *d_C = 0; + float alpha = 1.0f; + float beta = 0.0f; + int n2 = N * N; + int i; + float error_norm; + float ref_norm; + float diff; + cublasHandle_t handle; - int dev = findCudaDevice(argc, (const char **)argv); + int dev = findCudaDevice(argc, (const char **)argv); - if (dev == -1) { - return EXIT_FAILURE; - } + if (dev == -1) { + return EXIT_FAILURE; + } - /* Initialize CUBLAS */ - printf("simpleCUBLAS test running..\n"); + /* Initialize CUBLAS */ + printf("simpleCUBLAS test running..\n"); - status = cublasCreate(&handle); + status = cublasCreate(&handle); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! CUBLAS initialization error\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLAS initialization error\n"); + return EXIT_FAILURE; + } - /* Allocate host memory for the matrices */ - h_A = reinterpret_cast(malloc(n2 * sizeof(h_A[0]))); + /* Allocate host memory for the matrices */ + h_A = reinterpret_cast(malloc(n2 * sizeof(h_A[0]))); - if (h_A == 0) { - fprintf(stderr, "!!!! host memory allocation error (A)\n"); - return EXIT_FAILURE; - } + if (h_A == 0) { + fprintf(stderr, "!!!! host memory allocation error (A)\n"); + return EXIT_FAILURE; + } - h_B = reinterpret_cast(malloc(n2 * sizeof(h_B[0]))); + h_B = reinterpret_cast(malloc(n2 * sizeof(h_B[0]))); - if (h_B == 0) { - fprintf(stderr, "!!!! host memory allocation error (B)\n"); - return EXIT_FAILURE; - } + if (h_B == 0) { + fprintf(stderr, "!!!! host memory allocation error (B)\n"); + return EXIT_FAILURE; + } - h_C = reinterpret_cast(malloc(n2 * sizeof(h_C[0]))); + h_C = reinterpret_cast(malloc(n2 * sizeof(h_C[0]))); - if (h_C == 0) { - fprintf(stderr, "!!!! host memory allocation error (C)\n"); - return EXIT_FAILURE; - } + if (h_C == 0) { + fprintf(stderr, "!!!! host memory allocation error (C)\n"); + return EXIT_FAILURE; + } - /* Fill the matrices with test data */ - for (i = 0; i < n2; i++) { - h_A[i] = rand() / static_cast(RAND_MAX); - h_B[i] = rand() / static_cast(RAND_MAX); - h_C[i] = rand() / static_cast(RAND_MAX); - } + /* Fill the matrices with test data */ + for (i = 0; i < n2; i++) { + h_A[i] = rand() / static_cast(RAND_MAX); + h_B[i] = rand() / static_cast(RAND_MAX); + h_C[i] = rand() / static_cast(RAND_MAX); + } - /* Allocate device memory for the matrices */ - if (cudaMalloc(reinterpret_cast(&d_A), n2 * sizeof(d_A[0])) != - cudaSuccess) { - fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); - return EXIT_FAILURE; - } + /* Allocate device memory for the matrices */ + if (cudaMalloc(reinterpret_cast(&d_A), n2 * sizeof(d_A[0])) != cudaSuccess) { + fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); + return EXIT_FAILURE; + } - if (cudaMalloc(reinterpret_cast(&d_B), n2 * sizeof(d_B[0])) != - cudaSuccess) { - fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); - return EXIT_FAILURE; - } + if (cudaMalloc(reinterpret_cast(&d_B), n2 * sizeof(d_B[0])) != cudaSuccess) { + fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); + return EXIT_FAILURE; + } - if (cudaMalloc(reinterpret_cast(&d_C), n2 * sizeof(d_C[0])) != - cudaSuccess) { - fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); - return EXIT_FAILURE; - } + if (cudaMalloc(reinterpret_cast(&d_C), n2 * sizeof(d_C[0])) != cudaSuccess) { + fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); + return EXIT_FAILURE; + } - /* Initialize the device matrices with the host matrices */ - status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); + /* Initialize the device matrices with the host matrices */ + status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! device access error (write A)\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (write A)\n"); + return EXIT_FAILURE; + } - status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); + status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! device access error (write B)\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (write B)\n"); + return EXIT_FAILURE; + } - status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); + status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! device access error (write C)\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (write C)\n"); + return EXIT_FAILURE; + } - /* Performs operation using plain C code */ - simple_sgemm(N, alpha, h_A, h_B, beta, h_C); - h_C_ref = h_C; + /* Performs operation using plain C code */ + simple_sgemm(N, alpha, h_A, h_B, beta, h_C); + h_C_ref = h_C; - /* Performs operation using cublas */ - status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, - N, d_B, N, &beta, d_C, N); + /* Performs operation using cublas */ + status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! kernel execution error.\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! kernel execution error.\n"); + return EXIT_FAILURE; + } - /* Allocate host memory for reading back the result from device memory */ - h_C = reinterpret_cast(malloc(n2 * sizeof(h_C[0]))); + /* Allocate host memory for reading back the result from device memory */ + h_C = reinterpret_cast(malloc(n2 * sizeof(h_C[0]))); - if (h_C == 0) { - fprintf(stderr, "!!!! host memory allocation error (C)\n"); - return EXIT_FAILURE; - } + if (h_C == 0) { + fprintf(stderr, "!!!! host memory allocation error (C)\n"); + return EXIT_FAILURE; + } - /* Read the result back */ - status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); + /* Read the result back */ + status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! device access error (read C)\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! device access error (read C)\n"); + return EXIT_FAILURE; + } - /* Check result against reference */ - error_norm = 0; - ref_norm = 0; + /* Check result against reference */ + error_norm = 0; + ref_norm = 0; - for (i = 0; i < n2; ++i) { - diff = h_C_ref[i] - h_C[i]; - error_norm += diff * diff; - ref_norm += h_C_ref[i] * h_C_ref[i]; - } + for (i = 0; i < n2; ++i) { + diff = h_C_ref[i] - h_C[i]; + error_norm += diff * diff; + ref_norm += h_C_ref[i] * h_C_ref[i]; + } - error_norm = static_cast(sqrt(static_cast(error_norm))); - ref_norm = static_cast(sqrt(static_cast(ref_norm))); + error_norm = static_cast(sqrt(static_cast(error_norm))); + ref_norm = static_cast(sqrt(static_cast(ref_norm))); - if (fabs(ref_norm) < 1e-7) { - fprintf(stderr, "!!!! reference norm is 0\n"); - return EXIT_FAILURE; - } + if (fabs(ref_norm) < 1e-7) { + fprintf(stderr, "!!!! reference norm is 0\n"); + return EXIT_FAILURE; + } - /* Memory clean up */ - free(h_A); - free(h_B); - free(h_C); - free(h_C_ref); + /* Memory clean up */ + free(h_A); + free(h_B); + free(h_C); + free(h_C_ref); - if (cudaFree(d_A) != cudaSuccess) { - fprintf(stderr, "!!!! memory free error (A)\n"); - return EXIT_FAILURE; - } + if (cudaFree(d_A) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (A)\n"); + return EXIT_FAILURE; + } - if (cudaFree(d_B) != cudaSuccess) { - fprintf(stderr, "!!!! memory free error (B)\n"); - return EXIT_FAILURE; - } + if (cudaFree(d_B) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (B)\n"); + return EXIT_FAILURE; + } - if (cudaFree(d_C) != cudaSuccess) { - fprintf(stderr, "!!!! memory free error (C)\n"); - return EXIT_FAILURE; - } + if (cudaFree(d_C) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (C)\n"); + return EXIT_FAILURE; + } - /* Shutdown */ - status = cublasDestroy(handle); + /* Shutdown */ + status = cublasDestroy(handle); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! shutdown error (A)\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! shutdown error (A)\n"); + return EXIT_FAILURE; + } - if (error_norm / ref_norm < 1e-6f) { - printf("simpleCUBLAS test passed.\n"); - exit(EXIT_SUCCESS); - } else { - printf("simpleCUBLAS test failed.\n"); - exit(EXIT_FAILURE); - } + if (error_norm / ref_norm < 1e-6f) { + printf("simpleCUBLAS test passed.\n"); + exit(EXIT_SUCCESS); + } + else { + printf("simpleCUBLAS test failed.\n"); + exit(EXIT_FAILURE); + } } diff --git a/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp b/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp index 23edaa64..44f85de9 100644 --- a/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp +++ b/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp @@ -42,260 +42,263 @@ #include /* Matrix size */ -//#define N (275) +// #define N (275) #define N (1024) // Restricting the max used GPUs as input matrix is not so large #define MAX_NUM_OF_GPUS 2 /* Host implementation of a simple version of sgemm */ -static void simple_sgemm(int n, float alpha, const float *A, const float *B, - float beta, float *C) { - int i; - int j; - int k; +static void simple_sgemm(int n, float alpha, const float *A, const float *B, float beta, float *C) +{ + int i; + int j; + int k; - for (i = 0; i < n; ++i) { - for (j = 0; j < n; ++j) { - float prod = 0; + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + float prod = 0; - for (k = 0; k < n; ++k) { - prod += A[k * n + i] * B[j * n + k]; - } + for (k = 0; k < n; ++k) { + prod += A[k * n + i] * B[j * n + k]; + } - C[j * n + i] = alpha * prod + beta * C[j * n + i]; + C[j * n + i] = alpha * prod + beta * C[j * n + i]; + } } - } } -void findMultipleBestGPUs(int &num_of_devices, int *device_ids) { - // Find the best CUDA capable GPU device - int current_device = 0; +void findMultipleBestGPUs(int &num_of_devices, int *device_ids) +{ + // Find the best CUDA capable GPU device + int current_device = 0; - int device_count; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - typedef struct gpu_perf_t { - uint64_t compute_perf; - int device_id; - } gpu_perf; + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + typedef struct gpu_perf_t + { + uint64_t compute_perf; + int device_id; + } gpu_perf; - gpu_perf *gpu_stats = (gpu_perf *)malloc(sizeof(gpu_perf) * device_count); + gpu_perf *gpu_stats = (gpu_perf *)malloc(sizeof(gpu_perf) * device_count); - cudaDeviceProp deviceProp; - int devices_prohibited = 0; - while (current_device < device_count) { - cudaGetDeviceProperties(&deviceProp, current_device); + cudaDeviceProp deviceProp; + int devices_prohibited = 0; + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); - // If this GPU is not running on Compute Mode prohibited, - // then we can add it to the list - int sm_per_multiproc; - if (deviceProp.computeMode != cudaComputeModeProhibited) { - if (deviceProp.major == 9999 && deviceProp.minor == 9999) { - sm_per_multiproc = 1; - } else { - sm_per_multiproc = - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); - } + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + int sm_per_multiproc; + if (deviceProp.computeMode != cudaComputeModeProhibited) { + if (deviceProp.major == 9999 && deviceProp.minor == 9999) { + sm_per_multiproc = 1; + } + else { + sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + } - gpu_stats[current_device].compute_perf = - (uint64_t)deviceProp.multiProcessorCount * sm_per_multiproc * - deviceProp.clockRate; - gpu_stats[current_device].device_id = current_device; - - } else { - devices_prohibited++; - } - - ++current_device; - } - if (devices_prohibited == device_count) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceId() CUDA error:" - " all devices have compute mode prohibited.\n"); - exit(EXIT_FAILURE); - } else { - gpu_perf temp_elem; - // Sort the GPUs by highest compute perf. - for (int i = 0; i < current_device - 1; i++) { - for (int j = 0; j < current_device - i - 1; j++) { - if (gpu_stats[j].compute_perf < gpu_stats[j + 1].compute_perf) { - temp_elem = gpu_stats[j]; - gpu_stats[j] = gpu_stats[j + 1]; - gpu_stats[j + 1] = temp_elem; + gpu_stats[current_device].compute_perf = + (uint64_t)deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; + gpu_stats[current_device].device_id = current_device; + } + else { + devices_prohibited++; } - } - } - for (int i = 0; i < num_of_devices; i++) { - device_ids[i] = gpu_stats[i].device_id; + ++current_device; } - } - free(gpu_stats); + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + else { + gpu_perf temp_elem; + // Sort the GPUs by highest compute perf. + for (int i = 0; i < current_device - 1; i++) { + for (int j = 0; j < current_device - i - 1; j++) { + if (gpu_stats[j].compute_perf < gpu_stats[j + 1].compute_perf) { + temp_elem = gpu_stats[j]; + gpu_stats[j] = gpu_stats[j + 1]; + gpu_stats[j + 1] = temp_elem; + } + } + } + + for (int i = 0; i < num_of_devices; i++) { + device_ids[i] = gpu_stats[i].device_id; + } + } + free(gpu_stats); } /* Main */ -int main(int argc, char **argv) { - cublasStatus_t status; - float *h_A; - float *h_B; - float *h_C; - float *h_C_ref; - float *d_A = 0; - float *d_B = 0; - float *d_C = 0; - float alpha = 1.0f; - float beta = 0.0f; - int n2 = N * N; - int i; - float error_norm; - float ref_norm; - float diff; - cublasXtHandle_t handle; - int *devices = NULL; +int main(int argc, char **argv) +{ + cublasStatus_t status; + float *h_A; + float *h_B; + float *h_C; + float *h_C_ref; + float *d_A = 0; + float *d_B = 0; + float *d_C = 0; + float alpha = 1.0f; + float beta = 0.0f; + int n2 = N * N; + int i; + float error_norm; + float ref_norm; + float diff; + cublasXtHandle_t handle; + int *devices = NULL; - int num_of_devices = 0; + int num_of_devices = 0; - checkCudaErrors(cudaGetDeviceCount(&num_of_devices)); + checkCudaErrors(cudaGetDeviceCount(&num_of_devices)); - if (num_of_devices > MAX_NUM_OF_GPUS) { - num_of_devices = MAX_NUM_OF_GPUS; - } - devices = (int *)malloc(sizeof(int) * num_of_devices); + if (num_of_devices > MAX_NUM_OF_GPUS) { + num_of_devices = MAX_NUM_OF_GPUS; + } + devices = (int *)malloc(sizeof(int) * num_of_devices); - findMultipleBestGPUs(num_of_devices, devices); - cudaDeviceProp deviceProp; - printf("Using %d GPUs\n", num_of_devices); - for (i = 0; i < num_of_devices; i++) { - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devices[i])); - printf("GPU ID = %d, Name = %s \n", devices[i], deviceProp.name); - } + findMultipleBestGPUs(num_of_devices, devices); + cudaDeviceProp deviceProp; + printf("Using %d GPUs\n", num_of_devices); + for (i = 0; i < num_of_devices; i++) { + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devices[i])); + printf("GPU ID = %d, Name = %s \n", devices[i], deviceProp.name); + } - /* Initialize CUBLAS */ - printf("simpleCUBLASXT test running..\n"); + /* Initialize CUBLAS */ + printf("simpleCUBLASXT test running..\n"); - status = cublasXtCreate(&handle); + status = cublasXtCreate(&handle); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! CUBLASXT initialization error\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLASXT initialization error\n"); + return EXIT_FAILURE; + } - /* Select devices for use in CUBLASXT math functions */ - status = cublasXtDeviceSelect(handle, num_of_devices, devices); + /* Select devices for use in CUBLASXT math functions */ + status = cublasXtDeviceSelect(handle, num_of_devices, devices); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! CUBLASXT device selection error\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLASXT device selection error\n"); + return EXIT_FAILURE; + } - /* Optional: Set a block size for CUBLASXT math functions */ - status = cublasXtSetBlockDim(handle, 64); + /* Optional: Set a block size for CUBLASXT math functions */ + status = cublasXtSetBlockDim(handle, 64); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! CUBLASXT set block dimension error\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! CUBLASXT set block dimension error\n"); + return EXIT_FAILURE; + } - /* Allocate host memory for the matrices */ - h_A = (float *)malloc(n2 * sizeof(h_A[0])); + /* Allocate host memory for the matrices */ + h_A = (float *)malloc(n2 * sizeof(h_A[0])); - if (h_A == 0) { - fprintf(stderr, "!!!! host memory allocation error (A)\n"); - return EXIT_FAILURE; - } + if (h_A == 0) { + fprintf(stderr, "!!!! host memory allocation error (A)\n"); + return EXIT_FAILURE; + } - h_B = (float *)malloc(n2 * sizeof(h_B[0])); + h_B = (float *)malloc(n2 * sizeof(h_B[0])); - if (h_B == 0) { - fprintf(stderr, "!!!! host memory allocation error (B)\n"); - return EXIT_FAILURE; - } + if (h_B == 0) { + fprintf(stderr, "!!!! host memory allocation error (B)\n"); + return EXIT_FAILURE; + } - h_C_ref = (float *)malloc(n2 * sizeof(h_C[0])); + h_C_ref = (float *)malloc(n2 * sizeof(h_C[0])); - if (h_C_ref == 0) { - fprintf(stderr, "!!!! host memory allocation error (C_ref)\n"); - return EXIT_FAILURE; - } + if (h_C_ref == 0) { + fprintf(stderr, "!!!! host memory allocation error (C_ref)\n"); + return EXIT_FAILURE; + } - h_C = (float *)malloc(n2 * sizeof(h_C[0])); + h_C = (float *)malloc(n2 * sizeof(h_C[0])); - if (h_C == 0) { - fprintf(stderr, "!!!! host memory allocation error (C)\n"); - return EXIT_FAILURE; - } + if (h_C == 0) { + fprintf(stderr, "!!!! host memory allocation error (C)\n"); + return EXIT_FAILURE; + } - /* Fill the matrices with test data */ - for (i = 0; i < n2; i++) { - h_A[i] = rand() / (float)RAND_MAX; - h_B[i] = rand() / (float)RAND_MAX; - h_C[i] = rand() / (float)RAND_MAX; - h_C_ref[i] = h_C[i]; - } + /* Fill the matrices with test data */ + for (i = 0; i < n2; i++) { + h_A[i] = rand() / (float)RAND_MAX; + h_B[i] = rand() / (float)RAND_MAX; + h_C[i] = rand() / (float)RAND_MAX; + h_C_ref[i] = h_C[i]; + } - /* Performs operation using plain C code */ - simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); + /* Performs operation using plain C code */ + simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); - /* Performs operation using cublas */ - status = cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, h_A, - N, h_B, N, &beta, h_C, N); + /* Performs operation using cublas */ + status = cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, h_A, N, h_B, N, &beta, h_C, N); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! kernel execution error.\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! kernel execution error.\n"); + return EXIT_FAILURE; + } - /* Check result against reference */ - error_norm = 0; - ref_norm = 0; + /* Check result against reference */ + error_norm = 0; + ref_norm = 0; - for (i = 0; i < n2; ++i) { - diff = h_C_ref[i] - h_C[i]; - error_norm += diff * diff; - ref_norm += h_C_ref[i] * h_C_ref[i]; - } + for (i = 0; i < n2; ++i) { + diff = h_C_ref[i] - h_C[i]; + error_norm += diff * diff; + ref_norm += h_C_ref[i] * h_C_ref[i]; + } - error_norm = (float)sqrt((double)error_norm); - ref_norm = (float)sqrt((double)ref_norm); + error_norm = (float)sqrt((double)error_norm); + ref_norm = (float)sqrt((double)ref_norm); - if (fabs(ref_norm) < 1e-7) { - fprintf(stderr, "!!!! reference norm is 0\n"); - return EXIT_FAILURE; - } + if (fabs(ref_norm) < 1e-7) { + fprintf(stderr, "!!!! reference norm is 0\n"); + return EXIT_FAILURE; + } - /* Memory clean up */ - free(h_A); - free(h_B); - free(h_C); - free(h_C_ref); + /* Memory clean up */ + free(h_A); + free(h_B); + free(h_C); + free(h_C_ref); - if (cudaFree(d_A) != cudaSuccess) { - fprintf(stderr, "!!!! memory free error (A)\n"); - return EXIT_FAILURE; - } + if (cudaFree(d_A) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (A)\n"); + return EXIT_FAILURE; + } - if (cudaFree(d_B) != cudaSuccess) { - fprintf(stderr, "!!!! memory free error (B)\n"); - return EXIT_FAILURE; - } + if (cudaFree(d_B) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (B)\n"); + return EXIT_FAILURE; + } - if (cudaFree(d_C) != cudaSuccess) { - fprintf(stderr, "!!!! memory free error (C)\n"); - return EXIT_FAILURE; - } + if (cudaFree(d_C) != cudaSuccess) { + fprintf(stderr, "!!!! memory free error (C)\n"); + return EXIT_FAILURE; + } - /* Shutdown */ - status = cublasXtDestroy(handle); + /* Shutdown */ + status = cublasXtDestroy(handle); - if (status != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr, "!!!! shutdown error (A)\n"); - return EXIT_FAILURE; - } + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "!!!! shutdown error (A)\n"); + return EXIT_FAILURE; + } - if (error_norm / ref_norm < 1e-6f) { - printf("simpleCUBLASXT test passed.\n"); - exit(EXIT_SUCCESS); - } else { - printf("simpleCUBLASXT test failed.\n"); - exit(EXIT_FAILURE); - } + if (error_norm / ref_norm < 1e-6f) { + printf("simpleCUBLASXT test passed.\n"); + exit(EXIT_SUCCESS); + } + else { + printf("simpleCUBLASXT test failed.\n"); + exit(EXIT_FAILURE); + } } diff --git a/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp b/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp index 6f9281c1..201301c5 100644 --- a/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp +++ b/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/simpleCUBLAS_LU.cpp @@ -51,7 +51,7 @@ // configurable parameters // dimension of matrix -#define N 4 +#define N 4 #define BATCH_SIZE 10000 // use double precision data type @@ -70,348 +70,349 @@ // helper functions // wrapper around cublasgetrfBatched() -cublasStatus_t cublasXgetrfBatched(cublasHandle_t handle, int n, - DATA_TYPE* const A[], int lda, int* P, - int* info, int batchSize) { +cublasStatus_t +cublasXgetrfBatched(cublasHandle_t handle, int n, DATA_TYPE *const A[], int lda, int *P, int *info, int batchSize) +{ #ifdef DOUBLE_PRECISION - return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); + return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); #else - return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); + return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); #endif } // wrapper around malloc // clears the allocated memory to 0 // terminates the program if malloc fails -void* xmalloc(size_t size) { - void* ptr = malloc(size); - if (ptr == NULL) { - printf("> ERROR: malloc for size %zu failed..\n", size); - exit(EXIT_FAILURE); - } - memset(ptr, 0, size); - return ptr; +void *xmalloc(size_t size) +{ + void *ptr = malloc(size); + if (ptr == NULL) { + printf("> ERROR: malloc for size %zu failed..\n", size); + exit(EXIT_FAILURE); + } + memset(ptr, 0, size); + return ptr; } // initalize identity matrix -void initIdentityMatrix(DATA_TYPE* mat) { - // clear the matrix - memset(mat, 0, N * N * sizeof(DATA_TYPE)); +void initIdentityMatrix(DATA_TYPE *mat) +{ + // clear the matrix + memset(mat, 0, N * N * sizeof(DATA_TYPE)); - // set all diagonals to 1 - for (int i = 0; i < N; i++) { - mat[(i * N) + i] = 1.0; - } + // set all diagonals to 1 + for (int i = 0; i < N; i++) { + mat[(i * N) + i] = 1.0; + } } // initialize matrix with all elements as 0 -void initZeroMatrix(DATA_TYPE* mat) { - memset(mat, 0, N * N * sizeof(DATA_TYPE)); -} +void initZeroMatrix(DATA_TYPE *mat) { memset(mat, 0, N * N * sizeof(DATA_TYPE)); } // fill random value in column-major matrix -void initRandomMatrix(DATA_TYPE* mat) { - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - mat[(j * N) + i] = - (DATA_TYPE)1.0 + ((DATA_TYPE)rand() / (DATA_TYPE)RAND_MAX); +void initRandomMatrix(DATA_TYPE *mat) +{ + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + mat[(j * N) + i] = (DATA_TYPE)1.0 + ((DATA_TYPE)rand() / (DATA_TYPE)RAND_MAX); + } } - } - // diagonal dominant matrix to insure it is invertible matrix - for (int i = 0; i < N; i++) { - mat[(i * N) + i] += (DATA_TYPE)N; - } + // diagonal dominant matrix to insure it is invertible matrix + for (int i = 0; i < N; i++) { + mat[(i * N) + i] += (DATA_TYPE)N; + } } // print column-major matrix -void printMatrix(DATA_TYPE* mat) { - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - printf("%20.16f ", mat[(j * N) + i]); +void printMatrix(DATA_TYPE *mat) +{ + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + printf("%20.16f ", mat[(j * N) + i]); + } + printf("\n"); } printf("\n"); - } - printf("\n"); } // matrix mulitplication -void matrixMultiply(DATA_TYPE* res, DATA_TYPE* mat1, DATA_TYPE* mat2) { - initZeroMatrix(res); +void matrixMultiply(DATA_TYPE *res, DATA_TYPE *mat1, DATA_TYPE *mat2) +{ + initZeroMatrix(res); - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { - res[(j * N) + i] += mat1[(k * N) + i] * mat2[(j * N) + k]; - } + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + res[(j * N) + i] += mat1[(k * N) + i] * mat2[(j * N) + k]; + } + } } - } } // check matrix equality -bool checkRelativeError(DATA_TYPE* mat1, DATA_TYPE* mat2, DATA_TYPE maxError) { - DATA_TYPE err = (DATA_TYPE)0.0; - DATA_TYPE refNorm = (DATA_TYPE)0.0; - DATA_TYPE relError = (DATA_TYPE)0.0; - DATA_TYPE relMaxError = (DATA_TYPE)0.0; +bool checkRelativeError(DATA_TYPE *mat1, DATA_TYPE *mat2, DATA_TYPE maxError) +{ + DATA_TYPE err = (DATA_TYPE)0.0; + DATA_TYPE refNorm = (DATA_TYPE)0.0; + DATA_TYPE relError = (DATA_TYPE)0.0; + DATA_TYPE relMaxError = (DATA_TYPE)0.0; - for (int i = 0; i < N * N; i++) { - refNorm = abs(mat1[i]); - err = abs(mat1[i] - mat2[i]); + for (int i = 0; i < N * N; i++) { + refNorm = abs(mat1[i]); + err = abs(mat1[i] - mat2[i]); - if (refNorm != 0.0 && err > 0.0) { - relError = err / refNorm; - relMaxError = MAX(relMaxError, relError); + if (refNorm != 0.0 && err > 0.0) { + relError = err / refNorm; + relMaxError = MAX(relMaxError, relError); + } + + if (relMaxError > maxError) + return false; } - - if (relMaxError > maxError) return false; - } - return true; + return true; } // decode lower and upper matrix from single matrix // returned by getrfBatched() -void getLUdecoded(DATA_TYPE* mat, DATA_TYPE* L, DATA_TYPE* U) { - // init L as identity matrix - initIdentityMatrix(L); +void getLUdecoded(DATA_TYPE *mat, DATA_TYPE *L, DATA_TYPE *U) +{ + // init L as identity matrix + initIdentityMatrix(L); - // copy lower triangular values from mat to L (skip diagonal) - for (int i = 0; i < N; i++) { - for (int j = 0; j < i; j++) { - L[(j * N) + i] = mat[(j * N) + i]; + // copy lower triangular values from mat to L (skip diagonal) + for (int i = 0; i < N; i++) { + for (int j = 0; j < i; j++) { + L[(j * N) + i] = mat[(j * N) + i]; + } } - } - // init U as all zero - initZeroMatrix(U); + // init U as all zero + initZeroMatrix(U); - // copy upper triangular values from mat to U - for (int i = 0; i < N; i++) { - for (int j = i; j < N; j++) { - U[(j * N) + i] = mat[(j * N) + i]; + // copy upper triangular values from mat to U + for (int i = 0; i < N; i++) { + for (int j = i; j < N; j++) { + U[(j * N) + i] = mat[(j * N) + i]; + } } - } } // generate permutation matrix from pivot vector -void getPmatFromPivot(DATA_TYPE* Pmat, int* P) { - int pivot[N]; +void getPmatFromPivot(DATA_TYPE *Pmat, int *P) +{ + int pivot[N]; - // pivot vector in base-1 - // convert it to base-0 - for (int i = 0; i < N; i++) { - P[i]--; - } + // pivot vector in base-1 + // convert it to base-0 + for (int i = 0; i < N; i++) { + P[i]--; + } - // generate permutation vector from pivot - // initialize pivot with identity sequence - for (int k = 0; k < N; k++) { - pivot[k] = k; - } + // generate permutation vector from pivot + // initialize pivot with identity sequence + for (int k = 0; k < N; k++) { + pivot[k] = k; + } - // swap the indices according to pivot vector - for (int k = 0; k < N; k++) { - int q = P[k]; + // swap the indices according to pivot vector + for (int k = 0; k < N; k++) { + int q = P[k]; - // swap pivot(k) and pivot(q) - int s = pivot[k]; - int t = pivot[q]; - pivot[k] = t; - pivot[q] = s; - } + // swap pivot(k) and pivot(q) + int s = pivot[k]; + int t = pivot[q]; + pivot[k] = t; + pivot[q] = s; + } - // generate permutation matrix from pivot vector - initZeroMatrix(Pmat); - for (int i = 0; i < N; i++) { - int j = pivot[i]; - Pmat[(j * N) + i] = (DATA_TYPE)1.0; - } + // generate permutation matrix from pivot vector + initZeroMatrix(Pmat); + for (int i = 0; i < N; i++) { + int j = pivot[i]; + Pmat[(j * N) + i] = (DATA_TYPE)1.0; + } } -int main(int argc, char** argv) { - // cuBLAS variables - cublasStatus_t status; - cublasHandle_t handle; +int main(int argc, char **argv) +{ + // cuBLAS variables + cublasStatus_t status; + cublasHandle_t handle; - // host variables - size_t matSize = N * N * sizeof(DATA_TYPE); + // host variables + size_t matSize = N * N * sizeof(DATA_TYPE); - DATA_TYPE* h_AarrayInput; - DATA_TYPE* h_AarrayOutput; - DATA_TYPE* h_ptr_array[BATCH_SIZE]; + DATA_TYPE *h_AarrayInput; + DATA_TYPE *h_AarrayOutput; + DATA_TYPE *h_ptr_array[BATCH_SIZE]; - int* h_pivotArray; - int* h_infoArray; + int *h_pivotArray; + int *h_infoArray; - // device variables - DATA_TYPE* d_Aarray; - DATA_TYPE** d_ptr_array; + // device variables + DATA_TYPE *d_Aarray; + DATA_TYPE **d_ptr_array; - int* d_pivotArray; - int* d_infoArray; + int *d_pivotArray; + int *d_infoArray; - int err_count = 0; + int err_count = 0; - // seed the rand() function with time - srand(12345); + // seed the rand() function with time + srand(12345); - // find cuda device - printf("> initializing..\n"); - int dev = findCudaDevice(argc, (const char**)argv); - if (dev == -1) { - return (EXIT_FAILURE); - } + // find cuda device + printf("> initializing..\n"); + int dev = findCudaDevice(argc, (const char **)argv); + if (dev == -1) { + return (EXIT_FAILURE); + } - // initialize cuBLAS - status = cublasCreate(&handle); - if (status != CUBLAS_STATUS_SUCCESS) { - printf("> ERROR: cuBLAS initialization failed..\n"); - return (EXIT_FAILURE); - } + // initialize cuBLAS + status = cublasCreate(&handle); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("> ERROR: cuBLAS initialization failed..\n"); + return (EXIT_FAILURE); + } #ifdef DOUBLE_PRECISION - printf("> using DOUBLE precision..\n"); + printf("> using DOUBLE precision..\n"); #else - printf("> using SINGLE precision..\n"); + printf("> using SINGLE precision..\n"); #endif #ifdef PIVOT - printf("> pivot ENABLED..\n"); + printf("> pivot ENABLED..\n"); #else - printf("> pivot DISABLED..\n"); + printf("> pivot DISABLED..\n"); #endif - // allocate memory for host variables - h_AarrayInput = (DATA_TYPE*)xmalloc(BATCH_SIZE * matSize); - h_AarrayOutput = (DATA_TYPE*)xmalloc(BATCH_SIZE * matSize); + // allocate memory for host variables + h_AarrayInput = (DATA_TYPE *)xmalloc(BATCH_SIZE * matSize); + h_AarrayOutput = (DATA_TYPE *)xmalloc(BATCH_SIZE * matSize); - h_pivotArray = (int*)xmalloc(N * BATCH_SIZE * sizeof(int)); - h_infoArray = (int*)xmalloc(BATCH_SIZE * sizeof(int)); + h_pivotArray = (int *)xmalloc(N * BATCH_SIZE * sizeof(int)); + h_infoArray = (int *)xmalloc(BATCH_SIZE * sizeof(int)); - // allocate memory for device variables - checkCudaErrors(cudaMalloc((void**)&d_Aarray, BATCH_SIZE * matSize)); - checkCudaErrors( - cudaMalloc((void**)&d_pivotArray, N * BATCH_SIZE * sizeof(int))); - checkCudaErrors(cudaMalloc((void**)&d_infoArray, BATCH_SIZE * sizeof(int))); - checkCudaErrors( - cudaMalloc((void**)&d_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE*))); + // allocate memory for device variables + checkCudaErrors(cudaMalloc((void **)&d_Aarray, BATCH_SIZE * matSize)); + checkCudaErrors(cudaMalloc((void **)&d_pivotArray, N * BATCH_SIZE * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_infoArray, BATCH_SIZE * sizeof(int))); + checkCudaErrors(cudaMalloc((void **)&d_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE *))); - // fill matrix with random data - printf("> generating random matrices..\n"); - for (int i = 0; i < BATCH_SIZE; i++) { - initRandomMatrix(h_AarrayInput + (i * N * N)); - } - - // copy data to device from host - printf("> copying data from host memory to GPU memory..\n"); - checkCudaErrors(cudaMemcpy(d_Aarray, h_AarrayInput, BATCH_SIZE * matSize, - cudaMemcpyHostToDevice)); - - // create pointer array for matrices - for (int i = 0; i < BATCH_SIZE; i++) h_ptr_array[i] = d_Aarray + (i * N * N); - - // copy pointer array to device memory - checkCudaErrors(cudaMemcpy(d_ptr_array, h_ptr_array, - BATCH_SIZE * sizeof(DATA_TYPE*), - cudaMemcpyHostToDevice)); - - // perform LU decomposition - printf("> performing LU decomposition..\n"); -#ifdef PIVOT - status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray, - d_infoArray, BATCH_SIZE); -#else - status = cublasXgetrfBatched(handle, N, d_ptr_array, N, NULL, d_infoArray, - BATCH_SIZE); -#endif /* PIVOT */ - if (status != CUBLAS_STATUS_SUCCESS) { - printf("> ERROR: cublasDgetrfBatched() failed with error %s..\n", - _cudaGetErrorEnum(status)); - return (EXIT_FAILURE); - } - - // copy data to host from device - printf("> copying data from GPU memory to host memory..\n"); - checkCudaErrors(cudaMemcpy(h_AarrayOutput, d_Aarray, BATCH_SIZE * matSize, - cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_infoArray, d_infoArray, BATCH_SIZE * sizeof(int), - cudaMemcpyDeviceToHost)); -#ifdef PIVOT - checkCudaErrors(cudaMemcpy(h_pivotArray, d_pivotArray, - N * BATCH_SIZE * sizeof(int), - cudaMemcpyDeviceToHost)); -#endif /* PIVOT */ - - // verify the result - printf("> verifying the result..\n"); - for (int i = 0; i < BATCH_SIZE; i++) { - if (h_infoArray[i] == 0) { - DATA_TYPE* A = h_AarrayInput + (i * N * N); - DATA_TYPE* LU = h_AarrayOutput + (i * N * N); - DATA_TYPE L[N * N]; - DATA_TYPE U[N * N]; - getLUdecoded(LU, L, U); - - // test P * A = L * U - int* P = h_pivotArray + (i * N); - DATA_TYPE Pmat[N * N]; -#ifdef PIVOT - getPmatFromPivot(Pmat, P); -#else - initIdentityMatrix(Pmat); -#endif /* PIVOT */ - - // perform matrix multiplication - DATA_TYPE PxA[N * N]; - DATA_TYPE LxU[N * N]; - matrixMultiply(PxA, Pmat, A); - matrixMultiply(LxU, L, U); - - // check for equality of matrices - if (!checkRelativeError(PxA, LxU, (DATA_TYPE)MAX_ERROR)) { - printf("> ERROR: accuracy check failed for matrix number %05d..\n", - i + 1); - err_count++; - } - - } else if (h_infoArray[i] > 0) { - printf( - "> execution for matrix %05d is successful, but U is singular and " - "U(%d,%d) = 0..\n", - i + 1, h_infoArray[i] - 1, h_infoArray[i] - 1); - } else // (h_infoArray[i] < 0) - { - printf("> ERROR: matrix %05d have an illegal value at index %d = %lf..\n", - i + 1, -h_infoArray[i], - *(h_AarrayInput + (i * N * N) + (-h_infoArray[i]))); + // fill matrix with random data + printf("> generating random matrices..\n"); + for (int i = 0; i < BATCH_SIZE; i++) { + initRandomMatrix(h_AarrayInput + (i * N * N)); } - } - // free device variables - checkCudaErrors(cudaFree(d_ptr_array)); - checkCudaErrors(cudaFree(d_infoArray)); - checkCudaErrors(cudaFree(d_pivotArray)); - checkCudaErrors(cudaFree(d_Aarray)); + // copy data to device from host + printf("> copying data from host memory to GPU memory..\n"); + checkCudaErrors(cudaMemcpy(d_Aarray, h_AarrayInput, BATCH_SIZE * matSize, cudaMemcpyHostToDevice)); - // free host variables - if (h_infoArray) free(h_infoArray); - if (h_pivotArray) free(h_pivotArray); - if (h_AarrayOutput) free(h_AarrayOutput); - if (h_AarrayInput) free(h_AarrayInput); + // create pointer array for matrices + for (int i = 0; i < BATCH_SIZE; i++) + h_ptr_array[i] = d_Aarray + (i * N * N); - // destroy cuBLAS handle - status = cublasDestroy(handle); - if (status != CUBLAS_STATUS_SUCCESS) { - printf("> ERROR: cuBLAS uninitialization failed..\n"); - return (EXIT_FAILURE); - } + // copy pointer array to device memory + checkCudaErrors(cudaMemcpy(d_ptr_array, h_ptr_array, BATCH_SIZE * sizeof(DATA_TYPE *), cudaMemcpyHostToDevice)); - if (err_count > 0) { - printf("> TEST FAILED for %d matrices, with precision: %g\n", err_count, - MAX_ERROR); - return (EXIT_FAILURE); - } + // perform LU decomposition + printf("> performing LU decomposition..\n"); +#ifdef PIVOT + status = cublasXgetrfBatched(handle, N, d_ptr_array, N, d_pivotArray, d_infoArray, BATCH_SIZE); +#else + status = cublasXgetrfBatched(handle, N, d_ptr_array, N, NULL, d_infoArray, BATCH_SIZE); +#endif /* PIVOT */ + if (status != CUBLAS_STATUS_SUCCESS) { + printf("> ERROR: cublasDgetrfBatched() failed with error %s..\n", _cudaGetErrorEnum(status)); + return (EXIT_FAILURE); + } - printf("> TEST SUCCESSFUL, with precision: %g\n", MAX_ERROR); - return (EXIT_SUCCESS); + // copy data to host from device + printf("> copying data from GPU memory to host memory..\n"); + checkCudaErrors(cudaMemcpy(h_AarrayOutput, d_Aarray, BATCH_SIZE * matSize, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_infoArray, d_infoArray, BATCH_SIZE * sizeof(int), cudaMemcpyDeviceToHost)); +#ifdef PIVOT + checkCudaErrors(cudaMemcpy(h_pivotArray, d_pivotArray, N * BATCH_SIZE * sizeof(int), cudaMemcpyDeviceToHost)); +#endif /* PIVOT */ + + // verify the result + printf("> verifying the result..\n"); + for (int i = 0; i < BATCH_SIZE; i++) { + if (h_infoArray[i] == 0) { + DATA_TYPE *A = h_AarrayInput + (i * N * N); + DATA_TYPE *LU = h_AarrayOutput + (i * N * N); + DATA_TYPE L[N * N]; + DATA_TYPE U[N * N]; + getLUdecoded(LU, L, U); + + // test P * A = L * U + int *P = h_pivotArray + (i * N); + DATA_TYPE Pmat[N * N]; +#ifdef PIVOT + getPmatFromPivot(Pmat, P); +#else + initIdentityMatrix(Pmat); +#endif /* PIVOT */ + + // perform matrix multiplication + DATA_TYPE PxA[N * N]; + DATA_TYPE LxU[N * N]; + matrixMultiply(PxA, Pmat, A); + matrixMultiply(LxU, L, U); + + // check for equality of matrices + if (!checkRelativeError(PxA, LxU, (DATA_TYPE)MAX_ERROR)) { + printf("> ERROR: accuracy check failed for matrix number %05d..\n", i + 1); + err_count++; + } + } + else if (h_infoArray[i] > 0) { + printf("> execution for matrix %05d is successful, but U is singular and " + "U(%d,%d) = 0..\n", + i + 1, + h_infoArray[i] - 1, + h_infoArray[i] - 1); + } + else // (h_infoArray[i] < 0) + { + printf("> ERROR: matrix %05d have an illegal value at index %d = %lf..\n", + i + 1, + -h_infoArray[i], + *(h_AarrayInput + (i * N * N) + (-h_infoArray[i]))); + } + } + + // free device variables + checkCudaErrors(cudaFree(d_ptr_array)); + checkCudaErrors(cudaFree(d_infoArray)); + checkCudaErrors(cudaFree(d_pivotArray)); + checkCudaErrors(cudaFree(d_Aarray)); + + // free host variables + if (h_infoArray) + free(h_infoArray); + if (h_pivotArray) + free(h_pivotArray); + if (h_AarrayOutput) + free(h_AarrayOutput); + if (h_AarrayInput) + free(h_AarrayInput); + + // destroy cuBLAS handle + status = cublasDestroy(handle); + if (status != CUBLAS_STATUS_SUCCESS) { + printf("> ERROR: cuBLAS uninitialization failed..\n"); + return (EXIT_FAILURE); + } + + if (err_count > 0) { + printf("> TEST FAILED for %d matrices, with precision: %g\n", err_count, MAX_ERROR); + return (EXIT_FAILURE); + } + + printf("> TEST SUCCESSFUL, with precision: %g\n", MAX_ERROR); + return (EXIT_SUCCESS); } diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT/simpleCUFFT.cu b/Samples/4_CUDA_Libraries/simpleCUFFT/simpleCUFFT.cu index 29184f82..6180b990 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT/simpleCUFFT.cu +++ b/Samples/4_CUDA_Libraries/simpleCUFFT/simpleCUFFT.cu @@ -41,12 +41,11 @@ #include // Complex data type -typedef float2 Complex; +typedef float2 Complex; static __device__ __host__ inline Complex ComplexAdd(Complex, Complex); static __device__ __host__ inline Complex ComplexScale(Complex, float); static __device__ __host__ inline Complex ComplexMul(Complex, Complex); -static __global__ void ComplexPointwiseMulAndScale(Complex *, const Complex *, - int, float); +static __global__ void ComplexPointwiseMulAndScale(Complex *, const Complex *, int, float); // Filtering functions void Convolve(const Complex *, int, const Complex *, int, Complex *); @@ -59,7 +58,7 @@ int PadData(const Complex *, Complex **, int, const Complex *, Complex **, int); void runTest(int argc, char **argv); // The filter size is assumed to be a number smaller than the signal size -#define SIGNAL_SIZE 50 +#define SIGNAL_SIZE 50 #define FILTER_KERNEL_SIZE 11 //////////////////////////////////////////////////////////////////////////////// @@ -70,152 +69,143 @@ int main(int argc, char **argv) { runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - printf("[simpleCUFFT] is starting...\n"); +void runTest(int argc, char **argv) +{ + printf("[simpleCUFFT] is starting...\n"); - findCudaDevice(argc, (const char **)argv); + findCudaDevice(argc, (const char **)argv); - // Allocate host memory for the signal - Complex *h_signal = - reinterpret_cast(malloc(sizeof(Complex) * SIGNAL_SIZE)); + // Allocate host memory for the signal + Complex *h_signal = reinterpret_cast(malloc(sizeof(Complex) * SIGNAL_SIZE)); - // Initialize the memory for the signal - for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) { - h_signal[i].x = rand() / static_cast(RAND_MAX); - h_signal[i].y = 0; - } + // Initialize the memory for the signal + for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) { + h_signal[i].x = rand() / static_cast(RAND_MAX); + h_signal[i].y = 0; + } - // Allocate host memory for the filter - Complex *h_filter_kernel = - reinterpret_cast(malloc(sizeof(Complex) * FILTER_KERNEL_SIZE)); + // Allocate host memory for the filter + Complex *h_filter_kernel = reinterpret_cast(malloc(sizeof(Complex) * FILTER_KERNEL_SIZE)); - // Initialize the memory for the filter - for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i) { - h_filter_kernel[i].x = rand() / static_cast(RAND_MAX); - h_filter_kernel[i].y = 0; - } + // Initialize the memory for the filter + for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i) { + h_filter_kernel[i].x = rand() / static_cast(RAND_MAX); + h_filter_kernel[i].y = 0; + } - // Pad signal and filter kernel - Complex *h_padded_signal; - Complex *h_padded_filter_kernel; - int new_size = - PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel, - &h_padded_filter_kernel, FILTER_KERNEL_SIZE); - int mem_size = sizeof(Complex) * new_size; + // Pad signal and filter kernel + Complex *h_padded_signal; + Complex *h_padded_filter_kernel; + int new_size = + PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel, &h_padded_filter_kernel, FILTER_KERNEL_SIZE); + int mem_size = sizeof(Complex) * new_size; - // Allocate device memory for signal - Complex *d_signal; - checkCudaErrors(cudaMalloc(reinterpret_cast(&d_signal), mem_size)); - // Copy host memory to device - checkCudaErrors( - cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice)); + // Allocate device memory for signal + Complex *d_signal; + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_signal), mem_size)); + // Copy host memory to device + checkCudaErrors(cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice)); - // Allocate device memory for filter kernel - Complex *d_filter_kernel; - checkCudaErrors( - cudaMalloc(reinterpret_cast(&d_filter_kernel), mem_size)); + // Allocate device memory for filter kernel + Complex *d_filter_kernel; + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_filter_kernel), mem_size)); - // Copy host memory to device - checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size, - cudaMemcpyHostToDevice)); + // Copy host memory to device + checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size, cudaMemcpyHostToDevice)); - // CUFFT plan simple API - cufftHandle plan; - checkCudaErrors(cufftPlan1d(&plan, new_size, CUFFT_C2C, 1)); + // CUFFT plan simple API + cufftHandle plan; + checkCudaErrors(cufftPlan1d(&plan, new_size, CUFFT_C2C, 1)); - // CUFFT plan advanced API - cufftHandle plan_adv; - size_t workSize; - long long int new_size_long = new_size; + // CUFFT plan advanced API + cufftHandle plan_adv; + size_t workSize; + long long int new_size_long = new_size; - checkCudaErrors(cufftCreate(&plan_adv)); - checkCudaErrors(cufftXtMakePlanMany(plan_adv, 1, &new_size_long, NULL, 1, 1, - CUDA_C_32F, NULL, 1, 1, CUDA_C_32F, 1, - &workSize, CUDA_C_32F)); - printf("Temporary buffer size %li bytes\n", workSize); + checkCudaErrors(cufftCreate(&plan_adv)); + checkCudaErrors(cufftXtMakePlanMany( + plan_adv, 1, &new_size_long, NULL, 1, 1, CUDA_C_32F, NULL, 1, 1, CUDA_C_32F, 1, &workSize, CUDA_C_32F)); + printf("Temporary buffer size %li bytes\n", workSize); - // Transform signal and kernel - printf("Transforming signal cufftExecC2C\n"); - checkCudaErrors(cufftExecC2C(plan, reinterpret_cast(d_signal), - reinterpret_cast(d_signal), - CUFFT_FORWARD)); - checkCudaErrors(cufftExecC2C( - plan_adv, reinterpret_cast(d_filter_kernel), - reinterpret_cast(d_filter_kernel), CUFFT_FORWARD)); + // Transform signal and kernel + printf("Transforming signal cufftExecC2C\n"); + checkCudaErrors(cufftExecC2C( + plan, reinterpret_cast(d_signal), reinterpret_cast(d_signal), CUFFT_FORWARD)); + checkCudaErrors(cufftExecC2C(plan_adv, + reinterpret_cast(d_filter_kernel), + reinterpret_cast(d_filter_kernel), + CUFFT_FORWARD)); - // Multiply the coefficients together and normalize the result - printf("Launching ComplexPointwiseMulAndScale<<< >>>\n"); - ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size, - 1.0f / new_size); + // Multiply the coefficients together and normalize the result + printf("Launching ComplexPointwiseMulAndScale<<< >>>\n"); + ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size, 1.0f / new_size); - // Check if kernel execution generated and error - getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]"); + // Check if kernel execution generated and error + getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]"); - // Transform signal back - printf("Transforming signal back cufftExecC2C\n"); - checkCudaErrors(cufftExecC2C(plan, reinterpret_cast(d_signal), - reinterpret_cast(d_signal), - CUFFT_INVERSE)); + // Transform signal back + printf("Transforming signal back cufftExecC2C\n"); + checkCudaErrors(cufftExecC2C( + plan, reinterpret_cast(d_signal), reinterpret_cast(d_signal), CUFFT_INVERSE)); - // Copy device memory to host - Complex *h_convolved_signal = h_padded_signal; - checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size, - cudaMemcpyDeviceToHost)); + // Copy device memory to host + Complex *h_convolved_signal = h_padded_signal; + checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost)); - // Allocate host memory for the convolution result - Complex *h_convolved_signal_ref = - reinterpret_cast(malloc(sizeof(Complex) * SIGNAL_SIZE)); + // Allocate host memory for the convolution result + Complex *h_convolved_signal_ref = reinterpret_cast(malloc(sizeof(Complex) * SIGNAL_SIZE)); - // Convolve on the host - Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, - h_convolved_signal_ref); + // Convolve on the host + Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); - // check result - bool bTestResult = sdkCompareL2fe( - reinterpret_cast(h_convolved_signal_ref), - reinterpret_cast(h_convolved_signal), 2 * SIGNAL_SIZE, 1e-5f); + // check result + bool bTestResult = sdkCompareL2fe(reinterpret_cast(h_convolved_signal_ref), + reinterpret_cast(h_convolved_signal), + 2 * SIGNAL_SIZE, + 1e-5f); - // Destroy CUFFT context - checkCudaErrors(cufftDestroy(plan)); - checkCudaErrors(cufftDestroy(plan_adv)); + // Destroy CUFFT context + checkCudaErrors(cufftDestroy(plan)); + checkCudaErrors(cufftDestroy(plan_adv)); - // cleanup memory - free(h_signal); - free(h_filter_kernel); - free(h_padded_signal); - free(h_padded_filter_kernel); - free(h_convolved_signal_ref); - checkCudaErrors(cudaFree(d_signal)); - checkCudaErrors(cudaFree(d_filter_kernel)); + // cleanup memory + free(h_signal); + free(h_filter_kernel); + free(h_padded_signal); + free(h_padded_filter_kernel); + free(h_convolved_signal_ref); + checkCudaErrors(cudaFree(d_signal)); + checkCudaErrors(cudaFree(d_filter_kernel)); - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } // Pad data -int PadData(const Complex *signal, Complex **padded_signal, int signal_size, - const Complex *filter_kernel, Complex **padded_filter_kernel, - int filter_kernel_size) { - int minRadius = filter_kernel_size / 2; - int maxRadius = filter_kernel_size - minRadius; - int new_size = signal_size + maxRadius; +int PadData(const Complex *signal, + Complex **padded_signal, + int signal_size, + const Complex *filter_kernel, + Complex **padded_filter_kernel, + int filter_kernel_size) +{ + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; + int new_size = signal_size + maxRadius; - // Pad signal - Complex *new_data = - reinterpret_cast(malloc(sizeof(Complex) * new_size)); - memcpy(new_data + 0, signal, signal_size * sizeof(Complex)); - memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex)); - *padded_signal = new_data; + // Pad signal + Complex *new_data = reinterpret_cast(malloc(sizeof(Complex) * new_size)); + memcpy(new_data + 0, signal, signal_size * sizeof(Complex)); + memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex)); + *padded_signal = new_data; - // Pad filter - new_data = reinterpret_cast(malloc(sizeof(Complex) * new_size)); - memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex)); - memset(new_data + maxRadius, 0, - (new_size - filter_kernel_size) * sizeof(Complex)); - memcpy(new_data + new_size - minRadius, filter_kernel, - minRadius * sizeof(Complex)); - *padded_filter_kernel = new_data; + // Pad filter + new_data = reinterpret_cast(malloc(sizeof(Complex) * new_size)); + memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex)); + memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex)); + memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex)); + *padded_filter_kernel = new_data; - return new_size; + return new_size; } //////////////////////////////////////////////////////////////////////////////// @@ -223,27 +213,29 @@ int PadData(const Complex *signal, Complex **padded_signal, int signal_size, //////////////////////////////////////////////////////////////////////////////// // Computes convolution on the host -void Convolve(const Complex *signal, int signal_size, - const Complex *filter_kernel, int filter_kernel_size, - Complex *filtered_signal) { - int minRadius = filter_kernel_size / 2; - int maxRadius = filter_kernel_size - minRadius; +void Convolve(const Complex *signal, + int signal_size, + const Complex *filter_kernel, + int filter_kernel_size, + Complex *filtered_signal) +{ + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; - // Loop over output element indices - for (int i = 0; i < signal_size; ++i) { - filtered_signal[i].x = filtered_signal[i].y = 0; + // Loop over output element indices + for (int i = 0; i < signal_size; ++i) { + filtered_signal[i].x = filtered_signal[i].y = 0; - // Loop over convolution indices - for (int j = -maxRadius + 1; j <= minRadius; ++j) { - int k = i + j; + // Loop over convolution indices + for (int j = -maxRadius + 1; j <= minRadius; ++j) { + int k = i + j; - if (k >= 0 && k < signal_size) { - filtered_signal[i] = - ComplexAdd(filtered_signal[i], - ComplexMul(signal[k], filter_kernel[minRadius - j])); - } + if (k >= 0 && k < signal_size) { + filtered_signal[i] = + ComplexAdd(filtered_signal[i], ComplexMul(signal[k], filter_kernel[minRadius - j])); + } + } } - } } //////////////////////////////////////////////////////////////////////////////// @@ -251,36 +243,39 @@ void Convolve(const Complex *signal, int signal_size, //////////////////////////////////////////////////////////////////////////////// // Complex addition -static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) { - Complex c; - c.x = a.x + b.x; - c.y = a.y + b.y; - return c; +static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) +{ + Complex c; + c.x = a.x + b.x; + c.y = a.y + b.y; + return c; } // Complex scale -static __device__ __host__ inline Complex ComplexScale(Complex a, float s) { - Complex c; - c.x = s * a.x; - c.y = s * a.y; - return c; +static __device__ __host__ inline Complex ComplexScale(Complex a, float s) +{ + Complex c; + c.x = s * a.x; + c.y = s * a.y; + return c; } // Complex multiplication -static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) { - Complex c; - c.x = a.x * b.x - a.y * b.y; - c.y = a.x * b.y + a.y * b.x; - return c; +static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) +{ + Complex c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + return c; } // Complex pointwise multiplication -static __global__ void ComplexPointwiseMulAndScale(Complex *a, const Complex *b, - int size, float scale) { - const int numThreads = blockDim.x * gridDim.x; - const int threadID = blockIdx.x * blockDim.x + threadIdx.x; +static __global__ void ComplexPointwiseMulAndScale(Complex *a, const Complex *b, int size, float scale) +{ + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = threadID; i < size; i += numThreads) { - a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale); - } + for (int i = threadID; i < size; i += numThreads) { + a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale); + } } diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/simpleCUFFT_2d_MGPU.cu b/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/simpleCUFFT_2d_MGPU.cu index 78202052..11fe5fdd 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/simpleCUFFT_2d_MGPU.cu +++ b/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/simpleCUFFT_2d_MGPU.cu @@ -38,343 +38,338 @@ // System includes -#include -#include - -#include #include +#include +#include +#include // CUDA runtime #include -//CUFFT Header file +// CUFFT Header file #include // helper functions and utilities to work with CUDA -#include #include +#include // Complex data type typedef float2 Complex; // Data configuration const int GPU_COUNT = 2; -const int BSZ_Y = 4; -const int BSZ_X = 4; +const int BSZ_Y = 4; +const int BSZ_X = 4; // Forward Declaration void solvePoissonEquation(cudaLibXtDesc *, cudaLibXtDesc *, float **, int, int); -__global__ void solvePoisson(cufftComplex *, cufftComplex *, float *, int, int, - int n_gpu); +__global__ void solvePoisson(cufftComplex *, cufftComplex *, float *, int, int, int n_gpu); /////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf( - "\nPoisson equation using CUFFT library on Multiple GPUs is " - "starting...\n\n"); +int main(int argc, char **argv) +{ + printf("\nPoisson equation using CUFFT library on Multiple GPUs is " + "starting...\n\n"); - int GPU_N; - checkCudaErrors(cudaGetDeviceCount(&GPU_N)); + int GPU_N; + checkCudaErrors(cudaGetDeviceCount(&GPU_N)); - if (GPU_N < GPU_COUNT) { - printf("No. of GPU on node %d\n", GPU_N); - printf("Two GPUs are required to run simpleCUFFT_2d_MGPU sample code\n"); - exit(EXIT_WAIVED); - } - - int *major_minor = (int *)malloc(sizeof(int) * GPU_N * 2); - int found2IdenticalGPUs = 0; - int nGPUs = 2; - int *whichGPUs; - whichGPUs = (int *)malloc(sizeof(int) * nGPUs); - - for (int i = 0; i < GPU_N; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); - major_minor[i * 2] = deviceProp.major; - major_minor[i * 2 + 1] = deviceProp.minor; - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i, - deviceProp.name, deviceProp.major, deviceProp.minor); - } - - for (int i = 0; i < GPU_N; i++) { - for (int j = i + 1; j < GPU_N; j++) { - if ((major_minor[i * 2] == major_minor[j * 2]) && - (major_minor[i * 2 + 1] == major_minor[j * 2 + 1])) { - whichGPUs[0] = i; - whichGPUs[1] = j; - found2IdenticalGPUs = 1; - break; - } - } - if (found2IdenticalGPUs) { - break; - } - } - - free(major_minor); - if (!found2IdenticalGPUs) { - printf( - "No Two GPUs with same architecture found\nWaiving simpleCUFFT_2d_MGPU " - "sample\n"); - exit(EXIT_WAIVED); - } - - int N = 64; - float xMAX = 1.0f, xMIN = 0.0f, yMIN = 0.0f, h = (xMAX - xMIN) / ((float)N), - s = 0.1f, s2 = s * s; - float *x, *y, *f, *u_a, r2; - - x = (float *)malloc(sizeof(float) * N * N); - y = (float *)malloc(sizeof(float) * N * N); - f = (float *)malloc(sizeof(float) * N * N); - u_a = (float *)malloc(sizeof(float) * N * N); - - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) { - x[N * j + i] = xMIN + i * h; - y[N * j + i] = yMIN + j * h; - r2 = (x[N * j + i] - 0.5f) * (x[N * j + i] - 0.5f) + - (y[N * j + i] - 0.5f) * (y[N * j + i] - 0.5f); - f[N * j + i] = (r2 - 2 * s2) / (s2 * s2) * exp(-r2 / (2 * s2)); - u_a[N * j + i] = exp(-r2 / (2 * s2)); // analytical solution + if (GPU_N < GPU_COUNT) { + printf("No. of GPU on node %d\n", GPU_N); + printf("Two GPUs are required to run simpleCUFFT_2d_MGPU sample code\n"); + exit(EXIT_WAIVED); } - float *k, *d_k[GPU_COUNT]; - k = (float *)malloc(sizeof(float) * N); - for (int i = 0; i <= N / 2; i++) { - k[i] = i * 2 * (float)M_PI; - } - for (int i = N / 2 + 1; i < N; i++) { - k[i] = (i - N) * 2 * (float)M_PI; - } + int *major_minor = (int *)malloc(sizeof(int) * GPU_N * 2); + int found2IdenticalGPUs = 0; + int nGPUs = 2; + int *whichGPUs; + whichGPUs = (int *)malloc(sizeof(int) * nGPUs); - // Create a complex variable on host - Complex *h_f = (Complex *)malloc(sizeof(Complex) * N * N); + for (int i = 0; i < GPU_N; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); + major_minor[i * 2] = deviceProp.major; + major_minor[i * 2 + 1] = deviceProp.minor; + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", + i, + deviceProp.name, + deviceProp.major, + deviceProp.minor); + } - // Initialize the memory for the signal - for (int i = 0; i < (N * N); i++) { - h_f[i].x = f[i]; - h_f[i].y = 0.0f; - } + for (int i = 0; i < GPU_N; i++) { + for (int j = i + 1; j < GPU_N; j++) { + if ((major_minor[i * 2] == major_minor[j * 2]) && (major_minor[i * 2 + 1] == major_minor[j * 2 + 1])) { + whichGPUs[0] = i; + whichGPUs[1] = j; + found2IdenticalGPUs = 1; + break; + } + } + if (found2IdenticalGPUs) { + break; + } + } - // cufftCreate() - Create an empty plan - cufftResult result; - cufftHandle planComplex; - result = cufftCreate(&planComplex); - if (result != CUFFT_SUCCESS) { - printf("cufftCreate failed\n"); - exit(EXIT_FAILURE); - } + free(major_minor); + if (!found2IdenticalGPUs) { + printf("No Two GPUs with same architecture found\nWaiving simpleCUFFT_2d_MGPU " + "sample\n"); + exit(EXIT_WAIVED); + } - // cufftXtSetGPUs() - Define which GPUs to use - result = cufftXtSetGPUs(planComplex, nGPUs, whichGPUs); + int N = 64; + float xMAX = 1.0f, xMIN = 0.0f, yMIN = 0.0f, h = (xMAX - xMIN) / ((float)N), s = 0.1f, s2 = s * s; + float *x, *y, *f, *u_a, r2; - if (result == CUFFT_INVALID_DEVICE) { - printf("This sample requires two GPUs on the same board.\n"); - printf("No such board was found. Waiving sample.\n"); - exit(EXIT_WAIVED); - } else if (result != CUFFT_SUCCESS) { - printf("cufftXtSetGPUs failed\n"); - exit(EXIT_FAILURE); - } + x = (float *)malloc(sizeof(float) * N * N); + y = (float *)malloc(sizeof(float) * N * N); + f = (float *)malloc(sizeof(float) * N * N); + u_a = (float *)malloc(sizeof(float) * N * N); - // Print the device information to run the code - printf("\nRunning on GPUs\n"); - for (int i = 0; i < 2; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, whichGPUs[i])); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", - whichGPUs[i], deviceProp.name, deviceProp.major, deviceProp.minor); - } + for (int j = 0; j < N; j++) + for (int i = 0; i < N; i++) { + x[N * j + i] = xMIN + i * h; + y[N * j + i] = yMIN + j * h; + r2 = (x[N * j + i] - 0.5f) * (x[N * j + i] - 0.5f) + (y[N * j + i] - 0.5f) * (y[N * j + i] - 0.5f); + f[N * j + i] = (r2 - 2 * s2) / (s2 * s2) * exp(-r2 / (2 * s2)); + u_a[N * j + i] = exp(-r2 / (2 * s2)); // analytical solution + } - size_t *worksize; - worksize = (size_t *)malloc(sizeof(size_t) * nGPUs); + float *k, *d_k[GPU_COUNT]; + k = (float *)malloc(sizeof(float) * N); + for (int i = 0; i <= N / 2; i++) { + k[i] = i * 2 * (float)M_PI; + } + for (int i = N / 2 + 1; i < N; i++) { + k[i] = (i - N) * 2 * (float)M_PI; + } - // cufftMakePlan2d() - Create the plan - result = cufftMakePlan2d(planComplex, N, N, CUFFT_C2C, worksize); - if (result != CUFFT_SUCCESS) { - printf("*MakePlan* failed\n"); - exit(EXIT_FAILURE); - } + // Create a complex variable on host + Complex *h_f = (Complex *)malloc(sizeof(Complex) * N * N); - for (int i = 0; i < nGPUs; i++) { - cudaSetDevice(whichGPUs[i]); - cudaMalloc((void **)&d_k[i], sizeof(float) * N); - cudaMemcpy(d_k[i], k, sizeof(float) * N, cudaMemcpyHostToDevice); - } + // Initialize the memory for the signal + for (int i = 0; i < (N * N); i++) { + h_f[i].x = f[i]; + h_f[i].y = 0.0f; + } - // Create a variable on device - // d_f - variable on device to store the input data - // d_d_f - variable that store the natural order of d_f data - // d_out - device output - cudaLibXtDesc *d_f, *d_d_f, *d_out; + // cufftCreate() - Create an empty plan + cufftResult result; + cufftHandle planComplex; + result = cufftCreate(&planComplex); + if (result != CUFFT_SUCCESS) { + printf("cufftCreate failed\n"); + exit(EXIT_FAILURE); + } - // cufftXtMalloc() - Malloc data on multiple GPUs + // cufftXtSetGPUs() - Define which GPUs to use + result = cufftXtSetGPUs(planComplex, nGPUs, whichGPUs); - result = cufftXtMalloc(planComplex, (cudaLibXtDesc **)&d_f, - CUFFT_XT_FORMAT_INPLACE); - if (result != CUFFT_SUCCESS) { - printf("*XtMalloc failed\n"); - exit(EXIT_FAILURE); - } + if (result == CUFFT_INVALID_DEVICE) { + printf("This sample requires two GPUs on the same board.\n"); + printf("No such board was found. Waiving sample.\n"); + exit(EXIT_WAIVED); + } + else if (result != CUFFT_SUCCESS) { + printf("cufftXtSetGPUs failed\n"); + exit(EXIT_FAILURE); + } - result = cufftXtMalloc(planComplex, (cudaLibXtDesc **)&d_d_f, - CUFFT_XT_FORMAT_INPLACE); - if (result != CUFFT_SUCCESS) { - printf("*XtMalloc failed\n"); - exit(EXIT_FAILURE); - } + // Print the device information to run the code + printf("\nRunning on GPUs\n"); + for (int i = 0; i < 2; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, whichGPUs[i])); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", + whichGPUs[i], + deviceProp.name, + deviceProp.major, + deviceProp.minor); + } - result = cufftXtMalloc(planComplex, (cudaLibXtDesc **)&d_out, - CUFFT_XT_FORMAT_INPLACE); - if (result != CUFFT_SUCCESS) { - printf("*XtMalloc failed\n"); - exit(EXIT_FAILURE); - } + size_t *worksize; + worksize = (size_t *)malloc(sizeof(size_t) * nGPUs); - // cufftXtMemcpy() - Copy the data from host to device - result = cufftXtMemcpy(planComplex, d_f, h_f, CUFFT_COPY_HOST_TO_DEVICE); - if (result != CUFFT_SUCCESS) { - printf("*XtMemcpy failed\n"); - exit(EXIT_FAILURE); - } + // cufftMakePlan2d() - Create the plan + result = cufftMakePlan2d(planComplex, N, N, CUFFT_C2C, worksize); + if (result != CUFFT_SUCCESS) { + printf("*MakePlan* failed\n"); + exit(EXIT_FAILURE); + } - // cufftXtExecDescriptorC2C() - Execute FFT on data on multiple GPUs - printf("Forward 2d FFT on multiple GPUs\n"); - result = cufftXtExecDescriptorC2C(planComplex, d_f, d_f, CUFFT_FORWARD); - if (result != CUFFT_SUCCESS) { - printf("*XtExecC2C failed\n"); - exit(EXIT_FAILURE); - } + for (int i = 0; i < nGPUs; i++) { + cudaSetDevice(whichGPUs[i]); + cudaMalloc((void **)&d_k[i], sizeof(float) * N); + cudaMemcpy(d_k[i], k, sizeof(float) * N, cudaMemcpyHostToDevice); + } - // cufftXtMemcpy() - Copy the data to natural order on GPUs - result = cufftXtMemcpy(planComplex, d_d_f, d_f, CUFFT_COPY_DEVICE_TO_DEVICE); - if (result != CUFFT_SUCCESS) { - printf("*XtMemcpy failed\n"); - exit(EXIT_FAILURE); - } + // Create a variable on device + // d_f - variable on device to store the input data + // d_d_f - variable that store the natural order of d_f data + // d_out - device output + cudaLibXtDesc *d_f, *d_d_f, *d_out; - printf("Solve Poisson Equation\n"); - solvePoissonEquation(d_d_f, d_out, d_k, N, nGPUs); + // cufftXtMalloc() - Malloc data on multiple GPUs - printf("Inverse 2d FFT on multiple GPUs\n"); - // cufftXtExecDescriptorC2C() - Execute inverse FFT on data on multiple GPUs - result = cufftXtExecDescriptorC2C(planComplex, d_out, d_out, CUFFT_INVERSE); - if (result != CUFFT_SUCCESS) { - printf("*XtExecC2C failed\n"); - exit(EXIT_FAILURE); - } + result = cufftXtMalloc(planComplex, (cudaLibXtDesc **)&d_f, CUFFT_XT_FORMAT_INPLACE); + if (result != CUFFT_SUCCESS) { + printf("*XtMalloc failed\n"); + exit(EXIT_FAILURE); + } - // Create a variable on host to copy the data from device - // h_d_out - variable store the output of device - Complex *h_d_out = (Complex *)malloc(sizeof(Complex) * N * N); + result = cufftXtMalloc(planComplex, (cudaLibXtDesc **)&d_d_f, CUFFT_XT_FORMAT_INPLACE); + if (result != CUFFT_SUCCESS) { + printf("*XtMalloc failed\n"); + exit(EXIT_FAILURE); + } - // cufftXtMemcpy() - Copy data from multiple GPUs to host - result = - cufftXtMemcpy(planComplex, h_d_out, d_out, CUFFT_COPY_DEVICE_TO_HOST); - if (result != CUFFT_SUCCESS) { - printf("*XtMemcpy failed\n"); - exit(EXIT_FAILURE); - } + result = cufftXtMalloc(planComplex, (cudaLibXtDesc **)&d_out, CUFFT_XT_FORMAT_INPLACE); + if (result != CUFFT_SUCCESS) { + printf("*XtMalloc failed\n"); + exit(EXIT_FAILURE); + } - float *out = (float *)malloc(sizeof(float) * N * N); - float constant = h_d_out[0].x / N * N; - for (int i = 0; i < N * N; i++) { - // subtract u[0] to force the arbitrary constant to be 0 - out[i] = (h_d_out[i].x / (N * N)) - constant; - } + // cufftXtMemcpy() - Copy the data from host to device + result = cufftXtMemcpy(planComplex, d_f, h_f, CUFFT_COPY_HOST_TO_DEVICE); + if (result != CUFFT_SUCCESS) { + printf("*XtMemcpy failed\n"); + exit(EXIT_FAILURE); + } - // cleanup memory + // cufftXtExecDescriptorC2C() - Execute FFT on data on multiple GPUs + printf("Forward 2d FFT on multiple GPUs\n"); + result = cufftXtExecDescriptorC2C(planComplex, d_f, d_f, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) { + printf("*XtExecC2C failed\n"); + exit(EXIT_FAILURE); + } - free(h_f); - free(k); - free(out); - free(h_d_out); - free(x); - free(whichGPUs); - free(y); - free(f); - free(u_a); - free(worksize); + // cufftXtMemcpy() - Copy the data to natural order on GPUs + result = cufftXtMemcpy(planComplex, d_d_f, d_f, CUFFT_COPY_DEVICE_TO_DEVICE); + if (result != CUFFT_SUCCESS) { + printf("*XtMemcpy failed\n"); + exit(EXIT_FAILURE); + } - // cudaXtFree() - Free GPU memory - for (int i = 0; i < GPU_COUNT; i++) { - cudaFree(d_k[i]); - } - result = cufftXtFree(d_out); - if (result != CUFFT_SUCCESS) { - printf("*XtFree failed\n"); - exit(EXIT_FAILURE); - } - result = cufftXtFree(d_f); - if (result != CUFFT_SUCCESS) { - printf("*XtFree failed\n"); - exit(EXIT_FAILURE); - } - result = cufftXtFree(d_d_f); - if (result != CUFFT_SUCCESS) { - printf("*XtFree failed\n"); - exit(EXIT_FAILURE); - } + printf("Solve Poisson Equation\n"); + solvePoissonEquation(d_d_f, d_out, d_k, N, nGPUs); - // cufftDestroy() - Destroy FFT plan - result = cufftDestroy(planComplex); - if (result != CUFFT_SUCCESS) { - printf("cufftDestroy failed: code %d\n", (int)result); - exit(EXIT_FAILURE); - } + printf("Inverse 2d FFT on multiple GPUs\n"); + // cufftXtExecDescriptorC2C() - Execute inverse FFT on data on multiple GPUs + result = cufftXtExecDescriptorC2C(planComplex, d_out, d_out, CUFFT_INVERSE); + if (result != CUFFT_SUCCESS) { + printf("*XtExecC2C failed\n"); + exit(EXIT_FAILURE); + } - exit(EXIT_SUCCESS); + // Create a variable on host to copy the data from device + // h_d_out - variable store the output of device + Complex *h_d_out = (Complex *)malloc(sizeof(Complex) * N * N); + + // cufftXtMemcpy() - Copy data from multiple GPUs to host + result = cufftXtMemcpy(planComplex, h_d_out, d_out, CUFFT_COPY_DEVICE_TO_HOST); + if (result != CUFFT_SUCCESS) { + printf("*XtMemcpy failed\n"); + exit(EXIT_FAILURE); + } + + float *out = (float *)malloc(sizeof(float) * N * N); + float constant = h_d_out[0].x / N * N; + for (int i = 0; i < N * N; i++) { + // subtract u[0] to force the arbitrary constant to be 0 + out[i] = (h_d_out[i].x / (N * N)) - constant; + } + + // cleanup memory + + free(h_f); + free(k); + free(out); + free(h_d_out); + free(x); + free(whichGPUs); + free(y); + free(f); + free(u_a); + free(worksize); + + // cudaXtFree() - Free GPU memory + for (int i = 0; i < GPU_COUNT; i++) { + cudaFree(d_k[i]); + } + result = cufftXtFree(d_out); + if (result != CUFFT_SUCCESS) { + printf("*XtFree failed\n"); + exit(EXIT_FAILURE); + } + result = cufftXtFree(d_f); + if (result != CUFFT_SUCCESS) { + printf("*XtFree failed\n"); + exit(EXIT_FAILURE); + } + result = cufftXtFree(d_d_f); + if (result != CUFFT_SUCCESS) { + printf("*XtFree failed\n"); + exit(EXIT_FAILURE); + } + + // cufftDestroy() - Destroy FFT plan + result = cufftDestroy(planComplex); + if (result != CUFFT_SUCCESS) { + printf("cufftDestroy failed: code %d\n", (int)result); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////////// // Launch kernel on multiple GPU /////////////////////////////////////////////////////////////////////////////////// -void solvePoissonEquation(cudaLibXtDesc *d_ft, cudaLibXtDesc *d_ft_k, float **k, - int N, int nGPUs) { - int device; - dim3 dimGrid(int(N / BSZ_X), int((N / 2) / BSZ_Y)); - dim3 dimBlock(BSZ_X, BSZ_Y); +void solvePoissonEquation(cudaLibXtDesc *d_ft, cudaLibXtDesc *d_ft_k, float **k, int N, int nGPUs) +{ + int device; + dim3 dimGrid(int(N / BSZ_X), int((N / 2) / BSZ_Y)); + dim3 dimBlock(BSZ_X, BSZ_Y); - for (int i = 0; i < nGPUs; i++) { - device = d_ft_k->descriptor->GPUs[i]; - cudaSetDevice(device); - solvePoisson<<>>( - (cufftComplex *)d_ft->descriptor->data[i], - (cufftComplex *)d_ft_k->descriptor->data[i], k[i], N, i, nGPUs); - } + for (int i = 0; i < nGPUs; i++) { + device = d_ft_k->descriptor->GPUs[i]; + cudaSetDevice(device); + solvePoisson<<>>( + (cufftComplex *)d_ft->descriptor->data[i], (cufftComplex *)d_ft_k->descriptor->data[i], k[i], N, i, nGPUs); + } - // Wait for device to finish all operation - for (int i = 0; i < nGPUs; i++) { - device = d_ft_k->descriptor->GPUs[i]; - cudaSetDevice(device); - cudaDeviceSynchronize(); + // Wait for device to finish all operation + for (int i = 0; i < nGPUs; i++) { + device = d_ft_k->descriptor->GPUs[i]; + cudaSetDevice(device); + cudaDeviceSynchronize(); - // Check if kernel execution generated and error - getLastCudaError("Kernel execution failed [ solvePoisson ]"); - } + // Check if kernel execution generated and error + getLastCudaError("Kernel execution failed [ solvePoisson ]"); + } } //////////////////////////////////////////////////////////////////////////////// // Kernel for Solving Poisson equation on GPU //////////////////////////////////////////////////////////////////////////////// -__global__ void solvePoisson(cufftComplex *ft, cufftComplex *ft_k, float *k, - int N, int gpu_id, int n_gpu) { - int i = threadIdx.x + blockIdx.x * blockDim.x; - int j = threadIdx.y + blockIdx.y * blockDim.y; - int index = j * N + i; - if (i < N && j < N / n_gpu) { - float k2 = - k[i] * k[i] + k[j + gpu_id * N / n_gpu] * k[j + gpu_id * N / n_gpu]; - if (i == 0 && j == 0 && gpu_id == 0) { - k2 = 1.0f; - } +__global__ void solvePoisson(cufftComplex *ft, cufftComplex *ft_k, float *k, int N, int gpu_id, int n_gpu) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + int j = threadIdx.y + blockIdx.y * blockDim.y; + int index = j * N + i; + if (i < N && j < N / n_gpu) { + float k2 = k[i] * k[i] + k[j + gpu_id * N / n_gpu] * k[j + gpu_id * N / n_gpu]; + if (i == 0 && j == 0 && gpu_id == 0) { + k2 = 1.0f; + } - ft_k[index].x = -ft[index].x * 1 / k2; - ft_k[index].y = -ft[index].y * 1 / k2; - } + ft_k[index].x = -ft[index].x * 1 / k2; + ft_k[index].y = -ft[index].y * 1 / k2; + } } diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/simpleCUFFT_MGPU.cu b/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/simpleCUFFT_MGPU.cu index 9c6eaf81..ede799b0 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/simpleCUFFT_MGPU.cu +++ b/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/simpleCUFFT_MGPU.cu @@ -28,21 +28,20 @@ /* Example showing the use of CUFFT for fast 1D-convolution using FFT. */ // System includes -#include -#include - -#include #include +#include +#include +#include // CUDA runtime #include -//CUFFT Header file +// CUFFT Header file #include // helper functions and utilities to work with CUDA -#include #include +#include // Complex data type typedef float2 Complex; @@ -50,8 +49,7 @@ typedef float2 Complex; static __device__ __host__ inline Complex ComplexAdd(Complex, Complex); static __device__ __host__ inline Complex ComplexScale(Complex, float); static __device__ __host__ inline Complex ComplexMul(Complex, Complex); -static __global__ void ComplexPointwiseMulAndScale(cufftComplex *, - cufftComplex *, int, float); +static __global__ void ComplexPointwiseMulAndScale(cufftComplex *, cufftComplex *, int, float); // Kernel for GPU void multiplyCoefficient(cudaLibXtDesc *, cudaLibXtDesc *, int, float, int); @@ -66,299 +64,286 @@ int PadData(const Complex *, Complex **, int, const Complex *, Complex **, int); // Data configuration // The filter size is assumed to be a number smaller than the signal size /////////////////////////////////////////////////////////////////////////////// -const int SIGNAL_SIZE = 1018; +const int SIGNAL_SIZE = 1018; const int FILTER_KERNEL_SIZE = 11; -const int GPU_COUNT = 2; +const int GPU_COUNT = 2; //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("\n[simpleCUFFT_MGPU] is starting...\n\n"); +int main(int argc, char **argv) +{ + printf("\n[simpleCUFFT_MGPU] is starting...\n\n"); - int GPU_N; - checkCudaErrors(cudaGetDeviceCount(&GPU_N)); + int GPU_N; + checkCudaErrors(cudaGetDeviceCount(&GPU_N)); - if (GPU_N < GPU_COUNT) { - printf("No. of GPU on node %d\n", GPU_N); - printf("Two GPUs are required to run simpleCUFFT_MGPU sample code\n"); - exit(EXIT_WAIVED); - } - - int *major_minor = (int *)malloc(sizeof(int) * GPU_N * 2); - int found2IdenticalGPUs = 0; - int nGPUs = 2; - int *whichGPUs; - whichGPUs = (int *)malloc(sizeof(int) * nGPUs); - - for (int i = 0; i < GPU_N; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); - major_minor[i * 2] = deviceProp.major; - major_minor[i * 2 + 1] = deviceProp.minor; - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i, - deviceProp.name, deviceProp.major, deviceProp.minor); - } - - for (int i = 0; i < GPU_N; i++) { - for (int j = i + 1; j < GPU_N; j++) { - if ((major_minor[i * 2] == major_minor[j * 2]) && - (major_minor[i * 2 + 1] == major_minor[j * 2 + 1])) { - whichGPUs[0] = i; - whichGPUs[1] = j; - found2IdenticalGPUs = 1; - break; - } + if (GPU_N < GPU_COUNT) { + printf("No. of GPU on node %d\n", GPU_N); + printf("Two GPUs are required to run simpleCUFFT_MGPU sample code\n"); + exit(EXIT_WAIVED); } - if (found2IdenticalGPUs) { - break; + + int *major_minor = (int *)malloc(sizeof(int) * GPU_N * 2); + int found2IdenticalGPUs = 0; + int nGPUs = 2; + int *whichGPUs; + whichGPUs = (int *)malloc(sizeof(int) * nGPUs); + + for (int i = 0; i < GPU_N; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); + major_minor[i * 2] = deviceProp.major; + major_minor[i * 2 + 1] = deviceProp.minor; + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", + i, + deviceProp.name, + deviceProp.major, + deviceProp.minor); } - } - free(major_minor); - if (!found2IdenticalGPUs) { - printf( - "No Two GPUs with same architecture found\nWaiving simpleCUFFT_2d_MGPU " - "sample\n"); - exit(EXIT_WAIVED); - } + for (int i = 0; i < GPU_N; i++) { + for (int j = i + 1; j < GPU_N; j++) { + if ((major_minor[i * 2] == major_minor[j * 2]) && (major_minor[i * 2 + 1] == major_minor[j * 2 + 1])) { + whichGPUs[0] = i; + whichGPUs[1] = j; + found2IdenticalGPUs = 1; + break; + } + } + if (found2IdenticalGPUs) { + break; + } + } - // Allocate host memory for the signal - Complex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); + free(major_minor); + if (!found2IdenticalGPUs) { + printf("No Two GPUs with same architecture found\nWaiving simpleCUFFT_2d_MGPU " + "sample\n"); + exit(EXIT_WAIVED); + } - // Initialize the memory for the signal - for (int i = 0; i < SIGNAL_SIZE; ++i) { - h_signal[i].x = rand() / (float)RAND_MAX; - h_signal[i].y = 0; - } + // Allocate host memory for the signal + Complex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); - // Allocate host memory for the filter - Complex *h_filter_kernel = - (Complex *)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE); + // Initialize the memory for the signal + for (int i = 0; i < SIGNAL_SIZE; ++i) { + h_signal[i].x = rand() / (float)RAND_MAX; + h_signal[i].y = 0; + } - // Initialize the memory for the filter - for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { - h_filter_kernel[i].x = rand() / (float)RAND_MAX; - h_filter_kernel[i].y = 0; - } + // Allocate host memory for the filter + Complex *h_filter_kernel = (Complex *)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE); - // Pad signal and filter kernel - Complex *h_padded_signal; - Complex *h_padded_filter_kernel; - int new_size = - PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel, - &h_padded_filter_kernel, FILTER_KERNEL_SIZE); + // Initialize the memory for the filter + for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { + h_filter_kernel[i].x = rand() / (float)RAND_MAX; + h_filter_kernel[i].y = 0; + } - // cufftCreate() - Create an empty plan - cufftResult result; - cufftHandle plan_input; - checkCudaErrors(cufftCreate(&plan_input)); + // Pad signal and filter kernel + Complex *h_padded_signal; + Complex *h_padded_filter_kernel; + int new_size = + PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel, &h_padded_filter_kernel, FILTER_KERNEL_SIZE); - // cufftXtSetGPUs() - Define which GPUs to use - result = cufftXtSetGPUs(plan_input, nGPUs, whichGPUs); + // cufftCreate() - Create an empty plan + cufftResult result; + cufftHandle plan_input; + checkCudaErrors(cufftCreate(&plan_input)); - if (result == CUFFT_INVALID_DEVICE) { - printf("This sample requires two GPUs on the same board.\n"); - printf("No such board was found. Waiving sample.\n"); - exit(EXIT_WAIVED); - } else if (result != CUFFT_SUCCESS) { - printf("cufftXtSetGPUs failed\n"); - exit(EXIT_FAILURE); - } + // cufftXtSetGPUs() - Define which GPUs to use + result = cufftXtSetGPUs(plan_input, nGPUs, whichGPUs); - // Print the device information to run the code - printf("\nRunning on GPUs\n"); - for (int i = 0; i < nGPUs; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, whichGPUs[i])); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", - whichGPUs[i], deviceProp.name, deviceProp.major, deviceProp.minor); - } + if (result == CUFFT_INVALID_DEVICE) { + printf("This sample requires two GPUs on the same board.\n"); + printf("No such board was found. Waiving sample.\n"); + exit(EXIT_WAIVED); + } + else if (result != CUFFT_SUCCESS) { + printf("cufftXtSetGPUs failed\n"); + exit(EXIT_FAILURE); + } - size_t *worksize; - worksize = (size_t *)malloc(sizeof(size_t) * nGPUs); + // Print the device information to run the code + printf("\nRunning on GPUs\n"); + for (int i = 0; i < nGPUs; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, whichGPUs[i])); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", + whichGPUs[i], + deviceProp.name, + deviceProp.major, + deviceProp.minor); + } - // cufftMakePlan1d() - Create the plan - checkCudaErrors( - cufftMakePlan1d(plan_input, new_size, CUFFT_C2C, 1, worksize)); + size_t *worksize; + worksize = (size_t *)malloc(sizeof(size_t) * nGPUs); - // cufftXtMalloc() - Malloc data on multiple GPUs - cudaLibXtDesc *d_signal; - checkCudaErrors(cufftXtMalloc(plan_input, (cudaLibXtDesc **)&d_signal, - CUFFT_XT_FORMAT_INPLACE)); - cudaLibXtDesc *d_out_signal; - checkCudaErrors(cufftXtMalloc(plan_input, (cudaLibXtDesc **)&d_out_signal, - CUFFT_XT_FORMAT_INPLACE)); - cudaLibXtDesc *d_filter_kernel; - checkCudaErrors(cufftXtMalloc(plan_input, (cudaLibXtDesc **)&d_filter_kernel, - CUFFT_XT_FORMAT_INPLACE)); - cudaLibXtDesc *d_out_filter_kernel; - checkCudaErrors(cufftXtMalloc(plan_input, - (cudaLibXtDesc **)&d_out_filter_kernel, - CUFFT_XT_FORMAT_INPLACE)); + // cufftMakePlan1d() - Create the plan + checkCudaErrors(cufftMakePlan1d(plan_input, new_size, CUFFT_C2C, 1, worksize)); - // cufftXtMemcpy() - Copy data from host to multiple GPUs - checkCudaErrors(cufftXtMemcpy(plan_input, d_signal, h_padded_signal, - CUFFT_COPY_HOST_TO_DEVICE)); - checkCudaErrors(cufftXtMemcpy(plan_input, d_filter_kernel, - h_padded_filter_kernel, - CUFFT_COPY_HOST_TO_DEVICE)); + // cufftXtMalloc() - Malloc data on multiple GPUs + cudaLibXtDesc *d_signal; + checkCudaErrors(cufftXtMalloc(plan_input, (cudaLibXtDesc **)&d_signal, CUFFT_XT_FORMAT_INPLACE)); + cudaLibXtDesc *d_out_signal; + checkCudaErrors(cufftXtMalloc(plan_input, (cudaLibXtDesc **)&d_out_signal, CUFFT_XT_FORMAT_INPLACE)); + cudaLibXtDesc *d_filter_kernel; + checkCudaErrors(cufftXtMalloc(plan_input, (cudaLibXtDesc **)&d_filter_kernel, CUFFT_XT_FORMAT_INPLACE)); + cudaLibXtDesc *d_out_filter_kernel; + checkCudaErrors(cufftXtMalloc(plan_input, (cudaLibXtDesc **)&d_out_filter_kernel, CUFFT_XT_FORMAT_INPLACE)); - // cufftXtExecDescriptorC2C() - Execute FFT on data on multiple GPUs - checkCudaErrors( - cufftXtExecDescriptorC2C(plan_input, d_signal, d_signal, CUFFT_FORWARD)); - checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_filter_kernel, - d_filter_kernel, CUFFT_FORWARD)); + // cufftXtMemcpy() - Copy data from host to multiple GPUs + checkCudaErrors(cufftXtMemcpy(plan_input, d_signal, h_padded_signal, CUFFT_COPY_HOST_TO_DEVICE)); + checkCudaErrors(cufftXtMemcpy(plan_input, d_filter_kernel, h_padded_filter_kernel, CUFFT_COPY_HOST_TO_DEVICE)); - // cufftXtMemcpy() - Copy the data to natural order on GPUs - checkCudaErrors(cufftXtMemcpy(plan_input, d_out_signal, d_signal, - CUFFT_COPY_DEVICE_TO_DEVICE)); - checkCudaErrors(cufftXtMemcpy(plan_input, d_out_filter_kernel, - d_filter_kernel, CUFFT_COPY_DEVICE_TO_DEVICE)); + // cufftXtExecDescriptorC2C() - Execute FFT on data on multiple GPUs + checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_signal, d_signal, CUFFT_FORWARD)); + checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_filter_kernel, d_filter_kernel, CUFFT_FORWARD)); - printf("\n\nValue of Library Descriptor\n"); - printf("Number of GPUs %d\n", d_out_signal->descriptor->nGPUs); - printf("Device id %d %d\n", d_out_signal->descriptor->GPUs[0], - d_out_signal->descriptor->GPUs[1]); - printf("Data size on GPU %ld %ld\n", - (long)(d_out_signal->descriptor->size[0] / sizeof(cufftComplex)), - (long)(d_out_signal->descriptor->size[1] / sizeof(cufftComplex))); + // cufftXtMemcpy() - Copy the data to natural order on GPUs + checkCudaErrors(cufftXtMemcpy(plan_input, d_out_signal, d_signal, CUFFT_COPY_DEVICE_TO_DEVICE)); + checkCudaErrors(cufftXtMemcpy(plan_input, d_out_filter_kernel, d_filter_kernel, CUFFT_COPY_DEVICE_TO_DEVICE)); - // Multiply the coefficients together and normalize the result - printf("Launching ComplexPointwiseMulAndScale<<< >>>\n"); - multiplyCoefficient(d_out_signal, d_out_filter_kernel, new_size, - 1.0f / new_size, nGPUs); + printf("\n\nValue of Library Descriptor\n"); + printf("Number of GPUs %d\n", d_out_signal->descriptor->nGPUs); + printf("Device id %d %d\n", d_out_signal->descriptor->GPUs[0], d_out_signal->descriptor->GPUs[1]); + printf("Data size on GPU %ld %ld\n", + (long)(d_out_signal->descriptor->size[0] / sizeof(cufftComplex)), + (long)(d_out_signal->descriptor->size[1] / sizeof(cufftComplex))); - // cufftXtExecDescriptorC2C() - Execute inverse FFT on data on multiple GPUs - printf("Transforming signal back cufftExecC2C\n"); - checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_out_signal, - d_out_signal, CUFFT_INVERSE)); + // Multiply the coefficients together and normalize the result + printf("Launching ComplexPointwiseMulAndScale<<< >>>\n"); + multiplyCoefficient(d_out_signal, d_out_filter_kernel, new_size, 1.0f / new_size, nGPUs); - // Create host pointer pointing to padded signal - Complex *h_convolved_signal = h_padded_signal; + // cufftXtExecDescriptorC2C() - Execute inverse FFT on data on multiple GPUs + printf("Transforming signal back cufftExecC2C\n"); + checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_out_signal, d_out_signal, CUFFT_INVERSE)); - // Allocate host memory for the convolution result - Complex *h_convolved_signal_ref = - (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); + // Create host pointer pointing to padded signal + Complex *h_convolved_signal = h_padded_signal; - // cufftXtMemcpy() - Copy data from multiple GPUs to host - checkCudaErrors(cufftXtMemcpy(plan_input, h_convolved_signal, d_out_signal, - CUFFT_COPY_DEVICE_TO_HOST)); + // Allocate host memory for the convolution result + Complex *h_convolved_signal_ref = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); - // Convolve on the host - Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, - h_convolved_signal_ref); + // cufftXtMemcpy() - Copy data from multiple GPUs to host + checkCudaErrors(cufftXtMemcpy(plan_input, h_convolved_signal, d_out_signal, CUFFT_COPY_DEVICE_TO_HOST)); - // Compare CPU and GPU result - bool bTestResult = - sdkCompareL2fe((float *)h_convolved_signal_ref, - (float *)h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f); - printf("\nvalue of TestResult %d\n", bTestResult); + // Convolve on the host + Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); - // Cleanup memory - free(whichGPUs); - free(worksize); - free(h_signal); - free(h_filter_kernel); - free(h_padded_signal); - free(h_padded_filter_kernel); - free(h_convolved_signal_ref); + // Compare CPU and GPU result + bool bTestResult = + sdkCompareL2fe((float *)h_convolved_signal_ref, (float *)h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f); + printf("\nvalue of TestResult %d\n", bTestResult); - // cudaXtFree() - Free GPU memory - checkCudaErrors(cufftXtFree(d_signal)); - checkCudaErrors(cufftXtFree(d_filter_kernel)); - checkCudaErrors(cufftXtFree(d_out_signal)); - checkCudaErrors(cufftXtFree(d_out_filter_kernel)); + // Cleanup memory + free(whichGPUs); + free(worksize); + free(h_signal); + free(h_filter_kernel); + free(h_padded_signal); + free(h_padded_filter_kernel); + free(h_convolved_signal_ref); - // cufftDestroy() - Destroy FFT plan - checkCudaErrors(cufftDestroy(plan_input)); + // cudaXtFree() - Free GPU memory + checkCudaErrors(cufftXtFree(d_signal)); + checkCudaErrors(cufftXtFree(d_filter_kernel)); + checkCudaErrors(cufftXtFree(d_out_signal)); + checkCudaErrors(cufftXtFree(d_out_filter_kernel)); - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + // cufftDestroy() - Destroy FFT plan + checkCudaErrors(cufftDestroy(plan_input)); + + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } /////////////////////////////////////////////////////////////////////////////////// // Function for padding original data ////////////////////////////////////////////////////////////////////////////////// -int PadData(const Complex *signal, Complex **padded_signal, int signal_size, - const Complex *filter_kernel, Complex **padded_filter_kernel, - int filter_kernel_size) { - int minRadius = filter_kernel_size / 2; - int maxRadius = filter_kernel_size - minRadius; - int new_size = signal_size + maxRadius; +int PadData(const Complex *signal, + Complex **padded_signal, + int signal_size, + const Complex *filter_kernel, + Complex **padded_filter_kernel, + int filter_kernel_size) +{ + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; + int new_size = signal_size + maxRadius; - // Pad signal - Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size); - memcpy(new_data + 0, signal, signal_size * sizeof(Complex)); - memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex)); - *padded_signal = new_data; + // Pad signal + Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size); + memcpy(new_data + 0, signal, signal_size * sizeof(Complex)); + memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex)); + *padded_signal = new_data; - // Pad filter - new_data = (Complex *)malloc(sizeof(Complex) * new_size); - memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex)); - memset(new_data + maxRadius, 0, - (new_size - filter_kernel_size) * sizeof(Complex)); - memcpy(new_data + new_size - minRadius, filter_kernel, - minRadius * sizeof(Complex)); - *padded_filter_kernel = new_data; + // Pad filter + new_data = (Complex *)malloc(sizeof(Complex) * new_size); + memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex)); + memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex)); + memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex)); + *padded_filter_kernel = new_data; - return new_size; + return new_size; } //////////////////////////////////////////////////////////////////////////////// // Filtering operations - Computing Convolution on the host //////////////////////////////////////////////////////////////////////////////// -void Convolve(const Complex *signal, int signal_size, - const Complex *filter_kernel, int filter_kernel_size, - Complex *filtered_signal) { - int minRadius = filter_kernel_size / 2; - int maxRadius = filter_kernel_size - minRadius; +void Convolve(const Complex *signal, + int signal_size, + const Complex *filter_kernel, + int filter_kernel_size, + Complex *filtered_signal) +{ + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; - // Loop over output element indices - for (int i = 0; i < signal_size; ++i) { - filtered_signal[i].x = filtered_signal[i].y = 0; + // Loop over output element indices + for (int i = 0; i < signal_size; ++i) { + filtered_signal[i].x = filtered_signal[i].y = 0; - // Loop over convolution indices - for (int j = -maxRadius + 1; j <= minRadius; ++j) { - int k = i + j; + // Loop over convolution indices + for (int j = -maxRadius + 1; j <= minRadius; ++j) { + int k = i + j; - if (k >= 0 && k < signal_size) { - filtered_signal[i] = - ComplexAdd(filtered_signal[i], - ComplexMul(signal[k], filter_kernel[minRadius - j])); - } + if (k >= 0 && k < signal_size) { + filtered_signal[i] = + ComplexAdd(filtered_signal[i], ComplexMul(signal[k], filter_kernel[minRadius - j])); + } + } } - } } //////////////////////////////////////////////////////////////////////////////// // Launch Kernel on multiple GPU //////////////////////////////////////////////////////////////////////////////// -void multiplyCoefficient(cudaLibXtDesc *d_signal, - cudaLibXtDesc *d_filter_kernel, int new_size, - float val, int nGPUs) { - int device; - // Launch the ComplexPointwiseMulAndScale<<< >>> kernel on multiple GPU - for (int i = 0; i < nGPUs; i++) { - device = d_signal->descriptor->GPUs[i]; +void multiplyCoefficient(cudaLibXtDesc *d_signal, cudaLibXtDesc *d_filter_kernel, int new_size, float val, int nGPUs) +{ + int device; + // Launch the ComplexPointwiseMulAndScale<<< >>> kernel on multiple GPU + for (int i = 0; i < nGPUs; i++) { + device = d_signal->descriptor->GPUs[i]; - // Set device - checkCudaErrors(cudaSetDevice(device)); + // Set device + checkCudaErrors(cudaSetDevice(device)); - // Perform GPU computations - ComplexPointwiseMulAndScale<<<32, 256>>>( - (cufftComplex *)d_signal->descriptor->data[i], - (cufftComplex *)d_filter_kernel->descriptor->data[i], - int(d_signal->descriptor->size[i] / sizeof(cufftComplex)), val); - } + // Perform GPU computations + ComplexPointwiseMulAndScale<<<32, 256>>>((cufftComplex *)d_signal->descriptor->data[i], + (cufftComplex *)d_filter_kernel->descriptor->data[i], + int(d_signal->descriptor->size[i] / sizeof(cufftComplex)), + val); + } - // Wait for device to finish all operation - for (int i = 0; i < nGPUs; i++) { - device = d_signal->descriptor->GPUs[i]; - checkCudaErrors(cudaSetDevice(device)); - cudaDeviceSynchronize(); - // Check if kernel execution generated and error - getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]"); - } + // Wait for device to finish all operation + for (int i = 0; i < nGPUs; i++) { + device = d_signal->descriptor->GPUs[i]; + checkCudaErrors(cudaSetDevice(device)); + cudaDeviceSynchronize(); + // Check if kernel execution generated and error + getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]"); + } } //////////////////////////////////////////////////////////////////////////////// @@ -366,35 +351,37 @@ void multiplyCoefficient(cudaLibXtDesc *d_signal, //////////////////////////////////////////////////////////////////////////////// // Complex addition -static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) { - Complex c; - c.x = a.x + b.x; - c.y = a.y + b.y; - return c; +static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) +{ + Complex c; + c.x = a.x + b.x; + c.y = a.y + b.y; + return c; } // Complex scale -static __device__ __host__ inline Complex ComplexScale(Complex a, float s) { - Complex c; - c.x = s * a.x; - c.y = s * a.y; - return c; +static __device__ __host__ inline Complex ComplexScale(Complex a, float s) +{ + Complex c; + c.x = s * a.x; + c.y = s * a.y; + return c; } // Complex multiplication -static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) { - Complex c; - c.x = a.x * b.x - a.y * b.y; - c.y = a.x * b.y + a.y * b.x; - return c; +static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) +{ + Complex c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + return c; } // Complex pointwise multiplication -static __global__ void ComplexPointwiseMulAndScale(cufftComplex *a, - cufftComplex *b, int size, - float scale) { - const int numThreads = blockDim.x * gridDim.x; - const int threadID = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = threadID; i < size; i += numThreads) { - a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale); - } +static __global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size, float scale) +{ + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = threadID; i < size; i += numThreads) { + a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale); + } } diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT_callback/simpleCUFFT_callback.cu b/Samples/4_CUDA_Libraries/simpleCUFFT_callback/simpleCUFFT_callback.cu index 0d8e0588..64017353 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT_callback/simpleCUFFT_callback.cu +++ b/Samples/4_CUDA_Libraries/simpleCUFFT_callback/simpleCUFFT_callback.cu @@ -26,54 +26,48 @@ */ -/* - * Example showing the use of CUFFT for fast 1D-convolution using FFT. +/* + * Example showing the use of CUFFT for fast 1D-convolution using FFT. * This sample is the same as simpleCUFFT, except that it uses a callback * function to perform the pointwise multiply and scale, on input to the * inverse transform. - * -*/ + * + */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes, project #include #include #include -#include #include +#include // Complex data type -typedef float2 Complex; +typedef float2 Complex; static __device__ __host__ inline Complex ComplexAdd(Complex, Complex); static __device__ __host__ inline Complex ComplexScale(Complex, float); static __device__ __host__ inline Complex ComplexMul(Complex, Complex); // This is the callback routine prototype -static __device__ cufftComplex ComplexPointwiseMulAndScale(void *a, - size_t index, - void *cb_info, - void *sharedmem); +static __device__ cufftComplex ComplexPointwiseMulAndScale(void *a, size_t index, void *cb_info, void *sharedmem); -typedef struct _cb_params { - Complex *filter; - float scale; +typedef struct _cb_params +{ + Complex *filter; + float scale; } cb_params; // This is the callback routine. It does complex pointwise multiplication with // scaling. -static __device__ cufftComplex ComplexPointwiseMulAndScale(void *a, - size_t index, - void *cb_info, - void *sharedmem) { - cb_params *my_params = (cb_params *)cb_info; - return (cufftComplex)ComplexScale( - ComplexMul(((Complex *)a)[index], (my_params->filter)[index]), - my_params->scale); +static __device__ cufftComplex ComplexPointwiseMulAndScale(void *a, size_t index, void *cb_info, void *sharedmem) +{ + cb_params *my_params = (cb_params *)cb_info; + return (cufftComplex)ComplexScale(ComplexMul(((Complex *)a)[index], (my_params->filter)[index]), my_params->scale); } // Define the device pointer to the callback routine. The host code will fetch @@ -90,193 +84,183 @@ int PadData(const Complex *, Complex **, int, const Complex *, Complex **, int); int runTest(int argc, char **argv); // The filter size is assumed to be a number smaller than the signal size -#define SIGNAL_SIZE 50 +#define SIGNAL_SIZE 50 #define FILTER_KERNEL_SIZE 11 //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - struct cudaDeviceProp properties; - int device; - checkCudaErrors(cudaGetDevice(&device)); - checkCudaErrors(cudaGetDeviceProperties(&properties, device)); - if (!(properties.major >= 2)) { - printf("simpleCUFFT_callback requires CUDA architecture SM2.0 or higher\n"); - return EXIT_WAIVED; - } +int main(int argc, char **argv) +{ + struct cudaDeviceProp properties; + int device; + checkCudaErrors(cudaGetDevice(&device)); + checkCudaErrors(cudaGetDeviceProperties(&properties, device)); + if (!(properties.major >= 2)) { + printf("simpleCUFFT_callback requires CUDA architecture SM2.0 or higher\n"); + return EXIT_WAIVED; + } - return runTest(argc, argv); + return runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUFFT callbacks //////////////////////////////////////////////////////////////////////////////// -int runTest(int argc, char **argv) { - printf("[simpleCUFFT_callback] is starting...\n"); +int runTest(int argc, char **argv) +{ + printf("[simpleCUFFT_callback] is starting...\n"); - findCudaDevice(argc, (const char **)argv); + findCudaDevice(argc, (const char **)argv); - // Allocate host memory for the signal - Complex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); + // Allocate host memory for the signal + Complex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); - // Initialize the memory for the signal - for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) { - h_signal[i].x = rand() / (float)RAND_MAX; - h_signal[i].y = 0; - } + // Initialize the memory for the signal + for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) { + h_signal[i].x = rand() / (float)RAND_MAX; + h_signal[i].y = 0; + } - // Allocate host memory for the filter - Complex *h_filter_kernel = - (Complex *)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE); + // Allocate host memory for the filter + Complex *h_filter_kernel = (Complex *)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE); - // Initialize the memory for the filter - for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i) { - h_filter_kernel[i].x = rand() / (float)RAND_MAX; - h_filter_kernel[i].y = 0; - } + // Initialize the memory for the filter + for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i) { + h_filter_kernel[i].x = rand() / (float)RAND_MAX; + h_filter_kernel[i].y = 0; + } - // Pad signal and filter kernel - Complex *h_padded_signal; - Complex *h_padded_filter_kernel; - int new_size = - PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel, - &h_padded_filter_kernel, FILTER_KERNEL_SIZE); - int mem_size = sizeof(Complex) * new_size; + // Pad signal and filter kernel + Complex *h_padded_signal; + Complex *h_padded_filter_kernel; + int new_size = + PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel, &h_padded_filter_kernel, FILTER_KERNEL_SIZE); + int mem_size = sizeof(Complex) * new_size; - // Allocate device memory for signal - Complex *d_signal; - checkCudaErrors(cudaMalloc((void **)&d_signal, mem_size)); - // Copy host memory to device - checkCudaErrors( - cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice)); + // Allocate device memory for signal + Complex *d_signal; + checkCudaErrors(cudaMalloc((void **)&d_signal, mem_size)); + // Copy host memory to device + checkCudaErrors(cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice)); - // Allocate device memory for filter kernel - Complex *d_filter_kernel; - checkCudaErrors(cudaMalloc((void **)&d_filter_kernel, mem_size)); + // Allocate device memory for filter kernel + Complex *d_filter_kernel; + checkCudaErrors(cudaMalloc((void **)&d_filter_kernel, mem_size)); - // Copy host memory to device - checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size, - cudaMemcpyHostToDevice)); + // Copy host memory to device + checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size, cudaMemcpyHostToDevice)); - // Create one CUFFT plan for the forward transforms, and one for the reverse - // transform with load callback. - cufftHandle plan, cb_plan; - size_t work_size; + // Create one CUFFT plan for the forward transforms, and one for the reverse + // transform with load callback. + cufftHandle plan, cb_plan; + size_t work_size; - checkCudaErrors(cufftCreate(&plan)); - checkCudaErrors(cufftCreate(&cb_plan)); + checkCudaErrors(cufftCreate(&plan)); + checkCudaErrors(cufftCreate(&cb_plan)); - checkCudaErrors(cufftMakePlan1d(plan, new_size, CUFFT_C2C, 1, &work_size)); - checkCudaErrors(cufftMakePlan1d(cb_plan, new_size, CUFFT_C2C, 1, &work_size)); + checkCudaErrors(cufftMakePlan1d(plan, new_size, CUFFT_C2C, 1, &work_size)); + checkCudaErrors(cufftMakePlan1d(cb_plan, new_size, CUFFT_C2C, 1, &work_size)); - // Define a structure used to pass in the device address of the filter kernel, - // and the scale factor - cb_params h_params; + // Define a structure used to pass in the device address of the filter kernel, + // and the scale factor + cb_params h_params; - h_params.filter = d_filter_kernel; - h_params.scale = 1.0f / new_size; + h_params.filter = d_filter_kernel; + h_params.scale = 1.0f / new_size; - // Allocate device memory for parameters - cb_params *d_params; - checkCudaErrors(cudaMalloc((void **)&d_params, sizeof(cb_params))); + // Allocate device memory for parameters + cb_params *d_params; + checkCudaErrors(cudaMalloc((void **)&d_params, sizeof(cb_params))); - // Copy host memory to device - checkCudaErrors(cudaMemcpy(d_params, &h_params, sizeof(cb_params), - cudaMemcpyHostToDevice)); + // Copy host memory to device + checkCudaErrors(cudaMemcpy(d_params, &h_params, sizeof(cb_params), cudaMemcpyHostToDevice)); - // The host needs to get a copy of the device pointer to the callback - cufftCallbackLoadC hostCopyOfCallbackPtr; + // The host needs to get a copy of the device pointer to the callback + cufftCallbackLoadC hostCopyOfCallbackPtr; - checkCudaErrors(cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr, myOwnCallbackPtr, - sizeof(hostCopyOfCallbackPtr))); + checkCudaErrors(cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr, myOwnCallbackPtr, sizeof(hostCopyOfCallbackPtr))); - // Now associate the load callback with the plan. - cufftResult status = - cufftXtSetCallback(cb_plan, (void **)&hostCopyOfCallbackPtr, - CUFFT_CB_LD_COMPLEX, (void **)&d_params); - if (status == CUFFT_LICENSE_ERROR) { - printf("This sample requires a valid license file.\n"); - printf( - "The file was either not found, out of date, or otherwise invalid.\n"); - return EXIT_WAIVED; - } + // Now associate the load callback with the plan. + cufftResult status = + cufftXtSetCallback(cb_plan, (void **)&hostCopyOfCallbackPtr, CUFFT_CB_LD_COMPLEX, (void **)&d_params); + if (status == CUFFT_LICENSE_ERROR) { + printf("This sample requires a valid license file.\n"); + printf("The file was either not found, out of date, or otherwise invalid.\n"); + return EXIT_WAIVED; + } - checkCudaErrors(cufftXtSetCallback(cb_plan, (void **)&hostCopyOfCallbackPtr, - CUFFT_CB_LD_COMPLEX, (void **)&d_params)); + checkCudaErrors( + cufftXtSetCallback(cb_plan, (void **)&hostCopyOfCallbackPtr, CUFFT_CB_LD_COMPLEX, (void **)&d_params)); - // Transform signal and kernel - printf("Transforming signal cufftExecC2C\n"); - checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, - (cufftComplex *)d_signal, CUFFT_FORWARD)); - checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_filter_kernel, - (cufftComplex *)d_filter_kernel, CUFFT_FORWARD)); + // Transform signal and kernel + printf("Transforming signal cufftExecC2C\n"); + checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD)); + checkCudaErrors( + cufftExecC2C(plan, (cufftComplex *)d_filter_kernel, (cufftComplex *)d_filter_kernel, CUFFT_FORWARD)); - // Transform signal back, using the callback to do the pointwise multiply on - // the way in. - printf("Transforming signal back cufftExecC2C\n"); - checkCudaErrors(cufftExecC2C(cb_plan, (cufftComplex *)d_signal, - (cufftComplex *)d_signal, CUFFT_INVERSE)); + // Transform signal back, using the callback to do the pointwise multiply on + // the way in. + printf("Transforming signal back cufftExecC2C\n"); + checkCudaErrors(cufftExecC2C(cb_plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE)); - // Copy device memory to host - Complex *h_convolved_signal = h_padded_signal; - checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size, - cudaMemcpyDeviceToHost)); + // Copy device memory to host + Complex *h_convolved_signal = h_padded_signal; + checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost)); - // Allocate host memory for the convolution result - Complex *h_convolved_signal_ref = - (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); + // Allocate host memory for the convolution result + Complex *h_convolved_signal_ref = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE); - // Convolve on the host - Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, - h_convolved_signal_ref); + // Convolve on the host + Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); - // check result - bool bTestResult = - sdkCompareL2fe((float *)h_convolved_signal_ref, - (float *)h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f); + // check result + bool bTestResult = + sdkCompareL2fe((float *)h_convolved_signal_ref, (float *)h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f); - // Destroy CUFFT context - checkCudaErrors(cufftDestroy(plan)); - checkCudaErrors(cufftDestroy(cb_plan)); + // Destroy CUFFT context + checkCudaErrors(cufftDestroy(plan)); + checkCudaErrors(cufftDestroy(cb_plan)); - // cleanup memory - free(h_signal); - free(h_filter_kernel); - free(h_padded_signal); - free(h_padded_filter_kernel); - free(h_convolved_signal_ref); - checkCudaErrors(cudaFree(d_signal)); - checkCudaErrors(cudaFree(d_filter_kernel)); - checkCudaErrors(cudaFree(d_params)); + // cleanup memory + free(h_signal); + free(h_filter_kernel); + free(h_padded_signal); + free(h_padded_filter_kernel); + free(h_convolved_signal_ref); + checkCudaErrors(cudaFree(d_signal)); + checkCudaErrors(cudaFree(d_filter_kernel)); + checkCudaErrors(cudaFree(d_params)); - return bTestResult ? EXIT_SUCCESS : EXIT_FAILURE; + return bTestResult ? EXIT_SUCCESS : EXIT_FAILURE; } // Pad data -int PadData(const Complex *signal, Complex **padded_signal, int signal_size, - const Complex *filter_kernel, Complex **padded_filter_kernel, - int filter_kernel_size) { - int minRadius = filter_kernel_size / 2; - int maxRadius = filter_kernel_size - minRadius; - int new_size = signal_size + maxRadius; +int PadData(const Complex *signal, + Complex **padded_signal, + int signal_size, + const Complex *filter_kernel, + Complex **padded_filter_kernel, + int filter_kernel_size) +{ + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; + int new_size = signal_size + maxRadius; - // Pad signal - Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size); - memcpy(new_data + 0, signal, signal_size * sizeof(Complex)); - memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex)); - *padded_signal = new_data; + // Pad signal + Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size); + memcpy(new_data + 0, signal, signal_size * sizeof(Complex)); + memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex)); + *padded_signal = new_data; - // Pad filter - new_data = (Complex *)malloc(sizeof(Complex) * new_size); - memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex)); - memset(new_data + maxRadius, 0, - (new_size - filter_kernel_size) * sizeof(Complex)); - memcpy(new_data + new_size - minRadius, filter_kernel, - minRadius * sizeof(Complex)); - *padded_filter_kernel = new_data; + // Pad filter + new_data = (Complex *)malloc(sizeof(Complex) * new_size); + memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex)); + memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex)); + memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex)); + *padded_filter_kernel = new_data; - return new_size; + return new_size; } //////////////////////////////////////////////////////////////////////////////// @@ -284,27 +268,29 @@ int PadData(const Complex *signal, Complex **padded_signal, int signal_size, //////////////////////////////////////////////////////////////////////////////// // Computes convolution on the host -void Convolve(const Complex *signal, int signal_size, - const Complex *filter_kernel, int filter_kernel_size, - Complex *filtered_signal) { - int minRadius = filter_kernel_size / 2; - int maxRadius = filter_kernel_size - minRadius; +void Convolve(const Complex *signal, + int signal_size, + const Complex *filter_kernel, + int filter_kernel_size, + Complex *filtered_signal) +{ + int minRadius = filter_kernel_size / 2; + int maxRadius = filter_kernel_size - minRadius; - // Loop over output element indices - for (int i = 0; i < signal_size; ++i) { - filtered_signal[i].x = filtered_signal[i].y = 0; + // Loop over output element indices + for (int i = 0; i < signal_size; ++i) { + filtered_signal[i].x = filtered_signal[i].y = 0; - // Loop over convolution indices - for (int j = -maxRadius + 1; j <= minRadius; ++j) { - int k = i + j; + // Loop over convolution indices + for (int j = -maxRadius + 1; j <= minRadius; ++j) { + int k = i + j; - if (k >= 0 && k < signal_size) { - filtered_signal[i] = - ComplexAdd(filtered_signal[i], - ComplexMul(signal[k], filter_kernel[minRadius - j])); - } + if (k >= 0 && k < signal_size) { + filtered_signal[i] = + ComplexAdd(filtered_signal[i], ComplexMul(signal[k], filter_kernel[minRadius - j])); + } + } } - } } //////////////////////////////////////////////////////////////////////////////// @@ -312,25 +298,28 @@ void Convolve(const Complex *signal, int signal_size, //////////////////////////////////////////////////////////////////////////////// // Complex addition -static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) { - Complex c; - c.x = a.x + b.x; - c.y = a.y + b.y; - return c; +static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) +{ + Complex c; + c.x = a.x + b.x; + c.y = a.y + b.y; + return c; } // Complex scale -static __device__ __host__ inline Complex ComplexScale(Complex a, float s) { - Complex c; - c.x = s * a.x; - c.y = s * a.y; - return c; +static __device__ __host__ inline Complex ComplexScale(Complex a, float s) +{ + Complex c; + c.x = s * a.x; + c.y = s * a.y; + return c; } // Complex multiplication -static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) { - Complex c; - c.x = a.x * b.x - a.y * b.y; - c.y = a.x * b.y + a.y * b.x; - return c; +static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) +{ + Complex c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + return c; } diff --git a/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp b/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp index 825cea6f..bcd8c572 100644 --- a/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp +++ b/Samples/4_CUDA_Libraries/watershedSegmentationNPP/watershedSegmentationNPP.cpp @@ -27,42 +27,42 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -# define WINDOWS_LEAN_AND_MEAN -# define NOMINMAX -# include -# pragma warning(disable:4819) +#define WINDOWS_LEAN_AND_MEAN +#define NOMINMAX +#include +#pragma warning(disable : 4819) #endif +#include +#include +#include #include #include -#include -#include -#include - -// Note: If you want to view these images we HIGHLY recommend using imagej which is free on the internet and works on most platforms -// because it is one of the few image viewing apps that can display 32 bit integer image data. While it normalizes the data -// to floating point values for viewing it still provides a good representation of the relative brightness of each label value. +// Note: If you want to view these images we HIGHLY recommend using imagej which is free on the internet and works on +// most platforms +// because it is one of the few image viewing apps that can display 32 bit integer image data. While it +// normalizes the data to floating point values for viewing it still provides a good representation of the +// relative brightness of each label value. // -// The files read and written by this sample app use RAW image format, that is, only the image data itself exists in the files -// with no image format information. When viewing RAW files with imagej just enter the image size and bit depth values that -// are part of the file name when requested by imagej. +// The files read and written by this sample app use RAW image format, that is, only the image data itself exists +// in the files with no image format information. When viewing RAW files with imagej just enter the image size +// and bit depth values that are part of the file name when requested by imagej. // #define NUMBER_OF_IMAGES 3 - Npp8u * pInputImageDev[NUMBER_OF_IMAGES]; - Npp8u * pInputImageHost[NUMBER_OF_IMAGES]; - Npp8u * pSegmentationScratchBufferDev[NUMBER_OF_IMAGES]; - Npp8u * pSegmentsDev[NUMBER_OF_IMAGES]; - Npp8u * pSegmentsHost[NUMBER_OF_IMAGES]; - Npp32u * pSegmentLabelsOutputBufferDev[NUMBER_OF_IMAGES]; - Npp32u * pSegmentLabelsOutputBufferHost[NUMBER_OF_IMAGES]; +Npp8u *pInputImageDev[NUMBER_OF_IMAGES]; +Npp8u *pInputImageHost[NUMBER_OF_IMAGES]; +Npp8u *pSegmentationScratchBufferDev[NUMBER_OF_IMAGES]; +Npp8u *pSegmentsDev[NUMBER_OF_IMAGES]; +Npp8u *pSegmentsHost[NUMBER_OF_IMAGES]; +Npp32u *pSegmentLabelsOutputBufferDev[NUMBER_OF_IMAGES]; +Npp32u *pSegmentLabelsOutputBufferHost[NUMBER_OF_IMAGES]; void tearDown() // Clean up and tear down { - for (int j = 0; j < NUMBER_OF_IMAGES; j++) - { + for (int j = 0; j < NUMBER_OF_IMAGES; j++) { if (pSegmentLabelsOutputBufferDev[j] != 0) cudaFree(pSegmentLabelsOutputBufferDev[j]); if (pSegmentationScratchBufferDev[j] != 0) @@ -80,129 +80,120 @@ void tearDown() // Clean up and tear down } } -const std::string& SegmentsOutputFile0 = "teapot_Segments_8Way_512x512_8u.raw"; -const std::string& SegmentsOutputFile1 = "CT_skull_Segments_8Way_512x512_8u.raw"; -const std::string& SegmentsOutputFile2 = "Rocks_Segments_8Way_512x512_8u.raw"; +const std::string &SegmentsOutputFile0 = "teapot_Segments_8Way_512x512_8u.raw"; +const std::string &SegmentsOutputFile1 = "CT_skull_Segments_8Way_512x512_8u.raw"; +const std::string &SegmentsOutputFile2 = "Rocks_Segments_8Way_512x512_8u.raw"; -const std::string& SegmentBoundariesOutputFile0 = "teapot_SegmentBoundaries_8Way_512x512_8u.raw"; -const std::string& SegmentBoundariesOutputFile1 = "CT_skull_SegmentBoundaries_8Way_512x512_8u.raw"; -const std::string& SegmentBoundariesOutputFile2 = "Rocks_SegmentBoundaries_8Way_512x512_8u.raw"; +const std::string &SegmentBoundariesOutputFile0 = "teapot_SegmentBoundaries_8Way_512x512_8u.raw"; +const std::string &SegmentBoundariesOutputFile1 = "CT_skull_SegmentBoundaries_8Way_512x512_8u.raw"; +const std::string &SegmentBoundariesOutputFile2 = "Rocks_SegmentBoundaries_8Way_512x512_8u.raw"; -const std::string& SegmentsWithContrastingBoundariesOutputFile0 = "teapot_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw"; -const std::string& SegmentsWithContrastingBoundariesOutputFile1 = "CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw"; -const std::string& SegmentsWithContrastingBoundariesOutputFile2 = "Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw"; +const std::string &SegmentsWithContrastingBoundariesOutputFile0 = + "teapot_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw"; +const std::string &SegmentsWithContrastingBoundariesOutputFile1 = + "CT_skull_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw"; +const std::string &SegmentsWithContrastingBoundariesOutputFile2 = + "Rocks_SegmentsWithContrastingBoundaries_8Way_512x512_8u.raw"; -const std::string& CompressedSegmentLabelsOutputFile0 = "teapot_CompressedSegmentLabels_8Way_512x512_32u.raw"; -const std::string& CompressedSegmentLabelsOutputFile1 = "CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw"; -const std::string& CompressedSegmentLabelsOutputFile2 = "Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw"; +const std::string &CompressedSegmentLabelsOutputFile0 = "teapot_CompressedSegmentLabels_8Way_512x512_32u.raw"; +const std::string &CompressedSegmentLabelsOutputFile1 = "CT_skull_CompressedSegmentLabels_8Way_512x512_32u.raw"; +const std::string &CompressedSegmentLabelsOutputFile2 = "Rocks_CompressedSegmentLabels_8Way_512x512_32u.raw"; -int -loadRaw8BitImage(Npp8u * pImage, int nWidth, int nHeight, int nImage) +int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) { - FILE * bmpFile; + FILE *bmpFile; size_t nSize; - if (nImage == 0) - { - if (nWidth != 512 || nHeight != 512) + if (nImage == 0) { + if (nWidth != 512 || nHeight != 512) return -1; - const char* fileName = "teapot_512x512_8u_Gray.raw"; - const char* InputFile = sdkFindFilePath(fileName, "."); - if (InputFile == NULL) - { - printf("%s file not found.. exiting\n", fileName); - exit(EXIT_WAIVED); + const char *fileName = "teapot_512x512_8u_Gray.raw"; + const char *InputFile = sdkFindFilePath(fileName, "."); + if (InputFile == NULL) { + printf("%s file not found.. exiting\n", fileName); + exit(EXIT_WAIVED); } bmpFile = fopen(InputFile, "rb"); } - else if (nImage == 1) - { - if (nWidth != 512 || nHeight != 512) + else if (nImage == 1) { + if (nWidth != 512 || nHeight != 512) return -1; - const char* fileName = "CT_skull_512x512_8u_Gray.raw"; - const char* InputFile = sdkFindFilePath(fileName, "."); - if (InputFile == NULL) - { - printf("%s file not found.. exiting\n", fileName); - exit(EXIT_WAIVED); + const char *fileName = "CT_skull_512x512_8u_Gray.raw"; + const char *InputFile = sdkFindFilePath(fileName, "."); + if (InputFile == NULL) { + printf("%s file not found.. exiting\n", fileName); + exit(EXIT_WAIVED); } bmpFile = fopen(InputFile, "rb"); } - else if (nImage == 2) - { - if (nWidth != 512 || nHeight != 512) + else if (nImage == 2) { + if (nWidth != 512 || nHeight != 512) return -1; - const char* fileName = "Rocks_512x512_8u_Gray.raw"; - const char* InputFile = sdkFindFilePath(fileName, "."); - if (InputFile == NULL) - { - printf("%s file not found.. exiting\n", fileName); - exit(EXIT_WAIVED); + const char *fileName = "Rocks_512x512_8u_Gray.raw"; + const char *InputFile = sdkFindFilePath(fileName, "."); + if (InputFile == NULL) { + printf("%s file not found.. exiting\n", fileName); + exit(EXIT_WAIVED); } bmpFile = fopen(InputFile, "rb"); } - else - { - printf ("Input file load failed.\n"); + else { + printf("Input file load failed.\n"); return -1; } - if (bmpFile == NULL) - { - printf ("Input file load failed.\n"); + if (bmpFile == NULL) { + printf("Input file load failed.\n"); return -1; } nSize = fread(pImage, 1, nWidth * nHeight, bmpFile); - if (nSize < nWidth * nHeight) - { - printf ("Input file load failed.\n"); - fclose(bmpFile); + if (nSize < nWidth * nHeight) { + printf("Input file load failed.\n"); + fclose(bmpFile); return -1; } fclose(bmpFile); - printf ("Input file load succeeded.\n"); + printf("Input file load succeeded.\n"); return 0; } -int -main( int argc, char** argv ) +int main(int argc, char **argv) { - size_t aSegmentationScratchBufferSize[NUMBER_OF_IMAGES]; - int aSegmentLabelsOutputBufferSize[NUMBER_OF_IMAGES]; + size_t aSegmentationScratchBufferSize[NUMBER_OF_IMAGES]; + int aSegmentLabelsOutputBufferSize[NUMBER_OF_IMAGES]; - cudaError_t cudaError; - NppStatus nppStatus; + cudaError_t cudaError; + NppStatus nppStatus; NppStreamContext nppStreamCtx; - FILE * bmpFile; - NppiNorm eNorm = nppiNormInf; // default to 8 way neighbor search + FILE *bmpFile; + NppiNorm eNorm = nppiNormInf; // default to 8 way neighbor search - for (int j = 0; j < NUMBER_OF_IMAGES; j++) - { - pInputImageDev[j] = 0; - pInputImageHost[j] = 0; - pSegmentationScratchBufferDev[j] = 0; - pSegmentLabelsOutputBufferDev[j] = 0; + for (int j = 0; j < NUMBER_OF_IMAGES; j++) { + pInputImageDev[j] = 0; + pInputImageHost[j] = 0; + pSegmentationScratchBufferDev[j] = 0; + pSegmentLabelsOutputBufferDev[j] = 0; pSegmentLabelsOutputBufferHost[j] = 0; - pSegmentsDev[j] = 0; - pSegmentsHost[j] = 0; + pSegmentsDev[j] = 0; + pSegmentsHost[j] = 0; } - nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. + nppStreamCtx.hStream = + 0; // The NULL stream by default, set this to whatever your stream ID is if not the NULL stream. cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId); - if (cudaError != cudaSuccess) - { + if (cudaError != cudaSuccess) { printf("CUDA error: no devices supporting CUDA.\n"); return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; } - const NppLibraryVersion *libVer = nppGetLibVersion(); + const NppLibraryVersion *libVer = nppGetLibVersion(); printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); @@ -210,17 +201,17 @@ main( int argc, char** argv ) cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); - printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); - printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion/1000, (runtimeVersion%100)/10); + printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10); + printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, - cudaDevAttrComputeCapabilityMajor, + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor, + cudaDevAttrComputeCapabilityMajor, nppStreamCtx.nCudaDeviceId); if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; - cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, - cudaDevAttrComputeCapabilityMinor, + cudaError = cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor, + cudaDevAttrComputeCapabilityMinor, nppStreamCtx.nCudaDeviceId); if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY; @@ -231,79 +222,98 @@ main( int argc, char** argv ) cudaError = cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId); - nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; + nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount; nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor; - nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; - nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; + nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock; + nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock; NppiSize oSizeROI[NUMBER_OF_IMAGES]; - for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) - { - if (nImage == 0) - { - oSizeROI[nImage].width = 512; + for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { + if (nImage == 0) { + oSizeROI[nImage].width = 512; oSizeROI[nImage].height = 512; } - else if (nImage == 1) - { - oSizeROI[nImage].width = 512; + else if (nImage == 1) { + oSizeROI[nImage].width = 512; oSizeROI[nImage].height = 512; } - else if (nImage == 2) - { - oSizeROI[nImage].width = 512; + else if (nImage == 2) { + oSizeROI[nImage].width = 512; oSizeROI[nImage].height = 512; } // cudaMallocPitch OR cudaMalloc can be used here, in this sample case width == pitch. - cudaError = cudaMalloc ((void**)&pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height); + cudaError = cudaMalloc((void **)&pInputImageDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; - cudaError = cudaMalloc ((void**)&pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height); + cudaError = cudaMalloc((void **)&pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; - pInputImageHost[nImage] = reinterpret_cast(malloc(oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height)); - pSegmentsHost[nImage] = reinterpret_cast(malloc(oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height)); + pInputImageHost[nImage] = + reinterpret_cast(malloc(oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height)); + pSegmentsHost[nImage] = + reinterpret_cast(malloc(oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height)); nppStatus = nppiSegmentWatershedGetBufferSize_8u_C1R(oSizeROI[nImage], &aSegmentationScratchBufferSize[nImage]); - cudaError = cudaMalloc ((void **)&pSegmentationScratchBufferDev[nImage], aSegmentationScratchBufferSize[nImage]); + cudaError = cudaMalloc((void **)&pSegmentationScratchBufferDev[nImage], aSegmentationScratchBufferSize[nImage]); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; - // Output label marker buffers are only needed if you want to same the generated segmentation labels, they ARE compatible with NPP UF generated labels. - // Requesting segmentation output may slightly decrease segmentation function performance. Regardless of the pitch of the segmentation image - // the segment labels output buffer will have a pitch of oSizeROI[nImage].width * sizeof(Npp32u). + // Output label marker buffers are only needed if you want to same the generated segmentation labels, they ARE + // compatible with NPP UF generated labels. Requesting segmentation output may slightly decrease segmentation + // function performance. Regardless of the pitch of the segmentation image the segment labels output buffer + // will have a pitch of oSizeROI[nImage].width * sizeof(Npp32u). aSegmentLabelsOutputBufferSize[nImage] = oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height; - - cudaError = cudaMalloc ((void **)&pSegmentLabelsOutputBufferDev[nImage], aSegmentLabelsOutputBufferSize[nImage]); + + cudaError = cudaMalloc((void **)&pSegmentLabelsOutputBufferDev[nImage], aSegmentLabelsOutputBufferSize[nImage]); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; - pSegmentLabelsOutputBufferHost[nImage] = reinterpret_cast(malloc(oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height)); + pSegmentLabelsOutputBufferHost[nImage] = + reinterpret_cast(malloc(oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height)); - if (loadRaw8BitImage(pInputImageHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, nImage) == 0) - { - cudaError = cudaMemcpy2DAsync(pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), pInputImageHost[nImage], - oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, - cudaMemcpyHostToDevice, nppStreamCtx.hStream); + if (loadRaw8BitImage( + pInputImageHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, nImage) + == 0) { + cudaError = cudaMemcpy2DAsync(pInputImageDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pInputImageHost[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].height, + cudaMemcpyHostToDevice, + nppStreamCtx.hStream); - // Make a second copy of the unaltered input image since this function works in place and we want to reuse the input image multiple times. - cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), pInputImageHost[nImage], - oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, - cudaMemcpyHostToDevice, nppStreamCtx.hStream); + // Make a second copy of the unaltered input image since this function works in place and we want to reuse + // the input image multiple times. + cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pInputImageHost[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].height, + cudaMemcpyHostToDevice, + nppStreamCtx.hStream); - nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), - pSegmentLabelsOutputBufferDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), eNorm, - NPP_WATERSHED_SEGMENT_BOUNDARIES_NONE, oSizeROI[nImage], pSegmentationScratchBufferDev[nImage], nppStreamCtx); + nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pSegmentLabelsOutputBufferDev[nImage], + oSizeROI[nImage].width * sizeof(Npp32u), + eNorm, + NPP_WATERSHED_SEGMENT_BOUNDARIES_NONE, + oSizeROI[nImage], + pSegmentationScratchBufferDev[nImage], + nppStreamCtx); - if (nppStatus != NPP_SUCCESS) - { + if (nppStatus != NPP_SUCCESS) { if (nImage == 0) printf("Lena segments 8Way 512x512 8u failed.\n"); else if (nImage == 1) @@ -315,26 +325,29 @@ main( int argc, char** argv ) } // Now compress the label markers output to make them easier to view. - int nCompressedLabelsScratchBufferSize; - Npp8u * pCompressedLabelsScratchBufferDev; + int nCompressedLabelsScratchBufferSize; + Npp8u *pCompressedLabelsScratchBufferDev; - nppStatus = nppiCompressMarkerLabelsGetBufferSize_32u_C1R(oSizeROI[nImage].width * oSizeROI[nImage].height, &nCompressedLabelsScratchBufferSize); + nppStatus = nppiCompressMarkerLabelsGetBufferSize_32u_C1R(oSizeROI[nImage].width * oSizeROI[nImage].height, + &nCompressedLabelsScratchBufferSize); if (nppStatus != NPP_NO_ERROR) return nppStatus; - cudaError = cudaMalloc ((void **)&pCompressedLabelsScratchBufferDev, nCompressedLabelsScratchBufferSize); + cudaError = cudaMalloc((void **)&pCompressedLabelsScratchBufferDev, nCompressedLabelsScratchBufferSize); if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; int nCompressedLabelCount = 0; - nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR_Ctx(pSegmentLabelsOutputBufferDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage], - oSizeROI[nImage].width * oSizeROI[nImage].height, &nCompressedLabelCount, + nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR_Ctx(pSegmentLabelsOutputBufferDev[nImage], + oSizeROI[nImage].width * sizeof(Npp32u), + oSizeROI[nImage], + oSizeROI[nImage].width * oSizeROI[nImage].height, + &nCompressedLabelCount, pCompressedLabelsScratchBufferDev, nppStreamCtx); - if (nppStatus != NPP_SUCCESS) - { + if (nppStatus != NPP_SUCCESS) { if (nImage == 0) printf("teapot_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n"); else if (nImage == 1) @@ -346,19 +359,28 @@ main( int argc, char** argv ) } // Copy segmented image to host - cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), - pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, - cudaMemcpyDeviceToHost, nppStreamCtx.hStream); + cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].height, + cudaMemcpyDeviceToHost, + nppStreamCtx.hStream); // Copy segment labels image to host - cudaError = cudaMemcpy2DAsync(pSegmentLabelsOutputBufferHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u), - pSegmentLabelsOutputBufferDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height, - cudaMemcpyDeviceToHost, nppStreamCtx.hStream); + cudaError = cudaMemcpy2DAsync(pSegmentLabelsOutputBufferHost[nImage], + oSizeROI[nImage].width * sizeof(Npp32u), + pSegmentLabelsOutputBufferDev[nImage], + oSizeROI[nImage].width * sizeof(Npp32u), + oSizeROI[nImage].width * sizeof(Npp32u), + oSizeROI[nImage].height, + cudaMemcpyDeviceToHost, + nppStreamCtx.hStream); // Wait host image read backs to complete, not necessary if no need to synchronize - if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) - { - printf ("Post segmentation cudaStreamSynchronize failed\n"); + if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) { + printf("Post segmentation cudaStreamSynchronize failed\n"); tearDown(); return -1; } @@ -374,12 +396,12 @@ main( int argc, char** argv ) else if (nImage == 2) bmpFile = fopen(SegmentsOutputFile2.c_str(), "wb"); - if (bmpFile == NULL) + if (bmpFile == NULL) return -1; size_t nSize = 0; - for (int j = 0; j < oSizeROI[nImage].height; j++) - { - nSize += fwrite(&pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile); + for (int j = 0; j < oSizeROI[nImage].height; j++) { + nSize += fwrite( + &pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile); } fclose(bmpFile); @@ -398,12 +420,14 @@ main( int argc, char** argv ) else if (nImage == 2) bmpFile = fopen(CompressedSegmentLabelsOutputFile2.c_str(), "wb"); - if (bmpFile == NULL) + if (bmpFile == NULL) return -1; nSize = 0; - for (int j = 0; j < oSizeROI[nImage].height; j++) - { - nSize += fwrite(&pSegmentLabelsOutputBufferHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp32u), oSizeROI[nImage].width, bmpFile); + for (int j = 0; j < oSizeROI[nImage].height; j++) { + nSize += fwrite(&pSegmentLabelsOutputBufferHost[nImage][j * oSizeROI[nImage].width], + sizeof(Npp32u), + oSizeROI[nImage].width, + bmpFile); } fclose(bmpFile); @@ -416,18 +440,29 @@ main( int argc, char** argv ) // Now generate a segment boundaries only output image - // Make a second copy of the unaltered input image since this function works in place and we want to reuse the input image multiple times. - cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), pInputImageHost[nImage], - oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, - cudaMemcpyHostToDevice, nppStreamCtx.hStream); + // Make a second copy of the unaltered input image since this function works in place and we want to reuse + // the input image multiple times. + cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pInputImageHost[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].height, + cudaMemcpyHostToDevice, + nppStreamCtx.hStream); // We already generated segment labels images to skip that this time - nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), - 0, 0, eNorm, - NPP_WATERSHED_SEGMENT_BOUNDARIES_ONLY, oSizeROI[nImage], pSegmentationScratchBufferDev[nImage], nppStreamCtx); + nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + 0, + 0, + eNorm, + NPP_WATERSHED_SEGMENT_BOUNDARIES_ONLY, + oSizeROI[nImage], + pSegmentationScratchBufferDev[nImage], + nppStreamCtx); - if (nppStatus != NPP_SUCCESS) - { + if (nppStatus != NPP_SUCCESS) { if (nImage == 0) printf("Lena segment boundaries 8Way 512x512 8u failed.\n"); else if (nImage == 1) @@ -439,14 +474,18 @@ main( int argc, char** argv ) } // Copy segment boundaries image to host - cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), - pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, - cudaMemcpyDeviceToHost, nppStreamCtx.hStream); + cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].height, + cudaMemcpyDeviceToHost, + nppStreamCtx.hStream); // Wait host image read backs to complete, not necessary if no need to synchronize - if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) - { - printf ("Post segmentation cudaStreamSynchronize failed\n"); + if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) { + printf("Post segmentation cudaStreamSynchronize failed\n"); tearDown(); return -1; } @@ -458,12 +497,12 @@ main( int argc, char** argv ) else if (nImage == 2) bmpFile = fopen(SegmentBoundariesOutputFile2.c_str(), "wb"); - if (bmpFile == NULL) + if (bmpFile == NULL) return -1; nSize = 0; - for (int j = 0; j < oSizeROI[nImage].height; j++) - { - nSize += fwrite(&pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile); + for (int j = 0; j < oSizeROI[nImage].height; j++) { + nSize += fwrite( + &pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile); } fclose(bmpFile); @@ -476,18 +515,29 @@ main( int argc, char** argv ) // Now generate a segmented with contrasting boundaries output image - // Make a second copy of the unaltered input image since this function works in place and we want to reuse the input image multiple times. - cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), pInputImageHost[nImage], - oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, - cudaMemcpyHostToDevice, nppStreamCtx.hStream); + // Make a second copy of the unaltered input image since this function works in place and we want to reuse + // the input image multiple times. + cudaError = cudaMemcpy2DAsync(pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pInputImageHost[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].height, + cudaMemcpyHostToDevice, + nppStreamCtx.hStream); // We already generated segment labels images to skip that this time - nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), - 0, 0, eNorm, - NPP_WATERSHED_SEGMENT_BOUNDARIES_CONTRAST, oSizeROI[nImage], pSegmentationScratchBufferDev[nImage], nppStreamCtx); + nppStatus = nppiSegmentWatershed_8u_C1IR_Ctx(pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + 0, + 0, + eNorm, + NPP_WATERSHED_SEGMENT_BOUNDARIES_CONTRAST, + oSizeROI[nImage], + pSegmentationScratchBufferDev[nImage], + nppStreamCtx); - if (nppStatus != NPP_SUCCESS) - { + if (nppStatus != NPP_SUCCESS) { if (nImage == 0) printf("Lena segments with contrasting boundaries 8Way 512x512 8u failed.\n"); else if (nImage == 1) @@ -499,14 +549,18 @@ main( int argc, char** argv ) } // Copy segment boundaries image to host - cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u), - pSegmentsDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height, - cudaMemcpyDeviceToHost, nppStreamCtx.hStream); + cudaError = cudaMemcpy2DAsync(pSegmentsHost[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + pSegmentsDev[nImage], + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].width * sizeof(Npp8u), + oSizeROI[nImage].height, + cudaMemcpyDeviceToHost, + nppStreamCtx.hStream); // Wait host image read backs to complete, not necessary if no need to synchronize - if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) - { - printf ("Post segmentation cudaStreamSynchronize failed\n"); + if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) != cudaSuccess) { + printf("Post segmentation cudaStreamSynchronize failed\n"); tearDown(); return -1; } @@ -518,12 +572,12 @@ main( int argc, char** argv ) else if (nImage == 2) bmpFile = fopen(SegmentsWithContrastingBoundariesOutputFile2.c_str(), "wb"); - if (bmpFile == NULL) + if (bmpFile == NULL) return -1; nSize = 0; - for (int j = 0; j < oSizeROI[nImage].height; j++) - { - nSize += fwrite(&pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile); + for (int j = 0; j < oSizeROI[nImage].height; j++) { + nSize += fwrite( + &pSegmentsHost[nImage][j * oSizeROI[nImage].width], sizeof(Npp8u), oSizeROI[nImage].width, bmpFile); } fclose(bmpFile); diff --git a/Samples/5_Domain_Specific/BlackScholes/BlackScholes.cu b/Samples/5_Domain_Specific/BlackScholes/BlackScholes.cu index 0f4ecd44..e07cd618 100644 --- a/Samples/5_Domain_Specific/BlackScholes/BlackScholes.cu +++ b/Samples/5_Domain_Specific/BlackScholes/BlackScholes.cu @@ -31,16 +31,20 @@ * See supplied whitepaper for more explanations. */ -#include // helper functions for string parsing -#include // helper functions CUDA error checking and initialization +#include // helper functions CUDA error checking and initialization +#include // helper functions for string parsing //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on CPU //////////////////////////////////////////////////////////////////////////////// -extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, - float *h_StockPrice, float *h_OptionStrike, - float *h_OptionYears, float Riskfree, - float Volatility, int optN); +extern "C" void BlackScholesCPU(float *h_CallResult, + float *h_PutResult, + float *h_StockPrice, + float *h_OptionStrike, + float *h_OptionYears, + float Riskfree, + float Volatility, + int optN); //////////////////////////////////////////////////////////////////////////////// // Process an array of OptN options on GPU @@ -51,193 +55,192 @@ extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, // Helper function, returning uniformly distributed // random float in [low, high] range //////////////////////////////////////////////////////////////////////////////// -float RandFloat(float low, float high) { - float t = (float)rand() / (float)RAND_MAX; - return (1.0f - t) * low + t * high; +float RandFloat(float low, float high) +{ + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; } //////////////////////////////////////////////////////////////////////////////// // Data configuration //////////////////////////////////////////////////////////////////////////////// -const int OPT_N = 4000000; +const int OPT_N = 4000000; const int NUM_ITERATIONS = 512; -const int OPT_SZ = OPT_N * sizeof(float); -const float RISKFREE = 0.02f; +const int OPT_SZ = OPT_N * sizeof(float); +const float RISKFREE = 0.02f; const float VOLATILITY = 0.30f; -#define DIV_UP(a, b) (((a) + (b)-1) / (b)) +#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // Start logs - printf("[%s] - Starting...\n", argv[0]); +int main(int argc, char **argv) +{ + // Start logs + printf("[%s] - Starting...\n", argv[0]); - //'h_' prefix - CPU (host) memory space - float - // Results calculated by CPU for reference - *h_CallResultCPU, - *h_PutResultCPU, - // CPU copy of GPU results - *h_CallResultGPU, *h_PutResultGPU, - // CPU instance of input data - *h_StockPrice, *h_OptionStrike, *h_OptionYears; + //'h_' prefix - CPU (host) memory space + float + // Results calculated by CPU for reference + *h_CallResultCPU, + *h_PutResultCPU, + // CPU copy of GPU results + *h_CallResultGPU, *h_PutResultGPU, + // CPU instance of input data + *h_StockPrice, *h_OptionStrike, *h_OptionYears; - //'d_' prefix - GPU (device) memory space - float - // Results calculated by GPU - *d_CallResult, - *d_PutResult, - // GPU instance of input data - *d_StockPrice, *d_OptionStrike, *d_OptionYears; + //'d_' prefix - GPU (device) memory space + float + // Results calculated by GPU + *d_CallResult, + *d_PutResult, + // GPU instance of input data + *d_StockPrice, *d_OptionStrike, *d_OptionYears; - double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; + double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; - StopWatchInterface *hTimer = NULL; - int i; + StopWatchInterface *hTimer = NULL; + int i; - findCudaDevice(argc, (const char **)argv); + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Initializing data...\n"); - printf("...allocating CPU memory for options.\n"); - h_CallResultCPU = (float *)malloc(OPT_SZ); - h_PutResultCPU = (float *)malloc(OPT_SZ); - h_CallResultGPU = (float *)malloc(OPT_SZ); - h_PutResultGPU = (float *)malloc(OPT_SZ); - h_StockPrice = (float *)malloc(OPT_SZ); - h_OptionStrike = (float *)malloc(OPT_SZ); - h_OptionYears = (float *)malloc(OPT_SZ); + printf("Initializing data...\n"); + printf("...allocating CPU memory for options.\n"); + h_CallResultCPU = (float *)malloc(OPT_SZ); + h_PutResultCPU = (float *)malloc(OPT_SZ); + h_CallResultGPU = (float *)malloc(OPT_SZ); + h_PutResultGPU = (float *)malloc(OPT_SZ); + h_StockPrice = (float *)malloc(OPT_SZ); + h_OptionStrike = (float *)malloc(OPT_SZ); + h_OptionYears = (float *)malloc(OPT_SZ); - printf("...allocating GPU memory for options.\n"); - checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ)); + printf("...allocating GPU memory for options.\n"); + checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ)); - printf("...generating input data in CPU mem.\n"); - srand(5347); + printf("...generating input data in CPU mem.\n"); + srand(5347); - // Generate options set - for (i = 0; i < OPT_N; i++) { - h_CallResultCPU[i] = 0.0f; - h_PutResultCPU[i] = -1.0f; - h_StockPrice[i] = RandFloat(5.0f, 30.0f); - h_OptionStrike[i] = RandFloat(1.0f, 100.0f); - h_OptionYears[i] = RandFloat(0.25f, 10.0f); - } - - printf("...copying input data to GPU mem.\n"); - // Copy options data to GPU memory for further processing - checkCudaErrors( - cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, - cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice)); - printf("Data init done.\n\n"); - - printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", - NUM_ITERATIONS); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - for (i = 0; i < NUM_ITERATIONS; i++) { - BlackScholesGPU<<>>( - (float2 *)d_CallResult, (float2 *)d_PutResult, (float2 *)d_StockPrice, - (float2 *)d_OptionStrike, (float2 *)d_OptionYears, RISKFREE, VOLATILITY, - OPT_N); - getLastCudaError("BlackScholesGPU() execution failed\n"); - } - - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; - - // Both call and put is calculated - printf("Options count : %i \n", 2 * OPT_N); - printf("BlackScholesGPU() time : %f msec\n", gpuTime); - printf("Effective memory bandwidth: %f GB/s\n", - ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); - printf("Gigaoptions per second : %f \n\n", - ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); - - printf( - "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " - "options, NumDevsUsed = %u, Workgroup = %u\n", - (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3, - (2 * OPT_N), 1, 128); - - printf("\nReading back GPU results...\n"); - // Read back GPU results to compare them to CPU results - checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, - cudaMemcpyDeviceToHost)); - checkCudaErrors( - cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost)); - - printf("Checking the results...\n"); - printf("...running CPU calculations.\n\n"); - // Calculate options values on CPU - BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, - h_OptionYears, RISKFREE, VOLATILITY, OPT_N); - - printf("Comparing the results...\n"); - // Calculate max absolute difference and L1 distance - // between CPU and GPU results - sum_delta = 0; - sum_ref = 0; - max_delta = 0; - - for (i = 0; i < OPT_N; i++) { - ref = h_CallResultCPU[i]; - delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); - - if (delta > max_delta) { - max_delta = delta; + // Generate options set + for (i = 0; i < OPT_N; i++) { + h_CallResultCPU[i] = 0.0f; + h_PutResultCPU[i] = -1.0f; + h_StockPrice[i] = RandFloat(5.0f, 30.0f); + h_OptionStrike[i] = RandFloat(1.0f, 100.0f); + h_OptionYears[i] = RandFloat(0.25f, 10.0f); } - sum_delta += delta; - sum_ref += fabs(ref); - } + printf("...copying input data to GPU mem.\n"); + // Copy options data to GPU memory for further processing + checkCudaErrors(cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice)); + printf("Data init done.\n\n"); - L1norm = sum_delta / sum_ref; - printf("L1 norm: %E\n", L1norm); - printf("Max absolute error: %E\n\n", max_delta); + printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - printf("Shutting down...\n"); - printf("...releasing GPU memory.\n"); - checkCudaErrors(cudaFree(d_OptionYears)); - checkCudaErrors(cudaFree(d_OptionStrike)); - checkCudaErrors(cudaFree(d_StockPrice)); - checkCudaErrors(cudaFree(d_PutResult)); - checkCudaErrors(cudaFree(d_CallResult)); + for (i = 0; i < NUM_ITERATIONS; i++) { + BlackScholesGPU<<>>((float2 *)d_CallResult, + (float2 *)d_PutResult, + (float2 *)d_StockPrice, + (float2 *)d_OptionStrike, + (float2 *)d_OptionYears, + RISKFREE, + VOLATILITY, + OPT_N); + getLastCudaError("BlackScholesGPU() execution failed\n"); + } - printf("...releasing CPU memory.\n"); - free(h_OptionYears); - free(h_OptionStrike); - free(h_StockPrice); - free(h_PutResultGPU); - free(h_CallResultGPU); - free(h_PutResultCPU); - free(h_CallResultCPU); - sdkDeleteTimer(&hTimer); - printf("Shutdown done.\n"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; - printf("\n[BlackScholes] - Test Summary\n"); + // Both call and put is calculated + printf("Options count : %i \n", 2 * OPT_N); + printf("BlackScholesGPU() time : %f msec\n", gpuTime); + printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); + printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); - if (L1norm > 1e-6) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " + "options, NumDevsUsed = %u, Workgroup = %u\n", + (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), + gpuTime * 1e-3, + (2 * OPT_N), + 1, + 128); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); - printf("Test passed\n"); - exit(EXIT_SUCCESS); + printf("\nReading back GPU results...\n"); + // Read back GPU results to compare them to CPU results + checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost)); + + printf("Checking the results...\n"); + printf("...running CPU calculations.\n\n"); + // Calculate options values on CPU + BlackScholesCPU( + h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, h_OptionYears, RISKFREE, VOLATILITY, OPT_N); + + printf("Comparing the results...\n"); + // Calculate max absolute difference and L1 distance + // between CPU and GPU results + sum_delta = 0; + sum_ref = 0; + max_delta = 0; + + for (i = 0; i < OPT_N; i++) { + ref = h_CallResultCPU[i]; + delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); + + if (delta > max_delta) { + max_delta = delta; + } + + sum_delta += delta; + sum_ref += fabs(ref); + } + + L1norm = sum_delta / sum_ref; + printf("L1 norm: %E\n", L1norm); + printf("Max absolute error: %E\n\n", max_delta); + + printf("Shutting down...\n"); + printf("...releasing GPU memory.\n"); + checkCudaErrors(cudaFree(d_OptionYears)); + checkCudaErrors(cudaFree(d_OptionStrike)); + checkCudaErrors(cudaFree(d_StockPrice)); + checkCudaErrors(cudaFree(d_PutResult)); + checkCudaErrors(cudaFree(d_CallResult)); + + printf("...releasing CPU memory.\n"); + free(h_OptionYears); + free(h_OptionStrike); + free(h_StockPrice); + free(h_PutResultGPU); + free(h_CallResultGPU); + free(h_PutResultCPU); + free(h_CallResultCPU); + sdkDeleteTimer(&hTimer); + printf("Shutdown done.\n"); + + printf("\n[BlackScholes] - Test Summary\n"); + + if (L1norm > 1e-6) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/BlackScholes/BlackScholes_gold.cpp b/Samples/5_Domain_Specific/BlackScholes/BlackScholes_gold.cpp index 957331ee..41356b30 100644 --- a/Samples/5_Domain_Specific/BlackScholes/BlackScholes_gold.cpp +++ b/Samples/5_Domain_Specific/BlackScholes/BlackScholes_gold.cpp @@ -30,57 +30,69 @@ //////////////////////////////////////////////////////////////////////////////// // Polynomial approximation of cumulative normal distribution function //////////////////////////////////////////////////////////////////////////////// -static double CND(double d) { - const double A1 = 0.31938153; - const double A2 = -0.356563782; - const double A3 = 1.781477937; - const double A4 = -1.821255978; - const double A5 = 1.330274429; - const double RSQRT2PI = 0.39894228040143267793994605993438; +static double CND(double d) +{ + const double A1 = 0.31938153; + const double A2 = -0.356563782; + const double A3 = 1.781477937; + const double A4 = -1.821255978; + const double A5 = 1.330274429; + const double RSQRT2PI = 0.39894228040143267793994605993438; - double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); + double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); - double cnd = RSQRT2PI * exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + double cnd = RSQRT2PI * exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0 - cnd; + if (d > 0) + cnd = 1.0 - cnd; - return cnd; + return cnd; } //////////////////////////////////////////////////////////////////////////////// // Black-Scholes formula for both call and put //////////////////////////////////////////////////////////////////////////////// -static void BlackScholesBodyCPU(float &callResult, float &putResult, - float Sf, // Stock price - float Xf, // Option strike - float Tf, // Option years - float Rf, // Riskless rate - float Vf // Volatility rate - ) { - double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; +static void BlackScholesBodyCPU(float &callResult, + float &putResult, + float Sf, // Stock price + float Xf, // Option strike + float Tf, // Option years + float Rf, // Riskless rate + float Vf // Volatility rate +) +{ + double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; - double sqrtT = sqrt(T); - double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); - double d2 = d1 - V * sqrtT; - double CNDD1 = CND(d1); - double CNDD2 = CND(d2); + double sqrtT = sqrt(T); + double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; + double CNDD1 = CND(d1); + double CNDD2 = CND(d2); - // Calculate Call and Put simultaneously - double expRT = exp(-R * T); - callResult = (float)(S * CNDD1 - X * expRT * CNDD2); - putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); + // Calculate Call and Put simultaneously + double expRT = exp(-R * T); + callResult = (float)(S * CNDD1 - X * expRT * CNDD2); + putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); } //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options //////////////////////////////////////////////////////////////////////////////// -extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, - float *h_StockPrice, float *h_OptionStrike, - float *h_OptionYears, float Riskfree, - float Volatility, int optN) { - for (int opt = 0; opt < optN; opt++) - BlackScholesBodyCPU(h_CallResult[opt], h_PutResult[opt], h_StockPrice[opt], - h_OptionStrike[opt], h_OptionYears[opt], Riskfree, - Volatility); +extern "C" void BlackScholesCPU(float *h_CallResult, + float *h_PutResult, + float *h_StockPrice, + float *h_OptionStrike, + float *h_OptionYears, + float Riskfree, + float Volatility, + int optN) +{ + for (int opt = 0; opt < optN; opt++) + BlackScholesBodyCPU(h_CallResult[opt], + h_PutResult[opt], + h_StockPrice[opt], + h_OptionStrike[opt], + h_OptionYears[opt], + Riskfree, + Volatility); } diff --git a/Samples/5_Domain_Specific/BlackScholes/BlackScholes_kernel.cuh b/Samples/5_Domain_Specific/BlackScholes/BlackScholes_kernel.cuh index 9c49d49e..fa627f91 100644 --- a/Samples/5_Domain_Specific/BlackScholes/BlackScholes_kernel.cuh +++ b/Samples/5_Domain_Specific/BlackScholes/BlackScholes_kernel.cuh @@ -28,79 +28,92 @@ //////////////////////////////////////////////////////////////////////////////// // Polynomial approximation of cumulative normal distribution function //////////////////////////////////////////////////////////////////////////////// -__device__ inline float cndGPU(float d) { - const float A1 = 0.31938153f; - const float A2 = -0.356563782f; - const float A3 = 1.781477937f; - const float A4 = -1.821255978f; - const float A5 = 1.330274429f; - const float RSQRT2PI = 0.39894228040143267793994605993438f; +__device__ inline float cndGPU(float d) +{ + const float A1 = 0.31938153f; + const float A2 = -0.356563782f; + const float A3 = 1.781477937f; + const float A4 = -1.821255978f; + const float A5 = 1.330274429f; + const float RSQRT2PI = 0.39894228040143267793994605993438f; - float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); + float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); - float cnd = RSQRT2PI * __expf(-0.5f * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + float cnd = RSQRT2PI * __expf(-0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0f - cnd; + if (d > 0) + cnd = 1.0f - cnd; - return cnd; + return cnd; } //////////////////////////////////////////////////////////////////////////////// // Black-Scholes formula for both call and put //////////////////////////////////////////////////////////////////////////////// -__device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult, - float S, // Stock price - float X, // Option strike - float T, // Option years - float R, // Riskless rate - float V // Volatility rate - ) { - float sqrtT, expRT; - float d1, d2, CNDD1, CNDD2; +__device__ inline void BlackScholesBodyGPU(float &CallResult, + float &PutResult, + float S, // Stock price + float X, // Option strike + float T, // Option years + float R, // Riskless rate + float V // Volatility rate +) +{ + float sqrtT, expRT; + float d1, d2, CNDD1, CNDD2; - sqrtT = __fdividef(1.0F, rsqrtf(T)); - d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); - d2 = d1 - V * sqrtT; + sqrtT = __fdividef(1.0F, rsqrtf(T)); + d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); + d2 = d1 - V * sqrtT; - CNDD1 = cndGPU(d1); - CNDD2 = cndGPU(d2); + CNDD1 = cndGPU(d1); + CNDD2 = cndGPU(d2); - // Calculate Call and Put simultaneously - expRT = __expf(-R * T); - CallResult = S * CNDD1 - X * expRT * CNDD2; - PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); + // Calculate Call and Put simultaneously + expRT = __expf(-R * T); + CallResult = S * CNDD1 - X * expRT * CNDD2; + PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); } //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on GPU //////////////////////////////////////////////////////////////////////////////// -__launch_bounds__(128) __global__ - void BlackScholesGPU(float2 *__restrict d_CallResult, - float2 *__restrict d_PutResult, - float2 *__restrict d_StockPrice, - float2 *__restrict d_OptionStrike, - float2 *__restrict d_OptionYears, float Riskfree, - float Volatility, int optN) { - ////Thread index - // const int tid = blockDim.x * blockIdx.x + threadIdx.x; - ////Total number of threads in execution grid - // const int THREAD_N = blockDim.x * gridDim.x; +__launch_bounds__(128) __global__ void BlackScholesGPU(float2 *__restrict d_CallResult, + float2 *__restrict d_PutResult, + float2 *__restrict d_StockPrice, + float2 *__restrict d_OptionStrike, + float2 *__restrict d_OptionYears, + float Riskfree, + float Volatility, + int optN) +{ + ////Thread index + // const int tid = blockDim.x * blockIdx.x + threadIdx.x; + ////Total number of threads in execution grid + // const int THREAD_N = blockDim.x * gridDim.x; - const int opt = blockDim.x * blockIdx.x + threadIdx.x; + const int opt = blockDim.x * blockIdx.x + threadIdx.x; - // Calculating 2 options per thread to increase ILP (instruction level - // parallelism) - if (opt < (optN / 2)) { - float callResult1, callResult2; - float putResult1, putResult2; - BlackScholesBodyGPU(callResult1, putResult1, d_StockPrice[opt].x, - d_OptionStrike[opt].x, d_OptionYears[opt].x, Riskfree, - Volatility); - BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y, - d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree, - Volatility); - d_CallResult[opt] = make_float2(callResult1, callResult2); - d_PutResult[opt] = make_float2(putResult1, putResult2); - } + // Calculating 2 options per thread to increase ILP (instruction level + // parallelism) + if (opt < (optN / 2)) { + float callResult1, callResult2; + float putResult1, putResult2; + BlackScholesBodyGPU(callResult1, + putResult1, + d_StockPrice[opt].x, + d_OptionStrike[opt].x, + d_OptionYears[opt].x, + Riskfree, + Volatility); + BlackScholesBodyGPU(callResult2, + putResult2, + d_StockPrice[opt].y, + d_OptionStrike[opt].y, + d_OptionYears[opt].y, + Riskfree, + Volatility); + d_CallResult[opt] = make_float2(callResult1, callResult2); + d_PutResult[opt] = make_float2(putResult1, putResult2); + } } diff --git a/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes.cpp b/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes.cpp index 3f62adcd..bd538c87 100644 --- a/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes.cpp +++ b/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes.cpp @@ -32,18 +32,21 @@ */ #include +#include // helper functions for string parsing #include -#include // helper functions for string parsing - //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on CPU //////////////////////////////////////////////////////////////////////////////// -extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, - float *h_StockPrice, float *h_OptionStrike, - float *h_OptionYears, float Riskfree, - float Volatility, int optN); +extern "C" void BlackScholesCPU(float *h_CallResult, + float *h_PutResult, + float *h_StockPrice, + float *h_OptionStrike, + float *h_OptionYears, + float Riskfree, + float Volatility, + int optN); //////////////////////////////////////////////////////////////////////////////// // Process an array of OptN options on GPU @@ -54,216 +57,225 @@ extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, // random float in [low, high] range //////////////////////////////////////////////////////////////////////////////// -float RandFloat(float low, float high) { - float t = (float)rand() / (float)RAND_MAX; - return (1.0f - t) * low + t * high; +float RandFloat(float low, float high) +{ + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; } //////////////////////////////////////////////////////////////////////////////// // Data configuration //////////////////////////////////////////////////////////////////////////////// -const int OPT_N = 4000000; -const int NUM_ITERATIONS = 512; -const int OPT_SZ = OPT_N * sizeof(float); -const float RISKFREE = 0.02f; -const float VOLATILITY = 0.30f; +const int OPT_N = 4000000; +const int NUM_ITERATIONS = 512; +const int OPT_SZ = OPT_N * sizeof(float); +const float RISKFREE = 0.02f; +const float VOLATILITY = 0.30f; -#define DIV_UP(a, b) (((a) + (b)-1) / (b)) +#define DIV_UP(a, b) (((a) + (b) - 1) / (b)) //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // Start logs - printf("[%s] - Starting...\n", argv[0]); +int main(int argc, char **argv) +{ + // Start logs + printf("[%s] - Starting...\n", argv[0]); - //'h_' prefix - CPU (host) memory space - float - // Results calculated by CPU for reference - *h_CallResultCPU, - *h_PutResultCPU, - // CPU copy of GPU results - *h_CallResultGPU, *h_PutResultGPU, - // CPU instance of input data - *h_StockPrice, *h_OptionStrike, *h_OptionYears; + //'h_' prefix - CPU (host) memory space + float + // Results calculated by CPU for reference + *h_CallResultCPU, + *h_PutResultCPU, + // CPU copy of GPU results + *h_CallResultGPU, *h_PutResultGPU, + // CPU instance of input data + *h_StockPrice, *h_OptionStrike, *h_OptionYears; - //'d_' prefix - GPU (device) memory space - CUdeviceptr - // Results calculated by GPU - d_CallResult, - d_PutResult, + //'d_' prefix - GPU (device) memory space + CUdeviceptr + // Results calculated by GPU + d_CallResult, + d_PutResult, - // GPU instance of input data - d_StockPrice, d_OptionStrike, d_OptionYears; + // GPU instance of input data + d_StockPrice, d_OptionStrike, d_OptionYears; - double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; + double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; - StopWatchInterface *hTimer = NULL; - int i; + StopWatchInterface *hTimer = NULL; + int i; - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Initializing data...\n"); - printf("...allocating CPU memory for options.\n"); + printf("Initializing data...\n"); + printf("...allocating CPU memory for options.\n"); - h_CallResultCPU = (float *)malloc(OPT_SZ); - h_PutResultCPU = (float *)malloc(OPT_SZ); - h_CallResultGPU = (float *)malloc(OPT_SZ); - h_PutResultGPU = (float *)malloc(OPT_SZ); - h_StockPrice = (float *)malloc(OPT_SZ); - h_OptionStrike = (float *)malloc(OPT_SZ); - h_OptionYears = (float *)malloc(OPT_SZ); + h_CallResultCPU = (float *)malloc(OPT_SZ); + h_PutResultCPU = (float *)malloc(OPT_SZ); + h_CallResultGPU = (float *)malloc(OPT_SZ); + h_PutResultGPU = (float *)malloc(OPT_SZ); + h_StockPrice = (float *)malloc(OPT_SZ); + h_OptionStrike = (float *)malloc(OPT_SZ); + h_OptionYears = (float *)malloc(OPT_SZ); - char *cubin, *kernel_file; - size_t cubinSize; - kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]); + char *cubin, *kernel_file; + size_t cubinSize; + kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]); - // Compile the kernel BlackScholes_kernel. - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - CUmodule module = loadCUBIN(cubin, argc, argv); + // Compile the kernel BlackScholes_kernel. + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + CUmodule module = loadCUBIN(cubin, argc, argv); - CUfunction kernel_addr; - checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU")); + CUfunction kernel_addr; + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU")); - printf("...allocating GPU memory for options.\n"); - checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ)); - checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ)); - checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ)); - checkCudaErrors(cuMemAlloc(&d_OptionStrike, OPT_SZ)); - checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ)); + printf("...allocating GPU memory for options.\n"); + checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ)); + checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ)); + checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ)); + checkCudaErrors(cuMemAlloc(&d_OptionStrike, OPT_SZ)); + checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ)); - printf("...generating input data in CPU mem.\n"); - srand(5347); + printf("...generating input data in CPU mem.\n"); + srand(5347); - // Generate options set - for (i = 0; i < OPT_N; i++) { - h_CallResultCPU[i] = 0.0f; - h_PutResultCPU[i] = -1.0f; - h_StockPrice[i] = RandFloat(5.0f, 30.0f); - h_OptionStrike[i] = RandFloat(1.0f, 100.0f); - h_OptionYears[i] = RandFloat(0.25f, 10.0f); - } - - printf("...copying input data to GPU mem.\n"); - // Copy options data to GPU memory for further processing - checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ)); - checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ)); - checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ)); - - printf("Data init done.\n\n"); - printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", - NUM_ITERATIONS); - - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - dim3 cudaBlockSize(128, 1, 1); - dim3 cudaGridSize(DIV_UP(OPT_N / 2, 128), 1, 1); - - float risk = RISKFREE; - float volatility = VOLATILITY; - int optval = OPT_N; - - void *arr[] = {(void *)&d_CallResult, (void *)&d_PutResult, - (void *)&d_StockPrice, (void *)&d_OptionStrike, - (void *)&d_OptionYears, (void *)&risk, - (void *)&volatility, (void *)&optval}; - - for (i = 0; i < NUM_ITERATIONS; i++) { - checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - cudaBlockSize.x, cudaBlockSize.y, - cudaBlockSize.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &arr[0], /* arguments */ - 0)); - } - - checkCudaErrors(cuCtxSynchronize()); - - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; - - // Both call and put is calculated - printf("Options count : %i \n", 2 * OPT_N); - printf("BlackScholesGPU() time : %f msec\n", gpuTime); - printf("Effective memory bandwidth: %f GB/s\n", - ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); - printf("Gigaoptions per second : %f \n\n", - ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); - printf( - "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " - "options, NumDevsUsed = %u, Workgroup = %u\n", - (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3, - (2 * OPT_N), 1, 128); - - printf("\nReading back GPU results...\n"); - - // Read back GPU results to compare them to CPU results - checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ)); - checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ)); - - printf("Checking the results...\n"); - printf("...running CPU calculations.\n\n"); - - // Calculate options values on CPU - BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, - h_OptionYears, RISKFREE, VOLATILITY, OPT_N); - - printf("Comparing the results...\n"); - // Calculate max absolute difference and L1 distance - // between CPU and GPU results - sum_delta = 0; - sum_ref = 0; - max_delta = 0; - - for (i = 0; i < OPT_N; i++) { - ref = h_CallResultCPU[i]; - delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); - - if (delta > max_delta) { - max_delta = delta; + // Generate options set + for (i = 0; i < OPT_N; i++) { + h_CallResultCPU[i] = 0.0f; + h_PutResultCPU[i] = -1.0f; + h_StockPrice[i] = RandFloat(5.0f, 30.0f); + h_OptionStrike[i] = RandFloat(1.0f, 100.0f); + h_OptionYears[i] = RandFloat(0.25f, 10.0f); } - sum_delta += delta; - sum_ref += fabs(ref); - } + printf("...copying input data to GPU mem.\n"); + // Copy options data to GPU memory for further processing + checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ)); + checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ)); + checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ)); - L1norm = sum_delta / sum_ref; - printf("L1 norm: %E\n", L1norm); - printf("Max absolute error: %E\n\n", max_delta); + printf("Data init done.\n\n"); + printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); - printf("Shutting down...\n"); - printf("...releasing GPU memory.\n"); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - checkCudaErrors(cuMemFree(d_OptionYears)); - checkCudaErrors(cuMemFree(d_OptionStrike)); - checkCudaErrors(cuMemFree(d_StockPrice)); - checkCudaErrors(cuMemFree(d_PutResult)); - checkCudaErrors(cuMemFree(d_CallResult)); + dim3 cudaBlockSize(128, 1, 1); + dim3 cudaGridSize(DIV_UP(OPT_N / 2, 128), 1, 1); - printf("...releasing CPU memory.\n"); + float risk = RISKFREE; + float volatility = VOLATILITY; + int optval = OPT_N; - free(h_OptionYears); - free(h_OptionStrike); - free(h_StockPrice); - free(h_PutResultGPU); - free(h_CallResultGPU); - free(h_PutResultCPU); - free(h_CallResultCPU); + void *arr[] = {(void *)&d_CallResult, + (void *)&d_PutResult, + (void *)&d_StockPrice, + (void *)&d_OptionStrike, + (void *)&d_OptionYears, + (void *)&risk, + (void *)&volatility, + (void *)&optval}; - sdkDeleteTimer(&hTimer); - printf("Shutdown done.\n"); + for (i = 0; i < NUM_ITERATIONS; i++) { + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + cudaBlockSize.x, + cudaBlockSize.y, + cudaBlockSize.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &arr[0], /* arguments */ + 0)); + } - printf("\n[%s] - Test Summary\n", argv[0]); + checkCudaErrors(cuCtxSynchronize()); - if (L1norm > 1e-6) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; - printf("Test passed\n"); - exit(EXIT_SUCCESS); + // Both call and put is calculated + printf("Options count : %i \n", 2 * OPT_N); + printf("BlackScholesGPU() time : %f msec\n", gpuTime); + printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); + printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); + printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u " + "options, NumDevsUsed = %u, Workgroup = %u\n", + (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), + gpuTime * 1e-3, + (2 * OPT_N), + 1, + 128); + + printf("\nReading back GPU results...\n"); + + // Read back GPU results to compare them to CPU results + checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ)); + checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ)); + + printf("Checking the results...\n"); + printf("...running CPU calculations.\n\n"); + + // Calculate options values on CPU + BlackScholesCPU( + h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, h_OptionYears, RISKFREE, VOLATILITY, OPT_N); + + printf("Comparing the results...\n"); + // Calculate max absolute difference and L1 distance + // between CPU and GPU results + sum_delta = 0; + sum_ref = 0; + max_delta = 0; + + for (i = 0; i < OPT_N; i++) { + ref = h_CallResultCPU[i]; + delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); + + if (delta > max_delta) { + max_delta = delta; + } + + sum_delta += delta; + sum_ref += fabs(ref); + } + + L1norm = sum_delta / sum_ref; + printf("L1 norm: %E\n", L1norm); + printf("Max absolute error: %E\n\n", max_delta); + + printf("Shutting down...\n"); + printf("...releasing GPU memory.\n"); + + checkCudaErrors(cuMemFree(d_OptionYears)); + checkCudaErrors(cuMemFree(d_OptionStrike)); + checkCudaErrors(cuMemFree(d_StockPrice)); + checkCudaErrors(cuMemFree(d_PutResult)); + checkCudaErrors(cuMemFree(d_CallResult)); + + printf("...releasing CPU memory.\n"); + + free(h_OptionYears); + free(h_OptionStrike); + free(h_StockPrice); + free(h_PutResultGPU); + free(h_CallResultGPU); + free(h_PutResultCPU); + free(h_CallResultCPU); + + sdkDeleteTimer(&hTimer); + printf("Shutdown done.\n"); + + printf("\n[%s] - Test Summary\n", argv[0]); + + if (L1norm > 1e-6) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_gold.cpp b/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_gold.cpp index 63a26a63..ea76cabe 100644 --- a/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_gold.cpp +++ b/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_gold.cpp @@ -31,58 +31,70 @@ // Polynomial approximation of cumulative normal distribution function /////////////////////////////////////////////////////////////////////////////// -static double CND(double d) { - const double A1 = 0.31938153; - const double A2 = -0.356563782; - const double A3 = 1.781477937; - const double A4 = -1.821255978; - const double A5 = 1.330274429; - const double RSQRT2PI = 0.39894228040143267793994605993438; +static double CND(double d) +{ + const double A1 = 0.31938153; + const double A2 = -0.356563782; + const double A3 = 1.781477937; + const double A4 = -1.821255978; + const double A5 = 1.330274429; + const double RSQRT2PI = 0.39894228040143267793994605993438; - double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); + double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); - double cnd = RSQRT2PI * exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + double cnd = RSQRT2PI * exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0 - cnd; + if (d > 0) + cnd = 1.0 - cnd; - return cnd; + return cnd; } /////////////////////////////////////////////////////////////////////////////// // Black-Scholes formula for both call and put /////////////////////////////////////////////////////////////////////////////// -static void BlackScholesBodyCPU(float &callResult, float &putResult, - float Sf, // Stock price - float Xf, // Option strike - float Tf, // Option years - float Rf, // Riskless rate - float Vf // Volatility rate - ) { - double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; - double sqrtT = sqrt(T); - double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); - double d2 = d1 - V * sqrtT; - double CNDD1 = CND(d1); - double CNDD2 = CND(d2); +static void BlackScholesBodyCPU(float &callResult, + float &putResult, + float Sf, // Stock price + float Xf, // Option strike + float Tf, // Option years + float Rf, // Riskless rate + float Vf // Volatility rate +) +{ + double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; + double sqrtT = sqrt(T); + double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; + double CNDD1 = CND(d1); + double CNDD2 = CND(d2); - // Calculate Call and Put simultaneously - double expRT = exp(-R * T); + // Calculate Call and Put simultaneously + double expRT = exp(-R * T); - callResult = (float)(S * CNDD1 - X * expRT * CNDD2); - putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); + callResult = (float)(S * CNDD1 - X * expRT * CNDD2); + putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); } //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options //////////////////////////////////////////////////////////////////////////////// -extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, - float *h_StockPrice, float *h_OptionStrike, - float *h_OptionYears, float Riskfree, - float Volatility, int optN) { - for (int opt = 0; opt < optN; opt++) - BlackScholesBodyCPU(h_CallResult[opt], h_PutResult[opt], h_StockPrice[opt], - h_OptionStrike[opt], h_OptionYears[opt], Riskfree, - Volatility); +extern "C" void BlackScholesCPU(float *h_CallResult, + float *h_PutResult, + float *h_StockPrice, + float *h_OptionStrike, + float *h_OptionYears, + float Riskfree, + float Volatility, + int optN) +{ + for (int opt = 0; opt < optN; opt++) + BlackScholesBodyCPU(h_CallResult[opt], + h_PutResult[opt], + h_StockPrice[opt], + h_OptionStrike[opt], + h_OptionYears[opt], + Riskfree, + Volatility); } diff --git a/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_kernel.cuh b/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_kernel.cuh index 6da11761..8d67b7d2 100644 --- a/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_kernel.cuh +++ b/Samples/5_Domain_Specific/BlackScholes_nvrtc/BlackScholes_kernel.cuh @@ -29,75 +29,88 @@ // Polynomial approximation of cumulative normal distribution function /////////////////////////////////////////////////////////////////////////////// -__device__ inline float cndGPU(float d) { - const float A1 = 0.31938153f; - const float A2 = -0.356563782f; - const float A3 = 1.781477937f; - const float A4 = -1.821255978f; - const float A5 = 1.330274429f; - const float RSQRT2PI = 0.39894228040143267793994605993438f; +__device__ inline float cndGPU(float d) +{ + const float A1 = 0.31938153f; + const float A2 = -0.356563782f; + const float A3 = 1.781477937f; + const float A4 = -1.821255978f; + const float A5 = 1.330274429f; + const float RSQRT2PI = 0.39894228040143267793994605993438f; - float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); + float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); - float cnd = RSQRT2PI * __expf(-0.5f * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + float cnd = RSQRT2PI * __expf(-0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0f - cnd; + if (d > 0) + cnd = 1.0f - cnd; - return cnd; + return cnd; } /////////////////////////////////////////////////////////////////////////////// // Black-Scholes formula for both call and put /////////////////////////////////////////////////////////////////////////////// -__device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult, - float S, // Stock price - float X, // Option strike - float T, // Option years - float R, // Riskless rate - float V // Volatility rate - ) { - float sqrtT, expRT; - float d1, d2, CNDD1, CNDD2; +__device__ inline void BlackScholesBodyGPU(float &CallResult, + float &PutResult, + float S, // Stock price + float X, // Option strike + float T, // Option years + float R, // Riskless rate + float V // Volatility rate +) +{ + float sqrtT, expRT; + float d1, d2, CNDD1, CNDD2; - sqrtT = __fdividef(1.0F, rsqrtf(T)); - d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); - d2 = d1 - V * sqrtT; + sqrtT = __fdividef(1.0F, rsqrtf(T)); + d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); + d2 = d1 - V * sqrtT; - CNDD1 = cndGPU(d1); - CNDD2 = cndGPU(d2); + CNDD1 = cndGPU(d1); + CNDD2 = cndGPU(d2); - // Calculate Call and Put simultaneously - expRT = __expf(-R * T); - CallResult = S * CNDD1 - X * expRT * CNDD2; - PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); + // Calculate Call and Put simultaneously + expRT = __expf(-R * T); + CallResult = S * CNDD1 - X * expRT * CNDD2; + PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); } //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on GPU //////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(128) __global__ - void BlackScholesGPU(float2 *__restrict d_CallResult, - float2 *__restrict d_PutResult, - float2 *__restrict d_StockPrice, - float2 *__restrict d_OptionStrike, - float2 *__restrict d_OptionYears, float Riskfree, - float Volatility, int optN) { - ////Thread index - const int opt = blockDim.x * blockIdx.x + threadIdx.x; +extern "C" __launch_bounds__(128) __global__ void BlackScholesGPU(float2 *__restrict d_CallResult, + float2 *__restrict d_PutResult, + float2 *__restrict d_StockPrice, + float2 *__restrict d_OptionStrike, + float2 *__restrict d_OptionYears, + float Riskfree, + float Volatility, + int optN) +{ + ////Thread index + const int opt = blockDim.x * blockIdx.x + threadIdx.x; - // Calculating 2 options per thread to increase ILP (instruction level - // parallelism) - if (opt < (optN / 2)) { - float callResult1, callResult2; - float putResult1, putResult2; - BlackScholesBodyGPU(callResult1, putResult1, d_StockPrice[opt].x, - d_OptionStrike[opt].x, d_OptionYears[opt].x, Riskfree, - Volatility); - BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y, - d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree, - Volatility); - d_CallResult[opt] = make_float2(callResult1, callResult2); - d_PutResult[opt] = make_float2(putResult1, putResult2); - } + // Calculating 2 options per thread to increase ILP (instruction level + // parallelism) + if (opt < (optN / 2)) { + float callResult1, callResult2; + float putResult1, putResult2; + BlackScholesBodyGPU(callResult1, + putResult1, + d_StockPrice[opt].x, + d_OptionStrike[opt].x, + d_OptionYears[opt].x, + Riskfree, + Volatility); + BlackScholesBodyGPU(callResult2, + putResult2, + d_StockPrice[opt].y, + d_OptionStrike[opt].y, + d_OptionYears[opt].y, + Riskfree, + Volatility); + d_CallResult[opt] = make_float2(callResult1, callResult2); + d_PutResult[opt] = make_float2(putResult1, putResult2); + } } diff --git a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3d.h b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3d.h index b288c877..8057be06 100644 --- a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3d.h +++ b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3d.h @@ -34,21 +34,21 @@ // primary display in this case. #define k_dim_min 96 #define k_dim_max 376 -#define k_dim_qa 248 +#define k_dim_qa 248 // Note that the radius is defined here as exactly 4 since the // kernel code uses a constant. If you want a different radius // you must change the kernel accordingly. -#define k_radius_min 4 -#define k_radius_max 4 +#define k_radius_min 4 +#define k_radius_max 4 #define k_radius_default 4 // The values are set to give reasonable runtimes, they can // be changed but note that running a very large number of // timesteps can take a very long time and you should avoid // running on your primary display in this case. -#define k_timesteps_min 1 -#define k_timesteps_max 10 +#define k_timesteps_min 1 +#define k_timesteps_max 10 #define k_timesteps_default 5 #endif diff --git a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPU.h b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPU.h index 257b80d8..463f7e1d 100644 --- a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPU.h +++ b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPU.h @@ -29,23 +29,28 @@ #define _FDTD3DGPU_H_ #include -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || \ - defined(_WIN64) && defined(_MSC_VER) +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) && defined(_MSC_VER) typedef unsigned __int64 memsize_t; #else #include typedef uint64_t memsize_t; #endif -#define k_blockDimX 32 +#define k_blockDimX 32 #define k_blockDimMaxY 16 #define k_blockSizeMin 128 #define k_blockSizeMax (k_blockDimX * k_blockDimMaxY) -bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, - const char **argv); -bool fdtdGPU(float *output, const float *input, const float *coeff, - const int dimx, const int dimy, const int dimz, const int radius, - const int timesteps, const int argc, const char **argv); +bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv); +bool fdtdGPU(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps, + const int argc, + const char **argv); #endif diff --git a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPUKernel.cuh b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPUKernel.cuh index 0f5c6d45..c6a47355 100644 --- a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPUKernel.cuh +++ b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dGPUKernel.cuh @@ -25,9 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "FDTD3dGPU.h" #include +#include "FDTD3dGPU.h" + namespace cg = cooperative_groups; // Note: If you change the RADIUS, you should also change the unrolling below @@ -35,116 +36,125 @@ namespace cg = cooperative_groups; __constant__ float stencil[RADIUS + 1]; -__global__ void FiniteDifferencesKernel(float *output, const float *input, - const int dimx, const int dimy, - const int dimz) { - bool validr = true; - bool validw = true; - const int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - const int gtidy = blockIdx.y * blockDim.y + threadIdx.y; - const int ltidx = threadIdx.x; - const int ltidy = threadIdx.y; - const int workx = blockDim.x; - const int worky = blockDim.y; - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float tile[k_blockDimMaxY + 2 * RADIUS][k_blockDimX + 2 * RADIUS]; +__global__ void +FiniteDifferencesKernel(float *output, const float *input, const int dimx, const int dimy, const int dimz) +{ + bool validr = true; + bool validw = true; + const int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + const int gtidy = blockIdx.y * blockDim.y + threadIdx.y; + const int ltidx = threadIdx.x; + const int ltidy = threadIdx.y; + const int workx = blockDim.x; + const int worky = blockDim.y; + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float tile[k_blockDimMaxY + 2 * RADIUS][k_blockDimX + 2 * RADIUS]; - const int stride_y = dimx + 2 * RADIUS; - const int stride_z = stride_y * (dimy + 2 * RADIUS); + const int stride_y = dimx + 2 * RADIUS; + const int stride_z = stride_y * (dimy + 2 * RADIUS); - int inputIndex = 0; - int outputIndex = 0; + int inputIndex = 0; + int outputIndex = 0; - // Advance inputIndex to start of inner volume - inputIndex += RADIUS * stride_y + RADIUS; + // Advance inputIndex to start of inner volume + inputIndex += RADIUS * stride_y + RADIUS; - // Advance inputIndex to target element - inputIndex += gtidy * stride_y + gtidx; + // Advance inputIndex to target element + inputIndex += gtidy * stride_y + gtidx; - float infront[RADIUS]; - float behind[RADIUS]; - float current; + float infront[RADIUS]; + float behind[RADIUS]; + float current; - const int tx = ltidx + RADIUS; - const int ty = ltidy + RADIUS; + const int tx = ltidx + RADIUS; + const int ty = ltidy + RADIUS; - // Check in bounds - if ((gtidx >= dimx + RADIUS) || (gtidy >= dimy + RADIUS)) validr = false; + // Check in bounds + if ((gtidx >= dimx + RADIUS) || (gtidy >= dimy + RADIUS)) + validr = false; - if ((gtidx >= dimx) || (gtidy >= dimy)) validw = false; + if ((gtidx >= dimx) || (gtidy >= dimy)) + validw = false; - // Preload the "infront" and "behind" data - for (int i = RADIUS - 2; i >= 0; i--) { - if (validr) behind[i] = input[inputIndex]; + // Preload the "infront" and "behind" data + for (int i = RADIUS - 2; i >= 0; i--) { + if (validr) + behind[i] = input[inputIndex]; + inputIndex += stride_z; + } + + if (validr) + current = input[inputIndex]; + + outputIndex = inputIndex; inputIndex += stride_z; - } - if (validr) current = input[inputIndex]; + for (int i = 0; i < RADIUS; i++) { + if (validr) + infront[i] = input[inputIndex]; - outputIndex = inputIndex; - inputIndex += stride_z; - - for (int i = 0; i < RADIUS; i++) { - if (validr) infront[i] = input[inputIndex]; - - inputIndex += stride_z; - } + inputIndex += stride_z; + } // Step through the xy-planes #pragma unroll 9 - for (int iz = 0; iz < dimz; iz++) { - // Advance the slice (move the thread-front) - for (int i = RADIUS - 1; i > 0; i--) behind[i] = behind[i - 1]; + for (int iz = 0; iz < dimz; iz++) { + // Advance the slice (move the thread-front) + for (int i = RADIUS - 1; i > 0; i--) + behind[i] = behind[i - 1]; - behind[0] = current; - current = infront[0]; + behind[0] = current; + current = infront[0]; #pragma unroll 4 - for (int i = 0; i < RADIUS - 1; i++) infront[i] = infront[i + 1]; + for (int i = 0; i < RADIUS - 1; i++) + infront[i] = infront[i + 1]; - if (validr) infront[RADIUS - 1] = input[inputIndex]; + if (validr) + infront[RADIUS - 1] = input[inputIndex]; - inputIndex += stride_z; - outputIndex += stride_z; - cg::sync(cta); + inputIndex += stride_z; + outputIndex += stride_z; + cg::sync(cta); - // Note that for the work items on the boundary of the problem, the - // supplied index when reading the halo (below) may wrap to the - // previous/next row or even the previous/next xy-plane. This is - // acceptable since a) we disable the output write for these work - // items and b) there is at least one xy-plane before/after the - // current plane, so the access will be within bounds. + // Note that for the work items on the boundary of the problem, the + // supplied index when reading the halo (below) may wrap to the + // previous/next row or even the previous/next xy-plane. This is + // acceptable since a) we disable the output write for these work + // items and b) there is at least one xy-plane before/after the + // current plane, so the access will be within bounds. - // Update the data slice in the local tile - // Halo above & below - if (ltidy < RADIUS) { - tile[ltidy][tx] = input[outputIndex - RADIUS * stride_y]; - tile[ltidy + worky + RADIUS][tx] = input[outputIndex + worky * stride_y]; - } + // Update the data slice in the local tile + // Halo above & below + if (ltidy < RADIUS) { + tile[ltidy][tx] = input[outputIndex - RADIUS * stride_y]; + tile[ltidy + worky + RADIUS][tx] = input[outputIndex + worky * stride_y]; + } - // Halo left & right - if (ltidx < RADIUS) { - tile[ty][ltidx] = input[outputIndex - RADIUS]; - tile[ty][ltidx + workx + RADIUS] = input[outputIndex + workx]; - } + // Halo left & right + if (ltidx < RADIUS) { + tile[ty][ltidx] = input[outputIndex - RADIUS]; + tile[ty][ltidx + workx + RADIUS] = input[outputIndex + workx]; + } - tile[ty][tx] = current; - cg::sync(cta); + tile[ty][tx] = current; + cg::sync(cta); - // Compute the output value - float value = stencil[0] * current; + // Compute the output value + float value = stencil[0] * current; #pragma unroll 4 - for (int i = 1; i <= RADIUS; i++) { - value += - stencil[i] * (infront[i - 1] + behind[i - 1] + tile[ty - i][tx] + - tile[ty + i][tx] + tile[ty][tx - i] + tile[ty][tx + i]); - } + for (int i = 1; i <= RADIUS; i++) { + value += stencil[i] + * (infront[i - 1] + behind[i - 1] + tile[ty - i][tx] + tile[ty + i][tx] + tile[ty][tx - i] + + tile[ty][tx + i]); + } - // Store the output value - if (validw) output[outputIndex] = value; - } + // Store the output value + if (validw) + output[outputIndex] = value; + } } diff --git a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dReference.h b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dReference.h index 9b6a9303..f39ad62b 100644 --- a/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dReference.h +++ b/Samples/5_Domain_Specific/FDTD3d/inc/FDTD3dReference.h @@ -28,17 +28,32 @@ #ifndef _FDTD3DREFERENCE_H_ #define _FDTD3DREFERENCE_H_ -void generateRandomData(float *data, const int dimx, const int dimy, - const int dimz, const float lowerBound, +void generateRandomData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, const float upperBound); -void generatePatternData(float *data, const int dimx, const int dimy, - const int dimz, const float lowerBound, +void generatePatternData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, const float upperBound); -bool fdtdReference(float *output, const float *input, const float *coeff, - const int dimx, const int dimy, const int dimz, - const int radius, const int timesteps); -bool compareData(const float *output, const float *reference, const int dimx, - const int dimy, const int dimz, const int radius, - const float tolerance = 0.0001f); +bool fdtdReference(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps); +bool compareData(const float *output, + const float *reference, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const float tolerance = 0.0001f); #endif diff --git a/Samples/5_Domain_Specific/FDTD3d/src/FDTD3d.cpp b/Samples/5_Domain_Specific/FDTD3d/src/FDTD3d.cpp index 99d4e11d..9ecd3df0 100644 --- a/Samples/5_Domain_Specific/FDTD3d/src/FDTD3d.cpp +++ b/Samples/5_Domain_Specific/FDTD3d/src/FDTD3d.cpp @@ -27,16 +27,14 @@ #include "FDTD3d.h" -#include -#include - -#include "FDTD3dReference.h" -#include "FDTD3dGPU.h" - -#include - -#include #include +#include +#include +#include +#include + +#include "FDTD3dGPU.h" +#include "FDTD3dReference.h" #ifndef CLAMP #define CLAMP(a, min, max) (MIN(max, MAX(a, min))) @@ -49,187 +47,186 @@ bool runTest(int argc, const char **argv); void showHelp(const int argc, const char **argv); -int main(int argc, char **argv) { - bool bTestResult = false; - // Start the log - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + bool bTestResult = false; + // Start the log + printf("%s Starting...\n\n", argv[0]); - // Check help flag - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("Displaying help on console\n"); - showHelp(argc, (const char **)argv); - bTestResult = true; - } else { - // Execute - bTestResult = runTest(argc, (const char **)argv); - } + // Check help flag + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Displaying help on console\n"); + showHelp(argc, (const char **)argv); + bTestResult = true; + } + else { + // Execute + bTestResult = runTest(argc, (const char **)argv); + } - // Finish - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + // Finish + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } -void showHelp(const int argc, const char **argv) { - if (argc > 0) std::cout << std::endl << argv[0] << std::endl; +void showHelp(const int argc, const char **argv) +{ + if (argc > 0) + std::cout << std::endl << argv[0] << std::endl; - std::cout << std::endl << "Syntax:" << std::endl; - std::cout << std::left; - std::cout << " " << std::setw(20) << "--device=" - << "Specify device to use for execution" << std::endl; - std::cout << " " << std::setw(20) << "--dimx=" - << "Specify number of elements in x direction (excluding halo)" - << std::endl; - std::cout << " " << std::setw(20) << "--dimy=" - << "Specify number of elements in y direction (excluding halo)" - << std::endl; - std::cout << " " << std::setw(20) << "--dimz=" - << "Specify number of elements in z direction (excluding halo)" - << std::endl; - std::cout << " " << std::setw(20) << "--radius=" - << "Specify radius of stencil" << std::endl; - std::cout << " " << std::setw(20) << "--timesteps=" - << "Specify number of timesteps" << std::endl; - std::cout << " " << std::setw(20) << "--block-size=" - << "Specify number of threads per block" << std::endl; - std::cout << std::endl; - std::cout << " " << std::setw(20) << "--noprompt" - << "Skip prompt before exit" << std::endl; - std::cout << std::endl; + std::cout << std::endl << "Syntax:" << std::endl; + std::cout << std::left; + std::cout << " " << std::setw(20) << "--device=" + << "Specify device to use for execution" << std::endl; + std::cout << " " << std::setw(20) << "--dimx=" + << "Specify number of elements in x direction (excluding halo)" << std::endl; + std::cout << " " << std::setw(20) << "--dimy=" + << "Specify number of elements in y direction (excluding halo)" << std::endl; + std::cout << " " << std::setw(20) << "--dimz=" + << "Specify number of elements in z direction (excluding halo)" << std::endl; + std::cout << " " << std::setw(20) << "--radius=" + << "Specify radius of stencil" << std::endl; + std::cout << " " << std::setw(20) << "--timesteps=" + << "Specify number of timesteps" << std::endl; + std::cout << " " << std::setw(20) << "--block-size=" + << "Specify number of threads per block" << std::endl; + std::cout << std::endl; + std::cout << " " << std::setw(20) << "--noprompt" + << "Skip prompt before exit" << std::endl; + std::cout << std::endl; } -bool runTest(int argc, const char **argv) { - float *host_output; - float *device_output; - float *input; - float *coeff; +bool runTest(int argc, const char **argv) +{ + float *host_output; + float *device_output; + float *input; + float *coeff; - int defaultDim; - int dimx; - int dimy; - int dimz; - int outerDimx; - int outerDimy; - int outerDimz; - int radius; - int timesteps; - size_t volumeSize; - memsize_t memsize; + int defaultDim; + int dimx; + int dimy; + int dimz; + int outerDimx; + int outerDimy; + int outerDimz; + int radius; + int timesteps; + size_t volumeSize; + memsize_t memsize; - const float lowerBound = 0.0f; - const float upperBound = 1.0f; + const float lowerBound = 0.0f; + const float upperBound = 1.0f; - // Determine default dimensions - printf("Set-up, based upon target device GMEM size...\n"); - // Get the memory size of the target device - printf(" getTargetDeviceGlobalMemSize\n"); - getTargetDeviceGlobalMemSize(&memsize, argc, argv); + // Determine default dimensions + printf("Set-up, based upon target device GMEM size...\n"); + // Get the memory size of the target device + printf(" getTargetDeviceGlobalMemSize\n"); + getTargetDeviceGlobalMemSize(&memsize, argc, argv); - // We can never use all the memory so to keep things simple we aim to - // use around half the total memory - memsize /= 2; + // We can never use all the memory so to keep things simple we aim to + // use around half the total memory + memsize /= 2; - // Most of our memory use is taken up by the input and output buffers - - // two buffers of equal size - and for simplicity the volume is a cube: - // dim = floor( (N/2)^(1/3) ) - defaultDim = (int)floor(pow((memsize / (2.0 * sizeof(float))), 1.0 / 3.0)); + // Most of our memory use is taken up by the input and output buffers - + // two buffers of equal size - and for simplicity the volume is a cube: + // dim = floor( (N/2)^(1/3) ) + defaultDim = (int)floor(pow((memsize / (2.0 * sizeof(float))), 1.0 / 3.0)); - // By default, make the volume edge size an integer multiple of 128B to - // improve performance by coalescing memory accesses, in a real - // application it would make sense to pad the lines accordingly - int roundTarget = 128 / sizeof(float); - defaultDim = defaultDim / roundTarget * roundTarget; - defaultDim -= k_radius_default * 2; + // By default, make the volume edge size an integer multiple of 128B to + // improve performance by coalescing memory accesses, in a real + // application it would make sense to pad the lines accordingly + int roundTarget = 128 / sizeof(float); + defaultDim = defaultDim / roundTarget * roundTarget; + defaultDim -= k_radius_default * 2; - // Check dimension is valid - if (defaultDim < k_dim_min) { - printf( - "insufficient device memory (maximum volume on device is %d, must be " - "between %d and %d).\n", - defaultDim, k_dim_min, k_dim_max); - exit(EXIT_FAILURE); - } else if (defaultDim > k_dim_max) { - defaultDim = k_dim_max; - } + // Check dimension is valid + if (defaultDim < k_dim_min) { + printf("insufficient device memory (maximum volume on device is %d, must be " + "between %d and %d).\n", + defaultDim, + k_dim_min, + k_dim_max); + exit(EXIT_FAILURE); + } + else if (defaultDim > k_dim_max) { + defaultDim = k_dim_max; + } - // For QA testing, override default volume size - if (checkCmdLineFlag(argc, argv, "qatest")) { - defaultDim = MIN(defaultDim, k_dim_qa); - } + // For QA testing, override default volume size + if (checkCmdLineFlag(argc, argv, "qatest")) { + defaultDim = MIN(defaultDim, k_dim_qa); + } - // set default dim - dimx = defaultDim; - dimy = defaultDim; - dimz = defaultDim; - radius = k_radius_default; - timesteps = k_timesteps_default; + // set default dim + dimx = defaultDim; + dimy = defaultDim; + dimz = defaultDim; + radius = k_radius_default; + timesteps = k_timesteps_default; - // Parse command line arguments - if (checkCmdLineFlag(argc, argv, "dimx")) { - dimx = - CLAMP(getCmdLineArgumentInt(argc, argv, "dimx"), k_dim_min, k_dim_max); - } + // Parse command line arguments + if (checkCmdLineFlag(argc, argv, "dimx")) { + dimx = CLAMP(getCmdLineArgumentInt(argc, argv, "dimx"), k_dim_min, k_dim_max); + } - if (checkCmdLineFlag(argc, argv, "dimy")) { - dimy = - CLAMP(getCmdLineArgumentInt(argc, argv, "dimy"), k_dim_min, k_dim_max); - } + if (checkCmdLineFlag(argc, argv, "dimy")) { + dimy = CLAMP(getCmdLineArgumentInt(argc, argv, "dimy"), k_dim_min, k_dim_max); + } - if (checkCmdLineFlag(argc, argv, "dimz")) { - dimz = - CLAMP(getCmdLineArgumentInt(argc, argv, "dimz"), k_dim_min, k_dim_max); - } + if (checkCmdLineFlag(argc, argv, "dimz")) { + dimz = CLAMP(getCmdLineArgumentInt(argc, argv, "dimz"), k_dim_min, k_dim_max); + } - if (checkCmdLineFlag(argc, argv, "radius")) { - radius = CLAMP(getCmdLineArgumentInt(argc, argv, "radius"), k_radius_min, - k_radius_max); - } + if (checkCmdLineFlag(argc, argv, "radius")) { + radius = CLAMP(getCmdLineArgumentInt(argc, argv, "radius"), k_radius_min, k_radius_max); + } - if (checkCmdLineFlag(argc, argv, "timesteps")) { - timesteps = CLAMP(getCmdLineArgumentInt(argc, argv, "timesteps"), - k_timesteps_min, k_timesteps_max); - } + if (checkCmdLineFlag(argc, argv, "timesteps")) { + timesteps = CLAMP(getCmdLineArgumentInt(argc, argv, "timesteps"), k_timesteps_min, k_timesteps_max); + } - // Determine volume size - outerDimx = dimx + 2 * radius; - outerDimy = dimy + 2 * radius; - outerDimz = dimz + 2 * radius; - volumeSize = outerDimx * outerDimy * outerDimz; + // Determine volume size + outerDimx = dimx + 2 * radius; + outerDimy = dimy + 2 * radius; + outerDimz = dimz + 2 * radius; + volumeSize = outerDimx * outerDimy * outerDimz; - // Allocate memory - host_output = (float *)calloc(volumeSize, sizeof(float)); - input = (float *)malloc(volumeSize * sizeof(float)); - coeff = (float *)malloc((radius + 1) * sizeof(float)); + // Allocate memory + host_output = (float *)calloc(volumeSize, sizeof(float)); + input = (float *)malloc(volumeSize * sizeof(float)); + coeff = (float *)malloc((radius + 1) * sizeof(float)); - // Create coefficients - for (int i = 0; i <= radius; i++) { - coeff[i] = 0.1f; - } + // Create coefficients + for (int i = 0; i <= radius; i++) { + coeff[i] = 0.1f; + } - // Generate data - printf(" generateRandomData\n\n"); - generateRandomData(input, outerDimx, outerDimy, outerDimz, lowerBound, - upperBound); - printf( - "FDTD on %d x %d x %d volume with symmetric filter radius %d for %d " - "timesteps...\n\n", - dimx, dimy, dimz, radius, timesteps); + // Generate data + printf(" generateRandomData\n\n"); + generateRandomData(input, outerDimx, outerDimy, outerDimz, lowerBound, upperBound); + printf("FDTD on %d x %d x %d volume with symmetric filter radius %d for %d " + "timesteps...\n\n", + dimx, + dimy, + dimz, + radius, + timesteps); - // Execute on the host - printf("fdtdReference...\n"); - fdtdReference(host_output, input, coeff, dimx, dimy, dimz, radius, timesteps); - printf("fdtdReference complete\n"); + // Execute on the host + printf("fdtdReference...\n"); + fdtdReference(host_output, input, coeff, dimx, dimy, dimz, radius, timesteps); + printf("fdtdReference complete\n"); - // Allocate memory - device_output = (float *)calloc(volumeSize, sizeof(float)); + // Allocate memory + device_output = (float *)calloc(volumeSize, sizeof(float)); - // Execute on the device - printf("fdtdGPU...\n"); - fdtdGPU(device_output, input, coeff, dimx, dimy, dimz, radius, timesteps, - argc, argv); - printf("fdtdGPU complete\n"); + // Execute on the device + printf("fdtdGPU...\n"); + fdtdGPU(device_output, input, coeff, dimx, dimy, dimz, radius, timesteps, argc, argv); + printf("fdtdGPU complete\n"); - // Compare the results - float tolerance = 0.0001f; - printf("\nCompareData (tolerance %f)...\n", tolerance); - return compareData(device_output, host_output, dimx, dimy, dimz, radius, - tolerance); + // Compare the results + float tolerance = 0.0001f; + printf("\nCompareData (tolerance %f)...\n", tolerance); + return compareData(device_output, host_output, dimx, dimy, dimz, radius, tolerance); } diff --git a/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dGPU.cu b/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dGPU.cu index 1c4a37a4..c223514a 100644 --- a/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dGPU.cu +++ b/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dGPU.cu @@ -25,239 +25,236 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "FDTD3dGPU.h" - -#include #include -#include #include +#include +#include +#include "FDTD3dGPU.h" #include "FDTD3dGPUKernel.cuh" -bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, - const char **argv) { - int deviceCount = 0; - int targetDevice = 0; - size_t memsize = 0; +bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv) +{ + int deviceCount = 0; + int targetDevice = 0; + size_t memsize = 0; - // Get the number of CUDA enabled GPU devices - printf(" cudaGetDeviceCount\n"); - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + // Get the number of CUDA enabled GPU devices + printf(" cudaGetDeviceCount\n"); + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - // Select target device (device 0 by default) - targetDevice = findCudaDevice(argc, (const char **)argv); + // Select target device (device 0 by default) + targetDevice = findCudaDevice(argc, (const char **)argv); - // Query target device for maximum memory allocation - printf(" cudaGetDeviceProperties\n"); - struct cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, targetDevice)); + // Query target device for maximum memory allocation + printf(" cudaGetDeviceProperties\n"); + struct cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, targetDevice)); - memsize = deviceProp.totalGlobalMem; + memsize = deviceProp.totalGlobalMem; - // Save the result - *result = (memsize_t)memsize; - return true; + // Save the result + *result = (memsize_t)memsize; + return true; } -bool fdtdGPU(float *output, const float *input, const float *coeff, - const int dimx, const int dimy, const int dimz, const int radius, - const int timesteps, const int argc, const char **argv) { - const int outerDimx = dimx + 2 * radius; - const int outerDimy = dimy + 2 * radius; - const int outerDimz = dimz + 2 * radius; - const size_t volumeSize = outerDimx * outerDimy * outerDimz; - int deviceCount = 0; - int targetDevice = 0; - float *bufferOut = 0; - float *bufferIn = 0; - dim3 dimBlock; - dim3 dimGrid; +bool fdtdGPU(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps, + const int argc, + const char **argv) +{ + const int outerDimx = dimx + 2 * radius; + const int outerDimy = dimy + 2 * radius; + const int outerDimz = dimz + 2 * radius; + const size_t volumeSize = outerDimx * outerDimy * outerDimz; + int deviceCount = 0; + int targetDevice = 0; + float *bufferOut = 0; + float *bufferIn = 0; + dim3 dimBlock; + dim3 dimGrid; - // Ensure that the inner data starts on a 128B boundary - const int padding = (128 / sizeof(float)) - radius; - const size_t paddedVolumeSize = volumeSize + padding; + // Ensure that the inner data starts on a 128B boundary + const int padding = (128 / sizeof(float)) - radius; + const size_t paddedVolumeSize = volumeSize + padding; #ifdef GPU_PROFILING - cudaEvent_t profileStart = 0; - cudaEvent_t profileEnd = 0; - const int profileTimesteps = timesteps - 1; + cudaEvent_t profileStart = 0; + cudaEvent_t profileEnd = 0; + const int profileTimesteps = timesteps - 1; - if (profileTimesteps < 1) { - printf( - " cannot profile with fewer than two timesteps (timesteps=%d), " - "profiling is disabled.\n", - timesteps); - } + if (profileTimesteps < 1) { + printf(" cannot profile with fewer than two timesteps (timesteps=%d), " + "profiling is disabled.\n", + timesteps); + } #endif - // Check the radius is valid - if (radius != RADIUS) { - printf("radius is invalid, must be %d - see kernel for details.\n", RADIUS); - exit(EXIT_FAILURE); - } + // Check the radius is valid + if (radius != RADIUS) { + printf("radius is invalid, must be %d - see kernel for details.\n", RADIUS); + exit(EXIT_FAILURE); + } - // Get the number of CUDA enabled GPU devices - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + // Get the number of CUDA enabled GPU devices + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - // Select target device (device 0 by default) - targetDevice = findCudaDevice(argc, (const char **)argv); + // Select target device (device 0 by default) + targetDevice = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaSetDevice(targetDevice)); + checkCudaErrors(cudaSetDevice(targetDevice)); - // Allocate memory buffers - checkCudaErrors( - cudaMalloc((void **)&bufferOut, paddedVolumeSize * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&bufferIn, paddedVolumeSize * sizeof(float))); + // Allocate memory buffers + checkCudaErrors(cudaMalloc((void **)&bufferOut, paddedVolumeSize * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&bufferIn, paddedVolumeSize * sizeof(float))); - // Check for a command-line specified block size - int userBlockSize; + // Check for a command-line specified block size + int userBlockSize; - if (checkCmdLineFlag(argc, (const char **)argv, "block-size")) { - userBlockSize = getCmdLineArgumentInt(argc, argv, "block-size"); - // Constrain to a multiple of k_blockDimX - userBlockSize = (userBlockSize / k_blockDimX * k_blockDimX); + if (checkCmdLineFlag(argc, (const char **)argv, "block-size")) { + userBlockSize = getCmdLineArgumentInt(argc, argv, "block-size"); + // Constrain to a multiple of k_blockDimX + userBlockSize = (userBlockSize / k_blockDimX * k_blockDimX); - // Constrain within allowed bounds - userBlockSize = MIN(MAX(userBlockSize, k_blockSizeMin), k_blockSizeMax); - } else { - userBlockSize = k_blockSizeMax; - } + // Constrain within allowed bounds + userBlockSize = MIN(MAX(userBlockSize, k_blockSizeMin), k_blockSizeMax); + } + else { + userBlockSize = k_blockSizeMax; + } - // Check the device limit on the number of threads - struct cudaFuncAttributes funcAttrib; - checkCudaErrors(cudaFuncGetAttributes(&funcAttrib, FiniteDifferencesKernel)); + // Check the device limit on the number of threads + struct cudaFuncAttributes funcAttrib; + checkCudaErrors(cudaFuncGetAttributes(&funcAttrib, FiniteDifferencesKernel)); - userBlockSize = MIN(userBlockSize, funcAttrib.maxThreadsPerBlock); + userBlockSize = MIN(userBlockSize, funcAttrib.maxThreadsPerBlock); - // Set the block size - dimBlock.x = k_blockDimX; - // Visual Studio 2005 does not like std::min - // dimBlock.y = std::min(userBlockSize / k_blockDimX, - // (size_t)k_blockDimMaxY); - dimBlock.y = ((userBlockSize / k_blockDimX) < (size_t)k_blockDimMaxY) - ? (userBlockSize / k_blockDimX) - : (size_t)k_blockDimMaxY; - dimGrid.x = (unsigned int)ceil((float)dimx / dimBlock.x); - dimGrid.y = (unsigned int)ceil((float)dimy / dimBlock.y); - printf(" set block size to %dx%d\n", dimBlock.x, dimBlock.y); - printf(" set grid size to %dx%d\n", dimGrid.x, dimGrid.y); + // Set the block size + dimBlock.x = k_blockDimX; + // Visual Studio 2005 does not like std::min + // dimBlock.y = std::min(userBlockSize / k_blockDimX, + // (size_t)k_blockDimMaxY); + dimBlock.y = ((userBlockSize / k_blockDimX) < (size_t)k_blockDimMaxY) ? (userBlockSize / k_blockDimX) + : (size_t)k_blockDimMaxY; + dimGrid.x = (unsigned int)ceil((float)dimx / dimBlock.x); + dimGrid.y = (unsigned int)ceil((float)dimy / dimBlock.y); + printf(" set block size to %dx%d\n", dimBlock.x, dimBlock.y); + printf(" set grid size to %dx%d\n", dimGrid.x, dimGrid.y); - // Check the block size is valid - if (dimBlock.x < RADIUS || dimBlock.y < RADIUS) { - printf("invalid block size, x (%d) and y (%d) must be >= radius (%d).\n", - dimBlock.x, dimBlock.y, RADIUS); - exit(EXIT_FAILURE); - } + // Check the block size is valid + if (dimBlock.x < RADIUS || dimBlock.y < RADIUS) { + printf("invalid block size, x (%d) and y (%d) must be >= radius (%d).\n", dimBlock.x, dimBlock.y, RADIUS); + exit(EXIT_FAILURE); + } - // Copy the input to the device input buffer - checkCudaErrors(cudaMemcpy(bufferIn + padding, input, - volumeSize * sizeof(float), - cudaMemcpyHostToDevice)); + // Copy the input to the device input buffer + checkCudaErrors(cudaMemcpy(bufferIn + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice)); - // Copy the input to the device output buffer (actually only need the halo) - checkCudaErrors(cudaMemcpy(bufferOut + padding, input, - volumeSize * sizeof(float), - cudaMemcpyHostToDevice)); + // Copy the input to the device output buffer (actually only need the halo) + checkCudaErrors(cudaMemcpy(bufferOut + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice)); - // Copy the coefficients to the device coefficient buffer - checkCudaErrors( - cudaMemcpyToSymbol(stencil, (void *)coeff, (radius + 1) * sizeof(float))); + // Copy the coefficients to the device coefficient buffer + checkCudaErrors(cudaMemcpyToSymbol(stencil, (void *)coeff, (radius + 1) * sizeof(float))); #ifdef GPU_PROFILING - // Create the events - checkCudaErrors(cudaEventCreate(&profileStart)); - checkCudaErrors(cudaEventCreate(&profileEnd)); + // Create the events + checkCudaErrors(cudaEventCreate(&profileStart)); + checkCudaErrors(cudaEventCreate(&profileEnd)); #endif - // Execute the FDTD - float *bufferSrc = bufferIn + padding; - float *bufferDst = bufferOut + padding; - printf(" GPU FDTD loop\n"); + // Execute the FDTD + float *bufferSrc = bufferIn + padding; + float *bufferDst = bufferOut + padding; + printf(" GPU FDTD loop\n"); #ifdef GPU_PROFILING - // Enqueue start event - checkCudaErrors(cudaEventRecord(profileStart, 0)); + // Enqueue start event + checkCudaErrors(cudaEventRecord(profileStart, 0)); #endif - for (int it = 0; it < timesteps; it++) { - printf("\tt = %d ", it); + for (int it = 0; it < timesteps; it++) { + printf("\tt = %d ", it); - // Launch the kernel - printf("launch kernel\n"); - FiniteDifferencesKernel<<>>(bufferDst, bufferSrc, dimx, - dimy, dimz); + // Launch the kernel + printf("launch kernel\n"); + FiniteDifferencesKernel<<>>(bufferDst, bufferSrc, dimx, dimy, dimz); - // Toggle the buffers - // Visual Studio 2005 does not like std::swap - // std::swap(bufferSrc, bufferDst); - float *tmp = bufferDst; - bufferDst = bufferSrc; - bufferSrc = tmp; - } + // Toggle the buffers + // Visual Studio 2005 does not like std::swap + // std::swap(bufferSrc, bufferDst); + float *tmp = bufferDst; + bufferDst = bufferSrc; + bufferSrc = tmp; + } - printf("\n"); + printf("\n"); #ifdef GPU_PROFILING - // Enqueue end event - checkCudaErrors(cudaEventRecord(profileEnd, 0)); + // Enqueue end event + checkCudaErrors(cudaEventRecord(profileEnd, 0)); #endif - // Wait for the kernel to complete - checkCudaErrors(cudaDeviceSynchronize()); + // Wait for the kernel to complete + checkCudaErrors(cudaDeviceSynchronize()); - // Read the result back, result is in bufferSrc (after final toggle) - checkCudaErrors(cudaMemcpy(output, bufferSrc, volumeSize * sizeof(float), - cudaMemcpyDeviceToHost)); + // Read the result back, result is in bufferSrc (after final toggle) + checkCudaErrors(cudaMemcpy(output, bufferSrc, volumeSize * sizeof(float), cudaMemcpyDeviceToHost)); // Report time #ifdef GPU_PROFILING - float elapsedTimeMS = 0; + float elapsedTimeMS = 0; - if (profileTimesteps > 0) { - checkCudaErrors( - cudaEventElapsedTime(&elapsedTimeMS, profileStart, profileEnd)); - } + if (profileTimesteps > 0) { + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeMS, profileStart, profileEnd)); + } - if (profileTimesteps > 0) { - // Convert milliseconds to seconds - double elapsedTime = elapsedTimeMS * 1.0e-3; - double avgElapsedTime = elapsedTime / (double)profileTimesteps; - // Determine number of computations per timestep - size_t pointsComputed = dimx * dimy * dimz; - // Determine throughput - double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime; - printf( - "FDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, " - "NumDevsUsed = %u, Blocksize = %u\n", - throughputM, avgElapsedTime, pointsComputed, 1, - dimBlock.x * dimBlock.y); - } + if (profileTimesteps > 0) { + // Convert milliseconds to seconds + double elapsedTime = elapsedTimeMS * 1.0e-3; + double avgElapsedTime = elapsedTime / (double)profileTimesteps; + // Determine number of computations per timestep + size_t pointsComputed = dimx * dimy * dimz; + // Determine throughput + double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime; + printf("FDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, " + "NumDevsUsed = %u, Blocksize = %u\n", + throughputM, + avgElapsedTime, + pointsComputed, + 1, + dimBlock.x * dimBlock.y); + } #endif - // Cleanup - if (bufferIn) { - checkCudaErrors(cudaFree(bufferIn)); - } + // Cleanup + if (bufferIn) { + checkCudaErrors(cudaFree(bufferIn)); + } - if (bufferOut) { - checkCudaErrors(cudaFree(bufferOut)); - } + if (bufferOut) { + checkCudaErrors(cudaFree(bufferOut)); + } #ifdef GPU_PROFILING - if (profileStart) { - checkCudaErrors(cudaEventDestroy(profileStart)); - } + if (profileStart) { + checkCudaErrors(cudaEventDestroy(profileStart)); + } - if (profileEnd) { - checkCudaErrors(cudaEventDestroy(profileEnd)); - } + if (profileEnd) { + checkCudaErrors(cudaEventDestroy(profileEnd)); + } #endif - return true; + return true; } diff --git a/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dReference.cpp b/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dReference.cpp index 9a34f4f1..6ece3469 100644 --- a/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dReference.cpp +++ b/Samples/5_Domain_Specific/FDTD3d/src/FDTD3dReference.cpp @@ -27,152 +27,165 @@ #include "FDTD3dReference.h" -#include #include -#include +#include #include +#include #include -void generateRandomData(float *data, const int dimx, const int dimy, - const int dimz, const float lowerBound, - const float upperBound) { - srand(0); +void generateRandomData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, + const float upperBound) +{ + srand(0); - for (int iz = 0; iz < dimz; iz++) { - for (int iy = 0; iy < dimy; iy++) { - for (int ix = 0; ix < dimx; ix++) { - *data = (float)(lowerBound + - ((float)rand() / (float)RAND_MAX) * - (upperBound - lowerBound)); - ++data; - } - } - } -} - -void generatePatternData(float *data, const int dimx, const int dimy, - const int dimz, const float lowerBound, - const float upperBound) { - for (int iz = 0; iz < dimz; iz++) { - for (int iy = 0; iy < dimy; iy++) { - for (int ix = 0; ix < dimx; ix++) { - *data = (float)(lowerBound + - ((float)iz / (float)dimz) * (upperBound - lowerBound)); - ++data; - } - } - } -} - -bool fdtdReference(float *output, const float *input, const float *coeff, - const int dimx, const int dimy, const int dimz, - const int radius, const int timesteps) { - const int outerDimx = dimx + 2 * radius; - const int outerDimy = dimy + 2 * radius; - const int outerDimz = dimz + 2 * radius; - const size_t volumeSize = outerDimx * outerDimy * outerDimz; - const int stride_y = outerDimx; - const int stride_z = stride_y * outerDimy; - float *intermediate = 0; - const float *bufsrc = 0; - float *bufdst = 0; - float *bufdstnext = 0; - - // Allocate temporary buffer - printf(" calloc intermediate\n"); - intermediate = (float *)calloc(volumeSize, sizeof(float)); - - // Decide which buffer to use first (result should end up in output) - if ((timesteps % 2) == 0) { - bufsrc = input; - bufdst = intermediate; - bufdstnext = output; - } else { - bufsrc = input; - bufdst = output; - bufdstnext = intermediate; - } - - // Run the FDTD (naive method) - printf(" Host FDTD loop\n"); - - for (int it = 0; it < timesteps; it++) { - printf("\tt = %d\n", it); - const float *src = bufsrc; - float *dst = bufdst; - - for (int iz = -radius; iz < dimz + radius; iz++) { - for (int iy = -radius; iy < dimy + radius; iy++) { - for (int ix = -radius; ix < dimx + radius; ix++) { - if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && - iz < dimz) { - float value = (*src) * coeff[0]; - - for (int ir = 1; ir <= radius; ir++) { - value += coeff[ir] * (*(src + ir) + *(src - ir)); // horizontal - value += coeff[ir] * (*(src + ir * stride_y) + - *(src - ir * stride_y)); // vertical - value += - coeff[ir] * (*(src + ir * stride_z) + - *(src - ir * stride_z)); // in front & behind + for (int iz = 0; iz < dimz; iz++) { + for (int iy = 0; iy < dimy; iy++) { + for (int ix = 0; ix < dimx; ix++) { + *data = (float)(lowerBound + ((float)rand() / (float)RAND_MAX) * (upperBound - lowerBound)); + ++data; } - - *dst = value; - } else { - *dst = *src; - } - - ++dst; - ++src; } - } } - - // Rotate buffers - float *tmp = bufdst; - bufdst = bufdstnext; - bufdstnext = tmp; - bufsrc = (const float *)tmp; - } - - printf("\n"); - - if (intermediate) free(intermediate); - - return true; } -bool compareData(const float *output, const float *reference, const int dimx, - const int dimy, const int dimz, const int radius, - const float tolerance) { - for (int iz = -radius; iz < dimz + radius; iz++) { - for (int iy = -radius; iy < dimy + radius; iy++) { - for (int ix = -radius; ix < dimx + radius; ix++) { - if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && - iz < dimz) { - // Determine the absolute difference - float difference = fabs(*reference - *output); - float error; +void generatePatternData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, + const float upperBound) +{ + for (int iz = 0; iz < dimz; iz++) { + for (int iy = 0; iy < dimy; iy++) { + for (int ix = 0; ix < dimx; ix++) { + *data = (float)(lowerBound + ((float)iz / (float)dimz) * (upperBound - lowerBound)); + ++data; + } + } + } +} - // Determine the relative error - if (*reference != 0) - error = difference / *reference; - else - error = difference; +bool fdtdReference(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps) +{ + const int outerDimx = dimx + 2 * radius; + const int outerDimy = dimy + 2 * radius; + const int outerDimz = dimz + 2 * radius; + const size_t volumeSize = outerDimx * outerDimy * outerDimz; + const int stride_y = outerDimx; + const int stride_z = stride_y * outerDimy; + float *intermediate = 0; + const float *bufsrc = 0; + float *bufdst = 0; + float *bufdstnext = 0; - // Check the error is within the tolerance - if (error > tolerance) { - printf("Data error at point (%d,%d,%d)\t%f instead of %f\n", ix, iy, - iz, *output, *reference); - return false; - } + // Allocate temporary buffer + printf(" calloc intermediate\n"); + intermediate = (float *)calloc(volumeSize, sizeof(float)); + + // Decide which buffer to use first (result should end up in output) + if ((timesteps % 2) == 0) { + bufsrc = input; + bufdst = intermediate; + bufdstnext = output; + } + else { + bufsrc = input; + bufdst = output; + bufdstnext = intermediate; + } + + // Run the FDTD (naive method) + printf(" Host FDTD loop\n"); + + for (int it = 0; it < timesteps; it++) { + printf("\tt = %d\n", it); + const float *src = bufsrc; + float *dst = bufdst; + + for (int iz = -radius; iz < dimz + radius; iz++) { + for (int iy = -radius; iy < dimy + radius; iy++) { + for (int ix = -radius; ix < dimx + radius; ix++) { + if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && iz < dimz) { + float value = (*src) * coeff[0]; + + for (int ir = 1; ir <= radius; ir++) { + value += coeff[ir] * (*(src + ir) + *(src - ir)); // horizontal + value += coeff[ir] * (*(src + ir * stride_y) + *(src - ir * stride_y)); // vertical + value += coeff[ir] * (*(src + ir * stride_z) + *(src - ir * stride_z)); // in front & behind + } + + *dst = value; + } + else { + *dst = *src; + } + + ++dst; + ++src; + } + } } - ++output; - ++reference; - } + // Rotate buffers + float *tmp = bufdst; + bufdst = bufdstnext; + bufdstnext = tmp; + bufsrc = (const float *)tmp; } - } - return true; + printf("\n"); + + if (intermediate) + free(intermediate); + + return true; +} + +bool compareData(const float *output, + const float *reference, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const float tolerance) +{ + for (int iz = -radius; iz < dimz + radius; iz++) { + for (int iy = -radius; iy < dimy + radius; iy++) { + for (int ix = -radius; ix < dimx + radius; ix++) { + if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && iz < dimz) { + // Determine the absolute difference + float difference = fabs(*reference - *output); + float error; + + // Determine the relative error + if (*reference != 0) + error = difference / *reference; + else + error = difference; + + // Check the error is within the tolerance + if (error > tolerance) { + printf("Data error at point (%d,%d,%d)\t%f instead of %f\n", ix, iy, iz, *output, *reference); + return false; + } + } + + ++output; + ++reference; + } + } + } + + return true; } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/addKernel.cuh b/Samples/5_Domain_Specific/HSOpticalFlow/addKernel.cuh index 965e3ae6..2052c142 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/addKernel.cuh +++ b/Samples/5_Domain_Specific/HSOpticalFlow/addKernel.cuh @@ -36,13 +36,14 @@ /// \param[in] count vector size /// \param[out] sum result /////////////////////////////////////////////////////////////////////////////// -__global__ void AddKernel(const float *op1, const float *op2, int count, - float *sum) { - const int pos = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void AddKernel(const float *op1, const float *op2, int count, float *sum) +{ + const int pos = threadIdx.x + blockIdx.x * blockDim.x; - if (pos >= count) return; + if (pos >= count) + return; - sum[pos] = op1[pos] + op2[pos]; + sum[pos] = op1[pos] + op2[pos]; } /////////////////////////////////////////////////////////////////////////////// @@ -52,9 +53,10 @@ __global__ void AddKernel(const float *op1, const float *op2, int count, /// \param[in] count vector size /// \param[out] sum result /////////////////////////////////////////////////////////////////////////////// -static void Add(const float *op1, const float *op2, int count, float *sum) { - dim3 threads(256); - dim3 blocks(iDivUp(count, threads.x)); +static void Add(const float *op1, const float *op2, int count, float *sum) +{ + dim3 threads(256); + dim3 blocks(iDivUp(count, threads.x)); - AddKernel<<>>(op1, op2, count, sum); + AddKernel<<>>(op1, op2, count, sum); } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/common.h b/Samples/5_Domain_Specific/HSOpticalFlow/common.h index 2389cee4..4519307b 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/common.h +++ b/Samples/5_Domain_Specific/HSOpticalFlow/common.h @@ -36,13 +36,12 @@ // Common includes /////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include - #include +#include +#include +#include +#include +#include /////////////////////////////////////////////////////////////////////////////// // Common constants @@ -54,23 +53,24 @@ const int StrideAlignment = 32; /////////////////////////////////////////////////////////////////////////////// // Align up n to the nearest multiple of m -inline int iAlignUp(int n, int m = StrideAlignment) { - int mod = n % m; +inline int iAlignUp(int n, int m = StrideAlignment) +{ + int mod = n % m; - if (mod) - return n + m - mod; - else - return n; + if (mod) + return n + m - mod; + else + return n; } // round up n/m inline int iDivUp(int n, int m) { return (n + m - 1) / m; } // swap two values -template -inline void Swap(T &a, T &b) { - T t = a; - a = b; - b = t; +template inline void Swap(T &a, T &b) +{ + T t = a; + a = b; + b = t; } #endif diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/derivativesKernel.cuh b/Samples/5_Domain_Specific/HSOpticalFlow/derivativesKernel.cuh index ce426288..ffa90a84 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/derivativesKernel.cuh +++ b/Samples/5_Domain_Specific/HSOpticalFlow/derivativesKernel.cuh @@ -38,56 +38,62 @@ /// \param[out] Iy y derivative /// \param[out] Iz temporal derivative /////////////////////////////////////////////////////////////////////////////// -__global__ void ComputeDerivativesKernel(int width, int height, int stride, - float *Ix, float *Iy, float *Iz, +__global__ void ComputeDerivativesKernel(int width, + int height, + int stride, + float *Ix, + float *Iy, + float *Iz, cudaTextureObject_t texSource, - cudaTextureObject_t texTarget) { - const int ix = threadIdx.x + blockIdx.x * blockDim.x; - const int iy = threadIdx.y + blockIdx.y * blockDim.y; + cudaTextureObject_t texTarget) +{ + const int ix = threadIdx.x + blockIdx.x * blockDim.x; + const int iy = threadIdx.y + blockIdx.y * blockDim.y; - const int pos = ix + iy * stride; + const int pos = ix + iy * stride; - if (ix >= width || iy >= height) return; + if (ix >= width || iy >= height) + return; - float dx = 1.0f / (float)width; - float dy = 1.0f / (float)height; + float dx = 1.0f / (float)width; + float dy = 1.0f / (float)height; - float x = ((float)ix + 0.5f) * dx; - float y = ((float)iy + 0.5f) * dy; + float x = ((float)ix + 0.5f) * dx; + float y = ((float)iy + 0.5f) * dy; - float t0, t1; - // x derivative - t0 = tex2D(texSource, x - 2.0f * dx, y); - t0 -= tex2D(texSource, x - 1.0f * dx, y) * 8.0f; - t0 += tex2D(texSource, x + 1.0f * dx, y) * 8.0f; - t0 -= tex2D(texSource, x + 2.0f * dx, y); - t0 /= 12.0f; + float t0, t1; + // x derivative + t0 = tex2D(texSource, x - 2.0f * dx, y); + t0 -= tex2D(texSource, x - 1.0f * dx, y) * 8.0f; + t0 += tex2D(texSource, x + 1.0f * dx, y) * 8.0f; + t0 -= tex2D(texSource, x + 2.0f * dx, y); + t0 /= 12.0f; - t1 = tex2D(texTarget, x - 2.0f * dx, y); - t1 -= tex2D(texTarget, x - 1.0f * dx, y) * 8.0f; - t1 += tex2D(texTarget, x + 1.0f * dx, y) * 8.0f; - t1 -= tex2D(texTarget, x + 2.0f * dx, y); - t1 /= 12.0f; + t1 = tex2D(texTarget, x - 2.0f * dx, y); + t1 -= tex2D(texTarget, x - 1.0f * dx, y) * 8.0f; + t1 += tex2D(texTarget, x + 1.0f * dx, y) * 8.0f; + t1 -= tex2D(texTarget, x + 2.0f * dx, y); + t1 /= 12.0f; - Ix[pos] = (t0 + t1) * 0.5f; + Ix[pos] = (t0 + t1) * 0.5f; - // t derivative - Iz[pos] = tex2D(texTarget, x, y) - tex2D(texSource, x, y); + // t derivative + Iz[pos] = tex2D(texTarget, x, y) - tex2D(texSource, x, y); - // y derivative - t0 = tex2D(texSource, x, y - 2.0f * dy); - t0 -= tex2D(texSource, x, y - 1.0f * dy) * 8.0f; - t0 += tex2D(texSource, x, y + 1.0f * dy) * 8.0f; - t0 -= tex2D(texSource, x, y + 2.0f * dy); - t0 /= 12.0f; + // y derivative + t0 = tex2D(texSource, x, y - 2.0f * dy); + t0 -= tex2D(texSource, x, y - 1.0f * dy) * 8.0f; + t0 += tex2D(texSource, x, y + 1.0f * dy) * 8.0f; + t0 -= tex2D(texSource, x, y + 2.0f * dy); + t0 /= 12.0f; - t1 = tex2D(texTarget, x, y - 2.0f * dy); - t1 -= tex2D(texTarget, x, y - 1.0f * dy) * 8.0f; - t1 += tex2D(texTarget, x, y + 1.0f * dy) * 8.0f; - t1 -= tex2D(texTarget, x, y + 2.0f * dy); - t1 /= 12.0f; + t1 = tex2D(texTarget, x, y - 2.0f * dy); + t1 -= tex2D(texTarget, x, y - 1.0f * dy) * 8.0f; + t1 += tex2D(texTarget, x, y + 1.0f * dy) * 8.0f; + t1 -= tex2D(texTarget, x, y + 2.0f * dy); + t1 /= 12.0f; - Iy[pos] = (t0 + t1) * 0.5f; + Iy[pos] = (t0 + t1) * 0.5f; } /////////////////////////////////////////////////////////////////////////////// @@ -102,43 +108,40 @@ __global__ void ComputeDerivativesKernel(int width, int height, int stride, /// \param[out] Iy y derivative /// \param[out] Iz temporal derivative /////////////////////////////////////////////////////////////////////////////// -static void ComputeDerivatives(const float *I0, const float *I1, int w, int h, - int s, float *Ix, float *Iy, float *Iz) { - dim3 threads(32, 6); - dim3 blocks(iDivUp(w, threads.x), iDivUp(h, threads.y)); +static void ComputeDerivatives(const float *I0, const float *I1, int w, int h, int s, float *Ix, float *Iy, float *Iz) +{ + dim3 threads(32, 6); + dim3 blocks(iDivUp(w, threads.x), iDivUp(h, threads.y)); - cudaTextureObject_t texSource, texTarget; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texSource, texTarget; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = (void *)I0; - texRes.res.pitch2D.desc = cudaCreateChannelDesc(); - texRes.res.pitch2D.width = w; - texRes.res.pitch2D.height = h; - texRes.res.pitch2D.pitchInBytes = s * sizeof(float); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = (void *)I0; + texRes.res.pitch2D.desc = cudaCreateChannelDesc(); + texRes.res.pitch2D.width = w; + texRes.res.pitch2D.height = h; + texRes.res.pitch2D.pitchInBytes = s * sizeof(float); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeMirror; - texDescr.addressMode[1] = cudaAddressModeMirror; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeMirror; + texDescr.addressMode[1] = cudaAddressModeMirror; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texSource, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = (void *)I1; - texRes.res.pitch2D.desc = cudaCreateChannelDesc(); - texRes.res.pitch2D.width = w; - texRes.res.pitch2D.height = h; - texRes.res.pitch2D.pitchInBytes = s * sizeof(float); - checkCudaErrors( - cudaCreateTextureObject(&texTarget, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texSource, &texRes, &texDescr, NULL)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = (void *)I1; + texRes.res.pitch2D.desc = cudaCreateChannelDesc(); + texRes.res.pitch2D.width = w; + texRes.res.pitch2D.height = h; + texRes.res.pitch2D.pitchInBytes = s * sizeof(float); + checkCudaErrors(cudaCreateTextureObject(&texTarget, &texRes, &texDescr, NULL)); - ComputeDerivativesKernel<<>>(w, h, s, Ix, Iy, Iz, texSource, - texTarget); + ComputeDerivativesKernel<<>>(w, h, s, Ix, Iy, Iz, texSource, texTarget); } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/downscaleKernel.cuh b/Samples/5_Domain_Specific/HSOpticalFlow/downscaleKernel.cuh index 3e15c6b7..6546c906 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/downscaleKernel.cuh +++ b/Samples/5_Domain_Specific/HSOpticalFlow/downscaleKernel.cuh @@ -36,25 +36,24 @@ /// \param[in] stride image stride /// \param[out] out result /////////////////////////////////////////////////////////////////////////////// -__global__ void DownscaleKernel(int width, int height, int stride, float *out, - cudaTextureObject_t texFine) { - const int ix = threadIdx.x + blockIdx.x * blockDim.x; - const int iy = threadIdx.y + blockIdx.y * blockDim.y; +__global__ void DownscaleKernel(int width, int height, int stride, float *out, cudaTextureObject_t texFine) +{ + const int ix = threadIdx.x + blockIdx.x * blockDim.x; + const int iy = threadIdx.y + blockIdx.y * blockDim.y; - if (ix >= width || iy >= height) { - return; - } + if (ix >= width || iy >= height) { + return; + } - float dx = 1.0f / (float)width; - float dy = 1.0f / (float)height; + float dx = 1.0f / (float)width; + float dy = 1.0f / (float)height; - float x = ((float)ix + 0.5f) * dx; - float y = ((float)iy + 0.5f) * dy; + float x = ((float)ix + 0.5f) * dx; + float y = ((float)iy + 0.5f) * dy; - out[ix + iy * stride] = 0.25f * (tex2D(texFine, x - dx * 0.25f, y) + - tex2D(texFine, x + dx * 0.25f, y) + - tex2D(texFine, x, y - dy * 0.25f) + - tex2D(texFine, x, y + dy * 0.25f)); + out[ix + iy * stride] = 0.25f + * (tex2D(texFine, x - dx * 0.25f, y) + tex2D(texFine, x + dx * 0.25f, y) + + tex2D(texFine, x, y - dy * 0.25f) + tex2D(texFine, x, y + dy * 0.25f)); } /////////////////////////////////////////////////////////////////////////////// @@ -66,33 +65,33 @@ __global__ void DownscaleKernel(int width, int height, int stride, float *out, /// \param[in] stride image stride /// \param[out] out result /////////////////////////////////////////////////////////////////////////////// -static void Downscale(const float *src, int width, int height, int stride, - int newWidth, int newHeight, int newStride, float *out) { - dim3 threads(32, 8); - dim3 blocks(iDivUp(newWidth, threads.x), iDivUp(newHeight, threads.y)); +static void +Downscale(const float *src, int width, int height, int stride, int newWidth, int newHeight, int newStride, float *out) +{ + dim3 threads(32, 8); + dim3 blocks(iDivUp(newWidth, threads.x), iDivUp(newHeight, threads.y)); - cudaTextureObject_t texFine; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texFine; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = (void *)src; - texRes.res.pitch2D.desc = cudaCreateChannelDesc(); - texRes.res.pitch2D.width = width; - texRes.res.pitch2D.height = height; - texRes.res.pitch2D.pitchInBytes = stride * sizeof(float); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = (void *)src; + texRes.res.pitch2D.desc = cudaCreateChannelDesc(); + texRes.res.pitch2D.width = width; + texRes.res.pitch2D.height = height; + texRes.res.pitch2D.pitchInBytes = stride * sizeof(float); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeMirror; - texDescr.addressMode[1] = cudaAddressModeMirror; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeMirror; + texDescr.addressMode[1] = cudaAddressModeMirror; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&texFine, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texFine, &texRes, &texDescr, NULL)); - DownscaleKernel<<>>(newWidth, newHeight, newStride, out, - texFine); + DownscaleKernel<<>>(newWidth, newHeight, newStride, out, texFine); } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.cu b/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.cu index bdaa0f8e..ce06f612 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.cu +++ b/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.cu @@ -28,12 +28,12 @@ #include "common.h" // include kernels +#include "addKernel.cuh" +#include "derivativesKernel.cuh" #include "downscaleKernel.cuh" +#include "solverKernel.cuh" #include "upscaleKernel.cuh" #include "warpingKernel.cuh" -#include "derivativesKernel.cuh" -#include "solverKernel.cuh" -#include "addKernel.cuh" /////////////////////////////////////////////////////////////////////////////// /// \brief method logic @@ -51,166 +51,203 @@ /// \param[out] u horizontal displacement /// \param[out] v vertical displacement /////////////////////////////////////////////////////////////////////////////// -void ComputeFlowCUDA(const float *I0, const float *I1, int width, int height, - int stride, float alpha, int nLevels, int nWarpIters, - int nSolverIters, float *u, float *v) { - printf("Computing optical flow on GPU...\n"); +void ComputeFlowCUDA(const float *I0, + const float *I1, + int width, + int height, + int stride, + float alpha, + int nLevels, + int nWarpIters, + int nSolverIters, + float *u, + float *v) +{ + printf("Computing optical flow on GPU...\n"); - // pI0 and pI1 will hold device pointers - const float **pI0 = new const float *[nLevels]; - const float **pI1 = new const float *[nLevels]; + // pI0 and pI1 will hold device pointers + const float **pI0 = new const float *[nLevels]; + const float **pI1 = new const float *[nLevels]; - int *pW = new int[nLevels]; - int *pH = new int[nLevels]; - int *pS = new int[nLevels]; + int *pW = new int[nLevels]; + int *pH = new int[nLevels]; + int *pS = new int[nLevels]; - // device memory pointers - float *d_tmp; - float *d_du0; - float *d_dv0; - float *d_du1; - float *d_dv1; + // device memory pointers + float *d_tmp; + float *d_du0; + float *d_dv0; + float *d_du1; + float *d_dv1; - float *d_Ix; - float *d_Iy; - float *d_Iz; + float *d_Ix; + float *d_Iy; + float *d_Iz; - float *d_u; - float *d_v; - float *d_nu; - float *d_nv; + float *d_u; + float *d_v; + float *d_nu; + float *d_nv; - const int dataSize = stride * height * sizeof(float); + const int dataSize = stride * height * sizeof(float); - checkCudaErrors(cudaMalloc(&d_tmp, dataSize)); - checkCudaErrors(cudaMalloc(&d_du0, dataSize)); - checkCudaErrors(cudaMalloc(&d_dv0, dataSize)); - checkCudaErrors(cudaMalloc(&d_du1, dataSize)); - checkCudaErrors(cudaMalloc(&d_dv1, dataSize)); + checkCudaErrors(cudaMalloc(&d_tmp, dataSize)); + checkCudaErrors(cudaMalloc(&d_du0, dataSize)); + checkCudaErrors(cudaMalloc(&d_dv0, dataSize)); + checkCudaErrors(cudaMalloc(&d_du1, dataSize)); + checkCudaErrors(cudaMalloc(&d_dv1, dataSize)); - checkCudaErrors(cudaMalloc(&d_Ix, dataSize)); - checkCudaErrors(cudaMalloc(&d_Iy, dataSize)); - checkCudaErrors(cudaMalloc(&d_Iz, dataSize)); + checkCudaErrors(cudaMalloc(&d_Ix, dataSize)); + checkCudaErrors(cudaMalloc(&d_Iy, dataSize)); + checkCudaErrors(cudaMalloc(&d_Iz, dataSize)); - checkCudaErrors(cudaMalloc(&d_u, dataSize)); - checkCudaErrors(cudaMalloc(&d_v, dataSize)); - checkCudaErrors(cudaMalloc(&d_nu, dataSize)); - checkCudaErrors(cudaMalloc(&d_nv, dataSize)); + checkCudaErrors(cudaMalloc(&d_u, dataSize)); + checkCudaErrors(cudaMalloc(&d_v, dataSize)); + checkCudaErrors(cudaMalloc(&d_nu, dataSize)); + checkCudaErrors(cudaMalloc(&d_nv, dataSize)); - // prepare pyramid + // prepare pyramid - int currentLevel = nLevels - 1; - // allocate GPU memory for input images - checkCudaErrors(cudaMalloc(pI0 + currentLevel, dataSize)); - checkCudaErrors(cudaMalloc(pI1 + currentLevel, dataSize)); + int currentLevel = nLevels - 1; + // allocate GPU memory for input images + checkCudaErrors(cudaMalloc(pI0 + currentLevel, dataSize)); + checkCudaErrors(cudaMalloc(pI1 + currentLevel, dataSize)); - checkCudaErrors(cudaMemcpy((void *)pI0[currentLevel], I0, dataSize, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy((void *)pI1[currentLevel], I1, dataSize, - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy((void *)pI0[currentLevel], I0, dataSize, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy((void *)pI1[currentLevel], I1, dataSize, cudaMemcpyHostToDevice)); - pW[currentLevel] = width; - pH[currentLevel] = height; - pS[currentLevel] = stride; + pW[currentLevel] = width; + pH[currentLevel] = height; + pS[currentLevel] = stride; - for (; currentLevel > 0; --currentLevel) { - int nw = pW[currentLevel] / 2; - int nh = pH[currentLevel] / 2; - int ns = iAlignUp(nw); + for (; currentLevel > 0; --currentLevel) { + int nw = pW[currentLevel] / 2; + int nh = pH[currentLevel] / 2; + int ns = iAlignUp(nw); - checkCudaErrors( - cudaMalloc(pI0 + currentLevel - 1, ns * nh * sizeof(float))); - checkCudaErrors( - cudaMalloc(pI1 + currentLevel - 1, ns * nh * sizeof(float))); + checkCudaErrors(cudaMalloc(pI0 + currentLevel - 1, ns * nh * sizeof(float))); + checkCudaErrors(cudaMalloc(pI1 + currentLevel - 1, ns * nh * sizeof(float))); - Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel], - pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]); + Downscale(pI0[currentLevel], + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + nw, + nh, + ns, + (float *)pI0[currentLevel - 1]); - Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel], - pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]); + Downscale(pI1[currentLevel], + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + nw, + nh, + ns, + (float *)pI1[currentLevel - 1]); - pW[currentLevel - 1] = nw; - pH[currentLevel - 1] = nh; - pS[currentLevel - 1] = ns; - } - - checkCudaErrors(cudaMemset(d_u, 0, stride * height * sizeof(float))); - checkCudaErrors(cudaMemset(d_v, 0, stride * height * sizeof(float))); - - // compute flow - for (; currentLevel < nLevels; ++currentLevel) { - for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) { - checkCudaErrors(cudaMemset(d_du0, 0, dataSize)); - checkCudaErrors(cudaMemset(d_dv0, 0, dataSize)); - - checkCudaErrors(cudaMemset(d_du1, 0, dataSize)); - checkCudaErrors(cudaMemset(d_dv1, 0, dataSize)); - - // on current level we compute optical flow - // between frame 0 and warped frame 1 - WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel], - pS[currentLevel], d_u, d_v, d_tmp); - - ComputeDerivatives(pI0[currentLevel], d_tmp, pW[currentLevel], - pH[currentLevel], pS[currentLevel], d_Ix, d_Iy, d_Iz); - - for (int iter = 0; iter < nSolverIters; ++iter) { - SolveForUpdate(d_du0, d_dv0, d_Ix, d_Iy, d_Iz, pW[currentLevel], - pH[currentLevel], pS[currentLevel], alpha, d_du1, d_dv1); - - Swap(d_du0, d_du1); - Swap(d_dv0, d_dv1); - } - - // update u, v - Add(d_u, d_du0, pH[currentLevel] * pS[currentLevel], d_u); - Add(d_v, d_dv0, pH[currentLevel] * pS[currentLevel], d_v); + pW[currentLevel - 1] = nw; + pH[currentLevel - 1] = nh; + pS[currentLevel - 1] = ns; } - if (currentLevel != nLevels - 1) { - // prolongate solution - float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel]; + checkCudaErrors(cudaMemset(d_u, 0, stride * height * sizeof(float))); + checkCudaErrors(cudaMemset(d_v, 0, stride * height * sizeof(float))); - Upscale(d_u, pW[currentLevel], pH[currentLevel], pS[currentLevel], - pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1], - scaleX, d_nu); + // compute flow + for (; currentLevel < nLevels; ++currentLevel) { + for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) { + checkCudaErrors(cudaMemset(d_du0, 0, dataSize)); + checkCudaErrors(cudaMemset(d_dv0, 0, dataSize)); - float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel]; + checkCudaErrors(cudaMemset(d_du1, 0, dataSize)); + checkCudaErrors(cudaMemset(d_dv1, 0, dataSize)); - Upscale(d_v, pW[currentLevel], pH[currentLevel], pS[currentLevel], - pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1], - scaleY, d_nv); + // on current level we compute optical flow + // between frame 0 and warped frame 1 + WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel], pS[currentLevel], d_u, d_v, d_tmp); - Swap(d_u, d_nu); - Swap(d_v, d_nv); + ComputeDerivatives( + pI0[currentLevel], d_tmp, pW[currentLevel], pH[currentLevel], pS[currentLevel], d_Ix, d_Iy, d_Iz); + + for (int iter = 0; iter < nSolverIters; ++iter) { + SolveForUpdate(d_du0, + d_dv0, + d_Ix, + d_Iy, + d_Iz, + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + alpha, + d_du1, + d_dv1); + + Swap(d_du0, d_du1); + Swap(d_dv0, d_dv1); + } + + // update u, v + Add(d_u, d_du0, pH[currentLevel] * pS[currentLevel], d_u); + Add(d_v, d_dv0, pH[currentLevel] * pS[currentLevel], d_v); + } + + if (currentLevel != nLevels - 1) { + // prolongate solution + float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel]; + + Upscale(d_u, + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + pW[currentLevel + 1], + pH[currentLevel + 1], + pS[currentLevel + 1], + scaleX, + d_nu); + + float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel]; + + Upscale(d_v, + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + pW[currentLevel + 1], + pH[currentLevel + 1], + pS[currentLevel + 1], + scaleY, + d_nv); + + Swap(d_u, d_nu); + Swap(d_v, d_nv); + } } - } - checkCudaErrors(cudaMemcpy(u, d_u, dataSize, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(v, d_v, dataSize, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(u, d_u, dataSize, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(v, d_v, dataSize, cudaMemcpyDeviceToHost)); - // cleanup - for (int i = 0; i < nLevels; ++i) { - checkCudaErrors(cudaFree((void *)pI0[i])); - checkCudaErrors(cudaFree((void *)pI1[i])); - } + // cleanup + for (int i = 0; i < nLevels; ++i) { + checkCudaErrors(cudaFree((void *)pI0[i])); + checkCudaErrors(cudaFree((void *)pI1[i])); + } - delete[] pI0; - delete[] pI1; - delete[] pW; - delete[] pH; - delete[] pS; + delete[] pI0; + delete[] pI1; + delete[] pW; + delete[] pH; + delete[] pS; - checkCudaErrors(cudaFree(d_tmp)); - checkCudaErrors(cudaFree(d_du0)); - checkCudaErrors(cudaFree(d_dv0)); - checkCudaErrors(cudaFree(d_du1)); - checkCudaErrors(cudaFree(d_dv1)); - checkCudaErrors(cudaFree(d_Ix)); - checkCudaErrors(cudaFree(d_Iy)); - checkCudaErrors(cudaFree(d_Iz)); - checkCudaErrors(cudaFree(d_nu)); - checkCudaErrors(cudaFree(d_nv)); - checkCudaErrors(cudaFree(d_u)); - checkCudaErrors(cudaFree(d_v)); + checkCudaErrors(cudaFree(d_tmp)); + checkCudaErrors(cudaFree(d_du0)); + checkCudaErrors(cudaFree(d_dv0)); + checkCudaErrors(cudaFree(d_du1)); + checkCudaErrors(cudaFree(d_dv1)); + checkCudaErrors(cudaFree(d_Ix)); + checkCudaErrors(cudaFree(d_Iy)); + checkCudaErrors(cudaFree(d_Iz)); + checkCudaErrors(cudaFree(d_nu)); + checkCudaErrors(cudaFree(d_nv)); + checkCudaErrors(cudaFree(d_u)); + checkCudaErrors(cudaFree(d_v)); } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.h b/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.h index 8bb85600..a2d707f5 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.h +++ b/Samples/5_Domain_Specific/HSOpticalFlow/flowCUDA.h @@ -28,16 +28,15 @@ #ifndef FLOW_CUDA_H #define FLOW_CUDA_H -void ComputeFlowCUDA( - const float *I0, // source frame - const float *I1, // tracked frame - int width, // frame width - int height, // frame height - int stride, // row access stride - float alpha, // smoothness coefficient - int nLevels, // number of levels in pyramid - int nWarpIters, // number of warping iterations per pyramid level - int nSolverIters, // number of solver iterations (for linear system) - float *u, // output horizontal flow - float *v); // output vertical flow +void ComputeFlowCUDA(const float *I0, // source frame + const float *I1, // tracked frame + int width, // frame width + int height, // frame height + int stride, // row access stride + float alpha, // smoothness coefficient + int nLevels, // number of levels in pyramid + int nWarpIters, // number of warping iterations per pyramid level + int nSolverIters, // number of solver iterations (for linear system) + float *u, // output horizontal flow + float *v); // output vertical flow #endif diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.cpp b/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.cpp index e422dbeb..96ae3b6e 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.cpp +++ b/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.cpp @@ -25,9 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "common.h" #include "flowGold.h" +#include "common.h" + /////////////////////////////////////////////////////////////////////////////// /// \brief host texture fetch /// @@ -41,42 +42,49 @@ /// \param[in] y y coord of the point to fetch value at /// \return fetched value /////////////////////////////////////////////////////////////////////////////// -inline float Tex2D(const float *t, int w, int h, int s, float x, float y) { - // integer parts in floating point format - float intPartX, intPartY; +inline float Tex2D(const float *t, int w, int h, int s, float x, float y) +{ + // integer parts in floating point format + float intPartX, intPartY; - // get fractional parts of coordinates - float dx = fabsf(modff(x, &intPartX)); - float dy = fabsf(modff(y, &intPartY)); + // get fractional parts of coordinates + float dx = fabsf(modff(x, &intPartX)); + float dy = fabsf(modff(y, &intPartY)); - // assume pixels are squares - // one of the corners - int ix0 = (int)intPartX; - int iy0 = (int)intPartY; + // assume pixels are squares + // one of the corners + int ix0 = (int)intPartX; + int iy0 = (int)intPartY; - // mirror out-of-range position - if (ix0 < 0) ix0 = abs(ix0 + 1); + // mirror out-of-range position + if (ix0 < 0) + ix0 = abs(ix0 + 1); - if (iy0 < 0) iy0 = abs(iy0 + 1); + if (iy0 < 0) + iy0 = abs(iy0 + 1); - if (ix0 >= w) ix0 = w * 2 - ix0 - 1; + if (ix0 >= w) + ix0 = w * 2 - ix0 - 1; - if (iy0 >= h) iy0 = h * 2 - iy0 - 1; + if (iy0 >= h) + iy0 = h * 2 - iy0 - 1; - // corner which is opposite to (ix0, iy0) - int ix1 = ix0 + 1; - int iy1 = iy0 + 1; + // corner which is opposite to (ix0, iy0) + int ix1 = ix0 + 1; + int iy1 = iy0 + 1; - if (ix1 >= w) ix1 = w * 2 - ix1 - 1; + if (ix1 >= w) + ix1 = w * 2 - ix1 - 1; - if (iy1 >= h) iy1 = h * 2 - iy1 - 1; + if (iy1 >= h) + iy1 = h * 2 - iy1 - 1; - float res = t[ix0 + iy0 * s] * (1.0f - dx) * (1.0f - dy); - res += t[ix1 + iy0 * s] * dx * (1.0f - dy); - res += t[ix0 + iy1 * s] * (1.0f - dx) * dy; - res += t[ix1 + iy1 * s] * dx * dy; + float res = t[ix0 + iy0 * s] * (1.0f - dx) * (1.0f - dy); + res += t[ix1 + iy0 * s] * dx * (1.0f - dy); + res += t[ix0 + iy1 * s] * (1.0f - dx) * dy; + res += t[ix1 + iy1 * s] * dx * dy; - return res; + return res; } /////////////////////////////////////////////////////////////////////////////// @@ -92,16 +100,21 @@ inline float Tex2D(const float *t, int w, int h, int s, float x, float y) { /// \param[in] y y coord of the point to fetch value at /// \return fetched value /////////////////////////////////////////////////////////////////////////////// -inline float Tex2Di(const float *src, int w, int h, int s, int x, int y) { - if (x < 0) x = abs(x + 1); +inline float Tex2Di(const float *src, int w, int h, int s, int x, int y) +{ + if (x < 0) + x = abs(x + 1); - if (y < 0) y = abs(y + 1); + if (y < 0) + y = abs(y + 1); - if (x >= w) x = w * 2 - x - 1; + if (x >= w) + x = w * 2 - x - 1; - if (y >= h) y = h * 2 - y - 1; + if (y >= h) + y = h * 2 - y - 1; - return src[x + y * s]; + return src[x + y * s]; } /////////////////////////////////////////////////////////////////////////////// @@ -115,23 +128,24 @@ inline float Tex2Di(const float *src, int w, int h, int s, int x, int y) { /// \param[in] newStride image new stride /// \param[out] out downscaled image data /////////////////////////////////////////////////////////////////////////////// -static void Downscale(const float *src, int width, int height, int stride, - int newWidth, int newHeight, int newStride, float *out) { - for (int i = 0; i < newHeight; ++i) { - for (int j = 0; j < newWidth; ++j) { - const int srcX = j * 2; - const int srcY = i * 2; - // average 4 neighbouring pixels - float sum; - sum = Tex2Di(src, width, height, stride, srcX + 0, srcY + 0); - sum += Tex2Di(src, width, height, stride, srcX + 0, srcY + 1); - sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 0); - sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 1); - // normalize - sum *= 0.25f; - out[j + i * newStride] = sum; +static void +Downscale(const float *src, int width, int height, int stride, int newWidth, int newHeight, int newStride, float *out) +{ + for (int i = 0; i < newHeight; ++i) { + for (int j = 0; j < newWidth; ++j) { + const int srcX = j * 2; + const int srcY = i * 2; + // average 4 neighbouring pixels + float sum; + sum = Tex2Di(src, width, height, stride, srcX + 0, srcY + 0); + sum += Tex2Di(src, width, height, stride, srcX + 0, srcY + 1); + sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 0); + sum += Tex2Di(src, width, height, stride, srcX + 1, srcY + 1); + // normalize + sum *= 0.25f; + out[j + i * newStride] = sum; + } } - } } /////////////////////////////////////////////////////////////////////////////// @@ -146,18 +160,25 @@ static void Downscale(const float *src, int width, int height, int stride, /// \param[in] scale value scale factor (multiplier) /// \param[out] out upscaled field component /////////////////////////////////////////////////////////////////////////////// -static void Upscale(const float *src, int width, int height, int stride, - int newWidth, int newHeight, int newStride, float scale, - float *out) { - for (int i = 0; i < newHeight; ++i) { - for (int j = 0; j < newWidth; ++j) { - // position within smaller image - float x = ((float)j - 0.5f) * 0.5f; - float y = ((float)i - 0.5f) * 0.5f; +static void Upscale(const float *src, + int width, + int height, + int stride, + int newWidth, + int newHeight, + int newStride, + float scale, + float *out) +{ + for (int i = 0; i < newHeight; ++i) { + for (int j = 0; j < newWidth; ++j) { + // position within smaller image + float x = ((float)j - 0.5f) * 0.5f; + float y = ((float)i - 0.5f) * 0.5f; - out[j + i * newStride] = Tex2D(src, width, height, stride, x, y) * scale; + out[j + i * newStride] = Tex2D(src, width, height, stride, x, y) * scale; + } } - } } /////////////////////////////////////////////////////////////////////////////// @@ -176,18 +197,18 @@ static void Upscale(const float *src, int width, int height, int stride, /// \param[in] v vertical displacement /// \param[out] out warped image /////////////////////////////////////////////////////////////////////////////// -static void WarpImage(const float *src, int w, int h, int s, const float *u, - const float *v, float *out) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - const int pos = j + i * s; - // warped coords - float x = (float)j + u[pos]; - float y = (float)i + v[pos]; +static void WarpImage(const float *src, int w, int h, int s, const float *u, const float *v, float *out) +{ + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int pos = j + i * s; + // warped coords + float x = (float)j + u[pos]; + float y = (float)i + v[pos]; - out[pos] = Tex2D(src, w, h, s, x, y); + out[pos] = Tex2D(src, w, h, s, x, y); + } } - } } /////////////////////////////////////////////////////////////////////////////// @@ -201,48 +222,48 @@ static void WarpImage(const float *src, int w, int h, int s, const float *u, /// \param[out] Iy y derivative /// \param[out] Iz temporal derivative /////////////////////////////////////////////////////////////////////////////// -static void ComputeDerivatives(const float *I0, const float *I1, int w, int h, - int s, float *Ix, float *Iy, float *Iz) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - const int pos = j + i * s; - float t0, t1; - // derivative filter is (1, -8, 0, 8, -1)/12 - // x derivative - t0 = Tex2Di(I0, w, h, s, j - 2, i); - t0 -= Tex2Di(I0, w, h, s, j - 1, i) * 8.0f; - t0 += Tex2Di(I0, w, h, s, j + 1, i) * 8.0f; - t0 -= Tex2Di(I0, w, h, s, j + 2, i); - t0 /= 12.0f; +static void ComputeDerivatives(const float *I0, const float *I1, int w, int h, int s, float *Ix, float *Iy, float *Iz) +{ + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int pos = j + i * s; + float t0, t1; + // derivative filter is (1, -8, 0, 8, -1)/12 + // x derivative + t0 = Tex2Di(I0, w, h, s, j - 2, i); + t0 -= Tex2Di(I0, w, h, s, j - 1, i) * 8.0f; + t0 += Tex2Di(I0, w, h, s, j + 1, i) * 8.0f; + t0 -= Tex2Di(I0, w, h, s, j + 2, i); + t0 /= 12.0f; - t1 = Tex2Di(I1, w, h, s, j - 2, i); - t1 -= Tex2Di(I1, w, h, s, j - 1, i) * 8.0f; - t1 += Tex2Di(I1, w, h, s, j + 1, i) * 8.0f; - t1 -= Tex2Di(I1, w, h, s, j + 2, i); - t1 /= 12.0f; + t1 = Tex2Di(I1, w, h, s, j - 2, i); + t1 -= Tex2Di(I1, w, h, s, j - 1, i) * 8.0f; + t1 += Tex2Di(I1, w, h, s, j + 1, i) * 8.0f; + t1 -= Tex2Di(I1, w, h, s, j + 2, i); + t1 /= 12.0f; - // spatial derivatives are averaged - Ix[pos] = (t0 + t1) * 0.5f; + // spatial derivatives are averaged + Ix[pos] = (t0 + t1) * 0.5f; - // t derivative - Iz[pos] = I1[pos] - I0[pos]; + // t derivative + Iz[pos] = I1[pos] - I0[pos]; - // y derivative - t0 = Tex2Di(I0, w, h, s, j, i - 2); - t0 -= Tex2Di(I0, w, h, s, j, i - 1) * 8.0f; - t0 += Tex2Di(I0, w, h, s, j, i + 1) * 8.0f; - t0 -= Tex2Di(I0, w, h, s, j, i + 2); - t0 /= 12.0f; + // y derivative + t0 = Tex2Di(I0, w, h, s, j, i - 2); + t0 -= Tex2Di(I0, w, h, s, j, i - 1) * 8.0f; + t0 += Tex2Di(I0, w, h, s, j, i + 1) * 8.0f; + t0 -= Tex2Di(I0, w, h, s, j, i + 2); + t0 /= 12.0f; - t1 = Tex2Di(I1, w, h, s, j, i - 2); - t1 -= Tex2Di(I1, w, h, s, j, i - 1) * 8.0f; - t1 += Tex2Di(I1, w, h, s, j, i + 1) * 8.0f; - t1 -= Tex2Di(I1, w, h, s, j, i + 2); - t1 /= 12.0f; + t1 = Tex2Di(I1, w, h, s, j, i - 2); + t1 -= Tex2Di(I1, w, h, s, j, i - 1) * 8.0f; + t1 += Tex2Di(I1, w, h, s, j, i + 1) * 8.0f; + t1 -= Tex2Di(I1, w, h, s, j, i + 2); + t1 /= 12.0f; - Iy[pos] = (t0 + t1) * 0.5f; + Iy[pos] = (t0 + t1) * 0.5f; + } } - } } /////////////////////////////////////////////////////////////////////////////// @@ -261,45 +282,53 @@ static void ComputeDerivatives(const float *I0, const float *I1, int w, int h, /// \param[out] du1 new horizontal displacement approximation /// \param[out] dv1 new vertical displacement approximation /////////////////////////////////////////////////////////////////////////////// -static void SolveForUpdate(const float *du0, const float *dv0, const float *Ix, - const float *Iy, const float *Iz, int w, int h, - int s, float alpha, float *du1, float *dv1) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - const int pos = j + i * s; - int left, right, up, down; +static void SolveForUpdate(const float *du0, + const float *dv0, + const float *Ix, + const float *Iy, + const float *Iz, + int w, + int h, + int s, + float alpha, + float *du1, + float *dv1) +{ + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int pos = j + i * s; + int left, right, up, down; - // handle borders - if (j != 0) - left = pos - 1; - else - left = pos; + // handle borders + if (j != 0) + left = pos - 1; + else + left = pos; - if (j != w - 1) - right = pos + 1; - else - right = pos; + if (j != w - 1) + right = pos + 1; + else + right = pos; - if (i != 0) - down = pos - s; - else - down = pos; + if (i != 0) + down = pos - s; + else + down = pos; - if (i != h - 1) - up = pos + s; - else - up = pos; + if (i != h - 1) + up = pos + s; + else + up = pos; - float sumU = (du0[left] + du0[right] + du0[up] + du0[down]) * 0.25f; - float sumV = (dv0[left] + dv0[right] + dv0[up] + dv0[down]) * 0.25f; + float sumU = (du0[left] + du0[right] + du0[up] + du0[down]) * 0.25f; + float sumV = (dv0[left] + dv0[right] + dv0[up] + dv0[down]) * 0.25f; - float frac = (Ix[pos] * sumU + Iy[pos] * sumV + Iz[pos]) / - (Ix[pos] * Ix[pos] + Iy[pos] * Iy[pos] + alpha); + float frac = (Ix[pos] * sumU + Iy[pos] * sumV + Iz[pos]) / (Ix[pos] * Ix[pos] + Iy[pos] * Iy[pos] + alpha); - du1[pos] = sumU - Ix[pos] * frac; - dv1[pos] = sumV - Iy[pos] * frac; + du1[pos] = sumU - Ix[pos] * frac; + dv1[pos] = sumV - Iy[pos] * frac; + } } - } } /////////////////////////////////////////////////////////////////////////////// @@ -318,145 +347,177 @@ static void SolveForUpdate(const float *du0, const float *dv0, const float *Ix, /// \param[out] u horizontal displacement /// \param[out] v vertical displacement /////////////////////////////////////////////////////////////////////////////// -void ComputeFlowGold(const float *I0, const float *I1, int width, int height, - int stride, float alpha, int nLevels, int nWarpIters, - int nSolverIters, float *u, float *v) { - printf("Computing optical flow on CPU...\n"); +void ComputeFlowGold(const float *I0, + const float *I1, + int width, + int height, + int stride, + float alpha, + int nLevels, + int nWarpIters, + int nSolverIters, + float *u, + float *v) +{ + printf("Computing optical flow on CPU...\n"); - float *u0 = u; - float *v0 = v; + float *u0 = u; + float *v0 = v; - const float **pI0 = new const float *[nLevels]; - const float **pI1 = new const float *[nLevels]; + const float **pI0 = new const float *[nLevels]; + const float **pI1 = new const float *[nLevels]; - int *pW = new int[nLevels]; - int *pH = new int[nLevels]; - int *pS = new int[nLevels]; + int *pW = new int[nLevels]; + int *pH = new int[nLevels]; + int *pS = new int[nLevels]; - const int pixelCountAligned = height * stride; + const int pixelCountAligned = height * stride; - float *tmp = new float[pixelCountAligned]; - float *du0 = new float[pixelCountAligned]; - float *dv0 = new float[pixelCountAligned]; - float *du1 = new float[pixelCountAligned]; - float *dv1 = new float[pixelCountAligned]; - float *Ix = new float[pixelCountAligned]; - float *Iy = new float[pixelCountAligned]; - float *Iz = new float[pixelCountAligned]; - float *nu = new float[pixelCountAligned]; - float *nv = new float[pixelCountAligned]; + float *tmp = new float[pixelCountAligned]; + float *du0 = new float[pixelCountAligned]; + float *dv0 = new float[pixelCountAligned]; + float *du1 = new float[pixelCountAligned]; + float *dv1 = new float[pixelCountAligned]; + float *Ix = new float[pixelCountAligned]; + float *Iy = new float[pixelCountAligned]; + float *Iz = new float[pixelCountAligned]; + float *nu = new float[pixelCountAligned]; + float *nv = new float[pixelCountAligned]; - // prepare pyramid - int currentLevel = nLevels - 1; - pI0[currentLevel] = I0; - pI1[currentLevel] = I1; + // prepare pyramid + int currentLevel = nLevels - 1; + pI0[currentLevel] = I0; + pI1[currentLevel] = I1; - pW[currentLevel] = width; - pH[currentLevel] = height; - pS[currentLevel] = stride; + pW[currentLevel] = width; + pH[currentLevel] = height; + pS[currentLevel] = stride; - for (; currentLevel > 0; --currentLevel) { - int nw = pW[currentLevel] / 2; - int nh = pH[currentLevel] / 2; - int ns = iAlignUp(nw); - pI0[currentLevel - 1] = new float[ns * nh]; - pI1[currentLevel - 1] = new float[ns * nh]; + for (; currentLevel > 0; --currentLevel) { + int nw = pW[currentLevel] / 2; + int nh = pH[currentLevel] / 2; + int ns = iAlignUp(nw); + pI0[currentLevel - 1] = new float[ns * nh]; + pI1[currentLevel - 1] = new float[ns * nh]; - Downscale(pI0[currentLevel], pW[currentLevel], pH[currentLevel], - pS[currentLevel], nw, nh, ns, (float *)pI0[currentLevel - 1]); + Downscale(pI0[currentLevel], + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + nw, + nh, + ns, + (float *)pI0[currentLevel - 1]); - Downscale(pI1[currentLevel], pW[currentLevel], pH[currentLevel], - pS[currentLevel], nw, nh, ns, (float *)pI1[currentLevel - 1]); + Downscale(pI1[currentLevel], + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + nw, + nh, + ns, + (float *)pI1[currentLevel - 1]); - pW[currentLevel - 1] = nw; - pH[currentLevel - 1] = nh; - pS[currentLevel - 1] = ns; - } - - // initial approximation - memset(u, 0, stride * height * sizeof(float)); - memset(v, 0, stride * height * sizeof(float)); - - // compute flow - for (; currentLevel < nLevels; ++currentLevel) { - for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) { - memset(du0, 0, pixelCountAligned * sizeof(float)); - memset(dv0, 0, pixelCountAligned * sizeof(float)); - - memset(du1, 0, pixelCountAligned * sizeof(float)); - memset(dv1, 0, pixelCountAligned * sizeof(float)); - - WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel], - pS[currentLevel], u, v, tmp); - - // on current level we compute optical flow - // between frame 0 and warped frame 1 - ComputeDerivatives(pI0[currentLevel], tmp, pW[currentLevel], - pH[currentLevel], pS[currentLevel], Ix, Iy, Iz); - - for (int iter = 0; iter < nSolverIters; ++iter) { - SolveForUpdate(du0, dv0, Ix, Iy, Iz, pW[currentLevel], pH[currentLevel], - pS[currentLevel], alpha, du1, dv1); - Swap(du0, du1); - Swap(dv0, dv1); - } - - // update u, v - for (int i = 0; i < pH[currentLevel] * pS[currentLevel]; ++i) { - u[i] += du0[i]; - v[i] += dv0[i]; - } - } // end for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) - - if (currentLevel != nLevels - 1) { - // prolongate solution - float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel]; - - Upscale(u, pW[currentLevel], pH[currentLevel], pS[currentLevel], - pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1], - scaleX, nu); - - float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel]; - - Upscale(v, pW[currentLevel], pH[currentLevel], pS[currentLevel], - pW[currentLevel + 1], pH[currentLevel + 1], pS[currentLevel + 1], - scaleY, nv); - - Swap(u, nu); - Swap(v, nv); + pW[currentLevel - 1] = nw; + pH[currentLevel - 1] = nh; + pS[currentLevel - 1] = ns; } - } // end for (; currentLevel < nLevels; ++currentLevel) - if (u != u0) { - // solution is not in the specified array - // copy - memcpy(u0, u, pixelCountAligned * sizeof(float)); - memcpy(v0, v, pixelCountAligned * sizeof(float)); - Swap(u, nu); - Swap(v, nv); - } + // initial approximation + memset(u, 0, stride * height * sizeof(float)); + memset(v, 0, stride * height * sizeof(float)); - // cleanup - // last level is not being freed here - // because it refers to input images - for (int i = 0; i < nLevels - 1; ++i) { - delete[] pI0[i]; - delete[] pI1[i]; - } + // compute flow + for (; currentLevel < nLevels; ++currentLevel) { + for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) { + memset(du0, 0, pixelCountAligned * sizeof(float)); + memset(dv0, 0, pixelCountAligned * sizeof(float)); - delete[] pI0; - delete[] pI1; - delete[] pW; - delete[] pH; - delete[] pS; - delete[] tmp; - delete[] du0; - delete[] dv0; - delete[] du1; - delete[] dv1; - delete[] Ix; - delete[] Iy; - delete[] Iz; - delete[] nu; - delete[] nv; + memset(du1, 0, pixelCountAligned * sizeof(float)); + memset(dv1, 0, pixelCountAligned * sizeof(float)); + + WarpImage(pI1[currentLevel], pW[currentLevel], pH[currentLevel], pS[currentLevel], u, v, tmp); + + // on current level we compute optical flow + // between frame 0 and warped frame 1 + ComputeDerivatives( + pI0[currentLevel], tmp, pW[currentLevel], pH[currentLevel], pS[currentLevel], Ix, Iy, Iz); + + for (int iter = 0; iter < nSolverIters; ++iter) { + SolveForUpdate( + du0, dv0, Ix, Iy, Iz, pW[currentLevel], pH[currentLevel], pS[currentLevel], alpha, du1, dv1); + Swap(du0, du1); + Swap(dv0, dv1); + } + + // update u, v + for (int i = 0; i < pH[currentLevel] * pS[currentLevel]; ++i) { + u[i] += du0[i]; + v[i] += dv0[i]; + } + } // end for (int warpIter = 0; warpIter < nWarpIters; ++warpIter) + + if (currentLevel != nLevels - 1) { + // prolongate solution + float scaleX = (float)pW[currentLevel + 1] / (float)pW[currentLevel]; + + Upscale(u, + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + pW[currentLevel + 1], + pH[currentLevel + 1], + pS[currentLevel + 1], + scaleX, + nu); + + float scaleY = (float)pH[currentLevel + 1] / (float)pH[currentLevel]; + + Upscale(v, + pW[currentLevel], + pH[currentLevel], + pS[currentLevel], + pW[currentLevel + 1], + pH[currentLevel + 1], + pS[currentLevel + 1], + scaleY, + nv); + + Swap(u, nu); + Swap(v, nv); + } + } // end for (; currentLevel < nLevels; ++currentLevel) + + if (u != u0) { + // solution is not in the specified array + // copy + memcpy(u0, u, pixelCountAligned * sizeof(float)); + memcpy(v0, v, pixelCountAligned * sizeof(float)); + Swap(u, nu); + Swap(v, nv); + } + + // cleanup + // last level is not being freed here + // because it refers to input images + for (int i = 0; i < nLevels - 1; ++i) { + delete[] pI0[i]; + delete[] pI1[i]; + } + + delete[] pI0; + delete[] pI1; + delete[] pW; + delete[] pH; + delete[] pS; + delete[] tmp; + delete[] du0; + delete[] dv0; + delete[] du1; + delete[] dv1; + delete[] Ix; + delete[] Iy; + delete[] Iz; + delete[] nu; + delete[] nv; } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.h b/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.h index 0dc02970..c20d5f5c 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.h +++ b/Samples/5_Domain_Specific/HSOpticalFlow/flowGold.h @@ -28,17 +28,16 @@ #ifndef FLOW_GOLD_H #define FLOW_GOLD_H -void ComputeFlowGold( - const float *I0, // source frame - const float *I1, // tracked frame - int width, // frame width - int height, // frame height - int stride, // row access stride - float alpha, // smoothness coefficient - int nLevels, // number of levels in pyramid - int nWarpIters, // number of warping iterations per pyramid level - int nIters, // number of solver iterations (for linear system) - float *u, // output horizontal flow - float *v); // output vertical flow +void ComputeFlowGold(const float *I0, // source frame + const float *I1, // tracked frame + int width, // frame width + int height, // frame height + int stride, // row access stride + float alpha, // smoothness coefficient + int nLevels, // number of levels in pyramid + int nWarpIters, // number of warping iterations per pyramid level + int nIters, // number of solver iterations (for linear system) + float *u, // output horizontal flow + float *v); // output vertical flow #endif diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/main.cpp b/Samples/5_Domain_Specific/HSOpticalFlow/main.cpp index 77788877..28c697c4 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/main.cpp +++ b/Samples/5_Domain_Specific/HSOpticalFlow/main.cpp @@ -31,12 +31,11 @@ const static char *const sSDKsample = "HSOpticalFlow"; const float THRESHOLD = 0.05f; #include +#include #include "common.h" -#include "flowGold.h" #include "flowCUDA.h" - -#include +#include "flowGold.h" /////////////////////////////////////////////////////////////////////////////// /// \brief save optical flow in format described on vision.middlebury.edu/flow @@ -47,30 +46,30 @@ const float THRESHOLD = 0.05f; /// \param[in] u horizontal displacement /// \param[in] v vertical displacement /////////////////////////////////////////////////////////////////////////////// -void WriteFloFile(const char *name, int w, int h, int s, const float *u, - const float *v) { - FILE *stream; - stream = fopen(name, "wb"); +void WriteFloFile(const char *name, int w, int h, int s, const float *u, const float *v) +{ + FILE *stream; + stream = fopen(name, "wb"); - if (stream == 0) { - printf("Could not save flow to \"%s\"\n", name); - return; - } - - float data = 202021.25f; - fwrite(&data, sizeof(float), 1, stream); - fwrite(&w, sizeof(w), 1, stream); - fwrite(&h, sizeof(h), 1, stream); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - const int pos = j + i * s; - fwrite(u + pos, sizeof(float), 1, stream); - fwrite(v + pos, sizeof(float), 1, stream); + if (stream == 0) { + printf("Could not save flow to \"%s\"\n", name); + return; } - } - fclose(stream); + float data = 202021.25f; + fwrite(&data, sizeof(float), 1, stream); + fwrite(&w, sizeof(w), 1, stream); + fwrite(&h, sizeof(h), 1, stream); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int pos = j + i * s; + fwrite(u + pos, sizeof(float), 1, stream); + fwrite(v + pos, sizeof(float), 1, stream); + } + } + + fclose(stream); } /////////////////////////////////////////////////////////////////////////////// @@ -85,41 +84,41 @@ void WriteFloFile(const char *name, int w, int h, int s, const float *u, /// \param[in] exePath executable file path /// \return true if image is successfully loaded or false otherwise /////////////////////////////////////////////////////////////////////////////// -bool LoadImageAsFP32(float *&img_data, int &img_w, int &img_h, int &img_s, - const char *name, const char *exePath) { - printf("Loading \"%s\" ...\n", name); - char *name_ = sdkFindFilePath(name, exePath); +bool LoadImageAsFP32(float *&img_data, int &img_w, int &img_h, int &img_s, const char *name, const char *exePath) +{ + printf("Loading \"%s\" ...\n", name); + char *name_ = sdkFindFilePath(name, exePath); - if (!name_) { - printf("File not found\n"); - return false; - } - - unsigned char *data = 0; - unsigned int w = 0, h = 0; - bool result = sdkLoadPPM4ub(name_, &data, &w, &h); - - if (result == false) { - printf("Invalid file format\n"); - return false; - } - - img_w = w; - img_h = h; - img_s = iAlignUp(img_w); - - img_data = new float[img_s * h]; - - // source is 4 channel image - const int widthStep = 4 * img_w; - - for (int i = 0; i < img_h; ++i) { - for (int j = 0; j < img_w; ++j) { - img_data[j + i * img_s] = ((float)data[j * 4 + i * widthStep]) / 255.0f; + if (!name_) { + printf("File not found\n"); + return false; } - } - return true; + unsigned char *data = 0; + unsigned int w = 0, h = 0; + bool result = sdkLoadPPM4ub(name_, &data, &w, &h); + + if (result == false) { + printf("Invalid file format\n"); + return false; + } + + img_w = w; + img_h = h; + img_s = iAlignUp(img_w); + + img_data = new float[img_s * h]; + + // source is 4 channel image + const int widthStep = 4 * img_w; + + for (int i = 0; i < img_h; ++i) { + for (int j = 0; j < img_w; ++j) { + img_data[j + i * img_s] = ((float)data[j * 4 + i * widthStep]) / 255.0f; + } + } + + return true; } /////////////////////////////////////////////////////////////////////////////// @@ -133,105 +132,108 @@ bool LoadImageAsFP32(float *&img_data, int &img_w, int &img_h, int &img_s, /// \param[in] h_v vertical displacement /// \return true if discrepancy is lower than a given threshold /////////////////////////////////////////////////////////////////////////////// -bool CompareWithGold(int width, int height, int stride, const float *h_uGold, - const float *h_vGold, const float *h_u, const float *h_v) { - float error = 0.0f; +bool CompareWithGold(int width, + int height, + int stride, + const float *h_uGold, + const float *h_vGold, + const float *h_u, + const float *h_v) +{ + float error = 0.0f; - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; ++j) { - const int pos = j + i * stride; - error += fabsf(h_u[pos] - h_uGold[pos]) + fabsf(h_v[pos] - h_vGold[pos]); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int pos = j + i * stride; + error += fabsf(h_u[pos] - h_uGold[pos]) + fabsf(h_v[pos] - h_vGold[pos]); + } } - } - error /= (float)(width * height); + error /= (float)(width * height); - printf("L1 error : %.6f\n", error); + printf("L1 error : %.6f\n", error); - return (error < THRESHOLD); + return (error < THRESHOLD); } /////////////////////////////////////////////////////////////////////////////// /// application entry point /////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // welcome message - printf("%s Starting...\n\n", sSDKsample); +int main(int argc, char **argv) +{ + // welcome message + printf("%s Starting...\n\n", sSDKsample); - // pick GPU - findCudaDevice(argc, (const char **)argv); + // pick GPU + findCudaDevice(argc, (const char **)argv); - // find images - const char *const sourceFrameName = "frame10.ppm"; - const char *const targetFrameName = "frame11.ppm"; + // find images + const char *const sourceFrameName = "frame10.ppm"; + const char *const targetFrameName = "frame11.ppm"; - // image dimensions - int width; - int height; - // row access stride - int stride; + // image dimensions + int width; + int height; + // row access stride + int stride; - // flow is computed from source image to target image - float *h_source; // source image, host memory - float *h_target; // target image, host memory + // flow is computed from source image to target image + float *h_source; // source image, host memory + float *h_target; // target image, host memory - // load image from file - if (!LoadImageAsFP32(h_source, width, height, stride, sourceFrameName, - argv[0])) { - exit(EXIT_FAILURE); - } + // load image from file + if (!LoadImageAsFP32(h_source, width, height, stride, sourceFrameName, argv[0])) { + exit(EXIT_FAILURE); + } - if (!LoadImageAsFP32(h_target, width, height, stride, targetFrameName, - argv[0])) { - exit(EXIT_FAILURE); - } + if (!LoadImageAsFP32(h_target, width, height, stride, targetFrameName, argv[0])) { + exit(EXIT_FAILURE); + } - // allocate host memory for CPU results - float *h_uGold = new float[stride * height]; - float *h_vGold = new float[stride * height]; + // allocate host memory for CPU results + float *h_uGold = new float[stride * height]; + float *h_vGold = new float[stride * height]; - // allocate host memory for GPU results - float *h_u = new float[stride * height]; - float *h_v = new float[stride * height]; + // allocate host memory for GPU results + float *h_u = new float[stride * height]; + float *h_v = new float[stride * height]; - // smoothness - // if image brightness is not within [0,1] - // this paramter should be scaled appropriately - const float alpha = 0.2f; + // smoothness + // if image brightness is not within [0,1] + // this paramter should be scaled appropriately + const float alpha = 0.2f; - // number of pyramid levels - const int nLevels = 5; + // number of pyramid levels + const int nLevels = 5; - // number of solver iterations on each level - const int nSolverIters = 500; + // number of solver iterations on each level + const int nSolverIters = 500; - // number of warping iterations - const int nWarpIters = 3; + // number of warping iterations + const int nWarpIters = 3; - ComputeFlowGold(h_source, h_target, width, height, stride, alpha, nLevels, - nWarpIters, nSolverIters, h_uGold, h_vGold); + ComputeFlowGold( + h_source, h_target, width, height, stride, alpha, nLevels, nWarpIters, nSolverIters, h_uGold, h_vGold); - ComputeFlowCUDA(h_source, h_target, width, height, stride, alpha, nLevels, - nWarpIters, nSolverIters, h_u, h_v); + ComputeFlowCUDA(h_source, h_target, width, height, stride, alpha, nLevels, nWarpIters, nSolverIters, h_u, h_v); - // compare results (L1 norm) - bool status = - CompareWithGold(width, height, stride, h_uGold, h_vGold, h_u, h_v); + // compare results (L1 norm) + bool status = CompareWithGold(width, height, stride, h_uGold, h_vGold, h_u, h_v); - WriteFloFile("FlowGPU.flo", width, height, stride, h_u, h_v); + WriteFloFile("FlowGPU.flo", width, height, stride, h_u, h_v); - WriteFloFile("FlowCPU.flo", width, height, stride, h_uGold, h_vGold); + WriteFloFile("FlowCPU.flo", width, height, stride, h_uGold, h_vGold); - // free resources - delete[] h_uGold; - delete[] h_vGold; + // free resources + delete[] h_uGold; + delete[] h_vGold; - delete[] h_u; - delete[] h_v; + delete[] h_u; + delete[] h_v; - delete[] h_source; - delete[] h_target; + delete[] h_source; + delete[] h_target; - // report self-test status - exit(status ? EXIT_SUCCESS : EXIT_FAILURE); + // report self-test status + exit(status ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/solverKernel.cuh b/Samples/5_Domain_Specific/HSOpticalFlow/solverKernel.cuh index 2493ee8a..1b2faf91 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/solverKernel.cuh +++ b/Samples/5_Domain_Specific/HSOpticalFlow/solverKernel.cuh @@ -25,9 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "common.h" #include +#include "common.h" + namespace cg = cooperative_groups; /////////////////////////////////////////////////////////////////////////////// @@ -48,108 +49,117 @@ namespace cg = cooperative_groups; /// \param[out] dv1 new vertical displacement approximation /////////////////////////////////////////////////////////////////////////////// template -__global__ void JacobiIteration(const float *du0, const float *dv0, - const float *Ix, const float *Iy, - const float *Iz, int w, int h, int s, - float alpha, float *du1, float *dv1) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void JacobiIteration(const float *du0, + const float *dv0, + const float *Ix, + const float *Iy, + const float *Iz, + int w, + int h, + int s, + float alpha, + float *du1, + float *dv1) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - volatile __shared__ float du[(bx + 2) * (by + 2)]; - volatile __shared__ float dv[(bx + 2) * (by + 2)]; + volatile __shared__ float du[(bx + 2) * (by + 2)]; + volatile __shared__ float dv[(bx + 2) * (by + 2)]; - const int ix = threadIdx.x + blockIdx.x * blockDim.x; - const int iy = threadIdx.y + blockIdx.y * blockDim.y; + const int ix = threadIdx.x + blockIdx.x * blockDim.x; + const int iy = threadIdx.y + blockIdx.y * blockDim.y; - // position within global memory array - const int pos = min(ix, w - 1) + min(iy, h - 1) * s; + // position within global memory array + const int pos = min(ix, w - 1) + min(iy, h - 1) * s; - // position within shared memory array - const int shMemPos = threadIdx.x + 1 + (threadIdx.y + 1) * (bx + 2); + // position within shared memory array + const int shMemPos = threadIdx.x + 1 + (threadIdx.y + 1) * (bx + 2); - // Load data to shared memory. - // load tile being processed - du[shMemPos] = du0[pos]; - dv[shMemPos] = dv0[pos]; + // Load data to shared memory. + // load tile being processed + du[shMemPos] = du0[pos]; + dv[shMemPos] = dv0[pos]; - // load necessary neighbouring elements - // We clamp out-of-range coordinates. - // It is equivalent to mirroring - // because we access data only one step away from borders. - if (threadIdx.y == 0) { - // beginning of the tile - const int bsx = blockIdx.x * blockDim.x; - const int bsy = blockIdx.y * blockDim.y; - // element position within matrix - int x, y; - // element position within linear array - // gm - global memory - // sm - shared memory - int gmPos, smPos; + // load necessary neighbouring elements + // We clamp out-of-range coordinates. + // It is equivalent to mirroring + // because we access data only one step away from borders. + if (threadIdx.y == 0) { + // beginning of the tile + const int bsx = blockIdx.x * blockDim.x; + const int bsy = blockIdx.y * blockDim.y; + // element position within matrix + int x, y; + // element position within linear array + // gm - global memory + // sm - shared memory + int gmPos, smPos; - x = min(bsx + threadIdx.x, w - 1); - // row just below the tile - y = max(bsy - 1, 0); - gmPos = y * s + x; - smPos = threadIdx.x + 1; - du[smPos] = du0[gmPos]; - dv[smPos] = dv0[gmPos]; + x = min(bsx + threadIdx.x, w - 1); + // row just below the tile + y = max(bsy - 1, 0); + gmPos = y * s + x; + smPos = threadIdx.x + 1; + du[smPos] = du0[gmPos]; + dv[smPos] = dv0[gmPos]; - // row above the tile - y = min(bsy + by, h - 1); - smPos += (by + 1) * (bx + 2); - gmPos = y * s + x; - du[smPos] = du0[gmPos]; - dv[smPos] = dv0[gmPos]; - } else if (threadIdx.y == 1) { - // beginning of the tile - const int bsx = blockIdx.x * blockDim.x; - const int bsy = blockIdx.y * blockDim.y; - // element position within matrix - int x, y; - // element position within linear array - // gm - global memory - // sm - shared memory - int gmPos, smPos; - - y = min(bsy + threadIdx.x, h - 1); - // column to the left - x = max(bsx - 1, 0); - smPos = bx + 2 + threadIdx.x * (bx + 2); - gmPos = x + y * s; - - // check if we are within tile - if (threadIdx.x < by) { - du[smPos] = du0[gmPos]; - dv[smPos] = dv0[gmPos]; - // column to the right - x = min(bsx + bx, w - 1); - gmPos = y * s + x; - smPos += bx + 1; - du[smPos] = du0[gmPos]; - dv[smPos] = dv0[gmPos]; + // row above the tile + y = min(bsy + by, h - 1); + smPos += (by + 1) * (bx + 2); + gmPos = y * s + x; + du[smPos] = du0[gmPos]; + dv[smPos] = dv0[gmPos]; } - } + else if (threadIdx.y == 1) { + // beginning of the tile + const int bsx = blockIdx.x * blockDim.x; + const int bsy = blockIdx.y * blockDim.y; + // element position within matrix + int x, y; + // element position within linear array + // gm - global memory + // sm - shared memory + int gmPos, smPos; - cg::sync(cta); + y = min(bsy + threadIdx.x, h - 1); + // column to the left + x = max(bsx - 1, 0); + smPos = bx + 2 + threadIdx.x * (bx + 2); + gmPos = x + y * s; - if (ix >= w || iy >= h) return; + // check if we are within tile + if (threadIdx.x < by) { + du[smPos] = du0[gmPos]; + dv[smPos] = dv0[gmPos]; + // column to the right + x = min(bsx + bx, w - 1); + gmPos = y * s + x; + smPos += bx + 1; + du[smPos] = du0[gmPos]; + dv[smPos] = dv0[gmPos]; + } + } - // now all necessary data are loaded to shared memory - int left, right, up, down; - left = shMemPos - 1; - right = shMemPos + 1; - up = shMemPos + bx + 2; - down = shMemPos - bx - 2; + cg::sync(cta); - float sumU = (du[left] + du[right] + du[up] + du[down]) * 0.25f; - float sumV = (dv[left] + dv[right] + dv[up] + dv[down]) * 0.25f; + if (ix >= w || iy >= h) + return; - float frac = (Ix[pos] * sumU + Iy[pos] * sumV + Iz[pos]) / - (Ix[pos] * Ix[pos] + Iy[pos] * Iy[pos] + alpha); + // now all necessary data are loaded to shared memory + int left, right, up, down; + left = shMemPos - 1; + right = shMemPos + 1; + up = shMemPos + bx + 2; + down = shMemPos - bx - 2; - du1[pos] = sumU - Ix[pos] * frac; - dv1[pos] = sumV - Iy[pos] * frac; + float sumU = (du[left] + du[right] + du[up] + du[down]) * 0.25f; + float sumV = (dv[left] + dv[right] + dv[up] + dv[down]) * 0.25f; + + float frac = (Ix[pos] * sumU + Iy[pos] * sumV + Iz[pos]) / (Ix[pos] * Ix[pos] + Iy[pos] * Iy[pos] + alpha); + + du1[pos] = sumU - Ix[pos] * frac; + dv1[pos] = sumV - Iy[pos] * frac; } /////////////////////////////////////////////////////////////////////////////// @@ -168,14 +178,22 @@ __global__ void JacobiIteration(const float *du0, const float *dv0, /// \param[out] du1 new horizontal displacement approximation /// \param[out] dv1 new vertical displacement approximation /////////////////////////////////////////////////////////////////////////////// -static void SolveForUpdate(const float *du0, const float *dv0, const float *Ix, - const float *Iy, const float *Iz, int w, int h, - int s, float alpha, float *du1, float *dv1) { - // CTA size - dim3 threads(32, 6); - // grid size - dim3 blocks(iDivUp(w, threads.x), iDivUp(h, threads.y)); +static void SolveForUpdate(const float *du0, + const float *dv0, + const float *Ix, + const float *Iy, + const float *Iz, + int w, + int h, + int s, + float alpha, + float *du1, + float *dv1) +{ + // CTA size + dim3 threads(32, 6); + // grid size + dim3 blocks(iDivUp(w, threads.x), iDivUp(h, threads.y)); - JacobiIteration<32, 6><<>>(du0, dv0, Ix, Iy, Iz, w, h, s, - alpha, du1, dv1); + JacobiIteration<32, 6><<>>(du0, dv0, Ix, Iy, Iz, w, h, s, alpha, du1, dv1); } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/upscaleKernel.cuh b/Samples/5_Domain_Specific/HSOpticalFlow/upscaleKernel.cuh index fea42a6d..6e9f870a 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/upscaleKernel.cuh +++ b/Samples/5_Domain_Specific/HSOpticalFlow/upscaleKernel.cuh @@ -35,19 +35,20 @@ /// \param[in] scale scale factor (multiplier) /// \param[out] out result /////////////////////////////////////////////////////////////////////////////// -__global__ void UpscaleKernel(int width, int height, int stride, float scale, - float *out, cudaTextureObject_t texCoarse) { - const int ix = threadIdx.x + blockIdx.x * blockDim.x; - const int iy = threadIdx.y + blockIdx.y * blockDim.y; +__global__ void UpscaleKernel(int width, int height, int stride, float scale, float *out, cudaTextureObject_t texCoarse) +{ + const int ix = threadIdx.x + blockIdx.x * blockDim.x; + const int iy = threadIdx.y + blockIdx.y * blockDim.y; - if (ix >= width || iy >= height) return; + if (ix >= width || iy >= height) + return; - float x = ((float)ix + 0.5f) / (float)width; - float y = ((float)iy + 0.5f) / (float)height; + float x = ((float)ix + 0.5f) / (float)width; + float y = ((float)iy + 0.5f) / (float)height; - // exploit hardware interpolation - // and scale interpolated vector to match next pyramid level resolution - out[ix + iy * stride] = tex2D(texCoarse, x, y) * scale; + // exploit hardware interpolation + // and scale interpolated vector to match next pyramid level resolution + out[ix + iy * stride] = tex2D(texCoarse, x, y) * scale; } /////////////////////////////////////////////////////////////////////////////// @@ -62,35 +63,40 @@ __global__ void UpscaleKernel(int width, int height, int stride, float scale, /// \param[in] scale value scale factor (multiplier) /// \param[out] out upscaled field component /////////////////////////////////////////////////////////////////////////////// -static void Upscale(const float *src, int width, int height, int stride, - int newWidth, int newHeight, int newStride, float scale, - float *out) { - dim3 threads(32, 8); - dim3 blocks(iDivUp(newWidth, threads.x), iDivUp(newHeight, threads.y)); +static void Upscale(const float *src, + int width, + int height, + int stride, + int newWidth, + int newHeight, + int newStride, + float scale, + float *out) +{ + dim3 threads(32, 8); + dim3 blocks(iDivUp(newWidth, threads.x), iDivUp(newHeight, threads.y)); - cudaTextureObject_t texCoarse; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texCoarse; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = (void *)src; - texRes.res.pitch2D.desc = cudaCreateChannelDesc(); - texRes.res.pitch2D.width = width; - texRes.res.pitch2D.height = height; - texRes.res.pitch2D.pitchInBytes = stride * sizeof(float); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = (void *)src; + texRes.res.pitch2D.desc = cudaCreateChannelDesc(); + texRes.res.pitch2D.width = width; + texRes.res.pitch2D.height = height; + texRes.res.pitch2D.pitchInBytes = stride * sizeof(float); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeMirror; - texDescr.addressMode[1] = cudaAddressModeMirror; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeMirror; + texDescr.addressMode[1] = cudaAddressModeMirror; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texCoarse, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texCoarse, &texRes, &texDescr, NULL)); - UpscaleKernel<<>>(newWidth, newHeight, newStride, scale, out, - texCoarse); + UpscaleKernel<<>>(newWidth, newHeight, newStride, scale, out, texCoarse); } diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/warpingKernel.cuh b/Samples/5_Domain_Specific/HSOpticalFlow/warpingKernel.cuh index 7e19a622..e1e3280d 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/warpingKernel.cuh +++ b/Samples/5_Domain_Specific/HSOpticalFlow/warpingKernel.cuh @@ -36,20 +36,26 @@ /// \param[in] v vertical displacement /// \param[out] out result /////////////////////////////////////////////////////////////////////////////// -__global__ void WarpingKernel(int width, int height, int stride, const float *u, - const float *v, float *out, - cudaTextureObject_t texToWarp) { - const int ix = threadIdx.x + blockIdx.x * blockDim.x; - const int iy = threadIdx.y + blockIdx.y * blockDim.y; +__global__ void WarpingKernel(int width, + int height, + int stride, + const float *u, + const float *v, + float *out, + cudaTextureObject_t texToWarp) +{ + const int ix = threadIdx.x + blockIdx.x * blockDim.x; + const int iy = threadIdx.y + blockIdx.y * blockDim.y; - const int pos = ix + iy * stride; + const int pos = ix + iy * stride; - if (ix >= width || iy >= height) return; + if (ix >= width || iy >= height) + return; - float x = ((float)ix + u[pos] + 0.5f) / (float)width; - float y = ((float)iy + v[pos] + 0.5f) / (float)height; + float x = ((float)ix + u[pos] + 0.5f) / (float)width; + float y = ((float)iy + v[pos] + 0.5f) / (float)height; - out[pos] = tex2D(texToWarp, x, y); + out[pos] = tex2D(texToWarp, x, y); } /////////////////////////////////////////////////////////////////////////////// @@ -68,33 +74,32 @@ __global__ void WarpingKernel(int width, int height, int stride, const float *u, /// \param[in] v vertical displacement /// \param[out] out warped image /////////////////////////////////////////////////////////////////////////////// -static void WarpImage(const float *src, int w, int h, int s, const float *u, - const float *v, float *out) { - dim3 threads(32, 6); - dim3 blocks(iDivUp(w, threads.x), iDivUp(h, threads.y)); +static void WarpImage(const float *src, int w, int h, int s, const float *u, const float *v, float *out) +{ + dim3 threads(32, 6); + dim3 blocks(iDivUp(w, threads.x), iDivUp(h, threads.y)); - cudaTextureObject_t texToWarp; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texToWarp; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = (void *)src; - texRes.res.pitch2D.desc = cudaCreateChannelDesc(); - texRes.res.pitch2D.width = w; - texRes.res.pitch2D.height = h; - texRes.res.pitch2D.pitchInBytes = s * sizeof(float); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = (void *)src; + texRes.res.pitch2D.desc = cudaCreateChannelDesc(); + texRes.res.pitch2D.width = w; + texRes.res.pitch2D.height = h; + texRes.res.pitch2D.pitchInBytes = s * sizeof(float); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeMirror; - texDescr.addressMode[1] = cudaAddressModeMirror; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeMirror; + texDescr.addressMode[1] = cudaAddressModeMirror; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texToWarp, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texToWarp, &texRes, &texDescr, NULL)); - WarpingKernel<<>>(w, h, s, u, v, out, texToWarp); + WarpingKernel<<>>(w, h, s, u, v, out, texToWarp); } diff --git a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot.cpp b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot.cpp index c9e17c9f..e51642c7 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot.cpp +++ b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot.cpp @@ -53,20 +53,19 @@ // CUDA runtime // CUDA utilities and system includes -#include #include - -#include +#include #include +#include // Includes +#include #include #include #include -#include -#include "Mandelbrot_kernel.h" #include "Mandelbrot_gold.h" +#include "Mandelbrot_kernel.h" #define MAX_EPSILON_ERROR 5.0f @@ -75,15 +74,14 @@ const char *sOriginal[] = {"mandelbrot.ppm", "julia.ppm", NULL}; const char *sReference[] = {"Mandelbrot_fp32.ppm", "Mandelbrot_fp64.ppm", NULL}; -const char *sReferenceJulia[] = {"referenceJulia_fp32.ppm", - "referenceJulia_fp64.ppm", NULL}; +const char *sReferenceJulia[] = {"referenceJulia_fp32.ppm", "referenceJulia_fp64.ppm", NULL}; bool g_isJuliaSet = false; -bool g_isMoving = true; -bool g_runCPU = false; +bool g_isMoving = true; +bool g_runCPU = false; FILE *stream; -char g_ExecPath[300]; +char g_ExecPath[300]; // Set to 1 to run on the CPU instead of the GPU for timing comparison. #define RUN_CPU 0 @@ -92,12 +90,12 @@ char g_ExecPath[300]; #define RUN_TIMING 0 // Random number macros -#define RANDOMSEED(seed) ((seed) = ((seed)*1103515245 + 12345)) +#define RANDOMSEED(seed) ((seed) = ((seed) * 1103515245 + 12345)) #define RANDOMBITS(seed, bits) ((unsigned int)RANDOMSEED(seed) >> (32 - (bits))) // OpenGL PBO and texture "names" -GLuint gl_PBO, gl_Tex, gl_Shader; -struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange +GLuint gl_PBO, gl_Tex, gl_Shader; +struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange // Source image on the host side uchar4 *h_Src = 0; @@ -112,13 +110,13 @@ int imageW = 800, imageH = 600; int crunch = 512; // Starting position and scale -double xOff = -0.5; -double yOff = 0.0; +double xOff = -0.5; +double yOff = 0.0; double scale = 3.2; // Starting stationary position and scale motion -double xdOff = 0.0; -double ydOff = 0.0; +double xdOff = 0.0; +double ydOff = 0.0; double dscale = 1.0; // Julia parameter @@ -131,41 +129,41 @@ int precisionMode = 0; // Starting animation frame and anti-aliasing pass int animationFrame = 0; -int animationStep = 0; -int pass = 0; +int animationStep = 0; +int pass = 0; // Starting color multipliers and random seed -int colorSeed = 0; +int colorSeed = 0; uchar4 colors; // Timer ID StopWatchInterface *hTimer = NULL; // User interface variables -int lastx = 0; -int lasty = 0; -bool leftClicked = false; +int lastx = 0; +int lasty = 0; +bool leftClicked = false; bool middleClicked = false; -bool rightClicked = false; +bool rightClicked = false; bool haveDoubles = true; -int numSMs = 0; // number of multiprocessors -int version = 1; // Compute Capability +int numSMs = 0; // number of multiprocessors +int version = 1; // Compute Capability // Auto-Verification Code -const int frameCheckNumber = 60; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 15; // FPS limit for sampling -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; +const int frameCheckNumber = 60; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 15; // FPS limit for sampling +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; const char *sSDKsample = "CUDA Mandelbrot/Julia Set"; -#define MAX_EPSILON 50 -#define REFRESH_DELAY 10 // ms +#define MAX_EPSILON 50 +#define REFRESH_DELAY 10 // ms #ifndef MAX #define MAX(a, b) ((a > b) ? a : b) @@ -176,1097 +174,1180 @@ const char *sSDKsample = "CUDA Mandelbrot/Julia Set"; // This is specifically to enable the application to enable/disable vsync typedef BOOL(WINAPI *PFNWGLSWAPINTERVALFARPROC)(int); -void setVSync(int interval) { - if (WGL_EXT_swap_control) { - wglSwapIntervalEXT = - (PFNWGLSWAPINTERVALFARPROC)wglGetProcAddress("wglSwapIntervalEXT"); - wglSwapIntervalEXT(interval); - } +void setVSync(int interval) +{ + if (WGL_EXT_swap_control) { + wglSwapIntervalEXT = (PFNWGLSWAPINTERVALFARPROC)wglGetProcAddress("wglSwapIntervalEXT"); + wglSwapIntervalEXT(interval); + } } #endif -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&hTimer) / 1000.f); - sprintf(fps, " %3.1f fps", - g_isJuliaSet ? "Julia" : "Mandelbrot", ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&hTimer) / 1000.f); + sprintf(fps, " %3.1f fps", g_isJuliaSet ? "Julia" : "Mandelbrot", ifps); + glutSetWindowTitle(fps); + fpsCount = 0; - fpsLimit = (int)MAX(1.f, (float)ifps); - sdkResetTimer(&hTimer); - } + fpsLimit = (int)MAX(1.f, (float)ifps); + sdkResetTimer(&hTimer); + } } -void startJulia(const char *path) { - g_isJuliaSet = true; - g_isMoving = false; +void startJulia(const char *path) +{ + g_isJuliaSet = true; + g_isMoving = false; - if ((path == NULL) || (stream = fopen(path, "r")) == NULL) { - printf( - "JuliaSet: params.txt could not be opened. Using default " - "parameters\n"); - xOff = -0.085760; - yOff = 0.007040; - scale = 3.200000; - xJParam = -0.172400; - yJParam = -0.652693; - } else { - fseek(stream, 0L, SEEK_SET); - fscanf(stream, "%lf %lf %lf %lf %lf", &xOff, &yOff, &scale, &xJParam, - &yJParam); - fclose(stream); - } + if ((path == NULL) || (stream = fopen(path, "r")) == NULL) { + printf("JuliaSet: params.txt could not be opened. Using default " + "parameters\n"); + xOff = -0.085760; + yOff = 0.007040; + scale = 3.200000; + xJParam = -0.172400; + yJParam = -0.652693; + } + else { + fseek(stream, 0L, SEEK_SET); + fscanf(stream, "%lf %lf %lf %lf %lf", &xOff, &yOff, &scale, &xJParam, &yJParam); + fclose(stream); + } - xdOff = 0.0; - ydOff = 0.0; - dscale = 1.0; - pass = 0; + xdOff = 0.0; + ydOff = 0.0; + dscale = 1.0; + pass = 0; } // Get a sub-pixel sample location -void GetSample(int sampleIndex, float &x, float &y) { - static const unsigned char pairData[128][2] = { - {64, 64}, {0, 0}, {1, 63}, {63, 1}, {96, 32}, {97, 95}, - {36, 96}, {30, 31}, {95, 127}, {4, 97}, {33, 62}, {62, 33}, - {31, 126}, {67, 99}, {99, 65}, {2, 34}, {81, 49}, {19, 80}, - {113, 17}, {112, 112}, {80, 16}, {115, 81}, {46, 15}, {82, 79}, - {48, 78}, {16, 14}, {49, 113}, {114, 48}, {45, 45}, {18, 47}, - {20, 109}, {79, 115}, {65, 82}, {52, 94}, {15, 124}, {94, 111}, - {61, 18}, {47, 30}, {83, 100}, {98, 50}, {110, 2}, {117, 98}, - {50, 59}, {77, 35}, {3, 114}, {5, 77}, {17, 66}, {32, 13}, - {127, 20}, {34, 76}, {35, 110}, {100, 12}, {116, 67}, {66, 46}, - {14, 28}, {23, 93}, {102, 83}, {86, 61}, {44, 125}, {76, 3}, - {109, 36}, {6, 51}, {75, 89}, {91, 21}, {60, 117}, {29, 43}, - {119, 29}, {74, 70}, {126, 87}, {93, 75}, {71, 24}, {106, 102}, - {108, 58}, {89, 9}, {103, 23}, {72, 56}, {120, 8}, {88, 40}, - {11, 88}, {104, 120}, {57, 105}, {118, 122}, {53, 6}, {125, 44}, - {43, 68}, {58, 73}, {24, 22}, {22, 5}, {40, 86}, {122, 108}, - {87, 90}, {56, 42}, {70, 121}, {8, 7}, {37, 52}, {25, 55}, - {69, 11}, {10, 106}, {12, 38}, {26, 69}, {27, 116}, {38, 25}, - {59, 54}, {107, 72}, {121, 57}, {39, 37}, {73, 107}, {85, 123}, - {28, 103}, {123, 74}, {55, 85}, {101, 41}, {42, 104}, {84, 27}, - {111, 91}, {9, 19}, {21, 39}, {90, 53}, {41, 60}, {54, 26}, - {92, 119}, {51, 71}, {124, 101}, {68, 92}, {78, 10}, {13, 118}, - {7, 84}, {105, 4}}; +void GetSample(int sampleIndex, float &x, float &y) +{ + static const unsigned char pairData[128][2] = { + {64, 64}, {0, 0}, {1, 63}, {63, 1}, {96, 32}, {97, 95}, {36, 96}, {30, 31}, {95, 127}, {4, 97}, + {33, 62}, {62, 33}, {31, 126}, {67, 99}, {99, 65}, {2, 34}, {81, 49}, {19, 80}, {113, 17}, {112, 112}, + {80, 16}, {115, 81}, {46, 15}, {82, 79}, {48, 78}, {16, 14}, {49, 113}, {114, 48}, {45, 45}, {18, 47}, + {20, 109}, {79, 115}, {65, 82}, {52, 94}, {15, 124}, {94, 111}, {61, 18}, {47, 30}, {83, 100}, {98, 50}, + {110, 2}, {117, 98}, {50, 59}, {77, 35}, {3, 114}, {5, 77}, {17, 66}, {32, 13}, {127, 20}, {34, 76}, + {35, 110}, {100, 12}, {116, 67}, {66, 46}, {14, 28}, {23, 93}, {102, 83}, {86, 61}, {44, 125}, {76, 3}, + {109, 36}, {6, 51}, {75, 89}, {91, 21}, {60, 117}, {29, 43}, {119, 29}, {74, 70}, {126, 87}, {93, 75}, + {71, 24}, {106, 102}, {108, 58}, {89, 9}, {103, 23}, {72, 56}, {120, 8}, {88, 40}, {11, 88}, {104, 120}, + {57, 105}, {118, 122}, {53, 6}, {125, 44}, {43, 68}, {58, 73}, {24, 22}, {22, 5}, {40, 86}, {122, 108}, + {87, 90}, {56, 42}, {70, 121}, {8, 7}, {37, 52}, {25, 55}, {69, 11}, {10, 106}, {12, 38}, {26, 69}, + {27, 116}, {38, 25}, {59, 54}, {107, 72}, {121, 57}, {39, 37}, {73, 107}, {85, 123}, {28, 103}, {123, 74}, + {55, 85}, {101, 41}, {42, 104}, {84, 27}, {111, 91}, {9, 19}, {21, 39}, {90, 53}, {41, 60}, {54, 26}, + {92, 119}, {51, 71}, {124, 101}, {68, 92}, {78, 10}, {13, 118}, {7, 84}, {105, 4}}; - x = (1.0f / 128.0f) * (0.5f + (float)pairData[sampleIndex][0]); - y = (1.0f / 128.0f) * (0.5f + (float)pairData[sampleIndex][1]); -} // GetSample + x = (1.0f / 128.0f) * (0.5f + (float)pairData[sampleIndex][0]); + y = (1.0f / 128.0f) * (0.5f + (float)pairData[sampleIndex][1]); +} // GetSample // render Mandelbrot image using CUDA or CPU -void renderImage(bool bUseOpenGL, bool fp64, int mode) { +void renderImage(bool bUseOpenGL, bool fp64, int mode) +{ #if RUN_TIMING - pass = 0; + pass = 0; #endif - if (pass < 128) { - if (g_runCPU) { - int startPass = pass; - float xs, ys; - sdkResetTimer(&hTimer); + if (pass < 128) { + if (g_runCPU) { + int startPass = pass; + float xs, ys; + sdkResetTimer(&hTimer); - if (bUseOpenGL) { - // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, - // gl_PBO)); - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_dst, &num_bytes, cuda_pbo_resource)); - } + if (bUseOpenGL) { + // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, + // gl_PBO)); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); + } - // Get the anti-alias sub-pixel sample location - GetSample(pass & 127, xs, ys); + // Get the anti-alias sub-pixel sample location + GetSample(pass & 127, xs, ys); - // Get the pixel scale and offset - double s = scale / (double)imageW; - double x = (xs - (double)imageW * 0.5f) * s + xOff; - double y = (ys - (double)imageH * 0.5f) * s + yOff; + // Get the pixel scale and offset + double s = scale / (double)imageW; + double x = (xs - (double)imageW * 0.5f) * s + xOff; + double y = (ys - (double)imageH * 0.5f) * s + yOff; - // Run the mandelbrot generator - // Use the adaptive sampling version when animating. - if (pass && !startPass) { - if (precisionMode) - RunMandelbrotDSGold1(h_Src, imageW, imageH, crunch, x, y, xJParam, - yJParam, s, colors, pass++, animationFrame, - g_isJuliaSet); - else - RunMandelbrotGold1(h_Src, imageW, imageH, crunch, (float)x, (float)y, - (float)xJParam, (float)yJParam, (float)s, colors, - pass++, animationFrame, g_isJuliaSet); - } else { - if (precisionMode) - RunMandelbrotDSGold0(h_Src, imageW, imageH, crunch, x, y, xJParam, - yJParam, s, colors, pass++, animationFrame, - g_isJuliaSet); - else - RunMandelbrotGold0(h_Src, imageW, imageH, crunch, (float)x, (float)y, - (float)xJParam, (float)yJParam, (float)s, colors, - pass++, animationFrame, g_isJuliaSet); - } + // Run the mandelbrot generator + // Use the adaptive sampling version when animating. + if (pass && !startPass) { + if (precisionMode) + RunMandelbrotDSGold1(h_Src, + imageW, + imageH, + crunch, + x, + y, + xJParam, + yJParam, + s, + colors, + pass++, + animationFrame, + g_isJuliaSet); + else + RunMandelbrotGold1(h_Src, + imageW, + imageH, + crunch, + (float)x, + (float)y, + (float)xJParam, + (float)yJParam, + (float)s, + colors, + pass++, + animationFrame, + g_isJuliaSet); + } + else { + if (precisionMode) + RunMandelbrotDSGold0(h_Src, + imageW, + imageH, + crunch, + x, + y, + xJParam, + yJParam, + s, + colors, + pass++, + animationFrame, + g_isJuliaSet); + else + RunMandelbrotGold0(h_Src, + imageW, + imageH, + crunch, + (float)x, + (float)y, + (float)xJParam, + (float)yJParam, + (float)s, + colors, + pass++, + animationFrame, + g_isJuliaSet); + } - checkCudaErrors(cudaMemcpy(d_dst, h_Src, imageW * imageH * sizeof(uchar4), - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_dst, h_Src, imageW * imageH * sizeof(uchar4), cudaMemcpyHostToDevice)); - if (bUseOpenGL) { - // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - } + if (bUseOpenGL) { + // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + } #if RUN_TIMING - printf("CPU = %5.8f\n", 0.001f * sdkGetTimerValue(&hTimer)); + printf("CPU = %5.8f\n", 0.001f * sdkGetTimerValue(&hTimer)); #endif - } else { // this is the GPU Path - float timeEstimate; - int startPass = pass; - sdkResetTimer(&hTimer); + } + else { // this is the GPU Path + float timeEstimate; + int startPass = pass; + sdkResetTimer(&hTimer); - if (bUseOpenGL) { - // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, - // gl_PBO)); - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_dst, &num_bytes, cuda_pbo_resource)); - } + if (bUseOpenGL) { + // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, + // gl_PBO)); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); + } - // Render anti-aliasing passes until we run out time (60fps approximately) - do { - float xs, ys; + // Render anti-aliasing passes until we run out time (60fps approximately) + do { + float xs, ys; - // Get the anti-alias sub-pixel sample location - GetSample(pass & 127, xs, ys); + // Get the anti-alias sub-pixel sample location + GetSample(pass & 127, xs, ys); - // Get the pixel scale and offset - double s = scale / (float)imageW; - double x = (xs - (double)imageW * 0.5f) * s + xOff; - double y = (ys - (double)imageH * 0.5f) * s + yOff; + // Get the pixel scale and offset + double s = scale / (float)imageW; + double x = (xs - (double)imageW * 0.5f) * s + xOff; + double y = (ys - (double)imageH * 0.5f) * s + yOff; - // Run the mandelbrot generator - // Use the adaptive sampling version when animating. - if (pass && !startPass) - RunMandelbrot1(d_dst, imageW, imageH, crunch, x, y, xJParam, yJParam, - s, colors, pass++, animationFrame, precisionMode, - numSMs, g_isJuliaSet, version); - else - RunMandelbrot0(d_dst, imageW, imageH, crunch, x, y, xJParam, yJParam, - s, colors, pass++, animationFrame, precisionMode, - numSMs, g_isJuliaSet, version); + // Run the mandelbrot generator + // Use the adaptive sampling version when animating. + if (pass && !startPass) + RunMandelbrot1(d_dst, + imageW, + imageH, + crunch, + x, + y, + xJParam, + yJParam, + s, + colors, + pass++, + animationFrame, + precisionMode, + numSMs, + g_isJuliaSet, + version); + else + RunMandelbrot0(d_dst, + imageW, + imageH, + crunch, + x, + y, + xJParam, + yJParam, + s, + colors, + pass++, + animationFrame, + precisionMode, + numSMs, + g_isJuliaSet, + version); - // Estimate the total time of the frame if one more pass is rendered - timeEstimate = - 0.1f * sdkGetTimerValue(&hTimer) * - ((float)(pass + 1 - startPass) / (float)(pass - startPass)); - } while ((pass < 128) && (timeEstimate < 1.0f / 60.0f) && !RUN_TIMING); + // Estimate the total time of the frame if one more pass is rendered + timeEstimate = + 0.1f * sdkGetTimerValue(&hTimer) * ((float)(pass + 1 - startPass) / (float)(pass - startPass)); + } while ((pass < 128) && (timeEstimate < 1.0f / 60.0f) && !RUN_TIMING); - if (bUseOpenGL) { - // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - } + if (bUseOpenGL) { + // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + } #if RUN_TIMING printf("GPU = %5.8f\n", 0.001f * sdkGetTimerValue(&hTimer); #endif + } } - } } // OpenGL display function -void displayFunc(void) { - sdkStartTimer(&hTimer); +void displayFunc(void) +{ + sdkStartTimer(&hTimer); - if ((xdOff != 0.0) || (ydOff != 0.0)) { - if (g_isMoving || !g_isJuliaSet) { - xOff += xdOff; - yOff += ydOff; - } else { - xJParam += xdOff; - yJParam += ydOff; + if ((xdOff != 0.0) || (ydOff != 0.0)) { + if (g_isMoving || !g_isJuliaSet) { + xOff += xdOff; + yOff += ydOff; + } + else { + xJParam += xdOff; + yJParam += ydOff; + } + + pass = 0; } - pass = 0; - } + if (dscale != 1.0) { + scale *= dscale; + pass = 0; + } - if (dscale != 1.0) { - scale *= dscale; - pass = 0; - } + if (animationStep) { + animationFrame -= animationStep; + pass = 0; + } - if (animationStep) { - animationFrame -= animationStep; - pass = 0; - } + // render the Mandelbrot image + renderImage(true, g_isJuliaSet, precisionMode); - // render the Mandelbrot image - renderImage(true, g_isJuliaSet, precisionMode); + // load texture from PBO + // glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, gl_PBO); + glBindTexture(GL_TEXTURE_2D, gl_Tex); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0)); + // glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - // load texture from PBO - // glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, gl_PBO); - glBindTexture(GL_TEXTURE_2D, gl_Tex); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, - GL_UNSIGNED_BYTE, BUFFER_DATA(0)); - // glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + // fragment program is required to display floating point texture + glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, gl_Shader); + glEnable(GL_FRAGMENT_PROGRAM_ARB); + glDisable(GL_DEPTH_TEST); - // fragment program is required to display floating point texture - glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, gl_Shader); - glEnable(GL_FRAGMENT_PROGRAM_ARB); - glDisable(GL_DEPTH_TEST); + glBegin(GL_QUADS); + glTexCoord2f(0.0f, 0.0f); + glVertex2f(0.0f, 0.0f); + glTexCoord2f(1.0f, 0.0f); + glVertex2f(1.0f, 0.0f); + glTexCoord2f(1.0f, 1.0f); + glVertex2f(1.0f, 1.0f); + glTexCoord2f(0.0f, 1.0f); + glVertex2f(0.0f, 1.0f); + glEnd(); - glBegin(GL_QUADS); - glTexCoord2f(0.0f, 0.0f); - glVertex2f(0.0f, 0.0f); - glTexCoord2f(1.0f, 0.0f); - glVertex2f(1.0f, 0.0f); - glTexCoord2f(1.0f, 1.0f); - glVertex2f(1.0f, 1.0f); - glTexCoord2f(0.0f, 1.0f); - glVertex2f(0.0f, 1.0f); - glEnd(); + glBindTexture(GL_TEXTURE_2D, 0); + glDisable(GL_FRAGMENT_PROGRAM_ARB); - glBindTexture(GL_TEXTURE_2D, 0); - glDisable(GL_FRAGMENT_PROGRAM_ARB); + sdkStopTimer(&hTimer); + glutSwapBuffers(); - sdkStopTimer(&hTimer); - glutSwapBuffers(); + computeFPS(); +} // displayFunc - computeFPS(); -} // displayFunc +void cleanup() +{ + if (h_Src) { + free(h_Src); + h_Src = 0; + } -void cleanup() { - if (h_Src) { - free(h_Src); - h_Src = 0; - } + sdkStopTimer(&hTimer); + sdkDeleteTimer(&hTimer); - sdkStopTimer(&hTimer); - sdkDeleteTimer(&hTimer); + // DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(gl_PBO)); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - // DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(gl_PBO)); - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - - glDeleteBuffers(1, &gl_PBO); - glDeleteTextures(1, &gl_Tex); - glDeleteProgramsARB(1, &gl_Shader); + glDeleteBuffers(1, &gl_PBO); + glDeleteTextures(1, &gl_Tex); + glDeleteProgramsARB(1, &gl_Shader); } void initMenus(); // OpenGL keyboard function -void keyboardFunc(unsigned char k, int, int) { - int seed; +void keyboardFunc(unsigned char k, int, int) +{ + int seed; - switch (k) { + switch (k) { case '\033': case 'q': case 'Q': - printf("Shutting down...\n"); + printf("Shutting down...\n"); #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case '?': - printf("xOff = %5.8f\n", xOff); - printf("yOff = %5.8f\n", yOff); - printf("scale = %e\n", scale); - printf("detail = %d\n", crunch); - printf("color = %d\n", colorSeed); - printf("xJParam = %5.8f\n", xJParam); - printf("yJParam = %5.8f\n", yJParam); - printf("\n"); - break; + printf("xOff = %5.8f\n", xOff); + printf("yOff = %5.8f\n", yOff); + printf("scale = %e\n", scale); + printf("detail = %d\n", crunch); + printf("color = %d\n", colorSeed); + printf("xJParam = %5.8f\n", xJParam); + printf("yJParam = %5.8f\n", yJParam); + printf("\n"); + break; case 'e': case 'E': - // Reset all values to their defaults - g_isJuliaSet = false; - g_isMoving = true; - g_runCPU = false; - printf( - "All parameters are reset to defaults. GPU implementation is " - "used.\n"); - xOff = -0.5; - yOff = 0.0; - scale = 3.2; - xdOff = 0.0; - ydOff = 0.0; - dscale = 1.0; - colorSeed = 0; - colors.x = 3; - colors.y = 5; - colors.z = 7; - crunch = 512; - animationFrame = 0; - animationStep = 0; - xJParam = 0.0; - yJParam = 0.0; - pass = 0; - break; + // Reset all values to their defaults + g_isJuliaSet = false; + g_isMoving = true; + g_runCPU = false; + printf("All parameters are reset to defaults. GPU implementation is " + "used.\n"); + xOff = -0.5; + yOff = 0.0; + scale = 3.2; + xdOff = 0.0; + ydOff = 0.0; + dscale = 1.0; + colorSeed = 0; + colors.x = 3; + colors.y = 5; + colors.z = 7; + crunch = 512; + animationFrame = 0; + animationStep = 0; + xJParam = 0.0; + yJParam = 0.0; + pass = 0; + break; case 'c': - seed = ++colorSeed; + seed = ++colorSeed; - if (seed) { - colors.x = RANDOMBITS(seed, 4); - colors.y = RANDOMBITS(seed, 4); - colors.z = RANDOMBITS(seed, 4); - } else { - colors.x = 3; - colors.y = 5; - colors.z = 7; - } + if (seed) { + colors.x = RANDOMBITS(seed, 4); + colors.y = RANDOMBITS(seed, 4); + colors.z = RANDOMBITS(seed, 4); + } + else { + colors.x = 3; + colors.y = 5; + colors.z = 7; + } - pass = 0; - break; + pass = 0; + break; case 'C': - seed = --colorSeed; + seed = --colorSeed; - if (seed) { - colors.x = RANDOMBITS(seed, 4); - colors.y = RANDOMBITS(seed, 4); - colors.z = RANDOMBITS(seed, 4); - } else { - colors.x = 3; - colors.y = 5; - colors.z = 7; - } + if (seed) { + colors.x = RANDOMBITS(seed, 4); + colors.y = RANDOMBITS(seed, 4); + colors.z = RANDOMBITS(seed, 4); + } + else { + colors.x = 3; + colors.y = 5; + colors.z = 7; + } - pass = 0; - break; + pass = 0; + break; case 'a': - if (animationStep < 0) { - animationStep = 0; - } else { - animationStep++; - - if (animationStep > 8) { - animationStep = 8; + if (animationStep < 0) { + animationStep = 0; } - } + else { + animationStep++; - break; + if (animationStep > 8) { + animationStep = 8; + } + } + + break; case 'A': - if (animationStep > 0) { - animationStep = 0; - } else { - animationStep--; - - if (animationStep < -8) { - animationStep = -8; + if (animationStep > 0) { + animationStep = 0; } - } + else { + animationStep--; - break; + if (animationStep < -8) { + animationStep = -8; + } + } + + break; case 'd': - if (2 * crunch <= MIN(numSMs * (version < 20 ? 512 : 2048), 0x4000)) { - crunch *= 2; - pass = 0; - } + if (2 * crunch <= MIN(numSMs * (version < 20 ? 512 : 2048), 0x4000)) { + crunch *= 2; + pass = 0; + } - printf("detail = %d\n", crunch); - break; + printf("detail = %d\n", crunch); + break; case 'D': - if (crunch > 2) { - crunch /= 2; - pass = 0; - } + if (crunch > 2) { + crunch /= 2; + pass = 0; + } - printf("detail = %d\n", crunch); - break; + printf("detail = %d\n", crunch); + break; case 'r': - colors.x -= 1; - pass = 0; - break; + colors.x -= 1; + pass = 0; + break; case 'R': - colors.x += 1; - pass = 0; - break; + colors.x += 1; + pass = 0; + break; case 'g': - colors.y -= 1; - pass = 0; - break; + colors.y -= 1; + pass = 0; + break; case 'G': - colors.y += 1; - pass = 0; - break; + colors.y += 1; + pass = 0; + break; case 'b': - colors.z -= 1; - pass = 0; - break; + colors.z -= 1; + pass = 0; + break; case 'B': - colors.z += 1; - pass = 0; - break; + colors.z += 1; + pass = 0; + break; case 's': case 'S': - if (g_runCPU) { - g_runCPU = false; - printf("GPU implementation\n"); - } else { - g_runCPU = true; - printf("CPU implementation\n"); - } + if (g_runCPU) { + g_runCPU = false; + printf("GPU implementation\n"); + } + else { + g_runCPU = true; + printf("CPU implementation\n"); + } - pass = 0; - glutDestroyMenu(glutGetMenu()); - initMenus(); - break; + pass = 0; + glutDestroyMenu(glutGetMenu()); + initMenus(); + break; case 'j': case 'J': - // toggle between Mandelbrot and Julia sets and reset all parameters - if (!g_isJuliaSet) { // settings for Julia - g_isJuliaSet = true; - startJulia("params.txt"); - } else { // settings for Mandelbrot - g_isJuliaSet = false; - g_isMoving = true; - xOff = -0.5; - yOff = 0.0; - scale = 3.2; - xdOff = 0.0; - ydOff = 0.0; - dscale = 1.0; - colorSeed = 0; - colors.x = 3; - colors.y = 5; - colors.z = 7; - crunch = 512; - animationFrame = 0; - animationStep = 0; - pass = 0; - } + // toggle between Mandelbrot and Julia sets and reset all parameters + if (!g_isJuliaSet) { // settings for Julia + g_isJuliaSet = true; + startJulia("params.txt"); + } + else { // settings for Mandelbrot + g_isJuliaSet = false; + g_isMoving = true; + xOff = -0.5; + yOff = 0.0; + scale = 3.2; + xdOff = 0.0; + ydOff = 0.0; + dscale = 1.0; + colorSeed = 0; + colors.x = 3; + colors.y = 5; + colors.z = 7; + crunch = 512; + animationFrame = 0; + animationStep = 0; + pass = 0; + } - char fps[30]; - sprintf(fps, "", g_isJuliaSet ? "Julia" : "Mandelbrot"); - glutSetWindowTitle(fps); + char fps[30]; + sprintf(fps, "", g_isJuliaSet ? "Julia" : "Mandelbrot"); + glutSetWindowTitle(fps); - break; + break; case 'm': case 'M': - if (g_isJuliaSet) { - g_isMoving = !g_isMoving; - pass = 0; - } + if (g_isJuliaSet) { + g_isMoving = !g_isMoving; + pass = 0; + } - break; + break; case 'p': case 'P': #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - if (fopen_s(&stream, "params.txt", "w") != 0) + if (fopen_s(&stream, "params.txt", "w") != 0) #else - if ((stream = fopen("params.txt", "w")) == NULL) + if ((stream = fopen("params.txt", "w")) == NULL) #endif - { - printf("The file params.txt was not opened\n"); - break; - } + { + printf("The file params.txt was not opened\n"); + break; + } - fprintf(stream, "%f %f %f %f %f\n", xOff, yOff, scale, xJParam, yJParam); - fclose(stream); - break; + fprintf(stream, "%f %f %f %f %f\n", xOff, yOff, scale, xJParam, yJParam); + fclose(stream); + break; case 'o': case 'O': #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - if (fopen_s(&stream, "params.txt", "r") != 0) + if (fopen_s(&stream, "params.txt", "r") != 0) #else - if ((stream = fopen("params.txt", "r")) == NULL) + if ((stream = fopen("params.txt", "r")) == NULL) #endif - { - printf("The file params.txt was not opened\n"); + { + printf("The file params.txt was not opened\n"); + break; + } + + fseek(stream, 0L, SEEK_SET); + fscanf(stream, "%lf %lf %lf %lf %lf", &xOff, &yOff, &scale, &xJParam, &yJParam); + xdOff = 0.0; + ydOff = 0.0; + dscale = 1.0; + fclose(stream); + pass = 0; break; - } - fseek(stream, 0L, SEEK_SET); - fscanf(stream, "%lf %lf %lf %lf %lf", &xOff, &yOff, &scale, &xJParam, - &yJParam); - xdOff = 0.0; - ydOff = 0.0; - dscale = 1.0; - fclose(stream); - pass = 0; - break; + case '4': // Left arrow key + xOff -= 0.05f * scale; + pass = 0; + break; - case '4': // Left arrow key - xOff -= 0.05f * scale; - pass = 0; - break; + case '8': // Up arrow key + yOff += 0.05f * scale; + pass = 0; + break; - case '8': // Up arrow key - yOff += 0.05f * scale; - pass = 0; - break; + case '6': // Right arrow key + xOff += 0.05f * scale; + pass = 0; + break; - case '6': // Right arrow key - xOff += 0.05f * scale; - pass = 0; - break; - - case '2': // Down arrow key - yOff -= 0.05f * scale; - pass = 0; - break; + case '2': // Down arrow key + yOff -= 0.05f * scale; + pass = 0; + break; case '+': - scale /= 1.1f; - pass = 0; - break; + scale /= 1.1f; + pass = 0; + break; case '-': - scale *= 1.1f; - pass = 0; - break; + scale *= 1.1f; + pass = 0; + break; default: - break; - } + break; + } -} // keyboardFunc +} // keyboardFunc // OpenGL mouse click function -void clickFunc(int button, int state, int x, int y) { - if (button == 0) { - leftClicked = !leftClicked; - } +void clickFunc(int button, int state, int x, int y) +{ + if (button == 0) { + leftClicked = !leftClicked; + } - if (button == 1) { - middleClicked = !middleClicked; - } + if (button == 1) { + middleClicked = !middleClicked; + } - if (button == 2) { - rightClicked = !rightClicked; - } + if (button == 2) { + rightClicked = !rightClicked; + } - int modifiers = glutGetModifiers(); + int modifiers = glutGetModifiers(); - if (leftClicked && (modifiers & GLUT_ACTIVE_SHIFT)) { - leftClicked = 0; - middleClicked = 1; - } + if (leftClicked && (modifiers & GLUT_ACTIVE_SHIFT)) { + leftClicked = 0; + middleClicked = 1; + } - if (state == GLUT_UP) { - leftClicked = 0; - middleClicked = 0; - } + if (state == GLUT_UP) { + leftClicked = 0; + middleClicked = 0; + } - lastx = x; - lasty = y; - xdOff = 0.0; - ydOff = 0.0; - dscale = 1.0; -} // clickFunc + lastx = x; + lasty = y; + xdOff = 0.0; + ydOff = 0.0; + dscale = 1.0; +} // clickFunc // OpenGL mouse motion function -void motionFunc(int x, int y) { - double fx = (double)(x - lastx) / 50.0 / (double)(imageW); - double fy = (double)(lasty - y) / 50.0 / (double)(imageH); +void motionFunc(int x, int y) +{ + double fx = (double)(x - lastx) / 50.0 / (double)(imageW); + double fy = (double)(lasty - y) / 50.0 / (double)(imageH); - if (leftClicked) { - xdOff = fx * scale; - ydOff = fy * scale; - } else { - xdOff = 0.0f; - ydOff = 0.0f; - } - - if (middleClicked) - if (fy > 0.0f) { - dscale = 1.0 - fy; - dscale = dscale < 1.05 ? dscale : 1.05; - } else { - dscale = 1.0 / (1.0 + fy); - dscale = dscale > (1.0 / 1.05) ? dscale : (1.0 / 1.05); + if (leftClicked) { + xdOff = fx * scale; + ydOff = fy * scale; + } + else { + xdOff = 0.0f; + ydOff = 0.0f; } - else { - dscale = 1.0; - } -} // motionFunc -void timerEvent(int value) { - if (glutGetWindow()) { - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - } + if (middleClicked) + if (fy > 0.0f) { + dscale = 1.0 - fy; + dscale = dscale < 1.05 ? dscale : 1.05; + } + else { + dscale = 1.0 / (1.0 + fy); + dscale = dscale > (1.0 / 1.05) ? dscale : (1.0 / 1.05); + } + else { + dscale = 1.0; + } +} // motionFunc + +void timerEvent(int value) +{ + if (glutGetWindow()) { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + } } -void mainMenu(int i) { - precisionMode = i; - pass = 0; +void mainMenu(int i) +{ + precisionMode = i; + pass = 0; } -void initMenus() { - glutCreateMenu(mainMenu); +void initMenus() +{ + glutCreateMenu(mainMenu); - if (!g_runCPU) { - glutAddMenuEntry("Hardware single precision", 0); + if (!g_runCPU) { + glutAddMenuEntry("Hardware single precision", 0); - if (numSMs > 2) { - glutAddMenuEntry("Emulated double-single precision", 1); + if (numSMs > 2) { + glutAddMenuEntry("Emulated double-single precision", 1); + } + + if (haveDoubles) { + glutAddMenuEntry("Hardware double precision", 2); + } + } + else { + glutAddMenuEntry("Software single precision", 0); + glutAddMenuEntry("Software double precision", 1); } - if (haveDoubles) { - glutAddMenuEntry("Hardware double precision", 2); - } - } else { - glutAddMenuEntry("Software single precision", 0); - glutAddMenuEntry("Software double precision", 1); - } - - glutAttachMenu(GLUT_RIGHT_BUTTON); + glutAttachMenu(GLUT_RIGHT_BUTTON); } // gl_Shader for displaying floating-point texture -static const char *shader_code = - "!!ARBfp1.0\n" - "TEX result.color, fragment.texcoord, texture[0], 2D; \n" - "END"; +static const char *shader_code = "!!ARBfp1.0\n" + "TEX result.color, fragment.texcoord, texture[0], 2D; \n" + "END"; -GLuint compileASMShader(GLenum program_type, const char *code) { - GLuint program_id; - glGenProgramsARB(1, &program_id); - glBindProgramARB(program_type, program_id); - glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, - (GLsizei)strlen(code), (GLubyte *)code); +GLuint compileASMShader(GLenum program_type, const char *code) +{ + GLuint program_id; + glGenProgramsARB(1, &program_id); + glBindProgramARB(program_type, program_id); + glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)strlen(code), (GLubyte *)code); - GLint error_pos; - glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); + GLint error_pos; + glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); - if (error_pos != -1) { - const GLubyte *error_string; - error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); - fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, - error_string); - return 0; - } + if (error_pos != -1) { + const GLubyte *error_string; + error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); + fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, error_string); + return 0; + } - return program_id; + return program_id; } -void initOpenGLBuffers(int w, int h) { - // delete old buffers - if (h_Src) { - free(h_Src); - h_Src = 0; - } +void initOpenGLBuffers(int w, int h) +{ + // delete old buffers + if (h_Src) { + free(h_Src); + h_Src = 0; + } - if (gl_Tex) { - glDeleteTextures(1, &gl_Tex); - gl_Tex = 0; - } + if (gl_Tex) { + glDeleteTextures(1, &gl_Tex); + gl_Tex = 0; + } - if (gl_PBO) { - // DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(gl_PBO)); - cudaGraphicsUnregisterResource(cuda_pbo_resource); - glDeleteBuffers(1, &gl_PBO); - gl_PBO = 0; - } + if (gl_PBO) { + // DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(gl_PBO)); + cudaGraphicsUnregisterResource(cuda_pbo_resource); + glDeleteBuffers(1, &gl_PBO); + gl_PBO = 0; + } - // allocate new buffers - h_Src = (uchar4 *)malloc(w * h * 4); + // allocate new buffers + h_Src = (uchar4 *)malloc(w * h * 4); - printf("Creating GL texture...\n"); - glEnable(GL_TEXTURE_2D); - glGenTextures(1, &gl_Tex); - glBindTexture(GL_TEXTURE_2D, gl_Tex); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, - h_Src); - printf("Texture created.\n"); + printf("Creating GL texture...\n"); + glEnable(GL_TEXTURE_2D); + glGenTextures(1, &gl_Tex); + glBindTexture(GL_TEXTURE_2D, gl_Tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, h_Src); + printf("Texture created.\n"); - printf("Creating PBO...\n"); - glGenBuffers(1, &gl_PBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, gl_PBO); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, w * h * 4, h_Src, GL_STREAM_COPY); - // While a PBO is registered to CUDA, it can't be used - // as the destination for OpenGL drawing calls. - // But in our particular case OpenGL is only used - // to display the content of the PBO, specified by CUDA kernels, - // so we need to register/unregister it only once. + printf("Creating PBO...\n"); + glGenBuffers(1, &gl_PBO); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, gl_PBO); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, w * h * 4, h_Src, GL_STREAM_COPY); + // While a PBO is registered to CUDA, it can't be used + // as the destination for OpenGL drawing calls. + // But in our particular case OpenGL is only used + // to display the content of the PBO, specified by CUDA kernels, + // so we need to register/unregister it only once. - // DEPRECATED: checkCudaErrors( cudaGLRegisterBufferObject(gl_PBO) ); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, gl_PBO, cudaGraphicsMapFlagsWriteDiscard)); - printf("PBO created.\n"); + // DEPRECATED: checkCudaErrors( cudaGLRegisterBufferObject(gl_PBO) ); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, gl_PBO, cudaGraphicsMapFlagsWriteDiscard)); + printf("PBO created.\n"); - // load shader program - gl_Shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); + // load shader program + gl_Shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); } -void reshapeFunc(int w, int h) { - glViewport(0, 0, w, h); +void reshapeFunc(int w, int h) +{ + glViewport(0, 0, w, h); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); - if (w != 0 && h != 0) // Do not call when window is minimized that is when - // width && height == 0 - initOpenGLBuffers(w, h); + if (w != 0 && h != 0) // Do not call when window is minimized that is when + // width && height == 0 + initOpenGLBuffers(w, h); - imageW = w; - imageH = h; - pass = 0; + imageW = w; + imageH = h; + pass = 0; - glutPostRedisplay(); + glutPostRedisplay(); } -void initGL(int *argc, char **argv) { - printf("Initializing GLUT...\n"); - glutInit(argc, argv); +void initGL(int *argc, char **argv) +{ + printf("Initializing GLUT...\n"); + glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); - glutInitWindowSize(imageW, imageH); - glutInitWindowPosition(0, 0); - glutCreateWindow(argv[0]); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(imageW, imageH); + glutInitWindowPosition(0, 0); + glutCreateWindow(argv[0]); - glutDisplayFunc(displayFunc); - glutKeyboardFunc(keyboardFunc); - glutMouseFunc(clickFunc); - glutMotionFunc(motionFunc); - glutReshapeFunc(reshapeFunc); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - initMenus(); + glutDisplayFunc(displayFunc); + glutKeyboardFunc(keyboardFunc); + glutMouseFunc(clickFunc); + glutMotionFunc(motionFunc); + glutReshapeFunc(reshapeFunc); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + initMenus(); - if (!isGLVersionSupported(1, 5) || - !areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); - fprintf(stderr, "This sample requires:\n"); - fprintf(stderr, " OpenGL version 1.5\n"); - fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); - fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); - exit(EXIT_SUCCESS); - } + if (!isGLVersionSupported(1, 5) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); + fprintf(stderr, "This sample requires:\n"); + fprintf(stderr, " OpenGL version 1.5\n"); + fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); + fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); + exit(EXIT_SUCCESS); + } - printf("OpenGL window created.\n"); + printf("OpenGL window created.\n"); } -void initData(int argc, char **argv) { - // check for hardware double precision support - int dev = 0; - dev = findCudaDevice(argc, (const char **)argv); +void initData(int argc, char **argv) +{ + // check for hardware double precision support + int dev = 0; + dev = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - version = deviceProp.major * 10 + deviceProp.minor; + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + version = deviceProp.major * 10 + deviceProp.minor; - numSMs = deviceProp.multiProcessorCount; + numSMs = deviceProp.multiProcessorCount; - // initialize some of the arguments - if (checkCmdLineFlag(argc, (const char **)argv, "xOff")) { - xOff = getCmdLineArgumentFloat(argc, (const char **)argv, "xOff"); - } + // initialize some of the arguments + if (checkCmdLineFlag(argc, (const char **)argv, "xOff")) { + xOff = getCmdLineArgumentFloat(argc, (const char **)argv, "xOff"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "yOff")) { - yOff = getCmdLineArgumentFloat(argc, (const char **)argv, "yOff"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "yOff")) { + yOff = getCmdLineArgumentFloat(argc, (const char **)argv, "yOff"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "scale")) { - scale = getCmdLineArgumentFloat(argc, (const char **)argv, "xOff"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "scale")) { + scale = getCmdLineArgumentFloat(argc, (const char **)argv, "xOff"); + } - colors.w = 0; - colors.x = 3; - colors.y = 5; - colors.z = 7; - printf("Data initialization done.\n"); + colors.w = 0; + colors.x = 3; + colors.y = 5; + colors.z = 7; + printf("Data initialization done.\n"); } //////////////////////////////////////////////////////////////////////////////// // runAutoTest validates the Mandelbrot and Julia sets without using OpenGL //////////////////////////////////////////////////////////////////////////////// -int runSingleTest(int argc, char **argv) { - char dump_file[256], *ref_file = NULL; - bool haveDouble = false; +int runSingleTest(int argc, char **argv) +{ + char dump_file[256], *ref_file = NULL; + bool haveDouble = false; - printf("* Running Automatic Test: <%s>\n", sSDKsample); + printf("* Running Automatic Test: <%s>\n", sSDKsample); - strcpy(dump_file, (const char *)"rendered_image.ppm"); - // We've already determined that file has been passed in as input, we can grab - // the file here - getCmdLineArgumentString(argc, (const char **)argv, "file", - (char **)&ref_file); + strcpy(dump_file, (const char *)"rendered_image.ppm"); + // We've already determined that file has been passed in as input, we can grab + // the file here + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); - if (checkCmdLineFlag(argc, (const char **)argv, "fp64")) { - haveDouble = true; - } - - // initialize Data for CUDA - initData(argc, argv); - - // Allocate memory for renderImage (to be able to render into a CUDA memory - // buffer) - checkCudaErrors( - cudaMalloc((void **)&d_dst, (imageW * imageH * sizeof(uchar4)))); - - // Allocate memory for cpu buffer - unsigned char *h_dst = - (unsigned char *)malloc(sizeof(uchar4) * imageH * imageW); - - if (g_isJuliaSet) { - char *ref_path = sdkFindFilePath("params.txt", argv[0]); - startJulia(ref_path); - - for (int i = 0; i < 50; i++) { - renderImage(false, haveDouble, 0); + if (checkCmdLineFlag(argc, (const char **)argv, "fp64")) { + haveDouble = true; } - checkCudaErrors(cudaMemcpy(h_dst, d_dst, imageW * imageH * sizeof(uchar4), - cudaMemcpyDeviceToHost)); - sdkSavePPM4ub(dump_file, h_dst, imageW, imageH); - } else { - // Mandelbrot Set - for (int i = 0; i < 50; i++) { - renderImage(false, haveDouble, 0); + // initialize Data for CUDA + initData(argc, argv); + + // Allocate memory for renderImage (to be able to render into a CUDA memory + // buffer) + checkCudaErrors(cudaMalloc((void **)&d_dst, (imageW * imageH * sizeof(uchar4)))); + + // Allocate memory for cpu buffer + unsigned char *h_dst = (unsigned char *)malloc(sizeof(uchar4) * imageH * imageW); + + if (g_isJuliaSet) { + char *ref_path = sdkFindFilePath("params.txt", argv[0]); + startJulia(ref_path); + + for (int i = 0; i < 50; i++) { + renderImage(false, haveDouble, 0); + } + + checkCudaErrors(cudaMemcpy(h_dst, d_dst, imageW * imageH * sizeof(uchar4), cudaMemcpyDeviceToHost)); + sdkSavePPM4ub(dump_file, h_dst, imageW, imageH); + } + else { + // Mandelbrot Set + for (int i = 0; i < 50; i++) { + renderImage(false, haveDouble, 0); + } + + checkCudaErrors(cudaMemcpy(h_dst, d_dst, imageW * imageH * sizeof(uchar4), cudaMemcpyDeviceToHost)); + sdkSavePPM4ub(dump_file, h_dst, imageW, imageH); } - checkCudaErrors(cudaMemcpy(h_dst, d_dst, imageW * imageH * sizeof(uchar4), - cudaMemcpyDeviceToHost)); - sdkSavePPM4ub(dump_file, h_dst, imageW, imageH); - } + printf("\n[%s], %s Set, %s -> Saved File\n", + dump_file, + (g_isJuliaSet ? "Julia" : "Mandelbrot"), + (haveDouble ? "(fp64 double precision)" : "(fp32 single precision)")); - printf("\n[%s], %s Set, %s -> Saved File\n", dump_file, - (g_isJuliaSet ? "Julia" : "Mandelbrot"), - (haveDouble ? "(fp64 double precision)" : "(fp32 single precision)")); + if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { + printf("Images \"%s\", \"%s\" are different\n", ref_file, dump_file); + g_TotalErrors++; + } + else { + printf("Images \"%s\", \"%s\" are matching\n", ref_file, dump_file); + } - if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, argv[0]), - MAX_EPSILON_ERROR, 0.15f, false)) { - printf("Images \"%s\", \"%s\" are different\n", ref_file, dump_file); - g_TotalErrors++; - } else { - printf("Images \"%s\", \"%s\" are matching\n", ref_file, dump_file); - } + checkCudaErrors(cudaFree(d_dst)); + free(h_dst); - checkCudaErrors(cudaFree(d_dst)); - free(h_dst); - - return true; + return true; } // Performance Test -void runBenchmark(int argc, char **argv) { - int N = 1000; - // initialize Data for CUDA - initData(argc, argv); +void runBenchmark(int argc, char **argv) +{ + int N = 1000; + // initialize Data for CUDA + initData(argc, argv); - printf("\n* Run Performance Test\n"); - printf("Image Size %d x %d\n", imageW, imageH); - printf("Double Precision\n"); - printf("%d Iterations\n", N); + printf("\n* Run Performance Test\n"); + printf("Image Size %d x %d\n", imageW, imageH); + printf("Double Precision\n"); + printf("%d Iterations\n", N); - // Allocate memory for renderImage (to be able to render into a CUDA memory - // buffer) - checkCudaErrors( - cudaMalloc((void **)&d_dst, (imageW * imageH * sizeof(uchar4)))); + // Allocate memory for renderImage (to be able to render into a CUDA memory + // buffer) + checkCudaErrors(cudaMalloc((void **)&d_dst, (imageW * imageH * sizeof(uchar4)))); - float xs, ys; + float xs, ys; - // Get the anti-alias sub-pixel sample location - GetSample(0, xs, ys); + // Get the anti-alias sub-pixel sample location + GetSample(0, xs, ys); - double s = scale / (float)imageW; - double x = (xs - (double)imageW * 0.5f) * s + xOff; - double y = (ys - (double)imageH * 0.5f) * s + yOff; + double s = scale / (float)imageW; + double x = (xs - (double)imageW * 0.5f) * s + xOff; + double y = (ys - (double)imageH * 0.5f) * s + yOff; - // Create Timers - StopWatchInterface *kernel_timer = NULL; - sdkCreateTimer(&kernel_timer); - sdkStartTimer(&kernel_timer); + // Create Timers + StopWatchInterface *kernel_timer = NULL; + sdkCreateTimer(&kernel_timer); + sdkStartTimer(&kernel_timer); - // render Mandelbrot set and verify - for (int i = 0; i < N; i++) { - RunMandelbrot0(d_dst, imageW, imageH, crunch, x, y, xJParam, yJParam, s, - colors, pass++, animationFrame, 2, numSMs, g_isJuliaSet, - version); - cudaDeviceSynchronize(); - } + // render Mandelbrot set and verify + for (int i = 0; i < N; i++) { + RunMandelbrot0(d_dst, + imageW, + imageH, + crunch, + x, + y, + xJParam, + yJParam, + s, + colors, + pass++, + animationFrame, + 2, + numSMs, + g_isJuliaSet, + version); + cudaDeviceSynchronize(); + } - sdkStopTimer(&hTimer); - float ExecutionTime = sdkGetTimerValue(&kernel_timer); + sdkStopTimer(&hTimer); + float ExecutionTime = sdkGetTimerValue(&kernel_timer); - float PixelsPerSecond = - (float)imageW * (float)imageH * N / (ExecutionTime / 1000.0f); + float PixelsPerSecond = (float)imageW * (float)imageH * N / (ExecutionTime / 1000.0f); - printf("\nMegaPixels Per Second %.4f\n", PixelsPerSecond / 1e6); + printf("\nMegaPixels Per Second %.4f\n", PixelsPerSecond / 1e6); - checkCudaErrors(cudaFree(d_dst)); - sdkDeleteTimer(&kernel_timer); + checkCudaErrors(cudaFree(d_dst)); + sdkDeleteTimer(&kernel_timer); } -void printHelp() { - printf("[Mandelbrot]\n"); - printf("\tUsage Parameters\n"); - printf("\t-device=n (requires to be in non-graphics mode)\n"); - printf("\t-file=output.ppm (output file for image testing)\n"); - printf("\t-mode=0,1 (0=Mandelbrot Set, 1=Julia Set)\n"); - printf("\t-fp64 (run in double precision mode)\n"); +void printHelp() +{ + printf("[Mandelbrot]\n"); + printf("\tUsage Parameters\n"); + printf("\t-device=n (requires to be in non-graphics mode)\n"); + printf("\t-file=output.ppm (output file for image testing)\n"); + printf("\t-mode=0,1 (0=Mandelbrot Set, 1=Julia Set)\n"); + printf("\t-fp64 (run in double precision mode)\n"); } //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("[%s] - Starting...\n", sSDKsample); + printf("[%s] - Starting...\n", sSDKsample); - // parse command line arguments - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printHelp(); - exit(EXIT_SUCCESS); - } + // parse command line arguments + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printHelp(); + exit(EXIT_SUCCESS); + } - int mode = 0; + int mode = 0; - if (checkCmdLineFlag(argc, (const char **)argv, "mode")) { - mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); - g_isJuliaSet = mode; + if (checkCmdLineFlag(argc, (const char **)argv, "mode")) { + mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); + g_isJuliaSet = mode; + } + else { + g_isJuliaSet = 0; + } - } else { - g_isJuliaSet = 0; - } + // Set the initial parameters for either Mandelbrot and Julia sets and reset + // all parameters + if (g_isJuliaSet) // settings for Julia + { + char *ref_path = sdkFindFilePath("params.txt", argv[0]); + startJulia(ref_path); + } + else // settings for Mandelbrot + { + g_isMoving = true; + xOff = -0.5; + yOff = 0.0; + scale = 3.2; + xdOff = 0.0; + ydOff = 0.0; + dscale = 1.0; + colorSeed = 0; + colors.x = 3; + colors.y = 5; + colors.z = 7; + crunch = 512; + animationFrame = 0; + animationStep = 0; + pass = 0; + } - // Set the initial parameters for either Mandelbrot and Julia sets and reset - // all parameters - if (g_isJuliaSet) // settings for Julia - { - char *ref_path = sdkFindFilePath("params.txt", argv[0]); - startJulia(ref_path); - } else // settings for Mandelbrot - { - g_isMoving = true; - xOff = -0.5; - yOff = 0.0; - scale = 3.2; - xdOff = 0.0; - ydOff = 0.0; - dscale = 1.0; - colorSeed = 0; - colors.x = 3; - colors.y = 5; - colors.z = 7; - crunch = 512; - animationFrame = 0; - animationStep = 0; - pass = 0; - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + fpsLimit = frameCheckNumber; - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - fpsLimit = frameCheckNumber; + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); // no OpenGL usage + // We run the Automated Testing code path + runSingleTest(argc, argv); + + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + } + else if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { + // run benchmark + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); + + // We run the Automated Performance Test + runBenchmark(argc, argv); + + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + } // use command-line specified CUDA device, otherwise use device with highest // Gflops/s - findCudaDevice(argc, (const char **)argv); // no OpenGL usage + else if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf("[%s]\n", argv[0]); + printf(" Does not explicitly support -device=n in OpenGL mode\n"); + printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); + printf(" > %s -device=n -file=.ppm\n", argv[0]); + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } - // We run the Automated Testing code path - runSingleTest(argc, argv); + // Otherwise it succeeds, we will continue to run this sample + initData(argc, argv); - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - } else if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { - // run benchmark - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // Initialize OpenGL context first before the CUDA context is created. This + // is needed + // to achieve optimal performance with OpenGL/CUDA interop. + initGL(&argc, argv); + initOpenGLBuffers(imageW, imageH); - // We run the Automated Performance Test - runBenchmark(argc, argv); + printf("Starting GLUT main loop...\n"); + printf("\n"); + printf("Press [s] to toggle between GPU and CPU implementations\n"); + printf("Press [j] to toggle between Julia and Mandelbrot sets\n"); + printf("Press [r] or [R] to decrease or increase red color channel\n"); + printf("Press [g] or [G] to decrease or increase green color channel\n"); + printf("Press [b] or [B] to decrease or increase blue color channel\n"); + printf("Press [e] to reset\n"); + printf("Press [a] or [A] to animate colors\n"); + printf("Press [c] or [C] to change colors\n"); + printf("Press [d] or [D] to increase or decrease the detail\n"); + printf("Press [p] to record main parameters to file params.txt\n"); + printf("Press [o] to read main parameters from file params.txt\n"); + printf("Left mouse button + drag = move (Mandelbrot or Julia) or animate " + "(Julia)\n"); + printf("Press [m] to toggle between move and animate (Julia) for left mouse " + "button\n"); + printf("Middle mouse button + drag = Zoom\n"); + printf("Right mouse button = Menu\n"); + printf("Press [?] to print location and scale\n"); + printf("Press [q] to exit\n"); + printf("\n"); - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - } - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - else if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf("[%s]\n", argv[0]); - printf(" Does not explicitly support -device=n in OpenGL mode\n"); - printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); - printf(" > %s -device=n -file=.ppm\n", argv[0]); - printf("exiting...\n"); - exit(EXIT_SUCCESS); - } - - // Otherwise it succeeds, we will continue to run this sample - initData(argc, argv); - - // Initialize OpenGL context first before the CUDA context is created. This - // is needed - // to achieve optimal performance with OpenGL/CUDA interop. - initGL(&argc, argv); - initOpenGLBuffers(imageW, imageH); - - printf("Starting GLUT main loop...\n"); - printf("\n"); - printf("Press [s] to toggle between GPU and CPU implementations\n"); - printf("Press [j] to toggle between Julia and Mandelbrot sets\n"); - printf("Press [r] or [R] to decrease or increase red color channel\n"); - printf("Press [g] or [G] to decrease or increase green color channel\n"); - printf("Press [b] or [B] to decrease or increase blue color channel\n"); - printf("Press [e] to reset\n"); - printf("Press [a] or [A] to animate colors\n"); - printf("Press [c] or [C] to change colors\n"); - printf("Press [d] or [D] to increase or decrease the detail\n"); - printf("Press [p] to record main parameters to file params.txt\n"); - printf("Press [o] to read main parameters from file params.txt\n"); - printf( - "Left mouse button + drag = move (Mandelbrot or Julia) or animate " - "(Julia)\n"); - printf( - "Press [m] to toggle between move and animate (Julia) for left mouse " - "button\n"); - printf("Middle mouse button + drag = Zoom\n"); - printf("Right mouse button = Menu\n"); - printf("Press [?] to print location and scale\n"); - printf("Press [q] to exit\n"); - printf("\n"); - - sdkCreateTimer(&hTimer); - sdkStartTimer(&hTimer); + sdkCreateTimer(&hTimer); + sdkStartTimer(&hTimer); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - setVSync(0); + setVSync(0); #endif - glutMainLoop(); -} // main + glutMainLoop(); +} // main diff --git a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_cuda.cu b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_cuda.cu index 4cb21478..0fe5e5c5 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_cuda.cu +++ b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_cuda.cu @@ -26,369 +26,504 @@ */ #include -#include "helper_cuda.h" -#include "Mandelbrot_kernel.h" + #include "Mandelbrot_kernel.cuh" +#include "Mandelbrot_kernel.h" +#include "helper_cuda.h" // The Mandelbrot CUDA GPU thread function template -__global__ void Mandelbrot0(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const T xOff, const T yOff, - const T xJP, const T yJP, const T scale, - const uchar4 colors, const int frame, - const int animationFrame, const int gridWidth, - const int numBlocks, const bool isJ) { - // loop until all blocks completed - for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; - blockIndex += gridDim.x) { - unsigned int blockX = blockIndex % gridWidth; - unsigned int blockY = blockIndex / gridWidth; +__global__ void Mandelbrot0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const T xOff, + const T yOff, + const T xJP, + const T yJP, + const T scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int gridWidth, + const int numBlocks, + const bool isJ) +{ + // loop until all blocks completed + for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; blockIndex += gridDim.x) { + unsigned int blockX = blockIndex % gridWidth; + unsigned int blockY = blockIndex / gridWidth; - // process this block - const int ix = blockDim.x * blockX + threadIdx.x; - const int iy = blockDim.y * blockY + threadIdx.y; + // process this block + const int ix = blockDim.x * blockX + threadIdx.x; + const int iy = blockDim.y * blockY + threadIdx.y; - if ((ix < imageW) && (iy < imageH)) { - // Calculate the location - const T xPos = (T)ix * scale + xOff; - const T yPos = (T)iy * scale + yOff; + if ((ix < imageW) && (iy < imageH)) { + // Calculate the location + const T xPos = (T)ix * scale + xOff; + const T yPos = (T)iy * scale + yOff; - // Calculate the Mandelbrot index for the current location - int m = CalcMandelbrot(xPos, yPos, xJP, yJP, crunch, isJ); - // int m = blockIdx.x; // uncomment to see scheduling - // order - m = m > 0 ? crunch - m : 0; + // Calculate the Mandelbrot index for the current location + int m = CalcMandelbrot(xPos, yPos, xJP, yJP, crunch, isJ); + // int m = blockIdx.x; // uncomment to see scheduling + // order + m = m > 0 ? crunch - m : 0; - // Convert the Mandelbrot index into a color - uchar4 color; + // Convert the Mandelbrot index into a color + uchar4 color; - if (m) { - m += animationFrame; - color.x = m * colors.x; - color.y = m * colors.y; - color.z = m * colors.z; - } else { - color.x = 0; - color.y = 0; - color.z = 0; - } + if (m) { + m += animationFrame; + color.x = m * colors.x; + color.y = m * colors.y; + color.z = m * colors.z; + } + else { + color.x = 0; + color.y = 0; + color.z = 0; + } - // Output the pixel - int pixel = imageW * iy + ix; + // Output the pixel + int pixel = imageW * iy + ix; - if (frame == 0) { - color.w = 0; - dst[pixel] = color; - } else { - int frame1 = frame + 1; - int frame2 = frame1 / 2; - dst[pixel].x = (dst[pixel].x * frame + color.x + frame2) / frame1; - dst[pixel].y = (dst[pixel].y * frame + color.y + frame2) / frame1; - dst[pixel].z = (dst[pixel].z * frame + color.z + frame2) / frame1; - } + if (frame == 0) { + color.w = 0; + dst[pixel] = color; + } + else { + int frame1 = frame + 1; + int frame2 = frame1 / 2; + dst[pixel].x = (dst[pixel].x * frame + color.x + frame2) / frame1; + dst[pixel].y = (dst[pixel].y * frame + color.y + frame2) / frame1; + dst[pixel].z = (dst[pixel].z * frame + color.z + frame2) / frame1; + } + } } - } -} // Mandelbrot0 +} // Mandelbrot0 // The Mandelbrot CUDA GPU thread function (double single version) -__global__ void MandelbrotDS0(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const float xOff0, - const float xOff1, const float yOff0, - const float yOff1, const float xJP, - const float yJP, const float scale, - const uchar4 colors, const int frame, - const int animationFrame, const int gridWidth, - const int numBlocks, const bool isJ) { - // loop until all blocks completed - for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; - blockIndex += gridDim.x) { - unsigned int blockX = blockIndex % gridWidth; - unsigned int blockY = blockIndex / gridWidth; +__global__ void MandelbrotDS0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const float xOff0, + const float xOff1, + const float yOff0, + const float yOff1, + const float xJP, + const float yJP, + const float scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int gridWidth, + const int numBlocks, + const bool isJ) +{ + // loop until all blocks completed + for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; blockIndex += gridDim.x) { + unsigned int blockX = blockIndex % gridWidth; + unsigned int blockY = blockIndex / gridWidth; - // process this block - const int ix = blockDim.x * blockX + threadIdx.x; - const int iy = blockDim.y * blockY + threadIdx.y; + // process this block + const int ix = blockDim.x * blockX + threadIdx.x; + const int iy = blockDim.y * blockY + threadIdx.y; - if ((ix < imageW) && (iy < imageH)) { - // Calculate the location - float xPos0 = (float)ix * scale; - float xPos1 = 0.0f; - float yPos0 = (float)iy * scale; - float yPos1 = 0.0f; - dsadd(xPos0, xPos1, xPos0, xPos1, xOff0, xOff1); - dsadd(yPos0, yPos1, yPos0, yPos1, yOff0, yOff1); + if ((ix < imageW) && (iy < imageH)) { + // Calculate the location + float xPos0 = (float)ix * scale; + float xPos1 = 0.0f; + float yPos0 = (float)iy * scale; + float yPos1 = 0.0f; + dsadd(xPos0, xPos1, xPos0, xPos1, xOff0, xOff1); + dsadd(yPos0, yPos1, yPos0, yPos1, yOff0, yOff1); - // Calculate the Mandelbrot index for the current location - int m = - CalcMandelbrotDS(xPos0, xPos1, yPos0, yPos1, xJP, yJP, crunch, isJ); - m = m > 0 ? crunch - m : 0; + // Calculate the Mandelbrot index for the current location + int m = CalcMandelbrotDS(xPos0, xPos1, yPos0, yPos1, xJP, yJP, crunch, isJ); + m = m > 0 ? crunch - m : 0; - // Convert the Mandelbrot index into a color - uchar4 color; + // Convert the Mandelbrot index into a color + uchar4 color; - if (m) { - m += animationFrame; - color.x = m * colors.x; - color.y = m * colors.y; - color.z = m * colors.z; - } else { - color.x = 0; - color.y = 0; - color.z = 0; - } + if (m) { + m += animationFrame; + color.x = m * colors.x; + color.y = m * colors.y; + color.z = m * colors.z; + } + else { + color.x = 0; + color.y = 0; + color.z = 0; + } - // Output the pixel - int pixel = imageW * iy + ix; + // Output the pixel + int pixel = imageW * iy + ix; - if (frame == 0) { - color.w = 0; - dst[pixel] = color; - } else { - int frame1 = frame + 1; - int frame2 = frame1 / 2; - dst[pixel].x = (dst[pixel].x * frame + color.x + frame2) / frame1; - dst[pixel].y = (dst[pixel].y * frame + color.y + frame2) / frame1; - dst[pixel].z = (dst[pixel].z * frame + color.z + frame2) / frame1; - } + if (frame == 0) { + color.w = 0; + dst[pixel] = color; + } + else { + int frame1 = frame + 1; + int frame2 = frame1 / 2; + dst[pixel].x = (dst[pixel].x * frame + color.x + frame2) / frame1; + dst[pixel].y = (dst[pixel].y * frame + color.y + frame2) / frame1; + dst[pixel].z = (dst[pixel].z * frame + color.z + frame2) / frame1; + } + } } - } -} // MandelbrotDS0 +} // MandelbrotDS0 // The Mandelbrot secondary AA pass CUDA GPU thread function template -__global__ void Mandelbrot1(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const T xOff, const T yOff, - const T xJP, const T yJP, const T scale, - const uchar4 colors, const int frame, - const int animationFrame, const int gridWidth, - const int numBlocks, const bool isJ) { - // loop until all blocks completed - for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; - blockIndex += gridDim.x) { - unsigned int blockX = blockIndex % gridWidth; - unsigned int blockY = blockIndex / gridWidth; +__global__ void Mandelbrot1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const T xOff, + const T yOff, + const T xJP, + const T yJP, + const T scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int gridWidth, + const int numBlocks, + const bool isJ) +{ + // loop until all blocks completed + for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; blockIndex += gridDim.x) { + unsigned int blockX = blockIndex % gridWidth; + unsigned int blockY = blockIndex / gridWidth; - // process this block - const int ix = blockDim.x * blockX + threadIdx.x; - const int iy = blockDim.y * blockY + threadIdx.y; + // process this block + const int ix = blockDim.x * blockX + threadIdx.x; + const int iy = blockDim.y * blockY + threadIdx.y; - if ((ix < imageW) && (iy < imageH)) { - // Get the current pixel color - int pixel = imageW * iy + ix; - uchar4 pixelColor = dst[pixel]; - int count = 0; + if ((ix < imageW) && (iy < imageH)) { + // Get the current pixel color + int pixel = imageW * iy + ix; + uchar4 pixelColor = dst[pixel]; + int count = 0; - // Search for pixels out of tolerance surrounding the current pixel - if (ix > 0) { - count += CheckColors(pixelColor, dst[pixel - 1]); - } + // Search for pixels out of tolerance surrounding the current pixel + if (ix > 0) { + count += CheckColors(pixelColor, dst[pixel - 1]); + } - if (ix + 1 < imageW) { - count += CheckColors(pixelColor, dst[pixel + 1]); - } + if (ix + 1 < imageW) { + count += CheckColors(pixelColor, dst[pixel + 1]); + } - if (iy > 0) { - count += CheckColors(pixelColor, dst[pixel - imageW]); - } + if (iy > 0) { + count += CheckColors(pixelColor, dst[pixel - imageW]); + } - if (iy + 1 < imageH) { - count += CheckColors(pixelColor, dst[pixel + imageW]); - } + if (iy + 1 < imageH) { + count += CheckColors(pixelColor, dst[pixel + imageW]); + } - if (count) { - // Calculate the location - const T xPos = (T)ix * scale + xOff; - const T yPos = (T)iy * scale + yOff; + if (count) { + // Calculate the location + const T xPos = (T)ix * scale + xOff; + const T yPos = (T)iy * scale + yOff; - // Calculate the Mandelbrot index for the current location - int m = CalcMandelbrot(xPos, yPos, xJP, yJP, crunch, isJ); - m = m > 0 ? crunch - m : 0; + // Calculate the Mandelbrot index for the current location + int m = CalcMandelbrot(xPos, yPos, xJP, yJP, crunch, isJ); + m = m > 0 ? crunch - m : 0; - // Convert the Mandelbrot index into a color - uchar4 color; + // Convert the Mandelbrot index into a color + uchar4 color; - if (m) { - m += animationFrame; - color.x = m * colors.x; - color.y = m * colors.y; - color.z = m * colors.z; - } else { - color.x = 0; - color.y = 0; - color.z = 0; + if (m) { + m += animationFrame; + color.x = m * colors.x; + color.y = m * colors.y; + color.z = m * colors.z; + } + else { + color.x = 0; + color.y = 0; + color.z = 0; + } + + // Output the pixel + int frame1 = frame + 1; + int frame2 = frame1 / 2; + dst[pixel].x = (pixelColor.x * frame + color.x + frame2) / frame1; + dst[pixel].y = (pixelColor.y * frame + color.y + frame2) / frame1; + dst[pixel].z = (pixelColor.z * frame + color.z + frame2) / frame1; + } } - - // Output the pixel - int frame1 = frame + 1; - int frame2 = frame1 / 2; - dst[pixel].x = (pixelColor.x * frame + color.x + frame2) / frame1; - dst[pixel].y = (pixelColor.y * frame + color.y + frame2) / frame1; - dst[pixel].z = (pixelColor.z * frame + color.z + frame2) / frame1; - } } - } -} // Mandelbrot1 +} // Mandelbrot1 // The Mandelbrot secondary AA pass CUDA GPU thread function (double single // version) -__global__ void MandelbrotDS1(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const float xOff0, - const float xOff1, const float yOff0, - const float yOff1, const float xJP, - const float yJP, const float scale, - const uchar4 colors, const int frame, - const int animationFrame, const int gridWidth, - const int numBlocks, const bool isJ) { - // loop until all blocks completed - for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; - blockIndex += gridDim.x) { - unsigned int blockX = blockIndex % gridWidth; - unsigned int blockY = blockIndex / gridWidth; +__global__ void MandelbrotDS1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const float xOff0, + const float xOff1, + const float yOff0, + const float yOff1, + const float xJP, + const float yJP, + const float scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int gridWidth, + const int numBlocks, + const bool isJ) +{ + // loop until all blocks completed + for (unsigned int blockIndex = blockIdx.x; blockIndex < numBlocks; blockIndex += gridDim.x) { + unsigned int blockX = blockIndex % gridWidth; + unsigned int blockY = blockIndex / gridWidth; - // process this block - const int ix = blockDim.x * blockX + threadIdx.x; - const int iy = blockDim.y * blockY + threadIdx.y; + // process this block + const int ix = blockDim.x * blockX + threadIdx.x; + const int iy = blockDim.y * blockY + threadIdx.y; - if ((ix < imageW) && (iy < imageH)) { - // Get the current pixel color - int pixel = imageW * iy + ix; - uchar4 pixelColor = dst[pixel]; - int count = 0; + if ((ix < imageW) && (iy < imageH)) { + // Get the current pixel color + int pixel = imageW * iy + ix; + uchar4 pixelColor = dst[pixel]; + int count = 0; - // Search for pixels out of tolerance surrounding the current pixel - if (ix > 0) { - count += CheckColors(pixelColor, dst[pixel - 1]); - } + // Search for pixels out of tolerance surrounding the current pixel + if (ix > 0) { + count += CheckColors(pixelColor, dst[pixel - 1]); + } - if (ix + 1 < imageW) { - count += CheckColors(pixelColor, dst[pixel + 1]); - } + if (ix + 1 < imageW) { + count += CheckColors(pixelColor, dst[pixel + 1]); + } - if (iy > 0) { - count += CheckColors(pixelColor, dst[pixel - imageW]); - } + if (iy > 0) { + count += CheckColors(pixelColor, dst[pixel - imageW]); + } - if (iy + 1 < imageH) { - count += CheckColors(pixelColor, dst[pixel + imageW]); - } + if (iy + 1 < imageH) { + count += CheckColors(pixelColor, dst[pixel + imageW]); + } - if (count) { - // Calculate the location - float xPos0 = (float)ix * scale; - float xPos1 = 0.0f; - float yPos0 = (float)iy * scale; - float yPos1 = 0.0f; - dsadd(xPos0, xPos1, xPos0, xPos1, xOff0, xOff1); - dsadd(yPos0, yPos1, yPos0, yPos1, yOff0, yOff1); + if (count) { + // Calculate the location + float xPos0 = (float)ix * scale; + float xPos1 = 0.0f; + float yPos0 = (float)iy * scale; + float yPos1 = 0.0f; + dsadd(xPos0, xPos1, xPos0, xPos1, xOff0, xOff1); + dsadd(yPos0, yPos1, yPos0, yPos1, yOff0, yOff1); - // Calculate the Mandelbrot index for the current location - int m = - CalcMandelbrotDS(xPos0, xPos1, yPos0, yPos1, xJP, yJP, crunch, isJ); - m = m > 0 ? crunch - m : 0; + // Calculate the Mandelbrot index for the current location + int m = CalcMandelbrotDS(xPos0, xPos1, yPos0, yPos1, xJP, yJP, crunch, isJ); + m = m > 0 ? crunch - m : 0; - // Convert the Mandelbrot index into a color - uchar4 color; + // Convert the Mandelbrot index into a color + uchar4 color; - if (m) { - m += animationFrame; - color.x = m * colors.x; - color.y = m * colors.y; - color.z = m * colors.z; - } else { - color.x = 0; - color.y = 0; - color.z = 0; + if (m) { + m += animationFrame; + color.x = m * colors.x; + color.y = m * colors.y; + color.z = m * colors.z; + } + else { + color.x = 0; + color.y = 0; + color.z = 0; + } + + // Output the pixel + int frame1 = frame + 1; + int frame2 = frame1 / 2; + dst[pixel].x = (pixelColor.x * frame + color.x + frame2) / frame1; + dst[pixel].y = (pixelColor.y * frame + color.y + frame2) / frame1; + dst[pixel].z = (pixelColor.z * frame + color.z + frame2) / frame1; + } } - - // Output the pixel - int frame1 = frame + 1; - int frame2 = frame1 / 2; - dst[pixel].x = (pixelColor.x * frame + color.x + frame2) / frame1; - dst[pixel].y = (pixelColor.y * frame + color.y + frame2) / frame1; - dst[pixel].z = (pixelColor.z * frame + color.z + frame2) / frame1; - } } - } -} // MandelbrotDS1 +} // MandelbrotDS1 // The host CPU Mandelbrot thread spawner -void RunMandelbrot0(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const double xOff, const double yOff, - const double xjp, const double yjp, const double scale, - const uchar4 colors, const int frame, - const int animationFrame, const int mode, const int numSMs, - const bool isJ, int version) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +void RunMandelbrot0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xjp, + const double yjp, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int mode, + const int numSMs, + const bool isJ, + int version) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - int numWorkerBlocks = numSMs; + int numWorkerBlocks = numSMs; - switch (mode) { + switch (mode) { default: case 0: - Mandelbrot0<<>>( - dst, imageW, imageH, crunch, (float)xOff, (float)yOff, (float)xjp, - (float)yjp, (float)scale, colors, frame, animationFrame, grid.x, - grid.x * grid.y, isJ); - break; + Mandelbrot0<<>>(dst, + imageW, + imageH, + crunch, + (float)xOff, + (float)yOff, + (float)xjp, + (float)yjp, + (float)scale, + colors, + frame, + animationFrame, + grid.x, + grid.x * grid.y, + isJ); + break; case 1: - float x0, x1, y0, y1; - dsdeq(x0, x1, xOff); - dsdeq(y0, y1, yOff); - MandelbrotDS0<<>>( - dst, imageW, imageH, crunch, x0, x1, y0, y1, (float)xjp, (float)yjp, - (float)scale, colors, frame, animationFrame, grid.x, grid.x * grid.y, - isJ); - break; + float x0, x1, y0, y1; + dsdeq(x0, x1, xOff); + dsdeq(y0, y1, yOff); + MandelbrotDS0<<>>(dst, + imageW, + imageH, + crunch, + x0, + x1, + y0, + y1, + (float)xjp, + (float)yjp, + (float)scale, + colors, + frame, + animationFrame, + grid.x, + grid.x * grid.y, + isJ); + break; case 2: - Mandelbrot0<<>>( - dst, imageW, imageH, crunch, xOff, yOff, xjp, yjp, scale, colors, - frame, animationFrame, grid.x, grid.x * grid.y, isJ); - break; - } + Mandelbrot0<<>>(dst, + imageW, + imageH, + crunch, + xOff, + yOff, + xjp, + yjp, + scale, + colors, + frame, + animationFrame, + grid.x, + grid.x * grid.y, + isJ); + break; + } - getLastCudaError("Mandelbrot0 kernel execution failed.\n"); -} // RunMandelbrot0 + getLastCudaError("Mandelbrot0 kernel execution failed.\n"); +} // RunMandelbrot0 // The host CPU Mandelbrot thread spawner -void RunMandelbrot1(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const double xOff, const double yOff, - const double xjp, const double yjp, const double scale, - const uchar4 colors, const int frame, - const int animationFrame, const int mode, const int numSMs, - const bool isJ, int version) { - dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); - dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); +void RunMandelbrot1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xjp, + const double yjp, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int mode, + const int numSMs, + const bool isJ, + int version) +{ + dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); + dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); - int numWorkerBlocks = numSMs; + int numWorkerBlocks = numSMs; - switch (mode) { + switch (mode) { default: case 0: - Mandelbrot1<<>>( - dst, imageW, imageH, crunch, (float)xOff, (float)yOff, (float)xjp, - (float)yjp, (float)scale, colors, frame, animationFrame, grid.x, - grid.x * grid.y, isJ); - break; + Mandelbrot1<<>>(dst, + imageW, + imageH, + crunch, + (float)xOff, + (float)yOff, + (float)xjp, + (float)yjp, + (float)scale, + colors, + frame, + animationFrame, + grid.x, + grid.x * grid.y, + isJ); + break; case 1: - float x0, x1, y0, y1; - dsdeq(x0, x1, xOff); - dsdeq(y0, y1, yOff); - MandelbrotDS1<<>>( - dst, imageW, imageH, crunch, x0, x1, y0, y1, (float)xjp, (float)yjp, - (float)scale, colors, frame, animationFrame, grid.x, grid.x * grid.y, - isJ); - break; + float x0, x1, y0, y1; + dsdeq(x0, x1, xOff); + dsdeq(y0, y1, yOff); + MandelbrotDS1<<>>(dst, + imageW, + imageH, + crunch, + x0, + x1, + y0, + y1, + (float)xjp, + (float)yjp, + (float)scale, + colors, + frame, + animationFrame, + grid.x, + grid.x * grid.y, + isJ); + break; case 2: - Mandelbrot1<<>>( - dst, imageW, imageH, crunch, xOff, yOff, xjp, yjp, scale, colors, - frame, animationFrame, grid.x, grid.x * grid.y, isJ); - break; - } + Mandelbrot1<<>>(dst, + imageW, + imageH, + crunch, + xOff, + yOff, + xjp, + yjp, + scale, + colors, + frame, + animationFrame, + grid.x, + grid.x * grid.y, + isJ); + break; + } - getLastCudaError("Mandelbrot1 kernel execution failed.\n"); -} // RunMandelbrot1 + getLastCudaError("Mandelbrot1 kernel execution failed.\n"); +} // RunMandelbrot1 diff --git a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.cpp b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.cpp index 3d78713b..36a51766 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.cpp +++ b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.cpp @@ -30,253 +30,313 @@ #define ABS(n) ((n) < 0 ? -(n) : (n)) /* dfloat class declaration */ -class dfloat { - private: - float val[2]; +class dfloat +{ +private: + float val[2]; - public: - dfloat() { val[0] = val[1] = 0; } - dfloat(float a, float b) { - val[0] = a; - val[1] = b; - } - dfloat(double b); - inline float operator[](unsigned idx) const { return val[idx]; } +public: + dfloat() { val[0] = val[1] = 0; } + dfloat(float a, float b) + { + val[0] = a; + val[1] = b; + } + dfloat(double b); + inline float operator[](unsigned idx) const { return val[idx]; } }; inline dfloat operator+(const dfloat &dsa, const dfloat &dsb); inline dfloat operator-(const dfloat &dsa, const dfloat &dsb); inline dfloat operator*(const dfloat &dsa, const dfloat &dsb); -inline int operator<(const dfloat &a, float b) { return a[0] < b; } +inline int operator<(const dfloat &a, float b) { return a[0] < b; } // The core Mandelbrot calculation function template template -inline int CalcMandelbrot(const T xPos, const T yPos, const T xJParam, - const T yJParam, const int crunch, - const bool isJulia) { - T x, y, xx, yy, xC, yC; - int i = crunch; +inline int +CalcMandelbrot(const T xPos, const T yPos, const T xJParam, const T yJParam, const int crunch, const bool isJulia) +{ + T x, y, xx, yy, xC, yC; + int i = crunch; - if (isJulia) { - xC = xJParam; - yC = yJParam; - y = yPos; - x = xPos; - yy = y * y; - xx = x * x; - } else { - xC = xPos; - yC = yPos; - x = y = 0; - xx = yy = 0; - } + if (isJulia) { + xC = xJParam; + yC = yJParam; + y = yPos; + x = xPos; + yy = y * y; + xx = x * x; + } + else { + xC = xPos; + yC = yPos; + x = y = 0; + xx = yy = 0; + } - while (--i && (xx + yy < 4.0f)) { - y = x * y + x * y + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; - } + while (--i && (xx + yy < 4.0f)) { + y = x * y + x * y + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; + } - return i; -} // CalcMandelbrot + return i; +} // CalcMandelbrot -inline void updatePixel(uchar4 &dst, const uchar4 &color, int frame) { - int frame1 = frame + 1; - int frame2 = frame1 / 2; - dst.x = (dst.x * frame + color.x + frame2) / frame1; - dst.y = (dst.y * frame + color.y + frame2) / frame1; - dst.z = (dst.z * frame + color.z + frame2) / frame1; +inline void updatePixel(uchar4 &dst, const uchar4 &color, int frame) +{ + int frame1 = frame + 1; + int frame2 = frame1 / 2; + dst.x = (dst.x * frame + color.x + frame2) / frame1; + dst.y = (dst.y * frame + color.y + frame2) / frame1; + dst.z = (dst.z * frame + color.z + frame2) / frame1; } -inline void setColor(uchar4 &dst, const uchar4 &colors, int &m, - const int animationFrame) { - if (m == 0) { - dst.x = 0; - dst.y = 0; - dst.z = 0; - return; - } +inline void setColor(uchar4 &dst, const uchar4 &colors, int &m, const int animationFrame) +{ + if (m == 0) { + dst.x = 0; + dst.y = 0; + dst.z = 0; + return; + } - m += animationFrame; - dst.x = m * colors.x; - dst.y = m * colors.y; - dst.z = m * colors.z; + m += animationFrame; + dst.x = m * colors.x; + dst.y = m * colors.y; + dst.z = m * colors.z; } template -void runMandelbrotGold0(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const T xOff, const T yOff, - const T xJParam, const T yJParam, const T scale, - const uchar4 colors, const int frame, - const int animationFrame, const bool isJulia) { - for (int iy = 0; iy < imageH; iy++) - for (int ix = 0; ix < imageW; ix++) { - // Calculate the location - const T_ xPos = (T)ix * scale + xOff; - const T_ yPos = (T)iy * scale + yOff; +void runMandelbrotGold0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const T xOff, + const T yOff, + const T xJParam, + const T yJParam, + const T scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia) +{ + for (int iy = 0; iy < imageH; iy++) + for (int ix = 0; ix < imageW; ix++) { + // Calculate the location + const T_ xPos = (T)ix * scale + xOff; + const T_ yPos = (T)iy * scale + yOff; - // Calculate the Mandelbrot index for the current location - int m = CalcMandelbrot(xPos, yPos, xJParam, yJParam, crunch, isJulia); - m = m > 0 ? crunch - m : 0; + // Calculate the Mandelbrot index for the current location + int m = CalcMandelbrot(xPos, yPos, xJParam, yJParam, crunch, isJulia); + m = m > 0 ? crunch - m : 0; - // Convert the Mandelbrot index into a color - uchar4 color; + // Convert the Mandelbrot index into a color + uchar4 color; - setColor(color, colors, m, animationFrame); + setColor(color, colors, m, animationFrame); - // Output the pixel - int pixel = imageW * iy + ix; + // Output the pixel + int pixel = imageW * iy + ix; - if (frame == 0) { - color.w = 0; - dst[pixel] = color; - } else - updatePixel(dst[pixel], color, frame); - } + if (frame == 0) { + color.w = 0; + dst[pixel] = color; + } + else + updatePixel(dst[pixel], color, frame); + } -} // runMandelbrotGold0_ +} // runMandelbrotGold0_ // Determine if two pixel colors are within tolerance -inline int CheckColors(const uchar4 &color0, const uchar4 &color1) { - int x = color1.x - color0.x; - int y = color1.y - color0.y; - int z = color1.z - color0.z; - return (ABS(x) > 10) || (ABS(y) > 10) || (ABS(z) > 10); -} // CheckColors +inline int CheckColors(const uchar4 &color0, const uchar4 &color1) +{ + int x = color1.x - color0.x; + int y = color1.y - color0.y; + int z = color1.z - color0.z; + return (ABS(x) > 10) || (ABS(y) > 10) || (ABS(z) > 10); +} // CheckColors template -void runMandelbrotGold1(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const T xOff, const T yOff, - const T xJParam, const T yJParam, const T scale, - const uchar4 colors, const int frame, - const int animationFrame, const bool isJulia) { - for (int iy = 0; iy < imageH; iy++) - for (int ix = 0; ix < imageW; ix++) { - // Get the current pixel color - int pixel = imageW * iy + ix; - uchar4 pixelColor = dst[pixel]; - int count = 0; +void runMandelbrotGold1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const T xOff, + const T yOff, + const T xJParam, + const T yJParam, + const T scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia) +{ + for (int iy = 0; iy < imageH; iy++) + for (int ix = 0; ix < imageW; ix++) { + // Get the current pixel color + int pixel = imageW * iy + ix; + uchar4 pixelColor = dst[pixel]; + int count = 0; - // Search for pixels out of tolerance surrounding the current pixel - if (ix > 0) count += CheckColors(pixelColor, dst[pixel - 1]); + // Search for pixels out of tolerance surrounding the current pixel + if (ix > 0) + count += CheckColors(pixelColor, dst[pixel - 1]); - if (ix + 1 < imageW) count += CheckColors(pixelColor, dst[pixel + 1]); + if (ix + 1 < imageW) + count += CheckColors(pixelColor, dst[pixel + 1]); - if (iy > 0) count += CheckColors(pixelColor, dst[pixel - imageW]); + if (iy > 0) + count += CheckColors(pixelColor, dst[pixel - imageW]); - if (iy + 1 < imageH) - count += CheckColors(pixelColor, dst[pixel + imageW]); + if (iy + 1 < imageH) + count += CheckColors(pixelColor, dst[pixel + imageW]); - if (count) { - // Calculate the location - const T_ xPos = (T)ix * scale + xOff; - const T_ yPos = (T)iy * scale + yOff; + if (count) { + // Calculate the location + const T_ xPos = (T)ix * scale + xOff; + const T_ yPos = (T)iy * scale + yOff; - // Calculate the Mandelbrot index for the current location - int m = - CalcMandelbrot(xPos, yPos, xJParam, yJParam, crunch, isJulia); - m = m > 0 ? crunch - m : 0; + // Calculate the Mandelbrot index for the current location + int m = CalcMandelbrot(xPos, yPos, xJParam, yJParam, crunch, isJulia); + m = m > 0 ? crunch - m : 0; - // Convert the Mandelbrot index into a color - uchar4 color; + // Convert the Mandelbrot index into a color + uchar4 color; - setColor(color, colors, m, animationFrame); + setColor(color, colors, m, animationFrame); - // Output the pixel - updatePixel(dst[pixel], color, frame); - } - } -} // RunMandelbrotGold1_ + // Output the pixel + updatePixel(dst[pixel], color, frame); + } + } +} // RunMandelbrotGold1_ /* Implementation of exported functions */ -void RunMandelbrotGold1(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const float xOff, const float yOff, - const float xJParam, const float yJParam, - const float scale, const uchar4 colors, const int frame, - const int animationFrame, const bool isJulia) { - runMandelbrotGold1(dst, imageW, imageH, crunch, xOff, yOff, - xJParam, yJParam, scale, colors, frame, - animationFrame, isJulia); +void RunMandelbrotGold1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const float xOff, + const float yOff, + const float xJParam, + const float yJParam, + const float scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia) +{ + runMandelbrotGold1( + dst, imageW, imageH, crunch, xOff, yOff, xJParam, yJParam, scale, colors, frame, animationFrame, isJulia); -} // RunMandelbrotGold1 +} // RunMandelbrotGold1 -void RunMandelbrotDSGold1(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const double xOff, - const double yOff, const double xJParam, - const double yJParam, const double scale, - const uchar4 colors, const int frame, - const int animationFrame, const bool isJulia) { - runMandelbrotGold1(dst, imageW, imageH, crunch, xOff, yOff, - xJParam, yJParam, scale, colors, frame, - animationFrame, isJulia); +void RunMandelbrotDSGold1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xJParam, + const double yJParam, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia) +{ + runMandelbrotGold1( + dst, imageW, imageH, crunch, xOff, yOff, xJParam, yJParam, scale, colors, frame, animationFrame, isJulia); -} // RunMandelbrotDSGold1 +} // RunMandelbrotDSGold1 -void RunMandelbrotGold0(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const float xOff, const float yOff, - const float xJParam, const float yJParam, - const float scale, const uchar4 colors, const int frame, - const int animationFrame, const bool isJulia) { - runMandelbrotGold0(dst, imageW, imageH, crunch, xOff, yOff, - xJParam, yJParam, scale, colors, frame, - animationFrame, isJulia); -} // RunMandelbrotGold0 +void RunMandelbrotGold0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const float xOff, + const float yOff, + const float xJParam, + const float yJParam, + const float scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia) +{ + runMandelbrotGold0( + dst, imageW, imageH, crunch, xOff, yOff, xJParam, yJParam, scale, colors, frame, animationFrame, isJulia); +} // RunMandelbrotGold0 -void RunMandelbrotDSGold0(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const double xOff, - const double yOff, const double xJParam, - const double yJParam, const double scale, - const uchar4 colors, const int frame, - const int animationFrame, const bool isJulia) { - runMandelbrotGold0(dst, imageW, imageH, crunch, xOff, yOff, - xJParam, yJParam, scale, colors, frame, - animationFrame, isJulia); -} // RunMandelbrotDSGold0 +void RunMandelbrotDSGold0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xJParam, + const double yJParam, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia) +{ + runMandelbrotGold0( + dst, imageW, imageH, crunch, xOff, yOff, xJParam, yJParam, scale, colors, frame, animationFrame, isJulia); +} // RunMandelbrotDSGold0 /*dfloat operations implementation */ /* Construct a DS number equal to a double precision floating point number b*/ -dfloat::dfloat(double b) { - val[0] = (float)b; - val[1] = (float)(b - val[0]); +dfloat::dfloat(double b) +{ + val[0] = (float)b; + val[1] = (float)(b - val[0]); } -inline dfloat operator+(const dfloat &dsa, const dfloat &dsb) { - // Compute dsa + dsb using Knuth's trick. - float t1 = dsa[0] + dsb[0]; - float e = t1 - dsa[0]; - float t2 = ((dsb[0] - e) + (dsa[0] - (t1 - e))) + dsa[1] + dsb[1]; +inline dfloat operator+(const dfloat &dsa, const dfloat &dsb) +{ + // Compute dsa + dsb using Knuth's trick. + float t1 = dsa[0] + dsb[0]; + float e = t1 - dsa[0]; + float t2 = ((dsb[0] - e) + (dsa[0] - (t1 - e))) + dsa[1] + dsb[1]; - // The result is t1 + t2, after normalization. - e = t1 + t2; - return dfloat(e, t2 - (e - t1)); + // The result is t1 + t2, after normalization. + e = t1 + t2; + return dfloat(e, t2 - (e - t1)); } -inline dfloat operator-(const dfloat &dsa, const dfloat &dsb) { - // Compute dsa - dsb using Knuth's trick. - float t1 = dsa[0] - dsb[0]; - float e = t1 - dsa[0]; - float t2 = ((-dsb[0] - e) + (dsa[0] - (t1 - e))) + dsa[1] - dsb[1]; +inline dfloat operator-(const dfloat &dsa, const dfloat &dsb) +{ + // Compute dsa - dsb using Knuth's trick. + float t1 = dsa[0] - dsb[0]; + float e = t1 - dsa[0]; + float t2 = ((-dsb[0] - e) + (dsa[0] - (t1 - e))) + dsa[1] - dsb[1]; - // The result is t1 + t2, after normalization. - e = t1 + t2; - return dfloat(e, t2 - (e - t1)); + // The result is t1 + t2, after normalization. + e = t1 + t2; + return dfloat(e, t2 - (e - t1)); } -inline dfloat operator*(const dfloat &dsa, const dfloat &dsb) { - // This splits dsa(1) and dsb(1) into high-order and low-order words. - float c11 = dsa[0] * dsb[0]; - float c21 = dsa[0] * dsb[0] - c11; +inline dfloat operator*(const dfloat &dsa, const dfloat &dsb) +{ + // This splits dsa(1) and dsb(1) into high-order and low-order words. + float c11 = dsa[0] * dsb[0]; + float c21 = dsa[0] * dsb[0] - c11; - // Compute dsa[0] * dsb[1] + dsa[1] * dsb[0] (only high-order word is needed). - float c2 = dsa[0] * dsb[1] + dsa[1] * dsb[0]; + // Compute dsa[0] * dsb[1] + dsa[1] * dsb[0] (only high-order word is needed). + float c2 = dsa[0] * dsb[1] + dsa[1] * dsb[0]; - // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product. - float t1 = c11 + c2; - float e = t1 - c11; - float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + dsa[1] * dsb[1]; + // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product. + float t1 = c11 + c2; + float e = t1 - c11; + float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + dsa[1] * dsb[1]; - // The result is t1 + t2, after normalization. - e = t1 + t2; - return dfloat(e, t2 - (e - t1)); + // The result is t1 + t2, after normalization. + e = t1 + t2; + return dfloat(e, t2 - (e - t1)); } diff --git a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.h b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.h index 8b84b3e9..fe164355 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.h +++ b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_gold.h @@ -30,33 +30,57 @@ #include -extern "C" void RunMandelbrotGold0(uchar4 *dst, const int imageW, - const int imageH, const int crunch, - const float xOff, const float yOff, - const float xJParam, const float yJParam, - const float scale, const uchar4 colors, - const int frame, const int animationFrame, - const bool isJulia); -extern "C" void RunMandelbrotDSGold0(uchar4 *dst, const int imageW, - const int imageH, const int crunch, - const double xOff, const double yOff, - const double xJParam, const double yJParam, - const double scale, const uchar4 colors, - const int frame, const int animationFrame, - const bool isJulia); -extern "C" void RunMandelbrotGold1(uchar4 *dst, const int imageW, - const int imageH, const int crunch, - const float xOff, const float yOff, - const float xJParam, const float yJParam, - const float scale, const uchar4 colors, - const int frame, const int animationFrame, - const bool isJulia); -extern "C" void RunMandelbrotDSGold1(uchar4 *dst, const int imageW, - const int imageH, const int crunch, - const double xOff, const double yOff, - const double xJParam, const double yJParam, - const double scale, const uchar4 colors, - const int frame, const int animationFrame, - const bool isJulia); +extern "C" void RunMandelbrotGold0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const float xOff, + const float yOff, + const float xJParam, + const float yJParam, + const float scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia); +extern "C" void RunMandelbrotDSGold0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xJParam, + const double yJParam, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia); +extern "C" void RunMandelbrotGold1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const float xOff, + const float yOff, + const float xJParam, + const float yJParam, + const float scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia); +extern "C" void RunMandelbrotDSGold1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xJParam, + const double yJParam, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const bool isJulia); #endif diff --git a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.cuh b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.cuh index a60849c3..1be0f32d 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.cuh +++ b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.cuh @@ -26,8 +26,9 @@ */ #include -#include "helper_cuda.h" + #include "Mandelbrot_kernel.h" +#include "helper_cuda.h" // The dimensions of the thread block #define BLOCKDIM_X 16 @@ -40,73 +41,75 @@ // This function sets the DS number A equal to the double precision floating // point number B. -inline void dsdeq(float &a0, float &a1, double b) { - a0 = (float)b; - a1 = (float)(b - a0); -} // dsdcp +inline void dsdeq(float &a0, float &a1, double b) +{ + a0 = (float)b; + a1 = (float)(b - a0); +} // dsdcp // This function sets the DS number A equal to the single precision floating // point number B. -__device__ inline void dsfeq(float &a0, float &a1, float b) { - a0 = b; - a1 = 0.0f; -} // dsfeq +__device__ inline void dsfeq(float &a0, float &a1, float b) +{ + a0 = b; + a1 = 0.0f; +} // dsfeq // This function computes c = a + b. -__device__ inline void dsadd(float &c0, float &c1, const float a0, - const float a1, const float b0, const float b1) { - // Compute dsa + dsb using Knuth's trick. - float t1 = a0 + b0; - float e = t1 - a0; - float t2 = ((b0 - e) + (a0 - (t1 - e))) + a1 + b1; +__device__ inline void dsadd(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1) +{ + // Compute dsa + dsb using Knuth's trick. + float t1 = a0 + b0; + float e = t1 - a0; + float t2 = ((b0 - e) + (a0 - (t1 - e))) + a1 + b1; - // The result is t1 + t2, after normalization. - c0 = e = t1 + t2; - c1 = t2 - (e - t1); -} // dsadd + // The result is t1 + t2, after normalization. + c0 = e = t1 + t2; + c1 = t2 - (e - t1); +} // dsadd // This function computes c = a - b. -__device__ inline void dssub(float &c0, float &c1, const float a0, - const float a1, const float b0, const float b1) { - // Compute dsa - dsb using Knuth's trick. - float t1 = a0 - b0; - float e = t1 - a0; - float t2 = ((-b0 - e) + (a0 - (t1 - e))) + a1 - b1; +__device__ inline void dssub(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1) +{ + // Compute dsa - dsb using Knuth's trick. + float t1 = a0 - b0; + float e = t1 - a0; + float t2 = ((-b0 - e) + (a0 - (t1 - e))) + a1 - b1; - // The result is t1 + t2, after normalization. - c0 = e = t1 + t2; - c1 = t2 - (e - t1); -} // dssub + // The result is t1 + t2, after normalization. + c0 = e = t1 + t2; + c1 = t2 - (e - t1); +} // dssub #if 1 // This function multiplies DS numbers A and B to yield the DS product C. -__device__ inline void dsmul(float &c0, float &c1, const float a0, - const float a1, const float b0, const float b1) { - // This splits dsa(1) and dsb(1) into high-order and low-order words. - float cona = a0 * 8193.0f; - float conb = b0 * 8193.0f; - float sa1 = cona - (cona - a0); - float sb1 = conb - (conb - b0); - float sa2 = a0 - sa1; - float sb2 = b0 - sb1; +__device__ inline void dsmul(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1) +{ + // This splits dsa(1) and dsb(1) into high-order and low-order words. + float cona = a0 * 8193.0f; + float conb = b0 * 8193.0f; + float sa1 = cona - (cona - a0); + float sb1 = conb - (conb - b0); + float sa2 = a0 - sa1; + float sb2 = b0 - sb1; - // Multilply a0 * b0 using Dekker's method. - float c11 = a0 * b0; - float c21 = (((sa1 * sb1 - c11) + sa1 * sb2) + sa2 * sb1) + sa2 * sb2; + // Multilply a0 * b0 using Dekker's method. + float c11 = a0 * b0; + float c21 = (((sa1 * sb1 - c11) + sa1 * sb2) + sa2 * sb1) + sa2 * sb2; - // Compute a0 * b1 + a1 * b0 (only high-order word is needed). - float c2 = a0 * b1 + a1 * b0; + // Compute a0 * b1 + a1 * b0 (only high-order word is needed). + float c2 = a0 * b1 + a1 * b0; - // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product. - float t1 = c11 + c2; - float e = t1 - c11; - float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + a1 * b1; + // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product. + float t1 = c11 + c2; + float e = t1 - c11; + float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + a1 * b1; - // The result is t1 + t2, after normalization. - c0 = e = t1 + t2; - c1 = t2 - (e - t1); -} // dsmul + // The result is t1 + t2, after normalization. + c0 = e = t1 + t2; + c1 = t2 - (e - t1); +} // dsmul #else @@ -118,32 +121,32 @@ __device__ inline void dsmul(float &c0, float &c1, const float a0, */ // This function multiplies DS numbers A and B to yield the DS product C. -__device__ inline void dsmul(float &c0, float &c1, const float a0, - const float a1, const float b0, const float b1) { - // This splits dsa(1) and dsb(1) into high-order and low-order words. - float cona = a0 * 8193.0f; - float conb = b0 * 8193.0f; - float sa1 = cona - (cona - a0); - float sb1 = conb - (conb - b0); - float sa2 = a0 - sa1; - float sb2 = b0 - sb1; +__device__ inline void dsmul(float &c0, float &c1, const float a0, const float a1, const float b0, const float b1) +{ + // This splits dsa(1) and dsb(1) into high-order and low-order words. + float cona = a0 * 8193.0f; + float conb = b0 * 8193.0f; + float sa1 = cona - (cona - a0); + float sb1 = conb - (conb - b0); + float sa2 = a0 - sa1; + float sb2 = b0 - sb1; - // Multilply a0 * b0 using Dekker's method. - float c11 = __fmul_rn(a0, b0); - float c21 = (((sa1 * sb1 - c11) + sa1 * sb2) + sa2 * sb1) + sa2 * sb2; + // Multilply a0 * b0 using Dekker's method. + float c11 = __fmul_rn(a0, b0); + float c21 = (((sa1 * sb1 - c11) + sa1 * sb2) + sa2 * sb1) + sa2 * sb2; - // Compute a0 * b1 + a1 * b0 (only high-order word is needed). - float c2 = __fmul_rn(a0, b1) + __fmul_rn(a1, b0); + // Compute a0 * b1 + a1 * b0 (only high-order word is needed). + float c2 = __fmul_rn(a0, b1) + __fmul_rn(a1, b0); - // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product. - float t1 = c11 + c2; - float e = t1 - c11; - float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + __fmul_rn(a1, b1); + // Compute (c11, c21) + c2 using Knuth's trick, also adding low-order product. + float t1 = c11 + c2; + float e = t1 - c11; + float t2 = ((c2 - e) + (c11 - (t1 - e))) + c21 + __fmul_rn(a1, b1); - // The result is t1 + t2, after normalization. - c0 = e = t1 + t2; - c1 = t2 - (e - t1); -} // dsmul + // The result is t1 + t2, after normalization. + c0 = e = t1 + t2; + c1 = t2 - (e - t1); +} // dsmul #endif @@ -151,306 +154,331 @@ __device__ inline void dsmul(float &c0, float &c1, const float a0, #if 1 // Unrolled version template -__device__ inline int CalcMandelbrot(const T xPos, const T yPos, - const T xJParam, const T yJParam, - const int crunch, const bool isJulia) { - T x, y, xx, yy; - int i = crunch; +__device__ inline int +CalcMandelbrot(const T xPos, const T yPos, const T xJParam, const T yJParam, const int crunch, const bool isJulia) +{ + T x, y, xx, yy; + int i = crunch; - T xC, yC; + T xC, yC; - if (isJulia) { - xC = xJParam; - yC = yJParam; - y = yPos; - x = xPos; - yy = y * y; - xx = x * x; + if (isJulia) { + xC = xJParam; + yC = yJParam; + y = yPos; + x = xPos; + yy = y * y; + xx = x * x; + } + else { + xC = xPos; + yC = yPos; + y = 0; + x = 0; + yy = 0; + xx = 0; + } - } else { - xC = xPos; - yC = yPos; - y = 0; - x = 0; - yy = 0; - xx = 0; - } + do { + // Iteration 1 + if (xx + yy > T(4.0)) + return i - 1; - do { - // Iteration 1 - if (xx + yy > T(4.0)) return i - 1; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 2 + if (xx + yy > T(4.0)) + return i - 2; - // Iteration 2 - if (xx + yy > T(4.0)) return i - 2; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 3 + if (xx + yy > T(4.0)) + return i - 3; - // Iteration 3 - if (xx + yy > T(4.0)) return i - 3; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 4 + if (xx + yy > T(4.0)) + return i - 4; - // Iteration 4 - if (xx + yy > T(4.0)) return i - 4; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 5 + if (xx + yy > T(4.0)) + return i - 5; - // Iteration 5 - if (xx + yy > T(4.0)) return i - 5; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 6 + if (xx + yy > T(4.0)) + return i - 6; - // Iteration 6 - if (xx + yy > T(4.0)) return i - 6; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 7 + if (xx + yy > T(4.0)) + return i - 7; - // Iteration 7 - if (xx + yy > T(4.0)) return i - 7; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 8 + if (xx + yy > T(4.0)) + return i - 8; - // Iteration 8 - if (xx + yy > T(4.0)) return i - 8; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 9 + if (xx + yy > T(4.0)) + return i - 9; - // Iteration 9 - if (xx + yy > T(4.0)) return i - 9; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 10 + if (xx + yy > T(4.0)) + return i - 10; - // Iteration 10 - if (xx + yy > T(4.0)) return i - 10; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 11 + if (xx + yy > T(4.0)) + return i - 11; - // Iteration 11 - if (xx + yy > T(4.0)) return i - 11; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 12 + if (xx + yy > T(4.0)) + return i - 12; - // Iteration 12 - if (xx + yy > T(4.0)) return i - 12; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 13 + if (xx + yy > T(4.0)) + return i - 13; - // Iteration 13 - if (xx + yy > T(4.0)) return i - 13; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 14 + if (xx + yy > T(4.0)) + return i - 14; - // Iteration 14 - if (xx + yy > T(4.0)) return i - 14; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 15 + if (xx + yy > T(4.0)) + return i - 15; - // Iteration 15 - if (xx + yy > T(4.0)) return i - 15; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 16 + if (xx + yy > T(4.0)) + return i - 16; - // Iteration 16 - if (xx + yy > T(4.0)) return i - 16; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 17 + if (xx + yy > T(4.0)) + return i - 17; - // Iteration 17 - if (xx + yy > T(4.0)) return i - 17; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 18 + if (xx + yy > T(4.0)) + return i - 18; - // Iteration 18 - if (xx + yy > T(4.0)) return i - 18; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 19 + if (xx + yy > T(4.0)) + return i - 19; - // Iteration 19 - if (xx + yy > T(4.0)) return i - 19; + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; + // Iteration 20 + i -= 20; - // Iteration 20 - i -= 20; + if ((i <= 0) || (xx + yy > T(4.0))) + return i; - if ((i <= 0) || (xx + yy > T(4.0))) return i; - - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; - } while (1); -} // CalcMandelbrot + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; + } while (1); +} // CalcMandelbrot #else template -__device__ inline int CalcMandelbrot(const T xPos, const T yPos, - const T xJParam, const T yJParam, - const int crunch, const isJulia) { - T x, y, xx, yy, xC, yC; +__device__ inline int +CalcMandelbrot(const T xPos, const T yPos, const T xJParam, const T yJParam, const int crunch, const isJulia) +{ + T x, y, xx, yy, xC, yC; - if (isJulia) { - xC = xJParam; - yC = yJParam; - y = yPos; - x = xPos; - yy = y * y; - xx = x * x; + if (isJulia) { + xC = xJParam; + yC = yJParam; + y = yPos; + x = xPos; + yy = y * y; + xx = x * x; + } + else { + xC = xPos; + yC = yPos; + y = 0; + x = 0; + yy = 0; + xx = 0; + } - } else { - xC = xPos; - yC = yPos; - y = 0; - x = 0; - yy = 0; - xx = 0; - } + int i = crunch; - int i = crunch; + while (--i && (xx + yy < T(4.0))) { + y = x * y * T(2.0) + yC; + x = xx - yy + xC; + yy = y * y; + xx = x * x; + } - while (--i && (xx + yy < T(4.0))) { - y = x * y * T(2.0) + yC; - x = xx - yy + xC; - yy = y * y; - xx = x * x; - } - - return i; // i > 0 ? crunch - i : 0; -} // CalcMandelbrot + return i; // i > 0 ? crunch - i : 0; +} // CalcMandelbrot #endif // The core Mandelbrot calculation function in double-single precision -__device__ inline int CalcMandelbrotDS(const float xPos0, const float xPos1, - const float yPos0, const float yPos1, - const float xJParam, const float yJParam, - const int crunch, const bool isJulia) { - float xx0, xx1; - float yy0, yy1; - float sum0, sum1; - int i = crunch; +__device__ inline int CalcMandelbrotDS(const float xPos0, + const float xPos1, + const float yPos0, + const float yPos1, + const float xJParam, + const float yJParam, + const int crunch, + const bool isJulia) +{ + float xx0, xx1; + float yy0, yy1; + float sum0, sum1; + int i = crunch; - float x0, x1, y0, y1; - float xC0, xC1, yC0, yC1; + float x0, x1, y0, y1; + float xC0, xC1, yC0, yC1; - if (isJulia) { - xC0 = xJParam; - xC1 = 0; - yC0 = yJParam; - yC1 = 0; - y0 = yPos0; // y = yPos; - y1 = yPos1; - x0 = xPos0; // x = xPos; - x1 = xPos1; - dsmul(yy0, yy1, y0, y1, y0, y1); // yy = y * y; - dsmul(xx0, xx1, x0, x1, x0, x1); // xx = x * x; - } else { - xC0 = xPos0; - xC1 = xPos1; - yC0 = yPos0; - yC1 = yPos1; - y0 = 0; // y = 0 ; - y1 = 0; - x0 = 0; // x = 0 ; - x1 = 0; - yy0 = 0; // yy = 0 ; - yy1 = 0; - xx0 = 0; // xx = 0 ; - xx1 = 0; - } + if (isJulia) { + xC0 = xJParam; + xC1 = 0; + yC0 = yJParam; + yC1 = 0; + y0 = yPos0; // y = yPos; + y1 = yPos1; + x0 = xPos0; // x = xPos; + x1 = xPos1; + dsmul(yy0, yy1, y0, y1, y0, y1); // yy = y * y; + dsmul(xx0, xx1, x0, x1, x0, x1); // xx = x * x; + } + else { + xC0 = xPos0; + xC1 = xPos1; + yC0 = yPos0; + yC1 = yPos1; + y0 = 0; // y = 0 ; + y1 = 0; + x0 = 0; // x = 0 ; + x1 = 0; + yy0 = 0; // yy = 0 ; + yy1 = 0; + xx0 = 0; // xx = 0 ; + xx1 = 0; + } - dsadd(sum0, sum1, xx0, xx1, yy0, yy1); // sum = xx + yy; + dsadd(sum0, sum1, xx0, xx1, yy0, yy1); // sum = xx + yy; - while (--i && (sum0 + sum1 < 4.0f)) { - dsmul(y0, y1, x0, x1, y0, y1); // y = x * y * 2.0f + yC; // yC is yPos for - // Mandelbrot and it is yJParam for Julia - dsadd(y0, y1, y0, y1, y0, y1); - dsadd(y0, y1, y0, y1, yC0, yC1); + while (--i && (sum0 + sum1 < 4.0f)) { + dsmul(y0, y1, x0, x1, y0, y1); // y = x * y * 2.0f + yC; // yC is yPos for + // Mandelbrot and it is yJParam for Julia + dsadd(y0, y1, y0, y1, y0, y1); + dsadd(y0, y1, y0, y1, yC0, yC1); - dssub(x0, x1, xx0, xx1, yy0, yy1); // x = xx - yy + xC; // xC is xPos for - // Mandelbrot and it is xJParam for - // Julia - dsadd(x0, x1, x0, x1, xC0, xC1); + dssub(x0, x1, xx0, xx1, yy0, yy1); // x = xx - yy + xC; // xC is xPos for + // Mandelbrot and it is xJParam for + // Julia + dsadd(x0, x1, x0, x1, xC0, xC1); - dsmul(yy0, yy1, y0, y1, y0, y1); // yy = y * y; - dsmul(xx0, xx1, x0, x1, x0, x1); // xx = x * x; - dsadd(sum0, sum1, xx0, xx1, yy0, yy1); // sum = xx + yy; - } + dsmul(yy0, yy1, y0, y1, y0, y1); // yy = y * y; + dsmul(xx0, xx1, x0, x1, x0, x1); // xx = x * x; + dsadd(sum0, sum1, xx0, xx1, yy0, yy1); // sum = xx + yy; + } - return i; -} // CalcMandelbrotDS + return i; +} // CalcMandelbrotDS // Determine if two pixel colors are within tolerance -__device__ inline int CheckColors(const uchar4 &color0, const uchar4 &color1) { - int x = color1.x - color0.x; - int y = color1.y - color0.y; - int z = color1.z - color0.z; - return (ABS(x) > 10) || (ABS(y) > 10) || (ABS(z) > 10); -} // CheckColors +__device__ inline int CheckColors(const uchar4 &color0, const uchar4 &color1) +{ + int x = color1.x - color0.x; + int y = color1.y - color0.y; + int z = color1.z - color0.z; + return (ABS(x) > 10) || (ABS(y) > 10) || (ABS(z) > 10); +} // CheckColors // Increase the grid size by 1 if the image width or height does not divide // evenly // by the thread block dimensions -inline int iDivUp(int a, int b) { - return ((a % b) != 0) ? (a / b + 1) : (a / b); -} // iDivUp +inline int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); } // iDivUp diff --git a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.h b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.h index af3cd776..230ad530 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.h +++ b/Samples/5_Domain_Specific/Mandelbrot/Mandelbrot_kernel.h @@ -30,21 +30,37 @@ #include -extern "C" void RunMandelbrot0(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const double xOff, - const double yOff, const double xjp, - const double yjp, const double scale, - const uchar4 colors, const int frame, - const int animationFrame, const int mode, - const int numSMs, const bool isJ, - int version = 13); -extern "C" void RunMandelbrot1(uchar4 *dst, const int imageW, const int imageH, - const int crunch, const double xOff, - const double yOff, const double xjp, - const double yjp, const double scale, - const uchar4 colors, const int frame, - const int animationFrame, const int mode, - const int numSMs, const bool isJ, - int version = 13); +extern "C" void RunMandelbrot0(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xjp, + const double yjp, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int mode, + const int numSMs, + const bool isJ, + int version = 13); +extern "C" void RunMandelbrot1(uchar4 *dst, + const int imageW, + const int imageH, + const int crunch, + const double xOff, + const double yOff, + const double xjp, + const double yjp, + const double scale, + const uchar4 colors, + const int frame, + const int animationFrame, + const int mode, + const int numSMs, + const bool isJ, + int version = 13); #endif diff --git a/Samples/5_Domain_Specific/Mandelbrot/README.md b/Samples/5_Domain_Specific/Mandelbrot/README.md index cf973bae..db51c422 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/README.md +++ b/Samples/5_Domain_Specific/Mandelbrot/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarloMultiGPU.cpp b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarloMultiGPU.cpp index 71b3f8a1..989c210d 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarloMultiGPU.cpp +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarloMultiGPU.cpp @@ -31,20 +31,20 @@ * See supplied whitepaper for more explanations. */ -#include -#include -#include -#include #include +#include +#include +#include +#include // includes, project -#include // Helper functions (utilities, parsing, timing) -#include // helper functions (cuda error checking and initialization) +#include // helper functions (cuda error checking and initialization) +#include // Helper functions (utilities, parsing, timing) #include #include "MonteCarlo_common.h" -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; #ifdef WIN32 @@ -54,42 +54,43 @@ char **pArgv = NULL; //////////////////////////////////////////////////////////////////////////////// // Common functions //////////////////////////////////////////////////////////////////////////////// -float randFloat(float low, float high) { - float t = (float)rand() / (float)RAND_MAX; - return (1.0f - t) * low + t * high; +float randFloat(float low, float high) +{ + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; } /// Utility function to tweak problem size for small GPUs -int adjustProblemSize(int GPU_N, int default_nOptions) { - int nOptions = default_nOptions; +int adjustProblemSize(int GPU_N, int default_nOptions) +{ + int nOptions = default_nOptions; - // select problem size - for (int i = 0; i < GPU_N; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); - int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - deviceProp.multiProcessorCount; + // select problem size + for (int i = 0; i < GPU_N; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); + int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount; - if (cudaCores <= 32) { - nOptions = (nOptions < cudaCores / 2 ? nOptions : cudaCores / 2); + if (cudaCores <= 32) { + nOptions = (nOptions < cudaCores / 2 ? nOptions : cudaCores / 2); + } } - } - return nOptions; + return nOptions; } -int adjustGridSize(int GPUIndex, int defaultGridSize) { - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, GPUIndex)); - int maxGridSize = deviceProp.multiProcessorCount * 40; - return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize); +int adjustGridSize(int GPUIndex, int defaultGridSize) +{ + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, GPUIndex)); + int maxGridSize = deviceProp.multiProcessorCount * 40; + return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize); } /////////////////////////////////////////////////////////////////////////////// // CPU reference functions /////////////////////////////////////////////////////////////////////////////// -extern "C" void MonteCarloCPU(TOptionValue &callValue, TOptionData optionData, - float *h_Random, int pathN); +extern "C" void MonteCarloCPU(TOptionValue &callValue, TOptionData optionData, float *h_Random, int pathN); // Black-Scholes formula for call options extern "C" void BlackScholesCall(float &CallResult, TOptionData optionData); @@ -100,98 +101,99 @@ extern "C" void BlackScholesCall(float &CallResult, TOptionData optionData); // Timer StopWatchInterface **hTimer = NULL; -static CUT_THREADPROC solverThread(TOptionPlan *plan) { - // Init GPU - checkCudaErrors(cudaSetDevice(plan->device)); - - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device)); - - // Start the timer - sdkStartTimer(&hTimer[plan->device]); - - // Allocate intermediate memory for MC integrator and initialize - // RNG states - initMonteCarloGPU(plan); - - // Main computation - MonteCarloGPU(plan); - - checkCudaErrors(cudaDeviceSynchronize()); - - // Stop the timer - sdkStopTimer(&hTimer[plan->device]); - - // Shut down this GPU - closeMonteCarloGPU(plan); - - cudaStreamSynchronize(0); - - printf("solverThread() finished - GPU Device %d: %s\n", plan->device, - deviceProp.name); - - CUT_THREADEND; -} - -static void multiSolver(TOptionPlan *plan, int nPlans) { - // allocate and initialize an array of stream handles - cudaStream_t *streams = (cudaStream_t *)malloc(nPlans * sizeof(cudaStream_t)); - cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t)); - - for (int i = 0; i < nPlans; i++) { - checkCudaErrors(cudaSetDevice(plan[i].device)); - checkCudaErrors(cudaStreamCreate(&(streams[i]))); - checkCudaErrors(cudaEventCreate(&(events[i]))); - } - - // Init Each GPU - // In CUDA 4.0 we can call cudaSetDevice multiple times to target each device - // Set the device desired, then perform initializations on that device - - for (int i = 0; i < nPlans; i++) { - // set the target device to perform initialization on - checkCudaErrors(cudaSetDevice(plan[i].device)); +static CUT_THREADPROC solverThread(TOptionPlan *plan) +{ + // Init GPU + checkCudaErrors(cudaSetDevice(plan->device)); cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device)); - // Allocate intermediate memory for MC integrator - // and initialize RNG state - initMonteCarloGPU(&plan[i]); - } + // Start the timer + sdkStartTimer(&hTimer[plan->device]); + + // Allocate intermediate memory for MC integrator and initialize + // RNG states + initMonteCarloGPU(plan); + + // Main computation + MonteCarloGPU(plan); - for (int i = 0; i < nPlans; i++) { - checkCudaErrors(cudaSetDevice(plan[i].device)); checkCudaErrors(cudaDeviceSynchronize()); - } - // Start the timer - sdkResetTimer(&hTimer[0]); - sdkStartTimer(&hTimer[0]); + // Stop the timer + sdkStopTimer(&hTimer[plan->device]); - for (int i = 0; i < nPlans; i++) { - checkCudaErrors(cudaSetDevice(plan[i].device)); + // Shut down this GPU + closeMonteCarloGPU(plan); - // Main computations - MonteCarloGPU(&plan[i], streams[i]); + cudaStreamSynchronize(0); - checkCudaErrors(cudaEventRecord(events[i], streams[i])); - } + printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name); - for (int i = 0; i < nPlans; i++) { - checkCudaErrors(cudaSetDevice(plan[i].device)); - cudaEventSynchronize(events[i]); - } + CUT_THREADEND; +} - // Stop the timer - sdkStopTimer(&hTimer[0]); +static void multiSolver(TOptionPlan *plan, int nPlans) +{ + // allocate and initialize an array of stream handles + cudaStream_t *streams = (cudaStream_t *)malloc(nPlans * sizeof(cudaStream_t)); + cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t)); - for (int i = 0; i < nPlans; i++) { - checkCudaErrors(cudaSetDevice(plan[i].device)); - closeMonteCarloGPU(&plan[i]); - checkCudaErrors(cudaStreamDestroy(streams[i])); - checkCudaErrors(cudaEventDestroy(events[i])); - } + for (int i = 0; i < nPlans; i++) { + checkCudaErrors(cudaSetDevice(plan[i].device)); + checkCudaErrors(cudaStreamCreate(&(streams[i]))); + checkCudaErrors(cudaEventCreate(&(events[i]))); + } + + // Init Each GPU + // In CUDA 4.0 we can call cudaSetDevice multiple times to target each device + // Set the device desired, then perform initializations on that device + + for (int i = 0; i < nPlans; i++) { + // set the target device to perform initialization on + checkCudaErrors(cudaSetDevice(plan[i].device)); + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan[i].device)); + + // Allocate intermediate memory for MC integrator + // and initialize RNG state + initMonteCarloGPU(&plan[i]); + } + + for (int i = 0; i < nPlans; i++) { + checkCudaErrors(cudaSetDevice(plan[i].device)); + checkCudaErrors(cudaDeviceSynchronize()); + } + + // Start the timer + sdkResetTimer(&hTimer[0]); + sdkStartTimer(&hTimer[0]); + + for (int i = 0; i < nPlans; i++) { + checkCudaErrors(cudaSetDevice(plan[i].device)); + + // Main computations + MonteCarloGPU(&plan[i], streams[i]); + + checkCudaErrors(cudaEventRecord(events[i], streams[i])); + } + + for (int i = 0; i < nPlans; i++) { + checkCudaErrors(cudaSetDevice(plan[i].device)); + cudaEventSynchronize(events[i]); + } + + // Stop the timer + sdkStopTimer(&hTimer[0]); + + for (int i = 0; i < nPlans; i++) { + checkCudaErrors(cudaSetDevice(plan[i].device)); + closeMonteCarloGPU(&plan[i]); + checkCudaErrors(cudaStreamDestroy(streams[i])); + checkCudaErrors(cudaEventDestroy(events[i])); + } } /////////////////////////////////////////////////////////////////////////////// @@ -203,285 +205,279 @@ static void multiSolver(TOptionPlan *plan, int nPlans) { #define PRINT_RESULTS #undef PRINT_RESULTS -void usage() { - printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n"); - printf("Method=threaded: 1 CPU thread for each GPU [default]\n"); - printf( - " streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or " - "newer)\n"); - printf("Scaling=strong : constant problem size\n"); - printf( - " weak : problem size scales with number of available GPUs " - "[default]\n"); +void usage() +{ + printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n"); + printf("Method=threaded: 1 CPU thread for each GPU [default]\n"); + printf(" streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or " + "newer)\n"); + printf("Scaling=strong : constant problem size\n"); + printf(" weak : problem size scales with number of available GPUs " + "[default]\n"); } -int main(int argc, char **argv) { - char *multiMethodChoice = NULL; - char *scalingChoice = NULL; - bool use_threads = true; - bool bqatest = false; - bool strongScaling = false; +int main(int argc, char **argv) +{ + char *multiMethodChoice = NULL; + char *scalingChoice = NULL; + bool use_threads = true; + bool bqatest = false; + bool strongScaling = false; - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { - bqatest = true; - } - - getCmdLineArgumentString(argc, (const char **)argv, "method", - &multiMethodChoice); - getCmdLineArgumentString(argc, (const char **)argv, "scaling", - &scalingChoice); - - if (checkCmdLineFlag(argc, (const char **)argv, "h") || - checkCmdLineFlag(argc, (const char **)argv, "help")) { - usage(); - exit(EXIT_SUCCESS); - } - - if (multiMethodChoice == NULL) { - use_threads = false; - } else { - if (!strcasecmp(multiMethodChoice, "threaded")) { - use_threads = true; - } else { - use_threads = false; - } - } - - if (use_threads == false) { - printf("Using single CPU thread for multiple GPUs\n"); - } - - if (scalingChoice == NULL) { - strongScaling = false; - } else { - if (!strcasecmp(scalingChoice, "strong")) { - strongScaling = true; - } else { - strongScaling = false; - } - } - - // GPU number present in the system - int GPU_N; - checkCudaErrors(cudaGetDeviceCount(&GPU_N)); - int nOptions = 8 * 1024; - - nOptions = adjustProblemSize(GPU_N, nOptions); - - // select problem size - int scale = (strongScaling) ? 1 : GPU_N; - int OPT_N = nOptions * scale; - int PATH_N = 262144; - - // initialize the timers - hTimer = new StopWatchInterface *[GPU_N]; - - for (int i = 0; i < GPU_N; i++) { - sdkCreateTimer(&hTimer[i]); - sdkResetTimer(&hTimer[i]); - } - - // Input data array - TOptionData *optionData = new TOptionData[OPT_N]; - // Final GPU MC results - TOptionValue *callValueGPU = new TOptionValue[OPT_N]; - //"Theoretical" call values by Black-Scholes formula - float *callValueBS = new float[OPT_N]; - // Solver config - TOptionPlan *optionSolver = new TOptionPlan[GPU_N]; - // OS thread ID - CUTThread *threadID = new CUTThread[GPU_N]; - - int gpuBase, gpuIndex; - int i; - - float time; - - double delta, ref, sumDelta, sumRef, sumReserve; - - printf("MonteCarloMultiGPU\n"); - printf("==================\n"); - printf("Parallelization method = %s\n", - use_threads ? "threaded" : "streamed"); - printf("Problem scaling = %s\n", strongScaling ? "strong" : "weak"); - printf("Number of GPUs = %d\n", GPU_N); - printf("Total number of options = %d\n", OPT_N); - printf("Number of paths = %d\n", PATH_N); - - printf("main(): generating input data...\n"); - srand(123); - - for (i = 0; i < OPT_N; i++) { - optionData[i].S = randFloat(5.0f, 50.0f); - optionData[i].X = randFloat(10.0f, 25.0f); - optionData[i].T = randFloat(1.0f, 5.0f); - optionData[i].R = 0.06f; - optionData[i].V = 0.10f; - callValueGPU[i].Expected = -1.0f; - callValueGPU[i].Confidence = -1.0f; - } - - printf("main(): starting %i host threads...\n", GPU_N); - - // Get option count for each GPU - for (i = 0; i < GPU_N; i++) { - optionSolver[i].optionCount = OPT_N / GPU_N; - } - - // Take into account cases with "odd" option counts - for (i = 0; i < (OPT_N % GPU_N); i++) { - optionSolver[i].optionCount++; - } - - // Assign GPU option ranges - gpuBase = 0; - - for (i = 0; i < GPU_N; i++) { - optionSolver[i].device = i; - optionSolver[i].optionData = optionData + gpuBase; - optionSolver[i].callValue = callValueGPU + gpuBase; - optionSolver[i].pathN = PATH_N; - optionSolver[i].gridSize = - adjustGridSize(optionSolver[i].device, optionSolver[i].optionCount); - gpuBase += optionSolver[i].optionCount; - } - - if (use_threads || bqatest) { - // Start CPU thread for each GPU - for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) { - threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, - &optionSolver[gpuIndex]); + if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { + bqatest = true; } - printf("main(): waiting for GPU results...\n"); - cutWaitForThreads(threadID, GPU_N); + getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice); + getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice); - printf("main(): GPU statistics, threaded\n"); - - for (i = 0; i < GPU_N; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors( - cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); - printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); - printf("Options : %i\n", optionSolver[i].optionCount); - printf("Simulation paths: %i\n", optionSolver[i].pathN); - time = sdkGetTimerValue(&hTimer[i]); - printf("Total time (ms.): %f\n", time); - printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); + if (checkCmdLineFlag(argc, (const char **)argv, "h") || checkCmdLineFlag(argc, (const char **)argv, "help")) { + usage(); + exit(EXIT_SUCCESS); } - printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); - sumDelta = 0; - sumRef = 0; - sumReserve = 0; + if (multiMethodChoice == NULL) { + use_threads = false; + } + else { + if (!strcasecmp(multiMethodChoice, "threaded")) { + use_threads = true; + } + else { + use_threads = false; + } + } + + if (use_threads == false) { + printf("Using single CPU thread for multiple GPUs\n"); + } + + if (scalingChoice == NULL) { + strongScaling = false; + } + else { + if (!strcasecmp(scalingChoice, "strong")) { + strongScaling = true; + } + else { + strongScaling = false; + } + } + + // GPU number present in the system + int GPU_N; + checkCudaErrors(cudaGetDeviceCount(&GPU_N)); + int nOptions = 8 * 1024; + + nOptions = adjustProblemSize(GPU_N, nOptions); + + // select problem size + int scale = (strongScaling) ? 1 : GPU_N; + int OPT_N = nOptions * scale; + int PATH_N = 262144; + + // initialize the timers + hTimer = new StopWatchInterface *[GPU_N]; + + for (int i = 0; i < GPU_N; i++) { + sdkCreateTimer(&hTimer[i]); + sdkResetTimer(&hTimer[i]); + } + + // Input data array + TOptionData *optionData = new TOptionData[OPT_N]; + // Final GPU MC results + TOptionValue *callValueGPU = new TOptionValue[OPT_N]; + //"Theoretical" call values by Black-Scholes formula + float *callValueBS = new float[OPT_N]; + // Solver config + TOptionPlan *optionSolver = new TOptionPlan[GPU_N]; + // OS thread ID + CUTThread *threadID = new CUTThread[GPU_N]; + + int gpuBase, gpuIndex; + int i; + + float time; + + double delta, ref, sumDelta, sumRef, sumReserve; + + printf("MonteCarloMultiGPU\n"); + printf("==================\n"); + printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed"); + printf("Problem scaling = %s\n", strongScaling ? "strong" : "weak"); + printf("Number of GPUs = %d\n", GPU_N); + printf("Total number of options = %d\n", OPT_N); + printf("Number of paths = %d\n", PATH_N); + + printf("main(): generating input data...\n"); + srand(123); for (i = 0; i < OPT_N; i++) { - BlackScholesCall(callValueBS[i], optionData[i]); - delta = fabs(callValueBS[i] - callValueGPU[i].Expected); - ref = callValueBS[i]; - sumDelta += delta; - sumRef += fabs(ref); - - if (delta > 1e-6) { - sumReserve += callValueGPU[i].Confidence / delta; - } - -#ifdef PRINT_RESULTS - printf("BS: %f; delta: %E\n", callValueBS[i], delta); -#endif + optionData[i].S = randFloat(5.0f, 50.0f); + optionData[i].X = randFloat(10.0f, 25.0f); + optionData[i].T = randFloat(1.0f, 5.0f); + optionData[i].R = 0.06f; + optionData[i].V = 0.10f; + callValueGPU[i].Expected = -1.0f; + callValueGPU[i].Confidence = -1.0f; } - sumReserve /= OPT_N; - } + printf("main(): starting %i host threads...\n", GPU_N); - if (!use_threads || bqatest) { - multiSolver(optionSolver, GPU_N); + // Get option count for each GPU + for (i = 0; i < GPU_N; i++) { + optionSolver[i].optionCount = OPT_N / GPU_N; + } - printf("main(): GPU statistics, streamed\n"); + // Take into account cases with "odd" option counts + for (i = 0; i < (OPT_N % GPU_N); i++) { + optionSolver[i].optionCount++; + } + + // Assign GPU option ranges + gpuBase = 0; for (i = 0; i < GPU_N; i++) { - cudaDeviceProp deviceProp; - checkCudaErrors( - cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); - printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); - printf("Options : %i\n", optionSolver[i].optionCount); - printf("Simulation paths: %i\n", optionSolver[i].pathN); + optionSolver[i].device = i; + optionSolver[i].optionData = optionData + gpuBase; + optionSolver[i].callValue = callValueGPU + gpuBase; + optionSolver[i].pathN = PATH_N; + optionSolver[i].gridSize = adjustGridSize(optionSolver[i].device, optionSolver[i].optionCount); + gpuBase += optionSolver[i].optionCount; } - time = sdkGetTimerValue(&hTimer[0]); - printf("\nTotal time (ms.): %f\n", time); - printf("\tNote: This is elapsed time for all to compute.\n"); - printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); + if (use_threads || bqatest) { + // Start CPU thread for each GPU + for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) { + threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); + } - printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); - sumDelta = 0; - sumRef = 0; - sumReserve = 0; + printf("main(): waiting for GPU results...\n"); + cutWaitForThreads(threadID, GPU_N); - for (i = 0; i < OPT_N; i++) { - BlackScholesCall(callValueBS[i], optionData[i]); - delta = fabs(callValueBS[i] - callValueGPU[i].Expected); - ref = callValueBS[i]; - sumDelta += delta; - sumRef += fabs(ref); + printf("main(): GPU statistics, threaded\n"); - if (delta > 1e-6) { - sumReserve += callValueGPU[i].Confidence / delta; - } + for (i = 0; i < GPU_N; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); + printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); + printf("Options : %i\n", optionSolver[i].optionCount); + printf("Simulation paths: %i\n", optionSolver[i].pathN); + time = sdkGetTimerValue(&hTimer[i]); + printf("Total time (ms.): %f\n", time); + printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); + } + + printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); + sumDelta = 0; + sumRef = 0; + sumReserve = 0; + + for (i = 0; i < OPT_N; i++) { + BlackScholesCall(callValueBS[i], optionData[i]); + delta = fabs(callValueBS[i] - callValueGPU[i].Expected); + ref = callValueBS[i]; + sumDelta += delta; + sumRef += fabs(ref); + + if (delta > 1e-6) { + sumReserve += callValueGPU[i].Confidence / delta; + } #ifdef PRINT_RESULTS - printf("BS: %f; delta: %E\n", callValueBS[i], delta); + printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif + } + + sumReserve /= OPT_N; } - sumReserve /= OPT_N; - } + if (!use_threads || bqatest) { + multiSolver(optionSolver, GPU_N); + + printf("main(): GPU statistics, streamed\n"); + + for (i = 0; i < GPU_N; i++) { + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); + printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); + printf("Options : %i\n", optionSolver[i].optionCount); + printf("Simulation paths: %i\n", optionSolver[i].pathN); + } + + time = sdkGetTimerValue(&hTimer[0]); + printf("\nTotal time (ms.): %f\n", time); + printf("\tNote: This is elapsed time for all to compute.\n"); + printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); + + printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); + sumDelta = 0; + sumRef = 0; + sumReserve = 0; + + for (i = 0; i < OPT_N; i++) { + BlackScholesCall(callValueBS[i], optionData[i]); + delta = fabs(callValueBS[i] - callValueGPU[i].Expected); + ref = callValueBS[i]; + sumDelta += delta; + sumRef += fabs(ref); + + if (delta > 1e-6) { + sumReserve += callValueGPU[i].Confidence / delta; + } + +#ifdef PRINT_RESULTS + printf("BS: %f; delta: %E\n", callValueBS[i], delta); +#endif + } + + sumReserve /= OPT_N; + } #ifdef DO_CPU - printf("main(): running CPU MonteCarlo...\n"); - TOptionValue callValueCPU; - sumDelta = 0; - sumRef = 0; + printf("main(): running CPU MonteCarlo...\n"); + TOptionValue callValueCPU; + sumDelta = 0; + sumRef = 0; - for (i = 0; i < OPT_N; i++) { - MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N); - delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); - ref = callValueCPU.Expected; - sumDelta += delta; - sumRef += fabs(ref); - printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); - printf("Conf: %f | %f\n", callValueCPU.Confidence, - callValueGPU[i].Confidence); - } + for (i = 0; i < OPT_N; i++) { + MonteCarloCPU(callValueCPU, optionData[i], NULL, PATH_N); + delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); + ref = callValueCPU.Expected; + sumDelta += delta; + sumRef += fabs(ref); + printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); + printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); + } - printf("L1 norm: %E\n", sumDelta / sumRef); + printf("L1 norm: %E\n", sumDelta / sumRef); #endif - printf("Shutting down...\n"); + printf("Shutting down...\n"); - for (int i = 0; i < GPU_N; i++) { - sdkStartTimer(&hTimer[i]); - checkCudaErrors(cudaSetDevice(i)); - } + for (int i = 0; i < GPU_N; i++) { + sdkStartTimer(&hTimer[i]); + checkCudaErrors(cudaSetDevice(i)); + } - delete[] optionSolver; - delete[] callValueBS; - delete[] callValueGPU; - delete[] optionData; - delete[] threadID; - delete[] hTimer; + delete[] optionSolver; + delete[] callValueBS; + delete[] callValueGPU; + delete[] optionData; + delete[] threadID; + delete[] hTimer; - printf("Test Summary...\n"); - printf("L1 norm : %E\n", sumDelta / sumRef); - printf("Average reserve: %f\n", sumReserve); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); - printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n"); - exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE); + printf("Test Summary...\n"); + printf("L1 norm : %E\n", sumDelta / sumRef); + printf("Average reserve: %f\n", sumReserve); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); + printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n"); + exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_common.h b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_common.h index 562407a8..da929829 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_common.h +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_common.h @@ -27,68 +27,71 @@ #ifndef MONTECARLO_COMMON_H #define MONTECARLO_COMMON_H -#include "realtype.h" #include "curand_kernel.h" +#include "realtype.h" //////////////////////////////////////////////////////////////////////////////// // Global types //////////////////////////////////////////////////////////////////////////////// -typedef struct { - float S; - float X; - float T; - float R; - float V; +typedef struct +{ + float S; + float X; + float T; + float R; + float V; } TOptionData; typedef struct - //#ifdef __CUDACC__ - //__align__(8) - //#endif - { - float Expected; - float Confidence; +// #ifdef __CUDACC__ +//__align__(8) +// #endif +{ + float Expected; + float Confidence; } TOptionValue; // GPU outputs before CPU postprocessing -typedef struct { - real Expected; - real Confidence; +typedef struct +{ + real Expected; + real Confidence; } __TOptionValue; -typedef struct { - // Device ID for multi-GPU version - int device; - // Option count for this plan - int optionCount; +typedef struct +{ + // Device ID for multi-GPU version + int device; + // Option count for this plan + int optionCount; - // Host-side data source and result destination - TOptionData *optionData; - TOptionValue *callValue; + // Host-side data source and result destination + TOptionData *optionData; + TOptionValue *callValue; - // Temporary Host-side pinned memory for async + faster data transfers - __TOptionValue *h_CallValue; + // Temporary Host-side pinned memory for async + faster data transfers + __TOptionValue *h_CallValue; - // Device- and host-side option data - void *d_OptionData; - void *h_OptionData; + // Device- and host-side option data + void *d_OptionData; + void *h_OptionData; - // Device-side option values - void *d_CallValue; + // Device-side option values + void *d_CallValue; - // Intermediate device-side buffers - void *d_Buffer; + // Intermediate device-side buffers + void *d_Buffer; - // random number generator states - curandState *rngStates; + // random number generator states + curandState *rngStates; - // Pseudorandom samples count - int pathN; + // Pseudorandom samples count + int pathN; - // Time stamp - float time; + // Time stamp + float time; - int gridSize; + int gridSize; } TOptionPlan; extern "C" void initMonteCarloGPU(TOptionPlan *plan); diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_gold.cpp b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_gold.cpp index 369bb9f8..6dfe83d6 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_gold.cpp +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_gold.cpp @@ -25,13 +25,12 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include #include #include -#include -#include - -//#include "curand_kernel.h" +// #include "curand_kernel.h" #include "helper_cuda.h" //////////////////////////////////////////////////////////////////////////////// @@ -42,99 +41,101 @@ //////////////////////////////////////////////////////////////////////////////// // Black-Scholes formula for Monte Carlo results validation //////////////////////////////////////////////////////////////////////////////// -#define A1 0.31938153 -#define A2 -0.356563782 -#define A3 1.781477937 -#define A4 -1.821255978 -#define A5 1.330274429 +#define A1 0.31938153 +#define A2 -0.356563782 +#define A3 1.781477937 +#define A4 -1.821255978 +#define A5 1.330274429 #define RSQRT2PI 0.39894228040143267793994605993438 // Polynomial approximation of // cumulative normal distribution function -double CND(double d) { - double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); +double CND(double d) +{ + double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); - double cnd = RSQRT2PI * exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + double cnd = RSQRT2PI * exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0 - cnd; + if (d > 0) + cnd = 1.0 - cnd; - return cnd; + return cnd; } // Black-Scholes formula for call value -extern "C" void BlackScholesCall(float &callValue, TOptionData optionData) { - double S = optionData.S; - double X = optionData.X; - double T = optionData.T; - double R = optionData.R; - double V = optionData.V; +extern "C" void BlackScholesCall(float &callValue, TOptionData optionData) +{ + double S = optionData.S; + double X = optionData.X; + double T = optionData.T; + double R = optionData.R; + double V = optionData.V; - double sqrtT = sqrt(T); - double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); - double d2 = d1 - V * sqrtT; - double CNDD1 = CND(d1); - double CNDD2 = CND(d2); - double expRT = exp(-R * T); + double sqrtT = sqrt(T); + double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; + double CNDD1 = CND(d1); + double CNDD2 = CND(d2); + double expRT = exp(-R * T); - callValue = (float)(S * CNDD1 - X * expRT * CNDD2); + callValue = (float)(S * CNDD1 - X * expRT * CNDD2); } //////////////////////////////////////////////////////////////////////////////// // CPU Monte Carlo //////////////////////////////////////////////////////////////////////////////// -static double endCallValue(double S, double X, double r, double MuByT, - double VBySqrtT) { - double callValue = S * exp(MuByT + VBySqrtT * r) - X; - return (callValue > 0) ? callValue : 0; +static double endCallValue(double S, double X, double r, double MuByT, double VBySqrtT) +{ + double callValue = S * exp(MuByT + VBySqrtT * r) - X; + return (callValue > 0) ? callValue : 0; } -extern "C" void MonteCarloCPU(TOptionValue &callValue, TOptionData optionData, - float *h_Samples, int pathN) { - const double S = optionData.S; - const double X = optionData.X; - const double T = optionData.T; - const double R = optionData.R; - const double V = optionData.V; - const double MuByT = (R - 0.5 * V * V) * T; - const double VBySqrtT = V * sqrt(T); +extern "C" void MonteCarloCPU(TOptionValue &callValue, TOptionData optionData, float *h_Samples, int pathN) +{ + const double S = optionData.S; + const double X = optionData.X; + const double T = optionData.T; + const double R = optionData.R; + const double V = optionData.V; + const double MuByT = (R - 0.5 * V * V) * T; + const double VBySqrtT = V * sqrt(T); - float *samples; - curandGenerator_t gen; + float *samples; + curandGenerator_t gen; - checkCudaErrors(curandCreateGeneratorHost(&gen, CURAND_RNG_PSEUDO_DEFAULT)); - unsigned long long seed = 1234ULL; - checkCudaErrors(curandSetPseudoRandomGeneratorSeed(gen, seed)); + checkCudaErrors(curandCreateGeneratorHost(&gen, CURAND_RNG_PSEUDO_DEFAULT)); + unsigned long long seed = 1234ULL; + checkCudaErrors(curandSetPseudoRandomGeneratorSeed(gen, seed)); - if (h_Samples != NULL) { - samples = h_Samples; - } else { - samples = (float *)malloc(pathN * sizeof(float)); - checkCudaErrors(curandGenerateNormal(gen, samples, pathN, 0.0, 1.0)); - } + if (h_Samples != NULL) { + samples = h_Samples; + } + else { + samples = (float *)malloc(pathN * sizeof(float)); + checkCudaErrors(curandGenerateNormal(gen, samples, pathN, 0.0, 1.0)); + } - // for(int i=0; i<10; i++) printf("CPU sample = %f\n", samples[i]); + // for(int i=0; i<10; i++) printf("CPU sample = %f\n", samples[i]); - double sum = 0, sum2 = 0; + double sum = 0, sum2 = 0; - for (int pos = 0; pos < pathN; pos++) { - double sample = samples[pos]; - double callValue = endCallValue(S, X, sample, MuByT, VBySqrtT); - sum += callValue; - sum2 += callValue * callValue; - } + for (int pos = 0; pos < pathN; pos++) { + double sample = samples[pos]; + double callValue = endCallValue(S, X, sample, MuByT, VBySqrtT); + sum += callValue; + sum2 += callValue * callValue; + } - if (h_Samples == NULL) free(samples); + if (h_Samples == NULL) + free(samples); - checkCudaErrors(curandDestroyGenerator(gen)); + checkCudaErrors(curandDestroyGenerator(gen)); - // Derive average from the total sum and discount by riskfree rate - callValue.Expected = (float)(exp(-R * T) * sum / (double)pathN); - // Standard deviation - double stdDev = sqrt(((double)pathN * sum2 - sum * sum) / - ((double)pathN * (double)(pathN - 1))); - // Confidence width; in 95% of all cases theoretical value lies within these - // borders - callValue.Confidence = - (float)(exp(-R * T) * 1.96 * stdDev / sqrt((double)pathN)); + // Derive average from the total sum and discount by riskfree rate + callValue.Expected = (float)(exp(-R * T) * sum / (double)pathN); + // Standard deviation + double stdDev = sqrt(((double)pathN * sum2 - sum * sum) / ((double)pathN * (double)(pathN - 1))); + // Confidence width; in 95% of all cases theoretical value lies within these + // borders + callValue.Confidence = (float)(exp(-R * T) * 1.96 * stdDev / sqrt((double)pathN)); } diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_kernel.cu b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_kernel.cu index 5d33b8d5..dd69ccd8 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_kernel.cu +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_kernel.cu @@ -28,13 +28,14 @@ //////////////////////////////////////////////////////////////////////////////// // Global types //////////////////////////////////////////////////////////////////////////////// -#include -#include #include +#include +#include namespace cg = cooperative_groups; -#include #include +#include + #include "MonteCarlo_common.h" //////////////////////////////////////////////////////////////////////////////// @@ -49,26 +50,27 @@ namespace cg = cooperative_groups; #define MAX_OPTIONS (1024 * 1024) // Preprocessed input option data -typedef struct { - real S; - real X; - real MuByT; - real VBySqrtT; +typedef struct +{ + real S; + real X; + real MuByT; + real VBySqrtT; } __TOptionData; //////////////////////////////////////////////////////////////////////////////// // Overloaded shortcut payoff functions for different precision modes //////////////////////////////////////////////////////////////////////////////// -__device__ inline float endCallValue(float S, float X, float r, float MuByT, - float VBySqrtT) { - float callValue = S * __expf(MuByT + VBySqrtT * r) - X; - return (callValue > 0.0F) ? callValue : 0.0F; +__device__ inline float endCallValue(float S, float X, float r, float MuByT, float VBySqrtT) +{ + float callValue = S * __expf(MuByT + VBySqrtT * r) - X; + return (callValue > 0.0F) ? callValue : 0.0F; } -__device__ inline double endCallValue(double S, double X, double r, - double MuByT, double VBySqrtT) { - double callValue = S * exp(MuByT + VBySqrtT * r) - X; - return (callValue > 0.0) ? callValue : 0.0; +__device__ inline double endCallValue(double S, double X, double r, double MuByT, double VBySqrtT) +{ + double callValue = S * exp(MuByT + VBySqrtT * r) - X; + return (callValue > 0.0) ? callValue : 0.0; } #define THREAD_N 256 @@ -78,149 +80,145 @@ __device__ inline double endCallValue(double S, double X, double r, // per option. It is fastest when the number of thread blocks times the work per // block is high enough to keep the GPU busy. //////////////////////////////////////////////////////////////////////////////// -static __global__ void MonteCarloOneBlockPerOption( - curandState *__restrict rngStates, - const __TOptionData *__restrict d_OptionData, - __TOptionValue *__restrict d_CallValue, int pathN, int optionN) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); +static __global__ void MonteCarloOneBlockPerOption(curandState *__restrict rngStates, + const __TOptionData *__restrict d_OptionData, + __TOptionValue *__restrict d_CallValue, + int pathN, + int optionN) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); - const int SUM_N = THREAD_N; - __shared__ real s_SumCall[SUM_N]; - __shared__ real s_Sum2Call[SUM_N]; + const int SUM_N = THREAD_N; + __shared__ real s_SumCall[SUM_N]; + __shared__ real s_Sum2Call[SUM_N]; - // determine global thread id - int tid = threadIdx.x + blockIdx.x * blockDim.x; + // determine global thread id + int tid = threadIdx.x + blockIdx.x * blockDim.x; - // Copy random number state to local memory for efficiency - curandState localState = rngStates[tid]; - for (int optionIndex = blockIdx.x; optionIndex < optionN; - optionIndex += gridDim.x) { - const real S = d_OptionData[optionIndex].S; - const real X = d_OptionData[optionIndex].X; - const real MuByT = d_OptionData[optionIndex].MuByT; - const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT; + // Copy random number state to local memory for efficiency + curandState localState = rngStates[tid]; + for (int optionIndex = blockIdx.x; optionIndex < optionN; optionIndex += gridDim.x) { + const real S = d_OptionData[optionIndex].S; + const real X = d_OptionData[optionIndex].X; + const real MuByT = d_OptionData[optionIndex].MuByT; + const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT; - // Cycle through the entire samples array: - // derive end stock price for each path - // accumulate partial integrals into intermediate shared memory buffer - for (int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x) { - __TOptionValue sumCall = {0, 0}; + // Cycle through the entire samples array: + // derive end stock price for each path + // accumulate partial integrals into intermediate shared memory buffer + for (int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x) { + __TOptionValue sumCall = {0, 0}; #pragma unroll 8 - for (int i = iSum; i < pathN; i += SUM_N) { - real r = curand_normal(&localState); - real callValue = endCallValue(S, X, r, MuByT, VBySqrtT); - sumCall.Expected += callValue; - sumCall.Confidence += callValue * callValue; - } + for (int i = iSum; i < pathN; i += SUM_N) { + real r = curand_normal(&localState); + real callValue = endCallValue(S, X, r, MuByT, VBySqrtT); + sumCall.Expected += callValue; + sumCall.Confidence += callValue * callValue; + } - s_SumCall[iSum] = sumCall.Expected; - s_Sum2Call[iSum] = sumCall.Confidence; + s_SumCall[iSum] = sumCall.Expected; + s_Sum2Call[iSum] = sumCall.Confidence; + } + + // Reduce shared memory accumulators + // and write final result to global memory + cg::sync(cta); + sumReduce(s_SumCall, s_Sum2Call, cta, tile32, &d_CallValue[optionIndex]); } - - // Reduce shared memory accumulators - // and write final result to global memory - cg::sync(cta); - sumReduce(s_SumCall, s_Sum2Call, cta, tile32, - &d_CallValue[optionIndex]); - } } -static __global__ void rngSetupStates(curandState *rngState, int device_id) { - // determine global thread id - int tid = threadIdx.x + blockIdx.x * blockDim.x; - // Each threadblock gets different seed, - // Threads within a threadblock get different sequence numbers - curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0, - &rngState[tid]); +static __global__ void rngSetupStates(curandState *rngState, int device_id) +{ + // determine global thread id + int tid = threadIdx.x + blockIdx.x * blockDim.x; + // Each threadblock gets different seed, + // Threads within a threadblock get different sequence numbers + curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0, &rngState[tid]); } //////////////////////////////////////////////////////////////////////////////// // Host-side interface to GPU Monte Carlo //////////////////////////////////////////////////////////////////////////////// -extern "C" void initMonteCarloGPU(TOptionPlan *plan) { - checkCudaErrors(cudaMalloc(&plan->d_OptionData, - sizeof(__TOptionData) * (plan->optionCount))); - checkCudaErrors(cudaMalloc(&plan->d_CallValue, - sizeof(__TOptionValue) * (plan->optionCount))); - checkCudaErrors(cudaMallocHost(&plan->h_OptionData, - sizeof(__TOptionData) * (plan->optionCount))); - // Allocate internal device memory - checkCudaErrors(cudaMallocHost(&plan->h_CallValue, - sizeof(__TOptionValue) * (plan->optionCount))); - // Allocate states for pseudo random number generators - checkCudaErrors(cudaMalloc((void **)&plan->rngStates, - plan->gridSize * THREAD_N * sizeof(curandState))); - checkCudaErrors(cudaMemset(plan->rngStates, 0, - plan->gridSize * THREAD_N * sizeof(curandState))); +extern "C" void initMonteCarloGPU(TOptionPlan *plan) +{ + checkCudaErrors(cudaMalloc(&plan->d_OptionData, sizeof(__TOptionData) * (plan->optionCount))); + checkCudaErrors(cudaMalloc(&plan->d_CallValue, sizeof(__TOptionValue) * (plan->optionCount))); + checkCudaErrors(cudaMallocHost(&plan->h_OptionData, sizeof(__TOptionData) * (plan->optionCount))); + // Allocate internal device memory + checkCudaErrors(cudaMallocHost(&plan->h_CallValue, sizeof(__TOptionValue) * (plan->optionCount))); + // Allocate states for pseudo random number generators + checkCudaErrors(cudaMalloc((void **)&plan->rngStates, plan->gridSize * THREAD_N * sizeof(curandState))); + checkCudaErrors(cudaMemset(plan->rngStates, 0, plan->gridSize * THREAD_N * sizeof(curandState))); - // place each device pathN random numbers apart on the random number sequence - rngSetupStates<<gridSize, THREAD_N>>>(plan->rngStates, plan->device); - getLastCudaError("rngSetupStates kernel failed.\n"); + // place each device pathN random numbers apart on the random number sequence + rngSetupStates<<gridSize, THREAD_N>>>(plan->rngStates, plan->device); + getLastCudaError("rngSetupStates kernel failed.\n"); } // Compute statistics and deallocate internal device memory -extern "C" void closeMonteCarloGPU(TOptionPlan *plan) { - for (int i = 0; i < plan->optionCount; i++) { - const double RT = plan->optionData[i].R * plan->optionData[i].T; - const double sum = plan->h_CallValue[i].Expected; - const double sum2 = plan->h_CallValue[i].Confidence; - const double pathN = plan->pathN; - // Derive average from the total sum and discount by riskfree rate - plan->callValue[i].Expected = (float)(exp(-RT) * sum / pathN); - // Standard deviation - double stdDev = sqrt((pathN * sum2 - sum * sum) / (pathN * (pathN - 1))); - // Confidence width; in 95% of all cases theoretical value lies within these - // borders - plan->callValue[i].Confidence = - (float)(exp(-RT) * 1.96 * stdDev / sqrt(pathN)); - } +extern "C" void closeMonteCarloGPU(TOptionPlan *plan) +{ + for (int i = 0; i < plan->optionCount; i++) { + const double RT = plan->optionData[i].R * plan->optionData[i].T; + const double sum = plan->h_CallValue[i].Expected; + const double sum2 = plan->h_CallValue[i].Confidence; + const double pathN = plan->pathN; + // Derive average from the total sum and discount by riskfree rate + plan->callValue[i].Expected = (float)(exp(-RT) * sum / pathN); + // Standard deviation + double stdDev = sqrt((pathN * sum2 - sum * sum) / (pathN * (pathN - 1))); + // Confidence width; in 95% of all cases theoretical value lies within these + // borders + plan->callValue[i].Confidence = (float)(exp(-RT) * 1.96 * stdDev / sqrt(pathN)); + } - checkCudaErrors(cudaFree(plan->rngStates)); - checkCudaErrors(cudaFreeHost(plan->h_CallValue)); - checkCudaErrors(cudaFreeHost(plan->h_OptionData)); - checkCudaErrors(cudaFree(plan->d_CallValue)); - checkCudaErrors(cudaFree(plan->d_OptionData)); + checkCudaErrors(cudaFree(plan->rngStates)); + checkCudaErrors(cudaFreeHost(plan->h_CallValue)); + checkCudaErrors(cudaFreeHost(plan->h_OptionData)); + checkCudaErrors(cudaFree(plan->d_CallValue)); + checkCudaErrors(cudaFree(plan->d_OptionData)); } // Main computations -extern "C" void MonteCarloGPU(TOptionPlan *plan, cudaStream_t stream) { - __TOptionValue *h_CallValue = plan->h_CallValue; +extern "C" void MonteCarloGPU(TOptionPlan *plan, cudaStream_t stream) +{ + __TOptionValue *h_CallValue = plan->h_CallValue; - if (plan->optionCount <= 0 || plan->optionCount > MAX_OPTIONS) { - printf("MonteCarloGPU(): bad option count.\n"); - return; - } + if (plan->optionCount <= 0 || plan->optionCount > MAX_OPTIONS) { + printf("MonteCarloGPU(): bad option count.\n"); + return; + } - __TOptionData *h_OptionData = (__TOptionData *)plan->h_OptionData; + __TOptionData *h_OptionData = (__TOptionData *)plan->h_OptionData; - for (int i = 0; i < plan->optionCount; i++) { - const double T = plan->optionData[i].T; - const double R = plan->optionData[i].R; - const double V = plan->optionData[i].V; - const double MuByT = (R - 0.5 * V * V) * T; - const double VBySqrtT = V * sqrt(T); - h_OptionData[i].S = (real)plan->optionData[i].S; - h_OptionData[i].X = (real)plan->optionData[i].X; - h_OptionData[i].MuByT = (real)MuByT; - h_OptionData[i].VBySqrtT = (real)VBySqrtT; - } + for (int i = 0; i < plan->optionCount; i++) { + const double T = plan->optionData[i].T; + const double R = plan->optionData[i].R; + const double V = plan->optionData[i].V; + const double MuByT = (R - 0.5 * V * V) * T; + const double VBySqrtT = V * sqrt(T); + h_OptionData[i].S = (real)plan->optionData[i].S; + h_OptionData[i].X = (real)plan->optionData[i].X; + h_OptionData[i].MuByT = (real)MuByT; + h_OptionData[i].VBySqrtT = (real)VBySqrtT; + } - checkCudaErrors(cudaMemcpyAsync(plan->d_OptionData, h_OptionData, - plan->optionCount * sizeof(__TOptionData), - cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync( + plan->d_OptionData, h_OptionData, plan->optionCount * sizeof(__TOptionData), cudaMemcpyHostToDevice, stream)); - MonteCarloOneBlockPerOption<<gridSize, THREAD_N, 0, stream>>>( - plan->rngStates, (__TOptionData *)(plan->d_OptionData), - (__TOptionValue *)(plan->d_CallValue), plan->pathN, plan->optionCount); - getLastCudaError("MonteCarloOneBlockPerOption() execution failed\n"); + MonteCarloOneBlockPerOption<<gridSize, THREAD_N, 0, stream>>>(plan->rngStates, + (__TOptionData *)(plan->d_OptionData), + (__TOptionValue *)(plan->d_CallValue), + plan->pathN, + plan->optionCount); + getLastCudaError("MonteCarloOneBlockPerOption() execution failed\n"); - checkCudaErrors(cudaMemcpyAsync(h_CallValue, plan->d_CallValue, - plan->optionCount * sizeof(__TOptionValue), - cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync( + h_CallValue, plan->d_CallValue, plan->optionCount * sizeof(__TOptionValue), cudaMemcpyDeviceToHost, stream)); - // cudaDeviceSynchronize(); + // cudaDeviceSynchronize(); } diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_reduction.cuh b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_reduction.cuh index 92fc0508..b453171a 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_reduction.cuh +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/MonteCarlo_reduction.cuh @@ -40,40 +40,40 @@ namespace cg = cooperative_groups; //////////////////////////////////////////////////////////////////////////////// template -__device__ void sumReduce(T *sum, T *sum2, cg::thread_block &cta, - cg::thread_block_tile<32> &tile32, - __TOptionValue *d_CallValue) { - const int VEC = 32; - const int tid = cta.thread_rank(); +__device__ void +sumReduce(T *sum, T *sum2, cg::thread_block &cta, cg::thread_block_tile<32> &tile32, __TOptionValue *d_CallValue) +{ + const int VEC = 32; + const int tid = cta.thread_rank(); - T beta = sum[tid]; - T beta2 = sum2[tid]; - T temp, temp2; + T beta = sum[tid]; + T beta2 = sum2[tid]; + T temp, temp2; - for (int i = VEC / 2; i > 0; i >>= 1) { - if (tile32.thread_rank() < i) { - temp = sum[tid + i]; - temp2 = sum2[tid + i]; - beta += temp; - beta2 += temp2; - sum[tid] = beta; - sum2[tid] = beta2; + for (int i = VEC / 2; i > 0; i >>= 1) { + if (tile32.thread_rank() < i) { + temp = sum[tid + i]; + temp2 = sum2[tid + i]; + beta += temp; + beta2 += temp2; + sum[tid] = beta; + sum2[tid] = beta2; + } + cg::sync(tile32); } - cg::sync(tile32); - } - cg::sync(cta); + cg::sync(cta); - if (tid == 0) { - beta = 0; - beta2 = 0; - for (int i = 0; i < blockDim.x; i += VEC) { - beta += sum[i]; - beta2 += sum2[i]; + if (tid == 0) { + beta = 0; + beta2 = 0; + for (int i = 0; i < blockDim.x; i += VEC) { + beta += sum[i]; + beta2 += sum2[i]; + } + __TOptionValue t = {beta, beta2}; + *d_CallValue = t; } - __TOptionValue t = {beta, beta2}; - *d_CallValue = t; - } - cg::sync(cta); + cg::sync(cta); } #endif diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.cpp b/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.cpp index 91a33d05..4078c29b 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.cpp +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.cpp @@ -29,41 +29,46 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // Create thread -CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { - return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL); +CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) +{ + return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL); } // Wait for thread to finish -void cutEndThread(CUTThread thread) { - WaitForSingleObject(thread, INFINITE); - CloseHandle(thread); +void cutEndThread(CUTThread thread) +{ + WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); } // Wait for multiple threads -void cutWaitForThreads(const CUTThread *threads, int num) { - WaitForMultipleObjects(num, threads, true, INFINITE); +void cutWaitForThreads(const CUTThread *threads, int num) +{ + WaitForMultipleObjects(num, threads, true, INFINITE); - for (int i = 0; i < num; i++) { - CloseHandle(threads[i]); - } + for (int i = 0; i < num; i++) { + CloseHandle(threads[i]); + } } #else // Create thread -CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { - pthread_t thread; - pthread_create(&thread, NULL, func, data); - return thread; +CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) +{ + pthread_t thread; + pthread_create(&thread, NULL, func, data); + return thread; } // Wait for thread to finish void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); } // Wait for multiple threads -void cutWaitForThreads(const CUTThread *threads, int num) { - for (int i = 0; i < num; i++) { - cutEndThread(threads[i]); - } +void cutWaitForThreads(const CUTThread *threads, int num) +{ + for (int i = 0; i < num; i++) { + cutEndThread(threads[i]); + } } #endif diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.h b/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.h index 6046d3ba..c93334fa 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.h +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/multithreading.h @@ -38,7 +38,7 @@ typedef HANDLE CUTThread; typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *); #define CUT_THREADPROC unsigned WINAPI -#define CUT_THREADEND return 0 +#define CUT_THREADEND return 0 #else // POSIX threads. @@ -52,20 +52,21 @@ typedef void *(*CUT_THREADROUTINE)(void *); #endif #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -// Create thread. -CUTThread cutStartThread(CUT_THREADROUTINE, void *data); + // Create thread. + CUTThread cutStartThread(CUT_THREADROUTINE, void *data); -// Wait for thread to finish. -void cutEndThread(CUTThread thread); + // Wait for thread to finish. + void cutEndThread(CUTThread thread); -// Wait for multiple threads. -void cutWaitForThreads(const CUTThread *threads, int num); + // Wait for multiple threads. + void cutWaitForThreads(const CUTThread *threads, int num); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif -#endif // MULTITHREADING_H +#endif // MULTITHREADING_H diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/realtype.h b/Samples/5_Domain_Specific/MonteCarloMultiGPU/realtype.h index 64d6f799..11ca533d 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/realtype.h +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/realtype.h @@ -28,7 +28,7 @@ #ifndef REALTYPE_H #define REALTYPE_H -//#define DOUBLE_PRECISION +// #define DOUBLE_PRECISION #ifndef DOUBLE_PRECISION typedef float real; diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/bgr_resize.cu b/Samples/5_Domain_Specific/NV12toBGRandResize/bgr_resize.cu index 5079db3a..7432b436 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/bgr_resize.cu +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/bgr_resize.cu @@ -30,38 +30,58 @@ #include #include + #include "resize_convert.h" __global__ void resizeBGRplanarBatchKernel(cudaTextureObject_t texSrc, - float *pDst, int nDstPitch, int nDstHeight, int nSrcHeight, - int batch, float scaleX, float scaleY, - int cropX, int cropY, int cropW, int cropH) { + float *pDst, + int nDstPitch, + int nDstHeight, + int nSrcHeight, + int batch, + float scaleX, + float scaleY, + int cropX, + int cropY, + int cropW, + int cropH) +{ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; - if (x >= (int)(cropW/scaleX) || y >= (int)(cropH/scaleY)) + if (x >= (int)(cropW / scaleX) || y >= (int)(cropH / scaleY)) return; - int frameSize = nDstPitch*nDstHeight; - float *p = NULL; + int frameSize = nDstPitch * nDstHeight; + float *p = NULL; for (int i = blockIdx.z; i < batch; i += gridDim.z) { - #pragma unroll - for (int channel=0; channel < 3; channel++){ - p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize; - *p = tex2D(texSrc, x * scaleX + cropX, - ((3 * i + channel) * nSrcHeight + y * scaleY + cropY)); +#pragma unroll + for (int channel = 0; channel < 3; channel++) { + p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize; + *p = tex2D(texSrc, x * scaleX + cropX, ((3 * i + channel) * nSrcHeight + y * scaleY + cropY)); } } } -static void resizeBGRplanarBatchCore( - float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, - float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, - int nBatchSize, cudaStream_t stream, bool whSameResizeRatio, - int cropX, int cropY, int cropW, int cropH) { +static void resizeBGRplanarBatchCore(float *dpSrc, + int nSrcPitch, + int nSrcWidth, + int nSrcHeight, + float *dpDst, + int nDstPitch, + int nDstWidth, + int nDstHeight, + int nBatchSize, + cudaStream_t stream, + bool whSameResizeRatio, + int cropX, + int cropY, + int cropW, + int cropH) +{ cudaTextureObject_t texSrc[2]; - int nTiles = 1, h, iTile; + int nTiles = 1, h, iTile; h = nSrcHeight * 3 * nBatchSize; while ((h + nTiles - 1) / nTiles > 65536) @@ -70,65 +90,85 @@ static void resizeBGRplanarBatchCore( if (nTiles > 2) return; - int batchTile = nBatchSize / nTiles; - int batchTileLast = nBatchSize - batchTile * (nTiles-1); + int batchTile = nBatchSize / nTiles; + int batchTileLast = nBatchSize - batchTile * (nTiles - 1); for (iTile = 0; iTile < nTiles; ++iTile) { - int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile; - float *dpSrcNew = dpSrc + - iTile * (batchTile * 3 * nSrcHeight * nSrcPitch); + int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile; + float *dpSrcNew = dpSrc + iTile * (batchTile * 3 * nSrcHeight * nSrcPitch); - cudaResourceDesc resDesc = {}; - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = dpSrcNew; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); - resDesc.res.pitch2D.width = nSrcWidth; - resDesc.res.pitch2D.height = bs * 3 * nSrcHeight; + cudaResourceDesc resDesc = {}; + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = dpSrcNew; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); + resDesc.res.pitch2D.width = nSrcWidth; + resDesc.res.pitch2D.height = bs * 3 * nSrcHeight; resDesc.res.pitch2D.pitchInBytes = nSrcPitch * sizeof(float); - cudaTextureDesc texDesc = {}; - texDesc.filterMode = cudaFilterModeLinear; - texDesc.readMode = cudaReadModeElementType; + cudaTextureDesc texDesc = {}; + texDesc.filterMode = cudaFilterModeLinear; + texDesc.readMode = cudaReadModeElementType; checkCudaErrors(cudaCreateTextureObject(&texSrc[iTile], &resDesc, &texDesc, NULL)); - float *dpDstNew = dpDst + - iTile * (batchTile * 3 * nDstHeight * nDstPitch); + float *dpDstNew = dpDst + iTile * (batchTile * 3 * nDstHeight * nDstPitch); - if(cropW == 0 || cropH == 0) { + if (cropW == 0 || cropH == 0) { cropX = 0; cropY = 0; cropW = nSrcWidth; cropH = nSrcHeight; } - float scaleX = (cropW*1.0f / nDstWidth); - float scaleY = (cropH*1.0f / nDstHeight); + float scaleX = (cropW * 1.0f / nDstWidth); + float scaleY = (cropH * 1.0f / nDstHeight); - if(whSameResizeRatio == true) + if (whSameResizeRatio == true) scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY; dim3 block(32, 32, 1); size_t blockDimZ = bs; // Restricting blocks in Z-dim till 32 to not launch too many blocks blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ; - dim3 grid((cropW*1.0f/scaleX + block.x - 1) / block.x, - (cropH*1.0f/scaleY + block.y - 1) / block.y, blockDimZ); - - resizeBGRplanarBatchKernel<<>> - (texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight, - bs, scaleX, scaleY, cropX, cropY, cropW, cropH); + dim3 grid((cropW * 1.0f / scaleX + block.x - 1) / block.x, + (cropH * 1.0f / scaleY + block.y - 1) / block.y, + blockDimZ); + resizeBGRplanarBatchKernel<<>>( + texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight, bs, scaleX, scaleY, cropX, cropY, cropW, cropH); } for (iTile = 0; iTile < nTiles; ++iTile) checkCudaErrors(cudaDestroyTextureObject(texSrc[iTile])); } -void resizeBGRplanarBatch( - float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, - float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, - int nBatchSize, cudaStream_t stream, - int cropX, int cropY, int cropW, int cropH, bool whSameResizeRatio) { - resizeBGRplanarBatchCore(dpSrc, nSrcPitch, nSrcWidth, nSrcHeight, - dpDst, nDstPitch, nDstWidth, nDstHeight, nBatchSize, stream, - whSameResizeRatio, cropX, cropY, cropW, cropH); +void resizeBGRplanarBatch(float *dpSrc, + int nSrcPitch, + int nSrcWidth, + int nSrcHeight, + float *dpDst, + int nDstPitch, + int nDstWidth, + int nDstHeight, + int nBatchSize, + cudaStream_t stream, + int cropX, + int cropY, + int cropW, + int cropH, + bool whSameResizeRatio) +{ + resizeBGRplanarBatchCore(dpSrc, + nSrcPitch, + nSrcWidth, + nSrcHeight, + dpDst, + nDstPitch, + nDstWidth, + nDstHeight, + nBatchSize, + stream, + whSameResizeRatio, + cropX, + cropY, + cropW, + cropH); } diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_resize.cu b/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_resize.cu index ad1fd696..c1a129fa 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_resize.cu +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_resize.cu @@ -29,84 +29,94 @@ #include #include + #include "resize_convert.h" __global__ static void resizeNV12BatchKernel(cudaTextureObject_t texSrcLuma, cudaTextureObject_t texSrcChroma, - uint8_t *pDstNv12, int nSrcWidth, - int nSrcHeight, int nDstPitch, - int nDstWidth, int nDstHeight, - int nBatchSize) { - int x = threadIdx.x + blockIdx.x * blockDim.x; - int y = threadIdx.y + blockIdx.y * blockDim.y; + uint8_t *pDstNv12, + int nSrcWidth, + int nSrcHeight, + int nDstPitch, + int nDstWidth, + int nDstHeight, + int nBatchSize) +{ + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; - int px = x * 2, py = y * 2; + int px = x * 2, py = y * 2; - if ((px + 1) >= nDstWidth || (py + 1) >= nDstHeight) return; + if ((px + 1) >= nDstWidth || (py + 1) >= nDstHeight) + return; - float fxScale = 1.0f * nSrcWidth / nDstWidth; - float fyScale = 1.0f * nSrcHeight / nDstHeight; + float fxScale = 1.0f * nSrcWidth / nDstWidth; + float fyScale = 1.0f * nSrcHeight / nDstHeight; - uint8_t *p = pDstNv12 + px + py * nDstPitch; - int hh = nDstHeight * 3 / 2; - int nByte = nDstPitch * hh; - int px_fxScale = px * fxScale; - int px_fxScale_1 = (px + 1) * fxScale; - int py_fyScale = py * fyScale; - int py_fyScale_1 = (py + 1) * fyScale; + uint8_t *p = pDstNv12 + px + py * nDstPitch; + int hh = nDstHeight * 3 / 2; + int nByte = nDstPitch * hh; + int px_fxScale = px * fxScale; + int px_fxScale_1 = (px + 1) * fxScale; + int py_fyScale = py * fyScale; + int py_fyScale_1 = (py + 1) * fyScale; - for (int i = blockIdx.z; i < nBatchSize; i+=gridDim.z) { - *(uchar2 *)p = make_uchar2(tex2D(texSrcLuma, px_fxScale, py_fyScale), - tex2D(texSrcLuma, px_fxScale_1, py_fyScale)); - *(uchar2 *)(p + nDstPitch) = - make_uchar2(tex2D(texSrcLuma, px_fxScale, py_fyScale_1), - tex2D(texSrcLuma, px_fxScale_1, py_fyScale_1)); - *(uchar2 *)(p + (nDstHeight - y) * nDstPitch) = tex2D( - texSrcChroma, x * fxScale, (hh * i + nDstHeight + y) * fyScale); - p += nByte; - py += hh; - } + for (int i = blockIdx.z; i < nBatchSize; i += gridDim.z) { + *(uchar2 *)p = make_uchar2(tex2D(texSrcLuma, px_fxScale, py_fyScale), + tex2D(texSrcLuma, px_fxScale_1, py_fyScale)); + *(uchar2 *)(p + nDstPitch) = make_uchar2(tex2D(texSrcLuma, px_fxScale, py_fyScale_1), + tex2D(texSrcLuma, px_fxScale_1, py_fyScale_1)); + *(uchar2 *)(p + (nDstHeight - y) * nDstPitch) = + tex2D(texSrcChroma, x * fxScale, (hh * i + nDstHeight + y) * fyScale); + p += nByte; + py += hh; + } } -void resizeNV12Batch(uint8_t *dpSrc, int nSrcPitch, int nSrcWidth, - int nSrcHeight, uint8_t *dpDst, int nDstPitch, - int nDstWidth, int nDstHeight, int nBatchSize, - cudaStream_t stream) { - int hhSrc = ceilf(nSrcHeight * 3.0f / 2.0f); - cudaResourceDesc resDesc = {}; - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = dpSrc; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); - resDesc.res.pitch2D.width = nSrcWidth; - resDesc.res.pitch2D.height = hhSrc * nBatchSize; - resDesc.res.pitch2D.pitchInBytes = nSrcPitch; +void resizeNV12Batch(uint8_t *dpSrc, + int nSrcPitch, + int nSrcWidth, + int nSrcHeight, + uint8_t *dpDst, + int nDstPitch, + int nDstWidth, + int nDstHeight, + int nBatchSize, + cudaStream_t stream) +{ + int hhSrc = ceilf(nSrcHeight * 3.0f / 2.0f); + cudaResourceDesc resDesc = {}; + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = dpSrc; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); + resDesc.res.pitch2D.width = nSrcWidth; + resDesc.res.pitch2D.height = hhSrc * nBatchSize; + resDesc.res.pitch2D.pitchInBytes = nSrcPitch; - cudaTextureDesc texDesc = {}; - texDesc.filterMode = cudaFilterModePoint; - texDesc.readMode = cudaReadModeElementType; + cudaTextureDesc texDesc = {}; + texDesc.filterMode = cudaFilterModePoint; + texDesc.readMode = cudaReadModeElementType; - cudaTextureObject_t texLuma = 0; - checkCudaErrors(cudaCreateTextureObject(&texLuma, &resDesc, &texDesc, NULL)); + cudaTextureObject_t texLuma = 0; + checkCudaErrors(cudaCreateTextureObject(&texLuma, &resDesc, &texDesc, NULL)); - resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); - resDesc.res.pitch2D.width /= 2; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); + resDesc.res.pitch2D.width /= 2; - cudaTextureObject_t texChroma = 0; - checkCudaErrors(cudaCreateTextureObject(&texChroma, &resDesc, &texDesc, NULL)); + cudaTextureObject_t texChroma = 0; + checkCudaErrors(cudaCreateTextureObject(&texChroma, &resDesc, &texDesc, NULL)); - dim3 block(32, 32, 1); + dim3 block(32, 32, 1); - size_t blockDimZ = nBatchSize; + size_t blockDimZ = nBatchSize; - // Restricting blocks in Z-dim till 32 to not launch too many blocks - blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ; + // Restricting blocks in Z-dim till 32 to not launch too many blocks + blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ; - dim3 grid((nDstWidth / 2 + block.x) / block.x, - (nDstHeight / 2 + block.y) / block.y, blockDimZ); - resizeNV12BatchKernel<<>>( - texLuma, texChroma, dpDst, nSrcWidth, nSrcHeight, nDstPitch, nDstWidth, - nDstHeight, nBatchSize); + dim3 grid((nDstWidth / 2 + block.x) / block.x, (nDstHeight / 2 + block.y) / block.y, blockDimZ); + resizeNV12BatchKernel<<>>( + texLuma, texChroma, dpDst, nSrcWidth, nSrcHeight, nDstPitch, nDstWidth, nDstHeight, nBatchSize); - checkCudaErrors(cudaDestroyTextureObject(texLuma)); - checkCudaErrors(cudaDestroyTextureObject(texChroma)); + checkCudaErrors(cudaDestroyTextureObject(texLuma)); + checkCudaErrors(cudaDestroyTextureObject(texChroma)); } diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_to_bgr_planar.cu b/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_to_bgr_planar.cu index 7f8336c6..05431c07 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_to_bgr_planar.cu +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/nv12_to_bgr_planar.cu @@ -36,119 +36,124 @@ #define CONV_THREADS_X 64 #define CONV_THREADS_Y 10 -__forceinline__ __device__ static float clampF(float x, float lower, - float upper) { - return x < lower ? lower : (x > upper ? upper : x); +__forceinline__ __device__ static float clampF(float x, float lower, float upper) +{ + return x < lower ? lower : (x > upper ? upper : x); } __global__ static void nv12ToBGRplanarBatchKernel(const uint8_t *pNv12, - int nNv12Pitch, float *pBgr, - int nRgbPitch, int nWidth, - int nHeight, int nBatchSize) { - int x = threadIdx.x + blockIdx.x * blockDim.x; - int y = threadIdx.y + blockIdx.y * blockDim.y; + int nNv12Pitch, + float *pBgr, + int nRgbPitch, + int nWidth, + int nHeight, + int nBatchSize) +{ + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; - if ((x << 2) + 1 > nWidth || (y << 1) + 1 > nHeight) return; + if ((x << 2) + 1 > nWidth || (y << 1) + 1 > nHeight) + return; - const uint8_t *__restrict__ pSrc = pNv12; + const uint8_t *__restrict__ pSrc = pNv12; - for (int i = blockIdx.z; i < nBatchSize; i += gridDim.z) { - pSrc = pNv12 + i * ((nHeight * nNv12Pitch * 3) >> 1) + (x << 2) + - (y << 1) * nNv12Pitch; - uchar4 luma2x01, luma2x23, uv2; - *(uint32_t *)&luma2x01 = *(uint32_t *)pSrc; - *(uint32_t *)&luma2x23 = *(uint32_t *)(pSrc + nNv12Pitch); - *(uint32_t *)&uv2 = *(uint32_t *)(pSrc + (nHeight - y) * nNv12Pitch); + for (int i = blockIdx.z; i < nBatchSize; i += gridDim.z) { + pSrc = pNv12 + i * ((nHeight * nNv12Pitch * 3) >> 1) + (x << 2) + (y << 1) * nNv12Pitch; + uchar4 luma2x01, luma2x23, uv2; + *(uint32_t *)&luma2x01 = *(uint32_t *)pSrc; + *(uint32_t *)&luma2x23 = *(uint32_t *)(pSrc + nNv12Pitch); + *(uint32_t *)&uv2 = *(uint32_t *)(pSrc + (nHeight - y) * nNv12Pitch); - float *pDstBlock = (pBgr + i * ((nHeight * nRgbPitch * 3) >> 2) + - ((blockIdx.x * blockDim.x) << 2) + - ((blockIdx.y * blockDim.y) << 1) * (nRgbPitch >> 2)); + float *pDstBlock = (pBgr + i * ((nHeight * nRgbPitch * 3) >> 2) + ((blockIdx.x * blockDim.x) << 2) + + ((blockIdx.y * blockDim.y) << 1) * (nRgbPitch >> 2)); - float2 add1; - float2 add2; - float2 add3; - float2 add00, add01, add02, add03; - float2 d, e; + float2 add1; + float2 add2; + float2 add3; + float2 add00, add01, add02, add03; + float2 d, e; - add00.x = 1.1644f * luma2x01.x; - add01.x = 1.1644f * luma2x01.y; - add00.y = 1.1644f * luma2x01.z; - add01.y = 1.1644f * luma2x01.w; + add00.x = 1.1644f * luma2x01.x; + add01.x = 1.1644f * luma2x01.y; + add00.y = 1.1644f * luma2x01.z; + add01.y = 1.1644f * luma2x01.w; - add02.x = 1.1644f * luma2x23.x; - add03.x = 1.1644f * luma2x23.y; - add02.y = 1.1644f * luma2x23.z; - add03.y = 1.1644f * luma2x23.w; + add02.x = 1.1644f * luma2x23.x; + add03.x = 1.1644f * luma2x23.y; + add02.y = 1.1644f * luma2x23.z; + add03.y = 1.1644f * luma2x23.w; - d.x = uv2.x - 128.0f; - e.x = uv2.y - 128.0f; - d.y = uv2.z - 128.0f; - e.y = uv2.w - 128.0f; + d.x = uv2.x - 128.0f; + e.x = uv2.y - 128.0f; + d.y = uv2.z - 128.0f; + e.y = uv2.w - 128.0f; - add1.x = 2.0172f * d.x; - add1.y = 2.0172f * d.y; + add1.x = 2.0172f * d.x; + add1.y = 2.0172f * d.y; - add2.x = (-0.3918f) * d.x + (-0.8130f) * e.x; - add2.y = (-0.3918f) * d.y + (-0.8130f) * e.y; + add2.x = (-0.3918f) * d.x + (-0.8130f) * e.x; + add2.y = (-0.3918f) * d.y + (-0.8130f) * e.y; - add3.x = 1.5960f * e.x; - add3.y = 1.5960f * e.y; + add3.x = 1.5960f * e.x; + add3.y = 1.5960f * e.y; - int rowStride = (threadIdx.y << 1) * (nRgbPitch >> 2); - int nextRowStride = ((threadIdx.y << 1) + 1) * (nRgbPitch >> 2); - // B - *((float4 *)&pDstBlock[rowStride + (threadIdx.x << 2)]) = - make_float4(clampF(add00.x + add1.x, 0.0f, 255.0f), - clampF(add01.x + add1.x, 0.0f, 255.0f), - clampF(add00.y + add1.y, 0.0f, 255.0f), - clampF(add01.y + add1.y, 0.0f, 255.0f)); - *((float4 *)&pDstBlock[nextRowStride + (threadIdx.x << 2)]) = - make_float4(clampF(add02.x + add1.x, 0.0f, 255.0f), - clampF(add03.x + add1.x, 0.0f, 255.0f), - clampF(add02.y + add1.y, 0.0f, 255.0f), - clampF(add03.y + add1.y, 0.0f, 255.0f)); + int rowStride = (threadIdx.y << 1) * (nRgbPitch >> 2); + int nextRowStride = ((threadIdx.y << 1) + 1) * (nRgbPitch >> 2); + // B + *((float4 *)&pDstBlock[rowStride + (threadIdx.x << 2)]) = make_float4(clampF(add00.x + add1.x, 0.0f, 255.0f), + clampF(add01.x + add1.x, 0.0f, 255.0f), + clampF(add00.y + add1.y, 0.0f, 255.0f), + clampF(add01.y + add1.y, 0.0f, 255.0f)); + *((float4 *)&pDstBlock[nextRowStride + (threadIdx.x << 2)]) = + make_float4(clampF(add02.x + add1.x, 0.0f, 255.0f), + clampF(add03.x + add1.x, 0.0f, 255.0f), + clampF(add02.y + add1.y, 0.0f, 255.0f), + clampF(add03.y + add1.y, 0.0f, 255.0f)); - int planeStride = nHeight * nRgbPitch >> 2; - // G - *((float4 *)&pDstBlock[planeStride + rowStride + (threadIdx.x << 2)]) = - make_float4(clampF(add00.x + add2.x, 0.0f, 255.0f), - clampF(add01.x + add2.x, 0.0f, 255.0f), - clampF(add00.y + add2.y, 0.0f, 255.0f), - clampF(add01.y + add2.y, 0.0f, 255.0f)); - *((float4 *)&pDstBlock[planeStride + nextRowStride + (threadIdx.x << 2)]) = - make_float4(clampF(add02.x + add2.x, 0.0f, 255.0f), - clampF(add03.x + add2.x, 0.0f, 255.0f), - clampF(add02.y + add2.y, 0.0f, 255.0f), - clampF(add03.y + add2.y, 0.0f, 255.0f)); + int planeStride = nHeight * nRgbPitch >> 2; + // G + *((float4 *)&pDstBlock[planeStride + rowStride + (threadIdx.x << 2)]) = + make_float4(clampF(add00.x + add2.x, 0.0f, 255.0f), + clampF(add01.x + add2.x, 0.0f, 255.0f), + clampF(add00.y + add2.y, 0.0f, 255.0f), + clampF(add01.y + add2.y, 0.0f, 255.0f)); + *((float4 *)&pDstBlock[planeStride + nextRowStride + (threadIdx.x << 2)]) = + make_float4(clampF(add02.x + add2.x, 0.0f, 255.0f), + clampF(add03.x + add2.x, 0.0f, 255.0f), + clampF(add02.y + add2.y, 0.0f, 255.0f), + clampF(add03.y + add2.y, 0.0f, 255.0f)); - // R - *((float4 - *)&pDstBlock[(planeStride << 1) + rowStride + (threadIdx.x << 2)]) = - make_float4(clampF(add00.x + add3.x, 0.0f, 255.0f), - clampF(add01.x + add3.x, 0.0f, 255.0f), - clampF(add00.y + add3.y, 0.0f, 255.0f), - clampF(add01.y + add3.y, 0.0f, 255.0f)); - *((float4 *)&pDstBlock[(planeStride << 1) + nextRowStride + - (threadIdx.x << 2)]) = - make_float4(clampF(add02.x + add3.x, 0.0f, 255.0f), - clampF(add03.x + add3.x, 0.0f, 255.0f), - clampF(add02.y + add3.y, 0.0f, 255.0f), - clampF(add03.y + add3.y, 0.0f, 255.0f)); - } + // R + *((float4 *)&pDstBlock[(planeStride << 1) + rowStride + (threadIdx.x << 2)]) = + make_float4(clampF(add00.x + add3.x, 0.0f, 255.0f), + clampF(add01.x + add3.x, 0.0f, 255.0f), + clampF(add00.y + add3.y, 0.0f, 255.0f), + clampF(add01.y + add3.y, 0.0f, 255.0f)); + *((float4 *)&pDstBlock[(planeStride << 1) + nextRowStride + (threadIdx.x << 2)]) = + make_float4(clampF(add02.x + add3.x, 0.0f, 255.0f), + clampF(add03.x + add3.x, 0.0f, 255.0f), + clampF(add02.y + add3.y, 0.0f, 255.0f), + clampF(add03.y + add3.y, 0.0f, 255.0f)); + } } -void nv12ToBGRplanarBatch(uint8_t *pNv12, int nNv12Pitch, float *pBgr, - int nRgbPitch, int nWidth, int nHeight, - int nBatchSize, cudaStream_t stream) { - dim3 threads(CONV_THREADS_X, CONV_THREADS_Y); +void nv12ToBGRplanarBatch(uint8_t *pNv12, + int nNv12Pitch, + float *pBgr, + int nRgbPitch, + int nWidth, + int nHeight, + int nBatchSize, + cudaStream_t stream) +{ + dim3 threads(CONV_THREADS_X, CONV_THREADS_Y); - size_t blockDimZ = nBatchSize; + size_t blockDimZ = nBatchSize; - // Restricting blocks in Z-dim till 32 to not launch too many blocks - blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ; + // Restricting blocks in Z-dim till 32 to not launch too many blocks + blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ; - dim3 blocks((nWidth / 4 - 1) / threads.x + 1, - (nHeight / 2 - 1) / threads.y + 1, blockDimZ); - nv12ToBGRplanarBatchKernel<<>>( - pNv12, nNv12Pitch, pBgr, nRgbPitch, nWidth, nHeight, nBatchSize); + dim3 blocks((nWidth / 4 - 1) / threads.x + 1, (nHeight / 2 - 1) / threads.y + 1, blockDimZ); + nv12ToBGRplanarBatchKernel<<>>( + pNv12, nNv12Pitch, pBgr, nRgbPitch, nWidth, nHeight, nBatchSize); } diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert.h b/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert.h index 769a693f..fc36688d 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert.h +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert.h @@ -29,28 +29,45 @@ #ifndef __H_RESIZE_CONVERT__ #define __H_RESIZE_CONVERT__ -#include #include +#include // nv12 resize -extern "C" -void resizeNV12Batch( - uint8_t *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, - uint8_t *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, - int nBatchSize, cudaStream_t stream = 0); +extern "C" void resizeNV12Batch(uint8_t *dpSrc, + int nSrcPitch, + int nSrcWidth, + int nSrcHeight, + uint8_t *dpDst, + int nDstPitch, + int nDstWidth, + int nDstHeight, + int nBatchSize, + cudaStream_t stream = 0); // bgr resize -extern "C" -void resizeBGRplanarBatch( - float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight, - float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight, - int nBatchSize, cudaStream_t stream = 0, - int cropX = 0, int cropY = 0, int cropW = 0, int cropH = 0, - bool whSameResizeRatio = false); +extern "C" void resizeBGRplanarBatch(float *dpSrc, + int nSrcPitch, + int nSrcWidth, + int nSrcHeight, + float *dpDst, + int nDstPitch, + int nDstWidth, + int nDstHeight, + int nBatchSize, + cudaStream_t stream = 0, + int cropX = 0, + int cropY = 0, + int cropW = 0, + int cropH = 0, + bool whSameResizeRatio = false); -//NV12 to bgr planar -extern "C" -void nv12ToBGRplanarBatch(uint8_t *pNv12, int nNv12Pitch, - float *pRgb, int nRgbPitch, int nWidth, int nHeight, - int nBatchSize, cudaStream_t stream=0); +// NV12 to bgr planar +extern "C" void nv12ToBGRplanarBatch(uint8_t *pNv12, + int nNv12Pitch, + float *pRgb, + int nRgbPitch, + int nWidth, + int nHeight, + int nBatchSize, + cudaStream_t stream = 0); #endif diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert_main.cpp b/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert_main.cpp index 71368f2f..3352bd23 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert_main.cpp +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/resize_convert_main.cpp @@ -51,398 +51,424 @@ Run */ +#include #include #include - +#include +#include #include +#include #include #include #include -#include -#include -#include -#include #include "resize_convert.h" #include "utils.h" #define TEST_LOOP 20 -typedef struct _nv12_to_bgr24_context_t { - int width; - int height; - int pitch; +typedef struct _nv12_to_bgr24_context_t +{ + int width; + int height; + int pitch; - int dst_width; - int dst_height; - int dst_pitch; + int dst_width; + int dst_height; + int dst_pitch; - int batch; - int device; // cuda device ID + int batch; + int device; // cuda device ID - char *input_nv12_file; + char *input_nv12_file; - int ctx_pitch; // the value will be suitable for Texture memroy. - int ctx_heights; // the value will be even. + int ctx_pitch; // the value will be suitable for Texture memroy. + int ctx_heights; // the value will be even. } nv12_to_bgr24_context; nv12_to_bgr24_context g_ctx; -static void printHelp(const char *app_name) { - std::cout << "Usage:" << app_name << " [options]\n\n"; - std::cout << "OPTIONS:\n"; - std::cout << "\t-h,--help\n\n"; - std::cout << "\t-input=nv12file nv12 input file\n"; - std::cout - << "\t-width=width input nv12 image width, <1 -- 4096>\n"; - std::cout - << "\t-height=height input nv12 image height, <1 -- 4096>\n"; - std::cout - << "\t-pitch=pitch(optional) input nv12 image pitch, <0 -- 4096>\n"; - std::cout - << "\t-dst_width=width output BGR image width, <1 -- 4096>\n"; - std::cout - << "\t-dst_height=height output BGR image height, <1 -- 4096>\n"; - std::cout - << "\t-dst_pitch=pitch(optional) output BGR image pitch, <0 -- 4096>\n"; - std::cout - << "\t-batch=batch process frames count, <1 -- 4096>\n\n"; - std::cout - << "\t-device=device_num(optional) cuda device number, <0 -- 4096>\n\n"; +static void printHelp(const char *app_name) +{ + std::cout << "Usage:" << app_name << " [options]\n\n"; + std::cout << "OPTIONS:\n"; + std::cout << "\t-h,--help\n\n"; + std::cout << "\t-input=nv12file nv12 input file\n"; + std::cout << "\t-width=width input nv12 image width, <1 -- 4096>\n"; + std::cout << "\t-height=height input nv12 image height, <1 -- 4096>\n"; + std::cout << "\t-pitch=pitch(optional) input nv12 image pitch, <0 -- 4096>\n"; + std::cout << "\t-dst_width=width output BGR image width, <1 -- 4096>\n"; + std::cout << "\t-dst_height=height output BGR image height, <1 -- 4096>\n"; + std::cout << "\t-dst_pitch=pitch(optional) output BGR image pitch, <0 -- 4096>\n"; + std::cout << "\t-batch=batch process frames count, <1 -- 4096>\n\n"; + std::cout << "\t-device=device_num(optional) cuda device number, <0 -- 4096>\n\n"; - return; + return; } -int parseCmdLine(int argc, char *argv[]) { - char **argp = (char **)argv; - char *arg = (char *)argv[0]; +int parseCmdLine(int argc, char *argv[]) +{ + char **argp = (char **)argv; + char *arg = (char *)argv[0]; - memset(&g_ctx, 0, sizeof(g_ctx)); + memset(&g_ctx, 0, sizeof(g_ctx)); - if ((arg && (!strcmp(arg, "-h") || !strcmp(arg, "--help")))) { - printHelp(argv[0]); - return -1; - } - - if (argc == 1) { - // Run using default arguments - - g_ctx.input_nv12_file = sdkFindFilePath("test1920x1080.nv12", argv[0]); - if (g_ctx.input_nv12_file == NULL) { - printf("Cannot find input file test1920x1080.nv12\n Exiting\n"); - return EXIT_FAILURE; - } - g_ctx.width = 1920; - g_ctx.height = 1080; - g_ctx.dst_width = 640; - g_ctx.dst_height = 480; - g_ctx.batch = 24; - } else if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "width")) { - g_ctx.width = getCmdLineArgumentInt(argc, (const char **)argv, "width"); + if ((arg && (!strcmp(arg, "-h") || !strcmp(arg, "--help")))) { + printHelp(argv[0]); + return -1; } - if (checkCmdLineFlag(argc, (const char **)argv, "height")) { - g_ctx.height = getCmdLineArgumentInt(argc, (const char **)argv, "height"); + if (argc == 1) { + // Run using default arguments + + g_ctx.input_nv12_file = sdkFindFilePath("test1920x1080.nv12", argv[0]); + if (g_ctx.input_nv12_file == NULL) { + printf("Cannot find input file test1920x1080.nv12\n Exiting\n"); + return EXIT_FAILURE; + } + g_ctx.width = 1920; + g_ctx.height = 1080; + g_ctx.dst_width = 640; + g_ctx.dst_height = 480; + g_ctx.batch = 24; + } + else if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "width")) { + g_ctx.width = getCmdLineArgumentInt(argc, (const char **)argv, "width"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "height")) { + g_ctx.height = getCmdLineArgumentInt(argc, (const char **)argv, "height"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "pitch")) { + g_ctx.pitch = getCmdLineArgumentInt(argc, (const char **)argv, "pitch"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "input")) { + getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&g_ctx.input_nv12_file); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "dst_width")) { + g_ctx.dst_width = getCmdLineArgumentInt(argc, (const char **)argv, "dst_width"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "dst_height")) { + g_ctx.dst_height = getCmdLineArgumentInt(argc, (const char **)argv, "dst_height"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "dst_pitch")) { + g_ctx.dst_pitch = getCmdLineArgumentInt(argc, (const char **)argv, "dst_pitch"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "batch")) { + g_ctx.batch = getCmdLineArgumentInt(argc, (const char **)argv, "batch"); + } } - if (checkCmdLineFlag(argc, (const char **)argv, "pitch")) { - g_ctx.pitch = getCmdLineArgumentInt(argc, (const char **)argv, "pitch"); + g_ctx.device = findCudaDevice(argc, (const char **)argv); + + if ((g_ctx.width == 0) || (g_ctx.height == 0) || (g_ctx.dst_width == 0) || (g_ctx.dst_height == 0) + || !g_ctx.input_nv12_file) { + printHelp(argv[0]); + return -1; } - if (checkCmdLineFlag(argc, (const char **)argv, "input")) { - getCmdLineArgumentString(argc, (const char **)argv, "input", - (char **)&g_ctx.input_nv12_file); - } + if (g_ctx.pitch == 0) + g_ctx.pitch = g_ctx.width; + if (g_ctx.dst_pitch == 0) + g_ctx.dst_pitch = g_ctx.dst_width; - if (checkCmdLineFlag(argc, (const char **)argv, "dst_width")) { - g_ctx.dst_width = - getCmdLineArgumentInt(argc, (const char **)argv, "dst_width"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "dst_height")) { - g_ctx.dst_height = - getCmdLineArgumentInt(argc, (const char **)argv, "dst_height"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "dst_pitch")) { - g_ctx.dst_pitch = - getCmdLineArgumentInt(argc, (const char **)argv, "dst_pitch"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "batch")) { - g_ctx.batch = getCmdLineArgumentInt(argc, (const char **)argv, "batch"); - } - } - - g_ctx.device = findCudaDevice(argc, (const char **)argv); - - if ((g_ctx.width == 0) || (g_ctx.height == 0) || (g_ctx.dst_width == 0) || - (g_ctx.dst_height == 0) || !g_ctx.input_nv12_file) { - printHelp(argv[0]); - return -1; - } - - if (g_ctx.pitch == 0) g_ctx.pitch = g_ctx.width; - if (g_ctx.dst_pitch == 0) g_ctx.dst_pitch = g_ctx.dst_width; - - return 0; + return 0; } /* load nv12 yuvfile data into GPU device memory with batch of copy */ -static int loadNV12Frame(unsigned char *d_inputNV12) { - unsigned char *pNV12FrameData; - unsigned char *d_nv12; - int frameSize; - std::ifstream nv12File(g_ctx.input_nv12_file, std::ifstream::in | std::ios::binary); +static int loadNV12Frame(unsigned char *d_inputNV12) +{ + unsigned char *pNV12FrameData; + unsigned char *d_nv12; + int frameSize; + std::ifstream nv12File(g_ctx.input_nv12_file, std::ifstream::in | std::ios::binary); - if (!nv12File.is_open()) { - std::cerr << "Can't open files\n"; - return -1; - } + if (!nv12File.is_open()) { + std::cerr << "Can't open files\n"; + return -1; + } - frameSize = g_ctx.pitch * g_ctx.ctx_heights; + frameSize = g_ctx.pitch * g_ctx.ctx_heights; #if USE_UVM_MEM - pNV12FrameData = d_inputNV12; + pNV12FrameData = d_inputNV12; #else - pNV12FrameData = (unsigned char *)malloc(frameSize); - if (pNV12FrameData == NULL) { - std::cerr << "Failed to malloc pNV12FrameData\n"; - return -1; - } + pNV12FrameData = (unsigned char *)malloc(frameSize); + if (pNV12FrameData == NULL) { + std::cerr << "Failed to malloc pNV12FrameData\n"; + return -1; + } #endif - nv12File.read((char *)pNV12FrameData, frameSize); + nv12File.read((char *)pNV12FrameData, frameSize); - if (nv12File.gcount() < frameSize) { - std::cerr << "can't get one frame!\n"; - return -1; - } + if (nv12File.gcount() < frameSize) { + std::cerr << "can't get one frame!\n"; + return -1; + } #if USE_UVM_MEM - // Prefetch to GPU for following GPU operation - cudaStreamAttachMemAsync(NULL, pNV12FrameData, 0, cudaMemAttachGlobal); + // Prefetch to GPU for following GPU operation + cudaStreamAttachMemAsync(NULL, pNV12FrameData, 0, cudaMemAttachGlobal); #endif - // expand one frame to multi frames for batch processing - d_nv12 = d_inputNV12; - for (int i = 0; i < g_ctx.batch; i++) { - checkCudaErrors(cudaMemcpy2D((void *)d_nv12, g_ctx.ctx_pitch, - pNV12FrameData, g_ctx.width, g_ctx.width, - g_ctx.ctx_heights, cudaMemcpyHostToDevice)); + // expand one frame to multi frames for batch processing + d_nv12 = d_inputNV12; + for (int i = 0; i < g_ctx.batch; i++) { + checkCudaErrors(cudaMemcpy2D((void *)d_nv12, + g_ctx.ctx_pitch, + pNV12FrameData, + g_ctx.width, + g_ctx.width, + g_ctx.ctx_heights, + cudaMemcpyHostToDevice)); - d_nv12 += g_ctx.ctx_pitch * g_ctx.ctx_heights; - } + d_nv12 += g_ctx.ctx_pitch * g_ctx.ctx_heights; + } #if (USE_UVM_MEM == 0) - free(pNV12FrameData); + free(pNV12FrameData); #endif - nv12File.close(); + nv12File.close(); - return 0; + return 0; } /* 1. resize interlace nv12 to target size 2. convert nv12 to bgr 3 progressive planars */ -void nv12ResizeAndNV12ToBGR(unsigned char *d_inputNV12) { - unsigned char *d_resizedNV12; - float *d_outputBGR; - int size; - char filename[40]; +void nv12ResizeAndNV12ToBGR(unsigned char *d_inputNV12) +{ + unsigned char *d_resizedNV12; + float *d_outputBGR; + int size; + char filename[40]; - /* allocate device memory for resized nv12 output */ - size = g_ctx.dst_width * ceil(g_ctx.dst_height * 3.0f / 2.0f) * g_ctx.batch * - sizeof(unsigned char); - checkCudaErrors(cudaMalloc((void **)&d_resizedNV12, size)); + /* allocate device memory for resized nv12 output */ + size = g_ctx.dst_width * ceil(g_ctx.dst_height * 3.0f / 2.0f) * g_ctx.batch * sizeof(unsigned char); + checkCudaErrors(cudaMalloc((void **)&d_resizedNV12, size)); - /* allocate device memory for bgr output */ - size = g_ctx.dst_pitch * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float); - checkCudaErrors(cudaMalloc((void **)&d_outputBGR, size)); + /* allocate device memory for bgr output */ + size = g_ctx.dst_pitch * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float); + checkCudaErrors(cudaMalloc((void **)&d_outputBGR, size)); - cudaStream_t stream; - checkCudaErrors(cudaStreamCreate(&stream)); - /* create cuda event handles */ - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - float elapsedTime = 0.0f; + cudaStream_t stream; + checkCudaErrors(cudaStreamCreate(&stream)); + /* create cuda event handles */ + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + float elapsedTime = 0.0f; - /* resize interlace nv12 */ + /* resize interlace nv12 */ - cudaEventRecord(start, 0); - for (int i = 0; i < TEST_LOOP; i++) { - resizeNV12Batch(d_inputNV12, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height, - d_resizedNV12, g_ctx.dst_width, g_ctx.dst_width, - g_ctx.dst_height, g_ctx.batch); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + cudaEventRecord(start, 0); + for (int i = 0; i < TEST_LOOP; i++) { + resizeNV12Batch(d_inputNV12, + g_ctx.ctx_pitch, + g_ctx.width, + g_ctx.height, + d_resizedNV12, + g_ctx.dst_width, + g_ctx.dst_width, + g_ctx.dst_height, + g_ctx.batch); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsedTime, start, stop); - printf( - " CUDA resize nv12(%dx%d --> %dx%d), batch: %d," - " average time: %.3f ms ==> %.3f ms/frame\n", - g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, - (elapsedTime / (TEST_LOOP * 1.0f)), - (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf(" CUDA resize nv12(%dx%d --> %dx%d), batch: %d," + " average time: %.3f ms ==> %.3f ms/frame\n", + g_ctx.width, + g_ctx.height, + g_ctx.dst_width, + g_ctx.dst_height, + g_ctx.batch, + (elapsedTime / (TEST_LOOP * 1.0f)), + (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); - sprintf(filename, "resized_nv12_%dx%d", g_ctx.dst_width, g_ctx.dst_height); + sprintf(filename, "resized_nv12_%dx%d", g_ctx.dst_width, g_ctx.dst_height); - /* convert nv12 to bgr 3 progressive planars */ - cudaEventRecord(start, 0); - for (int i = 0; i < TEST_LOOP; i++) { - nv12ToBGRplanarBatch(d_resizedNV12, g_ctx.dst_pitch, // intput - d_outputBGR, - g_ctx.dst_pitch * sizeof(float), // output - g_ctx.dst_width, g_ctx.dst_height, // output - g_ctx.batch, 0); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + /* convert nv12 to bgr 3 progressive planars */ + cudaEventRecord(start, 0); + for (int i = 0; i < TEST_LOOP; i++) { + nv12ToBGRplanarBatch(d_resizedNV12, + g_ctx.dst_pitch, // intput + d_outputBGR, + g_ctx.dst_pitch * sizeof(float), // output + g_ctx.dst_width, + g_ctx.dst_height, // output + g_ctx.batch, + 0); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsedTime, start, stop); + cudaEventElapsedTime(&elapsedTime, start, stop); - printf( - " CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d," - " average time: %.3f ms ==> %.3f ms/frame\n", - g_ctx.dst_width, g_ctx.dst_height, g_ctx.dst_width, g_ctx.dst_height, - g_ctx.batch, (elapsedTime / (TEST_LOOP * 1.0f)), - (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); + printf(" CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d," + " average time: %.3f ms ==> %.3f ms/frame\n", + g_ctx.dst_width, + g_ctx.dst_height, + g_ctx.dst_width, + g_ctx.dst_height, + g_ctx.batch, + (elapsedTime / (TEST_LOOP * 1.0f)), + (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); - sprintf(filename, "converted_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height); - dumpBGR(d_outputBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height, - g_ctx.batch, (char *)"t1", filename); + sprintf(filename, "converted_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height); + dumpBGR(d_outputBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, (char *)"t1", filename); - /* release resources */ - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaStreamDestroy(stream)); - checkCudaErrors(cudaFree(d_resizedNV12)); - checkCudaErrors(cudaFree(d_outputBGR)); + /* release resources */ + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaFree(d_resizedNV12)); + checkCudaErrors(cudaFree(d_outputBGR)); } /* 1. convert nv12 to bgr 3 progressive planars 2. resize bgr 3 planars to target size */ -void nv12ToBGRandBGRresize(unsigned char *d_inputNV12) { - float *d_bgr; - float *d_resizedBGR; - int size; - char filename[40]; +void nv12ToBGRandBGRresize(unsigned char *d_inputNV12) +{ + float *d_bgr; + float *d_resizedBGR; + int size; + char filename[40]; - /* allocate device memory for bgr output */ - size = g_ctx.ctx_pitch * g_ctx.height * 3 * g_ctx.batch * sizeof(float); - checkCudaErrors(cudaMalloc((void **)&d_bgr, size)); + /* allocate device memory for bgr output */ + size = g_ctx.ctx_pitch * g_ctx.height * 3 * g_ctx.batch * sizeof(float); + checkCudaErrors(cudaMalloc((void **)&d_bgr, size)); - /* allocate device memory for resized bgr output */ - size = g_ctx.dst_width * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float); - checkCudaErrors(cudaMalloc((void **)&d_resizedBGR, size)); + /* allocate device memory for resized bgr output */ + size = g_ctx.dst_width * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float); + checkCudaErrors(cudaMalloc((void **)&d_resizedBGR, size)); - cudaStream_t stream; - checkCudaErrors(cudaStreamCreate(&stream)); - /* create cuda event handles */ - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - float elapsedTime = 0.0f; + cudaStream_t stream; + checkCudaErrors(cudaStreamCreate(&stream)); + /* create cuda event handles */ + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + float elapsedTime = 0.0f; - /* convert interlace nv12 to bgr 3 progressive planars */ - cudaEventRecord(start, 0); - cudaDeviceSynchronize(); - for (int i = 0; i < TEST_LOOP; i++) { - nv12ToBGRplanarBatch(d_inputNV12, g_ctx.ctx_pitch, d_bgr, - g_ctx.ctx_pitch * sizeof(float), g_ctx.width, - g_ctx.height, g_ctx.batch, 0); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + /* convert interlace nv12 to bgr 3 progressive planars */ + cudaEventRecord(start, 0); + cudaDeviceSynchronize(); + for (int i = 0; i < TEST_LOOP; i++) { + nv12ToBGRplanarBatch(d_inputNV12, + g_ctx.ctx_pitch, + d_bgr, + g_ctx.ctx_pitch * sizeof(float), + g_ctx.width, + g_ctx.height, + g_ctx.batch, + 0); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsedTime, start, stop); - printf( - " CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d," - " average time: %.3f ms ==> %.3f ms/frame\n", - g_ctx.width, g_ctx.height, g_ctx.width, g_ctx.height, g_ctx.batch, - (elapsedTime / (TEST_LOOP * 1.0f)), - (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf(" CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d," + " average time: %.3f ms ==> %.3f ms/frame\n", + g_ctx.width, + g_ctx.height, + g_ctx.width, + g_ctx.height, + g_ctx.batch, + (elapsedTime / (TEST_LOOP * 1.0f)), + (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); - sprintf(filename, "converted_bgr_%dx%d", g_ctx.width, g_ctx.height); + sprintf(filename, "converted_bgr_%dx%d", g_ctx.width, g_ctx.height); - /* resize bgr 3 progressive planars */ - cudaEventRecord(start, 0); - for (int i = 0; i < TEST_LOOP; i++) { - resizeBGRplanarBatch(d_bgr, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height, - d_resizedBGR, g_ctx.dst_width, g_ctx.dst_width, - g_ctx.dst_height, g_ctx.batch); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); + /* resize bgr 3 progressive planars */ + cudaEventRecord(start, 0); + for (int i = 0; i < TEST_LOOP; i++) { + resizeBGRplanarBatch(d_bgr, + g_ctx.ctx_pitch, + g_ctx.width, + g_ctx.height, + d_resizedBGR, + g_ctx.dst_width, + g_ctx.dst_width, + g_ctx.dst_height, + g_ctx.batch); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsedTime, start, stop); - printf( - " CUDA resize bgr(%dx%d --> %dx%d), batch: %d," - " average time: %.3f ms ==> %.3f ms/frame\n", - g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, - (elapsedTime / (TEST_LOOP * 1.0f)), - (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); + cudaEventElapsedTime(&elapsedTime, start, stop); + printf(" CUDA resize bgr(%dx%d --> %dx%d), batch: %d," + " average time: %.3f ms ==> %.3f ms/frame\n", + g_ctx.width, + g_ctx.height, + g_ctx.dst_width, + g_ctx.dst_height, + g_ctx.batch, + (elapsedTime / (TEST_LOOP * 1.0f)), + (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch); - memset(filename, 0, sizeof(filename)); - sprintf(filename, "resized_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height); - dumpBGR(d_resizedBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height, - g_ctx.batch, (char *)"t2", filename); + memset(filename, 0, sizeof(filename)); + sprintf(filename, "resized_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height); + dumpBGR(d_resizedBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch, (char *)"t2", filename); - /* release resources */ - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); - checkCudaErrors(cudaStreamDestroy(stream)); - checkCudaErrors(cudaFree(d_bgr)); - checkCudaErrors(cudaFree(d_resizedBGR)); + /* release resources */ + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaFree(d_bgr)); + checkCudaErrors(cudaFree(d_resizedBGR)); } -int main(int argc, char *argv[]) { - unsigned char *d_inputNV12; +int main(int argc, char *argv[]) +{ + unsigned char *d_inputNV12; - if (parseCmdLine(argc, argv) < 0) return EXIT_FAILURE; + if (parseCmdLine(argc, argv) < 0) + return EXIT_FAILURE; - g_ctx.ctx_pitch = g_ctx.width; - int ctx_alignment = 32; - g_ctx.ctx_pitch += (g_ctx.ctx_pitch % ctx_alignment != 0) - ? (ctx_alignment - g_ctx.ctx_pitch % ctx_alignment) - : 0; + g_ctx.ctx_pitch = g_ctx.width; + int ctx_alignment = 32; + g_ctx.ctx_pitch += (g_ctx.ctx_pitch % ctx_alignment != 0) ? (ctx_alignment - g_ctx.ctx_pitch % ctx_alignment) : 0; - g_ctx.ctx_heights = ceil(g_ctx.height * 3.0f / 2.0f); + g_ctx.ctx_heights = ceil(g_ctx.height * 3.0f / 2.0f); - /* load nv12 yuv data into d_inputNV12 with batch of copies */ + /* load nv12 yuv data into d_inputNV12 with batch of copies */ #if USE_UVM_MEM - checkCudaErrors(cudaMallocManaged( - (void **)&d_inputNV12, - (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch), cudaMemAttachHost)); - printf("\nUSE_UVM_MEM\n"); + checkCudaErrors(cudaMallocManaged( + (void **)&d_inputNV12, (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch), cudaMemAttachHost)); + printf("\nUSE_UVM_MEM\n"); #else - checkCudaErrors( - cudaMalloc((void **)&d_inputNV12, - (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch))); + checkCudaErrors(cudaMalloc((void **)&d_inputNV12, (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch))); #endif - if (loadNV12Frame(d_inputNV12)) { - std::cerr << "failed to load batch data!\n"; - return EXIT_FAILURE; - } + if (loadNV12Frame(d_inputNV12)) { + std::cerr << "failed to load batch data!\n"; + return EXIT_FAILURE; + } - /* firstly resize nv12, then convert nv12 to bgr */ - printf("\nTEST#1:\n"); - nv12ResizeAndNV12ToBGR(d_inputNV12); + /* firstly resize nv12, then convert nv12 to bgr */ + printf("\nTEST#1:\n"); + nv12ResizeAndNV12ToBGR(d_inputNV12); - /* first convert nv12 to bgr, then resize bgr */ - printf("\nTEST#2:\n"); - nv12ToBGRandBGRresize(d_inputNV12); + /* first convert nv12 to bgr, then resize bgr */ + printf("\nTEST#2:\n"); + nv12ToBGRandBGRresize(d_inputNV12); - checkCudaErrors(cudaFree(d_inputNV12)); + checkCudaErrors(cudaFree(d_inputNV12)); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/utils.cu b/Samples/5_Domain_Specific/NV12toBGRandResize/utils.cu index 5c305bab..00b72db0 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/utils.cu +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/utils.cu @@ -25,140 +25,124 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include +#include +#include #include #include #include -#include -#include - -#include -#include #include "resize_convert.h" #include "utils.h" -__global__ void floatToChar(float *src, unsigned char *dst, int height, - int width, int batchSize) +__global__ void floatToChar(float *src, unsigned char *dst, int height, int width, int batchSize) { - int x = threadIdx.x + blockIdx.x * blockDim.x; + int x = threadIdx.x + blockIdx.x * blockDim.x; - if (x >= height * width) - return; + if (x >= height * width) + return; - int offset = height * width * 3; + int offset = height * width * 3; - for (int j = 0; j < batchSize; j++) - { - // b - *(dst + j * offset + x * 3 + 0) = - (unsigned char)*(src + j * offset + height * width * 0 + x); - // g - *(dst + j * offset + x * 3 + 1) = - (unsigned char)*(src + j * offset + height * width * 1 + x); - // r - *(dst + j * offset + x * 3 + 2) = - (unsigned char)*(src + j * offset + height * width * 2 + x); - } + for (int j = 0; j < batchSize; j++) { + // b + *(dst + j * offset + x * 3 + 0) = (unsigned char)*(src + j * offset + height * width * 0 + x); + // g + *(dst + j * offset + x * 3 + 1) = (unsigned char)*(src + j * offset + height * width * 1 + x); + // r + *(dst + j * offset + x * 3 + 2) = (unsigned char)*(src + j * offset + height * width * 2 + x); + } } -void floatPlanarToChar(float *src, unsigned char *dst, int height, int width, - int batchSize) +void floatPlanarToChar(float *src, unsigned char *dst, int height, int width, int batchSize) { - floatToChar<<<(height * width - 1) / 1024 + 1, 1024, 0, NULL>>>( - src, dst, height, width, batchSize); + floatToChar<<<(height * width - 1) / 1024 + 1, 1024, 0, NULL>>>(src, dst, height, width, batchSize); } -void dumpRawBGR(float *d_srcBGR, int pitch, int width, int height, - int batchSize, char *folder, char *tag) +void dumpRawBGR(float *d_srcBGR, int pitch, int width, int height, int batchSize, char *folder, char *tag) { - float *bgr, *d_bgr; - int frameSize; - char directory[120]; - char mkdir_cmd[256]; + float *bgr, *d_bgr; + int frameSize; + char directory[120]; + char mkdir_cmd[256]; #if !defined(_WIN32) - sprintf(directory, "output/%s", folder); - sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory); + sprintf(directory, "output/%s", folder); + sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory); #else - sprintf(directory, "output\\%s", folder); - sprintf(mkdir_cmd, "mkdir %s 2> nul", directory); + sprintf(directory, "output\\%s", folder); + sprintf(mkdir_cmd, "mkdir %s 2> nul", directory); #endif - int ret = system(mkdir_cmd); + int ret = system(mkdir_cmd); - frameSize = width * height * 3 * sizeof(float); - bgr = (float *)malloc(frameSize); - if (bgr == NULL) - { - std::cerr << "Failed malloc for bgr\n"; - return; - } - - d_bgr = d_srcBGR; - for (int i = 0; i < batchSize; i++) - { - char filename[256]; - std::ofstream *outputFile; - - checkCudaErrors(cudaMemcpy((void *)bgr, (void *)d_bgr, frameSize, - cudaMemcpyDeviceToHost)); - snprintf(filename, sizeof(filename), "%s/%s_%d.raw", directory, tag, (i + 1)); - - outputFile = new std::ofstream(filename); - if (outputFile) - { - outputFile->write((char *)bgr, frameSize); - delete outputFile; + frameSize = width * height * 3 * sizeof(float); + bgr = (float *)malloc(frameSize); + if (bgr == NULL) { + std::cerr << "Failed malloc for bgr\n"; + return; } - d_bgr += pitch * height * 3; - } + d_bgr = d_srcBGR; + for (int i = 0; i < batchSize; i++) { + char filename[256]; + std::ofstream *outputFile; - free(bgr); + checkCudaErrors(cudaMemcpy((void *)bgr, (void *)d_bgr, frameSize, cudaMemcpyDeviceToHost)); + snprintf(filename, sizeof(filename), "%s/%s_%d.raw", directory, tag, (i + 1)); + + outputFile = new std::ofstream(filename); + if (outputFile) { + outputFile->write((char *)bgr, frameSize); + delete outputFile; + } + + d_bgr += pitch * height * 3; + } + + free(bgr); } -void dumpBGR(float *d_srcBGR, int pitch, int width, int height, int batchSize, - char *folder, char *tag) +void dumpBGR(float *d_srcBGR, int pitch, int width, int height, int batchSize, char *folder, char *tag) { - dumpRawBGR(d_srcBGR, pitch, width, height, batchSize, folder, tag); + dumpRawBGR(d_srcBGR, pitch, width, height, batchSize, folder, tag); } void dumpYUV(unsigned char *d_nv12, int size, char *folder, char *tag) { - unsigned char *nv12Data; - std::ofstream *nv12File; - char filename[256]; - char directory[60]; - char mkdir_cmd[256]; + unsigned char *nv12Data; + std::ofstream *nv12File; + char filename[256]; + char directory[60]; + char mkdir_cmd[256]; #if !defined(_WIN32) - sprintf(directory, "output/%s", folder); - sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory); + sprintf(directory, "output/%s", folder); + sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory); #else - sprintf(directory, "output\\%s", folder); - sprintf(mkdir_cmd, "mkdir %s 2> nul", directory); + sprintf(directory, "output\\%s", folder); + sprintf(mkdir_cmd, "mkdir %s 2> nul", directory); #endif - int ret = system(mkdir_cmd); + int ret = system(mkdir_cmd); - snprintf(filename, sizeof(filename), "%s/%s.nv12", directory, tag); + snprintf(filename, sizeof(filename), "%s/%s.nv12", directory, tag); - nv12File = new std::ofstream(filename); - if (nv12File == NULL) - { - std::cerr << "Failed to new " << filename; - return; - } + nv12File = new std::ofstream(filename); + if (nv12File == NULL) { + std::cerr << "Failed to new " << filename; + return; + } - nv12Data = (unsigned char *)malloc(size * (sizeof(char))); - if (nv12Data == NULL) - { - std::cerr << "Failed to allcoate memory\n"; - return; - } + nv12Data = (unsigned char *)malloc(size * (sizeof(char))); + if (nv12Data == NULL) { + std::cerr << "Failed to allcoate memory\n"; + return; + } - cudaMemcpy((void *)nv12Data, (void *)d_nv12, size, cudaMemcpyDeviceToHost); + cudaMemcpy((void *)nv12Data, (void *)d_nv12, size, cudaMemcpyDeviceToHost); - nv12File->write((const char *)nv12Data, size); + nv12File->write((const char *)nv12Data, size); - free(nv12Data); - delete nv12File; + free(nv12Data); + delete nv12File; } diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/utils.h b/Samples/5_Domain_Specific/NV12toBGRandResize/utils.h index a19d119b..3e36f212 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/utils.h +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/utils.h @@ -29,9 +29,6 @@ #ifndef __H_UTIL_ #define __H_UTIL_ -extern "C" -void dumpBGR(float *d_srcBGR, int pitch, int width, int height, - int batchSize, char *folder, char *tag); -extern "C" -void dumpYUV(unsigned char *d_nv12, int size, char *folder, char *tag); +extern "C" void dumpBGR(float *d_srcBGR, int pitch, int width, int height, int batchSize, char *folder, char *tag); +extern "C" void dumpYUV(unsigned char *d_nv12, int size, char *folder, char *tag); #endif diff --git a/Samples/5_Domain_Specific/README.md b/Samples/5_Domain_Specific/README.md index 8e9fc456..6ad1ea7d 100644 --- a/Samples/5_Domain_Specific/README.md +++ b/Samples/5_Domain_Specific/README.md @@ -18,7 +18,7 @@ This sample evaluates fair call and put prices for a given set of European optio ### [BlackScholes_nvrtc](./BlackScholes_nvrtc) This sample evaluates fair call and put prices for a given set of European options by Black-Scholes formula, compiling the CUDA kernels involved at runtime using NVRTC. - + ### [convolutionFFT2D](./convolutionFFT2D) This sample demonstrates how 2D convolutions with very large kernel sizes can be efficiently implemented using FFT transformations. @@ -152,4 +152,3 @@ This sample demonstrates basic volume rendering using 3D Textures. ### [vulkanImageCUDA](./vulkanImageCUDA) This sample demonstrates Vulkan Image - CUDA Interop. CUDA imports the Vulkan image buffer, performs box filtering over it, and synchronizes with Vulkan through vulkan semaphores imported by CUDA. This sample depends on Vulkan SDK, GLFW3 libraries, for building this sample please refer to "Build_instructions.txt" provided in this sample's directory - diff --git a/Samples/5_Domain_Specific/SobelFilter/README.md b/Samples/5_Domain_Specific/SobelFilter/README.md index 7d129afb..8f97c6f1 100644 --- a/Samples/5_Domain_Specific/SobelFilter/README.md +++ b/Samples/5_Domain_Specific/SobelFilter/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/SobelFilter/SobelFilter.cpp b/Samples/5_Domain_Specific/SobelFilter/SobelFilter.cpp index 96e8c1f2..c89bbf5d 100644 --- a/Samples/5_Domain_Specific/SobelFilter/SobelFilter.cpp +++ b/Samples/5_Domain_Specific/SobelFilter/SobelFilter.cpp @@ -38,22 +38,21 @@ #endif // CUDA utilities and system includes -#include #include +#include // Includes -#include #include +#include #include #include "SobelFilter_kernels.h" // includes, project -#include // includes for SDK helper functions -#include // includes for cuda initialization and error checking +#include // includes for cuda initialization and error checking +#include // includes for SDK helper functions -const char *filterMode[] = {"No Filtering", "Sobel Texture", - "Sobel SMEM+Texture", NULL}; +const char *filterMode[] = {"No Filtering", "Sobel Texture", "Sobel SMEM+Texture", NULL}; // // Cuda example code that implements the Sobel edge detection @@ -70,38 +69,37 @@ void cleanup(void); void initializeData(char *file); #define MAX_EPSILON_ERROR 5.0f -#define REFRESH_DELAY 10 // ms +#define REFRESH_DELAY 10 // ms const char *sSDKsample = "CUDA Sobel Edge-Detection"; -static int wWidth = 512; // Window width -static int wHeight = 512; // Window height -static int imWidth = 0; // Image width +static int wWidth = 512; // Window width +static int wHeight = 512; // Window height +static int imWidth = 0; // Image width static int imHeight = 0; // Image height // Code to handle Auto verification -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 8; // FPS limit for sampling -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; -StopWatchInterface *timer = NULL; -unsigned int g_Bpp; -unsigned int g_Index = 0; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 8; // FPS limit for sampling +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; +StopWatchInterface *timer = NULL; +unsigned int g_Bpp; +unsigned int g_Index = 0; bool g_bQAReadback = false; // Display Data -static GLuint pbo_buffer = 0; // Front and back CA buffers -struct cudaGraphicsResource - *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) +static GLuint pbo_buffer = 0; // Front and back CA buffers +struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) -static GLuint texid = 0; // Texture for display -unsigned char *pixels = NULL; // Image pixel data on the host -float imageScale = 1.f; // Image exposure +static GLuint texid = 0; // Texture for display +unsigned char *pixels = NULL; // Image pixel data on the host +float imageScale = 1.f; // Image exposure enum SobelDisplayMode g_SobelDisplayMode; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; extern "C" void runAutoTest(int argc, char **argv); @@ -109,376 +107,381 @@ extern "C" void runAutoTest(int argc, char **argv); #define OFFSET(i) ((char *)NULL + (i)) #define MAX(a, b) ((a > b) ? a : b) -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "CUDA Edge Detection (%s): %3.1f fps", - filterMode[g_SobelDisplayMode], ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "CUDA Edge Detection (%s): %3.1f fps", filterMode[g_SobelDisplayMode], ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - sdkResetTimer(&timer); - } + sdkResetTimer(&timer); + } } // This is the normal display path -void display(void) { - sdkStartTimer(&timer); +void display(void) +{ + sdkStartTimer(&timer); - // Sobel operation - Pixel *data = NULL; + // Sobel operation + Pixel *data = NULL; - // map PBO to get CUDA device pointer - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&data, &num_bytes, cuda_pbo_resource)); - // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); + // map PBO to get CUDA device pointer + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&data, &num_bytes, cuda_pbo_resource)); + // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); - sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + sobelFilter(data, imWidth, imHeight, g_SobelDisplayMode, imageScale); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - glClear(GL_COLOR_BUFFER_BIT); + glClear(GL_COLOR_BUFFER_BIT); - glBindTexture(GL_TEXTURE_2D, texid); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, - GL_UNSIGNED_BYTE, OFFSET(0)); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + glBindTexture(GL_TEXTURE_2D, texid); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, OFFSET(0)); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - glDisable(GL_DEPTH_TEST); - glEnable(GL_TEXTURE_2D); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); + glDisable(GL_DEPTH_TEST); + glEnable(GL_TEXTURE_2D); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); - glBegin(GL_QUADS); - glVertex2f(0, 0); - glTexCoord2f(0, 0); - glVertex2f(0, 1); - glTexCoord2f(1, 0); - glVertex2f(1, 1); - glTexCoord2f(1, 1); - glVertex2f(1, 0); - glTexCoord2f(0, 1); - glEnd(); - glBindTexture(GL_TEXTURE_2D, 0); - glutSwapBuffers(); + glBegin(GL_QUADS); + glVertex2f(0, 0); + glTexCoord2f(0, 0); + glVertex2f(0, 1); + glTexCoord2f(1, 0); + glVertex2f(1, 1); + glTexCoord2f(1, 1); + glVertex2f(1, 0); + glTexCoord2f(0, 1); + glEnd(); + glBindTexture(GL_TEXTURE_2D, 0); + glutSwapBuffers(); - sdkStopTimer(&timer); + sdkStopTimer(&timer); - computeFPS(); + computeFPS(); } -void timerEvent(int value) { - if (glutGetWindow()) { - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - } +void timerEvent(int value) +{ + if (glutGetWindow()) { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + } } -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - char temp[256]; +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + char temp[256]; - switch (key) { + switch (key) { case 27: case 'q': case 'Q': - printf("Shutting down...\n"); + printf("Shutting down...\n"); #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case '-': - imageScale -= 0.1f; - printf("brightness = %4.2f\n", imageScale); - break; + imageScale -= 0.1f; + printf("brightness = %4.2f\n", imageScale); + break; case '=': - imageScale += 0.1f; - printf("brightness = %4.2f\n", imageScale); - break; + imageScale += 0.1f; + printf("brightness = %4.2f\n", imageScale); + break; case 'i': case 'I': - g_SobelDisplayMode = SOBELDISPLAY_IMAGE; - sprintf(temp, "CUDA Edge Detection (%s)", filterMode[g_SobelDisplayMode]); - glutSetWindowTitle(temp); - break; + g_SobelDisplayMode = SOBELDISPLAY_IMAGE; + sprintf(temp, "CUDA Edge Detection (%s)", filterMode[g_SobelDisplayMode]); + glutSetWindowTitle(temp); + break; case 's': case 'S': - g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; - sprintf(temp, "CUDA Edge Detection (%s)", filterMode[g_SobelDisplayMode]); - glutSetWindowTitle(temp); - break; + g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; + sprintf(temp, "CUDA Edge Detection (%s)", filterMode[g_SobelDisplayMode]); + glutSetWindowTitle(temp); + break; case 't': case 'T': - g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; - sprintf(temp, "CUDA Edge Detection (%s)", filterMode[g_SobelDisplayMode]); - glutSetWindowTitle(temp); - break; + g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; + sprintf(temp, "CUDA Edge Detection (%s)", filterMode[g_SobelDisplayMode]); + glutSetWindowTitle(temp); + break; default: - break; - } + break; + } } -void reshape(int x, int y) { - glViewport(0, 0, x, y); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0, 1, 0, 1, 0, 1); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0, 1, 0, 1, 0, 1); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); } -void cleanup(void) { - cudaGraphicsUnregisterResource(cuda_pbo_resource); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - glDeleteBuffers(1, &pbo_buffer); - glDeleteTextures(1, &texid); - deleteTexture(); - - sdkDeleteTimer(&timer); -} - -void initializeData(char *file) { - GLint bsize; - unsigned int w, h; - size_t file_length = strlen(file); - - if (!strcmp(&file[file_length - 3], "pgm")) { - if (sdkLoadPGM(file, &pixels, &w, &h) != true) { - printf("Failed to load PGM image file: %s\n", file); - exit(EXIT_FAILURE); - } - - g_Bpp = 1; - } else if (!strcmp(&file[file_length - 3], "ppm")) { - if (sdkLoadPPM4(file, &pixels, &w, &h) != true) { - printf("Failed to load PPM image file: %s\n", file); - exit(EXIT_FAILURE); - } - - g_Bpp = 4; - } else { - exit(EXIT_FAILURE); - } - - imWidth = (int)w; - imHeight = (int)h; - setupTexture(imWidth, imHeight, pixels, g_Bpp); - - memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight); - - if (!g_bQAReadback) { - // use OpenGL Path - glGenBuffers(1, &pbo_buffer); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); - glBufferData(GL_PIXEL_UNPACK_BUFFER, - g_Bpp * sizeof(Pixel) * imWidth * imHeight, pixels, - GL_STREAM_DRAW); - - glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); - - if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) { - printf("Buffer object (%d) has incorrect size (%d).\n", - (unsigned)pbo_buffer, (unsigned)bsize); - - exit(EXIT_FAILURE); - } +void cleanup(void) +{ + cudaGraphicsUnregisterResource(cuda_pbo_resource); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + glDeleteBuffers(1, &pbo_buffer); + glDeleteTextures(1, &texid); + deleteTexture(); - // register this buffer object with CUDA - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard)); - - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - glTexImage2D(GL_TEXTURE_2D, 0, ((g_Bpp == 1) ? GL_LUMINANCE : GL_BGRA), - imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL); - glBindTexture(GL_TEXTURE_2D, 0); - - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glPixelStorei(GL_PACK_ALIGNMENT, 1); - } + sdkDeleteTimer(&timer); } -void loadDefaultImage(char *loc_exec) { - printf("Reading image: teapot.pgm\n"); - const char *image_filename = "teapot.pgm"; - char *image_path = sdkFindFilePath(image_filename, loc_exec); +void initializeData(char *file) +{ + GLint bsize; + unsigned int w, h; + size_t file_length = strlen(file); - if (image_path == NULL) { - printf("Failed to read image file: <%s>\n", image_filename); - exit(EXIT_FAILURE); - } + if (!strcmp(&file[file_length - 3], "pgm")) { + if (sdkLoadPGM(file, &pixels, &w, &h) != true) { + printf("Failed to load PGM image file: %s\n", file); + exit(EXIT_FAILURE); + } - initializeData(image_path); - free(image_path); + g_Bpp = 1; + } + else if (!strcmp(&file[file_length - 3], "ppm")) { + if (sdkLoadPPM4(file, &pixels, &w, &h) != true) { + printf("Failed to load PPM image file: %s\n", file); + exit(EXIT_FAILURE); + } + + g_Bpp = 4; + } + else { + exit(EXIT_FAILURE); + } + + imWidth = (int)w; + imHeight = (int)h; + setupTexture(imWidth, imHeight, pixels, g_Bpp); + + memset(pixels, 0x0, g_Bpp * sizeof(Pixel) * imWidth * imHeight); + + if (!g_bQAReadback) { + // use OpenGL Path + glGenBuffers(1, &pbo_buffer); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer); + glBufferData(GL_PIXEL_UNPACK_BUFFER, g_Bpp * sizeof(Pixel) * imWidth * imHeight, pixels, GL_STREAM_DRAW); + + glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &bsize); + + if ((GLuint)bsize != (g_Bpp * sizeof(Pixel) * imWidth * imHeight)) { + printf("Buffer object (%d) has incorrect size (%d).\n", (unsigned)pbo_buffer, (unsigned)bsize); + + exit(EXIT_FAILURE); + } + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo_buffer, cudaGraphicsMapFlagsWriteDiscard)); + + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + glTexImage2D(GL_TEXTURE_2D, + 0, + ((g_Bpp == 1) ? GL_LUMINANCE : GL_BGRA), + imWidth, + imHeight, + 0, + GL_LUMINANCE, + GL_UNSIGNED_BYTE, + NULL); + glBindTexture(GL_TEXTURE_2D, 0); + + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glPixelStorei(GL_PACK_ALIGNMENT, 1); + } } -void initGL(int *argc, char **argv) { - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); - glutInitWindowSize(wWidth, wHeight); - glutCreateWindow("CUDA Edge Detection"); +void loadDefaultImage(char *loc_exec) +{ + printf("Reading image: teapot.pgm\n"); + const char *image_filename = "teapot.pgm"; + char *image_path = sdkFindFilePath(image_filename, loc_exec); - if (!isGLVersionSupported(1, 5) || - !areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); - fprintf(stderr, "This sample requires:\n"); - fprintf(stderr, " OpenGL version 1.5\n"); - fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); - fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); - exit(EXIT_FAILURE); - } + if (image_path == NULL) { + printf("Failed to read image file: <%s>\n", image_filename); + exit(EXIT_FAILURE); + } + + initializeData(image_path); + free(image_path); } -void runAutoTest(int argc, char *argv[]) { - printf("[%s] (automated testing w/ readback)\n", sSDKsample); - int devID = findCudaDevice(argc, (const char **)argv); +void initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(wWidth, wHeight); + glutCreateWindow("CUDA Edge Detection"); - loadDefaultImage(argv[0]); + if (!isGLVersionSupported(1, 5) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Error: failed to get minimal extensions for demo\n"); + fprintf(stderr, "This sample requires:\n"); + fprintf(stderr, " OpenGL version 1.5\n"); + fprintf(stderr, " GL_ARB_vertex_buffer_object\n"); + fprintf(stderr, " GL_ARB_pixel_buffer_object\n"); + exit(EXIT_FAILURE); + } +} - Pixel *d_result; - checkCudaErrors( - cudaMalloc((void **)&d_result, imWidth * imHeight * sizeof(Pixel))); +void runAutoTest(int argc, char *argv[]) +{ + printf("[%s] (automated testing w/ readback)\n", sSDKsample); + int devID = findCudaDevice(argc, (const char **)argv); - char *ref_file = NULL; - char dump_file[256]; + loadDefaultImage(argv[0]); - int mode = 0; - mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + Pixel *d_result; + checkCudaErrors(cudaMalloc((void **)&d_result, imWidth * imHeight * sizeof(Pixel))); - switch (mode) { + char *ref_file = NULL; + char dump_file[256]; + + int mode = 0; + mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + + switch (mode) { case 0: - g_SobelDisplayMode = SOBELDISPLAY_IMAGE; - sprintf(dump_file, "teapot_orig.pgm"); - break; + g_SobelDisplayMode = SOBELDISPLAY_IMAGE; + sprintf(dump_file, "teapot_orig.pgm"); + break; case 1: - g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; - sprintf(dump_file, "teapot_tex.pgm"); - break; + g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; + sprintf(dump_file, "teapot_tex.pgm"); + break; case 2: - g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; - sprintf(dump_file, "teapot_shared.pgm"); - break; + g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; + sprintf(dump_file, "teapot_shared.pgm"); + break; default: - printf("Invalid Filter Mode File\n"); - exit(EXIT_FAILURE); - break; - } + printf("Invalid Filter Mode File\n"); + exit(EXIT_FAILURE); + break; + } - printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); - sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale); - checkCudaErrors(cudaDeviceSynchronize()); + printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); + sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale); + checkCudaErrors(cudaDeviceSynchronize()); - unsigned char *h_result = - (unsigned char *)malloc(imWidth * imHeight * sizeof(Pixel)); - checkCudaErrors(cudaMemcpy(h_result, d_result, - imWidth * imHeight * sizeof(Pixel), - cudaMemcpyDeviceToHost)); - sdkSavePGM(dump_file, h_result, imWidth, imHeight); + unsigned char *h_result = (unsigned char *)malloc(imWidth * imHeight * sizeof(Pixel)); + checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth * imHeight * sizeof(Pixel), cudaMemcpyDeviceToHost)); + sdkSavePGM(dump_file, h_result, imWidth, imHeight); - if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), - MAX_EPSILON_ERROR, 0.15f, false)) { - g_TotalErrors++; - } + if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { + g_TotalErrors++; + } - checkCudaErrors(cudaFree(d_result)); - free(h_result); + checkCudaErrors(cudaFree(d_result)); + free(h_result); - if (g_TotalErrors != 0) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + if (g_TotalErrors != 0) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - printf("Test passed!\n"); - exit(EXIT_SUCCESS); + printf("Test passed!\n"); + exit(EXIT_SUCCESS); } -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("\nUsage: SobelFilter \n"); - printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n"); - printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n"); - exit(EXIT_SUCCESS); - } + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("\nUsage: SobelFilter \n"); + printf("\t\t-mode=n (0=original, 1=texture, 2=smem + texture)\n"); + printf("\t\t-file=ref_orig.pgm (ref_tex.pgm, ref_shared.pgm)\n\n"); + exit(EXIT_SUCCESS); + } - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - g_bQAReadback = true; - runAutoTest(argc, argv); - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + g_bQAReadback = true; + runAutoTest(argc, argv); + } - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf( - " This SDK does not explicitly support -device=n when running with " - "OpenGL.\n"); - printf( - " When specifying -device=n (n=0,1,2,....) the sample must not use " - "OpenGL.\n"); - printf(" See details below to run without OpenGL:\n\n"); - printf(" > %s -device=n\n\n", argv[0]); - printf("exiting...\n"); - exit(EXIT_SUCCESS); - } + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf(" This SDK does not explicitly support -device=n when running with " + "OpenGL.\n"); + printf(" When specifying -device=n (n=0,1,2,....) the sample must not use " + "OpenGL.\n"); + printf(" See details below to run without OpenGL:\n\n"); + printf(" > %s -device=n\n\n", argv[0]); + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } - initGL(&argc, argv); - findCudaDevice(argc, (const char **)argv); + initGL(&argc, argv); + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&timer); - sdkResetTimer(&timer); + sdkCreateTimer(&timer); + sdkResetTimer(&timer); - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); - loadDefaultImage(argv[0]); + loadDefaultImage(argv[0]); - // If code is not printing the usage, then we execute this path. - printf("I: display Image (no filtering)\n"); - printf("T: display Sobel Edge Detection (Using Texture)\n"); - printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); - printf("Use the '-' and '=' keys to change the brightness.\n"); - fflush(stdout); + // If code is not printing the usage, then we execute this path. + printf("I: display Image (no filtering)\n"); + printf("T: display Sobel Edge Detection (Using Texture)\n"); + printf("S: display Sobel Edge Detection (Using SMEM+Texture)\n"); + printf("Use the '-' and '=' keys to change the brightness.\n"); + fflush(stdout); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - glutMainLoop(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + glutMainLoop(); } diff --git a/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.cu b/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.cu index 2ae7d03d..150766a9 100644 --- a/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.cu +++ b/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.cu @@ -25,10 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include #include #include -#include -#include namespace cg = cooperative_groups; @@ -37,14 +37,14 @@ namespace cg = cooperative_groups; #include "SobelFilter_kernels.h" // Texture object for reading image -cudaTextureObject_t texObject; +cudaTextureObject_t texObject; extern __shared__ unsigned char LocalBlock[]; -static cudaArray *array = NULL; +static cudaArray *array = NULL; #define RADIUS 1 #ifdef FIXED_BLOCKWIDTH -#define BlockWidth 80 +#define BlockWidth 80 #define SharedPitch 384 #endif @@ -52,241 +52,232 @@ static cudaArray *array = NULL; // call returns an error #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) -inline void __checkCudaErrors(cudaError err, const char *file, const int line) { - if (cudaSuccess != err) { - fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", file, line, - (int)err, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } +inline void __checkCudaErrors(cudaError err, const char *file, const int line) +{ + if (cudaSuccess != err) { + fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", file, line, (int)err, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } } -__device__ unsigned char ComputeSobel(unsigned char ul, // upper left - unsigned char um, // upper middle - unsigned char ur, // upper right - unsigned char ml, // middle left - unsigned char mm, // middle (unused) - unsigned char mr, // middle right - unsigned char ll, // lower left - unsigned char lm, // lower middle - unsigned char lr, // lower right - float fScale) { - short Horz = ur + 2 * mr + lr - ul - 2 * ml - ll; - short Vert = ul + 2 * um + ur - ll - 2 * lm - lr; - short Sum = (short)(fScale * (abs((int)Horz) + abs((int)Vert))); +__device__ unsigned char ComputeSobel(unsigned char ul, // upper left + unsigned char um, // upper middle + unsigned char ur, // upper right + unsigned char ml, // middle left + unsigned char mm, // middle (unused) + unsigned char mr, // middle right + unsigned char ll, // lower left + unsigned char lm, // lower middle + unsigned char lr, // lower right + float fScale) +{ + short Horz = ur + 2 * mr + lr - ul - 2 * ml - ll; + short Vert = ul + 2 * um + ur - ll - 2 * lm - lr; + short Sum = (short)(fScale * (abs((int)Horz) + abs((int)Vert))); - if (Sum < 0) { - return 0; - } else if (Sum > 0xff) { - return 0xff; - } + if (Sum < 0) { + return 0; + } + else if (Sum > 0xff) { + return 0xff; + } - return (unsigned char)Sum; + return (unsigned char)Sum; } -__global__ void SobelShared(uchar4 *pSobelOriginal, unsigned short SobelPitch, +__global__ void SobelShared(uchar4 *pSobelOriginal, + unsigned short SobelPitch, #ifndef FIXED_BLOCKWIDTH - short BlockWidth, short SharedPitch, + short BlockWidth, + short SharedPitch, #endif - short w, short h, float fScale, - cudaTextureObject_t tex) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - short u = 4 * blockIdx.x * BlockWidth; - short v = blockIdx.y * blockDim.y + threadIdx.y; - short ib; + short w, + short h, + float fScale, + cudaTextureObject_t tex) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + short u = 4 * blockIdx.x * BlockWidth; + short v = blockIdx.y * blockDim.y + threadIdx.y; + short ib; - int SharedIdx = threadIdx.y * SharedPitch; - - for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) { - LocalBlock[SharedIdx + 4 * ib + 0] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 1] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 2] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 3] = tex2D( - tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v - RADIUS)); - } - - if (threadIdx.y < RADIUS * 2) { - // - // copy trailing RADIUS*2 rows of pixels into shared - // - SharedIdx = (blockDim.y + threadIdx.y) * SharedPitch; + int SharedIdx = threadIdx.y * SharedPitch; for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) { - LocalBlock[SharedIdx + 4 * ib + 0] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 0), - (float)(v + blockDim.y - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 1] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 1), - (float)(v + blockDim.y - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 2] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 2), - (float)(v + blockDim.y - RADIUS)); - LocalBlock[SharedIdx + 4 * ib + 3] = - tex2D(tex, (float)(u + 4 * ib - RADIUS + 3), - (float)(v + blockDim.y - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 0] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 1] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 2] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 3] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v - RADIUS)); } - } - cg::sync(cta); + if (threadIdx.y < RADIUS * 2) { + // + // copy trailing RADIUS*2 rows of pixels into shared + // + SharedIdx = (blockDim.y + threadIdx.y) * SharedPitch; - u >>= 2; // index as uchar4 from here - uchar4 *pSobel = (uchar4 *)(((char *)pSobelOriginal) + v * SobelPitch); - SharedIdx = threadIdx.y * SharedPitch; - - for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x) { - unsigned char pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 0]; - unsigned char pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 1]; - unsigned char pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 2]; - unsigned char pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 0]; - unsigned char pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 1]; - unsigned char pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 2]; - unsigned char pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 0]; - unsigned char pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 1]; - unsigned char pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 2]; - - uchar4 out; - - out.x = ComputeSobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, - pix22, fScale); - - pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 3]; - pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 3]; - pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 3]; - out.y = ComputeSobel(pix01, pix02, pix00, pix11, pix12, pix10, pix21, pix22, - pix20, fScale); - - pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 4]; - pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 4]; - pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 4]; - out.z = ComputeSobel(pix02, pix00, pix01, pix12, pix10, pix11, pix22, pix20, - pix21, fScale); - - pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 5]; - pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 5]; - pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 5]; - out.w = ComputeSobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, - pix22, fScale); - - if (u + ib < w / 4 && v < h) { - pSobel[u + ib] = out; + for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) { + LocalBlock[SharedIdx + 4 * ib + 0] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v + blockDim.y - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 1] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v + blockDim.y - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 2] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v + blockDim.y - RADIUS)); + LocalBlock[SharedIdx + 4 * ib + 3] = + tex2D(tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v + blockDim.y - RADIUS)); + } } - } - cg::sync(cta); + cg::sync(cta); + + u >>= 2; // index as uchar4 from here + uchar4 *pSobel = (uchar4 *)(((char *)pSobelOriginal) + v * SobelPitch); + SharedIdx = threadIdx.y * SharedPitch; + + for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x) { + unsigned char pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 0]; + unsigned char pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 1]; + unsigned char pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 2]; + unsigned char pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 0]; + unsigned char pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 1]; + unsigned char pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 2]; + unsigned char pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 0]; + unsigned char pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 1]; + unsigned char pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 2]; + + uchar4 out; + + out.x = ComputeSobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); + + pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 3]; + pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 3]; + pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 3]; + out.y = ComputeSobel(pix01, pix02, pix00, pix11, pix12, pix10, pix21, pix22, pix20, fScale); + + pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 4]; + pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 4]; + pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 4]; + out.z = ComputeSobel(pix02, pix00, pix01, pix12, pix10, pix11, pix22, pix20, pix21, fScale); + + pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 5]; + pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 5]; + pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 5]; + out.w = ComputeSobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); + + if (u + ib < w / 4 && v < h) { + pSobel[u + ib] = out; + } + } + + cg::sync(cta); } -__global__ void SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch, int w, - int h, float fscale, cudaTextureObject_t tex) { - unsigned char *pSobel = - (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); +__global__ void +SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch, int w, int h, float fscale, cudaTextureObject_t tex) +{ + unsigned char *pSobel = (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); - for (int i = threadIdx.x; i < w; i += blockDim.x) { - pSobel[i] = min( - max((tex2D(tex, (float)i, (float)blockIdx.x) * fscale), - 0.f), - 255.f); - } + for (int i = threadIdx.x; i < w; i += blockDim.x) { + pSobel[i] = min(max((tex2D(tex, (float)i, (float)blockIdx.x) * fscale), 0.f), 255.f); + } } -__global__ void SobelTex(Pixel *pSobelOriginal, unsigned int Pitch, int w, - int h, float fScale, cudaTextureObject_t tex) { - unsigned char *pSobel = - (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); +__global__ void SobelTex(Pixel *pSobelOriginal, unsigned int Pitch, int w, int h, float fScale, cudaTextureObject_t tex) +{ + unsigned char *pSobel = (unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch); - for (int i = threadIdx.x; i < w; i += blockDim.x) { - unsigned char pix00 = - tex2D(tex, (float)i - 1, (float)blockIdx.x - 1); - unsigned char pix01 = - tex2D(tex, (float)i + 0, (float)blockIdx.x - 1); - unsigned char pix02 = - tex2D(tex, (float)i + 1, (float)blockIdx.x - 1); - unsigned char pix10 = - tex2D(tex, (float)i - 1, (float)blockIdx.x + 0); - unsigned char pix11 = - tex2D(tex, (float)i + 0, (float)blockIdx.x + 0); - unsigned char pix12 = - tex2D(tex, (float)i + 1, (float)blockIdx.x + 0); - unsigned char pix20 = - tex2D(tex, (float)i - 1, (float)blockIdx.x + 1); - unsigned char pix21 = - tex2D(tex, (float)i + 0, (float)blockIdx.x + 1); - unsigned char pix22 = - tex2D(tex, (float)i + 1, (float)blockIdx.x + 1); - pSobel[i] = ComputeSobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, - pix21, pix22, fScale); - } + for (int i = threadIdx.x; i < w; i += blockDim.x) { + unsigned char pix00 = tex2D(tex, (float)i - 1, (float)blockIdx.x - 1); + unsigned char pix01 = tex2D(tex, (float)i + 0, (float)blockIdx.x - 1); + unsigned char pix02 = tex2D(tex, (float)i + 1, (float)blockIdx.x - 1); + unsigned char pix10 = tex2D(tex, (float)i - 1, (float)blockIdx.x + 0); + unsigned char pix11 = tex2D(tex, (float)i + 0, (float)blockIdx.x + 0); + unsigned char pix12 = tex2D(tex, (float)i + 1, (float)blockIdx.x + 0); + unsigned char pix20 = tex2D(tex, (float)i - 1, (float)blockIdx.x + 1); + unsigned char pix21 = tex2D(tex, (float)i + 0, (float)blockIdx.x + 1); + unsigned char pix22 = tex2D(tex, (float)i + 1, (float)blockIdx.x + 1); + pSobel[i] = ComputeSobel(pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale); + } } -extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp) { - cudaChannelFormatDesc desc; +extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp) +{ + cudaChannelFormatDesc desc; - if (Bpp == 1) { - desc = cudaCreateChannelDesc(); - } else { - desc = cudaCreateChannelDesc(); - } + if (Bpp == 1) { + desc = cudaCreateChannelDesc(); + } + else { + desc = cudaCreateChannelDesc(); + } - checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih)); - checkCudaErrors(cudaMemcpy2DToArray( - array, 0, 0, data, iw * Bpp * sizeof(Pixel), iw * Bpp * sizeof(Pixel), ih, - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih)); + checkCudaErrors(cudaMemcpy2DToArray( + array, 0, 0, data, iw * Bpp * sizeof(Pixel), iw * Bpp * sizeof(Pixel), ih, cudaMemcpyHostToDevice)); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = array; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = array; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); } -extern "C" void deleteTexture(void) { - checkCudaErrors(cudaFreeArray(array)); - checkCudaErrors(cudaDestroyTextureObject(texObject)); +extern "C" void deleteTexture(void) +{ + checkCudaErrors(cudaFreeArray(array)); + checkCudaErrors(cudaDestroyTextureObject(texObject)); } // Wrapper for the __global__ call that sets up the texture and threads -extern "C" void sobelFilter(Pixel *odata, int iw, int ih, - enum SobelDisplayMode mode, float fScale) { - switch (mode) { +extern "C" void sobelFilter(Pixel *odata, int iw, int ih, enum SobelDisplayMode mode, float fScale) +{ + switch (mode) { case SOBELDISPLAY_IMAGE: - SobelCopyImage<<>>(odata, iw, iw, ih, fScale, texObject); - break; + SobelCopyImage<<>>(odata, iw, iw, ih, fScale, texObject); + break; case SOBELDISPLAY_SOBELTEX: - SobelTex<<>>(odata, iw, iw, ih, fScale, texObject); - break; + SobelTex<<>>(odata, iw, iw, ih, fScale, texObject); + break; case SOBELDISPLAY_SOBELSHARED: { - dim3 threads(16, 4); + dim3 threads(16, 4); #ifndef FIXED_BLOCKWIDTH - int BlockWidth = 80; // must be divisible by 16 for coalescing + int BlockWidth = 80; // must be divisible by 16 for coalescing #endif - dim3 blocks = dim3(iw / (4 * BlockWidth) + (0 != iw % (4 * BlockWidth)), - ih / threads.y + (0 != ih % threads.y)); - int SharedPitch = ~0x3f & (4 * (BlockWidth + 2 * RADIUS) + 0x3f); - int sharedMem = SharedPitch * (threads.y + 2 * RADIUS); + dim3 blocks = + dim3(iw / (4 * BlockWidth) + (0 != iw % (4 * BlockWidth)), ih / threads.y + (0 != ih % threads.y)); + int SharedPitch = ~0x3f & (4 * (BlockWidth + 2 * RADIUS) + 0x3f); + int sharedMem = SharedPitch * (threads.y + 2 * RADIUS); - // for the shared kernel, width must be divisible by 4 - iw &= ~3; + // for the shared kernel, width must be divisible by 4 + iw &= ~3; - SobelShared<<>>((uchar4 *)odata, iw, + SobelShared<<>>((uchar4 *)odata, + iw, #ifndef FIXED_BLOCKWIDTH - BlockWidth, SharedPitch, + BlockWidth, + SharedPitch, #endif - iw, ih, fScale, texObject); + iw, + ih, + fScale, + texObject); } break; - } + } } diff --git a/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.h b/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.h index 4093e285..d4199946 100644 --- a/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.h +++ b/Samples/5_Domain_Specific/SobelFilter/SobelFilter_kernels.h @@ -31,16 +31,11 @@ typedef unsigned char Pixel; // global determines which filter to invoke -enum SobelDisplayMode { - SOBELDISPLAY_IMAGE = 0, - SOBELDISPLAY_SOBELTEX, - SOBELDISPLAY_SOBELSHARED -}; +enum SobelDisplayMode { SOBELDISPLAY_IMAGE = 0, SOBELDISPLAY_SOBELTEX, SOBELDISPLAY_SOBELSHARED }; extern enum SobelDisplayMode g_SobelDisplayMode; -extern "C" void sobelFilter(Pixel *odata, int iw, int ih, - enum SobelDisplayMode mode, float fScale); +extern "C" void sobelFilter(Pixel *odata, int iw, int ih, enum SobelDisplayMode mode, float fScale); extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp); extern "C" void deleteTexture(void); extern "C" void initFilter(void); diff --git a/Samples/5_Domain_Specific/SobolQRNG/sobol.cpp b/Samples/5_Domain_Specific/SobolQRNG/sobol.cpp index 32f08695..0523a190 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/sobol.cpp +++ b/Samples/5_Domain_Specific/SobolQRNG/sobol.cpp @@ -53,16 +53,15 @@ * ACM Trans. on Math. Software, 29(1):49-57, 2003 */ -#include - -#include // CUDA Runtime Functions -#include // helper functions for CUDA error checking and initialization -#include // helper functions - -#include -#include - #include "sobol.h" + +#include // CUDA Runtime Functions +#include // helper functions for CUDA error checking and initialization +#include // helper functions +#include +#include +#include + #include "sobol_gold.h" #include "sobol_gpu.h" @@ -70,236 +69,238 @@ const char *sSDKsample = "Sobol Quasi-Random Number Generator"; -void printHelp(int argc, char *argv[]) { - if (argc > 0) { - std::cout << "\nUsage: " << argv[0] << " \n\n"; - } else { - std::cout << "\nUsage: \n\n"; - } +void printHelp(int argc, char *argv[]) +{ + if (argc > 0) { + std::cout << "\nUsage: " << argv[0] << " \n\n"; + } + else { + std::cout << "\nUsage: \n\n"; + } - std::cout << "\t--vectors=M specify number of vectors (required)\n"; - std::cout << "\t The generator will output M vectors\n\n"; - std::cout << "\t--dimensions=N specify number of dimensions (required)\n"; - std::cout << "\t Each vector will consist of N components\n\n"; - std::cout << std::endl; + std::cout << "\t--vectors=M specify number of vectors (required)\n"; + std::cout << "\t The generator will output M vectors\n\n"; + std::cout << "\t--dimensions=N specify number of dimensions (required)\n"; + std::cout << "\t Each vector will consist of N components\n\n"; + std::cout << std::endl; } -int main(int argc, char *argv[]) { - bool ok = true; +int main(int argc, char *argv[]) +{ + bool ok = true; - // We will generate n_vectors vectors of n_dimensions numbers - int n_vectors = 100000; - int n_dimensions = 100; + // We will generate n_vectors vectors of n_dimensions numbers + int n_vectors = 100000; + int n_dimensions = 100; - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - // Print help if requested - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printHelp(argc, argv); - return 0; - } + // Print help if requested + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printHelp(argc, argv); + return 0; + } - if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { - // For QA testing set a default number of vectors and dimensions - n_vectors = 100000; - n_dimensions = 100; - } else { - // Parse the command line to determine the required number of vectors - if (checkCmdLineFlag(argc, (const char **)argv, "vectors")) { - n_vectors = getCmdLineArgumentInt(argc, (const char **)argv, "vectors"); + if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { + // For QA testing set a default number of vectors and dimensions + n_vectors = 100000; + n_dimensions = 100; + } + else { + // Parse the command line to determine the required number of vectors + if (checkCmdLineFlag(argc, (const char **)argv, "vectors")) { + n_vectors = getCmdLineArgumentInt(argc, (const char **)argv, "vectors"); - if (n_vectors < 1) { - std::cerr << "Illegal argument: number of vectors must be positive " - "(--vectors=N)" + if (n_vectors < 1) { + std::cerr << "Illegal argument: number of vectors must be positive " + "(--vectors=N)" + << std::endl; + ok = false; + } + } + + std::cout << "> number of vectors = " << n_vectors << std::endl; + + // Parse the command line to determine the number of dimensions in each + // vector + if (checkCmdLineFlag(argc, (const char **)argv, "dimensions")) { + n_dimensions = getCmdLineArgumentInt(argc, (const char **)argv, "dimensions"); + + if (n_dimensions < 1) { + std::cerr << "Illegal argument: number of dimensions must be positive " + "(--dimensions=N)" + << std::endl; + ok = false; + } + } + + std::cout << "> number of dimensions = " << n_dimensions << std::endl; + } + + // If any of the command line checks failed, exit + if (!ok) { + return -1; + } + + // Use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); + + // Create a timer to measure performance + StopWatchInterface *hTimer = NULL; + double time; + sdkCreateTimer(&hTimer); + + // Allocate memory for the arrays + std::cout << "Allocating CPU memory..." << std::endl; + unsigned int *h_directions = 0; + float *h_outputCPU = 0; + float *h_outputGPU = 0; + + try { + h_directions = new unsigned int[n_dimensions * n_directions]; + h_outputCPU = new float[n_vectors * n_dimensions]; + h_outputGPU = new float[n_vectors * n_dimensions]; + } + catch (std::exception e) { + std::cerr << "Caught exception: " << e.what() << std::endl; + std::cerr << "Unable to allocate CPU memory (try running with fewer " + "vectors/dimensions)" << std::endl; - ok = false; - } + exit(EXIT_FAILURE); } - std::cout << "> number of vectors = " << n_vectors << std::endl; + std::cout << "Allocating GPU memory..." << std::endl; + unsigned int *d_directions; + float *d_output; - // Parse the command line to determine the number of dimensions in each - // vector - if (checkCmdLineFlag(argc, (const char **)argv, "dimensions")) { - n_dimensions = - getCmdLineArgumentInt(argc, (const char **)argv, "dimensions"); + try { + cudaError_t cudaResult; + cudaResult = cudaMalloc((void **)&d_directions, n_dimensions * n_directions * sizeof(unsigned int)); - if (n_dimensions < 1) { - std::cerr << "Illegal argument: number of dimensions must be positive " - "(--dimensions=N)" + if (cudaResult != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(cudaResult)); + } + + cudaResult = cudaMalloc((void **)&d_output, n_vectors * n_dimensions * sizeof(float)); + + if (cudaResult != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(cudaResult)); + } + } + catch (std::runtime_error e) { + std::cerr << "Caught exception: " << e.what() << std::endl; + std::cerr << "Unable to allocate GPU memory (try running with fewer " + "vectors/dimensions)" << std::endl; - ok = false; - } + exit(EXIT_FAILURE); } - std::cout << "> number of dimensions = " << n_dimensions << std::endl; - } + // Initialize the direction numbers (done on the host) + std::cout << "Initializing direction numbers..." << std::endl; + initSobolDirectionVectors(n_dimensions, h_directions); - // If any of the command line checks failed, exit - if (!ok) { - return -1; - } + // Copy the direction numbers to the device + std::cout << "Copying direction numbers to device..." << std::endl; + checkCudaErrors(cudaMemcpy( + d_directions, h_directions, n_dimensions * n_directions * sizeof(unsigned int), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaDeviceSynchronize()); - // Use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // Execute the QRNG on the device + std::cout << "Executing QRNG on GPU..." << std::endl; + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + sobolGPU(n_vectors, n_dimensions, d_directions, d_output); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + time = sdkGetTimerValue(&hTimer); - // Create a timer to measure performance - StopWatchInterface *hTimer = NULL; - double time; - sdkCreateTimer(&hTimer); - - // Allocate memory for the arrays - std::cout << "Allocating CPU memory..." << std::endl; - unsigned int *h_directions = 0; - float *h_outputCPU = 0; - float *h_outputGPU = 0; - - try { - h_directions = new unsigned int[n_dimensions * n_directions]; - h_outputCPU = new float[n_vectors * n_dimensions]; - h_outputGPU = new float[n_vectors * n_dimensions]; - } catch (std::exception e) { - std::cerr << "Caught exception: " << e.what() << std::endl; - std::cerr << "Unable to allocate CPU memory (try running with fewer " - "vectors/dimensions)" - << std::endl; - exit(EXIT_FAILURE); - } - - std::cout << "Allocating GPU memory..." << std::endl; - unsigned int *d_directions; - float *d_output; - - try { - cudaError_t cudaResult; - cudaResult = cudaMalloc((void **)&d_directions, - n_dimensions * n_directions * sizeof(unsigned int)); - - if (cudaResult != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(cudaResult)); + if (time < 1e-6) { + std::cout << "Gsamples/s: problem size too small to measure, try " + "increasing number of vectors or dimensions" + << std::endl; + } + else { + std::cout << "Gsamples/s: " << (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3) << std::endl; } - cudaResult = cudaMalloc((void **)&d_output, - n_vectors * n_dimensions * sizeof(float)); + std::cout << "Reading results from GPU..." << std::endl; + checkCudaErrors( + cudaMemcpy(h_outputGPU, d_output, n_vectors * n_dimensions * sizeof(float), cudaMemcpyDeviceToHost)); - if (cudaResult != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(cudaResult)); + std::cout << std::endl; + // Execute the QRNG on the host + std::cout << "Executing QRNG on CPU..." << std::endl; + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + sobolCPU(n_vectors, n_dimensions, h_directions, h_outputCPU); + sdkStopTimer(&hTimer); + time = sdkGetTimerValue(&hTimer); + + if (time < 1e-6) { + std::cout << "Gsamples/s: problem size too small to measure, try " + "increasing number of vectors or dimensions" + << std::endl; } - } catch (std::runtime_error e) { - std::cerr << "Caught exception: " << e.what() << std::endl; - std::cerr << "Unable to allocate GPU memory (try running with fewer " - "vectors/dimensions)" - << std::endl; - exit(EXIT_FAILURE); - } - - // Initialize the direction numbers (done on the host) - std::cout << "Initializing direction numbers..." << std::endl; - initSobolDirectionVectors(n_dimensions, h_directions); - - // Copy the direction numbers to the device - std::cout << "Copying direction numbers to device..." << std::endl; - checkCudaErrors(cudaMemcpy(d_directions, h_directions, - n_dimensions * n_directions * sizeof(unsigned int), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaDeviceSynchronize()); - - // Execute the QRNG on the device - std::cout << "Executing QRNG on GPU..." << std::endl; - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - sobolGPU(n_vectors, n_dimensions, d_directions, d_output); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - time = sdkGetTimerValue(&hTimer); - - if (time < 1e-6) { - std::cout << "Gsamples/s: problem size too small to measure, try " - "increasing number of vectors or dimensions" - << std::endl; - } else { - std::cout << "Gsamples/s: " - << (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3) - << std::endl; - } - - std::cout << "Reading results from GPU..." << std::endl; - checkCudaErrors(cudaMemcpy(h_outputGPU, d_output, - n_vectors * n_dimensions * sizeof(float), - cudaMemcpyDeviceToHost)); - - std::cout << std::endl; - // Execute the QRNG on the host - std::cout << "Executing QRNG on CPU..." << std::endl; - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - sobolCPU(n_vectors, n_dimensions, h_directions, h_outputCPU); - sdkStopTimer(&hTimer); - time = sdkGetTimerValue(&hTimer); - - if (time < 1e-6) { - std::cout << "Gsamples/s: problem size too small to measure, try " - "increasing number of vectors or dimensions" - << std::endl; - } else { - std::cout << "Gsamples/s: " - << (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3) - << std::endl; - } - - // Check the results - std::cout << "Checking results..." << std::endl; - float l1norm_diff = 0.0F; - float l1norm_ref = 0.0F; - float l1error; - - // Special case if n_vectors is 1, when the vector should be exactly 0 - if (n_vectors == 1) { - for (int d = 0, v = 0; d < n_dimensions; d++) { - float ref = h_outputCPU[d * n_vectors + v]; - l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref); - l1norm_ref += fabs(ref); + else { + std::cout << "Gsamples/s: " << (double)n_vectors * (double)n_dimensions * 1E-9 / (time * 1E-3) << std::endl; } - // Output the L1-Error - l1error = l1norm_diff; + // Check the results + std::cout << "Checking results..." << std::endl; + float l1norm_diff = 0.0F; + float l1norm_ref = 0.0F; + float l1error; - if (l1norm_ref != 0) { - std::cerr << "Error: L1-Norm of the reference is not zero (for single " - "vector), golden generator appears broken\n"; - } else { - std::cout << "L1-Error: " << l1error << std::endl; + // Special case if n_vectors is 1, when the vector should be exactly 0 + if (n_vectors == 1) { + for (int d = 0, v = 0; d < n_dimensions; d++) { + float ref = h_outputCPU[d * n_vectors + v]; + l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref); + l1norm_ref += fabs(ref); + } + + // Output the L1-Error + l1error = l1norm_diff; + + if (l1norm_ref != 0) { + std::cerr << "Error: L1-Norm of the reference is not zero (for single " + "vector), golden generator appears broken\n"; + } + else { + std::cout << "L1-Error: " << l1error << std::endl; + } } - } else { - for (int d = 0; d < n_dimensions; d++) { - for (int v = 0; v < n_vectors; v++) { - float ref = h_outputCPU[d * n_vectors + v]; - l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref); - l1norm_ref += fabs(ref); - } + else { + for (int d = 0; d < n_dimensions; d++) { + for (int v = 0; v < n_vectors; v++) { + float ref = h_outputCPU[d * n_vectors + v]; + l1norm_diff += fabs(h_outputGPU[d * n_vectors + v] - ref); + l1norm_ref += fabs(ref); + } + } + + // Output the L1-Error + l1error = l1norm_diff / l1norm_ref; + + if (l1norm_ref == 0) { + std::cerr << "Error: L1-Norm of the reference is zero, golden generator " + "appears broken\n"; + } + else { + std::cout << "L1-Error: " << l1error << std::endl; + } } - // Output the L1-Error - l1error = l1norm_diff / l1norm_ref; + // Cleanup and terminate + std::cout << "Shutting down..." << std::endl; + sdkDeleteTimer(&hTimer); + delete h_directions; + delete h_outputCPU; + delete h_outputGPU; + checkCudaErrors(cudaFree(d_directions)); + checkCudaErrors(cudaFree(d_output)); - if (l1norm_ref == 0) { - std::cerr << "Error: L1-Norm of the reference is zero, golden generator " - "appears broken\n"; - } else { - std::cout << "L1-Error: " << l1error << std::endl; - } - } - - // Cleanup and terminate - std::cout << "Shutting down..." << std::endl; - sdkDeleteTimer(&hTimer); - delete h_directions; - delete h_outputCPU; - delete h_outputGPU; - checkCudaErrors(cudaFree(d_directions)); - checkCudaErrors(cudaFree(d_output)); - - // Check pass/fail using L1 error - exit(l1error < L1ERROR_TOLERANCE ? EXIT_SUCCESS : EXIT_FAILURE); + // Check pass/fail using L1 error + exit(l1error < L1ERROR_TOLERANCE ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.cpp b/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.cpp index 662fc390..2737f7d6 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.cpp +++ b/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.cpp @@ -53,13 +53,14 @@ * ACM Trans. on Math. Software, 29(1):49-57, 2003 */ +#include "sobol_gold.h" + +#include #include #include -#include #include #include "sobol.h" -#include "sobol_gold.h" #include "sobol_primitives.h" #define k_2powneg32 2.3283064E-10F @@ -68,106 +69,110 @@ // fairly simple implementation. // WIN32 is defined on 32 and 64 bit Windows #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -int ffs(const unsigned int &i) { - unsigned int v = i; - unsigned int count; +int ffs(const unsigned int &i) +{ + unsigned int v = i; + unsigned int count; - if (!v) { - count = 0; - } else { - count = 2; + if (!v) { + count = 0; + } + else { + count = 2; - if ((v & 0xffff) == 0) { - v >>= 16; - count += 16; + if ((v & 0xffff) == 0) { + v >>= 16; + count += 16; + } + + if ((v & 0xff) == 0) { + v >>= 8; + count += 8; + } + + if ((v & 0xf) == 0) { + v >>= 4; + count += 4; + } + + if ((v & 0x3) == 0) { + v >>= 2; + count += 2; + } + + count -= v & 0x1; } - if ((v & 0xff) == 0) { - v >>= 8; - count += 8; - } - - if ((v & 0xf) == 0) { - v >>= 4; - count += 4; - } - - if ((v & 0x3) == 0) { - v >>= 2; - count += 2; - } - - count -= v & 0x1; - } - - return count; + return count; } #endif // Create the direction numbers, based on the primitive polynomials. -void initSobolDirectionVectors(int n_dimensions, unsigned int *directions) { - unsigned int *v = directions; +void initSobolDirectionVectors(int n_dimensions, unsigned int *directions) +{ + unsigned int *v = directions; - for (int dim = 0; dim < n_dimensions; dim++) { - // First dimension is a special case - if (dim == 0) { - for (int i = 0; i < n_directions; i++) { - // All m's are 1 - v[i] = 1 << (31 - i); - } - } else { - int d = sobol_primitives[dim].degree; - - // The first direction numbers (up to the degree of the polynomial) - // are simply v[i] = m[i] / 2^i (stored in Q0.32 format) - for (int i = 0; i < d; i++) { - v[i] = sobol_primitives[dim].m[i] << (31 - i); - } - - // The remaining direction numbers are computed as described in - // the Bratley and Fox paper. - // v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ - // v[i-d]/2^d - for (int i = d; i < n_directions; i++) { - // First do the v[i-d] ^ v[i-d]/2^d part - v[i] = v[i - d] ^ (v[i - d] >> d); - - // Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part - // Note that the coefficients a[] are zero or one and for compactness in - // the input tables they are stored as bits of a single integer. To - // extract the relevant bit we use right shift and mask with 1. - // For example, for a 10 degree polynomial there are ten useful bits in - // a, so to get a[2] we need to right shift 7 times (to get the 8th bit - // into the LSB) and then mask with 1. - for (int j = 1; j < d; j++) { - v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]); + for (int dim = 0; dim < n_dimensions; dim++) { + // First dimension is a special case + if (dim == 0) { + for (int i = 0; i < n_directions; i++) { + // All m's are 1 + v[i] = 1 << (31 - i); + } } - } - } + else { + int d = sobol_primitives[dim].degree; - v += n_directions; - } + // The first direction numbers (up to the degree of the polynomial) + // are simply v[i] = m[i] / 2^i (stored in Q0.32 format) + for (int i = 0; i < d; i++) { + v[i] = sobol_primitives[dim].m[i] << (31 - i); + } + + // The remaining direction numbers are computed as described in + // the Bratley and Fox paper. + // v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ + // v[i-d]/2^d + for (int i = d; i < n_directions; i++) { + // First do the v[i-d] ^ v[i-d]/2^d part + v[i] = v[i - d] ^ (v[i - d] >> d); + + // Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part + // Note that the coefficients a[] are zero or one and for compactness in + // the input tables they are stored as bits of a single integer. To + // extract the relevant bit we use right shift and mask with 1. + // For example, for a 10 degree polynomial there are ten useful bits in + // a, so to get a[2] we need to right shift 7 times (to get the 8th bit + // into the LSB) and then mask with 1. + for (int j = 1; j < d; j++) { + v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]); + } + } + } + + v += n_directions; + } } // Reference model for generating Sobol numbers on the host -void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, - float *output) { - unsigned int *v = directions; +void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output) +{ + unsigned int *v = directions; - for (int d = 0; d < n_dimensions; d++) { - unsigned int X = 0; - // x[0] is zero (in all dimensions) - output[n_vectors * d] = 0.0; + for (int d = 0; d < n_dimensions; d++) { + unsigned int X = 0; + // x[0] is zero (in all dimensions) + output[n_vectors * d] = 0.0; - for (int i = 1; i < n_vectors; i++) { - // x[i] = x[i-1] ^ v[c] - // where c is the index of the rightmost zero bit in i - // minus 1 (since C arrays count from zero) - // In the Bratley and Fox paper this is equation (**) - X ^= v[ffs(~(i - 1)) - 1]; - output[i + n_vectors * d] = (float)X * k_2powneg32; + for (int i = 1; i < n_vectors; i++) { + // x[i] = x[i-1] ^ v[c] + // where c is the index of the rightmost zero bit in i + // minus 1 (since C arrays count from zero) + // In the Bratley and Fox paper this is equation (**) + X ^= v[ffs(~(i - 1)) - 1]; + output[i + n_vectors * d] = (float)X * k_2powneg32; + } + + v += n_directions; } - - v += n_directions; - } } diff --git a/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.h b/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.h index 70f51acc..9e915197 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.h +++ b/Samples/5_Domain_Specific/SobolQRNG/sobol_gold.h @@ -58,7 +58,6 @@ #define SOBOL_GOLD_H void initSobolDirectionVectors(int n_dimensions, unsigned int *directions); -void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, - float *output); +void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output); #endif diff --git a/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.cu b/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.cu index 7479b942..f96f1aff 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.cu +++ b/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.cu @@ -54,154 +54,155 @@ * */ +#include + #include "sobol.h" #include "sobol_gpu.h" -#include namespace cg = cooperative_groups; #include #define k_2powneg32 2.3283064E-10F -__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, - unsigned *d_directions, float *d_output) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ unsigned int v[n_directions]; +__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ unsigned int v[n_directions]; - // Offset into the correct dimension as specified by the - // block y coordinate - d_directions = d_directions + n_directions * blockIdx.y; - d_output = d_output + n_vectors * blockIdx.y; + // Offset into the correct dimension as specified by the + // block y coordinate + d_directions = d_directions + n_directions * blockIdx.y; + d_output = d_output + n_vectors * blockIdx.y; - // Copy the direction numbers for this dimension into shared - // memory - there are only 32 direction numbers so only the - // first 32 (n_directions) threads need participate. - if (threadIdx.x < n_directions) { - v[threadIdx.x] = d_directions[threadIdx.x]; - } + // Copy the direction numbers for this dimension into shared + // memory - there are only 32 direction numbers so only the + // first 32 (n_directions) threads need participate. + if (threadIdx.x < n_directions) { + v[threadIdx.x] = d_directions[threadIdx.x]; + } - cg::sync(cta); + cg::sync(cta); - // Set initial index (i.e. which vector this thread is - // computing first) and stride (i.e. step to the next vector - // for this thread) - int i0 = threadIdx.x + blockIdx.x * blockDim.x; - int stride = gridDim.x * blockDim.x; + // Set initial index (i.e. which vector this thread is + // computing first) and stride (i.e. step to the next vector + // for this thread) + int i0 = threadIdx.x + blockIdx.x * blockDim.x; + int stride = gridDim.x * blockDim.x; - // Get the gray code of the index - // c.f. Numerical Recipes in C, chapter 20 - // http://www.nrbook.com/a/bookcpdf/c20-2.pdf - unsigned int g = i0 ^ (i0 >> 1); + // Get the gray code of the index + // c.f. Numerical Recipes in C, chapter 20 + // http://www.nrbook.com/a/bookcpdf/c20-2.pdf + unsigned int g = i0 ^ (i0 >> 1); - // Initialisation for first point x[i0] - // In the Bratley and Fox paper this is equation (*), where - // we are computing the value for x[n] without knowing the - // value of x[n-1]. - unsigned int X = 0; - unsigned int mask; + // Initialisation for first point x[i0] + // In the Bratley and Fox paper this is equation (*), where + // we are computing the value for x[n] without knowing the + // value of x[n-1]. + unsigned int X = 0; + unsigned int mask; - for (unsigned int k = 0; k < __ffs(stride) - 1; k++) { - // We want X ^= g_k * v[k], where g_k is one or zero. - // We do this by setting a mask with all bits equal to - // g_k. In reality we keep shifting g so that g_k is the - // LSB of g. This way we avoid multiplication. - mask = -(g & 1); - X ^= mask & v[k]; - g = g >> 1; - } + for (unsigned int k = 0; k < __ffs(stride) - 1; k++) { + // We want X ^= g_k * v[k], where g_k is one or zero. + // We do this by setting a mask with all bits equal to + // g_k. In reality we keep shifting g so that g_k is the + // LSB of g. This way we avoid multiplication. + mask = -(g & 1); + X ^= mask & v[k]; + g = g >> 1; + } - if (i0 < n_vectors) { - d_output[i0] = (float)X * k_2powneg32; - } + if (i0 < n_vectors) { + d_output[i0] = (float)X * k_2powneg32; + } - // Now do rest of points, using the stride - // Here we want to generate x[i] from x[i-stride] where we - // don't have any of the x in between, therefore we have to - // revisit the equation (**), this is easiest with an example - // so assume stride is 16. - // From x[n] to x[n+16] there will be: - // 8 changes in the first bit - // 4 changes in the second bit - // 2 changes in the third bit - // 1 change in the fourth - // 1 change in one of the remaining bits - // - // What this means is that in the equation: - // x[n+1] = x[n] ^ v[p] - // x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q] - // ... - // We will apply xor with v[1] eight times, v[2] four times, - // v[3] twice, v[4] once and one other direction number once. - // Since two xors cancel out, we can skip even applications - // and just apply xor with v[4] (i.e. log2(16)) and with - // the current applicable direction number. - // Note that all these indices count from 1, so we need to - // subtract 1 from them all to account for C arrays counting - // from zero. - unsigned int v_log2stridem1 = v[__ffs(stride) - 2]; - unsigned int v_stridemask = stride - 1; + // Now do rest of points, using the stride + // Here we want to generate x[i] from x[i-stride] where we + // don't have any of the x in between, therefore we have to + // revisit the equation (**), this is easiest with an example + // so assume stride is 16. + // From x[n] to x[n+16] there will be: + // 8 changes in the first bit + // 4 changes in the second bit + // 2 changes in the third bit + // 1 change in the fourth + // 1 change in one of the remaining bits + // + // What this means is that in the equation: + // x[n+1] = x[n] ^ v[p] + // x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q] + // ... + // We will apply xor with v[1] eight times, v[2] four times, + // v[3] twice, v[4] once and one other direction number once. + // Since two xors cancel out, we can skip even applications + // and just apply xor with v[4] (i.e. log2(16)) and with + // the current applicable direction number. + // Note that all these indices count from 1, so we need to + // subtract 1 from them all to account for C arrays counting + // from zero. + unsigned int v_log2stridem1 = v[__ffs(stride) - 2]; + unsigned int v_stridemask = stride - 1; - for (unsigned int i = i0 + stride; i < n_vectors; i += stride) { - // x[i] = x[i-stride] ^ v[b] ^ v[c] - // where b is log2(stride) minus 1 for C array indexing - // where c is the index of the rightmost zero bit in i, - // not including the bottom log2(stride) bits, minus 1 - // for C array indexing - // In the Bratley and Fox paper this is equation (**) - X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1]; - d_output[i] = (float)X * k_2powneg32; - } + for (unsigned int i = i0 + stride; i < n_vectors; i += stride) { + // x[i] = x[i-stride] ^ v[b] ^ v[c] + // where b is log2(stride) minus 1 for C array indexing + // where c is the index of the rightmost zero bit in i, + // not including the bottom log2(stride) bits, minus 1 + // for C array indexing + // In the Bratley and Fox paper this is equation (**) + X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1]; + d_output[i] = (float)X * k_2powneg32; + } } -extern "C" void sobolGPU(int n_vectors, int n_dimensions, - unsigned int *d_directions, float *d_output) { - const int threadsperblock = 64; +extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output) +{ + const int threadsperblock = 64; - // Set up the execution configuration - dim3 dimGrid; - dim3 dimBlock; + // Set up the execution configuration + dim3 dimGrid; + dim3 dimBlock; - int device; - cudaDeviceProp prop; - checkCudaErrors(cudaGetDevice(&device)); - checkCudaErrors(cudaGetDeviceProperties(&prop, device)); + int device; + cudaDeviceProp prop; + checkCudaErrors(cudaGetDevice(&device)); + checkCudaErrors(cudaGetDeviceProperties(&prop, device)); - // This implementation of the generator outputs all the draws for - // one dimension in a contiguous region of memory, followed by the - // next dimension and so on. - // Therefore all threads within a block will be processing different - // vectors from the same dimension. As a result we want the total - // number of blocks to be a multiple of the number of dimensions. - dimGrid.y = n_dimensions; + // This implementation of the generator outputs all the draws for + // one dimension in a contiguous region of memory, followed by the + // next dimension and so on. + // Therefore all threads within a block will be processing different + // vectors from the same dimension. As a result we want the total + // number of blocks to be a multiple of the number of dimensions. + dimGrid.y = n_dimensions; - // If the number of dimensions is large then we will set the number - // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1) - // but if the number of dimensions is small (e.g. less than four per - // multiprocessor) then we'll partition the vectors across blocks - // (as well as threads). - if (n_dimensions < (4 * prop.multiProcessorCount)) { - dimGrid.x = 4 * prop.multiProcessorCount; - } else { - dimGrid.x = 1; - } + // If the number of dimensions is large then we will set the number + // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1) + // but if the number of dimensions is small (e.g. less than four per + // multiprocessor) then we'll partition the vectors across blocks + // (as well as threads). + if (n_dimensions < (4 * prop.multiProcessorCount)) { + dimGrid.x = 4 * prop.multiProcessorCount; + } + else { + dimGrid.x = 1; + } - // Cap the dimGrid.x if the number of vectors is small - if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock)) { - dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock; - } + // Cap the dimGrid.x if the number of vectors is small + if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock)) { + dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock; + } - // Round up to a power of two, required for the algorithm so that - // stride is a power of two. - unsigned int targetDimGridX = dimGrid.x; + // Round up to a power of two, required for the algorithm so that + // stride is a power of two. + unsigned int targetDimGridX = dimGrid.x; - for (dimGrid.x = 1; dimGrid.x < targetDimGridX; dimGrid.x *= 2) - ; + for (dimGrid.x = 1; dimGrid.x < targetDimGridX; dimGrid.x *= 2) + ; - // Fix the number of threads - dimBlock.x = threadsperblock; + // Fix the number of threads + dimBlock.x = threadsperblock; - // Execute GPU kernel - sobolGPU_kernel<<>>(n_vectors, n_dimensions, d_directions, - d_output); + // Execute GPU kernel + sobolGPU_kernel<<>>(n_vectors, n_dimensions, d_directions, d_output); } diff --git a/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.h b/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.h index bec4e016..19d98f5b 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.h +++ b/Samples/5_Domain_Specific/SobolQRNG/sobol_gpu.h @@ -57,7 +57,6 @@ #ifndef SOBOL_GPU_H #define SOBOL_GPU_H -extern "C" void sobolGPU(int n_vectors, int n_dimensions, - unsigned int *d_directions, float *d_output); +extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output); #endif diff --git a/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.cpp b/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.cpp index 3cd526f9..7597a334 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.cpp +++ b/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.cpp @@ -65,8 +65,7 @@ // The primitives are based on those generated by Stephen Joe and // Frances Kuo in the joe-kuo-6.10200 set. // c.f. http://web.maths.unsw.edu.au/~fkuo/sobol/index.html -const struct primitive sobol_primitives[] = -{ +const struct primitive sobol_primitives[] = { // First dimension is a special case so this entry is actually ignored {1, 0, 0, {}}, {2, 1, 0, {1}}, diff --git a/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.h b/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.h index 2f8434b1..50b6b491 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.h +++ b/Samples/5_Domain_Specific/SobolQRNG/sobol_primitives.h @@ -64,11 +64,12 @@ // degree is the degree of the polynomial // a is a binary word representing the coefficients // m is the array of m values -struct primitive { - unsigned int dimension; - unsigned int degree; - unsigned int a; - unsigned int m[max_m]; +struct primitive +{ + unsigned int dimension; + unsigned int degree; + unsigned int a; + unsigned int m[max_m]; }; extern const struct primitive sobol_primitives[]; diff --git a/Samples/5_Domain_Specific/bicubicTexture/README.md b/Samples/5_Domain_Specific/bicubicTexture/README.md index b4824803..4298e2d0 100644 --- a/Samples/5_Domain_Specific/bicubicTexture/README.md +++ b/Samples/5_Domain_Specific/bicubicTexture/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture.cpp b/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture.cpp index 71d62817..e39a2a67 100644 --- a/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture.cpp +++ b/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture.cpp @@ -67,20 +67,20 @@ #endif // Includes -#include -#include -#include #include +#include +#include +#include // CUDA system and GL includes -#include #include +#include // Helper functions -#include // CUDA SDK Helper functions -#include // CUDA device initialization helper functions +#include // CUDA device initialization helper functions +#include // CUDA SDK Helper functions -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; #define USE_BUFFER_TEX 0 @@ -89,65 +89,55 @@ typedef unsigned char uchar; #endif // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 4; // FPS limit for sampling -int g_Index = 0; -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; -StopWatchInterface *timer = 0; -bool g_Verify = false; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 4; // FPS limit for sampling +int g_Index = 0; +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; +StopWatchInterface *timer = 0; +bool g_Verify = false; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; #define MAX_EPSILON_ERROR 5.0f -#define REFRESH_DELAY 10 // ms +#define REFRESH_DELAY 10 // ms static const char *sSDKsample = "CUDA BicubicTexture"; // Define the files that are to be save and the reference images for validation -const char *sFilterMode[] = {"Nearest", "Bilinear", "Bicubic", - "Fast Bicubic", "Catmull-Rom", NULL}; +const char *sFilterMode[] = {"Nearest", "Bilinear", "Bicubic", "Fast Bicubic", "Catmull-Rom", NULL}; -const char *sOriginal[] = {"0_nearest.ppm", "1_bilinear.ppm", - "2_bicubic.ppm", "3_fastbicubic.ppm", - "4_catmull-rom.ppm", NULL}; +const char *sOriginal[] = + {"0_nearest.ppm", "1_bilinear.ppm", "2_bicubic.ppm", "3_fastbicubic.ppm", "4_catmull-rom.ppm", NULL}; -const char *sReference[] = {"0_nearest.ppm", "1_bilinear.ppm", - "2_bicubic.ppm", "3_fastbicubic.ppm", - "4_catmull-rom.ppm", NULL}; +const char *sReference[] = + {"0_nearest.ppm", "1_bilinear.ppm", "2_bicubic.ppm", "3_fastbicubic.ppm", "4_catmull-rom.ppm", NULL}; const char *srcImageFilename = "teapot512.pgm"; -char *dumpFilename = NULL; +char *dumpFilename = NULL; uint width = 512, height = 512; uint imageWidth, imageHeight; dim3 blockSize(16, 16); dim3 gridSize(width / blockSize.x, height / blockSize.y); -enum eFilterMode { - MODE_NEAREST, - MODE_BILINEAR, - MODE_BICUBIC, - MODE_FAST_BICUBIC, - MODE_CATMULL_ROM, - NUM_MODES -}; +enum eFilterMode { MODE_NEAREST, MODE_BILINEAR, MODE_BICUBIC, MODE_FAST_BICUBIC, MODE_CATMULL_ROM, NUM_MODES }; eFilterMode g_FilterMode = MODE_FAST_BICUBIC; bool drawCurves = false; -GLuint pbo = 0; // OpenGL pixel buffer object -struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange -GLuint displayTex = 0; -GLuint bufferTex = 0; -GLuint fprog; // fragment program (shader) +GLuint pbo = 0; // OpenGL pixel buffer object +struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange +GLuint displayTex = 0; +GLuint bufferTex = 0; +GLuint fprog; // fragment program (shader) -float tx = -27.75f, ty = -189.0f; // image translation -float scale = 0.125f; // image scale -float cx, cy; // image centre +float tx = -27.75f, ty = -189.0f; // image translation +float scale = 0.125f; // image scale +float cx, cy; // image centre void display(); void initGLBuffers(); @@ -155,627 +145,620 @@ void runBenchmark(int iterations); void cleanup(); #define GL_TEXTURE_TYPE GL_TEXTURE_RECTANGLE_ARB -//#define GL_TEXTURE_TYPE GL_TEXTURE_2D +// #define GL_TEXTURE_TYPE GL_TEXTURE_2D extern "C" void initGL(int *argc, char **argv); extern "C" void loadImageData(int argc, char **argv); extern "C" void initTexture(int imageWidth, int imageHeight, uchar *h_data); extern "C" void freeTexture(); -extern "C" void render(int width, int height, float tx, float ty, float scale, - float cx, float cy, dim3 blockSize, dim3 gridSize, - eFilterMode filter_mode, uchar4 *output); +extern "C" void render(int width, + int height, + float tx, + float ty, + float scale, + float cx, + float cy, + dim3 blockSize, + dim3 gridSize, + eFilterMode filter_mode, + uchar4 *output); // w0, w1, w2, and w3 are the four cubic B-spline basis functions -float bspline_w0(float a) { - return (1.0f / 6.0f) * (-a * a * a + 3.0f * a * a - 3.0f * a + 1.0f); +float bspline_w0(float a) { return (1.0f / 6.0f) * (-a * a * a + 3.0f * a * a - 3.0f * a + 1.0f); } + +float bspline_w1(float a) { return (1.0f / 6.0f) * (3.0f * a * a * a - 6.0f * a * a + 4.0f); } + +float bspline_w2(float a) { return (1.0f / 6.0f) * (-3.0f * a * a * a + 3.0f * a * a + 3.0f * a + 1.0f); } + +__host__ __device__ float bspline_w3(float a) { return (1.0f / 6.0f) * (a * a * a); } + +void computeFPS() +{ + frameCount++; + fpsCount++; + + if (fpsCount == fpsLimit - 1) { + g_Verify = true; + } + + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "%s %s <%s>: %3.1f fps", "", sSDKsample, sFilterMode[g_FilterMode], ifps); + + glutSetWindowTitle(fps); + fpsCount = 0; + + sdkResetTimer(&timer); + } } -float bspline_w1(float a) { - return (1.0f / 6.0f) * (3.0f * a * a * a - 6.0f * a * a + 4.0f); -} +void plotCurve(float (*func)(float)) +{ + const int steps = 100; + glBegin(GL_LINE_STRIP); -float bspline_w2(float a) { - return (1.0f / 6.0f) * (-3.0f * a * a * a + 3.0f * a * a + 3.0f * a + 1.0f); -} + for (int i = 0; i < steps; i++) { + float x = i / (float)(steps - 1); + glVertex2f(x, func(x)); + } -__host__ __device__ float bspline_w3(float a) { - return (1.0f / 6.0f) * (a * a * a); -} - -void computeFPS() { - frameCount++; - fpsCount++; - - if (fpsCount == fpsLimit - 1) { - g_Verify = true; - } - - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "%s %s <%s>: %3.1f fps", "", sSDKsample, - sFilterMode[g_FilterMode], ifps); - - glutSetWindowTitle(fps); - fpsCount = 0; - - sdkResetTimer(&timer); - } -} - -void plotCurve(float (*func)(float)) { - const int steps = 100; - glBegin(GL_LINE_STRIP); - - for (int i = 0; i < steps; i++) { - float x = i / (float)(steps - 1); - glVertex2f(x, func(x)); - } - - glEnd(); + glEnd(); } // display results using OpenGL (called by GLUT) -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - // map PBO to get CUDA device pointer - uchar4 *d_output; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_output, &num_bytes, cuda_pbo_resource)); - render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, - g_FilterMode, d_output); + // map PBO to get CUDA device pointer + uchar4 *d_output; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); + render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, g_FilterMode, d_output); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - // Common display path - { - // display results - glClear(GL_COLOR_BUFFER_BIT); + // Common display path + { + // display results + glClear(GL_COLOR_BUFFER_BIT); #if USE_BUFFER_TEX - // display using buffer texture - glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); - glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fprog); - glEnable(GL_FRAGMENT_PROGRAM_ARB); - glProgramLocalParameterI4iNV(GL_FRAGMENT_PROGRAM_ARB, 0, width, 0, 0, 0); + // display using buffer texture + glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); + glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fprog); + glEnable(GL_FRAGMENT_PROGRAM_ARB); + glProgramLocalParameterI4iNV(GL_FRAGMENT_PROGRAM_ARB, 0, width, 0, 0, 0); #else - // download image from PBO to OpenGL texture - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBindTexture(GL_TEXTURE_TYPE, displayTex); - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTexSubImage2D(GL_TEXTURE_TYPE, 0, 0, 0, width, height, GL_BGRA, - GL_UNSIGNED_BYTE, 0); - glEnable(GL_TEXTURE_TYPE); + // download image from PBO to OpenGL texture + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBindTexture(GL_TEXTURE_TYPE, displayTex); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexSubImage2D(GL_TEXTURE_TYPE, 0, 0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, 0); + glEnable(GL_TEXTURE_TYPE); #endif - // draw textured quad - glDisable(GL_DEPTH_TEST); - glBegin(GL_QUADS); - glTexCoord2f(0.0f, (GLfloat)height); - glVertex2f(0.0f, 0.0f); - glTexCoord2f((GLfloat)width, (GLfloat)height); - glVertex2f(1.0f, 0.0f); - glTexCoord2f((GLfloat)width, 0.0f); - glVertex2f(1.0f, 1.0f); - glTexCoord2f(0.0f, 0.0f); - glVertex2f(0.0f, 1.0f); - glEnd(); - glDisable(GL_TEXTURE_TYPE); - glDisable(GL_FRAGMENT_PROGRAM_ARB); + // draw textured quad + glDisable(GL_DEPTH_TEST); + glBegin(GL_QUADS); + glTexCoord2f(0.0f, (GLfloat)height); + glVertex2f(0.0f, 0.0f); + glTexCoord2f((GLfloat)width, (GLfloat)height); + glVertex2f(1.0f, 0.0f); + glTexCoord2f((GLfloat)width, 0.0f); + glVertex2f(1.0f, 1.0f); + glTexCoord2f(0.0f, 0.0f); + glVertex2f(0.0f, 1.0f); + glEnd(); + glDisable(GL_TEXTURE_TYPE); + glDisable(GL_FRAGMENT_PROGRAM_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - if (drawCurves) { - // draw spline curves - glPushMatrix(); - glScalef(0.25, 0.25, 1.0); + if (drawCurves) { + // draw spline curves + glPushMatrix(); + glScalef(0.25, 0.25, 1.0); - glTranslatef(0.0, 2.0, 0.0); - glColor3f(1.0, 0.0, 0.0); - plotCurve(bspline_w3); + glTranslatef(0.0, 2.0, 0.0); + glColor3f(1.0, 0.0, 0.0); + plotCurve(bspline_w3); - glTranslatef(1.0, 0.0, 0.0); - glColor3f(0.0, 1.0, 0.0); - plotCurve(bspline_w2); + glTranslatef(1.0, 0.0, 0.0); + glColor3f(0.0, 1.0, 0.0); + plotCurve(bspline_w2); - glTranslatef(1.0, 0.0, 0.0); - glColor3f(0.0, 0.0, 1.0); - plotCurve(bspline_w1); + glTranslatef(1.0, 0.0, 0.0); + glColor3f(0.0, 0.0, 1.0); + plotCurve(bspline_w1); - glTranslatef(1.0, 0.0, 0.0); - glColor3f(1.0, 0.0, 1.0); - plotCurve(bspline_w0); + glTranslatef(1.0, 0.0, 0.0); + glColor3f(1.0, 0.0, 1.0); + plotCurve(bspline_w0); - glPopMatrix(); - glColor3f(1.0, 1.0, 1.0); + glPopMatrix(); + glColor3f(1.0, 1.0, 1.0); + } } - } - glutSwapBuffers(); - glutReportErrors(); + glutSwapBuffers(); + glutReportErrors(); - sdkStopTimer(&timer); + sdkStopTimer(&timer); - computeFPS(); + computeFPS(); } // GLUT callback functions -void timerEvent(int value) { - if (glutGetWindow()) { - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - } +void timerEvent(int value) +{ + if (glutGetWindow()) { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + } } -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif case '1': - g_FilterMode = MODE_NEAREST; - break; + g_FilterMode = MODE_NEAREST; + break; case '2': - g_FilterMode = MODE_BILINEAR; - break; + g_FilterMode = MODE_BILINEAR; + break; case '3': - g_FilterMode = MODE_BICUBIC; - break; + g_FilterMode = MODE_BICUBIC; + break; case '4': - g_FilterMode = MODE_FAST_BICUBIC; - break; + g_FilterMode = MODE_FAST_BICUBIC; + break; case '5': - g_FilterMode = MODE_CATMULL_ROM; - break; + g_FilterMode = MODE_CATMULL_ROM; + break; case '=': case '+': - scale *= 0.5f; - break; + scale *= 0.5f; + break; case '-': - scale *= 2.0f; - break; + scale *= 2.0f; + break; case 'r': - scale = 1.0f; - tx = ty = 0.0f; - break; + scale = 1.0f; + tx = ty = 0.0f; + break; case 'd': - printf("%f, %f, %f\n", tx, ty, scale); - break; + printf("%f, %f, %f\n", tx, ty, scale); + break; case 'b': - runBenchmark(500); - break; + runBenchmark(500); + break; case 'c': - drawCurves ^= 1; - break; + drawCurves ^= 1; + break; default: - break; - } + break; + } - if (key >= '1' && key <= '5') { - printf("> FilterMode[%d] = %s\n", g_FilterMode + 1, - sFilterMode[g_FilterMode]); - } + if (key >= '1' && key <= '5') { + printf("> FilterMode[%d] = %s\n", g_FilterMode + 1, sFilterMode[g_FilterMode]); + } } int ox, oy; int buttonState = 0; -void mouse(int button, int state, int x, int y) { - if (state == GLUT_DOWN) { - buttonState |= 1 << button; - } else if (state == GLUT_UP) { - buttonState = 0; - } +void mouse(int button, int state, int x, int y) +{ + if (state == GLUT_DOWN) { + buttonState |= 1 << button; + } + else if (state == GLUT_UP) { + buttonState = 0; + } - ox = x; - oy = y; + ox = x; + oy = y; } -void motion(int x, int y) { - float dx, dy; - dx = (float)(x - ox); - dy = (float)(y - oy); +void motion(int x, int y) +{ + float dx, dy; + dx = (float)(x - ox); + dy = (float)(y - oy); - if (buttonState & 1) { - // left = translate - tx -= dx * scale; - ty -= dy * scale; - } else if (buttonState & 2) { - // middle = zoom - scale -= dy / 1000.0f; - } + if (buttonState & 1) { + // left = translate + tx -= dx * scale; + ty -= dy * scale; + } + else if (buttonState & 2) { + // middle = zoom + scale -= dy / 1000.0f; + } - ox = x; - oy = y; + ox = x; + oy = y; } -void reshape(int x, int y) { - width = x; - height = y; - imageWidth = width; - imageHeight = height; +void reshape(int x, int y) +{ + width = x; + height = y; + imageWidth = width; + imageHeight = height; - initGLBuffers(); + initGLBuffers(); - glViewport(0, 0, x, y); + glViewport(0, 0, x, y); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } -void cleanup() { - freeTexture(); - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); +void cleanup() +{ + freeTexture(); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); - glDeleteBuffers(1, &pbo); + glDeleteBuffers(1, &pbo); #if USE_BUFFER_TEX - glDeleteTextures(1, &bufferTex); - glDeleteProgramsARB(1, &fprog); + glDeleteTextures(1, &bufferTex); + glDeleteProgramsARB(1, &fprog); #else - glDeleteTextures(1, &displayTex); + glDeleteTextures(1, &displayTex); #endif - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); } int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } -void initGLBuffers() { - if (pbo) { - // delete old buffer - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); - glDeleteBuffers(1, &pbo); - } +void initGLBuffers() +{ + if (pbo) { + // delete old buffer + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); + glDeleteBuffers(1, &pbo); + } - // create pixel buffer object for display - glGenBuffers(1, &pbo); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(uchar4), 0, - GL_STREAM_DRAW_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + // create pixel buffer object for display + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(uchar4), 0, GL_STREAM_DRAW_ARB); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); #if USE_BUFFER_TEX - // create buffer texture, attach to pbo - if (bufferTex) { - glDeleteTextures(1, &bufferTex); - } + // create buffer texture, attach to pbo + if (bufferTex) { + glDeleteTextures(1, &bufferTex); + } - glGenTextures(1, &bufferTex); - glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); - glTexBufferEXT(GL_TEXTURE_BUFFER_EXT, GL_RGBA8, pbo); - glBindTexture(GL_TEXTURE_BUFFER_EXT, 0); + glGenTextures(1, &bufferTex); + glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); + glTexBufferEXT(GL_TEXTURE_BUFFER_EXT, GL_RGBA8, pbo); + glBindTexture(GL_TEXTURE_BUFFER_EXT, 0); #else - // create texture for display - if (displayTex) { - glDeleteTextures(1, &displayTex); - } + // create texture for display + if (displayTex) { + glDeleteTextures(1, &displayTex); + } - glGenTextures(1, &displayTex); - glBindTexture(GL_TEXTURE_TYPE, displayTex); - glTexImage2D(GL_TEXTURE_TYPE, 0, GL_RGBA8, width, height, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_TYPE, 0); + glGenTextures(1, &displayTex); + glBindTexture(GL_TEXTURE_TYPE, displayTex); + glTexImage2D(GL_TEXTURE_TYPE, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_TYPE, 0); #endif - // calculate new grid size - gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); + // calculate new grid size + gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); } void mainMenu(int i) { keyboard(i, 0, 0); } -void initMenus() { - glutCreateMenu(mainMenu); - glutAddMenuEntry("Nearest [1]", '1'); - glutAddMenuEntry("Bilinear [2]", '2'); - glutAddMenuEntry("Bicubic [3]", '3'); - glutAddMenuEntry("Fast Bicubic [4]", '4'); - glutAddMenuEntry("Catmull-Rom [5]", '5'); - glutAddMenuEntry("Zoom in [=]", '='); - glutAddMenuEntry("Zoom out [-]", '-'); - glutAddMenuEntry("Benchmark [b]", 'b'); - glutAddMenuEntry("DrawCurves [c]", 'c'); - glutAddMenuEntry("Quit [esc]", 27); - glutAttachMenu(GLUT_RIGHT_BUTTON); +void initMenus() +{ + glutCreateMenu(mainMenu); + glutAddMenuEntry("Nearest [1]", '1'); + glutAddMenuEntry("Bilinear [2]", '2'); + glutAddMenuEntry("Bicubic [3]", '3'); + glutAddMenuEntry("Fast Bicubic [4]", '4'); + glutAddMenuEntry("Catmull-Rom [5]", '5'); + glutAddMenuEntry("Zoom in [=]", '='); + glutAddMenuEntry("Zoom out [-]", '-'); + glutAddMenuEntry("Benchmark [b]", 'b'); + glutAddMenuEntry("DrawCurves [c]", 'c'); + glutAddMenuEntry("Quit [esc]", 27); + glutAttachMenu(GLUT_RIGHT_BUTTON); } -void runBenchmark(int iterations) { - printf("[%s] (Benchmark Mode)\n", sSDKsample); +void runBenchmark(int iterations) +{ + printf("[%s] (Benchmark Mode)\n", sSDKsample); - sdkCreateTimer(&timer); + sdkCreateTimer(&timer); - uchar4 *d_output; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_output, &num_bytes, cuda_pbo_resource)); + uchar4 *d_output; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); - sdkStartTimer(&timer); + sdkStartTimer(&timer); - for (int i = 0; i < iterations; ++i) { - render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, - g_FilterMode, d_output); - } + for (int i = 0; i < iterations; ++i) { + render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, g_FilterMode, d_output); + } - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - float time = sdkGetTimerValue(&timer) / (float)iterations; + cudaDeviceSynchronize(); + sdkStopTimer(&timer); + float time = sdkGetTimerValue(&timer) / (float)iterations; - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - printf("time: %0.3f ms, %f Mpixels/sec\n", time, - (width * height / (time * 0.001f)) / 1e6); + printf("time: %0.3f ms, %f Mpixels/sec\n", time, (width * height / (time * 0.001f)) / 1e6); } -void runAutoTest(int argc, char **argv, const char *dump_filename, - eFilterMode filter_mode) { - cudaDeviceProp deviceProps; +void runAutoTest(int argc, char **argv, const char *dump_filename, eFilterMode filter_mode) +{ + cudaDeviceProp deviceProps; - int devID = findCudaDevice(argc, (const char **)argv); + int devID = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("[%s] (automated testing w/ readback)\n", sSDKsample); - printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, - deviceProps.multiProcessorCount); + printf("[%s] (automated testing w/ readback)\n", sSDKsample); + printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); - loadImageData(argc, argv); + loadImageData(argc, argv); - uchar4 *d_output; - checkCudaErrors(cudaMalloc((void **)&d_output, imageWidth * imageHeight * 4)); - unsigned int *h_result = - (unsigned int *)malloc(width * height * sizeof(unsigned int)); + uchar4 *d_output; + checkCudaErrors(cudaMalloc((void **)&d_output, imageWidth * imageHeight * 4)); + unsigned int *h_result = (unsigned int *)malloc(width * height * sizeof(unsigned int)); - printf("AutoTest: %s Filter Mode: <%s>\n", sSDKsample, - sFilterMode[g_FilterMode]); + printf("AutoTest: %s Filter Mode: <%s>\n", sSDKsample, sFilterMode[g_FilterMode]); - render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, - filter_mode, d_output); + render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, filter_mode, d_output); - // check if kernel execution generated an error - getLastCudaError("Error: render (bicubicTexture) Kernel execution FAILED"); - checkCudaErrors(cudaDeviceSynchronize()); + // check if kernel execution generated an error + getLastCudaError("Error: render (bicubicTexture) Kernel execution FAILED"); + checkCudaErrors(cudaDeviceSynchronize()); - cudaMemcpy(h_result, d_output, imageWidth * imageHeight * 4, - cudaMemcpyDeviceToHost); + cudaMemcpy(h_result, d_output, imageWidth * imageHeight * 4, cudaMemcpyDeviceToHost); - sdkSavePPM4ub(dump_filename, (unsigned char *)h_result, imageWidth, - imageHeight); + sdkSavePPM4ub(dump_filename, (unsigned char *)h_result, imageWidth, imageHeight); - checkCudaErrors(cudaFree(d_output)); - free(h_result); + checkCudaErrors(cudaFree(d_output)); + free(h_result); } #if USE_BUFFER_TEX // fragment program for reading from buffer texture -static const char *shaderCode = - "!!NVfp4.0\n" - "INT PARAM width = program.local[0];\n" - "INT TEMP index;\n" - "FLR.S index, fragment.texcoord;\n" - "MAD.S index.x, index.y, width, index.x;\n" // compute 1D index from 2D - // coords - "TXF result.color, index.x, texture[0], BUFFER;\n" - "END"; +static const char *shaderCode = "!!NVfp4.0\n" + "INT PARAM width = program.local[0];\n" + "INT TEMP index;\n" + "FLR.S index, fragment.texcoord;\n" + "MAD.S index.x, index.y, width, index.x;\n" // compute 1D index from 2D + // coords + "TXF result.color, index.x, texture[0], BUFFER;\n" + "END"; #endif -GLuint compileASMShader(GLenum program_type, const char *code) { - GLuint program_id; - glGenProgramsARB(1, &program_id); - glBindProgramARB(program_type, program_id); - glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, - (GLsizei)strlen(code), (GLubyte *)code); +GLuint compileASMShader(GLenum program_type, const char *code) +{ + GLuint program_id; + glGenProgramsARB(1, &program_id); + glBindProgramARB(program_type, program_id); + glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)strlen(code), (GLubyte *)code); - GLint error_pos; - glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); + GLint error_pos; + glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); - if (error_pos != -1) { - const GLubyte *error_string; - error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); - fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, - error_string); - return 0; - } + if (error_pos != -1) { + const GLubyte *error_string; + error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); + fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, error_string); + return 0; + } - return program_id; + return program_id; } -void initialize(int argc, char **argv) { - printf("[%s] (OpenGL Mode)\n", sSDKsample); +void initialize(int argc, char **argv) +{ + printf("[%s] (OpenGL Mode)\n", sSDKsample); - initGL(&argc, argv); + initGL(&argc, argv); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - int devID = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); - // get number of SMs on this GPU - cudaDeviceProp deviceProps; - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, - deviceProps.multiProcessorCount); + // get number of SMs on this GPU + cudaDeviceProp deviceProps; + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); - // Create the timer (for fps measurement) - sdkCreateTimer(&timer); + // Create the timer (for fps measurement) + sdkCreateTimer(&timer); - // load image from disk - loadImageData(argc, argv); + // load image from disk + loadImageData(argc, argv); - printf( - "\n" - "\tControls\n" - "\t=/- : Zoom in/out\n" - "\tb : Run Benchmark g_FilterMode\n" - "\tc : Draw Bicubic Spline Curve\n" - "\t[esc] - Quit\n\n" + printf("\n" + "\tControls\n" + "\t=/- : Zoom in/out\n" + "\tb : Run Benchmark g_FilterMode\n" + "\tc : Draw Bicubic Spline Curve\n" + "\t[esc] - Quit\n\n" - "\tPress number keys to change filtering g_FilterMode:\n\n" - "\t1 : nearest filtering\n" - "\t2 : bilinear filtering\n" - "\t3 : bicubic filtering\n" - "\t4 : fast bicubic filtering\n" - "\t5 : Catmull-Rom filtering\n\n"); + "\tPress number keys to change filtering g_FilterMode:\n\n" + "\t1 : nearest filtering\n" + "\t2 : bilinear filtering\n" + "\t3 : bicubic filtering\n" + "\t4 : fast bicubic filtering\n" + "\t5 : Catmull-Rom filtering\n\n"); - initGLBuffers(); + initGLBuffers(); #if USE_BUFFER_TEX - fprog = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shaderCode); + fprog = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shaderCode); - if (!fprog) { - exit(EXIT_SUCCESS); - } + if (!fprog) { + exit(EXIT_SUCCESS); + } #endif } -void initGL(int *argc, char **argv) { - // initialize GLUT callback functions - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); - glutInitWindowSize(width, height); - glutCreateWindow("CUDA bicubic texture filtering"); - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutReshapeFunc(reshape); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); +void initGL(int *argc, char **argv) +{ + // initialize GLUT callback functions + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); + glutInitWindowSize(width, height); + glutCreateWindow("CUDA bicubic texture filtering"); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutReshapeFunc(reshape); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - initMenus(); + initMenus(); - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Required OpenGL extensions are missing."); - exit(EXIT_FAILURE); - } + if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions are missing."); + exit(EXIT_FAILURE); + } #if USE_BUFFER_TEX - if (!areGLExtensionsSupported("GL_EXT_texture_buffer_object")) { - fprintf(stderr, - "OpenGL extension: GL_EXT_texture_buffer_object missing.\n"); - exit(EXIT_FAILURE); - } + if (!areGLExtensionsSupported("GL_EXT_texture_buffer_object")) { + fprintf(stderr, "OpenGL extension: GL_EXT_texture_buffer_object missing.\n"); + exit(EXIT_FAILURE); + } - if (!areGLExtensionsSupported("GL_NV_gpu_program4")) { - fprintf(stderr, "OpenGL extension: GL_NV_gpu_program4 missing.\n"); - exit(EXIT_FAILURE); - } + if (!areGLExtensionsSupported("GL_NV_gpu_program4")) { + fprintf(stderr, "OpenGL extension: GL_NV_gpu_program4 missing.\n"); + exit(EXIT_FAILURE); + } #endif } -void loadImageData(int argc, char **argv) { - // load image from disk - uchar *h_data = NULL; - char *srcImagePath = NULL; +void loadImageData(int argc, char **argv) +{ + // load image from disk + uchar *h_data = NULL; + char *srcImagePath = NULL; - if ((srcImagePath = sdkFindFilePath(srcImageFilename, argv[0])) == NULL) { - printf("bicubicTexture loadImageData() could not find <%s>\nExiting...\n", - srcImageFilename); - exit(EXIT_FAILURE); - } + if ((srcImagePath = sdkFindFilePath(srcImageFilename, argv[0])) == NULL) { + printf("bicubicTexture loadImageData() could not find <%s>\nExiting...\n", srcImageFilename); + exit(EXIT_FAILURE); + } - sdkLoadPGM(srcImagePath, &h_data, &imageWidth, &imageHeight); + sdkLoadPGM(srcImagePath, &h_data, &imageWidth, &imageHeight); - printf("Loaded '%s', %d x %d pixels\n", srcImageFilename, imageWidth, - imageHeight); + printf("Loaded '%s', %d x %d pixels\n", srcImageFilename, imageWidth, imageHeight); - cx = imageWidth * 0.5f; - cy = imageHeight * 0.5f; + cx = imageWidth * 0.5f; + cy = imageHeight * 0.5f; - // initialize texture - initTexture(imageWidth, imageHeight, h_data); + // initialize texture + initTexture(imageWidth, imageHeight, h_data); } -void printHelp() { - printf("bicubicTexture Usage:\n"); - printf("\t-file=output.ppm (output file to save to disk)\n"); - printf( - "\t-mode=n (0=Nearest, 1=Bilinear, 2=Bicubic, 3=Fast-Bicubic, " - "4=Catmull-Rom\n"); +void printHelp() +{ + printf("bicubicTexture Usage:\n"); + printf("\t-file=output.ppm (output file to save to disk)\n"); + printf("\t-mode=n (0=Nearest, 1=Bilinear, 2=Bicubic, 3=Fast-Bicubic, " + "4=Catmull-Rom\n"); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; - // parse arguments - char *filename; + // parse arguments + char *filename; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("Starting bicubicTexture\n"); + printf("Starting bicubicTexture\n"); - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printHelp(); - exit(EXIT_SUCCESS); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "mode")) { - g_FilterMode = - (eFilterMode)getCmdLineArgumentInt(argc, (const char **)argv, "mode"); - - if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES) { - printf("Invalid Mode setting %d\n", g_FilterMode); - exit(EXIT_FAILURE); + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printHelp(); + exit(EXIT_SUCCESS); } - } - if (getCmdLineArgumentString(argc, (const char **)argv, "file", &filename)) { - dumpFilename = filename; - fpsLimit = frameCheckNumber; + if (checkCmdLineFlag(argc, (const char **)argv, "mode")) { + g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **)argv, "mode"); - // Running CUDA kernel (bicubicFiltering) without visualization (QA - // Testing/Verification) - runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); - } else { - // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization - initialize(argc, argv); - glutMainLoop(); - } + if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES) { + printf("Invalid Mode setting %d\n", g_FilterMode); + exit(EXIT_FAILURE); + } + } - exit(EXIT_SUCCESS); + if (getCmdLineArgumentString(argc, (const char **)argv, "file", &filename)) { + dumpFilename = filename; + fpsLimit = frameCheckNumber; + + // Running CUDA kernel (bicubicFiltering) without visualization (QA + // Testing/Verification) + runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); + } + else { + // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization + initialize(argc, argv); + glutMainLoop(); + } + + exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_cuda.cu b/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_cuda.cu index ef375242..03c4062e 100644 --- a/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_cuda.cu +++ b/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_cuda.cu @@ -28,101 +28,107 @@ #ifndef _BICUBICTEXTURE_CU_ #define _BICUBICTEXTURE_CU_ -#include -#include -#include - #include +#include +#include +#include // includes, cuda #include -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; #include "bicubicTexture_kernel.cuh" cudaArray *d_imageArray = 0; -extern "C" void initTexture(int imageWidth, int imageHeight, uchar *h_data) { - // allocate array and copy image data - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned); - checkCudaErrors( - cudaMallocArray(&d_imageArray, &channelDesc, imageWidth, imageHeight)); - checkCudaErrors(cudaMemcpy2DToArray( - d_imageArray, 0, 0, h_data, imageWidth * sizeof(uchar), - imageWidth * sizeof(uchar), imageHeight, cudaMemcpyHostToDevice)); - free(h_data); +extern "C" void initTexture(int imageWidth, int imageHeight, uchar *h_data) +{ + // allocate array and copy image data + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned); + checkCudaErrors(cudaMallocArray(&d_imageArray, &channelDesc, imageWidth, imageHeight)); + checkCudaErrors(cudaMemcpy2DToArray(d_imageArray, + 0, + 0, + h_data, + imageWidth * sizeof(uchar), + imageWidth * sizeof(uchar), + imageHeight, + cudaMemcpyHostToDevice)); + free(h_data); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_imageArray; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_imageArray; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors( - cudaCreateTextureObject(&texObjLinear, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texObjLinear, &texRes, &texDescr, NULL)); - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeNormalizedFloat; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors( - cudaCreateTextureObject(&texObjPoint, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texObjPoint, &texRes, &texDescr, NULL)); } -extern "C" void freeTexture() { - checkCudaErrors(cudaDestroyTextureObject(texObjPoint)); - checkCudaErrors(cudaDestroyTextureObject(texObjLinear)); - checkCudaErrors(cudaFreeArray(d_imageArray)); +extern "C" void freeTexture() +{ + checkCudaErrors(cudaDestroyTextureObject(texObjPoint)); + checkCudaErrors(cudaDestroyTextureObject(texObjLinear)); + checkCudaErrors(cudaFreeArray(d_imageArray)); } // render image using CUDA -extern "C" void render(int width, int height, float tx, float ty, float scale, - float cx, float cy, dim3 blockSize, dim3 gridSize, - int filter_mode, uchar4 *output) { - // call CUDA kernel, writing results to PBO memory - switch (filter_mode) { +extern "C" void render(int width, + int height, + float tx, + float ty, + float scale, + float cx, + float cy, + dim3 blockSize, + dim3 gridSize, + int filter_mode, + uchar4 *output) +{ + // call CUDA kernel, writing results to PBO memory + switch (filter_mode) { case MODE_NEAREST: - d_render<<>>(output, width, height, tx, ty, scale, - cx, cy, texObjPoint); - break; + d_render<<>>(output, width, height, tx, ty, scale, cx, cy, texObjPoint); + break; case MODE_BILINEAR: - d_render<<>>(output, width, height, tx, ty, scale, - cx, cy, texObjLinear); - break; + d_render<<>>(output, width, height, tx, ty, scale, cx, cy, texObjLinear); + break; case MODE_BICUBIC: - d_renderBicubic<<>>(output, width, height, tx, ty, - scale, cx, cy, texObjPoint); - break; + d_renderBicubic<<>>(output, width, height, tx, ty, scale, cx, cy, texObjPoint); + break; case MODE_FAST_BICUBIC: - d_renderFastBicubic<<>>( - output, width, height, tx, ty, scale, cx, cy, texObjLinear); - break; + d_renderFastBicubic<<>>(output, width, height, tx, ty, scale, cx, cy, texObjLinear); + break; case MODE_CATROM: - d_renderCatRom<<>>(output, width, height, tx, ty, - scale, cx, cy, texObjPoint); - break; - } + d_renderCatRom<<>>(output, width, height, tx, ty, scale, cx, cy, texObjPoint); + break; + } - getLastCudaError("kernel failed"); + getLastCudaError("kernel failed"); } #endif diff --git a/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_kernel.cuh b/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_kernel.cuh index ec9aad52..fd1f4c5a 100644 --- a/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_kernel.cuh +++ b/Samples/5_Domain_Specific/bicubicTexture/bicubicTexture_kernel.cuh @@ -36,30 +36,27 @@ #ifndef _BICUBICTEXTURE_KERNEL_CUH_ #define _BICUBICTEXTURE_KERNEL_CUH_ -enum Mode { - MODE_NEAREST, - MODE_BILINEAR, - MODE_BICUBIC, - MODE_FAST_BICUBIC, - MODE_CATROM -}; +enum Mode { MODE_NEAREST, MODE_BILINEAR, MODE_BICUBIC, MODE_FAST_BICUBIC, MODE_CATROM }; cudaTextureObject_t texObjPoint, texObjLinear; // w0, w1, w2, and w3 are the four cubic B-spline basis functions -__host__ __device__ float w0(float a) { - // return (1.0f/6.0f)*(-a*a*a + 3.0f*a*a - 3.0f*a + 1.0f); - return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f); // optimized +__host__ __device__ float w0(float a) +{ + // return (1.0f/6.0f)*(-a*a*a + 3.0f*a*a - 3.0f*a + 1.0f); + return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f); // optimized } -__host__ __device__ float w1(float a) { - // return (1.0f/6.0f)*(3.0f*a*a*a - 6.0f*a*a + 4.0f); - return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f); +__host__ __device__ float w1(float a) +{ + // return (1.0f/6.0f)*(3.0f*a*a*a - 6.0f*a*a + 4.0f); + return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f); } -__host__ __device__ float w2(float a) { - // return (1.0f/6.0f)*(-3.0f*a*a*a + 3.0f*a*a + 3.0f*a + 1.0f); - return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f); +__host__ __device__ float w2(float a) +{ + // return (1.0f/6.0f)*(-3.0f*a*a*a + 3.0f*a*a + 3.0f*a + 1.0f); + return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f); } __host__ __device__ float w3(float a) { return (1.0f / 6.0f) * (a * a * a); } @@ -70,91 +67,99 @@ __device__ float g0(float a) { return w0(a) + w1(a); } __device__ float g1(float a) { return w2(a) + w3(a); } // h0 and h1 are the two offset functions -__device__ float h0(float a) { - // note +0.5 offset to compensate for CUDA linear filtering convention - return -1.0f + w1(a) / (w0(a) + w1(a)) + 0.5f; +__device__ float h0(float a) +{ + // note +0.5 offset to compensate for CUDA linear filtering convention + return -1.0f + w1(a) / (w0(a) + w1(a)) + 0.5f; } __device__ float h1(float a) { return 1.0f + w3(a) / (w2(a) + w3(a)) + 0.5f; } // filter 4 values using cubic splines -template -__device__ T cubicFilter(float x, T c0, T c1, T c2, T c3) { - T r; - r = c0 * w0(x); - r += c1 * w1(x); - r += c2 * w2(x); - r += c3 * w3(x); - return r; +template __device__ T cubicFilter(float x, T c0, T c1, T c2, T c3) +{ + T r; + r = c0 * w0(x); + r += c1 * w1(x); + r += c2 * w2(x); + r += c3 * w3(x); + return r; } // slow but precise bicubic lookup using 16 texture lookups -template // texture data type, return type -__device__ R tex2DBicubic(const cudaTextureObject_t tex, float x, float y) { - x -= 0.5f; - y -= 0.5f; - float px = floorf(x); - float py = floorf(y); - float fx = x - px; - float fy = y - py; +template // texture data type, return type +__device__ R tex2DBicubic(const cudaTextureObject_t tex, float x, float y) +{ + x -= 0.5f; + y -= 0.5f; + float px = floorf(x); + float py = floorf(y); + float fx = x - px; + float fy = y - py; - return cubicFilter( - fy, cubicFilter( - fx, tex2D(tex, px - 1, py - 1), tex2D(tex, px, py - 1), - tex2D(tex, px + 1, py - 1), tex2D(tex, px + 2, py - 1)), - cubicFilter(fx, tex2D(tex, px - 1, py), tex2D(tex, px, py), - tex2D(tex, px + 1, py), tex2D(tex, px + 2, py)), - cubicFilter(fx, tex2D(tex, px - 1, py + 1), - tex2D(tex, px, py + 1), tex2D(tex, px + 1, py + 1), - tex2D(tex, px + 2, py + 1)), - cubicFilter(fx, tex2D(tex, px - 1, py + 2), - tex2D(tex, px, py + 2), tex2D(tex, px + 1, py + 2), - tex2D(tex, px + 2, py + 2))); + return cubicFilter( + fy, + cubicFilter(fx, + tex2D(tex, px - 1, py - 1), + tex2D(tex, px, py - 1), + tex2D(tex, px + 1, py - 1), + tex2D(tex, px + 2, py - 1)), + cubicFilter( + fx, tex2D(tex, px - 1, py), tex2D(tex, px, py), tex2D(tex, px + 1, py), tex2D(tex, px + 2, py)), + cubicFilter(fx, + tex2D(tex, px - 1, py + 1), + tex2D(tex, px, py + 1), + tex2D(tex, px + 1, py + 1), + tex2D(tex, px + 2, py + 1)), + cubicFilter(fx, + tex2D(tex, px - 1, py + 2), + tex2D(tex, px, py + 2), + tex2D(tex, px + 1, py + 2), + tex2D(tex, px + 2, py + 2))); } // fast bicubic texture lookup using 4 bilinear lookups // assumes texture is set to non-normalized coordinates, point sampling -template // texture data type, return type -__device__ R tex2DFastBicubic(const cudaTextureObject_t tex, float x, float y) { - x -= 0.5f; - y -= 0.5f; - float px = floorf(x); - float py = floorf(y); - float fx = x - px; - float fy = y - py; +template // texture data type, return type +__device__ R tex2DFastBicubic(const cudaTextureObject_t tex, float x, float y) +{ + x -= 0.5f; + y -= 0.5f; + float px = floorf(x); + float py = floorf(y); + float fx = x - px; + float fy = y - py; - // note: we could store these functions in a lookup table texture, but maths - // is cheap - float g0x = g0(fx); - float g1x = g1(fx); - float h0x = h0(fx); - float h1x = h1(fx); - float h0y = h0(fy); - float h1y = h1(fy); + // note: we could store these functions in a lookup table texture, but maths + // is cheap + float g0x = g0(fx); + float g1x = g1(fx); + float h0x = h0(fx); + float h1x = h1(fx); + float h0y = h0(fy); + float h1y = h1(fy); - R r = g0(fy) * (g0x * tex2D(tex, px + h0x, py + h0y) + - g1x * tex2D(tex, px + h1x, py + h0y)) + - g1(fy) * (g0x * tex2D(tex, px + h0x, py + h1y) + - g1x * tex2D(tex, px + h1x, py + h1y)); - return r; + R r = g0(fy) * (g0x * tex2D(tex, px + h0x, py + h0y) + g1x * tex2D(tex, px + h1x, py + h0y)) + + g1(fy) * (g0x * tex2D(tex, px + h0x, py + h1y) + g1x * tex2D(tex, px + h1x, py + h1y)); + return r; } // higher-precision 2D bilinear lookup -template // texture data type, return type -__device__ R tex2DBilinear(const cudaTextureObject_t tex, float x, float y) { - x -= 0.5f; - y -= 0.5f; - float px = floorf(x); // integer position - float py = floorf(y); - float fx = x - px; // fractional position - float fy = y - py; - px += 0.5f; - py += 0.5f; +template // texture data type, return type +__device__ R tex2DBilinear(const cudaTextureObject_t tex, float x, float y) +{ + x -= 0.5f; + y -= 0.5f; + float px = floorf(x); // integer position + float py = floorf(y); + float fx = x - px; // fractional position + float fy = y - py; + px += 0.5f; + py += 0.5f; - return lerp(lerp(tex2D(tex, px, py), tex2D(tex, px + 1.0f, py), fx), - lerp(tex2D(tex, px, py + 1.0f), - tex2D(tex, px + 1.0f, py + 1.0f), fx), - fy); + return lerp(lerp(tex2D(tex, px, py), tex2D(tex, px + 1.0f, py), fx), + lerp(tex2D(tex, px, py + 1.0f), tex2D(tex, px + 1.0f, py + 1.0f), fx), + fy); } #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 @@ -174,154 +179,193 @@ __device__ R tex2DBilinear(const cudaTextureObject_t tex, float x, float y) { w: (0, 0) */ -template // texture data type, return type -__device__ float tex2DBilinearGather(const cudaTextureObject_t tex, float x, - float y, int comp = 0) { - x -= 0.5f; - y -= 0.5f; - float px = floorf(x); // integer position - float py = floorf(y); - float fx = x - px; // fractional position - float fy = y - py; +template // texture data type, return type +__device__ float tex2DBilinearGather(const cudaTextureObject_t tex, float x, float y, int comp = 0) +{ + x -= 0.5f; + y -= 0.5f; + float px = floorf(x); // integer position + float py = floorf(y); + float fx = x - px; // fractional position + float fy = y - py; - R samples = tex2Dgather(tex, px + 0.5f, py + 0.5f, comp); + R samples = tex2Dgather(tex, px + 0.5f, py + 0.5f, comp); - return lerp(lerp((float)samples.w, (float)samples.z, fx), - lerp((float)samples.x, (float)samples.y, fx), fy); + return lerp(lerp((float)samples.w, (float)samples.z, fx), lerp((float)samples.x, (float)samples.y, fx), fy); } #endif // Catmull-Rom interpolation -__host__ __device__ float catrom_w0(float a) { - // return -0.5f*a + a*a - 0.5f*a*a*a; - return a * (-0.5f + a * (1.0f - 0.5f * a)); +__host__ __device__ float catrom_w0(float a) +{ + // return -0.5f*a + a*a - 0.5f*a*a*a; + return a * (-0.5f + a * (1.0f - 0.5f * a)); } -__host__ __device__ float catrom_w1(float a) { - // return 1.0f - 2.5f*a*a + 1.5f*a*a*a; - return 1.0f + a * a * (-2.5f + 1.5f * a); +__host__ __device__ float catrom_w1(float a) +{ + // return 1.0f - 2.5f*a*a + 1.5f*a*a*a; + return 1.0f + a * a * (-2.5f + 1.5f * a); } -__host__ __device__ float catrom_w2(float a) { - // return 0.5f*a + 2.0f*a*a - 1.5f*a*a*a; - return a * (0.5f + a * (2.0f - 1.5f * a)); +__host__ __device__ float catrom_w2(float a) +{ + // return 0.5f*a + 2.0f*a*a - 1.5f*a*a*a; + return a * (0.5f + a * (2.0f - 1.5f * a)); } -__host__ __device__ float catrom_w3(float a) { - // return -0.5f*a*a + 0.5f*a*a*a; - return a * a * (-0.5f + 0.5f * a); +__host__ __device__ float catrom_w3(float a) +{ + // return -0.5f*a*a + 0.5f*a*a*a; + return a * a * (-0.5f + 0.5f * a); } -template -__device__ T catRomFilter(float x, T c0, T c1, T c2, T c3) { - T r; - r = c0 * catrom_w0(x); - r += c1 * catrom_w1(x); - r += c2 * catrom_w2(x); - r += c3 * catrom_w3(x); - return r; +template __device__ T catRomFilter(float x, T c0, T c1, T c2, T c3) +{ + T r; + r = c0 * catrom_w0(x); + r += c1 * catrom_w1(x); + r += c2 * catrom_w2(x); + r += c3 * catrom_w3(x); + return r; } // Note - can't use bilinear trick here because of negative lobes -template // texture data type, return type -__device__ R tex2DCatRom(const cudaTextureObject_t tex, float x, float y) { - x -= 0.5f; - y -= 0.5f; - float px = floorf(x); - float py = floorf(y); - float fx = x - px; - float fy = y - py; +template // texture data type, return type +__device__ R tex2DCatRom(const cudaTextureObject_t tex, float x, float y) +{ + x -= 0.5f; + y -= 0.5f; + float px = floorf(x); + float py = floorf(y); + float fx = x - px; + float fy = y - py; - return catRomFilter( - fy, catRomFilter( - fx, tex2D(tex, px - 1, py - 1), tex2D(tex, px, py - 1), - tex2D(tex, px + 1, py - 1), tex2D(tex, px + 2, py - 1)), - catRomFilter(fx, tex2D(tex, px - 1, py), tex2D(tex, px, py), - tex2D(tex, px + 1, py), tex2D(tex, px + 2, py)), - catRomFilter(fx, tex2D(tex, px - 1, py + 1), - tex2D(tex, px, py + 1), tex2D(tex, px + 1, py + 1), - tex2D(tex, px + 2, py + 1)), - catRomFilter(fx, tex2D(tex, px - 1, py + 2), - tex2D(tex, px, py + 2), tex2D(tex, px + 1, py + 2), - tex2D(tex, px + 2, py + 2))); + return catRomFilter( + fy, + catRomFilter(fx, + tex2D(tex, px - 1, py - 1), + tex2D(tex, px, py - 1), + tex2D(tex, px + 1, py - 1), + tex2D(tex, px + 2, py - 1)), + catRomFilter( + fx, tex2D(tex, px - 1, py), tex2D(tex, px, py), tex2D(tex, px + 1, py), tex2D(tex, px + 2, py)), + catRomFilter(fx, + tex2D(tex, px - 1, py + 1), + tex2D(tex, px, py + 1), + tex2D(tex, px + 1, py + 1), + tex2D(tex, px + 2, py + 1)), + catRomFilter(fx, + tex2D(tex, px - 1, py + 2), + tex2D(tex, px, py + 2), + tex2D(tex, px + 1, py + 2), + tex2D(tex, px + 2, py + 2))); } // test functions // render image using normal bilinear texture lookup -__global__ void d_render(uchar4 *d_output, uint width, uint height, float tx, - float ty, float scale, float cx, float cy, - cudaTextureObject_t texObj) { - uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; - uint i = __umul24(y, width) + x; +__global__ void d_render(uchar4 *d_output, + uint width, + uint height, + float tx, + float ty, + float scale, + float cx, + float cy, + cudaTextureObject_t texObj) +{ + uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; + uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; + uint i = __umul24(y, width) + x; - float u = (x - cx) * scale + cx + tx; - float v = (y - cy) * scale + cy + ty; + float u = (x - cx) * scale + cx + tx; + float v = (y - cy) * scale + cy + ty; - if ((x < width) && (y < height)) { - // write output color - float c = tex2D(texObj, u, v); - // float c = tex2DBilinear(tex, u, v); - // float c = tex2DBilinearGather(tex2, u, v, 0) / 255.0f; - d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); - } + if ((x < width) && (y < height)) { + // write output color + float c = tex2D(texObj, u, v); + // float c = tex2DBilinear(tex, u, v); + // float c = tex2DBilinearGather(tex2, u, v, 0) / 255.0f; + d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); + } } // render image using bicubic texture lookup -__global__ void d_renderBicubic(uchar4 *d_output, uint width, uint height, - float tx, float ty, float scale, float cx, - float cy, cudaTextureObject_t texObj) { - uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; - uint i = __umul24(y, width) + x; +__global__ void d_renderBicubic(uchar4 *d_output, + uint width, + uint height, + float tx, + float ty, + float scale, + float cx, + float cy, + cudaTextureObject_t texObj) +{ + uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; + uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; + uint i = __umul24(y, width) + x; - float u = (x - cx) * scale + cx + tx; - float v = (y - cy) * scale + cy + ty; + float u = (x - cx) * scale + cx + tx; + float v = (y - cy) * scale + cy + ty; - if ((x < width) && (y < height)) { - // write output color - float c = tex2DBicubic(texObj, u, v); - d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); - } + if ((x < width) && (y < height)) { + // write output color + float c = tex2DBicubic(texObj, u, v); + d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); + } } // render image using fast bicubic texture lookup -__global__ void d_renderFastBicubic(uchar4 *d_output, uint width, uint height, - float tx, float ty, float scale, float cx, - float cy, cudaTextureObject_t texObj) { - uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; - uint i = __umul24(y, width) + x; +__global__ void d_renderFastBicubic(uchar4 *d_output, + uint width, + uint height, + float tx, + float ty, + float scale, + float cx, + float cy, + cudaTextureObject_t texObj) +{ + uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; + uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; + uint i = __umul24(y, width) + x; - float u = (x - cx) * scale + cx + tx; - float v = (y - cy) * scale + cy + ty; + float u = (x - cx) * scale + cx + tx; + float v = (y - cy) * scale + cy + ty; - if ((x < width) && (y < height)) { - // write output color - float c = tex2DFastBicubic(texObj, u, v); - d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); - } + if ((x < width) && (y < height)) { + // write output color + float c = tex2DFastBicubic(texObj, u, v); + d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); + } } // render image using Catmull-Rom texture lookup -__global__ void d_renderCatRom(uchar4 *d_output, uint width, uint height, - float tx, float ty, float scale, float cx, - float cy, cudaTextureObject_t texObj) { - uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; - uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; - uint i = __umul24(y, width) + x; +__global__ void d_renderCatRom(uchar4 *d_output, + uint width, + uint height, + float tx, + float ty, + float scale, + float cx, + float cy, + cudaTextureObject_t texObj) +{ + uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; + uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; + uint i = __umul24(y, width) + x; - float u = (x - cx) * scale + cx + tx; - float v = (y - cy) * scale + cy + ty; + float u = (x - cx) * scale + cx + tx; + float v = (y - cy) * scale + cy + ty; - if ((x < width) && (y < height)) { - // write output color - float c = tex2DCatRom(texObj, u, v); - d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); - } + if ((x < width) && (y < height)) { + // write output color + float c = tex2DCatRom(texObj, u, v); + d_output[i] = make_uchar4(c * 0xff, c * 0xff, c * 0xff, 0); + } } -#endif // _BICUBICTEXTURE_KERNEL_CUH_ +#endif // _BICUBICTEXTURE_KERNEL_CUH_ diff --git a/Samples/5_Domain_Specific/bilateralFilter/README.md b/Samples/5_Domain_Specific/bilateralFilter/README.md index e0bffcad..a40f94b7 100644 --- a/Samples/5_Domain_Specific/bilateralFilter/README.md +++ b/Samples/5_Domain_Specific/bilateralFilter/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter.cpp b/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter.cpp index 33adbdb2..a16c9c01 100644 --- a/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter.cpp +++ b/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter.cpp @@ -63,50 +63,49 @@ #endif // CUDA utilities and system includes -#include #include - -#include // CUDA device initialization helper functions +#include +#include // CUDA device initialization helper functions // Shared Library Test Functions -#include // CUDA SDK Helper functions +#include // CUDA SDK Helper functions #define MAX_EPSILON_ERROR 5.0f -#define REFRESH_DELAY 10 // ms -#define MIN_EUCLIDEAN_D 0.01f -#define MAX_EUCLIDEAN_D 5.f +#define REFRESH_DELAY 10 // ms +#define MIN_EUCLIDEAN_D 0.01f +#define MAX_EUCLIDEAN_D 5.f #define MAX_FILTER_RADIUS 25 const static char *sSDKsample = "CUDA Bilateral Filter"; -const char *image_filename = "nature_monte.bmp"; -int iterations = 1; -float gaussian_delta = 4; -float euclidean_delta = 0.1f; -int filter_radius = 5; +const char *image_filename = "nature_monte.bmp"; +int iterations = 1; +float gaussian_delta = 4; +float euclidean_delta = 0.1f; +int filter_radius = 5; -unsigned int width, height; +unsigned int width, height; unsigned int *hImage = NULL; -GLuint pbo; // OpenGL pixel buffer object -struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange -GLuint texid; // texture -GLuint shader; +GLuint pbo; // OpenGL pixel buffer object +struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange +GLuint texid; // texture +GLuint shader; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; -StopWatchInterface *timer = NULL; +StopWatchInterface *timer = NULL; StopWatchInterface *kernel_timer = NULL; // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -unsigned int g_TotalErrors = 0; -bool g_bInteractive = false; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +unsigned int g_TotalErrors = 0; +bool g_bInteractive = false; -//#define GL_TEXTURE_TYPE GL_TEXTURE_RECTANGLE_ARB +// #define GL_TEXTURE_TYPE GL_TEXTURE_RECTANGLE_ARB #define GL_TEXTURE_TYPE GL_TEXTURE_2D extern "C" void loadImageData(int argc, char **argv); @@ -115,109 +114,113 @@ extern "C" void loadImageData(int argc, char **argv); extern "C" void initTexture(int width, int height, void *pImage); extern "C" void freeTextures(); -extern "C" double bilateralFilterRGBA(unsigned int *d_dest, int width, - int height, float e_d, int radius, - int iterations, +extern "C" double bilateralFilterRGBA(unsigned int *d_dest, + int width, + int height, + float e_d, + int radius, + int iterations, StopWatchInterface *timer); -extern "C" void updateGaussian(float delta, int radius); -extern "C" void updateGaussianGold(float delta, int radius); -extern "C" void bilateralFilterGold(unsigned int *pSrc, unsigned int *pDest, - float e_d, int w, int h, int r); -extern "C" void LoadBMPFile(uchar4 **dst, unsigned int *width, - unsigned int *height, const char *name); +extern "C" void updateGaussian(float delta, int radius); +extern "C" void updateGaussianGold(float delta, int radius); +extern "C" void bilateralFilterGold(unsigned int *pSrc, unsigned int *pDest, float e_d, int w, int h, int r); +extern "C" void LoadBMPFile(uchar4 **dst, unsigned int *width, unsigned int *height, const char *name); -void varyEuclidean() { - static float factor = 1.02f; +void varyEuclidean() +{ + static float factor = 1.02f; - if (euclidean_delta > MAX_EUCLIDEAN_D) { - factor = 1 / 1.02f; - } + if (euclidean_delta > MAX_EUCLIDEAN_D) { + factor = 1 / 1.02f; + } - if (euclidean_delta < MIN_EUCLIDEAN_D) { - factor = 1.02f; - } + if (euclidean_delta < MIN_EUCLIDEAN_D) { + factor = 1.02f; + } - euclidean_delta *= factor; + euclidean_delta *= factor; } -void computeFPS() { - fpsCount++; +void computeFPS() +{ + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f); - sprintf(fps, - "CUDA Bilateral Filter: %3.f fps (radius=%d, iter=%d, " - "euclidean=%.2f, gaussian=%.2f)", - ifps, filter_radius, iterations, (double)euclidean_delta, - (double)gaussian_delta); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.0f / (sdkGetAverageTimerValue(&timer) / 1000.0f); + sprintf(fps, + "CUDA Bilateral Filter: %3.f fps (radius=%d, iter=%d, " + "euclidean=%.2f, gaussian=%.2f)", + ifps, + filter_radius, + iterations, + (double)euclidean_delta, + (double)gaussian_delta); - glutSetWindowTitle(fps); - fpsCount = 0; - fpsLimit = (int)MAX(ifps, 1.0f); + glutSetWindowTitle(fps); + fpsCount = 0; + fpsLimit = (int)MAX(ifps, 1.0f); - sdkResetTimer(&timer); - } + sdkResetTimer(&timer); + } - if (!g_bInteractive) { - varyEuclidean(); - } + if (!g_bInteractive) { + varyEuclidean(); + } } // display results using OpenGL -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - // execute filter, writing results to pbo - unsigned int *dResult; + // execute filter, writing results to pbo + unsigned int *dResult; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&dResult, &num_bytes, cuda_pbo_resource)); - bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, - iterations, kernel_timer); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dResult, &num_bytes, cuda_pbo_resource)); + bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); - // Common display code path - { - glClear(GL_COLOR_BUFFER_BIT); - - // load texture from pbo - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBindTexture(GL_TEXTURE_2D, texid); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, - GL_UNSIGNED_BYTE, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - - // fragment program is required to display floating point texture - glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); - glEnable(GL_FRAGMENT_PROGRAM_ARB); - glDisable(GL_DEPTH_TEST); - - glBegin(GL_QUADS); + // Common display code path { - glTexCoord2f(0, 0); - glVertex2f(0, 0); - glTexCoord2f(1, 0); - glVertex2f(1, 0); - glTexCoord2f(1, 1); - glVertex2f(1, 1); - glTexCoord2f(0, 1); - glVertex2f(0, 1); + glClear(GL_COLOR_BUFFER_BIT); + + // load texture from pbo + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBindTexture(GL_TEXTURE_2D, texid); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + + // fragment program is required to display floating point texture + glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader); + glEnable(GL_FRAGMENT_PROGRAM_ARB); + glDisable(GL_DEPTH_TEST); + + glBegin(GL_QUADS); + { + glTexCoord2f(0, 0); + glVertex2f(0, 0); + glTexCoord2f(1, 0); + glVertex2f(1, 0); + glTexCoord2f(1, 1); + glVertex2f(1, 1); + glTexCoord2f(0, 1); + glVertex2f(0, 1); + } + glEnd(); + glBindTexture(GL_TEXTURE_TYPE, 0); + glDisable(GL_FRAGMENT_PROGRAM_ARB); } - glEnd(); - glBindTexture(GL_TEXTURE_TYPE, 0); - glDisable(GL_FRAGMENT_PROGRAM_ARB); - } - glutSwapBuffers(); - glutReportErrors(); + glutSwapBuffers(); + glutReportErrors(); - sdkStopTimer(&timer); + sdkStopTimer(&timer); - computeFPS(); + computeFPS(); } /* @@ -226,449 +229,454 @@ void display() { up arrow to increase the euclidean delta down arrow to decrease the euclidean delta */ -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case 'a': case 'A': - g_bInteractive = !g_bInteractive; - printf("> Animation is %s\n", !g_bInteractive ? "ON" : "OFF"); - break; + g_bInteractive = !g_bInteractive; + printf("> Animation is %s\n", !g_bInteractive ? "ON" : "OFF"); + break; case ']': - iterations++; - break; + iterations++; + break; case '[': - iterations--; + iterations--; - if (iterations < 1) { - iterations = 1; - } + if (iterations < 1) { + iterations = 1; + } - break; + break; case '=': case '+': - filter_radius++; + filter_radius++; - if (filter_radius > MAX_FILTER_RADIUS) { - filter_radius = MAX_FILTER_RADIUS; - } + if (filter_radius > MAX_FILTER_RADIUS) { + filter_radius = MAX_FILTER_RADIUS; + } - updateGaussian(gaussian_delta, filter_radius); - break; + updateGaussian(gaussian_delta, filter_radius); + break; case '-': - filter_radius--; + filter_radius--; - if (filter_radius < 1) { - filter_radius = 1; - } + if (filter_radius < 1) { + filter_radius = 1; + } - updateGaussian(gaussian_delta, filter_radius); - break; + updateGaussian(gaussian_delta, filter_radius); + break; case 'E': - euclidean_delta *= 1.5; - break; + euclidean_delta *= 1.5; + break; case 'e': - euclidean_delta /= 1.5; - break; + euclidean_delta /= 1.5; + break; case 'g': - if (gaussian_delta > 0.1) { - gaussian_delta /= 2; - } + if (gaussian_delta > 0.1) { + gaussian_delta /= 2; + } - // updateGaussianGold(gaussian_delta, filter_radius); - updateGaussian(gaussian_delta, filter_radius); - break; + // updateGaussianGold(gaussian_delta, filter_radius); + updateGaussian(gaussian_delta, filter_radius); + break; case 'G': - gaussian_delta *= 2; - // updateGaussianGold(gaussian_delta, filter_radius); - updateGaussian(gaussian_delta, filter_radius); - break; + gaussian_delta *= 2; + // updateGaussianGold(gaussian_delta, filter_radius); + updateGaussian(gaussian_delta, filter_radius); + break; default: - break; - } + break; + } - printf( - "filter radius = %d, iterations = %d, gaussian delta = %.2f, euclidean " - "delta = %.2f\n", - filter_radius, iterations, gaussian_delta, euclidean_delta); - glutPostRedisplay(); -} - -void timerEvent(int value) { - if (glutGetWindow()) { + printf("filter radius = %d, iterations = %d, gaussian delta = %.2f, euclidean " + "delta = %.2f\n", + filter_radius, + iterations, + gaussian_delta, + euclidean_delta); glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - } } -void reshape(int x, int y) { - glViewport(0, 0, x, y); - - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); +void timerEvent(int value) +{ + if (glutGetWindow()) { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + } } -void initCuda() { - // initialize gaussian mask - updateGaussian(gaussian_delta, filter_radius); +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); - initTexture(width, height, hImage); - sdkCreateTimer(&timer); - sdkCreateTimer(&kernel_timer); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } -void cleanup() { - sdkDeleteTimer(&timer); - sdkDeleteTimer(&kernel_timer); +void initCuda() +{ + // initialize gaussian mask + updateGaussian(gaussian_delta, filter_radius); - if (hImage) { - free(hImage); - } + initTexture(width, height, hImage); + sdkCreateTimer(&timer); + sdkCreateTimer(&kernel_timer); +} - freeTextures(); +void cleanup() +{ + sdkDeleteTimer(&timer); + sdkDeleteTimer(&kernel_timer); - cudaGraphicsUnregisterResource(cuda_pbo_resource); + if (hImage) { + free(hImage); + } - glDeleteBuffers(1, &pbo); - glDeleteTextures(1, &texid); - glDeleteProgramsARB(1, &shader); + freeTextures(); + + cudaGraphicsUnregisterResource(cuda_pbo_resource); + + glDeleteBuffers(1, &pbo); + glDeleteTextures(1, &texid); + glDeleteProgramsARB(1, &shader); } // shader for displaying floating-point texture -static const char *shader_code = - "!!ARBfp1.0\n" - "TEX result.color, fragment.texcoord, texture[0], 2D; \n" - "END"; +static const char *shader_code = "!!ARBfp1.0\n" + "TEX result.color, fragment.texcoord, texture[0], 2D; \n" + "END"; -GLuint compileASMShader(GLenum program_type, const char *code) { - GLuint program_id; - glGenProgramsARB(1, &program_id); - glBindProgramARB(program_type, program_id); - glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, - (GLsizei)strlen(code), (GLubyte *)code); +GLuint compileASMShader(GLenum program_type, const char *code) +{ + GLuint program_id; + glGenProgramsARB(1, &program_id); + glBindProgramARB(program_type, program_id); + glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)strlen(code), (GLubyte *)code); - GLint error_pos; - glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); + GLint error_pos; + glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); - if (error_pos != -1) { - const GLubyte *error_string; - error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); - printf("Program error at position: %d\n%s\n", (int)error_pos, error_string); - return 0; - } + if (error_pos != -1) { + const GLubyte *error_string; + error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); + printf("Program error at position: %d\n%s\n", (int)error_pos, error_string); + return 0; + } - return program_id; + return program_id; } -void initGLResources() { - // create pixel buffer object - glGenBuffers(1, &pbo); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, - hImage, GL_STREAM_DRAW_ARB); +void initGLResources() +{ + // create pixel buffer object + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, hImage, GL_STREAM_DRAW_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); - // create texture for display - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); + // create texture for display + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); - // load shader program - shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); + // load shader program + shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// -int runBenchmark(int argc, char **argv) { - printf("[runBenchmark]: [%s]\n", sSDKsample); +int runBenchmark(int argc, char **argv) +{ + printf("[runBenchmark]: [%s]\n", sSDKsample); - loadImageData(argc, argv); - initCuda(); + loadImageData(argc, argv); + initCuda(); - unsigned int *dResult; - size_t pitch; - checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, - width * sizeof(unsigned int), height)); - sdkStartTimer(&kernel_timer); + unsigned int *dResult; + size_t pitch; + checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width * sizeof(unsigned int), height)); + sdkStartTimer(&kernel_timer); - // warm-up - bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, - iterations, kernel_timer); - checkCudaErrors(cudaDeviceSynchronize()); + // warm-up + bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); + checkCudaErrors(cudaDeviceSynchronize()); - // Start round-trip timer and process iCycles loops on the GPU - iterations = 1; // standard 1-pass filtering - const int iCycles = 150; - double dProcessingTime = 0.0; - printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles); + // Start round-trip timer and process iCycles loops on the GPU + iterations = 1; // standard 1-pass filtering + const int iCycles = 150; + double dProcessingTime = 0.0; + printf("\nRunning BilateralFilterGPU for %d cycles...\n\n", iCycles); - for (int i = 0; i < iCycles; i++) { - dProcessingTime += - bilateralFilterRGBA(dResult, width, height, euclidean_delta, - filter_radius, iterations, kernel_timer); - } + for (int i = 0; i < iCycles; i++) { + dProcessingTime += + bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); + } - // check if kernel execution generated an error and sync host - getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&kernel_timer); + // check if kernel execution generated an error and sync host + getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&kernel_timer); - // Get average computation time - dProcessingTime /= (double)iCycles; + // Get average computation time + dProcessingTime /= (double)iCycles; - // log testname, throughput, timing and config info to sample and master logs - printf( - "bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f " - "s, Size = %u RGBA Pixels, NumDevsUsed = %u\n", - (1.0e-6 * width * height) / dProcessingTime, dProcessingTime, - (width * height), 1); - printf("\n"); + // log testname, throughput, timing and config info to sample and master logs + printf("bilateralFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f " + "s, Size = %u RGBA Pixels, NumDevsUsed = %u\n", + (1.0e-6 * width * height) / dProcessingTime, + dProcessingTime, + (width * height), + 1); + printf("\n"); - return 0; + return 0; } -void initGL(int argc, char **argv) { - // initialize GLUT - glutInit(&argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); - glutInitWindowSize(width, height); +void initGL(int argc, char **argv) +{ + // initialize GLUT + glutInit(&argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(width, height); - glutCreateWindow("CUDA Bilateral Filter"); - glutDisplayFunc(display); + glutCreateWindow("CUDA Bilateral Filter"); + glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); - // glutIdleFunc(idle); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + // glutIdleFunc(idle); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - printf("Error: failed to get minimal extensions for demo\n"); - printf("This sample requires:\n"); - printf(" OpenGL version 2.0\n"); - printf(" GL_ARB_vertex_buffer_object\n"); - printf(" GL_ARB_pixel_buffer_object\n"); - exit(EXIT_FAILURE); - } + if (!isGLVersionSupported(2, 0) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + printf("Error: failed to get minimal extensions for demo\n"); + printf("This sample requires:\n"); + printf(" OpenGL version 2.0\n"); + printf(" GL_ARB_vertex_buffer_object\n"); + printf(" GL_ARB_pixel_buffer_object\n"); + exit(EXIT_FAILURE); + } } // This test specifies a single test (where you specify radius and/or // iterations) -int runSingleTest(char *ref_file, char *exec_path) { - int nTotalErrors = 0; - char dump_file[256]; +int runSingleTest(char *ref_file, char *exec_path) +{ + int nTotalErrors = 0; + char dump_file[256]; - printf("[runSingleTest]: [%s]\n", sSDKsample); + printf("[runSingleTest]: [%s]\n", sSDKsample); - initCuda(); + initCuda(); - unsigned int *dResult; - unsigned int *hResult = - (unsigned int *)malloc(width * height * sizeof(unsigned int)); - size_t pitch; - checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, - width * sizeof(unsigned int), height)); + unsigned int *dResult; + unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int)); + size_t pitch; + checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width * sizeof(unsigned int), height)); - // run the sample radius - { - printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, - iterations); - bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, - iterations, kernel_timer); + // run the sample radius + { + printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations); + bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); - // check if kernel execution generated an error - getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); - checkCudaErrors(cudaDeviceSynchronize()); + // check if kernel execution generated an error + getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); + checkCudaErrors(cudaDeviceSynchronize()); - // readback the results to system memory - cudaMemcpy2D(hResult, sizeof(unsigned int) * width, dResult, pitch, - sizeof(unsigned int) * width, height, cudaMemcpyDeviceToHost); + // readback the results to system memory + cudaMemcpy2D(hResult, + sizeof(unsigned int) * width, + dResult, + pitch, + sizeof(unsigned int) * width, + height, + cudaMemcpyDeviceToHost); - sprintf(dump_file, "nature_%02d.ppm", filter_radius); + sprintf(dump_file, "nature_%02d.ppm", filter_radius); - sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, - height); + sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height); - if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), - MAX_EPSILON_ERROR, 0.15f, false)) { - printf("Image is Different "); - nTotalErrors++; - } else { - printf("Image is Matching "); + if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false)) { + printf("Image is Different "); + nTotalErrors++; + } + else { + printf("Image is Matching "); + } + + printf(" <%s>\n", ref_file); + } + printf("\n"); + + free(hResult); + checkCudaErrors(cudaFree(dResult)); + freeTextures(); + + return nTotalErrors; +} + +void loadImageData(int argc, char **argv) +{ + // load image (needed so we can get the width and height before we create the + // window + char *image_path = NULL; + + if (argc >= 1) { + image_path = sdkFindFilePath(image_filename, argv[0]); } - printf(" <%s>\n", ref_file); - } - printf("\n"); + if (image_path == NULL) { + fprintf(stderr, "Error finding image file '%s'\n", image_filename); + exit(EXIT_FAILURE); + } - free(hResult); - checkCudaErrors(cudaFree(dResult)); - freeTextures(); + LoadBMPFile((uchar4 **)&hImage, &width, &height, image_path); - return nTotalErrors; + if (!hImage) { + fprintf(stderr, "Error opening file '%s'\n", image_path); + exit(EXIT_FAILURE); + } + + printf("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); } -void loadImageData(int argc, char **argv) { - // load image (needed so we can get the width and height before we create the - // window - char *image_path = NULL; +bool checkCUDAProfile(int dev, int min_runtime, int min_compute) +{ + int runtimeVersion = 0; - if (argc >= 1) { - image_path = sdkFindFilePath(image_filename, argv[0]); - } + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); - if (image_path == NULL) { - fprintf(stderr, "Error finding image file '%s'\n", image_filename); - exit(EXIT_FAILURE); - } + fprintf(stderr, "\nDevice %d: \"%s\"\n", dev, deviceProp.name); + cudaRuntimeGetVersion(&runtimeVersion); + fprintf(stderr, " CUDA Runtime Version :\t%d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10); + fprintf(stderr, " CUDA Compute Capability :\t%d.%d\n", deviceProp.major, deviceProp.minor); - LoadBMPFile((uchar4 **)&hImage, &width, &height, image_path); - - if (!hImage) { - fprintf(stderr, "Error opening file '%s'\n", image_path); - exit(EXIT_FAILURE); - } - - printf("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); + if (runtimeVersion >= min_runtime && ((deviceProp.major << 4) + deviceProp.minor) >= min_compute) { + return true; + } + else { + return false; + } } -bool checkCUDAProfile(int dev, int min_runtime, int min_compute) { - int runtimeVersion = 0; - - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - - fprintf(stderr, "\nDevice %d: \"%s\"\n", dev, deviceProp.name); - cudaRuntimeGetVersion(&runtimeVersion); - fprintf(stderr, " CUDA Runtime Version :\t%d.%d\n", - runtimeVersion / 1000, (runtimeVersion % 100) / 10); - fprintf(stderr, " CUDA Compute Capability :\t%d.%d\n", deviceProp.major, - deviceProp.minor); - - if (runtimeVersion >= min_runtime && - ((deviceProp.major << 4) + deviceProp.minor) >= min_compute) { - return true; - } else { - return false; - } -} - -void printHelp() { - printf("bilateralFilter usage\n"); - printf(" -radius=n (specify the filter radius n to use)\n"); - printf(" -passes=n (specify the number of passes n to use)\n"); - printf(" -file=name (specify reference file for comparison)\n"); +void printHelp() +{ + printf("bilateralFilter usage\n"); + printf(" -radius=n (specify the filter radius n to use)\n"); + printf(" -passes=n (specify the number of passes n to use)\n"); + printf(" -file=name (specify reference file for comparison)\n"); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // start logs - int devID; - char *ref_file = NULL; - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + // start logs + int devID; + char *ref_file = NULL; + printf("%s Starting...\n\n", argv[0]); #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { - filter_radius = - getCmdLineArgumentInt(argc, (const char **)argv, "radius"); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "radius")) { + filter_radius = getCmdLineArgumentInt(argc, (const char **)argv, "radius"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { + iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); + } } - if (checkCmdLineFlag(argc, (const char **)argv, "passes")) { - iterations = getCmdLineArgumentInt(argc, (const char **)argv, "passes"); + // load image to process + loadImageData(argc, argv); + devID = findCudaDevice(argc, (const char **)argv); + + if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { + // This is a separate mode of the sample, where we are benchmark the kernels + // for performance + // Running CUDA kernels (bilateralfilter) in Benchmarking mode + g_TotalErrors += runBenchmark(argc, argv); + + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } + else if (checkCmdLineFlag(argc, (const char **)argv, "radius") + || checkCmdLineFlag(argc, (const char **)argv, "passes")) { + // This overrides the default mode. Users can specify the radius used by + // the filter kernel + g_TotalErrors += runSingleTest(ref_file, argv[0]); - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", - (char **)&ref_file); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } - } + else { + // Default mode running with OpenGL visualization and in automatic mode + // the output automatically changes animation + printf("\n"); - // load image to process - loadImageData(argc, argv); - devID = findCudaDevice(argc, (const char **)argv); + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with + // OpenGL/CUDA interop. + initGL(argc, (char **)argv); - if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) { - // This is a separate mode of the sample, where we are benchmark the kernels - // for performance - // Running CUDA kernels (bilateralfilter) in Benchmarking mode - g_TotalErrors += runBenchmark(argc, argv); - - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - } else if (checkCmdLineFlag(argc, (const char **)argv, "radius") || - checkCmdLineFlag(argc, (const char **)argv, "passes")) { - // This overrides the default mode. Users can specify the radius used by - // the filter kernel - g_TotalErrors += runSingleTest(ref_file, argv[0]); - - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - } else { - // Default mode running with OpenGL visualization and in automatic mode - // the output automatically changes animation - printf("\n"); - - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with - // OpenGL/CUDA interop. - initGL(argc, (char **)argv); - - initCuda(); - initGLResources(); + initCuda(); + initGLResources(); // sets the callback function so it will call cleanup upon exit #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - printf("Running Standard Demonstration with GLUT loop...\n\n"); - printf( - "Press '+' and '-' to change filter width\n" - "Press ']' and '[' to change number of iterations\n" - "Press 'e' and 'E' to change Euclidean delta\n" - "Press 'g' and 'G' to change Gaussian delta\n" - "Press 'a' or 'A' to change Animation mode ON/OFF\n\n"); + printf("Running Standard Demonstration with GLUT loop...\n\n"); + printf("Press '+' and '-' to change filter width\n" + "Press ']' and '[' to change number of iterations\n" + "Press 'e' and 'E' to change Euclidean delta\n" + "Press 'g' and 'G' to change Gaussian delta\n" + "Press 'a' or 'A' to change Animation mode ON/OFF\n\n"); - // Main OpenGL loop that will run visualization for every vsync - glutMainLoop(); - } + // Main OpenGL loop that will run visualization for every vsync + glutMainLoop(); + } } diff --git a/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter_cpu.cpp b/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter_cpu.cpp index 22ec23b5..bc113c5b 100644 --- a/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter_cpu.cpp +++ b/Samples/5_Domain_Specific/bilateralFilter/bilateralFilter_cpu.cpp @@ -32,122 +32,129 @@ // export C interface #define EPSILON 1e-3 extern "C" void updateGaussianGold(float delta, int radius); -extern "C" void bilateralFilterGold(unsigned int *pSrc, unsigned int *pDest, - float e_d, int w, int h, int r); +extern "C" void bilateralFilterGold(unsigned int *pSrc, unsigned int *pDest, float e_d, int w, int h, int r); // variables float gaussian[50]; -struct float4 { - float x; - float y; - float z; - float w; +struct float4 +{ + float x; + float y; + float z; + float w; - float4(){}; - float4(float value) { x = y = z = w = value; } + float4() {}; + float4(float value) { x = y = z = w = value; } }; -void updateGaussianGold(float delta, int radius) { - for (int i = 0; i < 2 * radius + 1; i++) { - int x = i - radius; - gaussian[i] = expf(-(x * x) / (2 * delta * delta)); - } -} - -float heuclideanLen(float4 a, float4 b, float d) { - float mod = (b.x - a.x) * (b.x - a.x) + (b.y - a.y) * (b.y - a.y) + - (b.z - a.z) * (b.z - a.z) + (b.w - a.w) * (b.w - a.w); - - return expf(-mod / (2 * d * d)); -} - -unsigned int hrgbaFloatToInt(float4 rgba) { - unsigned int w = (((unsigned int)(fabs(rgba.w) * 255.0f)) & 0xff) << 24; - unsigned int z = (((unsigned int)(fabs(rgba.z) * 255.0f)) & 0xff) << 16; - unsigned int y = (((unsigned int)(fabs(rgba.y) * 255.0f)) & 0xff) << 8; - unsigned int x = ((unsigned int)(fabs(rgba.x) * 255.0f)) & 0xff; - - return (w | z | y | x); -} - -float4 hrgbaIntToFloat(unsigned int c) { - float4 rgba; - rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; - rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; - rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; - rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; - return rgba; -} - -float4 mul(float a, float4 b) { - float4 ans; - ans.x = a * b.x; - ans.y = a * b.y; - ans.z = a * b.z; - ans.w = a * b.w; - - return ans; -} - -float4 add4(float4 a, float4 b) { - float4 ans; - ans.x = a.x + b.x; - ans.y = a.y + b.y; - ans.z = a.z + b.z; - ans.w = a.w + b.w; - - return ans; -} - -void bilateralFilterGold(unsigned int *pSrc, unsigned int *pDest, float e_d, - int w, int h, int r) { - float4 *hImage = new float4[w * h]; - float domainDist, colorDist, factor; - - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - hImage[y * w + x] = hrgbaIntToFloat(pSrc[y * w + x]); +void updateGaussianGold(float delta, int radius) +{ + for (int i = 0; i < 2 * radius + 1; i++) { + int x = i - radius; + gaussian[i] = expf(-(x * x) / (2 * delta * delta)); } - } - - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - float4 t(0.0f); - float sum = 0.0f; - - for (int i = -r; i <= r; i++) { - int neighborY = y + i; - - // clamp the neighbor pixel, prevent overflow - if (neighborY < 0) { - neighborY = 0; - } else if (neighborY >= h) { - neighborY = h - 1; - } - - for (int j = -r; j <= r; j++) { - domainDist = gaussian[r + i] * gaussian[r + j]; - - // clamp the neighbor pixel, prevent overflow - int neighborX = x + j; - - if (neighborX < 0) { - neighborX = 0; - } else if (neighborX >= w) { - neighborX = w - 1; - } - - colorDist = heuclideanLen(hImage[neighborY * w + neighborX], - hImage[y * w + x], e_d); - factor = domainDist * colorDist; - sum += factor; - t = add4(t, mul(factor, hImage[neighborY * w + neighborX])); - } - } - - pDest[y * w + x] = hrgbaFloatToInt(mul(1 / sum, t)); - } - } - - delete[] hImage; +} + +float heuclideanLen(float4 a, float4 b, float d) +{ + float mod = + (b.x - a.x) * (b.x - a.x) + (b.y - a.y) * (b.y - a.y) + (b.z - a.z) * (b.z - a.z) + (b.w - a.w) * (b.w - a.w); + + return expf(-mod / (2 * d * d)); +} + +unsigned int hrgbaFloatToInt(float4 rgba) +{ + unsigned int w = (((unsigned int)(fabs(rgba.w) * 255.0f)) & 0xff) << 24; + unsigned int z = (((unsigned int)(fabs(rgba.z) * 255.0f)) & 0xff) << 16; + unsigned int y = (((unsigned int)(fabs(rgba.y) * 255.0f)) & 0xff) << 8; + unsigned int x = ((unsigned int)(fabs(rgba.x) * 255.0f)) & 0xff; + + return (w | z | y | x); +} + +float4 hrgbaIntToFloat(unsigned int c) +{ + float4 rgba; + rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; + rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; + rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; + rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; + return rgba; +} + +float4 mul(float a, float4 b) +{ + float4 ans; + ans.x = a * b.x; + ans.y = a * b.y; + ans.z = a * b.z; + ans.w = a * b.w; + + return ans; +} + +float4 add4(float4 a, float4 b) +{ + float4 ans; + ans.x = a.x + b.x; + ans.y = a.y + b.y; + ans.z = a.z + b.z; + ans.w = a.w + b.w; + + return ans; +} + +void bilateralFilterGold(unsigned int *pSrc, unsigned int *pDest, float e_d, int w, int h, int r) +{ + float4 *hImage = new float4[w * h]; + float domainDist, colorDist, factor; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + hImage[y * w + x] = hrgbaIntToFloat(pSrc[y * w + x]); + } + } + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + float4 t(0.0f); + float sum = 0.0f; + + for (int i = -r; i <= r; i++) { + int neighborY = y + i; + + // clamp the neighbor pixel, prevent overflow + if (neighborY < 0) { + neighborY = 0; + } + else if (neighborY >= h) { + neighborY = h - 1; + } + + for (int j = -r; j <= r; j++) { + domainDist = gaussian[r + i] * gaussian[r + j]; + + // clamp the neighbor pixel, prevent overflow + int neighborX = x + j; + + if (neighborX < 0) { + neighborX = 0; + } + else if (neighborX >= w) { + neighborX = w - 1; + } + + colorDist = heuclideanLen(hImage[neighborY * w + neighborX], hImage[y * w + x], e_d); + factor = domainDist * colorDist; + sum += factor; + t = add4(t, mul(factor, hImage[neighborY * w + neighborX])); + } + } + + pDest[y * w + x] = hrgbaFloatToInt(mul(1 / sum, t)); + } + } + + delete[] hImage; } diff --git a/Samples/5_Domain_Specific/bilateralFilter/bilateral_kernel.cu b/Samples/5_Domain_Specific/bilateralFilter/bilateral_kernel.cu index 2df9958f..73597e5d 100644 --- a/Samples/5_Domain_Specific/bilateralFilter/bilateral_kernel.cu +++ b/Samples/5_Domain_Specific/bilateralFilter/bilateral_kernel.cu @@ -25,17 +25,17 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include +#include // CUDA device initialization helper functions #include -#include // CUDA device initialization helper functions +#include -__constant__ float cGaussian[64]; // gaussian array in device side +__constant__ float cGaussian[64]; // gaussian array in device side cudaTextureObject_t rgbaTexdImage; cudaTextureObject_t rgbaTexdTemp; -uint *dImage = NULL; // original image -uint *dTemp = NULL; // temp array for iterations +uint *dImage = NULL; // original image +uint *dTemp = NULL; // temp array for iterations size_t pitch; /* @@ -69,118 +69,117 @@ size_t pitch; */ // Euclidean Distance (x, y, d) = exp((|x - y| / d)^2 / 2) -__device__ float euclideanLen(float4 a, float4 b, float d) { - float mod = (b.x - a.x) * (b.x - a.x) + (b.y - a.y) * (b.y - a.y) + - (b.z - a.z) * (b.z - a.z); +__device__ float euclideanLen(float4 a, float4 b, float d) +{ + float mod = (b.x - a.x) * (b.x - a.x) + (b.y - a.y) * (b.y - a.y) + (b.z - a.z) * (b.z - a.z); - return __expf(-mod / (2.f * d * d)); + return __expf(-mod / (2.f * d * d)); } -__device__ uint rgbaFloatToInt(float4 rgba) { - rgba.x = __saturatef(fabs(rgba.x)); // clamp to [0.0, 1.0] - rgba.y = __saturatef(fabs(rgba.y)); - rgba.z = __saturatef(fabs(rgba.z)); - rgba.w = __saturatef(fabs(rgba.w)); - return (uint(rgba.w * 255.0f) << 24) | (uint(rgba.z * 255.0f) << 16) | - (uint(rgba.y * 255.0f) << 8) | uint(rgba.x * 255.0f); +__device__ uint rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(fabs(rgba.x)); // clamp to [0.0, 1.0] + rgba.y = __saturatef(fabs(rgba.y)); + rgba.z = __saturatef(fabs(rgba.z)); + rgba.w = __saturatef(fabs(rgba.w)); + return (uint(rgba.w * 255.0f) << 24) | (uint(rgba.z * 255.0f) << 16) | (uint(rgba.y * 255.0f) << 8) + | uint(rgba.x * 255.0f); } -__device__ float4 rgbaIntToFloat(uint c) { - float4 rgba; - rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; - rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; - rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; - rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; - return rgba; +__device__ float4 rgbaIntToFloat(uint c) +{ + float4 rgba; + rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; + rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; + rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; + rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; + return rgba; } // column pass using coalesced global memory reads -__global__ void d_bilateral_filter(uint *od, int w, int h, float e_d, int r, - cudaTextureObject_t rgbaTex) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void d_bilateral_filter(uint *od, int w, int h, float e_d, int r, cudaTextureObject_t rgbaTex) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; - if (x >= w || y >= h) { - return; - } - - float sum = 0.0f; - float factor; - float4 t = {0.f, 0.f, 0.f, 0.f}; - float4 center = tex2D(rgbaTex, x, y); - - for (int i = -r; i <= r; i++) { - for (int j = -r; j <= r; j++) { - float4 curPix = tex2D(rgbaTex, x + j, y + i); - factor = cGaussian[i + r] * cGaussian[j + r] * // domain factor - euclideanLen(curPix, center, e_d); // range factor - - t += factor * curPix; - sum += factor; + if (x >= w || y >= h) { + return; } - } - od[y * w + x] = rgbaFloatToInt(t / sum); + float sum = 0.0f; + float factor; + float4 t = {0.f, 0.f, 0.f, 0.f}; + float4 center = tex2D(rgbaTex, x, y); + + for (int i = -r; i <= r; i++) { + for (int j = -r; j <= r; j++) { + float4 curPix = tex2D(rgbaTex, x + j, y + i); + factor = cGaussian[i + r] * cGaussian[j + r] * // domain factor + euclideanLen(curPix, center, e_d); // range factor + + t += factor * curPix; + sum += factor; + } + } + + od[y * w + x] = rgbaFloatToInt(t / sum); } -extern "C" void initTexture(int width, int height, uint *hImage) { - // copy image data to array - checkCudaErrors( - cudaMallocPitch(&dImage, &pitch, sizeof(uint) * width, height)); - checkCudaErrors( - cudaMallocPitch(&dTemp, &pitch, sizeof(uint) * width, height)); - checkCudaErrors(cudaMemcpy2D(dImage, pitch, hImage, sizeof(uint) * width, - sizeof(uint) * width, height, - cudaMemcpyHostToDevice)); +extern "C" void initTexture(int width, int height, uint *hImage) +{ + // copy image data to array + checkCudaErrors(cudaMallocPitch(&dImage, &pitch, sizeof(uint) * width, height)); + checkCudaErrors(cudaMallocPitch(&dTemp, &pitch, sizeof(uint) * width, height)); + checkCudaErrors(cudaMemcpy2D( + dImage, pitch, hImage, sizeof(uint) * width, sizeof(uint) * width, height, cudaMemcpyHostToDevice)); - // texture rgbaTex; - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + // texture rgbaTex; + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = dImage; - texRes.res.pitch2D.desc = desc; - texRes.res.pitch2D.width = width; - texRes.res.pitch2D.height = height; - texRes.res.pitch2D.pitchInBytes = pitch; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = dImage; + texRes.res.pitch2D.desc = desc; + texRes.res.pitch2D.width = width; + texRes.res.pitch2D.height = height; + texRes.res.pitch2D.pitchInBytes = pitch; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors( - cudaCreateTextureObject(&rgbaTexdImage, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&rgbaTexdImage, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = dTemp; - texRes.res.pitch2D.desc = desc; - texRes.res.pitch2D.width = width; - texRes.res.pitch2D.height = height; - texRes.res.pitch2D.pitchInBytes = pitch; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = dTemp; + texRes.res.pitch2D.desc = desc; + texRes.res.pitch2D.width = width; + texRes.res.pitch2D.height = height; + texRes.res.pitch2D.pitchInBytes = pitch; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors( - cudaCreateTextureObject(&rgbaTexdTemp, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&rgbaTexdTemp, &texRes, &texDescr, NULL)); } -extern "C" void freeTextures() { - checkCudaErrors(cudaDestroyTextureObject(rgbaTexdImage)); - checkCudaErrors(cudaDestroyTextureObject(rgbaTexdTemp)); - checkCudaErrors(cudaFree(dImage)); - checkCudaErrors(cudaFree(dTemp)); +extern "C" void freeTextures() +{ + checkCudaErrors(cudaDestroyTextureObject(rgbaTexdImage)); + checkCudaErrors(cudaDestroyTextureObject(rgbaTexdTemp)); + checkCudaErrors(cudaFree(dImage)); + checkCudaErrors(cudaFree(dTemp)); } /* @@ -197,16 +196,16 @@ extern "C" void freeTextures() { radius - half of the filter size (total filter size = 2 * radius + 1) */ -extern "C" void updateGaussian(float delta, int radius) { - float fGaussian[64]; +extern "C" void updateGaussian(float delta, int radius) +{ + float fGaussian[64]; - for (int i = 0; i < 2 * radius + 1; ++i) { - float x = (float)(i - radius); - fGaussian[i] = expf(-(x * x) / (2 * delta * delta)); - } + for (int i = 0; i < 2 * radius + 1; ++i) { + float x = (float)(i - radius); + fGaussian[i] = expf(-(x * x) / (2 * delta * delta)); + } - checkCudaErrors(cudaMemcpyToSymbol(cGaussian, fGaussian, - sizeof(float) * (2 * radius + 1))); + checkCudaErrors(cudaMemcpyToSymbol(cGaussian, fGaussian, sizeof(float) * (2 * radius + 1))); } /* @@ -222,40 +221,43 @@ extern "C" void updateGaussian(float delta, int radius) { */ // RGBA version -extern "C" double bilateralFilterRGBA(uint *dDest, int width, int height, - float e_d, int radius, int iterations, - StopWatchInterface *timer) { - // var for kernel computation timing - double dKernelTime; +extern "C" double bilateralFilterRGBA(uint *dDest, + int width, + int height, + float e_d, + int radius, + int iterations, + StopWatchInterface *timer) +{ + // var for kernel computation timing + double dKernelTime; - for (int i = 0; i < iterations; i++) { - // sync host and start kernel computation timer - dKernelTime = 0.0; - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&timer); + for (int i = 0; i < iterations; i++) { + // sync host and start kernel computation timer + dKernelTime = 0.0; + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&timer); - dim3 gridSize((width + 16 - 1) / 16, (height + 16 - 1) / 16); - dim3 blockSize(16, 16); + dim3 gridSize((width + 16 - 1) / 16, (height + 16 - 1) / 16); + dim3 blockSize(16, 16); - if (iterations > 1) { - d_bilateral_filter<<>>(dDest, width, height, e_d, - radius, rgbaTexdTemp); - } else { - d_bilateral_filter<<>>(dDest, width, height, e_d, - radius, rgbaTexdImage); + if (iterations > 1) { + d_bilateral_filter<<>>(dDest, width, height, e_d, radius, rgbaTexdTemp); + } + else { + d_bilateral_filter<<>>(dDest, width, height, e_d, radius, rgbaTexdImage); + } + + // sync host and stop computation timer + checkCudaErrors(cudaDeviceSynchronize()); + dKernelTime += sdkGetTimerValue(&timer); + + if (iterations > 1) { + // copy result back from global memory to array + checkCudaErrors(cudaMemcpy2D( + dTemp, pitch, dDest, sizeof(int) * width, sizeof(int) * width, height, cudaMemcpyDeviceToDevice)); + } } - // sync host and stop computation timer - checkCudaErrors(cudaDeviceSynchronize()); - dKernelTime += sdkGetTimerValue(&timer); - - if (iterations > 1) { - // copy result back from global memory to array - checkCudaErrors(cudaMemcpy2D(dTemp, pitch, dDest, sizeof(int) * width, - sizeof(int) * width, height, - cudaMemcpyDeviceToDevice)); - } - } - - return ((dKernelTime / 1000.) / (double)iterations); + return ((dKernelTime / 1000.) / (double)iterations); } diff --git a/Samples/5_Domain_Specific/bilateralFilter/bmploader.cpp b/Samples/5_Domain_Specific/bilateralFilter/bmploader.cpp index dc5a3fef..45895139 100644 --- a/Samples/5_Domain_Specific/bilateralFilter/bmploader.cpp +++ b/Samples/5_Domain_Specific/bilateralFilter/bmploader.cpp @@ -29,104 +29,110 @@ #include #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#pragma warning(disable : 4996) // disable deprecated warning +#pragma warning(disable : 4996) // disable deprecated warning #endif #pragma pack(1) -typedef struct { - short type; - int size; - short reserved1; - short reserved2; - int offset; +typedef struct +{ + short type; + int size; + short reserved1; + short reserved2; + int offset; } BMPHeader; -typedef struct { - int size; - int width; - int height; - short planes; - short bitsPerPixel; - unsigned compression; - unsigned imageSize; - int xPelsPerMeter; - int yPelsPerMeter; - int clrUsed; - int clrImportant; +typedef struct +{ + int size; + int width; + int height; + short planes; + short bitsPerPixel; + unsigned compression; + unsigned imageSize; + int xPelsPerMeter; + int yPelsPerMeter; + int clrUsed; + int clrImportant; } BMPInfoHeader; // Isolated definition -typedef struct { unsigned char x, y, z, w; } uchar4; +typedef struct +{ + unsigned char x, y, z, w; +} uchar4; -extern "C" void LoadBMPFile(uchar4 **dst, unsigned int *width, - unsigned int *height, const char *name) { - BMPHeader hdr; - BMPInfoHeader infoHdr; - int x, y; +extern "C" void LoadBMPFile(uchar4 **dst, unsigned int *width, unsigned int *height, const char *name) +{ + BMPHeader hdr; + BMPInfoHeader infoHdr; + int x, y; - FILE *fd; + FILE *fd; - printf("Loading %s...\n", name); + printf("Loading %s...\n", name); - if (sizeof(uchar4) != 4) { - printf("***Bad uchar4 size***\n"); - exit(EXIT_SUCCESS); - } - - if (!(fd = fopen(name, "rb"))) { - printf("***BMP load error: file access denied***\n"); - exit(EXIT_SUCCESS); - } - - fread(&hdr, sizeof(hdr), 1, fd); - - if (hdr.type != 0x4D42) { - printf("***BMP load error: bad file format***\n"); - exit(EXIT_SUCCESS); - } - - fread(&infoHdr, sizeof(infoHdr), 1, fd); - - if (infoHdr.bitsPerPixel != 24) { - printf("***BMP load error: invalid color depth***\n"); - exit(EXIT_SUCCESS); - } - - if (infoHdr.compression) { - printf("***BMP load error: compressed image***\n"); - exit(EXIT_SUCCESS); - } - - *width = infoHdr.width; - *height = infoHdr.height; - *dst = (uchar4 *)malloc(*width * *height * 4); - - printf("BMP width: %u\n", infoHdr.width); - printf("BMP height: %u\n", infoHdr.height); - - fseek(fd, hdr.offset - sizeof(hdr) - sizeof(infoHdr), SEEK_CUR); - - for (y = 0; y < infoHdr.height; y++) { - for (x = 0; x < infoHdr.width; x++) { - (*dst)[(y * infoHdr.width + x)].w = 0; - (*dst)[(y * infoHdr.width + x)].z = fgetc(fd); - (*dst)[(y * infoHdr.width + x)].y = fgetc(fd); - (*dst)[(y * infoHdr.width + x)].x = fgetc(fd); + if (sizeof(uchar4) != 4) { + printf("***Bad uchar4 size***\n"); + exit(EXIT_SUCCESS); } - for (x = 0; x < (4 - (3 * infoHdr.width) % 4) % 4; x++) { - fgetc(fd); + if (!(fd = fopen(name, "rb"))) { + printf("***BMP load error: file access denied***\n"); + exit(EXIT_SUCCESS); } - } - if (ferror(fd)) { - printf("***Unknown BMP load error.***\n"); - free(*dst); - exit(EXIT_SUCCESS); - } else { - printf("BMP file loaded successfully!\n"); - } + fread(&hdr, sizeof(hdr), 1, fd); - fclose(fd); + if (hdr.type != 0x4D42) { + printf("***BMP load error: bad file format***\n"); + exit(EXIT_SUCCESS); + } + + fread(&infoHdr, sizeof(infoHdr), 1, fd); + + if (infoHdr.bitsPerPixel != 24) { + printf("***BMP load error: invalid color depth***\n"); + exit(EXIT_SUCCESS); + } + + if (infoHdr.compression) { + printf("***BMP load error: compressed image***\n"); + exit(EXIT_SUCCESS); + } + + *width = infoHdr.width; + *height = infoHdr.height; + *dst = (uchar4 *)malloc(*width * *height * 4); + + printf("BMP width: %u\n", infoHdr.width); + printf("BMP height: %u\n", infoHdr.height); + + fseek(fd, hdr.offset - sizeof(hdr) - sizeof(infoHdr), SEEK_CUR); + + for (y = 0; y < infoHdr.height; y++) { + for (x = 0; x < infoHdr.width; x++) { + (*dst)[(y * infoHdr.width + x)].w = 0; + (*dst)[(y * infoHdr.width + x)].z = fgetc(fd); + (*dst)[(y * infoHdr.width + x)].y = fgetc(fd); + (*dst)[(y * infoHdr.width + x)].x = fgetc(fd); + } + + for (x = 0; x < (4 - (3 * infoHdr.width) % 4) % 4; x++) { + fgetc(fd); + } + } + + if (ferror(fd)) { + printf("***Unknown BMP load error.***\n"); + free(*dst); + exit(EXIT_SUCCESS); + } + else { + printf("BMP file loaded successfully!\n"); + } + + fclose(fd); } diff --git a/Samples/5_Domain_Specific/binomialOptions/binomialOptions.cpp b/Samples/5_Domain_Specific/binomialOptions/binomialOptions.cpp index c3b1f441..d904e89c 100644 --- a/Samples/5_Domain_Specific/binomialOptions/binomialOptions.cpp +++ b/Samples/5_Domain_Specific/binomialOptions/binomialOptions.cpp @@ -31,14 +31,13 @@ * See supplied whitepaper for more explanations. */ -#include -#include -#include -#include #include - -#include #include +#include +#include +#include +#include +#include #include "binomialOptions_common.h" #include "realtype.h" @@ -57,132 +56,134 @@ extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData); //////////////////////////////////////////////////////////////////////////////// // Process an array of OptN options on GPU //////////////////////////////////////////////////////////////////////////////// -extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, - int optN); +extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, int optN); //////////////////////////////////////////////////////////////////////////////// // Helper function, returning uniformly distributed // random float in [low, high] range //////////////////////////////////////////////////////////////////////////////// -real randData(real low, real high) { - real t = (real)rand() / (real)RAND_MAX; - return ((real)1.0 - t) * low + t * high; +real randData(real low, real high) +{ + real t = (real)rand() / (real)RAND_MAX; + return ((real)1.0 - t) * low + t * high; } //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("[%s] - Starting...\n", argv[0]); +int main(int argc, char **argv) +{ + printf("[%s] - Starting...\n", argv[0]); - int devID = findCudaDevice(argc, (const char **)argv); + int devID = findCudaDevice(argc, (const char **)argv); - const int OPT_N = MAX_OPTIONS; + const int OPT_N = MAX_OPTIONS; - TOptionData optionData[MAX_OPTIONS]; - real callValueBS[MAX_OPTIONS], callValueGPU[MAX_OPTIONS], - callValueCPU[MAX_OPTIONS]; + TOptionData optionData[MAX_OPTIONS]; + real callValueBS[MAX_OPTIONS], callValueGPU[MAX_OPTIONS], callValueCPU[MAX_OPTIONS]; - real sumDelta, sumRef, gpuTime, errorVal; + real sumDelta, sumRef, gpuTime, errorVal; - StopWatchInterface *hTimer = NULL; - int i; + StopWatchInterface *hTimer = NULL; + int i; - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Generating input data...\n"); - // Generate options set - srand(123); + printf("Generating input data...\n"); + // Generate options set + srand(123); - for (i = 0; i < OPT_N; i++) { - optionData[i].S = randData(5.0f, 30.0f); - optionData[i].X = randData(1.0f, 100.0f); - optionData[i].T = randData(0.25f, 10.0f); - optionData[i].R = 0.06f; - optionData[i].V = 0.10f; - BlackScholesCall(callValueBS[i], optionData[i]); - } + for (i = 0; i < OPT_N; i++) { + optionData[i].S = randData(5.0f, 30.0f); + optionData[i].X = randData(1.0f, 100.0f); + optionData[i].T = randData(0.25f, 10.0f); + optionData[i].R = 0.06f; + optionData[i].V = 0.10f; + BlackScholesCall(callValueBS[i], optionData[i]); + } - printf("Running GPU binomial tree...\n"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + printf("Running GPU binomial tree...\n"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - binomialOptionsGPU(callValueGPU, optionData, OPT_N); + binomialOptionsGPU(callValueGPU, optionData, OPT_N); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer); - printf("Options count : %i \n", OPT_N); - printf("Time steps : %i \n", NUM_STEPS); - printf("binomialOptionsGPU() time: %f msec\n", gpuTime); - printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer); + printf("Options count : %i \n", OPT_N); + printf("Time steps : %i \n", NUM_STEPS); + printf("binomialOptionsGPU() time: %f msec\n", gpuTime); + printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); - printf("Running CPU binomial tree...\n"); + printf("Running CPU binomial tree...\n"); - for (i = 0; i < OPT_N; i++) { - binomialOptionsCPU(callValueCPU[i], optionData[i]); - } + for (i = 0; i < OPT_N; i++) { + binomialOptionsCPU(callValueCPU[i], optionData[i]); + } - printf("Comparing the results...\n"); - sumDelta = 0; - sumRef = 0; - printf("GPU binomial vs. Black-Scholes\n"); + printf("Comparing the results...\n"); + sumDelta = 0; + sumRef = 0; + printf("GPU binomial vs. Black-Scholes\n"); - for (i = 0; i < OPT_N; i++) { - sumDelta += fabs(callValueBS[i] - callValueGPU[i]); - sumRef += fabs(callValueBS[i]); - } + for (i = 0; i < OPT_N; i++) { + sumDelta += fabs(callValueBS[i] - callValueGPU[i]); + sumRef += fabs(callValueBS[i]); + } - if (sumRef > 1E-5) { - printf("L1 norm: %E\n", (double)(sumDelta / sumRef)); - } else { - printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); - } + if (sumRef > 1E-5) { + printf("L1 norm: %E\n", (double)(sumDelta / sumRef)); + } + else { + printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); + } - printf("CPU binomial vs. Black-Scholes\n"); - sumDelta = 0; - sumRef = 0; + printf("CPU binomial vs. Black-Scholes\n"); + sumDelta = 0; + sumRef = 0; - for (i = 0; i < OPT_N; i++) { - sumDelta += fabs(callValueBS[i] - callValueCPU[i]); - sumRef += fabs(callValueBS[i]); - } + for (i = 0; i < OPT_N; i++) { + sumDelta += fabs(callValueBS[i] - callValueCPU[i]); + sumRef += fabs(callValueBS[i]); + } - if (sumRef > 1E-5) { - printf("L1 norm: %E\n", sumDelta / sumRef); - } else { - printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); - } + if (sumRef > 1E-5) { + printf("L1 norm: %E\n", sumDelta / sumRef); + } + else { + printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); + } - printf("CPU binomial vs. GPU binomial\n"); - sumDelta = 0; - sumRef = 0; + printf("CPU binomial vs. GPU binomial\n"); + sumDelta = 0; + sumRef = 0; - for (i = 0; i < OPT_N; i++) { - sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); - sumRef += callValueCPU[i]; - } + for (i = 0; i < OPT_N; i++) { + sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); + sumRef += callValueCPU[i]; + } - if (sumRef > 1E-5) { - printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); - } else { - printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); - } + if (sumRef > 1E-5) { + printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); + } + else { + printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); + } - printf("Shutting down...\n"); + printf("Shutting down...\n"); - sdkDeleteTimer(&hTimer); + sdkDeleteTimer(&hTimer); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - if (errorVal > 5e-4) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + if (errorVal > 5e-4) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - printf("Test passed\n"); - exit(EXIT_SUCCESS); + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/binomialOptions/binomialOptions_common.h b/Samples/5_Domain_Specific/binomialOptions/binomialOptions_common.h index 84e22954..1922aa8b 100644 --- a/Samples/5_Domain_Specific/binomialOptions/binomialOptions_common.h +++ b/Samples/5_Domain_Specific/binomialOptions/binomialOptions_common.h @@ -33,12 +33,13 @@ //////////////////////////////////////////////////////////////////////////////// // Global types //////////////////////////////////////////////////////////////////////////////// -typedef struct { - real S; - real X; - real T; - real R; - real V; +typedef struct +{ + real S; + real X; + real T; + real R; + real V; } TOptionData; //////////////////////////////////////////////////////////////////////////////// diff --git a/Samples/5_Domain_Specific/binomialOptions/binomialOptions_gold.cpp b/Samples/5_Domain_Specific/binomialOptions/binomialOptions_gold.cpp index 4847ab39..fd343a8f 100644 --- a/Samples/5_Domain_Specific/binomialOptions/binomialOptions_gold.cpp +++ b/Samples/5_Domain_Specific/binomialOptions/binomialOptions_gold.cpp @@ -25,96 +25,102 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include + #include "binomialOptions_common.h" #include "realtype.h" /////////////////////////////////////////////////////////////////////////////// // Polynomial approximation of cumulative normal distribution function /////////////////////////////////////////////////////////////////////////////// -static real CND(real d) { - const real A1 = (real)0.31938153; - const real A2 = (real)-0.356563782; - const real A3 = (real)1.781477937; - const real A4 = (real)-1.821255978; - const real A5 = (real)1.330274429; - const real RSQRT2PI = (real)0.39894228040143267793994605993438; +static real CND(real d) +{ + const real A1 = (real)0.31938153; + const real A2 = (real)-0.356563782; + const real A3 = (real)1.781477937; + const real A4 = (real)-1.821255978; + const real A5 = (real)1.330274429; + const real RSQRT2PI = (real)0.39894228040143267793994605993438; - real K = (real)(1.0 / (1.0 + 0.2316419 * (real)fabs(d))); + real K = (real)(1.0 / (1.0 + 0.2316419 * (real)fabs(d))); - real cnd = (real)RSQRT2PI * (real)exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + real cnd = (real)RSQRT2PI * (real)exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = (real)1.0 - cnd; + if (d > 0) + cnd = (real)1.0 - cnd; - return cnd; + return cnd; } -extern "C" void BlackScholesCall(real &callResult, TOptionData optionData) { - real S = optionData.S; - real X = optionData.X; - real T = optionData.T; - real R = optionData.R; - real V = optionData.V; +extern "C" void BlackScholesCall(real &callResult, TOptionData optionData) +{ + real S = optionData.S; + real X = optionData.X; + real T = optionData.T; + real R = optionData.R; + real V = optionData.V; - real sqrtT = (real)sqrt(T); - real d1 = (real)(log(S / X) + (R + (real)0.5 * V * V) * T) / (V * sqrtT); - real d2 = d1 - V * sqrtT; - real CNDD1 = CND(d1); - real CNDD2 = CND(d2); + real sqrtT = (real)sqrt(T); + real d1 = (real)(log(S / X) + (R + (real)0.5 * V * V) * T) / (V * sqrtT); + real d2 = d1 - V * sqrtT; + real CNDD1 = CND(d1); + real CNDD2 = CND(d2); - // Calculate Call and Put simultaneously - real expRT = (real)exp(-R * T); - callResult = (real)(S * CNDD1 - X * expRT * CNDD2); + // Calculate Call and Put simultaneously + real expRT = (real)exp(-R * T); + callResult = (real)(S * CNDD1 - X * expRT * CNDD2); } //////////////////////////////////////////////////////////////////////////////// // Process an array of OptN options on CPU // Note that CPU code is for correctness testing only and not for benchmarking. //////////////////////////////////////////////////////////////////////////////// -static real expiryCallValue(real S, real X, real vDt, int i) { - real d = S * (real)exp(vDt * (real)(2 * i - NUM_STEPS)) - X; - return (d > (real)0) ? d : (real)0; +static real expiryCallValue(real S, real X, real vDt, int i) +{ + real d = S * (real)exp(vDt * (real)(2 * i - NUM_STEPS)) - X; + return (d > (real)0) ? d : (real)0; } -extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData) { - static real Call[NUM_STEPS + 1]; +extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData) +{ + static real Call[NUM_STEPS + 1]; - const real S = optionData.S; - const real X = optionData.X; - const real T = optionData.T; - const real R = optionData.R; - const real V = optionData.V; + const real S = optionData.S; + const real X = optionData.X; + const real T = optionData.T; + const real R = optionData.R; + const real V = optionData.V; - const real dt = T / (real)NUM_STEPS; - const real vDt = (real)V * (real)sqrt(dt); - const real rDt = R * dt; - // Per-step interest and discount factors - const real If = (real)exp(rDt); - const real Df = (real)exp(-rDt); - // Values and pseudoprobabilities of upward and downward moves - const real u = (real)exp(vDt); - const real d = (real)exp(-vDt); - const real pu = (If - d) / (u - d); - const real pd = (real)1.0 - pu; - const real puByDf = pu * Df; - const real pdByDf = pd * Df; + const real dt = T / (real)NUM_STEPS; + const real vDt = (real)V * (real)sqrt(dt); + const real rDt = R * dt; + // Per-step interest and discount factors + const real If = (real)exp(rDt); + const real Df = (real)exp(-rDt); + // Values and pseudoprobabilities of upward and downward moves + const real u = (real)exp(vDt); + const real d = (real)exp(-vDt); + const real pu = (If - d) / (u - d); + const real pd = (real)1.0 - pu; + const real puByDf = pu * Df; + const real pdByDf = pd * Df; - /////////////////////////////////////////////////////////////////////// - // Compute values at expiration date: - // call option value at period end is V(T) = S(T) - X - // if S(T) is greater than X, or zero otherwise. - // The computation is similar for put options. - /////////////////////////////////////////////////////////////////////// - for (int i = 0; i <= NUM_STEPS; i++) Call[i] = expiryCallValue(S, X, vDt, i); + /////////////////////////////////////////////////////////////////////// + // Compute values at expiration date: + // call option value at period end is V(T) = S(T) - X + // if S(T) is greater than X, or zero otherwise. + // The computation is similar for put options. + /////////////////////////////////////////////////////////////////////// + for (int i = 0; i <= NUM_STEPS; i++) + Call[i] = expiryCallValue(S, X, vDt, i); - //////////////////////////////////////////////////////////////////////// - // Walk backwards up binomial tree - //////////////////////////////////////////////////////////////////////// - for (int i = NUM_STEPS; i > 0; i--) - for (int j = 0; j <= i - 1; j++) - Call[j] = puByDf * Call[j + 1] + pdByDf * Call[j]; + //////////////////////////////////////////////////////////////////////// + // Walk backwards up binomial tree + //////////////////////////////////////////////////////////////////////// + for (int i = NUM_STEPS; i > 0; i--) + for (int j = 0; j <= i - 1; j++) + Call[j] = puByDf * Call[j + 1] + pdByDf * Call[j]; - callResult = (real)Call[0]; + callResult = (real)Call[0]; } diff --git a/Samples/5_Domain_Specific/binomialOptions/binomialOptions_kernel.cu b/Samples/5_Domain_Specific/binomialOptions/binomialOptions_kernel.cu index 3b1e8111..0818f0b9 100644 --- a/Samples/5_Domain_Specific/binomialOptions/binomialOptions_kernel.cu +++ b/Samples/5_Domain_Specific/binomialOptions/binomialOptions_kernel.cu @@ -28,40 +28,43 @@ //////////////////////////////////////////////////////////////////////////////// // Global types and parameters //////////////////////////////////////////////////////////////////////////////// +#include #include #include -#include namespace cg = cooperative_groups; #include + #include "binomialOptions_common.h" #include "realtype.h" // Preprocessed input option data -typedef struct { - real S; - real X; - real vDt; - real puByDf; - real pdByDf; +typedef struct +{ + real S; + real X; + real vDt; + real puByDf; + real pdByDf; } __TOptionData; static __constant__ __TOptionData d_OptionData[MAX_OPTIONS]; -static __device__ real d_CallValue[MAX_OPTIONS]; +static __device__ real d_CallValue[MAX_OPTIONS]; //////////////////////////////////////////////////////////////////////////////// // Overloaded shortcut functions for different precision modes //////////////////////////////////////////////////////////////////////////////// #ifndef DOUBLE_PRECISION -__device__ inline float expiryCallValue(float S, float X, float vDt, int i) { - float d = S * __expf(vDt * (2.0f * i - NUM_STEPS)) - X; - return (d > 0.0F) ? d : 0.0F; +__device__ inline float expiryCallValue(float S, float X, float vDt, int i) +{ + float d = S * __expf(vDt * (2.0f * i - NUM_STEPS)) - X; + return (d > 0.0F) ? d : 0.0F; } #else -__device__ inline double expiryCallValue(double S, double X, double vDt, - int i) { - double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X; - return (d > 0.0) ? d : 0.0; +__device__ inline double expiryCallValue(double S, double X, double vDt, int i) +{ + double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X; + return (d > 0.0) ? d : 0.0; } #endif @@ -74,84 +77,83 @@ __device__ inline double expiryCallValue(double S, double X, double vDt, #error Bad constants #endif -__global__ void binomialOptionsKernel() { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ real call_exchange[THREADBLOCK_SIZE + 1]; +__global__ void binomialOptionsKernel() +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ real call_exchange[THREADBLOCK_SIZE + 1]; - const int tid = threadIdx.x; - const real S = d_OptionData[blockIdx.x].S; - const real X = d_OptionData[blockIdx.x].X; - const real vDt = d_OptionData[blockIdx.x].vDt; - const real puByDf = d_OptionData[blockIdx.x].puByDf; - const real pdByDf = d_OptionData[blockIdx.x].pdByDf; + const int tid = threadIdx.x; + const real S = d_OptionData[blockIdx.x].S; + const real X = d_OptionData[blockIdx.x].X; + const real vDt = d_OptionData[blockIdx.x].vDt; + const real puByDf = d_OptionData[blockIdx.x].puByDf; + const real pdByDf = d_OptionData[blockIdx.x].pdByDf; - real call[ELEMS_PER_THREAD + 1]; + real call[ELEMS_PER_THREAD + 1]; #pragma unroll - for (int i = 0; i < ELEMS_PER_THREAD; ++i) - call[i] = expiryCallValue(S, X, vDt, tid * ELEMS_PER_THREAD + i); + for (int i = 0; i < ELEMS_PER_THREAD; ++i) + call[i] = expiryCallValue(S, X, vDt, tid * ELEMS_PER_THREAD + i); - if (tid == 0) - call_exchange[THREADBLOCK_SIZE] = expiryCallValue(S, X, vDt, NUM_STEPS); + if (tid == 0) + call_exchange[THREADBLOCK_SIZE] = expiryCallValue(S, X, vDt, NUM_STEPS); - int final_it = max(0, tid * ELEMS_PER_THREAD - 1); + int final_it = max(0, tid * ELEMS_PER_THREAD - 1); #pragma unroll 16 - for (int i = NUM_STEPS; i > 0; --i) { - call_exchange[tid] = call[0]; - cg::sync(cta); - call[ELEMS_PER_THREAD] = call_exchange[tid + 1]; - cg::sync(cta); + for (int i = NUM_STEPS; i > 0; --i) { + call_exchange[tid] = call[0]; + cg::sync(cta); + call[ELEMS_PER_THREAD] = call_exchange[tid + 1]; + cg::sync(cta); - if (i > final_it) { + if (i > final_it) { #pragma unroll - for (int j = 0; j < ELEMS_PER_THREAD; ++j) - call[j] = puByDf * call[j + 1] + pdByDf * call[j]; + for (int j = 0; j < ELEMS_PER_THREAD; ++j) + call[j] = puByDf * call[j + 1] + pdByDf * call[j]; + } } - } - if (tid == 0) { - d_CallValue[blockIdx.x] = call[0]; - } + if (tid == 0) { + d_CallValue[blockIdx.x] = call[0]; + } } //////////////////////////////////////////////////////////////////////////////// // Host-side interface to GPU binomialOptions //////////////////////////////////////////////////////////////////////////////// -extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, - int optN) { - __TOptionData h_OptionData[MAX_OPTIONS]; +extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, int optN) +{ + __TOptionData h_OptionData[MAX_OPTIONS]; - for (int i = 0; i < optN; i++) { - const real T = optionData[i].T; - const real R = optionData[i].R; - const real V = optionData[i].V; + for (int i = 0; i < optN; i++) { + const real T = optionData[i].T; + const real R = optionData[i].R; + const real V = optionData[i].V; - const real dt = T / (real)NUM_STEPS; - const real vDt = V * sqrt(dt); - const real rDt = R * dt; - // Per-step interest and discount factors - const real If = exp(rDt); - const real Df = exp(-rDt); - // Values and pseudoprobabilities of upward and downward moves - const real u = exp(vDt); - const real d = exp(-vDt); - const real pu = (If - d) / (u - d); - const real pd = (real)1.0 - pu; - const real puByDf = pu * Df; - const real pdByDf = pd * Df; + const real dt = T / (real)NUM_STEPS; + const real vDt = V * sqrt(dt); + const real rDt = R * dt; + // Per-step interest and discount factors + const real If = exp(rDt); + const real Df = exp(-rDt); + // Values and pseudoprobabilities of upward and downward moves + const real u = exp(vDt); + const real d = exp(-vDt); + const real pu = (If - d) / (u - d); + const real pd = (real)1.0 - pu; + const real puByDf = pu * Df; + const real pdByDf = pd * Df; - h_OptionData[i].S = (real)optionData[i].S; - h_OptionData[i].X = (real)optionData[i].X; - h_OptionData[i].vDt = (real)vDt; - h_OptionData[i].puByDf = (real)puByDf; - h_OptionData[i].pdByDf = (real)pdByDf; - } + h_OptionData[i].S = (real)optionData[i].S; + h_OptionData[i].X = (real)optionData[i].X; + h_OptionData[i].vDt = (real)vDt; + h_OptionData[i].puByDf = (real)puByDf; + h_OptionData[i].pdByDf = (real)pdByDf; + } - checkCudaErrors(cudaMemcpyToSymbol(d_OptionData, h_OptionData, - optN * sizeof(__TOptionData))); - binomialOptionsKernel<<>>(); - getLastCudaError("binomialOptionsKernel() execution failed.\n"); - checkCudaErrors( - cudaMemcpyFromSymbol(callValue, d_CallValue, optN * sizeof(real))); + checkCudaErrors(cudaMemcpyToSymbol(d_OptionData, h_OptionData, optN * sizeof(__TOptionData))); + binomialOptionsKernel<<>>(); + getLastCudaError("binomialOptionsKernel() execution failed.\n"); + checkCudaErrors(cudaMemcpyFromSymbol(callValue, d_CallValue, optN * sizeof(real))); } diff --git a/Samples/5_Domain_Specific/binomialOptions/realtype.h b/Samples/5_Domain_Specific/binomialOptions/realtype.h index 64d6f799..11ca533d 100644 --- a/Samples/5_Domain_Specific/binomialOptions/realtype.h +++ b/Samples/5_Domain_Specific/binomialOptions/realtype.h @@ -28,7 +28,7 @@ #ifndef REALTYPE_H #define REALTYPE_H -//#define DOUBLE_PRECISION +// #define DOUBLE_PRECISION #ifndef DOUBLE_PRECISION typedef float real; diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions.cpp b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions.cpp index eaecb69d..6567e0e6 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions.cpp +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions.cpp @@ -31,15 +31,13 @@ * See supplied whitepaper for more explanations. */ -#include -#include -#include -#include - -#include #include - +#include #include +#include +#include +#include +#include #include "binomialOptions_common.h" #include "realtype.h" @@ -61,138 +59,140 @@ extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData); // Process an array of OptN options on GPU //////////////////////////////////////////////////////////////////////////////// -extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, - int optN, int argc, char **argv); +extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, int optN, int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Helper function, returning uniformly distributed // random float in [low, high] range //////////////////////////////////////////////////////////////////////////////// -real randData(real low, real high) { - real t = (real)rand() / (real)RAND_MAX; - return ((real)1.0 - t) * low + t * high; +real randData(real low, real high) +{ + real t = (real)rand() / (real)RAND_MAX; + return ((real)1.0 - t) * low + t * high; } //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("[%s] - Starting...\n", argv[0]); +int main(int argc, char **argv) +{ + printf("[%s] - Starting...\n", argv[0]); - const int OPT_N = MAX_OPTIONS; + const int OPT_N = MAX_OPTIONS; - TOptionData optionData[MAX_OPTIONS]; - real callValueBS[MAX_OPTIONS], callValueGPU[MAX_OPTIONS], - callValueCPU[MAX_OPTIONS]; + TOptionData optionData[MAX_OPTIONS]; + real callValueBS[MAX_OPTIONS], callValueGPU[MAX_OPTIONS], callValueCPU[MAX_OPTIONS]; - real sumDelta, sumRef, gpuTime, errorVal; + real sumDelta, sumRef, gpuTime, errorVal; - StopWatchInterface *hTimer = NULL; + StopWatchInterface *hTimer = NULL; - int i; + int i; - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Generating input data...\n"); + printf("Generating input data...\n"); - // Generate options set - srand(123); + // Generate options set + srand(123); - for (i = 0; i < OPT_N; i++) { - optionData[i].S = randData(5.0f, 30.0f); - optionData[i].X = randData(1.0f, 100.0f); - optionData[i].T = randData(0.25f, 10.0f); - optionData[i].R = 0.06f; - optionData[i].V = 0.10f; + for (i = 0; i < OPT_N; i++) { + optionData[i].S = randData(5.0f, 30.0f); + optionData[i].X = randData(1.0f, 100.0f); + optionData[i].T = randData(0.25f, 10.0f); + optionData[i].R = 0.06f; + optionData[i].V = 0.10f; - BlackScholesCall(callValueBS[i], optionData[i]); - } + BlackScholesCall(callValueBS[i], optionData[i]); + } - printf("Running GPU binomial tree...\n"); + printf("Running GPU binomial tree...\n"); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - binomialOptionsGPU(callValueGPU, optionData, OPT_N, argc, argv); + binomialOptionsGPU(callValueGPU, optionData, OPT_N, argc, argv); - sdkStopTimer(&hTimer); + sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer); - printf("Options count : %i \n", OPT_N); - printf("Time steps : %i \n", NUM_STEPS); - printf("binomialOptionsGPU() time: %f msec\n", gpuTime); - printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); + printf("Options count : %i \n", OPT_N); + printf("Time steps : %i \n", NUM_STEPS); + printf("binomialOptionsGPU() time: %f msec\n", gpuTime); + printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001)); - printf("Running CPU binomial tree...\n"); + printf("Running CPU binomial tree...\n"); - for (i = 0; i < OPT_N; i++) { - binomialOptionsCPU(callValueCPU[i], optionData[i]); - } + for (i = 0; i < OPT_N; i++) { + binomialOptionsCPU(callValueCPU[i], optionData[i]); + } - printf("Comparing the results...\n"); + printf("Comparing the results...\n"); - sumDelta = 0; - sumRef = 0; - printf("GPU binomial vs. Black-Scholes\n"); + sumDelta = 0; + sumRef = 0; + printf("GPU binomial vs. Black-Scholes\n"); - for (i = 0; i < OPT_N; i++) { - sumDelta += fabs(callValueBS[i] - callValueGPU[i]); - sumRef += fabs(callValueBS[i]); - } + for (i = 0; i < OPT_N; i++) { + sumDelta += fabs(callValueBS[i] - callValueGPU[i]); + sumRef += fabs(callValueBS[i]); + } - if (sumRef > 1E-5) { - printf("L1 norm: %E\n", (double)(sumDelta / sumRef)); - } else { - printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); - } + if (sumRef > 1E-5) { + printf("L1 norm: %E\n", (double)(sumDelta / sumRef)); + } + else { + printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); + } - printf("CPU binomial vs. Black-Scholes\n"); - sumDelta = 0; - sumRef = 0; + printf("CPU binomial vs. Black-Scholes\n"); + sumDelta = 0; + sumRef = 0; - for (i = 0; i < OPT_N; i++) { - sumDelta += fabs(callValueBS[i] - callValueCPU[i]); - sumRef += fabs(callValueBS[i]); - } + for (i = 0; i < OPT_N; i++) { + sumDelta += fabs(callValueBS[i] - callValueCPU[i]); + sumRef += fabs(callValueBS[i]); + } - if (sumRef > 1E-5) { - printf("L1 norm: %E\n", sumDelta / sumRef); - } else { - printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); - } + if (sumRef > 1E-5) { + printf("L1 norm: %E\n", sumDelta / sumRef); + } + else { + printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); + } - printf("CPU binomial vs. GPU binomial\n"); - sumDelta = 0; - sumRef = 0; + printf("CPU binomial vs. GPU binomial\n"); + sumDelta = 0; + sumRef = 0; - for (i = 0; i < OPT_N; i++) { - sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); - sumRef += callValueCPU[i]; - } + for (i = 0; i < OPT_N; i++) { + sumDelta += fabs(callValueGPU[i] - callValueCPU[i]); + sumRef += callValueCPU[i]; + } - if (sumRef > 1E-5) { - printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); - } else { - printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); - } + if (sumRef > 1E-5) { + printf("L1 norm: %E\n", errorVal = sumDelta / sumRef); + } + else { + printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N)); + } - printf("Shutting down...\n"); + printf("Shutting down...\n"); - sdkDeleteTimer(&hTimer); + sdkDeleteTimer(&hTimer); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - if (errorVal > 5e-4) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + if (errorVal > 5e-4) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - printf("Test passed\n"); + printf("Test passed\n"); - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_common.h b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_common.h index f79a112e..1b4d9dfe 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_common.h +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_common.h @@ -32,12 +32,13 @@ // Global types //////////////////////////////////////////////////////////////////////////////// -typedef struct { - float S; - float X; - float T; - float R; - float V; +typedef struct +{ + float S; + float X; + float T; + float R; + float V; } TOptionData; //////////////////////////////////////////////////////////////////////////////// diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gold.cpp b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gold.cpp index e3a74f92..934b70cd 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gold.cpp +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gold.cpp @@ -25,50 +25,53 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include + #include "binomialOptions_common.h" /////////////////////////////////////////////////////////////////////////////// // Polynomial approximation of cumulative normal distribution function /////////////////////////////////////////////////////////////////////////////// -static double CND(double d) { - const double A1 = 0.31938153; - const double A2 = -0.356563782; - const double A3 = 1.781477937; - const double A4 = -1.821255978; - const double A5 = 1.330274429; - const double RSQRT2PI = 0.39894228040143267793994605993438; +static double CND(double d) +{ + const double A1 = 0.31938153; + const double A2 = -0.356563782; + const double A3 = 1.781477937; + const double A4 = -1.821255978; + const double A5 = 1.330274429; + const double RSQRT2PI = 0.39894228040143267793994605993438; - double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); + double K = 1.0 / (1.0 + 0.2316419 * fabs(d)); - double cnd = RSQRT2PI * exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + double cnd = RSQRT2PI * exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0 - cnd; + if (d > 0) + cnd = 1.0 - cnd; - return cnd; + return cnd; } -extern "C" void BlackScholesCall(float &callResult, TOptionData optionData) { - double S = optionData.S; - double X = optionData.X; - double T = optionData.T; - double R = optionData.R; - double V = optionData.V; - double sqrtT = sqrt(T); +extern "C" void BlackScholesCall(float &callResult, TOptionData optionData) +{ + double S = optionData.S; + double X = optionData.X; + double T = optionData.T; + double R = optionData.R; + double V = optionData.V; + double sqrtT = sqrt(T); - double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); - double d2 = d1 - V * sqrtT; + double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; - double CNDD1 = CND(d1); - double CNDD2 = CND(d2); + double CNDD1 = CND(d1); + double CNDD2 = CND(d2); - // Calculate Call and Put simultaneously - double expRT = exp(-R * T); + // Calculate Call and Put simultaneously + double expRT = exp(-R * T); - callResult = (float)(S * CNDD1 - X * expRT * CNDD2); + callResult = (float)(S * CNDD1 - X * expRT * CNDD2); } //////////////////////////////////////////////////////////////////////////////// @@ -76,51 +79,54 @@ extern "C" void BlackScholesCall(float &callResult, TOptionData optionData) { // Note that CPU code is for correctness testing only and not for benchmarking. //////////////////////////////////////////////////////////////////////////////// -static double expiryCallValue(double S, double X, double vDt, int i) { - double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X; - return (d > 0) ? d : 0; +static double expiryCallValue(double S, double X, double vDt, int i) +{ + double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X; + return (d > 0) ? d : 0; } -extern "C" void binomialOptionsCPU(float &callResult, TOptionData optionData) { - static double Call[NUM_STEPS + 1]; - const double S = optionData.S; - const double X = optionData.X; - const double T = optionData.T; - const double R = optionData.R; - const double V = optionData.V; +extern "C" void binomialOptionsCPU(float &callResult, TOptionData optionData) +{ + static double Call[NUM_STEPS + 1]; + const double S = optionData.S; + const double X = optionData.X; + const double T = optionData.T; + const double R = optionData.R; + const double V = optionData.V; - const double dt = T / (double)NUM_STEPS; - const double vDt = V * sqrt(dt); - const double rDt = R * dt; + const double dt = T / (double)NUM_STEPS; + const double vDt = V * sqrt(dt); + const double rDt = R * dt; - // Per-step interest and discount factors - const double If = exp(rDt); - const double Df = exp(-rDt); + // Per-step interest and discount factors + const double If = exp(rDt); + const double Df = exp(-rDt); - // Values and pseudoprobabilities of upward and downward moves - const double u = exp(vDt); - const double d = exp(-vDt); - const double pu = (If - d) / (u - d); - const double pd = 1.0 - pu; - const double puByDf = pu * Df; - const double pdByDf = pd * Df; + // Values and pseudoprobabilities of upward and downward moves + const double u = exp(vDt); + const double d = exp(-vDt); + const double pu = (If - d) / (u - d); + const double pd = 1.0 - pu; + const double puByDf = pu * Df; + const double pdByDf = pd * Df; - /////////////////////////////////////////////////////////////////////// - // Compute values at expiration date: - // call option value at period end is V(T) = S(T) - X - // if S(T) is greater than X, or zero otherwise. - // The computation is similar for put options. - /////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////// + // Compute values at expiration date: + // call option value at period end is V(T) = S(T) - X + // if S(T) is greater than X, or zero otherwise. + // The computation is similar for put options. + /////////////////////////////////////////////////////////////////////// - for (int i = 0; i <= NUM_STEPS; i++) Call[i] = expiryCallValue(S, X, vDt, i); + for (int i = 0; i <= NUM_STEPS; i++) + Call[i] = expiryCallValue(S, X, vDt, i); - //////////////////////////////////////////////////////////////////////// - // Walk backwards up binomial tree - //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + // Walk backwards up binomial tree + //////////////////////////////////////////////////////////////////////// - for (int i = NUM_STEPS; i > 0; i--) - for (int j = 0; j <= i - 1; j++) - Call[j] = puByDf * Call[j + 1] + pdByDf * Call[j]; + for (int i = NUM_STEPS; i > 0; i--) + for (int j = 0; j <= i - 1; j++) + Call[j] = puByDf * Call[j + 1] + pdByDf * Call[j]; - callResult = (float)Call[0]; + callResult = (float)Call[0]; } diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gpu.cpp b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gpu.cpp index ec5aa7c2..0fb12d13 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gpu.cpp +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_gpu.cpp @@ -29,99 +29,104 @@ // Global types and parameters //////////////////////////////////////////////////////////////////////////////// +#include #include #include -#include +// Other helpers #include #include + +// CUDA runtime #include #include "binomialOptions_common.h" - #include "common_gpu_header.h" #include "realtype.h" + // Preprocessed input option data -typedef struct { - real S; - real X; - real vDt; - real puByDf; - real pdByDf; +typedef struct +{ + real S; + real X; + real vDt; + real puByDf; + real pdByDf; } __TOptionData; static bool moduleLoaded = false; -char *cubin, *kernel_file; -size_t cubinSize; -CUmodule module; +char *cubin, *kernel_file; +size_t cubinSize; +CUmodule module; //////////////////////////////////////////////////////////////////////////////// // Host-side interface to GPU binomialOptions //////////////////////////////////////////////////////////////////////////////// -extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, - int optN, int argc, char **argv) { - if (!moduleLoaded) { - kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - module = loadCUBIN(cubin, argc, argv); - moduleLoaded = true; - } +extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData, int optN, int argc, char **argv) +{ + if (!moduleLoaded) { + kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + module = loadCUBIN(cubin, argc, argv); + moduleLoaded = true; + } - __TOptionData h_OptionData[MAX_OPTIONS]; + __TOptionData h_OptionData[MAX_OPTIONS]; - for (int i = 0; i < optN; i++) { - const real T = optionData[i].T; - const real R = optionData[i].R; - const real V = optionData[i].V; + for (int i = 0; i < optN; i++) { + const real T = optionData[i].T; + const real R = optionData[i].R; + const real V = optionData[i].V; - const real dt = T / (real)NUM_STEPS; - const real vDt = V * sqrt(dt); - const real rDt = R * dt; - // Per-step interest and discount factors - const real If = exp(rDt); - const real Df = exp(-rDt); - // Values and pseudoprobabilities of upward and downward moves - const real u = exp(vDt); - const real d = exp(-vDt); - const real pu = (If - d) / (u - d); - const real pd = (real)1.0 - pu; - const real puByDf = pu * Df; - const real pdByDf = pd * Df; + const real dt = T / (real)NUM_STEPS; + const real vDt = V * sqrt(dt); + const real rDt = R * dt; + // Per-step interest and discount factors + const real If = exp(rDt); + const real Df = exp(-rDt); + // Values and pseudoprobabilities of upward and downward moves + const real u = exp(vDt); + const real d = exp(-vDt); + const real pu = (If - d) / (u - d); + const real pd = (real)1.0 - pu; + const real puByDf = pu * Df; + const real pdByDf = pd * Df; - h_OptionData[i].S = (real)optionData[i].S; - h_OptionData[i].X = (real)optionData[i].X; - h_OptionData[i].vDt = (real)vDt; - h_OptionData[i].puByDf = (real)puByDf; - h_OptionData[i].pdByDf = (real)pdByDf; - } + h_OptionData[i].S = (real)optionData[i].S; + h_OptionData[i].X = (real)optionData[i].X; + h_OptionData[i].vDt = (real)vDt; + h_OptionData[i].puByDf = (real)puByDf; + h_OptionData[i].pdByDf = (real)pdByDf; + } - CUfunction kernel_addr; - checkCudaErrors( - cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel")); + CUfunction kernel_addr; + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel")); - CUdeviceptr d_OptionData; - checkCudaErrors( - cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData")); - checkCudaErrors( - cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData))); + CUdeviceptr d_OptionData; + checkCudaErrors(cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData")); + checkCudaErrors(cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData))); - dim3 cudaBlockSize(128, 1, 1); - dim3 cudaGridSize(optN, 1, 1); + dim3 cudaBlockSize(128, 1, 1); + dim3 cudaGridSize(optN, 1, 1); - checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - cudaBlockSize.x, cudaBlockSize.y, - cudaBlockSize.z, /* block dim */ - 0, 0, /* shared mem, stream */ - NULL, /* arguments */ - 0)); + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + cudaBlockSize.x, + cudaBlockSize.y, + cudaBlockSize.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + NULL, /* arguments */ + 0)); - checkCudaErrors(cuCtxSynchronize()); + checkCudaErrors(cuCtxSynchronize()); - CUdeviceptr d_CallValue; - checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue")); - checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN * sizeof(real))); + CUdeviceptr d_CallValue; + checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue")); + checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN * sizeof(real))); } diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_kernel.cu b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_kernel.cu index 2a1da887..9d50ca82 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_kernel.cu +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/binomialOptions_kernel.cu @@ -25,20 +25,21 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "common_gpu_header.h" #include "binomialOptions_common.h" +#include "common_gpu_header.h" #include "realtype.h" // Preprocessed input option data -typedef struct { - real S; - real X; - real vDt; - real puByDf; - real pdByDf; +typedef struct +{ + real S; + real X; + real vDt; + real puByDf; + real pdByDf; } __TOptionData; static __constant__ __TOptionData d_OptionData[MAX_OPTIONS]; -__device__ real d_CallValue[MAX_OPTIONS]; +__device__ real d_CallValue[MAX_OPTIONS]; #define THREADBLOCK_SIZE 128 #define ELEMS_PER_THREAD (NUM_STEPS / THREADBLOCK_SIZE) @@ -51,57 +52,59 @@ __device__ real d_CallValue[MAX_OPTIONS]; //////////////////////////////////////////////////////////////////////////////// #ifndef DOUBLE_PRECISION -__device__ inline float expiryCallValue(float S, float X, float vDt, int i) { - float d = S * __expf(vDt * (2.0f * i - NUM_STEPS)) - X; - return (d > 0.0F) ? d : 0.0F; +__device__ inline float expiryCallValue(float S, float X, float vDt, int i) +{ + float d = S * __expf(vDt * (2.0f * i - NUM_STEPS)) - X; + return (d > 0.0F) ? d : 0.0F; } #else -__device__ inline double expiryCallValue(double S, double X, double vDt, - int i) { - double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X; - return (d > 0.0) ? d : 0.0; +__device__ inline double expiryCallValue(double S, double X, double vDt, int i) +{ + double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X; + return (d > 0.0) ? d : 0.0; } #endif //////////////////////////////////////////////////////////////////////////////// // GPU kernel //////////////////////////////////////////////////////////////////////////////// -extern "C" __global__ void binomialOptionsKernel() { - __shared__ real call_exchange[THREADBLOCK_SIZE + 1]; +extern "C" __global__ void binomialOptionsKernel() +{ + __shared__ real call_exchange[THREADBLOCK_SIZE + 1]; - const int tid = threadIdx.x; - const real S = d_OptionData[blockIdx.x].S; - const real X = d_OptionData[blockIdx.x].X; - const real vDt = d_OptionData[blockIdx.x].vDt; - const real puByDf = d_OptionData[blockIdx.x].puByDf; - const real pdByDf = d_OptionData[blockIdx.x].pdByDf; + const int tid = threadIdx.x; + const real S = d_OptionData[blockIdx.x].S; + const real X = d_OptionData[blockIdx.x].X; + const real vDt = d_OptionData[blockIdx.x].vDt; + const real puByDf = d_OptionData[blockIdx.x].puByDf; + const real pdByDf = d_OptionData[blockIdx.x].pdByDf; - real call[ELEMS_PER_THREAD + 1]; + real call[ELEMS_PER_THREAD + 1]; #pragma unroll - for (int i = 0; i < ELEMS_PER_THREAD; ++i) - call[i] = expiryCallValue(S, X, vDt, tid * ELEMS_PER_THREAD + i); + for (int i = 0; i < ELEMS_PER_THREAD; ++i) + call[i] = expiryCallValue(S, X, vDt, tid * ELEMS_PER_THREAD + i); - if (tid == 0) - call_exchange[THREADBLOCK_SIZE] = expiryCallValue(S, X, vDt, NUM_STEPS); + if (tid == 0) + call_exchange[THREADBLOCK_SIZE] = expiryCallValue(S, X, vDt, NUM_STEPS); - int final_it = max(0, tid * ELEMS_PER_THREAD - 1); + int final_it = max(0, tid * ELEMS_PER_THREAD - 1); #pragma unroll 16 - for (int i = NUM_STEPS; i > 0; --i) { - call_exchange[tid] = call[0]; - __syncthreads(); - call[ELEMS_PER_THREAD] = call_exchange[tid + 1]; - __syncthreads(); + for (int i = NUM_STEPS; i > 0; --i) { + call_exchange[tid] = call[0]; + __syncthreads(); + call[ELEMS_PER_THREAD] = call_exchange[tid + 1]; + __syncthreads(); - if (i > final_it) { + if (i > final_it) { #pragma unroll - for (int j = 0; j < ELEMS_PER_THREAD; ++j) - call[j] = puByDf * call[j + 1] + pdByDf * call[j]; + for (int j = 0; j < ELEMS_PER_THREAD; ++j) + call[j] = puByDf * call[j + 1] + pdByDf * call[j]; + } } - } - if (tid == 0) { - d_CallValue[blockIdx.x] = call[0]; - } + if (tid == 0) { + d_CallValue[blockIdx.x] = call[0]; + } } diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/common_gpu_header.h b/Samples/5_Domain_Specific/binomialOptions_nvrtc/common_gpu_header.h index 1bcd639a..88dc2936 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/common_gpu_header.h +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/common_gpu_header.h @@ -8,7 +8,7 @@ * is strictly prohibited. * */ - + #if !defined(__COMMON_GPU_HEADER_H) #define __COMMON_GPU_HEADER_H @@ -16,16 +16,16 @@ // Internal GPU-side constants and data structures //////////////////////////////////////////////////////////////////////////////// -#define TIME_STEPS 16 +#define TIME_STEPS 16 #define CACHE_DELTA (2 * TIME_STEPS) -#define CACHE_SIZE (256) +#define CACHE_SIZE (256) -#define CACHE_STEP (CACHE_SIZE - CACHE_DELTA) +#define CACHE_STEP (CACHE_SIZE - CACHE_DELTA) #if NUM_STEPS % CACHE_DELTA #error Bad constants #endif -#endif \ No newline at end of file +#endif diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/realtype.h b/Samples/5_Domain_Specific/binomialOptions_nvrtc/realtype.h index ab066211..031d7b59 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/realtype.h +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/realtype.h @@ -30,7 +30,7 @@ // To use double precision uncomment the macro DOUBLE_PRECISION below, default // is single precision. -//#define DOUBLE_PRECISION +// #define DOUBLE_PRECISION #ifndef DOUBLE_PRECISION typedef float real; diff --git a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cu b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cu index d6b22e65..7d006d7b 100644 --- a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cu +++ b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cu @@ -29,98 +29,124 @@ #include #include #include + +// Other helpers #include -#include "convolutionFFT2D_common.h" + +// Project includes #include "convolutionFFT2D.cuh" +#include "convolutionFFT2D_common.h" //////////////////////////////////////////////////////////////////////////////// /// Position convolution kernel center at (0, 0) in the image //////////////////////////////////////////////////////////////////////////////// -extern "C" void padKernel(float *d_Dst, float *d_Src, int fftH, int fftW, - int kernelH, int kernelW, int kernelY, int kernelX) { - assert(d_Src != d_Dst); - dim3 threads(32, 8); - dim3 grid(iDivUp(kernelW, threads.x), iDivUp(kernelH, threads.y)); +extern "C" void +padKernel(float *d_Dst, float *d_Src, int fftH, int fftW, int kernelH, int kernelW, int kernelY, int kernelX) +{ + assert(d_Src != d_Dst); + dim3 threads(32, 8); + dim3 grid(iDivUp(kernelW, threads.x), iDivUp(kernelH, threads.y)); - SET_FLOAT_BASE; + SET_FLOAT_BASE; #if (USE_TEXTURE) - cudaTextureObject_t texFloat; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texFloat; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = d_Src; - texRes.res.linear.sizeInBytes = sizeof(float) * kernelH * kernelW; - texRes.res.linear.desc = cudaCreateChannelDesc(); + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = d_Src; + texRes.res.linear.sizeInBytes = sizeof(float) * kernelH * kernelW; + texRes.res.linear.desc = cudaCreateChannelDesc(); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&texFloat, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texFloat, &texRes, &texDescr, NULL)); #endif - padKernel_kernel<<>>(d_Dst, d_Src, fftH, fftW, kernelH, - kernelW, kernelY, kernelX + padKernel_kernel<<>>(d_Dst, + d_Src, + fftH, + fftW, + kernelH, + kernelW, + kernelY, + kernelX #if (USE_TEXTURE) - , - texFloat + , + texFloat #endif - ); - getLastCudaError("padKernel_kernel<<<>>> execution failed\n"); + ); + getLastCudaError("padKernel_kernel<<<>>> execution failed\n"); #if (USE_TEXTURE) - checkCudaErrors(cudaDestroyTextureObject(texFloat)); + checkCudaErrors(cudaDestroyTextureObject(texFloat)); #endif } //////////////////////////////////////////////////////////////////////////////// // Prepare data for "pad to border" addressing mode //////////////////////////////////////////////////////////////////////////////// -extern "C" void padDataClampToBorder(float *d_Dst, float *d_Src, int fftH, - int fftW, int dataH, int dataW, - int kernelW, int kernelH, int kernelY, - int kernelX) { - assert(d_Src != d_Dst); - dim3 threads(32, 8); - dim3 grid(iDivUp(fftW, threads.x), iDivUp(fftH, threads.y)); +extern "C" void padDataClampToBorder(float *d_Dst, + float *d_Src, + int fftH, + int fftW, + int dataH, + int dataW, + int kernelW, + int kernelH, + int kernelY, + int kernelX) +{ + assert(d_Src != d_Dst); + dim3 threads(32, 8); + dim3 grid(iDivUp(fftW, threads.x), iDivUp(fftH, threads.y)); #if (USE_TEXTURE) - cudaTextureObject_t texFloat; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texFloat; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = d_Src; - texRes.res.linear.sizeInBytes = sizeof(float) * dataH * dataW; - texRes.res.linear.desc = cudaCreateChannelDesc(); + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = d_Src; + texRes.res.linear.sizeInBytes = sizeof(float) * dataH * dataW; + texRes.res.linear.desc = cudaCreateChannelDesc(); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&texFloat, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texFloat, &texRes, &texDescr, NULL)); #endif - padDataClampToBorder_kernel<<>>( - d_Dst, d_Src, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX + padDataClampToBorder_kernel<<>>(d_Dst, + d_Src, + fftH, + fftW, + dataH, + dataW, + kernelH, + kernelW, + kernelY, + kernelX #if (USE_TEXTURE) - , - texFloat + , + texFloat #endif - ); - getLastCudaError("padDataClampToBorder_kernel<<<>>> execution failed\n"); + ); + getLastCudaError("padDataClampToBorder_kernel<<<>>> execution failed\n"); #if (USE_TEXTURE) - checkCudaErrors(cudaDestroyTextureObject(texFloat)); + checkCudaErrors(cudaDestroyTextureObject(texFloat)); #endif } @@ -128,192 +154,201 @@ extern "C" void padDataClampToBorder(float *d_Dst, float *d_Src, int fftH, // Modulate Fourier image of padded data by Fourier image of padded kernel // and normalize by FFT size //////////////////////////////////////////////////////////////////////////////// -extern "C" void modulateAndNormalize(fComplex *d_Dst, fComplex *d_Src, int fftH, - int fftW, int padding) { - assert(fftW % 2 == 0); - const int dataSize = fftH * (fftW / 2 + padding); +extern "C" void modulateAndNormalize(fComplex *d_Dst, fComplex *d_Src, int fftH, int fftW, int padding) +{ + assert(fftW % 2 == 0); + const int dataSize = fftH * (fftW / 2 + padding); - modulateAndNormalize_kernel<<>>( - d_Dst, d_Src, dataSize, 1.0f / (float)(fftW * fftH)); - getLastCudaError("modulateAndNormalize() execution failed\n"); + modulateAndNormalize_kernel<<>>(d_Dst, d_Src, dataSize, 1.0f / (float)(fftW * fftH)); + getLastCudaError("modulateAndNormalize() execution failed\n"); } //////////////////////////////////////////////////////////////////////////////// // 2D R2C / C2R post/preprocessing kernels //////////////////////////////////////////////////////////////////////////////// -static const double PI = 3.1415926535897932384626433832795; -static const uint BLOCKDIM = 256; +static const double PI = 3.1415926535897932384626433832795; +static const uint BLOCKDIM = 256; -extern "C" void spPostprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, - uint padding, int dir) { - assert(d_Src != d_Dst); - assert(DX % 2 == 0); +extern "C" void spPostprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, uint padding, int dir) +{ + assert(d_Src != d_Dst); + assert(DX % 2 == 0); #if (POWER_OF_TWO) - uint log2DX, log2DY; - uint factorizationRemX = factorRadix2(log2DX, DX); - uint factorizationRemY = factorRadix2(log2DY, DY); - assert(factorizationRemX == 1 && factorizationRemY == 1); + uint log2DX, log2DY; + uint factorizationRemX = factorRadix2(log2DX, DX); + uint factorizationRemY = factorRadix2(log2DY, DY); + assert(factorizationRemX == 1 && factorizationRemY == 1); #endif - const uint threadCount = DY * (DX / 2); - const double phaseBase = dir * PI / (double)DX; + const uint threadCount = DY * (DX / 2); + const double phaseBase = dir * PI / (double)DX; #if (USE_TEXTURE) - cudaTextureObject_t texComplex; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texComplex; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = d_Src; - texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * (DX + padding); - texRes.res.linear.desc = cudaCreateChannelDesc(); + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = d_Src; + texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * (DX + padding); + texRes.res.linear.desc = cudaCreateChannelDesc(); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texComplex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texComplex, &texRes, &texDescr, NULL)); #endif - spPostprocess2D_kernel<<>>( - (fComplex *)d_Dst, (fComplex *)d_Src, DY, DX, threadCount, padding, - (float)phaseBase + spPostprocess2D_kernel<<>>((fComplex *)d_Dst, + (fComplex *)d_Src, + DY, + DX, + threadCount, + padding, + (float)phaseBase #if (USE_TEXTURE) - , - texComplex + , + texComplex #endif - ); - getLastCudaError("spPostprocess2D_kernel<<<>>> execution failed\n"); + ); + getLastCudaError("spPostprocess2D_kernel<<<>>> execution failed\n"); #if (USE_TEXTURE) - checkCudaErrors(cudaDestroyTextureObject(texComplex)); + checkCudaErrors(cudaDestroyTextureObject(texComplex)); #endif } -extern "C" void spPreprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, - uint padding, int dir) { - assert(d_Src != d_Dst); - assert(DX % 2 == 0); +extern "C" void spPreprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, uint padding, int dir) +{ + assert(d_Src != d_Dst); + assert(DX % 2 == 0); #if (POWER_OF_TWO) - uint log2DX, log2DY; - uint factorizationRemX = factorRadix2(log2DX, DX); - uint factorizationRemY = factorRadix2(log2DY, DY); - assert(factorizationRemX == 1 && factorizationRemY == 1); + uint log2DX, log2DY; + uint factorizationRemX = factorRadix2(log2DX, DX); + uint factorizationRemY = factorRadix2(log2DY, DY); + assert(factorizationRemX == 1 && factorizationRemY == 1); #endif - const uint threadCount = DY * (DX / 2); - const double phaseBase = -dir * PI / (double)DX; + const uint threadCount = DY * (DX / 2); + const double phaseBase = -dir * PI / (double)DX; #if (USE_TEXTURE) - cudaTextureObject_t texComplex; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texComplex; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = d_Src; - texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * (DX + padding); - texRes.res.linear.desc = cudaCreateChannelDesc(); + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = d_Src; + texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * (DX + padding); + texRes.res.linear.desc = cudaCreateChannelDesc(); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texComplex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texComplex, &texRes, &texDescr, NULL)); #endif - spPreprocess2D_kernel<<>>( - (fComplex *)d_Dst, (fComplex *)d_Src, DY, DX, threadCount, padding, - (float)phaseBase + spPreprocess2D_kernel<<>>((fComplex *)d_Dst, + (fComplex *)d_Src, + DY, + DX, + threadCount, + padding, + (float)phaseBase #if (USE_TEXTURE) - , - texComplex + , + texComplex #endif - ); - getLastCudaError("spPreprocess2D_kernel<<<>>> execution failed\n"); + ); + getLastCudaError("spPreprocess2D_kernel<<<>>> execution failed\n"); #if (USE_TEXTURE) - checkCudaErrors(cudaDestroyTextureObject(texComplex)); + checkCudaErrors(cudaDestroyTextureObject(texComplex)); #endif } //////////////////////////////////////////////////////////////////////////////// // Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D //////////////////////////////////////////////////////////////////////////////// -extern "C" void spProcess2D(void *d_Dst, void *d_SrcA, void *d_SrcB, uint DY, - uint DX, int dir) { - assert(DY % 2 == 0); +extern "C" void spProcess2D(void *d_Dst, void *d_SrcA, void *d_SrcB, uint DY, uint DX, int dir) +{ + assert(DY % 2 == 0); #if (POWER_OF_TWO) - uint log2DX, log2DY; - uint factorizationRemX = factorRadix2(log2DX, DX); - uint factorizationRemY = factorRadix2(log2DY, DY); - assert(factorizationRemX == 1 && factorizationRemY == 1); + uint log2DX, log2DY; + uint factorizationRemX = factorRadix2(log2DX, DX); + uint factorizationRemY = factorRadix2(log2DY, DY); + assert(factorizationRemX == 1 && factorizationRemY == 1); #endif - const uint threadCount = (DY / 2) * DX; - const double phaseBase = dir * PI / (double)DX; + const uint threadCount = (DY / 2) * DX; + const double phaseBase = dir * PI / (double)DX; #if (USE_TEXTURE) - cudaTextureObject_t texComplexA, texComplexB; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t texComplexA, texComplexB; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = d_SrcA; - texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * DX; - texRes.res.linear.desc = cudaCreateChannelDesc(); + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = d_SrcA; + texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * DX; + texRes.res.linear.desc = cudaCreateChannelDesc(); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texComplexA, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texComplexA, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = d_SrcB; - texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * DX; - texRes.res.linear.desc = cudaCreateChannelDesc(); + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = d_SrcB; + texRes.res.linear.sizeInBytes = sizeof(fComplex) * DY * DX; + texRes.res.linear.desc = cudaCreateChannelDesc(); - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&texComplexB, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texComplexB, &texRes, &texDescr, NULL)); #endif - spProcess2D_kernel<<>>( - (fComplex *)d_Dst, (fComplex *)d_SrcA, (fComplex *)d_SrcB, DY, DX, - threadCount, (float)phaseBase, 0.5f / (float)(DY * DX) + spProcess2D_kernel<<>>((fComplex *)d_Dst, + (fComplex *)d_SrcA, + (fComplex *)d_SrcB, + DY, + DX, + threadCount, + (float)phaseBase, + 0.5f / (float)(DY * DX) #if (USE_TEXTURE) - , - texComplexA, texComplexB + , + texComplexA, + texComplexB #endif - ); - getLastCudaError("spProcess2D_kernel<<<>>> execution failed\n"); + ); + getLastCudaError("spProcess2D_kernel<<<>>> execution failed\n"); #if (USE_TEXTURE) - checkCudaErrors(cudaDestroyTextureObject(texComplexA)); - checkCudaErrors(cudaDestroyTextureObject(texComplexB)); + checkCudaErrors(cudaDestroyTextureObject(texComplexA)); + checkCudaErrors(cudaDestroyTextureObject(texComplexB)); #endif } diff --git a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cuh b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cuh index f61bd268..e94980c4 100644 --- a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cuh +++ b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D.cuh @@ -25,7 +25,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#define USE_TEXTURE 1 +#define USE_TEXTURE 1 #define POWER_OF_TWO 1 #if (USE_TEXTURE) @@ -36,116 +36,131 @@ #define SET_FLOAT_BASE #endif +#include "convolutionFFT2D_common.h" + //////////////////////////////////////////////////////////////////////////////// /// Position convolution kernel center at (0, 0) in the image //////////////////////////////////////////////////////////////////////////////// -__global__ void padKernel_kernel(float *d_Dst, float *d_Src, int fftH, int fftW, - int kernelH, int kernelW, int kernelY, - int kernelX +__global__ void padKernel_kernel(float *d_Dst, + float *d_Src, + int fftH, + int fftW, + int kernelH, + int kernelW, + int kernelY, + int kernelX #if (USE_TEXTURE) , cudaTextureObject_t texFloat #endif - ) { - const int y = blockDim.y * blockIdx.y + threadIdx.y; - const int x = blockDim.x * blockIdx.x + threadIdx.x; +) +{ + const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int x = blockDim.x * blockIdx.x + threadIdx.x; - if (y < kernelH && x < kernelW) { - int ky = y - kernelY; + if (y < kernelH && x < kernelW) { + int ky = y - kernelY; - if (ky < 0) { - ky += fftH; + if (ky < 0) { + ky += fftH; + } + + int kx = x - kernelX; + + if (kx < 0) { + kx += fftW; + } + + d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x); } - - int kx = x - kernelX; - - if (kx < 0) { - kx += fftW; - } - - d_Dst[ky * fftW + kx] = LOAD_FLOAT(y * kernelW + x); - } } //////////////////////////////////////////////////////////////////////////////// // Prepare data for "pad to border" addressing mode //////////////////////////////////////////////////////////////////////////////// -__global__ void padDataClampToBorder_kernel(float *d_Dst, float *d_Src, - int fftH, int fftW, int dataH, - int dataW, int kernelH, int kernelW, - int kernelY, int kernelX +__global__ void padDataClampToBorder_kernel(float *d_Dst, + float *d_Src, + int fftH, + int fftW, + int dataH, + int dataW, + int kernelH, + int kernelW, + int kernelY, + int kernelX #if (USE_TEXTURE) , cudaTextureObject_t texFloat #endif - ) { - const int y = blockDim.y * blockIdx.y + threadIdx.y; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int borderH = dataH + kernelY; - const int borderW = dataW + kernelX; +) +{ + const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int borderH = dataH + kernelY; + const int borderW = dataW + kernelX; - if (y < fftH && x < fftW) { - int dy, dx; + if (y < fftH && x < fftW) { + int dy, dx; - if (y < dataH) { - dy = y; + if (y < dataH) { + dy = y; + } + + if (x < dataW) { + dx = x; + } + + if (y >= dataH && y < borderH) { + dy = dataH - 1; + } + + if (x >= dataW && x < borderW) { + dx = dataW - 1; + } + + if (y >= borderH) { + dy = 0; + } + + if (x >= borderW) { + dx = 0; + } + + d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx); } - - if (x < dataW) { - dx = x; - } - - if (y >= dataH && y < borderH) { - dy = dataH - 1; - } - - if (x >= dataW && x < borderW) { - dx = dataW - 1; - } - - if (y >= borderH) { - dy = 0; - } - - if (x >= borderW) { - dx = 0; - } - - d_Dst[y * fftW + x] = LOAD_FLOAT(dy * dataW + dx); - } } //////////////////////////////////////////////////////////////////////////////// // Modulate Fourier image of padded data by Fourier image of padded kernel // and normalize by FFT size //////////////////////////////////////////////////////////////////////////////// -inline __device__ void mulAndScale(fComplex &a, const fComplex &b, - const float &c) { - fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)}; - a = t; +inline __device__ void mulAndScale(fComplex &a, const fComplex &b, const float &c) +{ + fComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)}; + a = t; } -__global__ void modulateAndNormalize_kernel(fComplex *d_Dst, fComplex *d_Src, - int dataSize, float c) { - const int i = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void modulateAndNormalize_kernel(fComplex *d_Dst, fComplex *d_Src, int dataSize, float c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= dataSize) { - return; - } + if (i >= dataSize) { + return; + } - fComplex a = d_Src[i]; - fComplex b = d_Dst[i]; + fComplex a = d_Src[i]; + fComplex b = d_Dst[i]; - mulAndScale(a, b, c); + mulAndScale(a, b, c); - d_Dst[i] = a; + d_Dst[i] = a; } //////////////////////////////////////////////////////////////////////////////// // 2D R2C / C2R post/preprocessing kernels //////////////////////////////////////////////////////////////////////////////// #if (USE_TEXTURE) -#define LOAD_FCOMPLEX(i) tex1Dfetch(texComplex, i) +#define LOAD_FCOMPLEX(i) tex1Dfetch(texComplex, i) #define LOAD_FCOMPLEX_A(i) tex1Dfetch(texComplexA, i) #define LOAD_FCOMPLEX_B(i) tex1Dfetch(texComplexB, i) @@ -153,7 +168,7 @@ __global__ void modulateAndNormalize_kernel(fComplex *d_Dst, fComplex *d_Src, #define SET_FCOMPLEX_BASE_A #define SET_FCOMPLEX_BASE_B #else -#define LOAD_FCOMPLEX(i) d_Src[i] +#define LOAD_FCOMPLEX(i) d_Src[i] #define LOAD_FCOMPLEX_A(i) d_SrcA[i] #define LOAD_FCOMPLEX_B(i) d_SrcB[i] @@ -162,260 +177,278 @@ __global__ void modulateAndNormalize_kernel(fComplex *d_Dst, fComplex *d_Src, #define SET_FCOMPLEX_BASE_B #endif -inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2, - const fComplex &twiddle) { - float A1 = 0.5f * (D1.x + D2.x); - float B1 = 0.5f * (D1.y - D2.y); - float A2 = 0.5f * (D1.y + D2.y); - float B2 = 0.5f * (D1.x - D2.x); +inline __device__ void spPostprocessC2C(fComplex &D1, fComplex &D2, const fComplex &twiddle) +{ + float A1 = 0.5f * (D1.x + D2.x); + float B1 = 0.5f * (D1.y - D2.y); + float A2 = 0.5f * (D1.y + D2.y); + float B2 = 0.5f * (D1.x - D2.x); - D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y); - D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1; - D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y); - D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1; + D1.x = A1 + (A2 * twiddle.x + B2 * twiddle.y); + D1.y = (A2 * twiddle.y - B2 * twiddle.x) + B1; + D2.x = A1 - (A2 * twiddle.x + B2 * twiddle.y); + D2.y = (A2 * twiddle.y - B2 * twiddle.x) - B1; } // Premultiply by 2 to account for 1.0 / (DZ * DY * DX) normalization -inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2, - const fComplex &twiddle) { - float A1 = /* 0.5f * */ (D1.x + D2.x); - float B1 = /* 0.5f * */ (D1.y - D2.y); - float A2 = /* 0.5f * */ (D1.y + D2.y); - float B2 = /* 0.5f * */ (D1.x - D2.x); +inline __device__ void spPreprocessC2C(fComplex &D1, fComplex &D2, const fComplex &twiddle) +{ + float A1 = /* 0.5f * */ (D1.x + D2.x); + float B1 = /* 0.5f * */ (D1.y - D2.y); + float A2 = /* 0.5f * */ (D1.y + D2.y); + float B2 = /* 0.5f * */ (D1.x - D2.x); - D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y); - D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1; - D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y); - D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1; + D1.x = A1 - (A2 * twiddle.x - B2 * twiddle.y); + D1.y = (B2 * twiddle.x + A2 * twiddle.y) + B1; + D2.x = A1 + (A2 * twiddle.x - B2 * twiddle.y); + D2.y = (B2 * twiddle.x + A2 * twiddle.y) - B1; } -inline __device__ void getTwiddle(fComplex &twiddle, float phase) { - __sincosf(phase, &twiddle.y, &twiddle.x); +inline __device__ void getTwiddle(fComplex &twiddle, float phase) { __sincosf(phase, &twiddle.y, &twiddle.x); } + +inline __device__ uint mod(uint a, uint DA) +{ + //(DA - a) % DA, assuming a <= DA + return a ? (DA - a) : a; } -inline __device__ uint mod(uint a, uint DA) { - //(DA - a) % DA, assuming a <= DA - return a ? (DA - a) : a; +static inline uint factorRadix2(uint &log2N, uint n) +{ + if (!n) { + log2N = 0; + return 0; + } + else { + for (log2N = 0; n % 2 == 0; n /= 2, log2N++) + ; + + return n; + } } -static inline uint factorRadix2(uint &log2N, uint n) { - if (!n) { - log2N = 0; - return 0; - } else { - for (log2N = 0; n % 2 == 0; n /= 2, log2N++) - ; - - return n; - } -} - -inline __device__ void udivmod(uint ÷nd, uint divisor, uint &rem) { +inline __device__ void udivmod(uint ÷nd, uint divisor, uint &rem) +{ #if (!POWER_OF_TWO) - rem = dividend % divisor; - dividend /= divisor; + rem = dividend % divisor; + dividend /= divisor; #else - rem = dividend & (divisor - 1); - dividend >>= (__ffs(divisor) - 1); + rem = dividend & (divisor - 1); + dividend >>= (__ffs(divisor) - 1); #endif } -__global__ void spPostprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src, - uint DY, uint DX, uint threadCount, - uint padding, float phaseBase +__global__ void spPostprocess2D_kernel(fComplex *d_Dst, + fComplex *d_Src, + uint DY, + uint DX, + uint threadCount, + uint padding, + float phaseBase #if (USE_TEXTURE) , cudaTextureObject_t texComplex #endif - ) { - const uint threadId = blockIdx.x * blockDim.x + threadIdx.x; +) +{ + const uint threadId = blockIdx.x * blockDim.x + threadIdx.x; - if (threadId >= threadCount) { - return; - } + if (threadId >= threadCount) { + return; + } - uint x, y, i = threadId; - udivmod(i, DX / 2, x); - udivmod(i, DY, y); + uint x, y, i = threadId; + udivmod(i, DX / 2, x); + udivmod(i, DY, y); - // Avoid overwrites in columns DX / 2 by different threads - if ((x == 0) && (y > DY / 2)) { - return; - } + // Avoid overwrites in columns DX / 2 by different threads + if ((x == 0) && (y > DY / 2)) { + return; + } - const uint srcOffset = i * DY * DX; - const uint dstOffset = i * DY * (DX + padding); + const uint srcOffset = i * DY * DX; + const uint dstOffset = i * DY * (DX + padding); - // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX] - { - const uint loadPos1 = srcOffset + y * DX + x; - const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX); - const uint storePos1 = dstOffset + y * (DX + padding) + x; - const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x); + // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX] + { + const uint loadPos1 = srcOffset + y * DX + x; + const uint loadPos2 = srcOffset + mod(y, DY) * DX + mod(x, DX); + const uint storePos1 = dstOffset + y * (DX + padding) + x; + const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + (DX - x); - fComplex D1 = LOAD_FCOMPLEX(loadPos1); - fComplex D2 = LOAD_FCOMPLEX(loadPos2); + fComplex D1 = LOAD_FCOMPLEX(loadPos1); + fComplex D2 = LOAD_FCOMPLEX(loadPos2); - fComplex twiddle; - getTwiddle(twiddle, phaseBase * (float)x); - spPostprocessC2C(D1, D2, twiddle); + fComplex twiddle; + getTwiddle(twiddle, phaseBase * (float)x); + spPostprocessC2C(D1, D2, twiddle); - d_Dst[storePos1] = D1; - d_Dst[storePos2] = D2; - } + d_Dst[storePos1] = D1; + d_Dst[storePos2] = D2; + } - // Process x = DX / 2 - if (x == 0) { - const uint loadPos1 = srcOffset + y * DX + DX / 2; - const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2; - const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2; - const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2; + // Process x = DX / 2 + if (x == 0) { + const uint loadPos1 = srcOffset + y * DX + DX / 2; + const uint loadPos2 = srcOffset + mod(y, DY) * DX + DX / 2; + const uint storePos1 = dstOffset + y * (DX + padding) + DX / 2; + const uint storePos2 = dstOffset + mod(y, DY) * (DX + padding) + DX / 2; - fComplex D1 = LOAD_FCOMPLEX(loadPos1); - fComplex D2 = LOAD_FCOMPLEX(loadPos2); + fComplex D1 = LOAD_FCOMPLEX(loadPos1); + fComplex D2 = LOAD_FCOMPLEX(loadPos2); - // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2) - fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f}; - spPostprocessC2C(D1, D2, twiddle); + // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(dir * j * PI / 2) + fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f}; + spPostprocessC2C(D1, D2, twiddle); - d_Dst[storePos1] = D1; - d_Dst[storePos2] = D2; - } + d_Dst[storePos1] = D1; + d_Dst[storePos2] = D2; + } } -__global__ void spPreprocess2D_kernel(fComplex *d_Dst, fComplex *d_Src, uint DY, - uint DX, uint threadCount, uint padding, - float phaseBase +__global__ void spPreprocess2D_kernel(fComplex *d_Dst, + fComplex *d_Src, + uint DY, + uint DX, + uint threadCount, + uint padding, + float phaseBase #if (USE_TEXTURE) , cudaTextureObject_t texComplex #endif - ) { - const uint threadId = blockIdx.x * blockDim.x + threadIdx.x; +) +{ + const uint threadId = blockIdx.x * blockDim.x + threadIdx.x; - if (threadId >= threadCount) { - return; - } + if (threadId >= threadCount) { + return; + } - uint x, y, i = threadId; - udivmod(i, DX / 2, x); - udivmod(i, DY, y); + uint x, y, i = threadId; + udivmod(i, DX / 2, x); + udivmod(i, DY, y); - // Avoid overwrites in columns 0 and DX / 2 by different threads (lower and - // upper halves) - if ((x == 0) && (y > DY / 2)) { - return; - } + // Avoid overwrites in columns 0 and DX / 2 by different threads (lower and + // upper halves) + if ((x == 0) && (y > DY / 2)) { + return; + } - const uint srcOffset = i * DY * (DX + padding); - const uint dstOffset = i * DY * DX; + const uint srcOffset = i * DY * (DX + padding); + const uint dstOffset = i * DY * DX; - // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX] - { - const uint loadPos1 = srcOffset + y * (DX + padding) + x; - const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x); - const uint storePos1 = dstOffset + y * DX + x; - const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX); + // Process x = [0 .. DX / 2 - 1] U [DX / 2 + 1 .. DX] + { + const uint loadPos1 = srcOffset + y * (DX + padding) + x; + const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + (DX - x); + const uint storePos1 = dstOffset + y * DX + x; + const uint storePos2 = dstOffset + mod(y, DY) * DX + mod(x, DX); - fComplex D1 = LOAD_FCOMPLEX(loadPos1); - fComplex D2 = LOAD_FCOMPLEX(loadPos2); + fComplex D1 = LOAD_FCOMPLEX(loadPos1); + fComplex D2 = LOAD_FCOMPLEX(loadPos2); - fComplex twiddle; - getTwiddle(twiddle, phaseBase * (float)x); - spPreprocessC2C(D1, D2, twiddle); + fComplex twiddle; + getTwiddle(twiddle, phaseBase * (float)x); + spPreprocessC2C(D1, D2, twiddle); - d_Dst[storePos1] = D1; - d_Dst[storePos2] = D2; - } + d_Dst[storePos1] = D1; + d_Dst[storePos2] = D2; + } - // Process x = DX / 2 - if (x == 0) { - const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2; - const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2; - const uint storePos1 = dstOffset + y * DX + DX / 2; - const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2; + // Process x = DX / 2 + if (x == 0) { + const uint loadPos1 = srcOffset + y * (DX + padding) + DX / 2; + const uint loadPos2 = srcOffset + mod(y, DY) * (DX + padding) + DX / 2; + const uint storePos1 = dstOffset + y * DX + DX / 2; + const uint storePos2 = dstOffset + mod(y, DY) * DX + DX / 2; - fComplex D1 = LOAD_FCOMPLEX(loadPos1); - fComplex D2 = LOAD_FCOMPLEX(loadPos2); + fComplex D1 = LOAD_FCOMPLEX(loadPos1); + fComplex D2 = LOAD_FCOMPLEX(loadPos2); - // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2) - fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f}; - spPreprocessC2C(D1, D2, twiddle); + // twiddle = getTwiddle(phaseBase * (DX / 2)) = exp(-dir * j * PI / 2) + fComplex twiddle = {0, (phaseBase > 0) ? 1.0f : -1.0f}; + spPreprocessC2C(D1, D2, twiddle); - d_Dst[storePos1] = D1; - d_Dst[storePos2] = D2; - } + d_Dst[storePos1] = D1; + d_Dst[storePos2] = D2; + } } //////////////////////////////////////////////////////////////////////////////// // Combined spPostprocess2D + modulateAndNormalize + spPreprocess2D //////////////////////////////////////////////////////////////////////////////// -__global__ void spProcess2D_kernel(fComplex *d_Dst, fComplex *d_SrcA, - fComplex *d_SrcB, uint DY, uint DX, - uint threadCount, float phaseBase, float c +__global__ void spProcess2D_kernel(fComplex *d_Dst, + fComplex *d_SrcA, + fComplex *d_SrcB, + uint DY, + uint DX, + uint threadCount, + float phaseBase, + float c #if (USE_TEXTURE) , cudaTextureObject_t texComplexA, cudaTextureObject_t texComplexB #endif - ) { - const uint threadId = blockIdx.x * blockDim.x + threadIdx.x; +) +{ + const uint threadId = blockIdx.x * blockDim.x + threadIdx.x; - if (threadId >= threadCount) { - return; - } + if (threadId >= threadCount) { + return; + } - uint x, y, i = threadId; - udivmod(i, DX, x); - udivmod(i, DY / 2, y); + uint x, y, i = threadId; + udivmod(i, DX, x); + udivmod(i, DY / 2, y); - const uint offset = i * DY * DX; + const uint offset = i * DY * DX; - // Avoid overwrites in rows 0 and DY / 2 by different threads (left and right - // halves) Otherwise correctness for in-place transformations is affected - if ((y == 0) && (x > DX / 2)) { - return; - } + // Avoid overwrites in rows 0 and DY / 2 by different threads (left and right + // halves) Otherwise correctness for in-place transformations is affected + if ((y == 0) && (x > DX / 2)) { + return; + } - fComplex twiddle; + fComplex twiddle; - // Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1] - { - const uint pos1 = offset + y * DX + x; - const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX); + // Process y = [0 .. DY / 2 - 1] U [DY - (DY / 2) + 1 .. DY - 1] + { + const uint pos1 = offset + y * DX + x; + const uint pos2 = offset + mod(y, DY) * DX + mod(x, DX); - fComplex D1 = LOAD_FCOMPLEX_A(pos1); - fComplex D2 = LOAD_FCOMPLEX_A(pos2); - fComplex K1 = LOAD_FCOMPLEX_B(pos1); - fComplex K2 = LOAD_FCOMPLEX_B(pos2); - getTwiddle(twiddle, phaseBase * (float)x); + fComplex D1 = LOAD_FCOMPLEX_A(pos1); + fComplex D2 = LOAD_FCOMPLEX_A(pos2); + fComplex K1 = LOAD_FCOMPLEX_B(pos1); + fComplex K2 = LOAD_FCOMPLEX_B(pos2); + getTwiddle(twiddle, phaseBase * (float)x); - spPostprocessC2C(D1, D2, twiddle); - spPostprocessC2C(K1, K2, twiddle); - mulAndScale(D1, K1, c); - mulAndScale(D2, K2, c); - spPreprocessC2C(D1, D2, twiddle); + spPostprocessC2C(D1, D2, twiddle); + spPostprocessC2C(K1, K2, twiddle); + mulAndScale(D1, K1, c); + mulAndScale(D2, K2, c); + spPreprocessC2C(D1, D2, twiddle); - d_Dst[pos1] = D1; - d_Dst[pos2] = D2; - } + d_Dst[pos1] = D1; + d_Dst[pos2] = D2; + } - if (y == 0) { - const uint pos1 = offset + (DY / 2) * DX + x; - const uint pos2 = offset + (DY / 2) * DX + mod(x, DX); + if (y == 0) { + const uint pos1 = offset + (DY / 2) * DX + x; + const uint pos2 = offset + (DY / 2) * DX + mod(x, DX); - fComplex D1 = LOAD_FCOMPLEX_A(pos1); - fComplex D2 = LOAD_FCOMPLEX_A(pos2); - fComplex K1 = LOAD_FCOMPLEX_B(pos1); - fComplex K2 = LOAD_FCOMPLEX_B(pos2); + fComplex D1 = LOAD_FCOMPLEX_A(pos1); + fComplex D2 = LOAD_FCOMPLEX_A(pos2); + fComplex K1 = LOAD_FCOMPLEX_B(pos1); + fComplex K2 = LOAD_FCOMPLEX_B(pos2); - spPostprocessC2C(D1, D2, twiddle); - spPostprocessC2C(K1, K2, twiddle); - mulAndScale(D1, K1, c); - mulAndScale(D2, K2, c); - spPreprocessC2C(D1, D2, twiddle); + spPostprocessC2C(D1, D2, twiddle); + spPostprocessC2C(K1, K2, twiddle); + mulAndScale(D1, K1, c); + mulAndScale(D2, K2, c); + spPreprocessC2C(D1, D2, twiddle); - d_Dst[pos1] = D1; - d_Dst[pos2] = D2; - } + d_Dst[pos1] = D1; + d_Dst[pos2] = D2; + } } diff --git a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_common.h b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_common.h index 78ad9c8b..7cc2ddaa 100644 --- a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_common.h +++ b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_common.h @@ -33,9 +33,10 @@ typedef unsigned int uint; #ifdef __CUDACC__ typedef float2 fComplex; #else -typedef struct { - float x; - float y; +typedef struct +{ + float x; + float y; } fComplex; #endif @@ -48,30 +49,42 @@ inline int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } // Align a to nearest higher multiple of b inline int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; } -extern "C" void convolutionClampToBorderCPU(float *h_Result, float *h_Data, - float *h_Kernel, int dataH, - int dataW, int kernelH, int kernelW, - int kernelY, int kernelX); +extern "C" void convolutionClampToBorderCPU(float *h_Result, + float *h_Data, + float *h_Kernel, + int dataH, + int dataW, + int kernelH, + int kernelW, + int kernelY, + int kernelX); -extern "C" void padKernel(float *d_PaddedKernel, float *d_Kernel, int fftH, - int fftW, int kernelH, int kernelW, int kernelY, - int kernelX); +extern "C" void padKernel(float *d_PaddedKernel, + float *d_Kernel, + int fftH, + int fftW, + int kernelH, + int kernelW, + int kernelY, + int kernelX); -extern "C" void padDataClampToBorder(float *d_PaddedData, float *d_Data, - int fftH, int fftW, int dataH, int dataW, - int kernelH, int kernelW, int kernelY, - int kernelX); +extern "C" void padDataClampToBorder(float *d_PaddedData, + float *d_Data, + int fftH, + int fftW, + int dataH, + int dataW, + int kernelH, + int kernelW, + int kernelY, + int kernelX); -extern "C" void modulateAndNormalize(fComplex *d_Dst, fComplex *d_Src, int fftH, - int fftW, int padding); +extern "C" void modulateAndNormalize(fComplex *d_Dst, fComplex *d_Src, int fftH, int fftW, int padding); -extern "C" void spPostprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, - uint padding, int dir); +extern "C" void spPostprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, uint padding, int dir); -extern "C" void spPreprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, - uint padding, int dir); +extern "C" void spPreprocess2D(void *d_Dst, void *d_Src, uint DY, uint DX, uint padding, int dir); -extern "C" void spProcess2D(void *d_Data, void *d_Data0, void *d_Kernel0, - uint DY, uint DX, int dir); +extern "C" void spProcess2D(void *d_Data, void *d_Data0, void *d_Kernel0, uint DY, uint DX, int dir); -#endif // CONVOLUTIONFFT2D_COMMON_H +#endif // CONVOLUTIONFFT2D_COMMON_H diff --git a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_gold.cpp b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_gold.cpp index 9bf7664e..4f01808c 100644 --- a/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_gold.cpp +++ b/Samples/5_Domain_Specific/convolutionFFT2D/convolutionFFT2D_gold.cpp @@ -26,36 +26,46 @@ */ #include + #include "convolutionFFT2D_common.h" //////////////////////////////////////////////////////////////////////////////// // Reference straightforward CPU convolution //////////////////////////////////////////////////////////////////////////////// -extern "C" void convolutionClampToBorderCPU(float *h_Result, float *h_Data, - float *h_Kernel, int dataH, - int dataW, int kernelH, int kernelW, - int kernelY, int kernelX) { - for (int y = 0; y < dataH; y++) - for (int x = 0; x < dataW; x++) { - double sum = 0; +extern "C" void convolutionClampToBorderCPU(float *h_Result, + float *h_Data, + float *h_Kernel, + int dataH, + int dataW, + int kernelH, + int kernelW, + int kernelY, + int kernelX) +{ + for (int y = 0; y < dataH; y++) + for (int x = 0; x < dataW; x++) { + double sum = 0; - for (int ky = -(kernelH - kernelY - 1); ky <= kernelY; ky++) - for (int kx = -(kernelW - kernelX - 1); kx <= kernelX; kx++) { - int dy = y + ky; - int dx = x + kx; + for (int ky = -(kernelH - kernelY - 1); ky <= kernelY; ky++) + for (int kx = -(kernelW - kernelX - 1); kx <= kernelX; kx++) { + int dy = y + ky; + int dx = x + kx; - if (dy < 0) dy = 0; + if (dy < 0) + dy = 0; - if (dx < 0) dx = 0; + if (dx < 0) + dx = 0; - if (dy >= dataH) dy = dataH - 1; + if (dy >= dataH) + dy = dataH - 1; - if (dx >= dataW) dx = dataW - 1; + if (dx >= dataW) + dx = dataW - 1; - sum += h_Data[dy * dataW + dx] * - h_Kernel[(kernelY - ky) * kernelW + (kernelX - kx)]; + sum += h_Data[dy * dataW + dx] * h_Kernel[(kernelY - ky) * kernelW + (kernelX - kx)]; + } + + h_Result[y * dataW + x] = (float)sum; } - - h_Result[y * dataW + x] = (float)sum; - } } diff --git a/Samples/5_Domain_Specific/convolutionFFT2D/main.cpp b/Samples/5_Domain_Specific/convolutionFFT2D/main.cpp index c6eed611..e41b275c 100644 --- a/Samples/5_Domain_Specific/convolutionFFT2D/main.cpp +++ b/Samples/5_Domain_Specific/convolutionFFT2D/main.cpp @@ -42,545 +42,489 @@ #include // Helper functions for CUDA -#include #include +#include #include "convolutionFFT2D_common.h" //////////////////////////////////////////////////////////////////////////////// // Helper functions //////////////////////////////////////////////////////////////////////////////// -int snapTransformSize(int dataSize) { - int hiBit; - unsigned int lowPOT, hiPOT; +int snapTransformSize(int dataSize) +{ + int hiBit; + unsigned int lowPOT, hiPOT; - dataSize = iAlignUp(dataSize, 16); + dataSize = iAlignUp(dataSize, 16); - for (hiBit = 31; hiBit >= 0; hiBit--) - if (dataSize & (1U << hiBit)) { - break; + for (hiBit = 31; hiBit >= 0; hiBit--) + if (dataSize & (1U << hiBit)) { + break; + } + + lowPOT = 1U << hiBit; + + if (lowPOT == (unsigned int)dataSize) { + return dataSize; } - lowPOT = 1U << hiBit; + hiPOT = 1U << (hiBit + 1); - if (lowPOT == (unsigned int)dataSize) { - return dataSize; - } - - hiPOT = 1U << (hiBit + 1); - - if (hiPOT <= 1024) { - return hiPOT; - } else { - return iAlignUp(dataSize, 512); - } + if (hiPOT <= 1024) { + return hiPOT; + } + else { + return iAlignUp(dataSize, 512); + } } float getRand(void) { return (float)(rand() % 16); } -bool test0(void) { - float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; +bool test0(void) +{ + float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; - float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel; + float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel; - fComplex *d_DataSpectrum, *d_KernelSpectrum; + fComplex *d_DataSpectrum, *d_KernelSpectrum; - cufftHandle fftPlanFwd, fftPlanInv; + cufftHandle fftPlanFwd, fftPlanInv; - bool bRetVal; - StopWatchInterface *hTimer = NULL; - sdkCreateTimer(&hTimer); + bool bRetVal; + StopWatchInterface *hTimer = NULL; + sdkCreateTimer(&hTimer); - printf("Testing built-in R2C / C2R FFT-based convolution\n"); - const int kernelH = 7; - const int kernelW = 6; - const int kernelY = 3; - const int kernelX = 4; - const int dataH = 2000; - const int dataW = 2000; - const int fftH = snapTransformSize(dataH + kernelH - 1); - const int fftW = snapTransformSize(dataW + kernelW - 1); + printf("Testing built-in R2C / C2R FFT-based convolution\n"); + const int kernelH = 7; + const int kernelW = 6; + const int kernelY = 3; + const int kernelX = 4; + const int dataH = 2000; + const int dataW = 2000; + const int fftH = snapTransformSize(dataH + kernelH - 1); + const int fftW = snapTransformSize(dataW + kernelW - 1); - printf("...allocating memory\n"); - h_Data = (float *)malloc(dataH * dataW * sizeof(float)); - h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); - h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); - h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); + printf("...allocating memory\n"); + h_Data = (float *)malloc(dataH * dataW * sizeof(float)); + h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); + h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); + h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); - checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, - fftH * (fftW / 2 + 1) * sizeof(fComplex))); - checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, - fftH * (fftW / 2 + 1) * sizeof(fComplex))); - checkCudaErrors(cudaMemset(d_KernelSpectrum, 0, - fftH * (fftW / 2 + 1) * sizeof(fComplex))); + checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); + checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); + checkCudaErrors(cudaMemset(d_KernelSpectrum, 0, fftH * (fftW / 2 + 1) * sizeof(fComplex))); - printf("...generating random input data\n"); - srand(2010); + printf("...generating random input data\n"); + srand(2010); - for (int i = 0; i < dataH * dataW; i++) { - h_Data[i] = getRand(); - } - - for (int i = 0; i < kernelH * kernelW; i++) { - h_Kernel[i] = getRand(); - } - - printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW); - checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); - checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); - - printf("...uploading to GPU and padding convolution kernel and input data\n"); - checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, - kernelH * kernelW * sizeof(float), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); - checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); - - padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, - kernelX); - - padDataClampToBorder(d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, - kernelW, kernelY, kernelX); - - // Not including kernel transformation into time measurement, - // since convolution kernel is not changed very frequently - printf("...transforming convolution kernel\n"); - checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, - (cufftComplex *)d_KernelSpectrum)); - - printf("...running GPU FFT convolution: "); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, - (cufftComplex *)d_DataSpectrum)); - modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1); - checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, - (cufftReal *)d_PaddedData)); - - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - double gpuTime = sdkGetTimerValue(&hTimer); - printf("%f MPix/s (%f ms)\n", - (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); - - printf("...reading back GPU convolution results\n"); - checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, - fftH * fftW * sizeof(float), - cudaMemcpyDeviceToHost)); - - printf("...running reference CPU convolution\n"); - convolutionClampToBorderCPU(h_ResultCPU, h_Data, h_Kernel, dataH, dataW, - kernelH, kernelW, kernelY, kernelX); - - printf("...comparing the results: "); - double sum_delta2 = 0; - double sum_ref2 = 0; - double max_delta_ref = 0; - - for (int y = 0; y < dataH; y++) - for (int x = 0; x < dataW; x++) { - double rCPU = (double)h_ResultCPU[y * dataW + x]; - double rGPU = (double)h_ResultGPU[y * fftW + x]; - double delta = (rCPU - rGPU) * (rCPU - rGPU); - double ref = rCPU * rCPU + rCPU * rCPU; - - if ((delta / ref) > max_delta_ref) { - max_delta_ref = delta / ref; - } - - sum_delta2 += delta; - sum_ref2 += ref; + for (int i = 0; i < dataH * dataW; i++) { + h_Data[i] = getRand(); } - double L2norm = sqrt(sum_delta2 / sum_ref2); - printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); - bRetVal = (L2norm < 1e-6) ? true : false; - printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); - - printf("...shutting down\n"); - sdkDeleteTimer(&hTimer); - - checkCudaErrors(cufftDestroy(fftPlanInv)); - checkCudaErrors(cufftDestroy(fftPlanFwd)); - - checkCudaErrors(cudaFree(d_DataSpectrum)); - checkCudaErrors(cudaFree(d_KernelSpectrum)); - checkCudaErrors(cudaFree(d_PaddedData)); - checkCudaErrors(cudaFree(d_PaddedKernel)); - checkCudaErrors(cudaFree(d_Data)); - checkCudaErrors(cudaFree(d_Kernel)); - - free(h_ResultGPU); - free(h_ResultCPU); - free(h_Data); - free(h_Kernel); - - return bRetVal; -} - -bool test1(void) { - float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; - - float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; - - fComplex *d_DataSpectrum0, *d_KernelSpectrum0, *d_DataSpectrum, - *d_KernelSpectrum; - - cufftHandle fftPlan; - - bool bRetVal; - StopWatchInterface *hTimer = NULL; - sdkCreateTimer(&hTimer); - - printf("Testing custom R2C / C2R FFT-based convolution\n"); - const uint fftPadding = 16; - const int kernelH = 7; - const int kernelW = 6; - const int kernelY = 3; - const int kernelX = 4; - const int dataH = 2000; - const int dataW = 2000; - const int fftH = snapTransformSize(dataH + kernelH - 1); - const int fftW = snapTransformSize(dataW + kernelW - 1); - - printf("...allocating memory\n"); - h_Data = (float *)malloc(dataH * dataW * sizeof(float)); - h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); - h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); - h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); - - checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); - - checkCudaErrors( - cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); - - checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, - fftH * (fftW / 2) * sizeof(fComplex))); - checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, - fftH * (fftW / 2) * sizeof(fComplex))); - checkCudaErrors( - cudaMalloc((void **)&d_DataSpectrum, - fftH * (fftW / 2 + fftPadding) * sizeof(fComplex))); - checkCudaErrors( - cudaMalloc((void **)&d_KernelSpectrum, - fftH * (fftW / 2 + fftPadding) * sizeof(fComplex))); - - printf("...generating random input data\n"); - srand(2010); - - for (int i = 0; i < dataH * dataW; i++) { - h_Data[i] = getRand(); - } - - for (int i = 0; i < kernelH * kernelW; i++) { - h_Kernel[i] = getRand(); - } - - printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); - checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); - - printf("...uploading to GPU and padding convolution kernel and input data\n"); - checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, - kernelH * kernelW * sizeof(float), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); - checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); - - padDataClampToBorder(d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, - kernelW, kernelY, kernelX); - - padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, - kernelX); - - // CUFFT_INVERSE works just as well... - const int FFT_DIR = CUFFT_FORWARD; - - // Not including kernel transformation into time measurement, - // since convolution kernel is not changed very frequently - printf("...transforming convolution kernel\n"); - checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, - (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); - spPostprocess2D(d_KernelSpectrum, d_KernelSpectrum0, fftH, fftW / 2, - fftPadding, FFT_DIR); - - printf("...running GPU FFT convolution: "); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, - (cufftComplex *)d_DataSpectrum0, FFT_DIR)); - - spPostprocess2D(d_DataSpectrum, d_DataSpectrum0, fftH, fftW / 2, fftPadding, - FFT_DIR); - modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, - fftPadding); - spPreprocess2D(d_DataSpectrum0, d_DataSpectrum, fftH, fftW / 2, fftPadding, - -FFT_DIR); - - checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, - (cufftComplex *)d_PaddedData, -FFT_DIR)); - - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - double gpuTime = sdkGetTimerValue(&hTimer); - printf("%f MPix/s (%f ms)\n", - (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); - - printf("...reading back GPU FFT results\n"); - checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, - fftH * fftW * sizeof(float), - cudaMemcpyDeviceToHost)); - - printf("...running reference CPU convolution\n"); - convolutionClampToBorderCPU(h_ResultCPU, h_Data, h_Kernel, dataH, dataW, - kernelH, kernelW, kernelY, kernelX); - - printf("...comparing the results: "); - double sum_delta2 = 0; - double sum_ref2 = 0; - double max_delta_ref = 0; - - for (int y = 0; y < dataH; y++) - for (int x = 0; x < dataW; x++) { - double rCPU = (double)h_ResultCPU[y * dataW + x]; - double rGPU = (double)h_ResultGPU[y * fftW + x]; - double delta = (rCPU - rGPU) * (rCPU - rGPU); - double ref = rCPU * rCPU + rCPU * rCPU; - - if ((delta / ref) > max_delta_ref) { - max_delta_ref = delta / ref; - } - - sum_delta2 += delta; - sum_ref2 += ref; + for (int i = 0; i < kernelH * kernelW; i++) { + h_Kernel[i] = getRand(); } - double L2norm = sqrt(sum_delta2 / sum_ref2); - printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); - bRetVal = (L2norm < 1e-6) ? true : false; - printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); + printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW); + checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); + checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); - printf("...shutting down\n"); - sdkDeleteTimer(&hTimer); - checkCudaErrors(cufftDestroy(fftPlan)); + printf("...uploading to GPU and padding convolution kernel and input data\n"); + checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); - checkCudaErrors(cudaFree(d_KernelSpectrum)); - checkCudaErrors(cudaFree(d_DataSpectrum)); - checkCudaErrors(cudaFree(d_KernelSpectrum0)); - checkCudaErrors(cudaFree(d_DataSpectrum0)); - checkCudaErrors(cudaFree(d_PaddedKernel)); - checkCudaErrors(cudaFree(d_PaddedData)); - checkCudaErrors(cudaFree(d_Kernel)); - checkCudaErrors(cudaFree(d_Data)); + padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX); - free(h_ResultGPU); - free(h_ResultCPU); - free(h_Kernel); - free(h_Data); + padDataClampToBorder(d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX); - return bRetVal; + // Not including kernel transformation into time measurement, + // since convolution kernel is not changed very frequently + printf("...transforming convolution kernel\n"); + checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum)); + + printf("...running GPU FFT convolution: "); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum)); + modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1); + checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData)); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + double gpuTime = sdkGetTimerValue(&hTimer); + printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); + + printf("...reading back GPU convolution results\n"); + checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); + + printf("...running reference CPU convolution\n"); + convolutionClampToBorderCPU(h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX); + + printf("...comparing the results: "); + double sum_delta2 = 0; + double sum_ref2 = 0; + double max_delta_ref = 0; + + for (int y = 0; y < dataH; y++) + for (int x = 0; x < dataW; x++) { + double rCPU = (double)h_ResultCPU[y * dataW + x]; + double rGPU = (double)h_ResultGPU[y * fftW + x]; + double delta = (rCPU - rGPU) * (rCPU - rGPU); + double ref = rCPU * rCPU + rCPU * rCPU; + + if ((delta / ref) > max_delta_ref) { + max_delta_ref = delta / ref; + } + + sum_delta2 += delta; + sum_ref2 += ref; + } + + double L2norm = sqrt(sum_delta2 / sum_ref2); + printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); + bRetVal = (L2norm < 1e-6) ? true : false; + printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); + + printf("...shutting down\n"); + sdkDeleteTimer(&hTimer); + + checkCudaErrors(cufftDestroy(fftPlanInv)); + checkCudaErrors(cufftDestroy(fftPlanFwd)); + + checkCudaErrors(cudaFree(d_DataSpectrum)); + checkCudaErrors(cudaFree(d_KernelSpectrum)); + checkCudaErrors(cudaFree(d_PaddedData)); + checkCudaErrors(cudaFree(d_PaddedKernel)); + checkCudaErrors(cudaFree(d_Data)); + checkCudaErrors(cudaFree(d_Kernel)); + + free(h_ResultGPU); + free(h_ResultCPU); + free(h_Data); + free(h_Kernel); + + return bRetVal; } -bool test2(void) { - float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; +bool test1(void) +{ + float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; - float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; + float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; - fComplex *d_DataSpectrum0, *d_KernelSpectrum0; + fComplex *d_DataSpectrum0, *d_KernelSpectrum0, *d_DataSpectrum, *d_KernelSpectrum; - cufftHandle fftPlan; + cufftHandle fftPlan; - bool bRetVal; - StopWatchInterface *hTimer = NULL; - sdkCreateTimer(&hTimer); + bool bRetVal; + StopWatchInterface *hTimer = NULL; + sdkCreateTimer(&hTimer); - printf("Testing updated custom R2C / C2R FFT-based convolution\n"); - const int kernelH = 7; - const int kernelW = 6; - const int kernelY = 3; - const int kernelX = 4; - const int dataH = 2000; - const int dataW = 2000; - const int fftH = snapTransformSize(dataH + kernelH - 1); - const int fftW = snapTransformSize(dataW + kernelW - 1); + printf("Testing custom R2C / C2R FFT-based convolution\n"); + const uint fftPadding = 16; + const int kernelH = 7; + const int kernelW = 6; + const int kernelY = 3; + const int kernelX = 4; + const int dataH = 2000; + const int dataW = 2000; + const int fftH = snapTransformSize(dataH + kernelH - 1); + const int fftW = snapTransformSize(dataW + kernelW - 1); - printf("...allocating memory\n"); - h_Data = (float *)malloc(dataH * dataW * sizeof(float)); - h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); - h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); - h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); + printf("...allocating memory\n"); + h_Data = (float *)malloc(dataH * dataW * sizeof(float)); + h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); + h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); + h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); - checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); - checkCudaErrors( - cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); - checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, - fftH * (fftW / 2) * sizeof(fComplex))); - checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, - fftH * (fftW / 2) * sizeof(fComplex))); + checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); + checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); + checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + fftPadding) * sizeof(fComplex))); + checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + fftPadding) * sizeof(fComplex))); - printf("...generating random input data\n"); - srand(2010); + printf("...generating random input data\n"); + srand(2010); - for (int i = 0; i < dataH * dataW; i++) { - h_Data[i] = getRand(); - } - - for (int i = 0; i < kernelH * kernelW; i++) { - h_Kernel[i] = getRand(); - } - - printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); - checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); - - printf("...uploading to GPU and padding convolution kernel and input data\n"); - checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, - kernelH * kernelW * sizeof(float), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); - checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); - - padDataClampToBorder(d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, - kernelW, kernelY, kernelX); - - padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, - kernelX); - - // CUFFT_INVERSE works just as well... - const int FFT_DIR = CUFFT_FORWARD; - - // Not including kernel transformation into time measurement, - // since convolution kernel is not changed very frequently - printf("...transforming convolution kernel\n"); - checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, - (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); - - printf("...running GPU FFT convolution: "); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - - checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, - (cufftComplex *)d_DataSpectrum0, FFT_DIR)); - spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, - fftW / 2, FFT_DIR); - checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, - (cufftComplex *)d_PaddedData, -FFT_DIR)); - - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - double gpuTime = sdkGetTimerValue(&hTimer); - printf("%f MPix/s (%f ms)\n", - (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); - - printf("...reading back GPU FFT results\n"); - checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, - fftH * fftW * sizeof(float), - cudaMemcpyDeviceToHost)); - - printf("...running reference CPU convolution\n"); - convolutionClampToBorderCPU(h_ResultCPU, h_Data, h_Kernel, dataH, dataW, - kernelH, kernelW, kernelY, kernelX); - - printf("...comparing the results: "); - double sum_delta2 = 0; - double sum_ref2 = 0; - double max_delta_ref = 0; - - for (int y = 0; y < dataH; y++) { - for (int x = 0; x < dataW; x++) { - double rCPU = (double)h_ResultCPU[y * dataW + x]; - double rGPU = (double)h_ResultGPU[y * fftW + x]; - double delta = (rCPU - rGPU) * (rCPU - rGPU); - double ref = rCPU * rCPU + rCPU * rCPU; - - if ((delta / ref) > max_delta_ref) { - max_delta_ref = delta / ref; - } - - sum_delta2 += delta; - sum_ref2 += ref; + for (int i = 0; i < dataH * dataW; i++) { + h_Data[i] = getRand(); } - } - double L2norm = sqrt(sum_delta2 / sum_ref2); - printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); - bRetVal = (L2norm < 1e-6) ? true : false; - printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); + for (int i = 0; i < kernelH * kernelW; i++) { + h_Kernel[i] = getRand(); + } - printf("...shutting down\n"); - sdkDeleteTimer(&hTimer); - checkCudaErrors(cufftDestroy(fftPlan)); + printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); + checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); - checkCudaErrors(cudaFree(d_KernelSpectrum0)); - checkCudaErrors(cudaFree(d_DataSpectrum0)); - checkCudaErrors(cudaFree(d_PaddedKernel)); - checkCudaErrors(cudaFree(d_PaddedData)); - checkCudaErrors(cudaFree(d_Kernel)); - checkCudaErrors(cudaFree(d_Data)); + printf("...uploading to GPU and padding convolution kernel and input data\n"); + checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); - free(h_ResultGPU); - free(h_ResultCPU); - free(h_Kernel); - free(h_Data); + padDataClampToBorder(d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX); - return bRetVal; + padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX); + + // CUFFT_INVERSE works just as well... + const int FFT_DIR = CUFFT_FORWARD; + + // Not including kernel transformation into time measurement, + // since convolution kernel is not changed very frequently + printf("...transforming convolution kernel\n"); + checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); + spPostprocess2D(d_KernelSpectrum, d_KernelSpectrum0, fftH, fftW / 2, fftPadding, FFT_DIR); + + printf("...running GPU FFT convolution: "); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + + checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR)); + + spPostprocess2D(d_DataSpectrum, d_DataSpectrum0, fftH, fftW / 2, fftPadding, FFT_DIR); + modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, fftPadding); + spPreprocess2D(d_DataSpectrum0, d_DataSpectrum, fftH, fftW / 2, fftPadding, -FFT_DIR); + + checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR)); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + double gpuTime = sdkGetTimerValue(&hTimer); + printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); + + printf("...reading back GPU FFT results\n"); + checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); + + printf("...running reference CPU convolution\n"); + convolutionClampToBorderCPU(h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX); + + printf("...comparing the results: "); + double sum_delta2 = 0; + double sum_ref2 = 0; + double max_delta_ref = 0; + + for (int y = 0; y < dataH; y++) + for (int x = 0; x < dataW; x++) { + double rCPU = (double)h_ResultCPU[y * dataW + x]; + double rGPU = (double)h_ResultGPU[y * fftW + x]; + double delta = (rCPU - rGPU) * (rCPU - rGPU); + double ref = rCPU * rCPU + rCPU * rCPU; + + if ((delta / ref) > max_delta_ref) { + max_delta_ref = delta / ref; + } + + sum_delta2 += delta; + sum_ref2 += ref; + } + + double L2norm = sqrt(sum_delta2 / sum_ref2); + printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); + bRetVal = (L2norm < 1e-6) ? true : false; + printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); + + printf("...shutting down\n"); + sdkDeleteTimer(&hTimer); + checkCudaErrors(cufftDestroy(fftPlan)); + + checkCudaErrors(cudaFree(d_KernelSpectrum)); + checkCudaErrors(cudaFree(d_DataSpectrum)); + checkCudaErrors(cudaFree(d_KernelSpectrum0)); + checkCudaErrors(cudaFree(d_DataSpectrum0)); + checkCudaErrors(cudaFree(d_PaddedKernel)); + checkCudaErrors(cudaFree(d_PaddedData)); + checkCudaErrors(cudaFree(d_Kernel)); + checkCudaErrors(cudaFree(d_Data)); + + free(h_ResultGPU); + free(h_ResultCPU); + free(h_Kernel); + free(h_Data); + + return bRetVal; } -int main(int argc, char **argv) { - printf("[%s] - Starting...\n", argv[0]); +bool test2(void) +{ + float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; - // Use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; - int nFailures = 0; + fComplex *d_DataSpectrum0, *d_KernelSpectrum0; - if (!test0()) { - nFailures++; - } + cufftHandle fftPlan; - if (!test1()) { - nFailures++; - } + bool bRetVal; + StopWatchInterface *hTimer = NULL; + sdkCreateTimer(&hTimer); - if (!test2()) { - nFailures++; - } + printf("Testing updated custom R2C / C2R FFT-based convolution\n"); + const int kernelH = 7; + const int kernelW = 6; + const int kernelY = 3; + const int kernelX = 4; + const int dataH = 2000; + const int dataW = 2000; + const int fftH = snapTransformSize(dataH + kernelH - 1); + const int fftW = snapTransformSize(dataW + kernelW - 1); - printf("Test Summary: %d errors\n", nFailures); + printf("...allocating memory\n"); + h_Data = (float *)malloc(dataH * dataW * sizeof(float)); + h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); + h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); + h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); - if (nFailures > 0) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); - printf("Test passed\n"); - exit(EXIT_SUCCESS); + checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); + + checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); + checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); + + printf("...generating random input data\n"); + srand(2010); + + for (int i = 0; i < dataH * dataW; i++) { + h_Data[i] = getRand(); + } + + for (int i = 0; i < kernelH * kernelW; i++) { + h_Kernel[i] = getRand(); + } + + printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); + checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); + + printf("...uploading to GPU and padding convolution kernel and input data\n"); + checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); + checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); + + padDataClampToBorder(d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX); + + padKernel(d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX); + + // CUFFT_INVERSE works just as well... + const int FFT_DIR = CUFFT_FORWARD; + + // Not including kernel transformation into time measurement, + // since convolution kernel is not changed very frequently + printf("...transforming convolution kernel\n"); + checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); + + printf("...running GPU FFT convolution: "); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + + checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR)); + spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR); + checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR)); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + double gpuTime = sdkGetTimerValue(&hTimer); + printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); + + printf("...reading back GPU FFT results\n"); + checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); + + printf("...running reference CPU convolution\n"); + convolutionClampToBorderCPU(h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX); + + printf("...comparing the results: "); + double sum_delta2 = 0; + double sum_ref2 = 0; + double max_delta_ref = 0; + + for (int y = 0; y < dataH; y++) { + for (int x = 0; x < dataW; x++) { + double rCPU = (double)h_ResultCPU[y * dataW + x]; + double rGPU = (double)h_ResultGPU[y * fftW + x]; + double delta = (rCPU - rGPU) * (rCPU - rGPU); + double ref = rCPU * rCPU + rCPU * rCPU; + + if ((delta / ref) > max_delta_ref) { + max_delta_ref = delta / ref; + } + + sum_delta2 += delta; + sum_ref2 += ref; + } + } + + double L2norm = sqrt(sum_delta2 / sum_ref2); + printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); + bRetVal = (L2norm < 1e-6) ? true : false; + printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); + + printf("...shutting down\n"); + sdkDeleteTimer(&hTimer); + checkCudaErrors(cufftDestroy(fftPlan)); + + checkCudaErrors(cudaFree(d_KernelSpectrum0)); + checkCudaErrors(cudaFree(d_DataSpectrum0)); + checkCudaErrors(cudaFree(d_PaddedKernel)); + checkCudaErrors(cudaFree(d_PaddedData)); + checkCudaErrors(cudaFree(d_Kernel)); + checkCudaErrors(cudaFree(d_Data)); + + free(h_ResultGPU); + free(h_ResultCPU); + free(h_Kernel); + free(h_Data); + + return bRetVal; +} + +int main(int argc, char **argv) +{ + printf("[%s] - Starting...\n", argv[0]); + + // Use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); + + int nFailures = 0; + + if (!test0()) { + nFailures++; + } + + if (!test1()) { + nFailures++; + } + + if (!test2()) { + nFailures++; + } + + printf("Test Summary: %d errors\n", nFailures); + + if (nFailures > 0) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D.cu b/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D.cu index c8bbec24..b6d98d2d 100644 --- a/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D.cu +++ b/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D.cu @@ -84,20 +84,20 @@ decomposition. #endif // includes, system -#include -#include -#include -#include #include +#include +#include +#include +#include // includes, project -#include #include +#include // constants which are used in host and device code #define INV_SQRT_2 0.70710678118654752440f; const unsigned int LOG_NUM_BANKS = 4; -const unsigned int NUM_BANKS = 16; +const unsigned int NUM_BANKS = 16; //////////////////////////////////////////////////////////////////////////////// // includes, kernels @@ -111,256 +111,252 @@ bool getLevels(unsigned int len, unsigned int *levels); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - // run test - runTest(argc, argv); +int main(int argc, char **argv) +{ + // run test + runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Perform the wavelet decomposition //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - bool bResult = false; // flag for final validation of the results +void runTest(int argc, char **argv) +{ + bool bResult = false; // flag for final validation of the results - char *s_fname = NULL, *r_gold_fname = NULL; - char r_fname[256]; - const char usage[] = { - "\nUsage:\n" - " dwtHaar1D --signal= --result= " - "--gold=\n\n" - " Input file containing the signal\n" - " Output file storing the result of the wavelet " - "decomposition\n" - " Input file containing the reference result of the " - "wavelet decomposition\n" - "\nExample:\n" - " ./dwtHaar1D\n" - " --signal=signal.dat\n" - " --result=result.dat\n" - " --gold=regression.gold.dat\n"}; + char *s_fname = NULL, *r_gold_fname = NULL; + char r_fname[256]; + const char usage[] = {"\nUsage:\n" + " dwtHaar1D --signal= --result= " + "--gold=\n\n" + " Input file containing the signal\n" + " Output file storing the result of the wavelet " + "decomposition\n" + " Input file containing the reference result of the " + "wavelet decomposition\n" + "\nExample:\n" + " ./dwtHaar1D\n" + " --signal=signal.dat\n" + " --result=result.dat\n" + " --gold=regression.gold.dat\n"}; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - // file names, either specified as cmd line args or use default - if (argc == 4) { - char *tmp_sfname, *tmp_rfname, *tmp_goldfname; + // file names, either specified as cmd line args or use default + if (argc == 4) { + char *tmp_sfname, *tmp_rfname, *tmp_goldfname; - if ((getCmdLineArgumentString(argc, (const char **)argv, "signal", - &tmp_sfname) != true) || - (getCmdLineArgumentString(argc, (const char **)argv, "result", - &tmp_rfname) != true) || - (getCmdLineArgumentString(argc, (const char **)argv, "gold", - &tmp_goldfname) != true)) { - fprintf(stderr, "Invalid input syntax.\n%s", usage); - exit(EXIT_FAILURE); + if ((getCmdLineArgumentString(argc, (const char **)argv, "signal", &tmp_sfname) != true) + || (getCmdLineArgumentString(argc, (const char **)argv, "result", &tmp_rfname) != true) + || (getCmdLineArgumentString(argc, (const char **)argv, "gold", &tmp_goldfname) != true)) { + fprintf(stderr, "Invalid input syntax.\n%s", usage); + exit(EXIT_FAILURE); + } + + s_fname = sdkFindFilePath(tmp_sfname, argv[0]); + r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]); + strcpy(r_fname, tmp_rfname); + } + else { + s_fname = sdkFindFilePath("signal.dat", argv[0]); + r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]); + strcpy(r_fname, "result.dat"); } - s_fname = sdkFindFilePath(tmp_sfname, argv[0]); - r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]); - strcpy(r_fname, tmp_rfname); - } else { - s_fname = sdkFindFilePath("signal.dat", argv[0]); - r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]); - strcpy(r_fname, "result.dat"); - } + printf("source file = \"%s\"\n", s_fname); + printf("reference file = \"%s\"\n", r_fname); + printf("gold file = \"%s\"\n", r_gold_fname); - printf("source file = \"%s\"\n", s_fname); - printf("reference file = \"%s\"\n", r_fname); - printf("gold file = \"%s\"\n", r_gold_fname); + // read in signal + unsigned int slength = 0; + float *signal = NULL; - // read in signal - unsigned int slength = 0; - float *signal = NULL; + if (s_fname == NULL) { + fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage); - if (s_fname == NULL) { - fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage); + exit(EXIT_FAILURE); + } - exit(EXIT_FAILURE); - } + if (sdkReadFile(s_fname, &signal, &slength, false) == true) { + printf("Reading signal from \"%s\"\n", s_fname); + } + else { + exit(EXIT_FAILURE); + } - if (sdkReadFile(s_fname, &signal, &slength, false) == true) { - printf("Reading signal from \"%s\"\n", s_fname); - } else { - exit(EXIT_FAILURE); - } + // get the number of decompositions necessary to perform a full decomposition + unsigned int dlevels_complete = 0; - // get the number of decompositions necessary to perform a full decomposition - unsigned int dlevels_complete = 0; + if (true != getLevels(slength, &dlevels_complete)) { + // error message + fprintf(stderr, "Signal length not supported.\n"); + // cleanup and abort + free(signal); + exit(EXIT_FAILURE); + } + + // device in data + float *d_idata = NULL; + // device out data + float *d_odata = NULL; + // device approx_final data + float *approx_final = NULL; + // The very final approximation coefficient has to be written to the output + // data, all others are reused as input data in the next global step and + // therefore have to be written to the input data again. + // The following flag indicates where to copy approx_final data + // - 0 is input, 1 is output + int approx_is_input; + + // allocate device mem + const unsigned int smem_size = sizeof(float) * slength; + checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size)); + checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size)); + checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size)); + // copy input data to device + checkCudaErrors(cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice)); + + // total number of threads + // in the first decomposition step always one thread computes the average and + // detail signal for one pair of adjacent values + unsigned int num_threads_total_left = slength / 2; + // decomposition levels performed in the current / next step + unsigned int dlevels_step = dlevels_complete; + + // 1D signal so the arrangement of elements is also 1D + dim3 block_size; + dim3 grid_size; + + // number of decomposition levels left after one iteration on the device + unsigned int dlevels_left = dlevels_complete; + + // if less or equal 1k elements, then the data can be processed in one block, + // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the + // computation is split across multiple SM's if enough input data + if (dlevels_complete <= 10) { + // decomposition can be performed at once + block_size.x = num_threads_total_left; + approx_is_input = 0; + } + else { + // 512 threads per block + grid_size.x = (num_threads_total_left / 512); + block_size.x = 512; + + // 512 threads corresponds to 10 decomposition steps + dlevels_step = 10; + dlevels_left -= 10; + + approx_is_input = 1; + } + + // Initialize d_odata to 0.0f + initValue<<>>(d_odata, 0.0f); + + // do until full decomposition is accomplished + while (0 != num_threads_total_left) { + // double the number of threads as bytes + unsigned int mem_shared = (2 * block_size.x) * sizeof(float); + // extra memory requirements to avoid bank conflicts + mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float); + + // run kernel + dwtHaar1D<<>>( + d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left, block_size.x); + + // Copy approx_final to appropriate location + if (approx_is_input) { + checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice)); + } + else { + checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice)); + } + + // update level variables + if (dlevels_left < 10) { + // approx_final = d_odata; + approx_is_input = 0; + } + + // more global steps necessary + dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left; + dlevels_left -= 10; + + // after each step only half the threads are used any longer + // therefore after 10 steps 2^10 less threads + num_threads_total_left = num_threads_total_left >> 10; + + // update block and grid size + grid_size.x = (num_threads_total_left / 512) + (0 != (num_threads_total_left % 512)) ? 1 : 0; + + if (grid_size.x <= 1) { + block_size.x = num_threads_total_left; + } + } + + // get the result back from the server + // allocate mem for the result + float *odata = (float *)malloc(smem_size); + checkCudaErrors(cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost)); + + // post processing + // write file for regression test + if (r_fname == NULL) { + fprintf(stderr, + "Cannot write the output file storing the result of the wavelet " + "decomposition.\n%s", + usage); + exit(EXIT_FAILURE); + } + + if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) { + printf("Writing result to \"%s\"\n", r_fname); + } + else { + exit(EXIT_FAILURE); + } + + // load the reference solution + unsigned int len_reference = 0; + float *reference = NULL; + + if (r_gold_fname == NULL) { + fprintf(stderr, + "Cannot read the file containing the reference result of the " + "wavelet decomposition.\n%s", + usage); + + exit(EXIT_FAILURE); + } + + if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) { + printf("Reading reference result from \"%s\"\n", r_gold_fname); + } + else { + exit(EXIT_FAILURE); + } + + assert(slength == len_reference); + + // compare the computed solution and the reference + bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f); + free(reference); + + // free allocated host and device memory + checkCudaErrors(cudaFree(d_odata)); + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(approx_final)); - if (true != getLevels(slength, &dlevels_complete)) { - // error message - fprintf(stderr, "Signal length not supported.\n"); - // cleanup and abort free(signal); - exit(EXIT_FAILURE); - } + free(odata); + free(s_fname); + free(r_gold_fname); - // device in data - float *d_idata = NULL; - // device out data - float *d_odata = NULL; - // device approx_final data - float *approx_final = NULL; - // The very final approximation coefficient has to be written to the output - // data, all others are reused as input data in the next global step and - // therefore have to be written to the input data again. - // The following flag indicates where to copy approx_final data - // - 0 is input, 1 is output - int approx_is_input; - - // allocate device mem - const unsigned int smem_size = sizeof(float) * slength; - checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size)); - checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size)); - checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size)); - // copy input data to device - checkCudaErrors( - cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice)); - - // total number of threads - // in the first decomposition step always one thread computes the average and - // detail signal for one pair of adjacent values - unsigned int num_threads_total_left = slength / 2; - // decomposition levels performed in the current / next step - unsigned int dlevels_step = dlevels_complete; - - // 1D signal so the arrangement of elements is also 1D - dim3 block_size; - dim3 grid_size; - - // number of decomposition levels left after one iteration on the device - unsigned int dlevels_left = dlevels_complete; - - // if less or equal 1k elements, then the data can be processed in one block, - // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the - // computation is split across multiple SM's if enough input data - if (dlevels_complete <= 10) { - // decomposition can be performed at once - block_size.x = num_threads_total_left; - approx_is_input = 0; - } else { - // 512 threads per block - grid_size.x = (num_threads_total_left / 512); - block_size.x = 512; - - // 512 threads corresponds to 10 decomposition steps - dlevels_step = 10; - dlevels_left -= 10; - - approx_is_input = 1; - } - - // Initialize d_odata to 0.0f - initValue<<>>(d_odata, 0.0f); - - // do until full decomposition is accomplished - while (0 != num_threads_total_left) { - // double the number of threads as bytes - unsigned int mem_shared = (2 * block_size.x) * sizeof(float); - // extra memory requirements to avoid bank conflicts - mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float); - - // run kernel - dwtHaar1D<<>>( - d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left, - block_size.x); - - // Copy approx_final to appropriate location - if (approx_is_input) { - checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4, - cudaMemcpyDeviceToDevice)); - } else { - checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4, - cudaMemcpyDeviceToDevice)); - } - - // update level variables - if (dlevels_left < 10) { - // approx_final = d_odata; - approx_is_input = 0; - } - - // more global steps necessary - dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left; - dlevels_left -= 10; - - // after each step only half the threads are used any longer - // therefore after 10 steps 2^10 less threads - num_threads_total_left = num_threads_total_left >> 10; - - // update block and grid size - grid_size.x = - (num_threads_total_left / 512) + (0 != (num_threads_total_left % 512)) - ? 1 - : 0; - - if (grid_size.x <= 1) { - block_size.x = num_threads_total_left; - } - } - - // get the result back from the server - // allocate mem for the result - float *odata = (float *)malloc(smem_size); - checkCudaErrors( - cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost)); - - // post processing - // write file for regression test - if (r_fname == NULL) { - fprintf(stderr, - "Cannot write the output file storing the result of the wavelet " - "decomposition.\n%s", - usage); - exit(EXIT_FAILURE); - } - - if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) { - printf("Writing result to \"%s\"\n", r_fname); - } else { - exit(EXIT_FAILURE); - } - - // load the reference solution - unsigned int len_reference = 0; - float *reference = NULL; - - if (r_gold_fname == NULL) { - fprintf(stderr, - "Cannot read the file containing the reference result of the " - "wavelet decomposition.\n%s", - usage); - - exit(EXIT_FAILURE); - } - - if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) { - printf("Reading reference result from \"%s\"\n", r_gold_fname); - } else { - exit(EXIT_FAILURE); - } - - assert(slength == len_reference); - - // compare the computed solution and the reference - bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f); - free(reference); - - // free allocated host and device memory - checkCudaErrors(cudaFree(d_odata)); - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(approx_final)); - - free(signal); - free(odata); - free(s_fname); - free(r_gold_fname); - - printf(bResult ? "Test success!\n" : "Test failure!\n"); + printf(bResult ? "Test success!\n" : "Test failure!\n"); } //////////////////////////////////////////////////////////////////////////////// @@ -373,17 +369,18 @@ void runTest(int argc, char **argv) { //! @param levels number of decomposition levels necessary to perform a full //! decomposition //////////////////////////////////////////////////////////////////////////////// -bool getLevels(unsigned int len, unsigned int *levels) { - bool retval = false; +bool getLevels(unsigned int len, unsigned int *levels) +{ + bool retval = false; - // currently signals up to a length of 2^20 supported - for (unsigned int i = 0; i < 20; ++i) { - if (len == (1 << i)) { - *levels = i; - retval = true; - break; + // currently signals up to a length of 2^20 supported + for (unsigned int i = 0; i < 20; ++i) { + if (len == (1 << i)) { + *levels = i; + retval = true; + break; + } } - } - return retval; + return retval; } diff --git a/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D_kernel.cuh b/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D_kernel.cuh index e1cf98bd..3bf1e658 100644 --- a/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D_kernel.cuh +++ b/Samples/5_Domain_Specific/dwtHaar1D/dwtHaar1D_kernel.cuh @@ -91,16 +91,17 @@ namespace cg = cooperative_groups; //! @param od output data //! @param value //////////////////////////////////////////////////////////////////////////////// -__global__ void initValue(float *od, float value) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // position of write into global memory - unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; +__global__ void initValue(float *od, float value) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // position of write into global memory + unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; - od[index] = value; + od[index] = value; - // sync after each decomposition step - cg::sync(cta); + // sync after each decomposition step + cg::sync(cta); } //////////////////////////////////////////////////////////////////////////////// @@ -118,127 +119,130 @@ __global__ void initValue(float *od, float value) { //! global memory //! @param bdim block dimension //////////////////////////////////////////////////////////////////////////////// -__global__ void dwtHaar1D(float *id, float *od, float *approx_final, +__global__ void dwtHaar1D(float *id, + float *od, + float *approx_final, const unsigned int dlevels, const unsigned int slength_step_half, - const int bdim) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); + const int bdim) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - // shared memory for part of the signal - extern __shared__ float shared[]; + // shared memory for part of the signal + extern __shared__ float shared[]; - // thread runtime environment, 1D parametrization - const int gdim = gridDim.x; - // const int bdim = blockDim.x; - const int bid = blockIdx.x; - const int tid = threadIdx.x; + // thread runtime environment, 1D parametrization + const int gdim = gridDim.x; + // const int bdim = blockDim.x; + const int bid = blockIdx.x; + const int tid = threadIdx.x; - // global thread id (w.r.t. to total data set) - const int tid_global = (bid * bdim) + tid; - unsigned int idata = (bid * (2 * bdim)) + tid; + // global thread id (w.r.t. to total data set) + const int tid_global = (bid * bdim) + tid; + unsigned int idata = (bid * (2 * bdim)) + tid; - // read data from global memory - shared[tid] = id[idata]; - shared[tid + bdim] = id[idata + bdim]; - cg::sync(cta); + // read data from global memory + shared[tid] = id[idata]; + shared[tid + bdim] = id[idata + bdim]; + cg::sync(cta); - // this operation has a two way bank conflicts for all threads, this are two - // additional cycles for each warp -- all alternatives to avoid this bank - // conflict are more expensive than the one cycle introduced by serialization - float data0 = shared[2 * tid]; - float data1 = shared[(2 * tid) + 1]; - cg::sync(cta); + // this operation has a two way bank conflicts for all threads, this are two + // additional cycles for each warp -- all alternatives to avoid this bank + // conflict are more expensive than the one cycle introduced by serialization + float data0 = shared[2 * tid]; + float data1 = shared[(2 * tid) + 1]; + cg::sync(cta); - // detail coefficient, not further referenced so directly store in - // global memory - od[tid_global + slength_step_half] = (data0 - data1) * INV_SQRT_2; + // detail coefficient, not further referenced so directly store in + // global memory + od[tid_global + slength_step_half] = (data0 - data1) * INV_SQRT_2; - // offset to avoid bank conflicts - // see the scan example for a more detailed description - unsigned int atid = tid + (tid >> LOG_NUM_BANKS); + // offset to avoid bank conflicts + // see the scan example for a more detailed description + unsigned int atid = tid + (tid >> LOG_NUM_BANKS); - // approximation coefficient - // store in shared memory for further decomposition steps in this global step - shared[atid] = (data0 + data1) * INV_SQRT_2; + // approximation coefficient + // store in shared memory for further decomposition steps in this global step + shared[atid] = (data0 + data1) * INV_SQRT_2; - // all threads have to write approximation coefficient to shared memory before - // next steps can take place - cg::sync(cta); + // all threads have to write approximation coefficient to shared memory before + // next steps can take place + cg::sync(cta); - // early out if possible - // the compiler removes this part from the source because dlevels is - // a constant shader input - // note: syncthreads in bodies of branches can lead to dead-locks unless - // the condition evaluates the same way for ALL threads of a block, as in - // this case - if (dlevels > 1) { - // offset to second element in shared element which has to be used for the - // decomposition, effectively 2^(i - 1) - unsigned int offset_neighbor = 1; - // number of active threads per decomposition level - // identical to the offset for the detail coefficients - unsigned int num_threads = bdim >> 1; + // early out if possible + // the compiler removes this part from the source because dlevels is + // a constant shader input + // note: syncthreads in bodies of branches can lead to dead-locks unless + // the condition evaluates the same way for ALL threads of a block, as in + // this case + if (dlevels > 1) { + // offset to second element in shared element which has to be used for the + // decomposition, effectively 2^(i - 1) + unsigned int offset_neighbor = 1; + // number of active threads per decomposition level + // identical to the offset for the detail coefficients + unsigned int num_threads = bdim >> 1; - // index for the first element of the pair to process - // the representation is still compact (and therefore still tid * 2) - // because the first step operated on registers and only the result has been - // written to shared memory - unsigned int idata0 = tid * 2; + // index for the first element of the pair to process + // the representation is still compact (and therefore still tid * 2) + // because the first step operated on registers and only the result has been + // written to shared memory + unsigned int idata0 = tid * 2; - // offset levels to make the loop more efficient - for (unsigned int i = 1; i < dlevels; ++i) { - // Non-coalesced writes occur if the number of active threads becomes - // less than 16 for a block because the start address for the first - // block is not always aligned with 64 byte which is necessary for - // coalesced access. However, the problem only occurs at high levels - // with only a small number of active threads so that the total number of - // non-coalesced access is rather small and does not justify the - // computations which are necessary to avoid these uncoalesced writes - // (this has been tested and verified) - if (tid < num_threads) { - // update stride, with each decomposition level the stride grows by a - // factor of 2 - unsigned int idata1 = idata0 + offset_neighbor; + // offset levels to make the loop more efficient + for (unsigned int i = 1; i < dlevels; ++i) { + // Non-coalesced writes occur if the number of active threads becomes + // less than 16 for a block because the start address for the first + // block is not always aligned with 64 byte which is necessary for + // coalesced access. However, the problem only occurs at high levels + // with only a small number of active threads so that the total number of + // non-coalesced access is rather small and does not justify the + // computations which are necessary to avoid these uncoalesced writes + // (this has been tested and verified) + if (tid < num_threads) { + // update stride, with each decomposition level the stride grows by a + // factor of 2 + unsigned int idata1 = idata0 + offset_neighbor; - // position of write into global memory - unsigned int g_wpos = (num_threads * gdim) + (bid * num_threads) + tid; + // position of write into global memory + unsigned int g_wpos = (num_threads * gdim) + (bid * num_threads) + tid; - // compute wavelet decomposition step + // compute wavelet decomposition step - // offset to avoid bank conflicts - unsigned int c_idata0 = idata0 + (idata0 >> LOG_NUM_BANKS); - unsigned int c_idata1 = idata1 + (idata1 >> LOG_NUM_BANKS); + // offset to avoid bank conflicts + unsigned int c_idata0 = idata0 + (idata0 >> LOG_NUM_BANKS); + unsigned int c_idata1 = idata1 + (idata1 >> LOG_NUM_BANKS); - // detail coefficient, not further modified so directly store - // in global memory - od[g_wpos] = (shared[c_idata0] - shared[c_idata1]) * INV_SQRT_2; + // detail coefficient, not further modified so directly store + // in global memory + od[g_wpos] = (shared[c_idata0] - shared[c_idata1]) * INV_SQRT_2; - // approximation coefficient - // note that the representation in shared memory becomes rather sparse - // (with a lot of holes inbetween) but the storing scheme in global - // memory guarantees that the common representation (approx, detail_0, - // detail_1, ...) - // is achieved - shared[c_idata0] = (shared[c_idata0] + shared[c_idata1]) * INV_SQRT_2; + // approximation coefficient + // note that the representation in shared memory becomes rather sparse + // (with a lot of holes inbetween) but the storing scheme in global + // memory guarantees that the common representation (approx, detail_0, + // detail_1, ...) + // is achieved + shared[c_idata0] = (shared[c_idata0] + shared[c_idata1]) * INV_SQRT_2; - // update storage offset for details - num_threads = num_threads >> 1; // div 2 - offset_neighbor <<= 1; // mul 2 - idata0 = idata0 << 1; // mul 2 - } + // update storage offset for details + num_threads = num_threads >> 1; // div 2 + offset_neighbor <<= 1; // mul 2 + idata0 = idata0 << 1; // mul 2 + } - // sync after each decomposition step - cg::sync(cta); - } + // sync after each decomposition step + cg::sync(cta); + } - // write the top most level element for the next decomposition steps - // which are performed after an interlock synchronization on host side - if (0 == tid) { - approx_final[bid] = shared[0]; - } + // write the top most level element for the next decomposition steps + // which are performed after an interlock synchronization on host side + if (0 == tid) { + approx_final[bid] = shared[0]; + } - } // end early out if possible + } // end early out if possible } -#endif // #ifndef _DWTHAAR1D_KERNEL_H_ +#endif // #ifndef _DWTHAAR1D_KERNEL_H_ diff --git a/Samples/5_Domain_Specific/dxtc/CudaMath.h b/Samples/5_Domain_Specific/dxtc/CudaMath.h index 58b41227..c1ae0ac7 100644 --- a/Samples/5_Domain_Specific/dxtc/CudaMath.h +++ b/Samples/5_Domain_Specific/dxtc/CudaMath.h @@ -36,71 +36,72 @@ namespace cg = cooperative_groups; // Use power method to find the first eigenvector. // https://en.wikipedia.org/wiki/Power_iteration -inline __device__ __host__ float3 firstEigenVector(float matrix[6]) { - // 8 iterations seems to be more than enough. +inline __device__ __host__ float3 firstEigenVector(float matrix[6]) +{ + // 8 iterations seems to be more than enough. - float3 v = make_float3(1.0f, 1.0f, 1.0f); + float3 v = make_float3(1.0f, 1.0f, 1.0f); - for (int i = 0; i < 8; i++) { - float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; - float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; - float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; - float m = max(max(x, y), z); - float iv = 1.0f / m; - v = make_float3(x * iv, y * iv, z * iv); - } - - return v; -} - -inline __device__ void colorSums(const float3 *colors, float3 *sums, - cg::thread_group tile) { - const int idx = threadIdx.x; - - sums[idx] = colors[idx]; - cg::sync(tile); - sums[idx] += sums[idx ^ 8]; - cg::sync(tile); - sums[idx] += sums[idx ^ 4]; - cg::sync(tile); - sums[idx] += sums[idx ^ 2]; - cg::sync(tile); - sums[idx] += sums[idx ^ 1]; -} - -inline __device__ float3 bestFitLine(const float3 *colors, float3 color_sum, - cg::thread_group tile) { - // Compute covariance matrix of the given colors. - const int idx = threadIdx.x; - - float3 diff = colors[idx] - color_sum * (1.0f / 16.0f); - - // @@ Eliminate two-way bank conflicts here. - // @@ It seems that doing that and unrolling the reduction doesn't help... - __shared__ float covariance[16 * 6]; - - covariance[6 * idx + 0] = diff.x * diff.x; // 0, 6, 12, 2, 8, 14, 4, 10, 0 - covariance[6 * idx + 1] = diff.x * diff.y; - covariance[6 * idx + 2] = diff.x * diff.z; - covariance[6 * idx + 3] = diff.y * diff.y; - covariance[6 * idx + 4] = diff.y * diff.z; - covariance[6 * idx + 5] = diff.z * diff.z; - - cg::sync(tile); - for (int d = 8; d > 0; d >>= 1) { - if (idx < d) { - covariance[6 * idx + 0] += covariance[6 * (idx + d) + 0]; - covariance[6 * idx + 1] += covariance[6 * (idx + d) + 1]; - covariance[6 * idx + 2] += covariance[6 * (idx + d) + 2]; - covariance[6 * idx + 3] += covariance[6 * (idx + d) + 3]; - covariance[6 * idx + 4] += covariance[6 * (idx + d) + 4]; - covariance[6 * idx + 5] += covariance[6 * (idx + d) + 5]; + for (int i = 0; i < 8; i++) { + float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; + float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; + float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; + float m = max(max(x, y), z); + float iv = 1.0f / m; + v = make_float3(x * iv, y * iv, z * iv); } - cg::sync(tile); - } - // Compute first eigen vector. - return firstEigenVector(covariance); + return v; } -#endif // CUDAMATH_H +inline __device__ void colorSums(const float3 *colors, float3 *sums, cg::thread_group tile) +{ + const int idx = threadIdx.x; + + sums[idx] = colors[idx]; + cg::sync(tile); + sums[idx] += sums[idx ^ 8]; + cg::sync(tile); + sums[idx] += sums[idx ^ 4]; + cg::sync(tile); + sums[idx] += sums[idx ^ 2]; + cg::sync(tile); + sums[idx] += sums[idx ^ 1]; +} + +inline __device__ float3 bestFitLine(const float3 *colors, float3 color_sum, cg::thread_group tile) +{ + // Compute covariance matrix of the given colors. + const int idx = threadIdx.x; + + float3 diff = colors[idx] - color_sum * (1.0f / 16.0f); + + // @@ Eliminate two-way bank conflicts here. + // @@ It seems that doing that and unrolling the reduction doesn't help... + __shared__ float covariance[16 * 6]; + + covariance[6 * idx + 0] = diff.x * diff.x; // 0, 6, 12, 2, 8, 14, 4, 10, 0 + covariance[6 * idx + 1] = diff.x * diff.y; + covariance[6 * idx + 2] = diff.x * diff.z; + covariance[6 * idx + 3] = diff.y * diff.y; + covariance[6 * idx + 4] = diff.y * diff.z; + covariance[6 * idx + 5] = diff.z * diff.z; + + cg::sync(tile); + for (int d = 8; d > 0; d >>= 1) { + if (idx < d) { + covariance[6 * idx + 0] += covariance[6 * (idx + d) + 0]; + covariance[6 * idx + 1] += covariance[6 * (idx + d) + 1]; + covariance[6 * idx + 2] += covariance[6 * (idx + d) + 2]; + covariance[6 * idx + 3] += covariance[6 * (idx + d) + 3]; + covariance[6 * idx + 4] += covariance[6 * (idx + d) + 4]; + covariance[6 * idx + 5] += covariance[6 * (idx + d) + 5]; + } + cg::sync(tile); + } + + // Compute first eigen vector. + return firstEigenVector(covariance); +} + +#endif // CUDAMATH_H diff --git a/Samples/5_Domain_Specific/dxtc/dds.h b/Samples/5_Domain_Specific/dxtc/dds.h index f433f2b9..af058be8 100644 --- a/Samples/5_Domain_Specific/dxtc/dds.h +++ b/Samples/5_Domain_Specific/dxtc/dds.h @@ -29,56 +29,58 @@ #define DDS_H #if !defined(MAKEFOURCC) -#define MAKEFOURCC(ch0, ch1, ch2, ch3) \ - ((unsigned int)(ch0) | ((unsigned int)(ch1) << 8) | \ - ((unsigned int)(ch2) << 16) | ((unsigned int)(ch3) << 24)) +#define MAKEFOURCC(ch0, ch1, ch2, ch3) \ + ((unsigned int)(ch0) | ((unsigned int)(ch1) << 8) | ((unsigned int)(ch2) << 16) | ((unsigned int)(ch3) << 24)) #endif -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned short ushort; -struct DDSPixelFormat { - uint size; - uint flags; - uint fourcc; - uint bitcount; - uint rmask; - uint gmask; - uint bmask; - uint amask; +struct DDSPixelFormat +{ + uint size; + uint flags; + uint fourcc; + uint bitcount; + uint rmask; + uint gmask; + uint bmask; + uint amask; }; -struct DDSCaps { - uint caps1; - uint caps2; - uint caps3; - uint caps4; +struct DDSCaps +{ + uint caps1; + uint caps2; + uint caps3; + uint caps4; }; /// DDS file header. -struct DDSHeader { - uint fourcc; - uint size; - uint flags; - uint height; - uint width; - uint pitch; - uint depth; - uint mipmapcount; - uint reserved[11]; - DDSPixelFormat pf; - DDSCaps caps; - uint notused; +struct DDSHeader +{ + uint fourcc; + uint size; + uint flags; + uint height; + uint width; + uint pitch; + uint depth; + uint mipmapcount; + uint reserved[11]; + DDSPixelFormat pf; + DDSCaps caps; + uint notused; }; -static const uint FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' '); -static const uint FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1'); -static const uint DDSD_WIDTH = 0x00000004U; -static const uint DDSD_HEIGHT = 0x00000002U; -static const uint DDSD_CAPS = 0x00000001U; +static const uint FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' '); +static const uint FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1'); +static const uint DDSD_WIDTH = 0x00000004U; +static const uint DDSD_HEIGHT = 0x00000002U; +static const uint DDSD_CAPS = 0x00000001U; static const uint DDSD_PIXELFORMAT = 0x00001000U; -static const uint DDSCAPS_TEXTURE = 0x00001000U; -static const uint DDPF_FOURCC = 0x00000004U; -static const uint DDSD_LINEARSIZE = 0x00080000U; +static const uint DDSCAPS_TEXTURE = 0x00001000U; +static const uint DDPF_FOURCC = 0x00000004U; +static const uint DDSD_LINEARSIZE = 0x00080000U; -#endif // DDS_H +#endif // DDS_H diff --git a/Samples/5_Domain_Specific/dxtc/dxtc.cu b/Samples/5_Domain_Specific/dxtc/dxtc.cu index 36baa737..546506b2 100644 --- a/Samples/5_Domain_Specific/dxtc/dxtc.cu +++ b/Samples/5_Domain_Specific/dxtc/dxtc.cu @@ -30,31 +30,30 @@ namespace cg = cooperative_groups; -#include +#include // for FLT_MAX #include - +#include #include -#include // for FLT_MAX #include "CudaMath.h" #include "dds.h" #include "permutations.h" // Definitions -#define INPUT_IMAGE "teapot512_std.ppm" +#define INPUT_IMAGE "teapot512_std.ppm" #define REFERENCE_IMAGE "teapot512_ref.dds" #define ERROR_THRESHOLD 0.02f -#define NUM_THREADS 64 // Number of threads per block. +#define NUM_THREADS 64 // Number of threads per block. #define __debugsync() -template -__device__ inline void swap(T &a, T &b) { - T tmp = a; - a = b; - b = tmp; +template __device__ inline void swap(T &a, T &b) +{ + T tmp = a; + a = b; + b = tmp; } //__constant__ float3 kColorMetric = { 0.2126f, 0.7152f, 0.0722f }; @@ -63,399 +62,400 @@ __constant__ float3 kColorMetric = {1.0f, 1.0f, 1.0f}; //////////////////////////////////////////////////////////////////////////////// // Sort colors //////////////////////////////////////////////////////////////////////////////// -__device__ void sortColors(const float *values, int *ranks, - cg::thread_group tile) { - const int tid = threadIdx.x; +__device__ void sortColors(const float *values, int *ranks, cg::thread_group tile) +{ + const int tid = threadIdx.x; - int rank = 0; + int rank = 0; #pragma unroll - for (int i = 0; i < 16; i++) { - rank += (values[i] < values[tid]); - } - - ranks[tid] = rank; - - cg::sync(tile); - - // Resolve elements with the same index. - for (int i = 0; i < 15; i++) { - if (tid > i && ranks[tid] == ranks[i]) { - ++ranks[tid]; + for (int i = 0; i < 16; i++) { + rank += (values[i] < values[tid]); } + + ranks[tid] = rank; + cg::sync(tile); - } + + // Resolve elements with the same index. + for (int i = 0; i < 15; i++) { + if (tid > i && ranks[tid] == ranks[i]) { + ++ranks[tid]; + } + cg::sync(tile); + } } //////////////////////////////////////////////////////////////////////////////// // Load color block to shared mem //////////////////////////////////////////////////////////////////////////////// -__device__ void loadColorBlock(const uint *image, float3 colors[16], - float3 sums[16], int xrefs[16], int blockOffset, - cg::thread_block cta) { - const int bid = blockIdx.x + blockOffset; - const int idx = threadIdx.x; +__device__ void loadColorBlock(const uint *image, + float3 colors[16], + float3 sums[16], + int xrefs[16], + int blockOffset, + cg::thread_block cta) +{ + const int bid = blockIdx.x + blockOffset; + const int idx = threadIdx.x; - __shared__ float dps[16]; + __shared__ float dps[16]; - float3 tmp; + float3 tmp; - cg::thread_group tile = cg::tiled_partition(cta, 16); + cg::thread_group tile = cg::tiled_partition(cta, 16); - if (idx < 16) { - // Read color and copy to shared mem. - uint c = image[(bid)*16 + idx]; + if (idx < 16) { + // Read color and copy to shared mem. + uint c = image[(bid) * 16 + idx]; - colors[idx].x = ((c >> 0) & 0xFF) * (1.0f / 255.0f); - colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f); - colors[idx].z = ((c >> 16) & 0xFF) * (1.0f / 255.0f); + colors[idx].x = ((c >> 0) & 0xFF) * (1.0f / 255.0f); + colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f); + colors[idx].z = ((c >> 16) & 0xFF) * (1.0f / 255.0f); - cg::sync(tile); - // Sort colors along the best fit line. - colorSums(colors, sums, tile); + cg::sync(tile); + // Sort colors along the best fit line. + colorSums(colors, sums, tile); - cg::sync(tile); + cg::sync(tile); - float3 axis = bestFitLine(colors, sums[0], tile); + float3 axis = bestFitLine(colors, sums[0], tile); - cg::sync(tile); + cg::sync(tile); - dps[idx] = dot(colors[idx], axis); + dps[idx] = dot(colors[idx], axis); - cg::sync(tile); + cg::sync(tile); - sortColors(dps, xrefs, tile); + sortColors(dps, xrefs, tile); - cg::sync(tile); + cg::sync(tile); - tmp = colors[idx]; + tmp = colors[idx]; - cg::sync(tile); + cg::sync(tile); - colors[xrefs[idx]] = tmp; - } + colors[xrefs[idx]] = tmp; + } } //////////////////////////////////////////////////////////////////////////////// // Round color to RGB565 and expand //////////////////////////////////////////////////////////////////////////////// -inline __device__ float3 roundAndExpand(float3 v, ushort *w) { - v.x = rintf(__saturatef(v.x) * 31.0f); - v.y = rintf(__saturatef(v.y) * 63.0f); - v.z = rintf(__saturatef(v.z) * 31.0f); +inline __device__ float3 roundAndExpand(float3 v, ushort *w) +{ + v.x = rintf(__saturatef(v.x) * 31.0f); + v.y = rintf(__saturatef(v.y) * 63.0f); + v.z = rintf(__saturatef(v.z) * 31.0f); - *w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z; - v.x *= 0.03227752766457f; // approximate integer bit expansion. - v.y *= 0.01583151765563f; - v.z *= 0.03227752766457f; - return v; + *w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z; + v.x *= 0.03227752766457f; // approximate integer bit expansion. + v.y *= 0.01583151765563f; + v.z *= 0.03227752766457f; + return v; } -__constant__ float alphaTable4[4] = {9.0f, 0.0f, 6.0f, 3.0f}; -__constant__ float alphaTable3[4] = {4.0f, 0.0f, 2.0f, 2.0f}; -__constant__ const int prods4[4] = {0x090000, 0x000900, 0x040102, 0x010402}; -__constant__ const int prods3[4] = {0x040000, 0x000400, 0x040101, 0x010401}; +__constant__ float alphaTable4[4] = {9.0f, 0.0f, 6.0f, 3.0f}; +__constant__ float alphaTable3[4] = {4.0f, 0.0f, 2.0f, 2.0f}; +__constant__ const int prods4[4] = {0x090000, 0x000900, 0x040102, 0x010402}; +__constant__ const int prods3[4] = {0x040000, 0x000400, 0x040101, 0x010401}; #define USE_TABLES 1 //////////////////////////////////////////////////////////////////////////////// // Evaluate permutations //////////////////////////////////////////////////////////////////////////////// -static __device__ float evalPermutation4(const float3 *colors, uint permutation, - ushort *start, ushort *end, - float3 color_sum) { +static __device__ float +evalPermutation4(const float3 *colors, uint permutation, ushort *start, ushort *end, float3 color_sum) +{ // Compute endpoints using least squares. #if USE_TABLES - float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); + float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - int akku = 0; + int akku = 0; - // Compute alpha & beta for this permutation. - for (int i = 0; i < 16; i++) { - const uint bits = permutation >> (2 * i); + // Compute alpha & beta for this permutation. + for (int i = 0; i < 16; i++) { + const uint bits = permutation >> (2 * i); - alphax_sum += alphaTable4[bits & 3] * colors[i]; - akku += prods4[bits & 3]; - } - - float alpha2_sum = float(akku >> 16); - float beta2_sum = float((akku >> 8) & 0xff); - float alphabeta_sum = float((akku >> 0) & 0xff); - float3 betax_sum = (9.0f * color_sum) - alphax_sum; -#else - float alpha2_sum = 0.0f; - float beta2_sum = 0.0f; - float alphabeta_sum = 0.0f; - float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - - // Compute alpha & beta for this permutation. - for (int i = 0; i < 16; i++) { - const uint bits = permutation >> (2 * i); - - float beta = (bits & 1); - - if (bits & 2) { - beta = (1 + beta) * (1.0f / 3.0f); + alphax_sum += alphaTable4[bits & 3] * colors[i]; + akku += prods4[bits & 3]; } - float alpha = 1.0f - beta; + float alpha2_sum = float(akku >> 16); + float beta2_sum = float((akku >> 8) & 0xff); + float alphabeta_sum = float((akku >> 0) & 0xff); + float3 betax_sum = (9.0f * color_sum) - alphax_sum; +#else + float alpha2_sum = 0.0f; + float beta2_sum = 0.0f; + float alphabeta_sum = 0.0f; + float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - alpha2_sum += alpha * alpha; - beta2_sum += beta * beta; - alphabeta_sum += alpha * beta; - alphax_sum += alpha * colors[i]; - } + // Compute alpha & beta for this permutation. + for (int i = 0; i < 16; i++) { + const uint bits = permutation >> (2 * i); - float3 betax_sum = color_sum - alphax_sum; + float beta = (bits & 1); + + if (bits & 2) { + beta = (1 + beta) * (1.0f / 3.0f); + } + + float alpha = 1.0f - beta; + + alpha2_sum += alpha * alpha; + beta2_sum += beta * beta; + alphabeta_sum += alpha * beta; + alphax_sum += alpha * colors[i]; + } + + float3 betax_sum = color_sum - alphax_sum; #endif - // alpha2, beta2, alphabeta and factor could be precomputed for each - // permutation, but it's faster to recompute them. - const float factor = - 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); + // alpha2, beta2, alphabeta and factor could be precomputed for each + // permutation, but it's faster to recompute them. + const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; - float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; + float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; + float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; - // Round a, b to the closest 5-6-5 color and expand... - a = roundAndExpand(a, start); - b = roundAndExpand(b, end); + // Round a, b to the closest 5-6-5 color and expand... + a = roundAndExpand(a, start); + b = roundAndExpand(b, end); - // compute the error - float3 e = a * a * alpha2_sum + b * b * beta2_sum + - 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); + // compute the error + float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); - return (0.111111111111f) * dot(e, kColorMetric); + return (0.111111111111f) * dot(e, kColorMetric); } -static __device__ float evalPermutation3(const float3 *colors, uint permutation, - ushort *start, ushort *end, - float3 color_sum) { +static __device__ float +evalPermutation3(const float3 *colors, uint permutation, ushort *start, ushort *end, float3 color_sum) +{ // Compute endpoints using least squares. #if USE_TABLES - float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); + float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - int akku = 0; + int akku = 0; - // Compute alpha & beta for this permutation. - for (int i = 0; i < 16; i++) { - const uint bits = permutation >> (2 * i); + // Compute alpha & beta for this permutation. + for (int i = 0; i < 16; i++) { + const uint bits = permutation >> (2 * i); - alphax_sum += alphaTable3[bits & 3] * colors[i]; - akku += prods3[bits & 3]; - } - - float alpha2_sum = float(akku >> 16); - float beta2_sum = float((akku >> 8) & 0xff); - float alphabeta_sum = float((akku >> 0) & 0xff); - float3 betax_sum = (4.0f * color_sum) - alphax_sum; -#else - float alpha2_sum = 0.0f; - float beta2_sum = 0.0f; - float alphabeta_sum = 0.0f; - float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - - // Compute alpha & beta for this permutation. - for (int i = 0; i < 16; i++) { - const uint bits = permutation >> (2 * i); - - float beta = (bits & 1); - - if (bits & 2) { - beta = 0.5f; + alphax_sum += alphaTable3[bits & 3] * colors[i]; + akku += prods3[bits & 3]; } - float alpha = 1.0f - beta; + float alpha2_sum = float(akku >> 16); + float beta2_sum = float((akku >> 8) & 0xff); + float alphabeta_sum = float((akku >> 0) & 0xff); + float3 betax_sum = (4.0f * color_sum) - alphax_sum; +#else + float alpha2_sum = 0.0f; + float beta2_sum = 0.0f; + float alphabeta_sum = 0.0f; + float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f); - alpha2_sum += alpha * alpha; - beta2_sum += beta * beta; - alphabeta_sum += alpha * beta; - alphax_sum += alpha * colors[i]; - } + // Compute alpha & beta for this permutation. + for (int i = 0; i < 16; i++) { + const uint bits = permutation >> (2 * i); - float3 betax_sum = color_sum - alphax_sum; + float beta = (bits & 1); + + if (bits & 2) { + beta = 0.5f; + } + + float alpha = 1.0f - beta; + + alpha2_sum += alpha * alpha; + beta2_sum += beta * beta; + alphabeta_sum += alpha * beta; + alphax_sum += alpha * colors[i]; + } + + float3 betax_sum = color_sum - alphax_sum; #endif - const float factor = - 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); + const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; - float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; + float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; + float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; - // Round a, b to the closest 5-6-5 color and expand... - a = roundAndExpand(a, start); - b = roundAndExpand(b, end); + // Round a, b to the closest 5-6-5 color and expand... + a = roundAndExpand(a, start); + b = roundAndExpand(b, end); - // compute the error - float3 e = a * a * alpha2_sum + b * b * beta2_sum + - 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); + // compute the error + float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum); - return (0.25f) * dot(e, kColorMetric); + return (0.25f) * dot(e, kColorMetric); } -__device__ void evalAllPermutations(const float3 *colors, - const uint *permutations, ushort &bestStart, - ushort &bestEnd, uint &bestPermutation, - float *errors, float3 color_sum, - cg::thread_block cta) { - const int idx = threadIdx.x; +__device__ void evalAllPermutations(const float3 *colors, + const uint *permutations, + ushort &bestStart, + ushort &bestEnd, + uint &bestPermutation, + float *errors, + float3 color_sum, + cg::thread_block cta) +{ + const int idx = threadIdx.x; - float bestError = FLT_MAX; + float bestError = FLT_MAX; - __shared__ uint s_permutations[160]; + __shared__ uint s_permutations[160]; - for (int i = 0; i < 16; i++) { - int pidx = idx + NUM_THREADS * i; + for (int i = 0; i < 16; i++) { + int pidx = idx + NUM_THREADS * i; - if (pidx >= 992) { - break; + if (pidx >= 992) { + break; + } + + ushort start, end; + uint permutation = permutations[pidx]; + + if (pidx < 160) { + s_permutations[pidx] = permutation; + } + + float error = evalPermutation4(colors, permutation, &start, &end, color_sum); + + if (error < bestError) { + bestError = error; + bestPermutation = permutation; + bestStart = start; + bestEnd = end; + } } - ushort start, end; - uint permutation = permutations[pidx]; - - if (pidx < 160) { - s_permutations[pidx] = permutation; - } - - float error = - evalPermutation4(colors, permutation, &start, &end, color_sum); - - if (error < bestError) { - bestError = error; - bestPermutation = permutation; - bestStart = start; - bestEnd = end; - } - } - - if (bestStart < bestEnd) { - swap(bestEnd, bestStart); - bestPermutation ^= 0x55555555; // Flip indices. - } - - cg::sync(cta); // Sync here to ensure s_permutations is valid going forward - - for (int i = 0; i < 3; i++) { - int pidx = idx + NUM_THREADS * i; - - if (pidx >= 160) { - break; - } - - ushort start, end; - uint permutation = s_permutations[pidx]; - float error = - evalPermutation3(colors, permutation, &start, &end, color_sum); - - if (error < bestError) { - bestError = error; - bestPermutation = permutation; - bestStart = start; - bestEnd = end; - - if (bestStart > bestEnd) { + if (bestStart < bestEnd) { swap(bestEnd, bestStart); - bestPermutation ^= - (~bestPermutation >> 1) & 0x55555555; // Flip indices. - } + bestPermutation ^= 0x55555555; // Flip indices. } - } - errors[idx] = bestError; + cg::sync(cta); // Sync here to ensure s_permutations is valid going forward + + for (int i = 0; i < 3; i++) { + int pidx = idx + NUM_THREADS * i; + + if (pidx >= 160) { + break; + } + + ushort start, end; + uint permutation = s_permutations[pidx]; + float error = evalPermutation3(colors, permutation, &start, &end, color_sum); + + if (error < bestError) { + bestError = error; + bestPermutation = permutation; + bestStart = start; + bestEnd = end; + + if (bestStart > bestEnd) { + swap(bestEnd, bestStart); + bestPermutation ^= (~bestPermutation >> 1) & 0x55555555; // Flip indices. + } + } + } + + errors[idx] = bestError; } //////////////////////////////////////////////////////////////////////////////// // Find index with minimum error //////////////////////////////////////////////////////////////////////////////// -__device__ int findMinError(float *errors, cg::thread_block cta) { - const int idx = threadIdx.x; - __shared__ int indices[NUM_THREADS]; - indices[idx] = idx; - - cg::sync(cta); - - for (int d = NUM_THREADS / 2; d > 0; d >>= 1) { - float err0 = errors[idx]; - float err1 = (idx + d) < NUM_THREADS ? errors[idx + d] : FLT_MAX; - int index1 = (idx + d) < NUM_THREADS ? indices[idx + d] : 0; +__device__ int findMinError(float *errors, cg::thread_block cta) +{ + const int idx = threadIdx.x; + __shared__ int indices[NUM_THREADS]; + indices[idx] = idx; cg::sync(cta); - if (err1 < err0) { - errors[idx] = err1; - indices[idx] = index1; + for (int d = NUM_THREADS / 2; d > 0; d >>= 1) { + float err0 = errors[idx]; + float err1 = (idx + d) < NUM_THREADS ? errors[idx + d] : FLT_MAX; + int index1 = (idx + d) < NUM_THREADS ? indices[idx + d] : 0; + + cg::sync(cta); + + if (err1 < err0) { + errors[idx] = err1; + indices[idx] = index1; + } + + cg::sync(cta); } - cg::sync(cta); - } - - return indices[0]; + return indices[0]; } //////////////////////////////////////////////////////////////////////////////// // Save DXT block //////////////////////////////////////////////////////////////////////////////// -__device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, - int xrefs[16], uint2 *result, int blockOffset) { - const int bid = blockIdx.x + blockOffset; +__device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xrefs[16], uint2 *result, int blockOffset) +{ + const int bid = blockIdx.x + blockOffset; - if (start == end) { - permutation = 0; - } + if (start == end) { + permutation = 0; + } - // Reorder permutation. - uint indices = 0; + // Reorder permutation. + uint indices = 0; - for (int i = 0; i < 16; i++) { - int ref = xrefs[i]; - indices |= ((permutation >> (2 * ref)) & 3) << (2 * i); - } + for (int i = 0; i < 16; i++) { + int ref = xrefs[i]; + indices |= ((permutation >> (2 * ref)) & 3) << (2 * i); + } - // Write endpoints. - result[bid].x = (end << 16) | start; + // Write endpoints. + result[bid].x = (end << 16) | start; - // Write palette indices. - result[bid].y = indices; + // Write palette indices. + result[bid].y = indices; } //////////////////////////////////////////////////////////////////////////////// // Compress color block //////////////////////////////////////////////////////////////////////////////// -__global__ void compress(const uint *permutations, const uint *image, - uint2 *result, int blockOffset) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void compress(const uint *permutations, const uint *image, uint2 *result, int blockOffset) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - const int idx = threadIdx.x; + const int idx = threadIdx.x; - __shared__ float3 colors[16]; - __shared__ float3 sums[16]; - __shared__ int xrefs[16]; + __shared__ float3 colors[16]; + __shared__ float3 sums[16]; + __shared__ int xrefs[16]; - loadColorBlock(image, colors, sums, xrefs, blockOffset, cta); + loadColorBlock(image, colors, sums, xrefs, blockOffset, cta); - cg::sync(cta); + cg::sync(cta); - ushort bestStart, bestEnd; - uint bestPermutation; + ushort bestStart, bestEnd; + uint bestPermutation; - __shared__ float errors[NUM_THREADS]; + __shared__ float errors[NUM_THREADS]; - evalAllPermutations(colors, permutations, bestStart, bestEnd, bestPermutation, - errors, sums[0], cta); + evalAllPermutations(colors, permutations, bestStart, bestEnd, bestPermutation, errors, sums[0], cta); - // Use a parallel reduction to find minimum error. - const int minIdx = findMinError(errors, cta); + // Use a parallel reduction to find minimum error. + const int minIdx = findMinError(errors, cta); - cg::sync(cta); + cg::sync(cta); - // Only write the result of the winner thread. - if (idx == minIdx) { - saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result, - blockOffset); - } + // Only write the result of the winner thread. + if (idx == minIdx) { + saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result, blockOffset); + } } // Helper structs and functions to validate the output of the compressor. @@ -463,324 +463,330 @@ __global__ void compress(const uint *permutations, const uint *image, // different // results for different targets due to floating point arithmetic. -union Color32 { - struct { - unsigned char b, g, r, a; - }; - unsigned int u; +union Color32 +{ + struct + { + unsigned char b, g, r, a; + }; + unsigned int u; }; -union Color16 { - struct { - unsigned short b : 5; - unsigned short g : 6; - unsigned short r : 5; - }; - unsigned short u; +union Color16 +{ + struct + { + unsigned short b : 5; + unsigned short g : 6; + unsigned short r : 5; + }; + unsigned short u; }; -struct BlockDXT1 { - Color16 col0; - Color16 col1; - union { - unsigned char row[4]; - unsigned int indices; - }; +struct BlockDXT1 +{ + Color16 col0; + Color16 col1; + union + { + unsigned char row[4]; + unsigned int indices; + }; - void decompress(Color32 colors[16]) const; + void decompress(Color32 colors[16]) const; }; -void BlockDXT1::decompress(Color32 *colors) const { - Color32 palette[4]; +void BlockDXT1::decompress(Color32 *colors) const +{ + Color32 palette[4]; - // Does bit expansion before interpolation. - palette[0].b = (col0.b << 3) | (col0.b >> 2); - palette[0].g = (col0.g << 2) | (col0.g >> 4); - palette[0].r = (col0.r << 3) | (col0.r >> 2); - palette[0].a = 0xFF; + // Does bit expansion before interpolation. + palette[0].b = (col0.b << 3) | (col0.b >> 2); + palette[0].g = (col0.g << 2) | (col0.g >> 4); + palette[0].r = (col0.r << 3) | (col0.r >> 2); + palette[0].a = 0xFF; - palette[1].r = (col1.r << 3) | (col1.r >> 2); - palette[1].g = (col1.g << 2) | (col1.g >> 4); - palette[1].b = (col1.b << 3) | (col1.b >> 2); - palette[1].a = 0xFF; + palette[1].r = (col1.r << 3) | (col1.r >> 2); + palette[1].g = (col1.g << 2) | (col1.g >> 4); + palette[1].b = (col1.b << 3) | (col1.b >> 2); + palette[1].a = 0xFF; - if (col0.u > col1.u) { - // Four-color block: derive the other two colors. - palette[2].r = (2 * palette[0].r + palette[1].r) / 3; - palette[2].g = (2 * palette[0].g + palette[1].g) / 3; - palette[2].b = (2 * palette[0].b + palette[1].b) / 3; - palette[2].a = 0xFF; + if (col0.u > col1.u) { + // Four-color block: derive the other two colors. + palette[2].r = (2 * palette[0].r + palette[1].r) / 3; + palette[2].g = (2 * palette[0].g + palette[1].g) / 3; + palette[2].b = (2 * palette[0].b + palette[1].b) / 3; + palette[2].a = 0xFF; - palette[3].r = (2 * palette[1].r + palette[0].r) / 3; - palette[3].g = (2 * palette[1].g + palette[0].g) / 3; - palette[3].b = (2 * palette[1].b + palette[0].b) / 3; - palette[3].a = 0xFF; - } else { - // Three-color block: derive the other color. - palette[2].r = (palette[0].r + palette[1].r) / 2; - palette[2].g = (palette[0].g + palette[1].g) / 2; - palette[2].b = (palette[0].b + palette[1].b) / 2; - palette[2].a = 0xFF; + palette[3].r = (2 * palette[1].r + palette[0].r) / 3; + palette[3].g = (2 * palette[1].g + palette[0].g) / 3; + palette[3].b = (2 * palette[1].b + palette[0].b) / 3; + palette[3].a = 0xFF; + } + else { + // Three-color block: derive the other color. + palette[2].r = (palette[0].r + palette[1].r) / 2; + palette[2].g = (palette[0].g + palette[1].g) / 2; + palette[2].b = (palette[0].b + palette[1].b) / 2; + palette[2].a = 0xFF; - palette[3].r = 0x00; - palette[3].g = 0x00; - palette[3].b = 0x00; - palette[3].a = 0x00; - } + palette[3].r = 0x00; + palette[3].g = 0x00; + palette[3].b = 0x00; + palette[3].a = 0x00; + } - for (int i = 0; i < 16; i++) { - colors[i] = palette[(indices >> (2 * i)) & 0x3]; - } + for (int i = 0; i < 16; i++) { + colors[i] = palette[(indices >> (2 * i)) & 0x3]; + } } -static int compareColors(const Color32 *b0, const Color32 *b1) { - int sum = 0; +static int compareColors(const Color32 *b0, const Color32 *b1) +{ + int sum = 0; - for (int i = 0; i < 16; i++) { - int r = (b0[i].r - b1[i].r); - int g = (b0[i].g - b1[i].g); - int b = (b0[i].b - b1[i].b); - sum += r * r + g * g + b * b; - } + for (int i = 0; i < 16; i++) { + int r = (b0[i].r - b1[i].r); + int g = (b0[i].g - b1[i].g); + int b = (b0[i].b - b1[i].b); + sum += r * r + g * g + b * b; + } - return sum; + return sum; } -static int compareBlock(const BlockDXT1 *b0, const BlockDXT1 *b1) { - Color32 colors0[16]; - Color32 colors1[16]; +static int compareBlock(const BlockDXT1 *b0, const BlockDXT1 *b1) +{ + Color32 colors0[16]; + Color32 colors1[16]; - if (memcmp(b0, b1, sizeof(BlockDXT1)) == 0) { - return 0; - } else { - b0->decompress(colors0); - b1->decompress(colors1); + if (memcmp(b0, b1, sizeof(BlockDXT1)) == 0) { + return 0; + } + else { + b0->decompress(colors0); + b1->decompress(colors1); - return compareColors(colors0, colors1); - } + return compareColors(colors0, colors1); + } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + printf("%s Starting...\n\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - // Load input image. - unsigned char *data = NULL; - uint W, H; + // Load input image. + unsigned char *data = NULL; + uint W, H; - char *image_path = sdkFindFilePath(INPUT_IMAGE, argv[0]); + char *image_path = sdkFindFilePath(INPUT_IMAGE, argv[0]); - if (image_path == 0) { - printf("Error, unable to find source image <%s>\n", image_path); - exit(EXIT_FAILURE); - } - - if (!sdkLoadPPM4ub(image_path, &data, &W, &H)) { - printf("Error, unable to open source image file <%s>\n", image_path); - - exit(EXIT_FAILURE); - } - - uint w = W, h = H; - - printf("Image Loaded '%s', %d x %d pixels\n\n", image_path, w, h); - - // Allocate input image. - const uint memSize = w * h * 4; - assert(0 != memSize); - uint *block_image = (uint *)malloc(memSize); - - // Convert linear image to block linear. - for (uint by = 0; by < h / 4; by++) { - for (uint bx = 0; bx < w / 4; bx++) { - for (int i = 0; i < 16; i++) { - const int x = i & 3; - const int y = i / 4; - block_image[(by * w / 4 + bx) * 16 + i] = - ((uint *)data)[(by * 4 + y) * 4 * (W / 4) + bx * 4 + x]; - } - } - } - - // copy into global mem - uint *d_data = NULL; - checkCudaErrors(cudaMalloc((void **)&d_data, memSize)); - - // Result - uint *d_result = NULL; - const uint compressedSize = (w / 4) * (h / 4) * 8; - checkCudaErrors(cudaMalloc((void **)&d_result, compressedSize)); - uint *h_result = (uint *)malloc(compressedSize); - - // Compute permutations. - uint permutations[1024]; - computePermutations(permutations); - - // Copy permutations host to devie. - uint *d_permutations = NULL; - checkCudaErrors(cudaMalloc((void **)&d_permutations, 1024 * sizeof(uint))); - checkCudaErrors(cudaMemcpy(d_permutations, permutations, 1024 * sizeof(uint), - cudaMemcpyHostToDevice)); - - // create a timer - StopWatchInterface *timer = NULL; - sdkCreateTimer(&timer); - - // Copy image from host to device - checkCudaErrors( - cudaMemcpy(d_data, block_image, memSize, cudaMemcpyHostToDevice)); - - // Determine launch configuration and run timed computation numIterations - // times - uint blocks = ((w + 3) / 4) * - ((h + 3) / 4); // rounds up by 1 block in each dim if %4 != 0 - - int devID; - cudaDeviceProp deviceProp; - - // get number of SMs on this GPU - checkCudaErrors(cudaGetDevice(&devID)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - - // Restrict the numbers of blocks to launch on low end GPUs to avoid kernel - // timeout - int blocksPerLaunch = min(blocks, 768 * deviceProp.multiProcessorCount); - - printf("Running DXT Compression on %u x %u image...\n", w, h); - printf("\n%u Blocks, %u Threads per Block, %u Threads in Grid...\n\n", blocks, - NUM_THREADS, blocks * NUM_THREADS); - int numIterations = 1; - - for (int i = -1; i < numIterations; ++i) { - if (i == 0) { - checkCudaErrors(cudaDeviceSynchronize()); - sdkStartTimer(&timer); + if (image_path == 0) { + printf("Error, unable to find source image <%s>\n", image_path); + exit(EXIT_FAILURE); } - for (int j = 0; j < (int)blocks; j += blocksPerLaunch) { - compress<<>>( - d_permutations, d_data, (uint2 *)d_result, j); + if (!sdkLoadPPM4ub(image_path, &data, &W, &H)) { + printf("Error, unable to open source image file <%s>\n", image_path); + + exit(EXIT_FAILURE); } - } - getLastCudaError("compress"); + uint w = W, h = H; - // sync to host, stop timer, record perf - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer); - double dAvgTime = 1.0e-3 * sdkGetTimerValue(&timer) / (double)numIterations; - printf( - "dxtc, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, " - "NumDevsUsed = %i, Workgroup = %d\n", - (1.0e-6 * (double)(W * H) / dAvgTime), dAvgTime, (W * H), 1, NUM_THREADS); + printf("Image Loaded '%s', %d x %d pixels\n\n", image_path, w, h); - // copy result data from device to host - checkCudaErrors( - cudaMemcpy(h_result, d_result, compressedSize, cudaMemcpyDeviceToHost)); + // Allocate input image. + const uint memSize = w * h * 4; + assert(0 != memSize); + uint *block_image = (uint *)malloc(memSize); - // Write out result data to DDS file - char output_filename[1024]; - strcpy(output_filename, image_path); - strcpy(output_filename + strlen(image_path) - 3, "dds"); - FILE *fp = fopen(output_filename, "wb"); - - if (fp == 0) { - printf("Error, unable to open output image <%s>\n", output_filename); - exit(EXIT_FAILURE); - } - - DDSHeader header; - header.fourcc = FOURCC_DDS; - header.size = 124; - header.flags = (DDSD_WIDTH | DDSD_HEIGHT | DDSD_CAPS | DDSD_PIXELFORMAT | - DDSD_LINEARSIZE); - header.height = h; - header.width = w; - header.pitch = compressedSize; - header.depth = 0; - header.mipmapcount = 0; - memset(header.reserved, 0, sizeof(header.reserved)); - header.pf.size = 32; - header.pf.flags = DDPF_FOURCC; - header.pf.fourcc = FOURCC_DXT1; - header.pf.bitcount = 0; - header.pf.rmask = 0; - header.pf.gmask = 0; - header.pf.bmask = 0; - header.pf.amask = 0; - header.caps.caps1 = DDSCAPS_TEXTURE; - header.caps.caps2 = 0; - header.caps.caps3 = 0; - header.caps.caps4 = 0; - header.notused = 0; - fwrite(&header, sizeof(DDSHeader), 1, fp); - fwrite(h_result, compressedSize, 1, fp); - fclose(fp); - - // Make sure the generated image is correct. - const char *reference_image_path = sdkFindFilePath(REFERENCE_IMAGE, argv[0]); - - if (reference_image_path == 0) { - printf("Error, unable to find reference image\n"); - - exit(EXIT_FAILURE); - } - - fp = fopen(reference_image_path, "rb"); - - if (fp == 0) { - printf("Error, unable to open reference image\n"); - - exit(EXIT_FAILURE); - } - - fseek(fp, sizeof(DDSHeader), SEEK_SET); - uint referenceSize = (W / 4) * (H / 4) * 8; - uint *reference = (uint *)malloc(referenceSize); - fread(reference, referenceSize, 1, fp); - fclose(fp); - - printf("\nChecking accuracy...\n"); - float rms = 0; - - for (uint y = 0; y < h; y += 4) { - for (uint x = 0; x < w; x += 4) { - uint referenceBlockIdx = ((y / 4) * (W / 4) + (x / 4)); - uint resultBlockIdx = ((y / 4) * (w / 4) + (x / 4)); - - int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, - ((BlockDXT1 *)reference) + referenceBlockIdx); - - if (cmp != 0.0f) { - printf("Deviation at (%4d,%4d):\t%f rms\n", x / 4, y / 4, - float(cmp) / 16 / 3); - } - - rms += cmp; + // Convert linear image to block linear. + for (uint by = 0; by < h / 4; by++) { + for (uint bx = 0; bx < w / 4; bx++) { + for (int i = 0; i < 16; i++) { + const int x = i & 3; + const int y = i / 4; + block_image[(by * w / 4 + bx) * 16 + i] = ((uint *)data)[(by * 4 + y) * 4 * (W / 4) + bx * 4 + x]; + } + } } - } - rms /= w * h * 3; + // copy into global mem + uint *d_data = NULL; + checkCudaErrors(cudaMalloc((void **)&d_data, memSize)); - // Free allocated resources and exit - checkCudaErrors(cudaFree(d_permutations)); - checkCudaErrors(cudaFree(d_data)); - checkCudaErrors(cudaFree(d_result)); - free(image_path); - free(data); - free(block_image); - free(h_result); - free(reference); - sdkDeleteTimer(&timer); + // Result + uint *d_result = NULL; + const uint compressedSize = (w / 4) * (h / 4) * 8; + checkCudaErrors(cudaMalloc((void **)&d_result, compressedSize)); + uint *h_result = (uint *)malloc(compressedSize); - printf("RMS(reference, result) = %f\n\n", rms); - printf(rms <= ERROR_THRESHOLD ? "Test passed\n" : "Test failed!\n"); - /* Return zero if test passed, one otherwise */ - return rms > ERROR_THRESHOLD; + // Compute permutations. + uint permutations[1024]; + computePermutations(permutations); + + // Copy permutations host to devie. + uint *d_permutations = NULL; + checkCudaErrors(cudaMalloc((void **)&d_permutations, 1024 * sizeof(uint))); + checkCudaErrors(cudaMemcpy(d_permutations, permutations, 1024 * sizeof(uint), cudaMemcpyHostToDevice)); + + // create a timer + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + + // Copy image from host to device + checkCudaErrors(cudaMemcpy(d_data, block_image, memSize, cudaMemcpyHostToDevice)); + + // Determine launch configuration and run timed computation numIterations + // times + uint blocks = ((w + 3) / 4) * ((h + 3) / 4); // rounds up by 1 block in each dim if %4 != 0 + + int devID; + cudaDeviceProp deviceProp; + + // get number of SMs on this GPU + checkCudaErrors(cudaGetDevice(&devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + + // Restrict the numbers of blocks to launch on low end GPUs to avoid kernel + // timeout + int blocksPerLaunch = min(blocks, 768 * deviceProp.multiProcessorCount); + + printf("Running DXT Compression on %u x %u image...\n", w, h); + printf("\n%u Blocks, %u Threads per Block, %u Threads in Grid...\n\n", blocks, NUM_THREADS, blocks * NUM_THREADS); + int numIterations = 1; + + for (int i = -1; i < numIterations; ++i) { + if (i == 0) { + checkCudaErrors(cudaDeviceSynchronize()); + sdkStartTimer(&timer); + } + + for (int j = 0; j < (int)blocks; j += blocksPerLaunch) { + compress<<>>(d_permutations, d_data, (uint2 *)d_result, j); + } + } + + getLastCudaError("compress"); + + // sync to host, stop timer, record perf + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer); + double dAvgTime = 1.0e-3 * sdkGetTimerValue(&timer) / (double)numIterations; + printf("dxtc, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, " + "NumDevsUsed = %i, Workgroup = %d\n", + (1.0e-6 * (double)(W * H) / dAvgTime), + dAvgTime, + (W * H), + 1, + NUM_THREADS); + + // copy result data from device to host + checkCudaErrors(cudaMemcpy(h_result, d_result, compressedSize, cudaMemcpyDeviceToHost)); + + // Write out result data to DDS file + char output_filename[1024]; + strcpy(output_filename, image_path); + strcpy(output_filename + strlen(image_path) - 3, "dds"); + FILE *fp = fopen(output_filename, "wb"); + + if (fp == 0) { + printf("Error, unable to open output image <%s>\n", output_filename); + exit(EXIT_FAILURE); + } + + DDSHeader header; + header.fourcc = FOURCC_DDS; + header.size = 124; + header.flags = (DDSD_WIDTH | DDSD_HEIGHT | DDSD_CAPS | DDSD_PIXELFORMAT | DDSD_LINEARSIZE); + header.height = h; + header.width = w; + header.pitch = compressedSize; + header.depth = 0; + header.mipmapcount = 0; + memset(header.reserved, 0, sizeof(header.reserved)); + header.pf.size = 32; + header.pf.flags = DDPF_FOURCC; + header.pf.fourcc = FOURCC_DXT1; + header.pf.bitcount = 0; + header.pf.rmask = 0; + header.pf.gmask = 0; + header.pf.bmask = 0; + header.pf.amask = 0; + header.caps.caps1 = DDSCAPS_TEXTURE; + header.caps.caps2 = 0; + header.caps.caps3 = 0; + header.caps.caps4 = 0; + header.notused = 0; + fwrite(&header, sizeof(DDSHeader), 1, fp); + fwrite(h_result, compressedSize, 1, fp); + fclose(fp); + + // Make sure the generated image is correct. + const char *reference_image_path = sdkFindFilePath(REFERENCE_IMAGE, argv[0]); + + if (reference_image_path == 0) { + printf("Error, unable to find reference image\n"); + + exit(EXIT_FAILURE); + } + + fp = fopen(reference_image_path, "rb"); + + if (fp == 0) { + printf("Error, unable to open reference image\n"); + + exit(EXIT_FAILURE); + } + + fseek(fp, sizeof(DDSHeader), SEEK_SET); + uint referenceSize = (W / 4) * (H / 4) * 8; + uint *reference = (uint *)malloc(referenceSize); + fread(reference, referenceSize, 1, fp); + fclose(fp); + + printf("\nChecking accuracy...\n"); + float rms = 0; + + for (uint y = 0; y < h; y += 4) { + for (uint x = 0; x < w; x += 4) { + uint referenceBlockIdx = ((y / 4) * (W / 4) + (x / 4)); + uint resultBlockIdx = ((y / 4) * (w / 4) + (x / 4)); + + int cmp = + compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); + + if (cmp != 0.0f) { + printf("Deviation at (%4d,%4d):\t%f rms\n", x / 4, y / 4, float(cmp) / 16 / 3); + } + + rms += cmp; + } + } + + rms /= w * h * 3; + + // Free allocated resources and exit + checkCudaErrors(cudaFree(d_permutations)); + checkCudaErrors(cudaFree(d_data)); + checkCudaErrors(cudaFree(d_result)); + free(image_path); + free(data); + free(block_image); + free(h_result); + free(reference); + sdkDeleteTimer(&timer); + + printf("RMS(reference, result) = %f\n\n", rms); + printf(rms <= ERROR_THRESHOLD ? "Test passed\n" : "Test failed!\n"); + /* Return zero if test passed, one otherwise */ + return rms > ERROR_THRESHOLD; } diff --git a/Samples/5_Domain_Specific/dxtc/permutations.h b/Samples/5_Domain_Specific/dxtc/permutations.h index 96a096bf..c47e11e8 100644 --- a/Samples/5_Domain_Specific/dxtc/permutations.h +++ b/Samples/5_Domain_Specific/dxtc/permutations.h @@ -28,116 +28,118 @@ #ifndef PERMUTATIONS_H #define PERMUTATIONS_H -#include // assert +#include // assert -static void computePermutations(uint permutations[1024]) { - int indices[16]; - int num = 0; +static void computePermutations(uint permutations[1024]) +{ + int indices[16]; + int num = 0; - // 3 element permutations: + // 3 element permutations: - // first cluster [0,i) is at the start - for (int m = 0; m < 16; ++m) { - indices[m] = 0; - } - - const int imax = 15; - - for (int i = imax; i >= 0; --i) { - // second cluster [i,j) is half along - for (int m = i; m < 16; ++m) { - indices[m] = 2; + // first cluster [0,i) is at the start + for (int m = 0; m < 16; ++m) { + indices[m] = 0; } - const int jmax = (i == 0) ? 15 : 16; + const int imax = 15; - for (int j = jmax; j >= i; --j) { - // last cluster [j,k) is at the end - if (j < 16) { - indices[j] = 1; - } - - uint permutation = 0; - - for (int p = 0; p < 16; p++) { - permutation |= indices[p] << (p * 2); - // permutation |= indices[15-p] << (p * 2); - } - - permutations[num] = permutation; - - num++; - } - } - - assert(num == 151); - - for (int i = 0; i < 9; i++) { - permutations[num] = 0x000AA555; - num++; - } - - assert(num == 160); - - // Append 4 element permutations: - - // first cluster [0,i) is at the start - for (int m = 0; m < 16; ++m) { - indices[m] = 0; - } - - for (int i = imax; i >= 0; --i) { - // second cluster [i,j) is one third along - for (int m = i; m < 16; ++m) { - indices[m] = 2; - } - - const int jmax = (i == 0) ? 15 : 16; - - for (int j = jmax; j >= i; --j) { - // third cluster [j,k) is two thirds along - for (int m = j; m < 16; ++m) { - indices[m] = 3; - } - - int kmax = (j == 0) ? 15 : 16; - - for (int k = kmax; k >= j; --k) { - // last cluster [k,n) is at the end - if (k < 16) { - indices[k] = 1; + for (int i = imax; i >= 0; --i) { + // second cluster [i,j) is half along + for (int m = i; m < 16; ++m) { + indices[m] = 2; } - uint permutation = 0; + const int jmax = (i == 0) ? 15 : 16; - bool hasThree = false; + for (int j = jmax; j >= i; --j) { + // last cluster [j,k) is at the end + if (j < 16) { + indices[j] = 1; + } - for (int p = 0; p < 16; p++) { - permutation |= indices[p] << (p * 2); - // permutation |= indices[15-p] << (p * 2); + uint permutation = 0; - if (indices[p] == 3) hasThree = true; + for (int p = 0; p < 16; p++) { + permutation |= indices[p] << (p * 2); + // permutation |= indices[15-p] << (p * 2); + } + + permutations[num] = permutation; + + num++; } - - if (hasThree) { - permutations[num] = permutation; - num++; - } - } } - } - assert(num == 975); + assert(num == 151); - // 1024 - 969 - 7 = 48 extra elements + for (int i = 0; i < 9; i++) { + permutations[num] = 0x000AA555; + num++; + } - // It would be nice to set these extra elements with better values... - for (int i = 0; i < 49; i++) { - permutations[num] = 0x00AAFF55; - num++; - } + assert(num == 160); - assert(num == 1024); + // Append 4 element permutations: + + // first cluster [0,i) is at the start + for (int m = 0; m < 16; ++m) { + indices[m] = 0; + } + + for (int i = imax; i >= 0; --i) { + // second cluster [i,j) is one third along + for (int m = i; m < 16; ++m) { + indices[m] = 2; + } + + const int jmax = (i == 0) ? 15 : 16; + + for (int j = jmax; j >= i; --j) { + // third cluster [j,k) is two thirds along + for (int m = j; m < 16; ++m) { + indices[m] = 3; + } + + int kmax = (j == 0) ? 15 : 16; + + for (int k = kmax; k >= j; --k) { + // last cluster [k,n) is at the end + if (k < 16) { + indices[k] = 1; + } + + uint permutation = 0; + + bool hasThree = false; + + for (int p = 0; p < 16; p++) { + permutation |= indices[p] << (p * 2); + // permutation |= indices[15-p] << (p * 2); + + if (indices[p] == 3) + hasThree = true; + } + + if (hasThree) { + permutations[num] = permutation; + num++; + } + } + } + } + + assert(num == 975); + + // 1024 - 969 - 7 = 48 extra elements + + // It would be nice to set these extra elements with better values... + for (int i = 0; i < 49; i++) { + permutations[num] = 0x00AAFF55; + num++; + } + + assert(num == 1024); } -#endif // PERMUTATIONS_H +#endif // PERMUTATIONS_H diff --git a/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform.cu b/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform.cu index c82ceaf4..7b00bf49 100644 --- a/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform.cu +++ b/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform.cu @@ -38,20 +38,18 @@ * Victor Podlozhnyuk (vpodlozhnyuk@nvidia.com) */ +#include +#include #include #include #include -#include -#include //////////////////////////////////////////////////////////////////////////////// // Reference CPU FWT //////////////////////////////////////////////////////////////////////////////// extern "C" void fwtCPU(float *h_Output, float *h_Input, int log2N); extern "C" void slowWTcpu(float *h_Output, float *h_Input, int log2N); -extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data, - float *h_Kernel, int log2dataN, - int log2kernelN); +extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data, float *h_Kernel, int log2dataN, int log2kernelN); //////////////////////////////////////////////////////////////////////////////// // GPU FWT @@ -62,12 +60,12 @@ extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data, // Data configuration //////////////////////////////////////////////////////////////////////////////// const int log2Kernel = 7; -const int log2Data = 23; +const int log2Data = 23; -const int dataN = 1 << log2Data; +const int dataN = 1 << log2Data; const int kernelN = 1 << log2Kernel; -const int DATA_SIZE = dataN * sizeof(float); +const int DATA_SIZE = dataN * sizeof(float); const int KERNEL_SIZE = kernelN * sizeof(float); const double NOPS = 3.0 * (double)dataN * (double)log2Data / 2.0; @@ -75,95 +73,92 @@ const double NOPS = 3.0 * (double)dataN * (double)log2Data / 2.0; //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; +int main(int argc, char *argv[]) +{ + float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; - float *d_Data, *d_Kernel; + float *d_Data, *d_Kernel; - double delta, ref, sum_delta2, sum_ref2, L2norm, gpuTime; + double delta, ref, sum_delta2, sum_ref2, L2norm, gpuTime; - StopWatchInterface *hTimer = NULL; - int i; + StopWatchInterface *hTimer = NULL; + int i; - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Initializing data...\n"); - printf("...allocating CPU memory\n"); - h_Kernel = (float *)malloc(KERNEL_SIZE); - h_Data = (float *)malloc(DATA_SIZE); - h_ResultCPU = (float *)malloc(DATA_SIZE); - h_ResultGPU = (float *)malloc(DATA_SIZE); - printf("...allocating GPU memory\n"); - checkCudaErrors(cudaMalloc((void **)&d_Kernel, DATA_SIZE)); - checkCudaErrors(cudaMalloc((void **)&d_Data, DATA_SIZE)); + printf("Initializing data...\n"); + printf("...allocating CPU memory\n"); + h_Kernel = (float *)malloc(KERNEL_SIZE); + h_Data = (float *)malloc(DATA_SIZE); + h_ResultCPU = (float *)malloc(DATA_SIZE); + h_ResultGPU = (float *)malloc(DATA_SIZE); + printf("...allocating GPU memory\n"); + checkCudaErrors(cudaMalloc((void **)&d_Kernel, DATA_SIZE)); + checkCudaErrors(cudaMalloc((void **)&d_Data, DATA_SIZE)); - printf("...generating data\n"); - printf("Data length: %i; kernel length: %i\n", dataN, kernelN); - srand(2007); + printf("...generating data\n"); + printf("Data length: %i; kernel length: %i\n", dataN, kernelN); + srand(2007); - for (i = 0; i < kernelN; i++) { - h_Kernel[i] = (float)rand() / (float)RAND_MAX; - } + for (i = 0; i < kernelN; i++) { + h_Kernel[i] = (float)rand() / (float)RAND_MAX; + } - for (i = 0; i < dataN; i++) { - h_Data[i] = (float)rand() / (float)RAND_MAX; - } + for (i = 0; i < dataN; i++) { + h_Data[i] = (float)rand() / (float)RAND_MAX; + } - checkCudaErrors(cudaMemset(d_Kernel, 0, DATA_SIZE)); - checkCudaErrors( - cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_Data, h_Data, DATA_SIZE, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(d_Kernel, 0, DATA_SIZE)); + checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Data, h_Data, DATA_SIZE, cudaMemcpyHostToDevice)); - printf("Running GPU dyadic convolution using Fast Walsh Transform...\n"); - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); - fwtBatchGPU(d_Data, 1, log2Data); - fwtBatchGPU(d_Kernel, 1, log2Data); - modulateGPU(d_Data, d_Kernel, dataN); - fwtBatchGPU(d_Data, 1, log2Data); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer); - printf("GPU time: %f ms; GOP/s: %f\n", gpuTime, - NOPS / (gpuTime * 0.001 * 1E+9)); + printf("Running GPU dyadic convolution using Fast Walsh Transform...\n"); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + fwtBatchGPU(d_Data, 1, log2Data); + fwtBatchGPU(d_Kernel, 1, log2Data); + modulateGPU(d_Data, d_Kernel, dataN); + fwtBatchGPU(d_Data, 1, log2Data); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer); + printf("GPU time: %f ms; GOP/s: %f\n", gpuTime, NOPS / (gpuTime * 0.001 * 1E+9)); - printf("Reading back GPU results...\n"); - checkCudaErrors( - cudaMemcpy(h_ResultGPU, d_Data, DATA_SIZE, cudaMemcpyDeviceToHost)); + printf("Reading back GPU results...\n"); + checkCudaErrors(cudaMemcpy(h_ResultGPU, d_Data, DATA_SIZE, cudaMemcpyDeviceToHost)); - printf("Running straightforward CPU dyadic convolution...\n"); - dyadicConvolutionCPU(h_ResultCPU, h_Data, h_Kernel, log2Data, log2Kernel); + printf("Running straightforward CPU dyadic convolution...\n"); + dyadicConvolutionCPU(h_ResultCPU, h_Data, h_Kernel, log2Data, log2Kernel); - printf("Comparing the results...\n"); - sum_delta2 = 0; - sum_ref2 = 0; + printf("Comparing the results...\n"); + sum_delta2 = 0; + sum_ref2 = 0; - for (i = 0; i < dataN; i++) { - delta = h_ResultCPU[i] - h_ResultGPU[i]; - ref = h_ResultCPU[i]; - sum_delta2 += delta * delta; - sum_ref2 += ref * ref; - } + for (i = 0; i < dataN; i++) { + delta = h_ResultCPU[i] - h_ResultGPU[i]; + ref = h_ResultCPU[i]; + sum_delta2 += delta * delta; + sum_ref2 += ref * ref; + } - L2norm = sqrt(sum_delta2 / sum_ref2); + L2norm = sqrt(sum_delta2 / sum_ref2); - printf("Shutting down...\n"); - sdkDeleteTimer(&hTimer); - checkCudaErrors(cudaFree(d_Data)); - checkCudaErrors(cudaFree(d_Kernel)); - free(h_ResultGPU); - free(h_ResultCPU); - free(h_Data); - free(h_Kernel); + printf("Shutting down...\n"); + sdkDeleteTimer(&hTimer); + checkCudaErrors(cudaFree(d_Data)); + checkCudaErrors(cudaFree(d_Kernel)); + free(h_ResultGPU); + free(h_ResultCPU); + free(h_Data); + free(h_Kernel); - printf("L2 norm: %E\n", L2norm); - printf(L2norm < 1e-6 ? "Test passed\n" : "Test failed!\n"); + printf("L2 norm: %E\n", L2norm); + printf(L2norm < 1e-6 ? "Test passed\n" : "Test failed!\n"); } diff --git a/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_gold.cpp b/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_gold.cpp index 38127ca4..c8b33ef0 100644 --- a/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_gold.cpp +++ b/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_gold.cpp @@ -28,68 +28,72 @@ /////////////////////////////////////////////////////////////////////////////// // CPU Fast Walsh Transform /////////////////////////////////////////////////////////////////////////////// -extern "C" void fwtCPU(float *h_Output, float *h_Input, int log2N) { - const int N = 1 << log2N; +extern "C" void fwtCPU(float *h_Output, float *h_Input, int log2N) +{ + const int N = 1 << log2N; - for (int pos = 0; pos < N; pos++) h_Output[pos] = h_Input[pos]; + for (int pos = 0; pos < N; pos++) + h_Output[pos] = h_Input[pos]; - // Cycle through stages with different butterfly strides - for (int stride = N / 2; stride >= 1; stride >>= 1) { - // Cycle through subvectors of (2 * stride) elements - for (int base = 0; base < N; base += 2 * stride) + // Cycle through stages with different butterfly strides + for (int stride = N / 2; stride >= 1; stride >>= 1) { + // Cycle through subvectors of (2 * stride) elements + for (int base = 0; base < N; base += 2 * stride) - // Butterfly index within subvector of (2 * stride) size - for (int j = 0; j < stride; j++) { - int i0 = base + j + 0; - int i1 = base + j + stride; + // Butterfly index within subvector of (2 * stride) size + for (int j = 0; j < stride; j++) { + int i0 = base + j + 0; + int i1 = base + j + stride; - float T1 = h_Output[i0]; - float T2 = h_Output[i1]; - h_Output[i0] = T1 + T2; - h_Output[i1] = T1 - T2; - } - } + float T1 = h_Output[i0]; + float T2 = h_Output[i1]; + h_Output[i0] = T1 + T2; + h_Output[i1] = T1 - T2; + } + } } /////////////////////////////////////////////////////////////////////////////// // Straightforward Walsh Transform: used to test both CPU and GPU FWT // Slow. Uses doubles because of straightforward accumulation /////////////////////////////////////////////////////////////////////////////// -extern "C" void slowWTcpu(float *h_Output, float *h_Input, int log2N) { - const int N = 1 << log2N; +extern "C" void slowWTcpu(float *h_Output, float *h_Input, int log2N) +{ + const int N = 1 << log2N; - for (int i = 0; i < N; i++) { - double sum = 0; + for (int i = 0; i < N; i++) { + double sum = 0; - for (int j = 0; j < N; j++) { - // Walsh-Hadamard quotient - double q = 1.0; + for (int j = 0; j < N; j++) { + // Walsh-Hadamard quotient + double q = 1.0; - for (int t = i & j; t != 0; t >>= 1) - if (t & 1) q = -q; + for (int t = i & j; t != 0; t >>= 1) + if (t & 1) + q = -q; - sum += q * h_Input[j]; + sum += q * h_Input[j]; + } + + h_Output[i] = (float)sum; } - - h_Output[i] = (float)sum; - } } //////////////////////////////////////////////////////////////////////////////// // Reference CPU dyadic convolution. // Extremely slow because of non-linear memory access patterns (cache thrashing) //////////////////////////////////////////////////////////////////////////////// -extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data, - float *h_Kernel, int log2dataN, - int log2kernelN) { - const int dataN = 1 << log2dataN; - const int kernelN = 1 << log2kernelN; +extern "C" void dyadicConvolutionCPU(float *h_Result, float *h_Data, float *h_Kernel, int log2dataN, int log2kernelN) +{ + const int dataN = 1 << log2dataN; + const int kernelN = 1 << log2kernelN; - for (int i = 0; i < dataN; i++) { - double sum = 0; + for (int i = 0; i < dataN; i++) { + double sum = 0; - for (int j = 0; j < kernelN; j++) sum += h_Data[i ^ j] * h_Kernel[j]; + for (int j = 0; j < kernelN; j++) + sum += h_Data[i ^ j] * h_Kernel[j]; - h_Result[i] = (float)sum; - } + h_Result[i] = (float)sum; + } } diff --git a/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_kernel.cuh b/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_kernel.cuh index 4f64117d..22735378 100644 --- a/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_kernel.cuh +++ b/Samples/5_Domain_Specific/fastWalshTransform/fastWalshTransform_kernel.cuh @@ -40,146 +40,148 @@ namespace cg = cooperative_groups; /////////////////////////////////////////////////////////////////////////////// #define ELEMENTARY_LOG2SIZE 11 -__global__ void fwtBatch1Kernel(float *d_Output, float *d_Input, int log2N) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - const int N = 1 << log2N; - const int base = blockIdx.x << log2N; +__global__ void fwtBatch1Kernel(float *d_Output, float *d_Input, int log2N) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + const int N = 1 << log2N; + const int base = blockIdx.x << log2N; - //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80 - extern __shared__ float s_data[]; - float *d_Src = d_Input + base; - float *d_Dst = d_Output + base; + //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80 + extern __shared__ float s_data[]; + float *d_Src = d_Input + base; + float *d_Dst = d_Output + base; - for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { - s_data[pos] = d_Src[pos]; - } - - // Main radix-4 stages - const int pos = threadIdx.x; - - for (int stride = N >> 2; stride > 0; stride >>= 2) { - int lo = pos & (stride - 1); - int i0 = ((pos - lo) << 2) + lo; - int i1 = i0 + stride; - int i2 = i1 + stride; - int i3 = i2 + stride; - - cg::sync(cta); - float D0 = s_data[i0]; - float D1 = s_data[i1]; - float D2 = s_data[i2]; - float D3 = s_data[i3]; - - float T; - T = D0; - D0 = D0 + D2; - D2 = T - D2; - T = D1; - D1 = D1 + D3; - D3 = T - D3; - T = D0; - s_data[i0] = D0 + D1; - s_data[i1] = T - D1; - T = D2; - s_data[i2] = D2 + D3; - s_data[i3] = T - D3; - } - - // Do single radix-2 stage for odd power of two - if (log2N & 1) { - cg::sync(cta); - - for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { - int i0 = pos << 1; - int i1 = i0 + 1; - - float D0 = s_data[i0]; - float D1 = s_data[i1]; - s_data[i0] = D0 + D1; - s_data[i1] = D0 - D1; + for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { + s_data[pos] = d_Src[pos]; } - } - cg::sync(cta); + // Main radix-4 stages + const int pos = threadIdx.x; - for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { - d_Dst[pos] = s_data[pos]; - } + for (int stride = N >> 2; stride > 0; stride >>= 2) { + int lo = pos & (stride - 1); + int i0 = ((pos - lo) << 2) + lo; + int i1 = i0 + stride; + int i2 = i1 + stride; + int i3 = i2 + stride; + + cg::sync(cta); + float D0 = s_data[i0]; + float D1 = s_data[i1]; + float D2 = s_data[i2]; + float D3 = s_data[i3]; + + float T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + s_data[i0] = D0 + D1; + s_data[i1] = T - D1; + T = D2; + s_data[i2] = D2 + D3; + s_data[i3] = T - D3; + } + + // Do single radix-2 stage for odd power of two + if (log2N & 1) { + cg::sync(cta); + + for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { + int i0 = pos << 1; + int i1 = i0 + 1; + + float D0 = s_data[i0]; + float D1 = s_data[i1]; + s_data[i0] = D0 + D1; + s_data[i1] = D0 - D1; + } + } + + cg::sync(cta); + + for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { + d_Dst[pos] = s_data[pos]; + } } //////////////////////////////////////////////////////////////////////////////// // Single in-global memory radix-4 Fast Walsh Transform pass // (for strides exceeding elementary vector size) //////////////////////////////////////////////////////////////////////////////// -__global__ void fwtBatch2Kernel(float *d_Output, float *d_Input, int stride) { - const int pos = blockIdx.x * blockDim.x + threadIdx.x; - const int N = blockDim.x * gridDim.x * 4; +__global__ void fwtBatch2Kernel(float *d_Output, float *d_Input, int stride) +{ + const int pos = blockIdx.x * blockDim.x + threadIdx.x; + const int N = blockDim.x * gridDim.x * 4; - float *d_Src = d_Input + blockIdx.y * N; - float *d_Dst = d_Output + blockIdx.y * N; + float *d_Src = d_Input + blockIdx.y * N; + float *d_Dst = d_Output + blockIdx.y * N; - int lo = pos & (stride - 1); - int i0 = ((pos - lo) << 2) + lo; - int i1 = i0 + stride; - int i2 = i1 + stride; - int i3 = i2 + stride; + int lo = pos & (stride - 1); + int i0 = ((pos - lo) << 2) + lo; + int i1 = i0 + stride; + int i2 = i1 + stride; + int i3 = i2 + stride; - float D0 = d_Src[i0]; - float D1 = d_Src[i1]; - float D2 = d_Src[i2]; - float D3 = d_Src[i3]; + float D0 = d_Src[i0]; + float D1 = d_Src[i1]; + float D2 = d_Src[i2]; + float D3 = d_Src[i3]; - float T; - T = D0; - D0 = D0 + D2; - D2 = T - D2; - T = D1; - D1 = D1 + D3; - D3 = T - D3; - T = D0; - d_Dst[i0] = D0 + D1; - d_Dst[i1] = T - D1; - T = D2; - d_Dst[i2] = D2 + D3; - d_Dst[i3] = T - D3; + float T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + d_Dst[i0] = D0 + D1; + d_Dst[i1] = T - D1; + T = D2; + d_Dst[i2] = D2 + D3; + d_Dst[i3] = T - D3; } //////////////////////////////////////////////////////////////////////////////// // Put everything together: batched Fast Walsh Transform CPU front-end //////////////////////////////////////////////////////////////////////////////// -void fwtBatchGPU(float *d_Data, int M, int log2N) { - const int THREAD_N = 256; +void fwtBatchGPU(float *d_Data, int M, int log2N) +{ + const int THREAD_N = 256; - int N = 1 << log2N; - dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); + int N = 1 << log2N; + dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); - for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { - fwtBatch2Kernel<<>>(d_Data, d_Data, N / 4); - getLastCudaError("fwtBatch2Kernel() execution failed\n"); - } + for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { + fwtBatch2Kernel<<>>(d_Data, d_Data, N / 4); + getLastCudaError("fwtBatch2Kernel() execution failed\n"); + } - fwtBatch1Kernel<<>>(d_Data, d_Data, log2N); - getLastCudaError("fwtBatch1Kernel() execution failed\n"); + fwtBatch1Kernel<<>>(d_Data, d_Data, log2N); + getLastCudaError("fwtBatch1Kernel() execution failed\n"); } //////////////////////////////////////////////////////////////////////////////// // Modulate two arrays //////////////////////////////////////////////////////////////////////////////// -__global__ void modulateKernel(float *d_A, float *d_B, int N) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int numThreads = blockDim.x * gridDim.x; - float rcpN = 1.0f / (float)N; +__global__ void modulateKernel(float *d_A, float *d_B, int N) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int numThreads = blockDim.x * gridDim.x; + float rcpN = 1.0f / (float)N; - for (int pos = tid; pos < N; pos += numThreads) { - d_A[pos] *= d_B[pos] * rcpN; - } + for (int pos = tid; pos < N; pos += numThreads) { + d_A[pos] *= d_B[pos] * rcpN; + } } // Interface to modulateKernel() -void modulateGPU(float *d_A, float *d_B, int N) { - modulateKernel<<<128, 256>>>(d_A, d_B, N); -} +void modulateGPU(float *d_A, float *d_B, int N) { modulateKernel<<<128, 256>>>(d_A, d_B, N); } #endif #endif diff --git a/Samples/5_Domain_Specific/fluidsGL/defines.h b/Samples/5_Domain_Specific/fluidsGL/defines.h index af58c64b..e1f5e8a8 100644 --- a/Samples/5_Domain_Specific/fluidsGL/defines.h +++ b/Samples/5_Domain_Specific/fluidsGL/defines.h @@ -28,21 +28,20 @@ #ifndef DEFINES_H #define DEFINES_H -#define DIM 512 // Square size of solver domain -#define DS (DIM * DIM) // Total domain size -#define CPADW (DIM / 2 + 1) // Padded width for real->complex in-place FFT -#define RPADW \ - (2 * (DIM / 2 + 1)) // Padded width for real->complex in-place FFT -#define PDS (DIM * CPADW) // Padded total domain size +#define DIM 512 // Square size of solver domain +#define DS (DIM * DIM) // Total domain size +#define CPADW (DIM / 2 + 1) // Padded width for real->complex in-place FFT +#define RPADW (2 * (DIM / 2 + 1)) // Padded width for real->complex in-place FFT +#define PDS (DIM * CPADW) // Padded total domain size -#define DT 0.09f // Delta T for interative solver -#define VIS 0.0025f // Viscosity constant -#define FORCE (5.8f * DIM) // Force scale factor -#define FR 4 // Force update radius +#define DT 0.09f // Delta T for interative solver +#define VIS 0.0025f // Viscosity constant +#define FORCE (5.8f * DIM) // Force scale factor +#define FR 4 // Force update radius -#define TILEX 64 // Tile width -#define TILEY 64 // Tile height -#define TIDSX 64 // Tids in X -#define TIDSY 4 // Tids in Y +#define TILEX 64 // Tile width +#define TILEY 64 // Tile height +#define TIDSX 64 // Tids in X +#define TIDSY 4 // Tids in Y #endif diff --git a/Samples/5_Domain_Specific/fluidsGL/fluidsGL.cpp b/Samples/5_Domain_Specific/fluidsGL/fluidsGL.cpp index fe99106f..6379f729 100644 --- a/Samples/5_Domain_Specific/fluidsGL/fluidsGL.cpp +++ b/Samples/5_Domain_Specific/fluidsGL/fluidsGL.cpp @@ -39,21 +39,21 @@ #endif // Includes -#include #include +#include #include // CUDA standard includes -#include #include +#include // CUDA FFT Libraries #include // CUDA helper functions +#include #include #include -#include #include "defines.h" #include "fluidsGL_kernels.h" @@ -73,34 +73,34 @@ void cleanup(void); void reshape(int x, int y); // CUFFT plan handle -cufftHandle planr2c; -cufftHandle planc2r; +cufftHandle planr2c; +cufftHandle planc2r; static cData *vxfield = NULL; static cData *vyfield = NULL; -cData *hvfield = NULL; -cData *dvfield = NULL; -static int wWidth = MAX(512, DIM); +cData *hvfield = NULL; +cData *dvfield = NULL; +static int wWidth = MAX(512, DIM); static int wHeight = MAX(512, DIM); -static int clicked = 0; -static int fpsCount = 0; -static int fpsLimit = 1; -StopWatchInterface *timer = NULL; +static int clicked = 0; +static int fpsCount = 0; +static int fpsLimit = 1; +StopWatchInterface *timer = NULL; // Particle data -GLuint vbo = 0; // OpenGL vertex buffer object -struct cudaGraphicsResource *cuda_vbo_resource; // handles OpenGL-CUDA exchange -static cData *particles = NULL; // particle positions in host memory -static int lastx = 0, lasty = 0; +GLuint vbo = 0; // OpenGL vertex buffer object +struct cudaGraphicsResource *cuda_vbo_resource; // handles OpenGL-CUDA exchange +static cData *particles = NULL; // particle positions in host memory +static int lastx = 0, lasty = 0; // Texture pitch -size_t tPitch = 0; // Now this is compatible with gcc in 64-bit +size_t tPitch = 0; // Now this is compatible with gcc in 64-bit -char *ref_file = NULL; -bool g_bQAAddTestForce = true; -int g_iFrameToCompare = 100; -int g_TotalErrors = 0; +char *ref_file = NULL; +bool g_bQAAddTestForce = true; +int g_iFrameToCompare = 100; +int g_TotalErrors = 0; bool g_bExitESC = false; @@ -109,389 +109,388 @@ CheckRender *g_CheckRender = NULL; void autoTest(char **); -extern "C" void addForces(cData *v, int dx, int dy, int spx, int spy, float fx, - float fy, int r); -extern "C" void advectVelocity(cData *v, float *vx, float *vy, int dx, int pdx, - int dy, float dt); -extern "C" void diffuseProject(cData *vx, cData *vy, int dx, int dy, float dt, - float visc); -extern "C" void updateVelocity(cData *v, float *vx, float *vy, int dx, int pdx, - int dy); +extern "C" void addForces(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r); +extern "C" void advectVelocity(cData *v, float *vx, float *vy, int dx, int pdx, int dy, float dt); +extern "C" void diffuseProject(cData *vx, cData *vy, int dx, int dy, float dt, float visc); +extern "C" void updateVelocity(cData *v, float *vx, float *vy, int dx, int pdx, int dy); extern "C" void advectParticles(GLuint vbo, cData *v, int dx, int dy, float dt); -void simulateFluids(void) { - // simulate fluid - advectVelocity(dvfield, (float *)vxfield, (float *)vyfield, DIM, RPADW, DIM, - DT); - diffuseProject(vxfield, vyfield, CPADW, DIM, DT, VIS); - updateVelocity(dvfield, (float *)vxfield, (float *)vyfield, DIM, RPADW, DIM); - advectParticles(vbo, dvfield, DIM, DIM, DT); +void simulateFluids(void) +{ + // simulate fluid + advectVelocity(dvfield, (float *)vxfield, (float *)vyfield, DIM, RPADW, DIM, DT); + diffuseProject(vxfield, vyfield, CPADW, DIM, DT, VIS); + updateVelocity(dvfield, (float *)vxfield, (float *)vyfield, DIM, RPADW, DIM); + advectParticles(vbo, dvfield, DIM, DIM, DT); } -void display(void) { - if (!ref_file) { - sdkStartTimer(&timer); - simulateFluids(); - } - - // render points from vertex buffer - glClear(GL_COLOR_BUFFER_BIT); - glColor4f(0, 1, 0, 0.5f); - glPointSize(1); - glEnable(GL_POINT_SMOOTH); - glEnable(GL_BLEND); - glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); - glEnableClientState(GL_VERTEX_ARRAY); - glDisable(GL_DEPTH_TEST); - glDisable(GL_CULL_FACE); - glBindBuffer(GL_ARRAY_BUFFER, vbo); - glVertexPointer(2, GL_FLOAT, 0, NULL); - glDrawArrays(GL_POINTS, 0, DS); - glBindBuffer(GL_ARRAY_BUFFER, 0); - glDisableClientState(GL_VERTEX_ARRAY); - glDisableClientState(GL_TEXTURE_COORD_ARRAY); - glDisable(GL_TEXTURE_2D); - - if (ref_file) { - return; - } - - // Finish timing before swap buffers to avoid refresh sync - sdkStopTimer(&timer); - glutSwapBuffers(); - - fpsCount++; - - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "Cuda/GL Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps); - glutSetWindowTitle(fps); - fpsCount = 0; - fpsLimit = (int)MAX(ifps, 1.f); - sdkResetTimer(&timer); - } - - glutPostRedisplay(); -} - -void autoTest(char **argv) { - CFrameBufferObject *fbo = - new CFrameBufferObject(wWidth, wHeight, 4, false, GL_TEXTURE_2D); - g_CheckRender = new CheckFBO(wWidth, wHeight, 4, fbo); - g_CheckRender->setPixelFormat(GL_RGBA); - g_CheckRender->setExecPath(argv[0]); - g_CheckRender->EnableQAReadback(true); - - fbo->bindRenderPath(); - - reshape(wWidth, wHeight); - - for (int count = 0; count < g_iFrameToCompare; count++) { - simulateFluids(); - - // add in a little force so the automated testing is interesting. - if (ref_file) { - int x = wWidth / (count + 1); - int y = wHeight / (count + 1); - float fx = (x / (float)wWidth); - float fy = (y / (float)wHeight); - int nx = (int)(fx * DIM); - int ny = (int)(fy * DIM); - - int ddx = 35; - int ddy = 35; - fx = ddx / (float)wWidth; - fy = ddy / (float)wHeight; - int spy = ny - FR; - int spx = nx - FR; - - addForces(dvfield, DIM, DIM, spx, spy, FORCE * DT * fx, FORCE * DT * fy, - FR); - lastx = x; - lasty = y; +void display(void) +{ + if (!ref_file) { + sdkStartTimer(&timer); + simulateFluids(); } - } - display(); + // render points from vertex buffer + glClear(GL_COLOR_BUFFER_BIT); + glColor4f(0, 1, 0, 0.5f); + glPointSize(1); + glEnable(GL_POINT_SMOOTH); + glEnable(GL_BLEND); + glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); + glEnableClientState(GL_VERTEX_ARRAY); + glDisable(GL_DEPTH_TEST); + glDisable(GL_CULL_FACE); + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glVertexPointer(2, GL_FLOAT, 0, NULL); + glDrawArrays(GL_POINTS, 0, DS); + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDisableClientState(GL_VERTEX_ARRAY); + glDisableClientState(GL_TEXTURE_COORD_ARRAY); + glDisable(GL_TEXTURE_2D); - fbo->unbindRenderPath(); + if (ref_file) { + return; + } - // compare to official reference image, printing PASS or FAIL. - printf("> (Frame %d) Readback BackBuffer\n", 100); - g_CheckRender->readback(wWidth, wHeight); - g_CheckRender->savePPM("fluidsGL.ppm", true, NULL); + // Finish timing before swap buffers to avoid refresh sync + sdkStopTimer(&timer); + glutSwapBuffers(); - if (!g_CheckRender->PPMvsPPM("fluidsGL.ppm", ref_file, MAX_EPSILON_ERROR, - 0.25f)) { - g_TotalErrors++; - } + fpsCount++; + + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "Cuda/GL Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps); + glutSetWindowTitle(fps); + fpsCount = 0; + fpsLimit = (int)MAX(ifps, 1.f); + sdkResetTimer(&timer); + } + + glutPostRedisplay(); +} + +void autoTest(char **argv) +{ + CFrameBufferObject *fbo = new CFrameBufferObject(wWidth, wHeight, 4, false, GL_TEXTURE_2D); + g_CheckRender = new CheckFBO(wWidth, wHeight, 4, fbo); + g_CheckRender->setPixelFormat(GL_RGBA); + g_CheckRender->setExecPath(argv[0]); + g_CheckRender->EnableQAReadback(true); + + fbo->bindRenderPath(); + + reshape(wWidth, wHeight); + + for (int count = 0; count < g_iFrameToCompare; count++) { + simulateFluids(); + + // add in a little force so the automated testing is interesting. + if (ref_file) { + int x = wWidth / (count + 1); + int y = wHeight / (count + 1); + float fx = (x / (float)wWidth); + float fy = (y / (float)wHeight); + int nx = (int)(fx * DIM); + int ny = (int)(fy * DIM); + + int ddx = 35; + int ddy = 35; + fx = ddx / (float)wWidth; + fy = ddy / (float)wHeight; + int spy = ny - FR; + int spx = nx - FR; + + addForces(dvfield, DIM, DIM, spx, spy, FORCE * DT * fx, FORCE * DT * fy, FR); + lastx = x; + lasty = y; + } + } + + display(); + + fbo->unbindRenderPath(); + + // compare to official reference image, printing PASS or FAIL. + printf("> (Frame %d) Readback BackBuffer\n", 100); + g_CheckRender->readback(wWidth, wHeight); + g_CheckRender->savePPM("fluidsGL.ppm", true, NULL); + + if (!g_CheckRender->PPMvsPPM("fluidsGL.ppm", ref_file, MAX_EPSILON_ERROR, 0.25f)) { + g_TotalErrors++; + } } // very simple von neumann middle-square prng. can't use rand() in -qatest // mode because its implementation varies across platforms which makes testing // for consistency in the important parts of this program difficult. -float myrand(void) { - static int seed = 72191; - char sq[22]; +float myrand(void) +{ + static int seed = 72191; + char sq[22]; - if (ref_file) { - seed *= seed; - sprintf(sq, "%010d", seed); - // pull the middle 5 digits out of sq - sq[8] = 0; - seed = atoi(&sq[3]); + if (ref_file) { + seed *= seed; + sprintf(sq, "%010d", seed); + // pull the middle 5 digits out of sq + sq[8] = 0; + seed = atoi(&sq[3]); - return seed / 99999.f; - } else { - return rand() / (float)RAND_MAX; - } -} - -void initParticles(cData *p, int dx, int dy) { - int i, j; - - for (i = 0; i < dy; i++) { - for (j = 0; j < dx; j++) { - p[i * dx + j].x = (j + 0.5f + (myrand() - 0.5f)) / dx; - p[i * dx + j].y = (i + 0.5f + (myrand() - 0.5f)) / dy; + return seed / 99999.f; + } + else { + return rand() / (float)RAND_MAX; } - } } -void keyboard(unsigned char key, int x, int y) { - switch (key) { +void initParticles(cData *p, int dx, int dy) +{ + int i, j; + + for (i = 0; i < dy; i++) { + for (j = 0; j < dx; j++) { + p[i * dx + j].x = (j + 0.5f + (myrand() - 0.5f)) / dx; + p[i * dx + j].y = (i + 0.5f + (myrand() - 0.5f)) / dy; + } + } +} + +void keyboard(unsigned char key, int x, int y) +{ + switch (key) { case 27: - g_bExitESC = true; + g_bExitESC = true; #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case 'r': - memset(hvfield, 0, sizeof(cData) * DS); - cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); + memset(hvfield, 0, sizeof(cData) * DS); + cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); - initParticles(particles, DIM, DIM); + initParticles(particles, DIM, DIM); - cudaGraphicsUnregisterResource(cuda_vbo_resource); + cudaGraphicsUnregisterResource(cuda_vbo_resource); - getLastCudaError("cudaGraphicsUnregisterBuffer failed"); + getLastCudaError("cudaGraphicsUnregisterBuffer failed"); - glBindBuffer(GL_ARRAY_BUFFER, vbo); - glBufferData(GL_ARRAY_BUFFER, sizeof(cData) * DS, particles, - GL_DYNAMIC_DRAW_ARB); - glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glBufferData(GL_ARRAY_BUFFER, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); + glBindBuffer(GL_ARRAY_BUFFER, 0); - cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, - cudaGraphicsMapFlagsNone); + cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone); - getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); - break; + getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); + break; default: - break; - } + break; + } } -void click(int button, int updown, int x, int y) { - lastx = x; - lasty = y; - clicked = !clicked; +void click(int button, int updown, int x, int y) +{ + lastx = x; + lasty = y; + clicked = !clicked; } -void motion(int x, int y) { - // Convert motion coordinates to domain - float fx = (lastx / (float)wWidth); - float fy = (lasty / (float)wHeight); - int nx = (int)(fx * DIM); - int ny = (int)(fy * DIM); +void motion(int x, int y) +{ + // Convert motion coordinates to domain + float fx = (lastx / (float)wWidth); + float fy = (lasty / (float)wHeight); + int nx = (int)(fx * DIM); + int ny = (int)(fy * DIM); - if (clicked && nx < DIM - FR && nx > FR - 1 && ny < DIM - FR && ny > FR - 1) { - int ddx = x - lastx; - int ddy = y - lasty; - fx = ddx / (float)wWidth; - fy = ddy / (float)wHeight; - int spy = ny - FR; - int spx = nx - FR; - addForces(dvfield, DIM, DIM, spx, spy, FORCE * DT * fx, FORCE * DT * fy, - FR); - lastx = x; - lasty = y; - } + if (clicked && nx < DIM - FR && nx > FR - 1 && ny < DIM - FR && ny > FR - 1) { + int ddx = x - lastx; + int ddy = y - lasty; + fx = ddx / (float)wWidth; + fy = ddy / (float)wHeight; + int spy = ny - FR; + int spx = nx - FR; + addForces(dvfield, DIM, DIM, spx, spy, FORCE * DT * fx, FORCE * DT * fy, FR); + lastx = x; + lasty = y; + } - glutPostRedisplay(); + glutPostRedisplay(); } -void reshape(int x, int y) { - wWidth = x; - wHeight = y; - glViewport(0, 0, x, y); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0, 1, 1, 0, 0, 1); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - glutPostRedisplay(); +void reshape(int x, int y) +{ + wWidth = x; + wHeight = y; + glViewport(0, 0, x, y); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0, 1, 1, 0, 0, 1); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + glutPostRedisplay(); } -void cleanup(void) { - cudaGraphicsUnregisterResource(cuda_vbo_resource); +void cleanup(void) +{ + cudaGraphicsUnregisterResource(cuda_vbo_resource); - deleteTexture(); + deleteTexture(); - // Free all host and device resources - free(hvfield); - free(particles); - cudaFree(dvfield); - cudaFree(vxfield); - cudaFree(vyfield); - cufftDestroy(planr2c); - cufftDestroy(planc2r); + // Free all host and device resources + free(hvfield); + free(particles); + cudaFree(dvfield); + cudaFree(vxfield); + cudaFree(vyfield); + cufftDestroy(planr2c); + cufftDestroy(planc2r); - glBindBuffer(GL_ARRAY_BUFFER, 0); - glDeleteBuffers(1, &vbo); + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDeleteBuffers(1, &vbo); - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); } -int initGL(int *argc, char **argv) { - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); - glutInitWindowSize(wWidth, wHeight); - glutCreateWindow("Compute Stable Fluids"); - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutMouseFunc(click); - glutMotionFunc(motion); - glutReshapeFunc(reshape); +int initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(wWidth, wHeight); + glutCreateWindow("Compute Stable Fluids"); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutMouseFunc(click); + glutMotionFunc(motion); + glutReshapeFunc(reshape); - if (!isGLVersionSupported(1, 5)) { - fprintf(stderr, "ERROR: Support for OpenGL 1.5 is missing"); - fflush(stderr); - return false; - } + if (!isGLVersionSupported(1, 5)) { + fprintf(stderr, "ERROR: Support for OpenGL 1.5 is missing"); + fflush(stderr); + return false; + } - if (!areGLExtensionsSupported("GL_ARB_vertex_buffer_object")) { - fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); - fflush(stderr); - return false; - } + if (!areGLExtensionsSupported("GL_ARB_vertex_buffer_object")) { + fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); + fflush(stderr); + return false; + } - return true; + return true; } -int main(int argc, char **argv) { - int devID; - cudaDeviceProp deviceProps; +int main(int argc, char **argv) +{ + int devID; + cudaDeviceProp deviceProps; #if defined(__linux__) - char *Xstatus = getenv("DISPLAY"); - if (Xstatus == NULL) { - printf("Waiving execution as X server is not running\n"); - exit(EXIT_WAIVED); - } - setenv("DISPLAY", ":0", 0); + char *Xstatus = getenv("DISPLAY"); + if (Xstatus == NULL) { + printf("Waiving execution as X server is not running\n"); + exit(EXIT_WAIVED); + } + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", sSDKname); + printf("%s Starting...\n\n", sSDKname); - printf( - "NOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + printf("NOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with OpenGL/CUDA - // interop. - if (false == initGL(&argc, argv)) { - exit(EXIT_SUCCESS); - } + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with OpenGL/CUDA + // interop. + if (false == initGL(&argc, argv)) { + exit(EXIT_SUCCESS); + } - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - devID = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + devID = findCudaDevice(argc, (const char **)argv); - // get number of SMs on this GPU - checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, - deviceProps.multiProcessorCount); + // get number of SMs on this GPU + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); - // automated build testing harness - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - } + // automated build testing harness + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + } - // Allocate and initialize host data - GLint bsize; + // Allocate and initialize host data + GLint bsize; - sdkCreateTimer(&timer); - sdkResetTimer(&timer); + sdkCreateTimer(&timer); + sdkResetTimer(&timer); - hvfield = (cData *)malloc(sizeof(cData) * DS); - memset(hvfield, 0, sizeof(cData) * DS); + hvfield = (cData *)malloc(sizeof(cData) * DS); + memset(hvfield, 0, sizeof(cData) * DS); - // Allocate and initialize device data - cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData) * DIM, DIM); + // Allocate and initialize device data + cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData) * DIM, DIM); - cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); - // Temporary complex velocity field data - cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); - cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); + cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); + // Temporary complex velocity field data + cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); + cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); - setupTexture(DIM, DIM); + setupTexture(DIM, DIM); - // Create particle array - particles = (cData *)malloc(sizeof(cData) * DS); - memset(particles, 0, sizeof(cData) * DS); + // Create particle array + particles = (cData *)malloc(sizeof(cData) * DS); + memset(particles, 0, sizeof(cData) * DS); - initParticles(particles, DIM, DIM); + initParticles(particles, DIM, DIM); - // Create CUFFT transform plan configuration - checkCudaErrors(cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C)); - checkCudaErrors(cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R)); + // Create CUFFT transform plan configuration + checkCudaErrors(cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C)); + checkCudaErrors(cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R)); - glGenBuffers(1, &vbo); - glBindBuffer(GL_ARRAY_BUFFER, vbo); - glBufferData(GL_ARRAY_BUFFER, sizeof(cData) * DS, particles, - GL_DYNAMIC_DRAW_ARB); + glGenBuffers(1, &vbo); + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glBufferData(GL_ARRAY_BUFFER, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); - glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, &bsize); + glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, &bsize); - if (bsize != (sizeof(cData) * DS)) goto EXTERR; + if (bsize != (sizeof(cData) * DS)) + goto EXTERR; - glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindBuffer(GL_ARRAY_BUFFER, 0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, - cudaGraphicsMapFlagsNone)); - getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); + getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); - if (ref_file) { - autoTest(argv); - cleanup(); + if (ref_file) { + autoTest(argv); + cleanup(); - printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - - } else { + printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + } + else { #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - glutMainLoop(); - } + glutMainLoop(); + } - if (!ref_file) { - exit(EXIT_SUCCESS); - } + if (!ref_file) { + exit(EXIT_SUCCESS); + } - return 0; + return 0; EXTERR: - printf("Failed to initialize GL extensions.\n"); + printf("Failed to initialize GL extensions.\n"); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cu b/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cu index 592de619..a9224170 100644 --- a/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cu +++ b/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cu @@ -25,13 +25,12 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include // CUDA FFT Libraries +#include // Helper functions for CUDA Error handling #include #include -#include -#include // CUDA FFT Libraries -#include // Helper functions for CUDA Error handling - // OpenGL Graphics includes #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION #include @@ -41,51 +40,52 @@ // Texture object for reading velocity field cudaTextureObject_t texObj; -static cudaArray *array = NULL; +static cudaArray *array = NULL; // Particle data -extern GLuint vbo; // OpenGL vertex buffer object -extern struct cudaGraphicsResource - *cuda_vbo_resource; // handles OpenGL-CUDA exchange +extern GLuint vbo; // OpenGL vertex buffer object +extern struct cudaGraphicsResource *cuda_vbo_resource; // handles OpenGL-CUDA exchange // Texture pitch -extern size_t tPitch; +extern size_t tPitch; extern cufftHandle planr2c; extern cufftHandle planc2r; -cData *vxfield = NULL; -cData *vyfield = NULL; +cData *vxfield = NULL; +cData *vyfield = NULL; -void setupTexture(int x, int y) { - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); +void setupTexture(int x, int y) +{ + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaMallocArray(&array, &desc, y, x); - getLastCudaError("cudaMalloc failed"); + cudaMallocArray(&array, &desc, y, x); + getLastCudaError("cudaMalloc failed"); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = array; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = array; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&texObj, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texObj, &texRes, &texDescr, NULL)); } -void updateTexture(cData *data, size_t wib, size_t h, size_t pitch) { - checkCudaErrors(cudaMemcpy2DToArray(array, 0, 0, data, pitch, wib, h, - cudaMemcpyDeviceToDevice)); +void updateTexture(cData *data, size_t wib, size_t h, size_t pitch) +{ + checkCudaErrors(cudaMemcpy2DToArray(array, 0, 0, data, pitch, wib, h, cudaMemcpyDeviceToDevice)); } -void deleteTexture(void) { - checkCudaErrors(cudaDestroyTextureObject(texObj)); - checkCudaErrors(cudaFreeArray(array)); +void deleteTexture(void) +{ + checkCudaErrors(cudaDestroyTextureObject(texObj)); + checkCudaErrors(cudaFreeArray(array)); } // Note that these kernels are designed to work with arbitrary @@ -99,54 +99,61 @@ void deleteTexture(void) { // This method adds constant force vectors to the velocity field // stored in 'v' according to v(x,t+1) = v(x,t) + dt * f. -__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, - float fx, float fy, int r, size_t pitch) { - int tx = threadIdx.x; - int ty = threadIdx.y; - cData *fj = (cData *)((char *)v + (ty + spy) * pitch) + tx + spx; +__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch) +{ + int tx = threadIdx.x; + int ty = threadIdx.y; + cData *fj = (cData *)((char *)v + (ty + spy) * pitch) + tx + spx; - cData vterm = *fj; - tx -= r; - ty -= r; - float s = 1.f / (1.f + tx * tx * tx * tx + ty * ty * ty * ty); - vterm.x += s * fx; - vterm.y += s * fy; - *fj = vterm; + cData vterm = *fj; + tx -= r; + ty -= r; + float s = 1.f / (1.f + tx * tx * tx * tx + ty * ty * ty * ty); + vterm.x += s * fx; + vterm.y += s * fy; + *fj = vterm; } // This method performs the velocity advection step, where we // trace velocity vectors back in time to update each grid cell. // That is, v(x,t+1) = v(p(x,-dt),t). Here we perform bilinear // interpolation in the velocity space. -__global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, float dt, int lb, - cudaTextureObject_t texObject) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void advectVelocity_k(cData *v, + float *vx, + float *vy, + int dx, + int pdx, + int dy, + float dt, + int lb, + cudaTextureObject_t texObject) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - cData vterm, ploc; - float vxterm, vyterm; + cData vterm, ploc; + float vxterm, vyterm; - // gtidx is the domain location in x for this thread - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + // gtidx is the domain location in x for this thread + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fj = fi * pdx + gtidx; - vterm = tex2D(texObject, (float)gtidx, (float)fi); - ploc.x = (gtidx + 0.5f) - (dt * vterm.x * dx); - ploc.y = (fi + 0.5f) - (dt * vterm.y * dy); - vterm = tex2D(texObject, ploc.x, ploc.y); - vxterm = vterm.x; - vyterm = vterm.y; - vx[fj] = vxterm; - vy[fj] = vyterm; - } + if (fi < dy) { + int fj = fi * pdx + gtidx; + vterm = tex2D(texObject, (float)gtidx, (float)fi); + ploc.x = (gtidx + 0.5f) - (dt * vterm.x * dx); + ploc.y = (fi + 0.5f) - (dt * vterm.y * dy); + vterm = tex2D(texObject, ploc.x, ploc.y); + vxterm = vterm.x; + vyterm = vterm.y; + vx[fj] = vxterm; + vy[fj] = vyterm; + } + } } - } } // This method performs velocity diffusion and forces mass conservation @@ -157,204 +164,197 @@ __global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, // and k is the wavenumber. The projection step forces the Fourier // velocity vectors to be orthogonal to the vectors for each // wavenumber: v(k,t) = v(k,t) - ((k dot v(k,t) * k) / k^2. -__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, - float visc, int lb) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, float visc, int lb) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - cData xterm, yterm; + cData xterm, yterm; - // gtidx is the domain location in x for this thread - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + // gtidx is the domain location in x for this thread + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fj = fi * dx + gtidx; - xterm = vx[fj]; - yterm = vy[fj]; + if (fi < dy) { + int fj = fi * dx + gtidx; + xterm = vx[fj]; + yterm = vy[fj]; - // Compute the index of the wavenumber based on the - // data order produced by a standard NN FFT. - int iix = gtidx; - int iiy = (fi > dy / 2) ? (fi - (dy)) : fi; + // Compute the index of the wavenumber based on the + // data order produced by a standard NN FFT. + int iix = gtidx; + int iiy = (fi > dy / 2) ? (fi - (dy)) : fi; - // Velocity diffusion - float kk = (float)(iix * iix + iiy * iiy); // k^2 - float diff = 1.f / (1.f + visc * dt * kk); - xterm.x *= diff; - xterm.y *= diff; - yterm.x *= diff; - yterm.y *= diff; + // Velocity diffusion + float kk = (float)(iix * iix + iiy * iiy); // k^2 + float diff = 1.f / (1.f + visc * dt * kk); + xterm.x *= diff; + xterm.y *= diff; + yterm.x *= diff; + yterm.y *= diff; - // Velocity projection - if (kk > 0.f) { - float rkk = 1.f / kk; - // Real portion of velocity projection - float rkp = (iix * xterm.x + iiy * yterm.x); - // Imaginary portion of velocity projection - float ikp = (iix * xterm.y + iiy * yterm.y); - xterm.x -= rkk * rkp * iix; - xterm.y -= rkk * ikp * iix; - yterm.x -= rkk * rkp * iiy; - yterm.y -= rkk * ikp * iiy; + // Velocity projection + if (kk > 0.f) { + float rkk = 1.f / kk; + // Real portion of velocity projection + float rkp = (iix * xterm.x + iiy * yterm.x); + // Imaginary portion of velocity projection + float ikp = (iix * xterm.y + iiy * yterm.y); + xterm.x -= rkk * rkp * iix; + xterm.y -= rkk * ikp * iix; + yterm.x -= rkk * rkp * iiy; + yterm.y -= rkk * ikp * iiy; + } + + vx[fj] = xterm; + vy[fj] = yterm; + } } - - vx[fj] = xterm; - vy[fj] = yterm; - } } - } } // This method updates the velocity field 'v' using the two complex // arrays from the previous step: 'vx' and 'vy'. Here we scale the // real components by 1/(dx*dy) to account for an unnormalized FFT. -__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, int lb, size_t pitch) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, int lb, size_t pitch) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - float vxterm, vyterm; - cData nvterm; + float vxterm, vyterm; + cData nvterm; - // gtidx is the domain location in x for this thread - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + // gtidx is the domain location in x for this thread + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fjr = fi * pdx + gtidx; - vxterm = vx[fjr]; - vyterm = vy[fjr]; + if (fi < dy) { + int fjr = fi * pdx + gtidx; + vxterm = vx[fjr]; + vyterm = vy[fjr]; - // Normalize the result of the inverse FFT - float scale = 1.f / (dx * dy); - nvterm.x = vxterm * scale; - nvterm.y = vyterm * scale; + // Normalize the result of the inverse FFT + float scale = 1.f / (dx * dy); + nvterm.x = vxterm * scale; + nvterm.y = vyterm * scale; - cData *fj = (cData *)((char *)v + fi * pitch) + gtidx; - *fj = nvterm; - } - } // If this thread is inside the domain in Y - } // If this thread is inside the domain in X + cData *fj = (cData *)((char *)v + fi * pitch) + gtidx; + *fj = nvterm; + } + } // If this thread is inside the domain in Y + } // If this thread is inside the domain in X } // This method updates the particles by moving particle positions // according to the velocity field and time step. That is, for each // particle: p(t+1) = p(t) + dt * v(p(t)). -__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, - float dt, int lb, size_t pitch) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, float dt, int lb, size_t pitch) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - // gtidx is the domain location in x for this thread - cData pterm, vterm; + // gtidx is the domain location in x for this thread + cData pterm, vterm; - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fj = fi * dx + gtidx; - pterm = part[fj]; + if (fi < dy) { + int fj = fi * dx + gtidx; + pterm = part[fj]; - int xvi = ((int)(pterm.x * dx)); - int yvi = ((int)(pterm.y * dy)); - vterm = *((cData *)((char *)v + yvi * pitch) + xvi); + int xvi = ((int)(pterm.x * dx)); + int yvi = ((int)(pterm.y * dy)); + vterm = *((cData *)((char *)v + yvi * pitch) + xvi); - pterm.x += dt * vterm.x; - pterm.x = pterm.x - (int)pterm.x; - pterm.x += 1.f; - pterm.x = pterm.x - (int)pterm.x; - pterm.y += dt * vterm.y; - pterm.y = pterm.y - (int)pterm.y; - pterm.y += 1.f; - pterm.y = pterm.y - (int)pterm.y; + pterm.x += dt * vterm.x; + pterm.x = pterm.x - (int)pterm.x; + pterm.x += 1.f; + pterm.x = pterm.x - (int)pterm.x; + pterm.y += dt * vterm.y; + pterm.y = pterm.y - (int)pterm.y; + pterm.y += 1.f; + pterm.y = pterm.y - (int)pterm.y; - part[fj] = pterm; - } - } // If this thread is inside the domain in Y - } // If this thread is inside the domain in X + part[fj] = pterm; + } + } // If this thread is inside the domain in Y + } // If this thread is inside the domain in X } // These are the external function calls necessary for launching fluid // simulation -extern "C" void addForces(cData *v, int dx, int dy, int spx, int spy, float fx, - float fy, int r) { - dim3 tids(2 * r + 1, 2 * r + 1); +extern "C" void addForces(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r) +{ + dim3 tids(2 * r + 1, 2 * r + 1); - addForces_k<<<1, tids>>>(v, dx, dy, spx, spy, fx, fy, r, tPitch); - getLastCudaError("addForces_k failed."); + addForces_k<<<1, tids>>>(v, dx, dy, spx, spy, fx, fy, r, tPitch); + getLastCudaError("addForces_k failed."); } -extern "C" void advectVelocity(cData *v, float *vx, float *vy, int dx, int pdx, - int dy, float dt) { - dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); +extern "C" void advectVelocity(cData *v, float *vx, float *vy, int dx, int pdx, int dy, float dt) +{ + dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); - dim3 tids(TIDSX, TIDSY); + dim3 tids(TIDSX, TIDSY); - updateTexture(v, DIM * sizeof(cData), DIM, tPitch); - advectVelocity_k<<>>(v, vx, vy, dx, pdx, dy, dt, TILEY / TIDSY, - texObj); + updateTexture(v, DIM * sizeof(cData), DIM, tPitch); + advectVelocity_k<<>>(v, vx, vy, dx, pdx, dy, dt, TILEY / TIDSY, texObj); - getLastCudaError("advectVelocity_k failed."); + getLastCudaError("advectVelocity_k failed."); } -extern "C" void diffuseProject(cData *vx, cData *vy, int dx, int dy, float dt, - float visc) { - // Forward FFT - checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vx, (cufftComplex *)vx)); - checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vy, (cufftComplex *)vy)); +extern "C" void diffuseProject(cData *vx, cData *vy, int dx, int dy, float dt, float visc) +{ + // Forward FFT + checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vx, (cufftComplex *)vx)); + checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vy, (cufftComplex *)vy)); - uint3 grid = make_uint3((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1), 1); - uint3 tids = make_uint3(TIDSX, TIDSY, 1); + uint3 grid = make_uint3((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1), 1); + uint3 tids = make_uint3(TIDSX, TIDSY, 1); - diffuseProject_k<<>>(vx, vy, dx, dy, dt, visc, TILEY / TIDSY); - getLastCudaError("diffuseProject_k failed."); + diffuseProject_k<<>>(vx, vy, dx, dy, dt, visc, TILEY / TIDSY); + getLastCudaError("diffuseProject_k failed."); - // Inverse FFT - checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vx, (cufftReal *)vx)); - checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vy, (cufftReal *)vy)); + // Inverse FFT + checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vx, (cufftReal *)vx)); + checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vy, (cufftReal *)vy)); } -extern "C" void updateVelocity(cData *v, float *vx, float *vy, int dx, int pdx, - int dy) { - dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); - dim3 tids(TIDSX, TIDSY); +extern "C" void updateVelocity(cData *v, float *vx, float *vy, int dx, int pdx, int dy) +{ + dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); + dim3 tids(TIDSX, TIDSY); - updateVelocity_k<<>>(v, vx, vy, dx, pdx, dy, TILEY / TIDSY, - tPitch); - getLastCudaError("updateVelocity_k failed."); + updateVelocity_k<<>>(v, vx, vy, dx, pdx, dy, TILEY / TIDSY, tPitch); + getLastCudaError("updateVelocity_k failed."); } -extern "C" void advectParticles(GLuint vbo, cData *v, int dx, int dy, - float dt) { - dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); - dim3 tids(TIDSX, TIDSY); +extern "C" void advectParticles(GLuint vbo, cData *v, int dx, int dy, float dt) +{ + dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); + dim3 tids(TIDSX, TIDSY); - cData *p; - cudaGraphicsMapResources(1, &cuda_vbo_resource, 0); - getLastCudaError("cudaGraphicsMapResources failed"); + cData *p; + cudaGraphicsMapResources(1, &cuda_vbo_resource, 0); + getLastCudaError("cudaGraphicsMapResources failed"); - size_t num_bytes; - cudaGraphicsResourceGetMappedPointer((void **)&p, &num_bytes, - cuda_vbo_resource); - getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); + size_t num_bytes; + cudaGraphicsResourceGetMappedPointer((void **)&p, &num_bytes, cuda_vbo_resource); + getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); - advectParticles_k<<>>(p, v, dx, dy, dt, TILEY / TIDSY, tPitch); - getLastCudaError("advectParticles_k failed."); + advectParticles_k<<>>(p, v, dx, dy, dt, TILEY / TIDSY, tPitch); + getLastCudaError("advectParticles_k failed."); - cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0); - getLastCudaError("cudaGraphicsUnmapResources failed"); + cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0); + getLastCudaError("cudaGraphicsUnmapResources failed"); } diff --git a/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cuh b/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cuh index 6c677ec2..e7485fe3 100644 --- a/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cuh +++ b/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.cuh @@ -39,16 +39,14 @@ void deleteTexture(void); // This method adds constant force vectors to the velocity field // stored in 'v' according to v(x,t+1) = v(x,t) + dt * f. -__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, - float fx, float fy, int r, size_t pitch); +__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch); // This method performs the velocity advection step, where we // trace velocity vectors back in time to update each grid cell. // That is, v(x,t+1) = v(p(x,-dt),t). Here we perform bilinear // interpolation in the velocity space. -__global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, float dt, int lb, - cudaTextureObject_t tex); +__global__ void +advectVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, float dt, int lb, cudaTextureObject_t tex); // This method performs velocity diffusion and forces mass conservation // in the frequency domain. The inputs 'vx' and 'vy' are complex-valued @@ -58,19 +56,16 @@ __global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, // and k is the wavenumber. The projection step forces the Fourier // velocity vectors to be orthogonal to the wave wave vectors for each // wavenumber: v(k,t) = v(k,t) - ((k dot v(k,t) * k) / k^2. -__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, - float visc, int lb); +__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, float visc, int lb); // This method updates the velocity field 'v' using the two complex // arrays from the previous step: 'vx' and 'vy'. Here we scale the // real components by 1/(dx*dy) to account for an unnormalized FFT. -__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, int lb, size_t pitch); +__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, int lb, size_t pitch); // This method updates the particles by moving particle positions // according to the velocity field and time step. That is, for each // particle: p(t+1) = p(t) + dt * v(p(t)). -__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, - float dt, int lb, size_t pitch); +__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, float dt, int lb, size_t pitch); #endif diff --git a/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.h b/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.h index 412f325e..10becbfd 100644 --- a/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.h +++ b/Samples/5_Domain_Specific/fluidsGL/fluidsGL_kernels.h @@ -37,16 +37,14 @@ void deleteTexture(void); // This method adds constant force vectors to the velocity field // stored in 'v' according to v(x,t+1) = v(x,t) + dt * f. -__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, - float fx, float fy, int r, size_t pitch); +__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch); // This method performs the velocity advection step, where we // trace velocity vectors back in time to update each grid cell. // That is, v(x,t+1) = v(p(x,-dt),t). Here we perform bilinear // interpolation in the velocity space. -__global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, float dt, int lb, - cudaTextureObject_t tex); +__global__ void +advectVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, float dt, int lb, cudaTextureObject_t tex); // This method performs velocity diffusion and forces mass conservation // in the frequency domain. The inputs 'vx' and 'vy' are complex-valued @@ -56,19 +54,16 @@ __global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, // and k is the wavenumber. The projection step forces the Fourier // velocity vectors to be orthogonal to the wave wave vectors for each // wavenumber: v(k,t) = v(k,t) - ((k dot v(k,t) * k) / k^2. -__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, - float visc, int lb); +__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, float visc, int lb); // This method updates the velocity field 'v' using the two complex // arrays from the previous step: 'vx' and 'vy'. Here we scale the // real components by 1/(dx*dy) to account for an unnormalized FFT. -__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, int lb, size_t pitch); +__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, int lb, size_t pitch); // This method updates the particles by moving particle positions // according to the velocity field and time step. That is, for each // particle: p(t+1) = p(t) + dt * v(p(t)). -__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, - float dt, int lb, size_t pitch); +__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, float dt, int lb, size_t pitch); #endif diff --git a/Samples/5_Domain_Specific/marchingCubes/README.md b/Samples/5_Domain_Specific/marchingCubes/README.md index e9095565..5f15eba4 100644 --- a/Samples/5_Domain_Specific/marchingCubes/README.md +++ b/Samples/5_Domain_Specific/marchingCubes/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/marchingCubes/defines.h b/Samples/5_Domain_Specific/marchingCubes/defines.h index 72506f3f..fd17faab 100644 --- a/Samples/5_Domain_Specific/marchingCubes/defines.h +++ b/Samples/5_Domain_Specific/marchingCubes/defines.h @@ -28,7 +28,7 @@ #ifndef _DEFINES_H_ #define _DEFINES_H_ -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; // if SAMPLE_VOLUME is 0, an implicit dataset is generated. If 1, a voxelized diff --git a/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp b/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp index 0e052750..555fdde6 100644 --- a/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp +++ b/Samples/5_Domain_Specific/marchingCubes/marchingCubes.cpp @@ -87,19 +87,17 @@ #endif // includes -#include -#include -#include -#include -#include - -#include #include -#include -#include - -#include // includes cuda.h and cuda_runtime_api.h +#include +#include // includes cuda.h and cuda_runtime_api.h #include +#include +#include +#include +#include +#include +#include +#include #include "defines.h" @@ -110,38 +108,61 @@ #include #endif -extern "C" void launch_classifyVoxel(dim3 grid, dim3 threads, uint *voxelVerts, - uint *voxelOccupied, uchar *volume, - uint3 gridSize, uint3 gridSizeShift, - uint3 gridSizeMask, uint numVoxels, - float3 voxelSize, float isoValue); +extern "C" void launch_classifyVoxel(dim3 grid, + dim3 threads, + uint *voxelVerts, + uint *voxelOccupied, + uchar *volume, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + uint numVoxels, + float3 voxelSize, + float isoValue); -extern "C" void launch_compactVoxels(dim3 grid, dim3 threads, +extern "C" void launch_compactVoxels(dim3 grid, + dim3 threads, uint *compactedVoxelArray, uint *voxelOccupied, - uint *voxelOccupiedScan, uint numVoxels); + uint *voxelOccupiedScan, + uint numVoxels); -extern "C" void launch_generateTriangles( - dim3 grid, dim3 threads, float4 *pos, float4 *norm, - uint *compactedVoxelArray, uint *numVertsScanned, uint3 gridSize, - uint3 gridSizeShift, uint3 gridSizeMask, float3 voxelSize, float isoValue, - uint activeVoxels, uint maxVerts); +extern "C" void launch_generateTriangles(dim3 grid, + dim3 threads, + float4 *pos, + float4 *norm, + uint *compactedVoxelArray, + uint *numVertsScanned, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + float3 voxelSize, + float isoValue, + uint activeVoxels, + uint maxVerts); -extern "C" void launch_generateTriangles2( - dim3 grid, dim3 threads, float4 *pos, float4 *norm, - uint *compactedVoxelArray, uint *numVertsScanned, uchar *volume, - uint3 gridSize, uint3 gridSizeShift, uint3 gridSizeMask, float3 voxelSize, - float isoValue, uint activeVoxels, uint maxVerts); +extern "C" void launch_generateTriangles2(dim3 grid, + dim3 threads, + float4 *pos, + float4 *norm, + uint *compactedVoxelArray, + uint *numVertsScanned, + uchar *volume, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + float3 voxelSize, + float isoValue, + uint activeVoxels, + uint maxVerts); -extern "C" void allocateTextures(uint **d_edgeTable, uint **d_triTable, - uint **d_numVertsTable); +extern "C" void allocateTextures(uint **d_edgeTable, uint **d_triTable, uint **d_numVertsTable); extern "C" void createVolumeTexture(uchar *d_volume, size_t buffSize); extern "C" void destroyAllTextureObjects(); -extern "C" void ThrustScanWrapper(unsigned int *output, unsigned int *input, - unsigned int numElements); +extern "C" void ThrustScanWrapper(unsigned int *output, unsigned int *input, unsigned int numElements); // constants -const unsigned int window_width = 512; +const unsigned int window_width = 512; const unsigned int window_height = 512; const char *volumeFilename = "Bucky.raw"; @@ -152,49 +173,49 @@ uint3 gridSize; uint3 gridSizeMask; float3 voxelSize; -uint numVoxels = 0; -uint maxVerts = 0; -uint activeVoxels = 0; -uint totalVerts = 0; +uint numVoxels = 0; +uint maxVerts = 0; +uint activeVoxels = 0; +uint totalVerts = 0; -float isoValue = 0.2f; +float isoValue = 0.2f; float dIsoValue = 0.005f; // device data -GLuint posVbo, normalVbo; -GLint gl_Shader; +GLuint posVbo, normalVbo; +GLint gl_Shader; struct cudaGraphicsResource *cuda_posvbo_resource, - *cuda_normalvbo_resource; // handles OpenGL-CUDA exchange + *cuda_normalvbo_resource; // handles OpenGL-CUDA exchange float4 *d_pos = 0, *d_normal = 0; -uchar *d_volume = 0; -uint *d_voxelVerts = 0; -uint *d_voxelVertsScan = 0; -uint *d_voxelOccupied = 0; -uint *d_voxelOccupiedScan = 0; -uint *d_compVoxelArray; +uchar *d_volume = 0; +uint *d_voxelVerts = 0; +uint *d_voxelVertsScan = 0; +uint *d_voxelOccupied = 0; +uint *d_voxelOccupiedScan = 0; +uint *d_compVoxelArray; // tables uint *d_numVertsTable = 0; -uint *d_edgeTable = 0; -uint *d_triTable = 0; +uint *d_edgeTable = 0; +uint *d_triTable = 0; // mouse controls -int mouse_old_x, mouse_old_y; -int mouse_buttons = 0; -float3 rotate = make_float3(0.0, 0.0, 0.0); -float3 translate = make_float3(0.0, 0.0, -3.0); +int mouse_old_x, mouse_old_y; +int mouse_buttons = 0; +float3 rotate = make_float3(0.0, 0.0, 0.0); +float3 translate = make_float3(0.0, 0.0, -3.0); // toggles bool wireframe = false; -bool animate = true; -bool lighting = true; -bool render = true; -bool compute = true; +bool animate = true; +bool lighting = true; +bool render = true; +bool compute = true; #define MAX_EPSILON_ERROR 5.0f -#define REFRESH_DELAY 10 // ms +#define REFRESH_DELAY 10 // ms // Define the files that are to be save and the reference images for validation const char *sOriginal[] = {"march_cubes.ppm", NULL}; @@ -204,14 +225,14 @@ const char *sReference[] = {"ref_march_cubes.ppm", NULL}; StopWatchInterface *timer = 0; // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -unsigned int frameCount = 0; -bool g_bValidate = false; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +unsigned int frameCount = 0; +bool g_bValidate = false; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; // forward declarations @@ -221,8 +242,7 @@ void initMC(int argc, char **argv); void computeIsosurface(); void dumpFile(void *dData, int data_bytes, const char *file_name); -template -void dumpBuffer(T *d_buffer, int nelements, int size_element); +template void dumpBuffer(T *d_buffer, int nelements, int size_element); void cleanup(); @@ -239,336 +259,342 @@ void reshape(int w, int h); void mainMenu(int i); -#define EPSILON 5.0f +#define EPSILON 5.0f #define THRESHOLD 0.30f -void animation() { - if (animate) { - isoValue += dIsoValue; +void animation() +{ + if (animate) { + isoValue += dIsoValue; - if (isoValue < 0.1f) { - isoValue = 0.1f; - dIsoValue *= -1.0f; - } else if (isoValue > 0.9f) { - isoValue = 0.9f; - dIsoValue *= -1.0f; + if (isoValue < 0.1f) { + isoValue = 0.1f; + dIsoValue *= -1.0f; + } + else if (isoValue > 0.9f) { + isoValue = 0.9f; + dIsoValue *= -1.0f; + } } - } } -void timerEvent(int value) { - animation(); - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); +void timerEvent(int value) +{ + animation(); + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); } -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "CUDA Marching Cubes: %3.1f fps", ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "CUDA Marching Cubes: %3.1f fps", ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - fpsLimit = ftoi(MAX(1.f, ifps)); - sdkResetTimer(&timer); - } + fpsLimit = ftoi(MAX(1.f, ifps)); + sdkResetTimer(&timer); + } } //////////////////////////////////////////////////////////////////////////////// // Load raw data from disk //////////////////////////////////////////////////////////////////////////////// -uchar *loadRawFile(char *filename, int size) { - FILE *fp = fopen(filename, "rb"); +uchar *loadRawFile(char *filename, int size) +{ + FILE *fp = fopen(filename, "rb"); - if (!fp) { - fprintf(stderr, "Error opening file '%s'\n", filename); - return 0; - } + if (!fp) { + fprintf(stderr, "Error opening file '%s'\n", filename); + return 0; + } - uchar *data = (uchar *)malloc(size); - size_t read = fread(data, 1, size, fp); - fclose(fp); + uchar *data = (uchar *)malloc(size); + size_t read = fread(data, 1, size, fp); + fclose(fp); - printf("Read '%s', %d bytes\n", filename, (int)read); + printf("Read '%s', %d bytes\n", filename, (int)read); - return data; + return data; } -void dumpFile(void *dData, int data_bytes, const char *file_name) { - void *hData = malloc(data_bytes); - checkCudaErrors(cudaMemcpy(hData, dData, data_bytes, cudaMemcpyDeviceToHost)); - sdkDumpBin(hData, data_bytes, file_name); - free(hData); +void dumpFile(void *dData, int data_bytes, const char *file_name) +{ + void *hData = malloc(data_bytes); + checkCudaErrors(cudaMemcpy(hData, dData, data_bytes, cudaMemcpyDeviceToHost)); + sdkDumpBin(hData, data_bytes, file_name); + free(hData); } -template -void dumpBuffer(T *d_buffer, int nelements, int size_element) { - uint bytes = nelements * size_element; - T *h_buffer = (T *)malloc(bytes); - checkCudaErrors( - cudaMemcpy(h_buffer, d_buffer, bytes, cudaMemcpyDeviceToHost)); +template void dumpBuffer(T *d_buffer, int nelements, int size_element) +{ + uint bytes = nelements * size_element; + T *h_buffer = (T *)malloc(bytes); + checkCudaErrors(cudaMemcpy(h_buffer, d_buffer, bytes, cudaMemcpyDeviceToHost)); - for (int i = 0; i < nelements; i++) { - printf("%d: %u\n", i, h_buffer[i]); - } + for (int i = 0; i < nelements; i++) { + printf("%d: %u\n", i, h_buffer[i]); + } - printf("\n"); - free(h_buffer); + printf("\n"); + free(h_buffer); } -void runAutoTest(int argc, char **argv) { - findCudaDevice(argc, (const char **)argv); +void runAutoTest(int argc, char **argv) +{ + findCudaDevice(argc, (const char **)argv); - // Initialize CUDA buffers for Marching Cubes - initMC(argc, argv); + // Initialize CUDA buffers for Marching Cubes + initMC(argc, argv); - computeIsosurface(); + computeIsosurface(); - char *ref_file = NULL; - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + char *ref_file = NULL; + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - enum DUMP_TYPE { DUMP_POS = 0, DUMP_NORMAL, DUMP_VOXEL }; - int dump_option = getCmdLineArgumentInt(argc, (const char **)argv, "dump"); + enum DUMP_TYPE { DUMP_POS = 0, DUMP_NORMAL, DUMP_VOXEL }; + int dump_option = getCmdLineArgumentInt(argc, (const char **)argv, "dump"); - bool bTestResult = true; + bool bTestResult = true; - switch (dump_option) { + switch (dump_option) { case DUMP_POS: - dumpFile((void *)d_pos, sizeof(float4) * maxVerts, - "marchCube_posArray.bin"); - bTestResult = sdkCompareBin2BinFloat( - "marchCube_posArray.bin", "posArray.bin", - maxVerts * sizeof(float) * 4, EPSILON, THRESHOLD, argv[0]); - break; + dumpFile((void *)d_pos, sizeof(float4) * maxVerts, "marchCube_posArray.bin"); + bTestResult = sdkCompareBin2BinFloat( + "marchCube_posArray.bin", "posArray.bin", maxVerts * sizeof(float) * 4, EPSILON, THRESHOLD, argv[0]); + break; case DUMP_NORMAL: - dumpFile((void *)d_normal, sizeof(float4) * maxVerts, - "marchCube_normalArray.bin"); - bTestResult = sdkCompareBin2BinFloat( - "marchCube_normalArray.bin", "normalArray.bin", - maxVerts * sizeof(float) * 4, EPSILON, THRESHOLD, argv[0]); - break; + dumpFile((void *)d_normal, sizeof(float4) * maxVerts, "marchCube_normalArray.bin"); + bTestResult = sdkCompareBin2BinFloat( + "marchCube_normalArray.bin", "normalArray.bin", maxVerts * sizeof(float) * 4, EPSILON, THRESHOLD, argv[0]); + break; case DUMP_VOXEL: - dumpFile((void *)d_compVoxelArray, sizeof(uint) * numVoxels, - "marchCube_compVoxelArray.bin"); - bTestResult = sdkCompareBin2BinFloat( - "marchCube_compVoxelArray.bin", "compVoxelArray.bin", - numVoxels * sizeof(uint), EPSILON, THRESHOLD, argv[0]); - break; + dumpFile((void *)d_compVoxelArray, sizeof(uint) * numVoxels, "marchCube_compVoxelArray.bin"); + bTestResult = sdkCompareBin2BinFloat("marchCube_compVoxelArray.bin", + "compVoxelArray.bin", + numVoxels * sizeof(uint), + EPSILON, + THRESHOLD, + argv[0]); + break; default: - printf("Invalid validation flag!\n"); - printf("-dump=0 \n"); - printf("-dump=1 \n"); - printf("-dump=2 \n"); - exit(EXIT_SUCCESS); - } + printf("Invalid validation flag!\n"); + printf("-dump=0 \n"); + printf("-dump=1 \n"); + printf("-dump=2 \n"); + exit(EXIT_SUCCESS); + } - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("[%s] - Starting...\n", argv[0]); + printf("[%s] - Starting...\n", argv[0]); - if (checkCmdLineFlag(argc, (const char **)argv, "file") && - checkCmdLineFlag(argc, (const char **)argv, "dump")) { - animate = false; - fpsLimit = frameCheckNumber; - g_bValidate = true; - runAutoTest(argc, argv); - } else { - runGraphicsTest(argc, argv); - } + if (checkCmdLineFlag(argc, (const char **)argv, "file") && checkCmdLineFlag(argc, (const char **)argv, "dump")) { + animate = false; + fpsLimit = frameCheckNumber; + g_bValidate = true; + runAutoTest(argc, argv); + } + else { + runGraphicsTest(argc, argv); + } - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////// // initialize marching cubes //////////////////////////////////////////////////////////////////////////////// -void initMC(int argc, char **argv) { - // parse command line arguments - int n; +void initMC(int argc, char **argv) +{ + // parse command line arguments + int n; - if (checkCmdLineFlag(argc, (const char **)argv, "grid")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "grid"); - gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "grid")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "grid"); + gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n; + } - if (checkCmdLineFlag(argc, (const char **)argv, "gridx")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "gridx"); - gridSizeLog2.x = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "gridx")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "gridx"); + gridSizeLog2.x = n; + } - if (checkCmdLineFlag(argc, (const char **)argv, "gridy")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "gridy"); - gridSizeLog2.y = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "gridy")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "gridy"); + gridSizeLog2.y = n; + } - if (checkCmdLineFlag(argc, (const char **)argv, "gridz")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "gridz"); - gridSizeLog2.z = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "gridz")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "gridz"); + gridSizeLog2.z = n; + } - char *filename; + char *filename; - if (getCmdLineArgumentString(argc, (const char **)argv, "file", &filename)) { - volumeFilename = filename; - } + if (getCmdLineArgumentString(argc, (const char **)argv, "file", &filename)) { + volumeFilename = filename; + } - gridSize = - make_uint3(1 << gridSizeLog2.x, 1 << gridSizeLog2.y, 1 << gridSizeLog2.z); - gridSizeMask = make_uint3(gridSize.x - 1, gridSize.y - 1, gridSize.z - 1); - gridSizeShift = - make_uint3(0, gridSizeLog2.x, gridSizeLog2.x + gridSizeLog2.y); + gridSize = make_uint3(1 << gridSizeLog2.x, 1 << gridSizeLog2.y, 1 << gridSizeLog2.z); + gridSizeMask = make_uint3(gridSize.x - 1, gridSize.y - 1, gridSize.z - 1); + gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x + gridSizeLog2.y); - numVoxels = gridSize.x * gridSize.y * gridSize.z; - voxelSize = - make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z); - maxVerts = gridSize.x * gridSize.y * 100; + numVoxels = gridSize.x * gridSize.y * gridSize.z; + voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z); + maxVerts = gridSize.x * gridSize.y * 100; - printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, - numVoxels); - printf("max verts = %d\n", maxVerts); + printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels); + printf("max verts = %d\n", maxVerts); #if SAMPLE_VOLUME - // load volume data - char *path = sdkFindFilePath(volumeFilename, argv[0]); + // load volume data + char *path = sdkFindFilePath(volumeFilename, argv[0]); - if (path == NULL) { - fprintf(stderr, "Error finding file '%s'\n", volumeFilename); + if (path == NULL) { + fprintf(stderr, "Error finding file '%s'\n", volumeFilename); - exit(EXIT_FAILURE); - } + exit(EXIT_FAILURE); + } - int size = gridSize.x * gridSize.y * gridSize.z * sizeof(uchar); - uchar *volume = loadRawFile(path, size); - checkCudaErrors(cudaMalloc((void **)&d_volume, size)); - checkCudaErrors(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice)); - free(volume); + int size = gridSize.x * gridSize.y * gridSize.z * sizeof(uchar); + uchar *volume = loadRawFile(path, size); + checkCudaErrors(cudaMalloc((void **)&d_volume, size)); + checkCudaErrors(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice)); + free(volume); - createVolumeTexture(d_volume, size); + createVolumeTexture(d_volume, size); #endif - if (g_bValidate) { - cudaMalloc((void **)&(d_pos), maxVerts * sizeof(float) * 4); - cudaMalloc((void **)&(d_normal), maxVerts * sizeof(float) * 4); - } else { - // create VBOs - createVBO(&posVbo, maxVerts * sizeof(float) * 4); - // DEPRECATED: checkCudaErrors( cudaGLRegisterBufferObject(posVbo) ); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_posvbo_resource, posVbo, cudaGraphicsMapFlagsWriteDiscard)); + if (g_bValidate) { + cudaMalloc((void **)&(d_pos), maxVerts * sizeof(float) * 4); + cudaMalloc((void **)&(d_normal), maxVerts * sizeof(float) * 4); + } + else { + // create VBOs + createVBO(&posVbo, maxVerts * sizeof(float) * 4); + // DEPRECATED: checkCudaErrors( cudaGLRegisterBufferObject(posVbo) ); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, cudaGraphicsMapFlagsWriteDiscard)); - createVBO(&normalVbo, maxVerts * sizeof(float) * 4); - // DEPRECATED: checkCudaErrors(cudaGLRegisterBufferObject(normalVbo)); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_normalvbo_resource, normalVbo, cudaGraphicsMapFlagsWriteDiscard)); - } + createVBO(&normalVbo, maxVerts * sizeof(float) * 4); + // DEPRECATED: checkCudaErrors(cudaGLRegisterBufferObject(normalVbo)); + checkCudaErrors( + cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, cudaGraphicsMapFlagsWriteDiscard)); + } - // allocate textures - allocateTextures(&d_edgeTable, &d_triTable, &d_numVertsTable); + // allocate textures + allocateTextures(&d_edgeTable, &d_triTable, &d_numVertsTable); - // allocate device memory - unsigned int memSize = sizeof(uint) * numVoxels; - checkCudaErrors(cudaMalloc((void **)&d_voxelVerts, memSize)); - checkCudaErrors(cudaMalloc((void **)&d_voxelVertsScan, memSize)); - checkCudaErrors(cudaMalloc((void **)&d_voxelOccupied, memSize)); - checkCudaErrors(cudaMalloc((void **)&d_voxelOccupiedScan, memSize)); - checkCudaErrors(cudaMalloc((void **)&d_compVoxelArray, memSize)); + // allocate device memory + unsigned int memSize = sizeof(uint) * numVoxels; + checkCudaErrors(cudaMalloc((void **)&d_voxelVerts, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_voxelVertsScan, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_voxelOccupied, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_voxelOccupiedScan, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_compVoxelArray, memSize)); } -void cleanup() { - if (g_bValidate) { - cudaFree(d_pos); - cudaFree(d_normal); - } else { - sdkDeleteTimer(&timer); +void cleanup() +{ + if (g_bValidate) { + cudaFree(d_pos); + cudaFree(d_normal); + } + else { + sdkDeleteTimer(&timer); - deleteVBO(&posVbo, &cuda_posvbo_resource); - deleteVBO(&normalVbo, &cuda_normalvbo_resource); - } - destroyAllTextureObjects(); - checkCudaErrors(cudaFree(d_edgeTable)); - checkCudaErrors(cudaFree(d_triTable)); - checkCudaErrors(cudaFree(d_numVertsTable)); + deleteVBO(&posVbo, &cuda_posvbo_resource); + deleteVBO(&normalVbo, &cuda_normalvbo_resource); + } + destroyAllTextureObjects(); + checkCudaErrors(cudaFree(d_edgeTable)); + checkCudaErrors(cudaFree(d_triTable)); + checkCudaErrors(cudaFree(d_numVertsTable)); - checkCudaErrors(cudaFree(d_voxelVerts)); - checkCudaErrors(cudaFree(d_voxelVertsScan)); - checkCudaErrors(cudaFree(d_voxelOccupied)); - checkCudaErrors(cudaFree(d_voxelOccupiedScan)); - checkCudaErrors(cudaFree(d_compVoxelArray)); + checkCudaErrors(cudaFree(d_voxelVerts)); + checkCudaErrors(cudaFree(d_voxelVertsScan)); + checkCudaErrors(cudaFree(d_voxelOccupied)); + checkCudaErrors(cudaFree(d_voxelOccupiedScan)); + checkCudaErrors(cudaFree(d_compVoxelArray)); - if (d_volume) { - checkCudaErrors(cudaFree(d_volume)); - } + if (d_volume) { + checkCudaErrors(cudaFree(d_volume)); + } } -void initMenus() { - glutCreateMenu(mainMenu); - glutAddMenuEntry("Toggle animation [ ]", ' '); - glutAddMenuEntry("Increment isovalue [+]", '+'); - glutAddMenuEntry("Decrement isovalue [-]", '-'); - glutAddMenuEntry("Toggle computation [c]", 'c'); - glutAddMenuEntry("Toggle rendering [r]", 'r'); - glutAddMenuEntry("Toggle lighting [l]", 'l'); - glutAddMenuEntry("Toggle wireframe [w]", 'w'); - glutAddMenuEntry("Quit (esc)", '\033'); - glutAttachMenu(GLUT_RIGHT_BUTTON); +void initMenus() +{ + glutCreateMenu(mainMenu); + glutAddMenuEntry("Toggle animation [ ]", ' '); + glutAddMenuEntry("Increment isovalue [+]", '+'); + glutAddMenuEntry("Decrement isovalue [-]", '-'); + glutAddMenuEntry("Toggle computation [c]", 'c'); + glutAddMenuEntry("Toggle rendering [r]", 'r'); + glutAddMenuEntry("Toggle lighting [l]", 'l'); + glutAddMenuEntry("Toggle wireframe [w]", 'w'); + glutAddMenuEntry("Quit (esc)", '\033'); + glutAttachMenu(GLUT_RIGHT_BUTTON); } -void runGraphicsTest(int argc, char **argv) { - printf("MarchingCubes\n"); +void runGraphicsTest(int argc, char **argv) +{ + printf("MarchingCubes\n"); - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf("[%s]\n", argv[0]); - printf(" Does not explicitly support -device=n in OpenGL mode\n"); - printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); - printf(" > %s -device=n -file= -dump=<0/1/2>\n", argv[0]); - exit(EXIT_SUCCESS); - } + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf("[%s]\n", argv[0]); + printf(" Does not explicitly support -device=n in OpenGL mode\n"); + printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); + printf(" > %s -device=n -file= -dump=<0/1/2>\n", argv[0]); + exit(EXIT_SUCCESS); + } - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with OpenGL/CUDA - // interop. - if (false == initGL(&argc, argv)) { - return; - } + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with OpenGL/CUDA + // interop. + if (false == initGL(&argc, argv)) { + return; + } - findCudaDevice(argc, (const char **)argv); + findCudaDevice(argc, (const char **)argv); - // register callbacks - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutReshapeFunc(reshape); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - initMenus(); + // register callbacks + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutReshapeFunc(reshape); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + initMenus(); - // Initialize CUDA buffers for Marching Cubes - initMC(argc, argv); + // Initialize CUDA buffers for Marching Cubes + initMC(argc, argv); - sdkCreateTimer(&timer); + sdkCreateTimer(&timer); - // start rendering mainloop - glutMainLoop(); + // start rendering mainloop + glutMainLoop(); } #define DEBUG_BUFFERS 0 @@ -576,401 +602,436 @@ void runGraphicsTest(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void computeIsosurface() { - int threads = 128; - dim3 grid(numVoxels / threads, 1, 1); +void computeIsosurface() +{ + int threads = 128; + dim3 grid(numVoxels / threads, 1, 1); - // get around maximum grid size of 65535 in each dimension - if (grid.x > 65535) { - grid.y = grid.x / 32768; - grid.x = 32768; - } + // get around maximum grid size of 65535 in each dimension + if (grid.x > 65535) { + grid.y = grid.x / 32768; + grid.x = 32768; + } - // calculate number of vertices need per voxel - launch_classifyVoxel(grid, threads, d_voxelVerts, d_voxelOccupied, d_volume, - gridSize, gridSizeShift, gridSizeMask, numVoxels, - voxelSize, isoValue); + // calculate number of vertices need per voxel + launch_classifyVoxel(grid, + threads, + d_voxelVerts, + d_voxelOccupied, + d_volume, + gridSize, + gridSizeShift, + gridSizeMask, + numVoxels, + voxelSize, + isoValue); #if DEBUG_BUFFERS - printf("voxelVerts:\n"); - dumpBuffer(d_voxelVerts, numVoxels, sizeof(uint)); + printf("voxelVerts:\n"); + dumpBuffer(d_voxelVerts, numVoxels, sizeof(uint)); #endif #if SKIP_EMPTY_VOXELS - // scan voxel occupied array - ThrustScanWrapper(d_voxelOccupiedScan, d_voxelOccupied, numVoxels); + // scan voxel occupied array + ThrustScanWrapper(d_voxelOccupiedScan, d_voxelOccupied, numVoxels); #if DEBUG_BUFFERS - printf("voxelOccupiedScan:\n"); - dumpBuffer(d_voxelOccupiedScan, numVoxels, sizeof(uint)); + printf("voxelOccupiedScan:\n"); + dumpBuffer(d_voxelOccupiedScan, numVoxels, sizeof(uint)); #endif - // read back values to calculate total number of non-empty voxels - // since we are using an exclusive scan, the total is the last value of - // the scan result plus the last value in the input array - { - uint lastElement, lastScanElement; - checkCudaErrors(cudaMemcpy((void *)&lastElement, - (void *)(d_voxelOccupied + numVoxels - 1), - sizeof(uint), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy((void *)&lastScanElement, - (void *)(d_voxelOccupiedScan + numVoxels - 1), - sizeof(uint), cudaMemcpyDeviceToHost)); - activeVoxels = lastElement + lastScanElement; - } + // read back values to calculate total number of non-empty voxels + // since we are using an exclusive scan, the total is the last value of + // the scan result plus the last value in the input array + { + uint lastElement, lastScanElement; + checkCudaErrors(cudaMemcpy( + (void *)&lastElement, (void *)(d_voxelOccupied + numVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy((void *)&lastScanElement, + (void *)(d_voxelOccupiedScan + numVoxels - 1), + sizeof(uint), + cudaMemcpyDeviceToHost)); + activeVoxels = lastElement + lastScanElement; + } - if (activeVoxels == 0) { - // return if there are no full voxels - totalVerts = 0; - return; - } + if (activeVoxels == 0) { + // return if there are no full voxels + totalVerts = 0; + return; + } - // compact voxel index array - launch_compactVoxels(grid, threads, d_compVoxelArray, d_voxelOccupied, - d_voxelOccupiedScan, numVoxels); - getLastCudaError("compactVoxels failed"); + // compact voxel index array + launch_compactVoxels(grid, threads, d_compVoxelArray, d_voxelOccupied, d_voxelOccupiedScan, numVoxels); + getLastCudaError("compactVoxels failed"); -#endif // SKIP_EMPTY_VOXELS +#endif // SKIP_EMPTY_VOXELS - // scan voxel vertex count array - ThrustScanWrapper(d_voxelVertsScan, d_voxelVerts, numVoxels); + // scan voxel vertex count array + ThrustScanWrapper(d_voxelVertsScan, d_voxelVerts, numVoxels); #if DEBUG_BUFFERS - printf("voxelVertsScan:\n"); - dumpBuffer(d_voxelVertsScan, numVoxels, sizeof(uint)); + printf("voxelVertsScan:\n"); + dumpBuffer(d_voxelVertsScan, numVoxels, sizeof(uint)); #endif - // readback total number of vertices - { - uint lastElement, lastScanElement; - checkCudaErrors(cudaMemcpy((void *)&lastElement, - (void *)(d_voxelVerts + numVoxels - 1), - sizeof(uint), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy((void *)&lastScanElement, - (void *)(d_voxelVertsScan + numVoxels - 1), - sizeof(uint), cudaMemcpyDeviceToHost)); - totalVerts = lastElement + lastScanElement; - } + // readback total number of vertices + { + uint lastElement, lastScanElement; + checkCudaErrors(cudaMemcpy( + (void *)&lastElement, (void *)(d_voxelVerts + numVoxels - 1), sizeof(uint), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy((void *)&lastScanElement, + (void *)(d_voxelVertsScan + numVoxels - 1), + sizeof(uint), + cudaMemcpyDeviceToHost)); + totalVerts = lastElement + lastScanElement; + } - // generate triangles, writing to vertex buffers - if (!g_bValidate) { - size_t num_bytes; - // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_pos, - // posVbo)); - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_posvbo_resource, 0)); - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_pos, &num_bytes, cuda_posvbo_resource)); + // generate triangles, writing to vertex buffers + if (!g_bValidate) { + size_t num_bytes; + // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_pos, + // posVbo)); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_posvbo_resource, 0)); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_pos, &num_bytes, cuda_posvbo_resource)); - // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_normal, - // normalVbo)); - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_normalvbo_resource, 0)); - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_normal, &num_bytes, cuda_normalvbo_resource)); - } + // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_normal, + // normalVbo)); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_normalvbo_resource, 0)); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_normal, &num_bytes, cuda_normalvbo_resource)); + } #if SKIP_EMPTY_VOXELS - dim3 grid2((int)ceil(activeVoxels / (float)NTHREADS), 1, 1); + dim3 grid2((int)ceil(activeVoxels / (float)NTHREADS), 1, 1); #else - dim3 grid2((int)ceil(numVoxels / (float)NTHREADS), 1, 1); + dim3 grid2((int)ceil(numVoxels / (float)NTHREADS), 1, 1); #endif - while (grid2.x > 65535) { - grid2.x /= 2; - grid2.y *= 2; - } + while (grid2.x > 65535) { + grid2.x /= 2; + grid2.y *= 2; + } #if SAMPLE_VOLUME - launch_generateTriangles2(grid2, NTHREADS, d_pos, d_normal, d_compVoxelArray, - d_voxelVertsScan, d_volume, gridSize, gridSizeShift, - gridSizeMask, voxelSize, isoValue, activeVoxels, - maxVerts); + launch_generateTriangles2(grid2, + NTHREADS, + d_pos, + d_normal, + d_compVoxelArray, + d_voxelVertsScan, + d_volume, + gridSize, + gridSizeShift, + gridSizeMask, + voxelSize, + isoValue, + activeVoxels, + maxVerts); #else - launch_generateTriangles(grid2, NTHREADS, d_pos, d_normal, d_compVoxelArray, - d_voxelVertsScan, gridSize, gridSizeShift, - gridSizeMask, voxelSize, isoValue, activeVoxels, - maxVerts); + launch_generateTriangles(grid2, + NTHREADS, + d_pos, + d_normal, + d_compVoxelArray, + d_voxelVertsScan, + gridSize, + gridSizeShift, + gridSizeMask, + voxelSize, + isoValue, + activeVoxels, + maxVerts); #endif - if (!g_bValidate) { - // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(normalVbo)); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_normalvbo_resource, 0)); - // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(posVbo)); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_posvbo_resource, 0)); - } + if (!g_bValidate) { + // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(normalVbo)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_normalvbo_resource, 0)); + // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(posVbo)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_posvbo_resource, 0)); + } } // shader for displaying floating-point texture -static const char *shader_code = - "!!ARBfp1.0\n" - "TEX result.color, fragment.texcoord, texture[0], 2D; \n" - "END"; +static const char *shader_code = "!!ARBfp1.0\n" + "TEX result.color, fragment.texcoord, texture[0], 2D; \n" + "END"; -GLuint compileASMShader(GLenum program_type, const char *code) { - GLuint program_id; - glGenProgramsARB(1, &program_id); - glBindProgramARB(program_type, program_id); - glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, - (GLsizei)strlen(code), (GLubyte *)code); +GLuint compileASMShader(GLenum program_type, const char *code) +{ + GLuint program_id; + glGenProgramsARB(1, &program_id); + glBindProgramARB(program_type, program_id); + glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)strlen(code), (GLubyte *)code); - GLint error_pos; - glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); + GLint error_pos; + glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); - if (error_pos != -1) { - const GLubyte *error_string; - error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); - fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, - error_string); - return 0; - } + if (error_pos != -1) { + const GLubyte *error_string; + error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); + fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, error_string); + return 0; + } - return program_id; + return program_id; } //////////////////////////////////////////////////////////////////////////////// //! Initialize OpenGL //////////////////////////////////////////////////////////////////////////////// -bool initGL(int *argc, char **argv) { - // Create GL context - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); - glutInitWindowSize(window_width, window_height); - glutCreateWindow("CUDA Marching Cubes"); +bool initGL(int *argc, char **argv) +{ + // Create GL context + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); + glutInitWindowSize(window_width, window_height); + glutCreateWindow("CUDA Marching Cubes"); - if (!isGLVersionSupported(2, 0)) { - fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); - fflush(stderr); - return false; - } + if (!isGLVersionSupported(2, 0)) { + fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); + fflush(stderr); + return false; + } - // default initialization - glClearColor(0.1f, 0.2f, 0.3f, 1.0f); - glEnable(GL_DEPTH_TEST); + // default initialization + glClearColor(0.1f, 0.2f, 0.3f, 1.0f); + glEnable(GL_DEPTH_TEST); - // good old-fashioned fixed function lighting - float black[] = {0.0f, 0.0f, 0.0f, 1.0f}; - float white[] = {1.0f, 1.0f, 1.0f, 1.0f}; - float ambient[] = {0.1f, 0.1f, 0.1f, 1.0f}; - float diffuse[] = {0.9f, 0.9f, 0.9f, 1.0f}; - float lightPos[] = {0.0f, 0.0f, 1.0f, 0.0f}; + // good old-fashioned fixed function lighting + float black[] = {0.0f, 0.0f, 0.0f, 1.0f}; + float white[] = {1.0f, 1.0f, 1.0f, 1.0f}; + float ambient[] = {0.1f, 0.1f, 0.1f, 1.0f}; + float diffuse[] = {0.9f, 0.9f, 0.9f, 1.0f}; + float lightPos[] = {0.0f, 0.0f, 1.0f, 0.0f}; - glMaterialfv(GL_FRONT_AND_BACK, GL_AMBIENT, ambient); - glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, diffuse); - glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, black); + glMaterialfv(GL_FRONT_AND_BACK, GL_AMBIENT, ambient); + glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, diffuse); + glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, black); - glLightfv(GL_LIGHT0, GL_AMBIENT, white); - glLightfv(GL_LIGHT0, GL_DIFFUSE, white); - glLightfv(GL_LIGHT0, GL_SPECULAR, white); - glLightfv(GL_LIGHT0, GL_POSITION, lightPos); + glLightfv(GL_LIGHT0, GL_AMBIENT, white); + glLightfv(GL_LIGHT0, GL_DIFFUSE, white); + glLightfv(GL_LIGHT0, GL_SPECULAR, white); + glLightfv(GL_LIGHT0, GL_POSITION, lightPos); - glLightModelfv(GL_LIGHT_MODEL_AMBIENT, black); + glLightModelfv(GL_LIGHT_MODEL_AMBIENT, black); - glEnable(GL_LIGHT0); - glEnable(GL_NORMALIZE); + glEnable(GL_LIGHT0); + glEnable(GL_NORMALIZE); - // load shader program - gl_Shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); + // load shader program + gl_Shader = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shader_code); - glutReportErrors(); + glutReportErrors(); - return true; + return true; } //////////////////////////////////////////////////////////////////////////////// //! Create VBO //////////////////////////////////////////////////////////////////////////////// -void createVBO(GLuint *vbo, unsigned int size) { - // create buffer object - glGenBuffers(1, vbo); - glBindBuffer(GL_ARRAY_BUFFER, *vbo); +void createVBO(GLuint *vbo, unsigned int size) +{ + // create buffer object + glGenBuffers(1, vbo); + glBindBuffer(GL_ARRAY_BUFFER, *vbo); - // initialize buffer object - glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); + // initialize buffer object + glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); - glutReportErrors(); + glutReportErrors(); } //////////////////////////////////////////////////////////////////////////////// //! Delete VBO //////////////////////////////////////////////////////////////////////////////// -void deleteVBO(GLuint *vbo, struct cudaGraphicsResource **cuda_resource) { - glBindBuffer(1, *vbo); - glDeleteBuffers(1, vbo); - // DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(*vbo)); - cudaGraphicsUnregisterResource(*cuda_resource); +void deleteVBO(GLuint *vbo, struct cudaGraphicsResource **cuda_resource) +{ + glBindBuffer(1, *vbo); + glDeleteBuffers(1, vbo); + // DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(*vbo)); + cudaGraphicsUnregisterResource(*cuda_resource); - *vbo = 0; + *vbo = 0; } //////////////////////////////////////////////////////////////////////////////// // Render isosurface geometry from the vertex buffers //////////////////////////////////////////////////////////////////////////////// -void renderIsosurface() { - glBindBuffer(GL_ARRAY_BUFFER, posVbo); - glVertexPointer(4, GL_FLOAT, 0, 0); - glEnableClientState(GL_VERTEX_ARRAY); +void renderIsosurface() +{ + glBindBuffer(GL_ARRAY_BUFFER, posVbo); + glVertexPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_VERTEX_ARRAY); - glBindBuffer(GL_ARRAY_BUFFER, normalVbo); - glNormalPointer(GL_FLOAT, sizeof(float) * 4, 0); - glEnableClientState(GL_NORMAL_ARRAY); + glBindBuffer(GL_ARRAY_BUFFER, normalVbo); + glNormalPointer(GL_FLOAT, sizeof(float) * 4, 0); + glEnableClientState(GL_NORMAL_ARRAY); - glColor3f(1.0, 0.0, 0.0); - glDrawArrays(GL_TRIANGLES, 0, totalVerts); - glDisableClientState(GL_VERTEX_ARRAY); - glDisableClientState(GL_NORMAL_ARRAY); + glColor3f(1.0, 0.0, 0.0); + glDrawArrays(GL_TRIANGLES, 0, totalVerts); + glDisableClientState(GL_VERTEX_ARRAY); + glDisableClientState(GL_NORMAL_ARRAY); - glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindBuffer(GL_ARRAY_BUFFER, 0); } //////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - // run CUDA kernel to generate geometry - if (compute) { - computeIsosurface(); - } - - // Common display code path - { - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - - // set view matrix - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - glTranslatef(translate.x, translate.y, translate.z); - glRotatef(rotate.x, 1.0, 0.0, 0.0); - glRotatef(rotate.y, 0.0, 1.0, 0.0); - - glPolygonMode(GL_FRONT_AND_BACK, wireframe ? GL_LINE : GL_FILL); - - if (lighting) { - glEnable(GL_LIGHTING); + // run CUDA kernel to generate geometry + if (compute) { + computeIsosurface(); } - // render - if (render) { - glPushMatrix(); - glRotatef(180.0, 0.0, 1.0, 0.0); - glRotatef(90.0, 1.0, 0.0, 0.0); - renderIsosurface(); - glPopMatrix(); + // Common display code path + { + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + + // set view matrix + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + glTranslatef(translate.x, translate.y, translate.z); + glRotatef(rotate.x, 1.0, 0.0, 0.0); + glRotatef(rotate.y, 0.0, 1.0, 0.0); + + glPolygonMode(GL_FRONT_AND_BACK, wireframe ? GL_LINE : GL_FILL); + + if (lighting) { + glEnable(GL_LIGHTING); + } + + // render + if (render) { + glPushMatrix(); + glRotatef(180.0, 0.0, 1.0, 0.0); + glRotatef(90.0, 1.0, 0.0, 0.0); + renderIsosurface(); + glPopMatrix(); + } + + glDisable(GL_LIGHTING); } - glDisable(GL_LIGHTING); - } + glutSwapBuffers(); + glutReportErrors(); - glutSwapBuffers(); - glutReportErrors(); + sdkStopTimer(&timer); - sdkStopTimer(&timer); - - computeFPS(); + computeFPS(); } //////////////////////////////////////////////////////////////////////////////// //! Keyboard events handler //////////////////////////////////////////////////////////////////////////////// -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case (27): - cleanup(); - exit(EXIT_SUCCESS); + cleanup(); + exit(EXIT_SUCCESS); case '=': - isoValue += 0.01f; - break; + isoValue += 0.01f; + break; case '-': - isoValue -= 0.01f; - break; + isoValue -= 0.01f; + break; case '+': - isoValue += 0.1f; - break; + isoValue += 0.1f; + break; case '_': - isoValue -= 0.1f; - break; + isoValue -= 0.1f; + break; case 'w': - wireframe = !wireframe; - break; + wireframe = !wireframe; + break; case ' ': - animate = !animate; - break; + animate = !animate; + break; case 'l': - lighting = !lighting; - break; + lighting = !lighting; + break; case 'r': - render = !render; - break; + render = !render; + break; case 'c': - compute = !compute; - break; - } + compute = !compute; + break; + } - printf("isoValue = %f\n", isoValue); - printf("voxels = %d\n", activeVoxels); - printf("verts = %d\n", totalVerts); - printf("occupancy: %d / %d = %.2f%%\n", activeVoxels, numVoxels, - activeVoxels * 100.0f / (float)numVoxels); + printf("isoValue = %f\n", isoValue); + printf("voxels = %d\n", activeVoxels); + printf("verts = %d\n", totalVerts); + printf("occupancy: %d / %d = %.2f%%\n", activeVoxels, numVoxels, activeVoxels * 100.0f / (float)numVoxels); - if (!compute) { - computeIsosurface(); - } + if (!compute) { + computeIsosurface(); + } } //////////////////////////////////////////////////////////////////////////////// //! Mouse event handlers //////////////////////////////////////////////////////////////////////////////// -void mouse(int button, int state, int x, int y) { - if (state == GLUT_DOWN) { - mouse_buttons |= 1 << button; - } else if (state == GLUT_UP) { - mouse_buttons = 0; - } +void mouse(int button, int state, int x, int y) +{ + if (state == GLUT_DOWN) { + mouse_buttons |= 1 << button; + } + else if (state == GLUT_UP) { + mouse_buttons = 0; + } - mouse_old_x = x; - mouse_old_y = y; + mouse_old_x = x; + mouse_old_y = y; } -void motion(int x, int y) { - float dx = (float)(x - mouse_old_x); - float dy = (float)(y - mouse_old_y); +void motion(int x, int y) +{ + float dx = (float)(x - mouse_old_x); + float dy = (float)(y - mouse_old_y); - if (mouse_buttons == 1) { - rotate.x += dy * 0.2f; - rotate.y += dx * 0.2f; - } else if (mouse_buttons == 2) { - translate.x += dx * 0.01f; - translate.y -= dy * 0.01f; - } else if (mouse_buttons == 3) { - translate.z += dy * 0.01f; - } + if (mouse_buttons == 1) { + rotate.x += dy * 0.2f; + rotate.y += dx * 0.2f; + } + else if (mouse_buttons == 2) { + translate.x += dx * 0.01f; + translate.y -= dy * 0.01f; + } + else if (mouse_buttons == 3) { + translate.z += dy * 0.01f; + } - mouse_old_x = x; - mouse_old_y = y; - glutPostRedisplay(); + mouse_old_x = x; + mouse_old_y = y; + glutPostRedisplay(); } -void idle() { - animation(); - glutPostRedisplay(); +void idle() +{ + animation(); + glutPostRedisplay(); } -void reshape(int w, int h) { - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - gluPerspective(60.0, (float)w / (float)h, 0.1, 10.0); +void reshape(int w, int h) +{ + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (float)w / (float)h, 0.1, 10.0); - glMatrixMode(GL_MODELVIEW); - glViewport(0, 0, w, h); + glMatrixMode(GL_MODELVIEW); + glViewport(0, 0, w, h); } void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); } diff --git a/Samples/5_Domain_Specific/marchingCubes/marchingCubes_kernel.cu b/Samples/5_Domain_Specific/marchingCubes/marchingCubes_kernel.cu index cf633b77..6387c50b 100644 --- a/Samples/5_Domain_Specific/marchingCubes/marchingCubes_kernel.cu +++ b/Samples/5_Domain_Specific/marchingCubes/marchingCubes_kernel.cu @@ -28,11 +28,11 @@ #ifndef _MARCHING_CUBES_KERNEL_CU_ #define _MARCHING_CUBES_KERNEL_CU_ +#include +#include // includes for helper CUDA functions +#include #include #include -#include // includes for helper CUDA functions -#include -#include #include #include @@ -46,93 +46,90 @@ cudaTextureObject_t numVertsTex; // volume data cudaTextureObject_t volumeTex; -extern "C" void allocateTextures(uint **d_edgeTable, uint **d_triTable, - uint **d_numVertsTable) { - checkCudaErrors(cudaMalloc((void **)d_edgeTable, 256 * sizeof(uint))); - checkCudaErrors(cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable, - 256 * sizeof(uint), cudaMemcpyHostToDevice)); - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned); +extern "C" void allocateTextures(uint **d_edgeTable, uint **d_triTable, uint **d_numVertsTable) +{ + checkCudaErrors(cudaMalloc((void **)d_edgeTable, 256 * sizeof(uint))); + checkCudaErrors(cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable, 256 * sizeof(uint), cudaMemcpyHostToDevice)); + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned); - checkCudaErrors(cudaMalloc((void **)d_triTable, 256 * 16 * sizeof(uint))); - checkCudaErrors(cudaMemcpy((void *)*d_triTable, (void *)triTable, - 256 * 16 * sizeof(uint), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc((void **)d_triTable, 256 * 16 * sizeof(uint))); + checkCudaErrors(cudaMemcpy((void *)*d_triTable, (void *)triTable, 256 * 16 * sizeof(uint), cudaMemcpyHostToDevice)); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = *d_triTable; - texRes.res.linear.sizeInBytes = 256 * 16 * sizeof(uint); - texRes.res.linear.desc = channelDesc; + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = *d_triTable; + texRes.res.linear.sizeInBytes = 256 * 16 * sizeof(uint); + texRes.res.linear.desc = channelDesc; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&triTex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&triTex, &texRes, &texDescr, NULL)); - checkCudaErrors(cudaMalloc((void **)d_numVertsTable, 256 * sizeof(uint))); - checkCudaErrors(cudaMemcpy((void *)*d_numVertsTable, (void *)numVertsTable, - 256 * sizeof(uint), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMalloc((void **)d_numVertsTable, 256 * sizeof(uint))); + checkCudaErrors( + cudaMemcpy((void *)*d_numVertsTable, (void *)numVertsTable, 256 * sizeof(uint), cudaMemcpyHostToDevice)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = *d_numVertsTable; - texRes.res.linear.sizeInBytes = 256 * sizeof(uint); - texRes.res.linear.desc = channelDesc; + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = *d_numVertsTable; + texRes.res.linear.sizeInBytes = 256 * sizeof(uint); + texRes.res.linear.desc = channelDesc; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&numVertsTex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&numVertsTex, &texRes, &texDescr, NULL)); } -extern "C" void createVolumeTexture(uchar *d_volume, size_t buffSize) { - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); +extern "C" void createVolumeTexture(uchar *d_volume, size_t buffSize) +{ + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeLinear; - texRes.res.linear.devPtr = d_volume; - texRes.res.linear.sizeInBytes = buffSize; - texRes.res.linear.desc = - cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned); + texRes.resType = cudaResourceTypeLinear; + texRes.res.linear.devPtr = d_volume; + texRes.res.linear.sizeInBytes = buffSize; + texRes.res.linear.desc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned); - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeNormalizedFloat; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeNormalizedFloat; - checkCudaErrors( - cudaCreateTextureObject(&volumeTex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&volumeTex, &texRes, &texDescr, NULL)); } -extern "C" void destroyAllTextureObjects() { - checkCudaErrors(cudaDestroyTextureObject(triTex)); - checkCudaErrors(cudaDestroyTextureObject(numVertsTex)); - checkCudaErrors(cudaDestroyTextureObject(volumeTex)); +extern "C" void destroyAllTextureObjects() +{ + checkCudaErrors(cudaDestroyTextureObject(triTex)); + checkCudaErrors(cudaDestroyTextureObject(numVertsTex)); + checkCudaErrors(cudaDestroyTextureObject(volumeTex)); } // an interesting field function -__device__ float tangle(float x, float y, float z) { - x *= 3.0f; - y *= 3.0f; - z *= 3.0f; - return (x * x * x * x - 5.0f * x * x + y * y * y * y - 5.0f * y * y + - z * z * z * z - 5.0f * z * z + 11.8f) * 0.2f + 0.5f; +__device__ float tangle(float x, float y, float z) +{ + x *= 3.0f; + y *= 3.0f; + z *= 3.0f; + return (x * x * x * x - 5.0f * x * x + y * y * y * y - 5.0f * y * y + z * z * z * z - 5.0f * z * z + 11.8f) * 0.2f + + 0.5f; } // evaluate field function at point @@ -140,528 +137,611 @@ __device__ float fieldFunc(float3 p) { return tangle(p.x, p.y, p.z); } // evaluate field function at a point // returns value and gradient in float4 -__device__ float4 fieldFunc4(float3 p) { - float v = tangle(p.x, p.y, p.z); - const float d = 0.001f; - float dx = tangle(p.x + d, p.y, p.z) - v; - float dy = tangle(p.x, p.y + d, p.z) - v; - float dz = tangle(p.x, p.y, p.z + d) - v; - return make_float4(dx, dy, dz, v); +__device__ float4 fieldFunc4(float3 p) +{ + float v = tangle(p.x, p.y, p.z); + const float d = 0.001f; + float dx = tangle(p.x + d, p.y, p.z) - v; + float dy = tangle(p.x, p.y + d, p.z) - v; + float dz = tangle(p.x, p.y, p.z + d) - v; + return make_float4(dx, dy, dz, v); } // sample volume data set at a point -__device__ float sampleVolume(cudaTextureObject_t volumeTex, uchar *data, - uint3 p, uint3 gridSize) { - p.x = min(p.x, gridSize.x - 1); - p.y = min(p.y, gridSize.y - 1); - p.z = min(p.z, gridSize.z - 1); - uint i = (p.z * gridSize.x * gridSize.y) + (p.y * gridSize.x) + p.x; - // return (float) data[i] / 255.0f; - return tex1Dfetch(volumeTex, i); +__device__ float sampleVolume(cudaTextureObject_t volumeTex, uchar *data, uint3 p, uint3 gridSize) +{ + p.x = min(p.x, gridSize.x - 1); + p.y = min(p.y, gridSize.y - 1); + p.z = min(p.z, gridSize.z - 1); + uint i = (p.z * gridSize.x * gridSize.y) + (p.y * gridSize.x) + p.x; + // return (float) data[i] / 255.0f; + return tex1Dfetch(volumeTex, i); } // compute position in 3d grid from 1d index // only works for power of 2 sizes -__device__ uint3 calcGridPos(uint i, uint3 gridSizeShift, uint3 gridSizeMask) { - uint3 gridPos; - gridPos.x = i & gridSizeMask.x; - gridPos.y = (i >> gridSizeShift.y) & gridSizeMask.y; - gridPos.z = (i >> gridSizeShift.z) & gridSizeMask.z; - return gridPos; +__device__ uint3 calcGridPos(uint i, uint3 gridSizeShift, uint3 gridSizeMask) +{ + uint3 gridPos; + gridPos.x = i & gridSizeMask.x; + gridPos.y = (i >> gridSizeShift.y) & gridSizeMask.y; + gridPos.z = (i >> gridSizeShift.z) & gridSizeMask.z; + return gridPos; } // classify voxel based on number of vertices it will generate // one thread per voxel -__global__ void classifyVoxel(uint *voxelVerts, uint *voxelOccupied, - uchar *volume, uint3 gridSize, - uint3 gridSizeShift, uint3 gridSizeMask, - uint numVoxels, float3 voxelSize, float isoValue, +__global__ void classifyVoxel(uint *voxelVerts, + uint *voxelOccupied, + uchar *volume, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + uint numVoxels, + float3 voxelSize, + float isoValue, cudaTextureObject_t numVertsTex, - cudaTextureObject_t volumeTex) { - uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; - uint i = __mul24(blockId, blockDim.x) + threadIdx.x; + cudaTextureObject_t volumeTex) +{ + uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; + uint i = __mul24(blockId, blockDim.x) + threadIdx.x; - uint3 gridPos = calcGridPos(i, gridSizeShift, gridSizeMask); + uint3 gridPos = calcGridPos(i, gridSizeShift, gridSizeMask); // read field values at neighbouring grid vertices #if SAMPLE_VOLUME - float field[8]; - field[0] = sampleVolume(volumeTex, volume, gridPos, gridSize); - field[1] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 0), gridSize); - field[2] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 0), gridSize); - field[3] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 0), gridSize); - field[4] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 0, 1), gridSize); - field[5] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 1), gridSize); - field[6] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 1), gridSize); - field[7] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 1), gridSize); + float field[8]; + field[0] = sampleVolume(volumeTex, volume, gridPos, gridSize); + field[1] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 0), gridSize); + field[2] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 0), gridSize); + field[3] = sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 0), gridSize); + field[4] = sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 0, 1), gridSize); + field[5] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 1), gridSize); + field[6] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 1), gridSize); + field[7] = sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 1), gridSize); #else - float3 p; - p.x = -1.0f + (gridPos.x * voxelSize.x); - p.y = -1.0f + (gridPos.y * voxelSize.y); - p.z = -1.0f + (gridPos.z * voxelSize.z); + float3 p; + p.x = -1.0f + (gridPos.x * voxelSize.x); + p.y = -1.0f + (gridPos.y * voxelSize.y); + p.z = -1.0f + (gridPos.z * voxelSize.z); - float field[8]; - field[0] = fieldFunc(p); - field[1] = fieldFunc(p + make_float3(voxelSize.x, 0, 0)); - field[2] = fieldFunc(p + make_float3(voxelSize.x, voxelSize.y, 0)); - field[3] = fieldFunc(p + make_float3(0, voxelSize.y, 0)); - field[4] = fieldFunc(p + make_float3(0, 0, voxelSize.z)); - field[5] = fieldFunc(p + make_float3(voxelSize.x, 0, voxelSize.z)); - field[6] = fieldFunc(p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z)); - field[7] = fieldFunc(p + make_float3(0, voxelSize.y, voxelSize.z)); + float field[8]; + field[0] = fieldFunc(p); + field[1] = fieldFunc(p + make_float3(voxelSize.x, 0, 0)); + field[2] = fieldFunc(p + make_float3(voxelSize.x, voxelSize.y, 0)); + field[3] = fieldFunc(p + make_float3(0, voxelSize.y, 0)); + field[4] = fieldFunc(p + make_float3(0, 0, voxelSize.z)); + field[5] = fieldFunc(p + make_float3(voxelSize.x, 0, voxelSize.z)); + field[6] = fieldFunc(p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z)); + field[7] = fieldFunc(p + make_float3(0, voxelSize.y, voxelSize.z)); #endif - // calculate flag indicating if each vertex is inside or outside isosurface - uint cubeindex; - cubeindex = uint(field[0] < isoValue); - cubeindex += uint(field[1] < isoValue) * 2; - cubeindex += uint(field[2] < isoValue) * 4; - cubeindex += uint(field[3] < isoValue) * 8; - cubeindex += uint(field[4] < isoValue) * 16; - cubeindex += uint(field[5] < isoValue) * 32; - cubeindex += uint(field[6] < isoValue) * 64; - cubeindex += uint(field[7] < isoValue) * 128; + // calculate flag indicating if each vertex is inside or outside isosurface + uint cubeindex; + cubeindex = uint(field[0] < isoValue); + cubeindex += uint(field[1] < isoValue) * 2; + cubeindex += uint(field[2] < isoValue) * 4; + cubeindex += uint(field[3] < isoValue) * 8; + cubeindex += uint(field[4] < isoValue) * 16; + cubeindex += uint(field[5] < isoValue) * 32; + cubeindex += uint(field[6] < isoValue) * 64; + cubeindex += uint(field[7] < isoValue) * 128; - // read number of vertices from texture - uint numVerts = tex1Dfetch(numVertsTex, cubeindex); + // read number of vertices from texture + uint numVerts = tex1Dfetch(numVertsTex, cubeindex); - if (i < numVoxels) { - voxelVerts[i] = numVerts; - voxelOccupied[i] = (numVerts > 0); - } + if (i < numVoxels) { + voxelVerts[i] = numVerts; + voxelOccupied[i] = (numVerts > 0); + } } -extern "C" void launch_classifyVoxel(dim3 grid, dim3 threads, uint *voxelVerts, - uint *voxelOccupied, uchar *volume, - uint3 gridSize, uint3 gridSizeShift, - uint3 gridSizeMask, uint numVoxels, - float3 voxelSize, float isoValue) { - // calculate number of vertices need per voxel - classifyVoxel<<>>(voxelVerts, voxelOccupied, volume, gridSize, - gridSizeShift, gridSizeMask, numVoxels, - voxelSize, isoValue, numVertsTex, volumeTex); - getLastCudaError("classifyVoxel failed"); +extern "C" void launch_classifyVoxel(dim3 grid, + dim3 threads, + uint *voxelVerts, + uint *voxelOccupied, + uchar *volume, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + uint numVoxels, + float3 voxelSize, + float isoValue) +{ + // calculate number of vertices need per voxel + classifyVoxel<<>>(voxelVerts, + voxelOccupied, + volume, + gridSize, + gridSizeShift, + gridSizeMask, + numVoxels, + voxelSize, + isoValue, + numVertsTex, + volumeTex); + getLastCudaError("classifyVoxel failed"); } // compact voxel array -__global__ void compactVoxels(uint *compactedVoxelArray, uint *voxelOccupied, - uint *voxelOccupiedScan, uint numVoxels) { - uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; - uint i = __mul24(blockId, blockDim.x) + threadIdx.x; +__global__ void compactVoxels(uint *compactedVoxelArray, uint *voxelOccupied, uint *voxelOccupiedScan, uint numVoxels) +{ + uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; + uint i = __mul24(blockId, blockDim.x) + threadIdx.x; - if (voxelOccupied[i] && (i < numVoxels)) { - compactedVoxelArray[voxelOccupiedScan[i]] = i; - } + if (voxelOccupied[i] && (i < numVoxels)) { + compactedVoxelArray[voxelOccupiedScan[i]] = i; + } } -extern "C" void launch_compactVoxels(dim3 grid, dim3 threads, +extern "C" void launch_compactVoxels(dim3 grid, + dim3 threads, uint *compactedVoxelArray, uint *voxelOccupied, - uint *voxelOccupiedScan, uint numVoxels) { - compactVoxels<<>>(compactedVoxelArray, voxelOccupied, - voxelOccupiedScan, numVoxels); - getLastCudaError("compactVoxels failed"); + uint *voxelOccupiedScan, + uint numVoxels) +{ + compactVoxels<<>>(compactedVoxelArray, voxelOccupied, voxelOccupiedScan, numVoxels); + getLastCudaError("compactVoxels failed"); } // compute interpolated vertex along an edge -__device__ float3 vertexInterp(float isolevel, float3 p0, float3 p1, float f0, - float f1) { - float t = (isolevel - f0) / (f1 - f0); - return lerp(p0, p1, t); +__device__ float3 vertexInterp(float isolevel, float3 p0, float3 p1, float f0, float f1) +{ + float t = (isolevel - f0) / (f1 - f0); + return lerp(p0, p1, t); } // compute interpolated vertex position and normal along an edge -__device__ void vertexInterp2(float isolevel, float3 p0, float3 p1, float4 f0, - float4 f1, float3 &p, float3 &n) { - float t = (isolevel - f0.w) / (f1.w - f0.w); - p = lerp(p0, p1, t); - n.x = lerp(f0.x, f1.x, t); - n.y = lerp(f0.y, f1.y, t); - n.z = lerp(f0.z, f1.z, t); - // n = normalize(n); +__device__ void vertexInterp2(float isolevel, float3 p0, float3 p1, float4 f0, float4 f1, float3 &p, float3 &n) +{ + float t = (isolevel - f0.w) / (f1.w - f0.w); + p = lerp(p0, p1, t); + n.x = lerp(f0.x, f1.x, t); + n.y = lerp(f0.y, f1.y, t); + n.z = lerp(f0.z, f1.z, t); + // n = normalize(n); } // generate triangles for each voxel using marching cubes // interpolates normals from field function -__global__ void generateTriangles( - float4 *pos, float4 *norm, uint *compactedVoxelArray, uint *numVertsScanned, - uint3 gridSize, uint3 gridSizeShift, uint3 gridSizeMask, float3 voxelSize, - float isoValue, uint activeVoxels, uint maxVerts, - cudaTextureObject_t triTex, cudaTextureObject_t numVertsTex) { - uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; - uint i = __mul24(blockId, blockDim.x) + threadIdx.x; +__global__ void generateTriangles(float4 *pos, + float4 *norm, + uint *compactedVoxelArray, + uint *numVertsScanned, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + float3 voxelSize, + float isoValue, + uint activeVoxels, + uint maxVerts, + cudaTextureObject_t triTex, + cudaTextureObject_t numVertsTex) +{ + uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; + uint i = __mul24(blockId, blockDim.x) + threadIdx.x; - if (i > activeVoxels - 1) { - // can't return here because of syncthreads() - i = activeVoxels - 1; - } + if (i > activeVoxels - 1) { + // can't return here because of syncthreads() + i = activeVoxels - 1; + } #if SKIP_EMPTY_VOXELS - uint voxel = compactedVoxelArray[i]; + uint voxel = compactedVoxelArray[i]; #else - uint voxel = i; + uint voxel = i; #endif - // compute position in 3d grid - uint3 gridPos = calcGridPos(voxel, gridSizeShift, gridSizeMask); + // compute position in 3d grid + uint3 gridPos = calcGridPos(voxel, gridSizeShift, gridSizeMask); - float3 p; - p.x = -1.0f + (gridPos.x * voxelSize.x); - p.y = -1.0f + (gridPos.y * voxelSize.y); - p.z = -1.0f + (gridPos.z * voxelSize.z); + float3 p; + p.x = -1.0f + (gridPos.x * voxelSize.x); + p.y = -1.0f + (gridPos.y * voxelSize.y); + p.z = -1.0f + (gridPos.z * voxelSize.z); - // calculate cell vertex positions - float3 v[8]; - v[0] = p; - v[1] = p + make_float3(voxelSize.x, 0, 0); - v[2] = p + make_float3(voxelSize.x, voxelSize.y, 0); - v[3] = p + make_float3(0, voxelSize.y, 0); - v[4] = p + make_float3(0, 0, voxelSize.z); - v[5] = p + make_float3(voxelSize.x, 0, voxelSize.z); - v[6] = p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z); - v[7] = p + make_float3(0, voxelSize.y, voxelSize.z); + // calculate cell vertex positions + float3 v[8]; + v[0] = p; + v[1] = p + make_float3(voxelSize.x, 0, 0); + v[2] = p + make_float3(voxelSize.x, voxelSize.y, 0); + v[3] = p + make_float3(0, voxelSize.y, 0); + v[4] = p + make_float3(0, 0, voxelSize.z); + v[5] = p + make_float3(voxelSize.x, 0, voxelSize.z); + v[6] = p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z); + v[7] = p + make_float3(0, voxelSize.y, voxelSize.z); - // evaluate field values - float4 field[8]; - field[0] = fieldFunc4(v[0]); - field[1] = fieldFunc4(v[1]); - field[2] = fieldFunc4(v[2]); - field[3] = fieldFunc4(v[3]); - field[4] = fieldFunc4(v[4]); - field[5] = fieldFunc4(v[5]); - field[6] = fieldFunc4(v[6]); - field[7] = fieldFunc4(v[7]); + // evaluate field values + float4 field[8]; + field[0] = fieldFunc4(v[0]); + field[1] = fieldFunc4(v[1]); + field[2] = fieldFunc4(v[2]); + field[3] = fieldFunc4(v[3]); + field[4] = fieldFunc4(v[4]); + field[5] = fieldFunc4(v[5]); + field[6] = fieldFunc4(v[6]); + field[7] = fieldFunc4(v[7]); - // recalculate flag - // (this is faster than storing it in global memory) - uint cubeindex; - cubeindex = uint(field[0].w < isoValue); - cubeindex += uint(field[1].w < isoValue) * 2; - cubeindex += uint(field[2].w < isoValue) * 4; - cubeindex += uint(field[3].w < isoValue) * 8; - cubeindex += uint(field[4].w < isoValue) * 16; - cubeindex += uint(field[5].w < isoValue) * 32; - cubeindex += uint(field[6].w < isoValue) * 64; - cubeindex += uint(field[7].w < isoValue) * 128; + // recalculate flag + // (this is faster than storing it in global memory) + uint cubeindex; + cubeindex = uint(field[0].w < isoValue); + cubeindex += uint(field[1].w < isoValue) * 2; + cubeindex += uint(field[2].w < isoValue) * 4; + cubeindex += uint(field[3].w < isoValue) * 8; + cubeindex += uint(field[4].w < isoValue) * 16; + cubeindex += uint(field[5].w < isoValue) * 32; + cubeindex += uint(field[6].w < isoValue) * 64; + cubeindex += uint(field[7].w < isoValue) * 128; -// find the vertices where the surface intersects the cube + // find the vertices where the surface intersects the cube #if USE_SHARED - // use partioned shared memory to avoid using local memory - __shared__ float3 vertlist[12 * NTHREADS]; - __shared__ float3 normlist[12 * NTHREADS]; + // use partioned shared memory to avoid using local memory + __shared__ float3 vertlist[12 * NTHREADS]; + __shared__ float3 normlist[12 * NTHREADS]; - vertexInterp2(isoValue, v[0], v[1], field[0], field[1], vertlist[threadIdx.x], - normlist[threadIdx.x]); - vertexInterp2(isoValue, v[1], v[2], field[1], field[2], - vertlist[threadIdx.x + NTHREADS], - normlist[threadIdx.x + NTHREADS]); - vertexInterp2(isoValue, v[2], v[3], field[2], field[3], - vertlist[threadIdx.x + (NTHREADS * 2)], - normlist[threadIdx.x + (NTHREADS * 2)]); - vertexInterp2(isoValue, v[3], v[0], field[3], field[0], - vertlist[threadIdx.x + (NTHREADS * 3)], - normlist[threadIdx.x + (NTHREADS * 3)]); - vertexInterp2(isoValue, v[4], v[5], field[4], field[5], - vertlist[threadIdx.x + (NTHREADS * 4)], - normlist[threadIdx.x + (NTHREADS * 4)]); - vertexInterp2(isoValue, v[5], v[6], field[5], field[6], - vertlist[threadIdx.x + (NTHREADS * 5)], - normlist[threadIdx.x + (NTHREADS * 5)]); - vertexInterp2(isoValue, v[6], v[7], field[6], field[7], - vertlist[threadIdx.x + (NTHREADS * 6)], - normlist[threadIdx.x + (NTHREADS * 6)]); - vertexInterp2(isoValue, v[7], v[4], field[7], field[4], - vertlist[threadIdx.x + (NTHREADS * 7)], - normlist[threadIdx.x + (NTHREADS * 7)]); - vertexInterp2(isoValue, v[0], v[4], field[0], field[4], - vertlist[threadIdx.x + (NTHREADS * 8)], - normlist[threadIdx.x + (NTHREADS * 8)]); - vertexInterp2(isoValue, v[1], v[5], field[1], field[5], - vertlist[threadIdx.x + (NTHREADS * 9)], - normlist[threadIdx.x + (NTHREADS * 9)]); - vertexInterp2(isoValue, v[2], v[6], field[2], field[6], - vertlist[threadIdx.x + (NTHREADS * 10)], - normlist[threadIdx.x + (NTHREADS * 10)]); - vertexInterp2(isoValue, v[3], v[7], field[3], field[7], - vertlist[threadIdx.x + (NTHREADS * 11)], - normlist[threadIdx.x + (NTHREADS * 11)]); - __syncthreads(); + vertexInterp2(isoValue, v[0], v[1], field[0], field[1], vertlist[threadIdx.x], normlist[threadIdx.x]); + vertexInterp2( + isoValue, v[1], v[2], field[1], field[2], vertlist[threadIdx.x + NTHREADS], normlist[threadIdx.x + NTHREADS]); + vertexInterp2(isoValue, + v[2], + v[3], + field[2], + field[3], + vertlist[threadIdx.x + (NTHREADS * 2)], + normlist[threadIdx.x + (NTHREADS * 2)]); + vertexInterp2(isoValue, + v[3], + v[0], + field[3], + field[0], + vertlist[threadIdx.x + (NTHREADS * 3)], + normlist[threadIdx.x + (NTHREADS * 3)]); + vertexInterp2(isoValue, + v[4], + v[5], + field[4], + field[5], + vertlist[threadIdx.x + (NTHREADS * 4)], + normlist[threadIdx.x + (NTHREADS * 4)]); + vertexInterp2(isoValue, + v[5], + v[6], + field[5], + field[6], + vertlist[threadIdx.x + (NTHREADS * 5)], + normlist[threadIdx.x + (NTHREADS * 5)]); + vertexInterp2(isoValue, + v[6], + v[7], + field[6], + field[7], + vertlist[threadIdx.x + (NTHREADS * 6)], + normlist[threadIdx.x + (NTHREADS * 6)]); + vertexInterp2(isoValue, + v[7], + v[4], + field[7], + field[4], + vertlist[threadIdx.x + (NTHREADS * 7)], + normlist[threadIdx.x + (NTHREADS * 7)]); + vertexInterp2(isoValue, + v[0], + v[4], + field[0], + field[4], + vertlist[threadIdx.x + (NTHREADS * 8)], + normlist[threadIdx.x + (NTHREADS * 8)]); + vertexInterp2(isoValue, + v[1], + v[5], + field[1], + field[5], + vertlist[threadIdx.x + (NTHREADS * 9)], + normlist[threadIdx.x + (NTHREADS * 9)]); + vertexInterp2(isoValue, + v[2], + v[6], + field[2], + field[6], + vertlist[threadIdx.x + (NTHREADS * 10)], + normlist[threadIdx.x + (NTHREADS * 10)]); + vertexInterp2(isoValue, + v[3], + v[7], + field[3], + field[7], + vertlist[threadIdx.x + (NTHREADS * 11)], + normlist[threadIdx.x + (NTHREADS * 11)]); + __syncthreads(); #else - float3 vertlist[12]; - float3 normlist[12]; + float3 vertlist[12]; + float3 normlist[12]; - vertexInterp2(isoValue, v[0], v[1], field[0], field[1], vertlist[0], - normlist[0]); - vertexInterp2(isoValue, v[1], v[2], field[1], field[2], vertlist[1], - normlist[1]); - vertexInterp2(isoValue, v[2], v[3], field[2], field[3], vertlist[2], - normlist[2]); - vertexInterp2(isoValue, v[3], v[0], field[3], field[0], vertlist[3], - normlist[3]); + vertexInterp2(isoValue, v[0], v[1], field[0], field[1], vertlist[0], normlist[0]); + vertexInterp2(isoValue, v[1], v[2], field[1], field[2], vertlist[1], normlist[1]); + vertexInterp2(isoValue, v[2], v[3], field[2], field[3], vertlist[2], normlist[2]); + vertexInterp2(isoValue, v[3], v[0], field[3], field[0], vertlist[3], normlist[3]); - vertexInterp2(isoValue, v[4], v[5], field[4], field[5], vertlist[4], - normlist[4]); - vertexInterp2(isoValue, v[5], v[6], field[5], field[6], vertlist[5], - normlist[5]); - vertexInterp2(isoValue, v[6], v[7], field[6], field[7], vertlist[6], - normlist[6]); - vertexInterp2(isoValue, v[7], v[4], field[7], field[4], vertlist[7], - normlist[7]); + vertexInterp2(isoValue, v[4], v[5], field[4], field[5], vertlist[4], normlist[4]); + vertexInterp2(isoValue, v[5], v[6], field[5], field[6], vertlist[5], normlist[5]); + vertexInterp2(isoValue, v[6], v[7], field[6], field[7], vertlist[6], normlist[6]); + vertexInterp2(isoValue, v[7], v[4], field[7], field[4], vertlist[7], normlist[7]); - vertexInterp2(isoValue, v[0], v[4], field[0], field[4], vertlist[8], - normlist[8]); - vertexInterp2(isoValue, v[1], v[5], field[1], field[5], vertlist[9], - normlist[9]); - vertexInterp2(isoValue, v[2], v[6], field[2], field[6], vertlist[10], - normlist[10]); - vertexInterp2(isoValue, v[3], v[7], field[3], field[7], vertlist[11], - normlist[11]); + vertexInterp2(isoValue, v[0], v[4], field[0], field[4], vertlist[8], normlist[8]); + vertexInterp2(isoValue, v[1], v[5], field[1], field[5], vertlist[9], normlist[9]); + vertexInterp2(isoValue, v[2], v[6], field[2], field[6], vertlist[10], normlist[10]); + vertexInterp2(isoValue, v[3], v[7], field[3], field[7], vertlist[11], normlist[11]); #endif - // output triangle vertices - uint numVerts = tex1Dfetch(numVertsTex, cubeindex); + // output triangle vertices + uint numVerts = tex1Dfetch(numVertsTex, cubeindex); - for (int i = 0; i < numVerts; i++) { - uint edge = tex1Dfetch(triTex, cubeindex * 16 + i); + for (int i = 0; i < numVerts; i++) { + uint edge = tex1Dfetch(triTex, cubeindex * 16 + i); - uint index = numVertsScanned[voxel] + i; + uint index = numVertsScanned[voxel] + i; - if (index < maxVerts) { + if (index < maxVerts) { #if USE_SHARED - pos[index] = make_float4(vertlist[(edge * NTHREADS) + threadIdx.x], 1.0f); - norm[index] = - make_float4(normlist[(edge * NTHREADS) + threadIdx.x], 0.0f); + pos[index] = make_float4(vertlist[(edge * NTHREADS) + threadIdx.x], 1.0f); + norm[index] = make_float4(normlist[(edge * NTHREADS) + threadIdx.x], 0.0f); #else - pos[index] = make_float4(vertlist[edge], 1.0f); - norm[index] = make_float4(normlist[edge], 0.0f); + pos[index] = make_float4(vertlist[edge], 1.0f); + norm[index] = make_float4(normlist[edge], 0.0f); #endif + } } - } } -extern "C" void launch_generateTriangles( - dim3 grid, dim3 threads, float4 *pos, float4 *norm, - uint *compactedVoxelArray, uint *numVertsScanned, uint3 gridSize, - uint3 gridSizeShift, uint3 gridSizeMask, float3 voxelSize, float isoValue, - uint activeVoxels, uint maxVerts) { - generateTriangles<<>>( - pos, norm, compactedVoxelArray, numVertsScanned, gridSize, gridSizeShift, - gridSizeMask, voxelSize, isoValue, activeVoxels, maxVerts, triTex, - numVertsTex); - getLastCudaError("generateTriangles failed"); +extern "C" void launch_generateTriangles(dim3 grid, + dim3 threads, + float4 *pos, + float4 *norm, + uint *compactedVoxelArray, + uint *numVertsScanned, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + float3 voxelSize, + float isoValue, + uint activeVoxels, + uint maxVerts) +{ + generateTriangles<<>>(pos, + norm, + compactedVoxelArray, + numVertsScanned, + gridSize, + gridSizeShift, + gridSizeMask, + voxelSize, + isoValue, + activeVoxels, + maxVerts, + triTex, + numVertsTex); + getLastCudaError("generateTriangles failed"); } // calculate triangle normal -__device__ float3 calcNormal(float3 *v0, float3 *v1, float3 *v2) { - float3 edge0 = *v1 - *v0; - float3 edge1 = *v2 - *v0; - // note - it's faster to perform normalization in vertex shader rather than - // here - return cross(edge0, edge1); +__device__ float3 calcNormal(float3 *v0, float3 *v1, float3 *v2) +{ + float3 edge0 = *v1 - *v0; + float3 edge1 = *v2 - *v0; + // note - it's faster to perform normalization in vertex shader rather than + // here + return cross(edge0, edge1); } // version that calculates flat surface normal for each triangle -__global__ void generateTriangles2( - float4 *pos, float4 *norm, uint *compactedVoxelArray, uint *numVertsScanned, - uchar *volume, uint3 gridSize, uint3 gridSizeShift, uint3 gridSizeMask, - float3 voxelSize, float isoValue, uint activeVoxels, uint maxVerts, - cudaTextureObject_t triTex, cudaTextureObject_t numVertsTex, - cudaTextureObject_t volumeTex) { - uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; - uint i = __mul24(blockId, blockDim.x) + threadIdx.x; +__global__ void generateTriangles2(float4 *pos, + float4 *norm, + uint *compactedVoxelArray, + uint *numVertsScanned, + uchar *volume, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + float3 voxelSize, + float isoValue, + uint activeVoxels, + uint maxVerts, + cudaTextureObject_t triTex, + cudaTextureObject_t numVertsTex, + cudaTextureObject_t volumeTex) +{ + uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; + uint i = __mul24(blockId, blockDim.x) + threadIdx.x; - if (i > activeVoxels - 1) { - i = activeVoxels - 1; - } + if (i > activeVoxels - 1) { + i = activeVoxels - 1; + } #if SKIP_EMPTY_VOXELS - uint voxel = compactedVoxelArray[i]; + uint voxel = compactedVoxelArray[i]; #else - uint voxel = i; + uint voxel = i; #endif - // compute position in 3d grid - uint3 gridPos = calcGridPos(voxel, gridSizeShift, gridSizeMask); + // compute position in 3d grid + uint3 gridPos = calcGridPos(voxel, gridSizeShift, gridSizeMask); - float3 p; - p.x = -1.0f + (gridPos.x * voxelSize.x); - p.y = -1.0f + (gridPos.y * voxelSize.y); - p.z = -1.0f + (gridPos.z * voxelSize.z); + float3 p; + p.x = -1.0f + (gridPos.x * voxelSize.x); + p.y = -1.0f + (gridPos.y * voxelSize.y); + p.z = -1.0f + (gridPos.z * voxelSize.z); - // calculate cell vertex positions - float3 v[8]; - v[0] = p; - v[1] = p + make_float3(voxelSize.x, 0, 0); - v[2] = p + make_float3(voxelSize.x, voxelSize.y, 0); - v[3] = p + make_float3(0, voxelSize.y, 0); - v[4] = p + make_float3(0, 0, voxelSize.z); - v[5] = p + make_float3(voxelSize.x, 0, voxelSize.z); - v[6] = p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z); - v[7] = p + make_float3(0, voxelSize.y, voxelSize.z); + // calculate cell vertex positions + float3 v[8]; + v[0] = p; + v[1] = p + make_float3(voxelSize.x, 0, 0); + v[2] = p + make_float3(voxelSize.x, voxelSize.y, 0); + v[3] = p + make_float3(0, voxelSize.y, 0); + v[4] = p + make_float3(0, 0, voxelSize.z); + v[5] = p + make_float3(voxelSize.x, 0, voxelSize.z); + v[6] = p + make_float3(voxelSize.x, voxelSize.y, voxelSize.z); + v[7] = p + make_float3(0, voxelSize.y, voxelSize.z); #if SAMPLE_VOLUME - float field[8]; - field[0] = sampleVolume(volumeTex, volume, gridPos, gridSize); - field[1] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 0), gridSize); - field[2] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 0), gridSize); - field[3] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 0), gridSize); - field[4] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 0, 1), gridSize); - field[5] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 1), gridSize); - field[6] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 1), gridSize); - field[7] = - sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 1), gridSize); + float field[8]; + field[0] = sampleVolume(volumeTex, volume, gridPos, gridSize); + field[1] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 0), gridSize); + field[2] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 0), gridSize); + field[3] = sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 0), gridSize); + field[4] = sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 0, 1), gridSize); + field[5] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 0, 1), gridSize); + field[6] = sampleVolume(volumeTex, volume, gridPos + make_uint3(1, 1, 1), gridSize); + field[7] = sampleVolume(volumeTex, volume, gridPos + make_uint3(0, 1, 1), gridSize); #else - // evaluate field values - float field[8]; - field[0] = fieldFunc(v[0]); - field[1] = fieldFunc(v[1]); - field[2] = fieldFunc(v[2]); - field[3] = fieldFunc(v[3]); - field[4] = fieldFunc(v[4]); - field[5] = fieldFunc(v[5]); - field[6] = fieldFunc(v[6]); - field[7] = fieldFunc(v[7]); + // evaluate field values + float field[8]; + field[0] = fieldFunc(v[0]); + field[1] = fieldFunc(v[1]); + field[2] = fieldFunc(v[2]); + field[3] = fieldFunc(v[3]); + field[4] = fieldFunc(v[4]); + field[5] = fieldFunc(v[5]); + field[6] = fieldFunc(v[6]); + field[7] = fieldFunc(v[7]); #endif - // recalculate flag - uint cubeindex; - cubeindex = uint(field[0] < isoValue); - cubeindex += uint(field[1] < isoValue) * 2; - cubeindex += uint(field[2] < isoValue) * 4; - cubeindex += uint(field[3] < isoValue) * 8; - cubeindex += uint(field[4] < isoValue) * 16; - cubeindex += uint(field[5] < isoValue) * 32; - cubeindex += uint(field[6] < isoValue) * 64; - cubeindex += uint(field[7] < isoValue) * 128; + // recalculate flag + uint cubeindex; + cubeindex = uint(field[0] < isoValue); + cubeindex += uint(field[1] < isoValue) * 2; + cubeindex += uint(field[2] < isoValue) * 4; + cubeindex += uint(field[3] < isoValue) * 8; + cubeindex += uint(field[4] < isoValue) * 16; + cubeindex += uint(field[5] < isoValue) * 32; + cubeindex += uint(field[6] < isoValue) * 64; + cubeindex += uint(field[7] < isoValue) * 128; -// find the vertices where the surface intersects the cube + // find the vertices where the surface intersects the cube #if USE_SHARED - // use shared memory to avoid using local - __shared__ float3 vertlist[12 * NTHREADS]; + // use shared memory to avoid using local + __shared__ float3 vertlist[12 * NTHREADS]; - vertlist[threadIdx.x] = - vertexInterp(isoValue, v[0], v[1], field[0], field[1]); - vertlist[NTHREADS + threadIdx.x] = - vertexInterp(isoValue, v[1], v[2], field[1], field[2]); - vertlist[(NTHREADS * 2) + threadIdx.x] = - vertexInterp(isoValue, v[2], v[3], field[2], field[3]); - vertlist[(NTHREADS * 3) + threadIdx.x] = - vertexInterp(isoValue, v[3], v[0], field[3], field[0]); - vertlist[(NTHREADS * 4) + threadIdx.x] = - vertexInterp(isoValue, v[4], v[5], field[4], field[5]); - vertlist[(NTHREADS * 5) + threadIdx.x] = - vertexInterp(isoValue, v[5], v[6], field[5], field[6]); - vertlist[(NTHREADS * 6) + threadIdx.x] = - vertexInterp(isoValue, v[6], v[7], field[6], field[7]); - vertlist[(NTHREADS * 7) + threadIdx.x] = - vertexInterp(isoValue, v[7], v[4], field[7], field[4]); - vertlist[(NTHREADS * 8) + threadIdx.x] = - vertexInterp(isoValue, v[0], v[4], field[0], field[4]); - vertlist[(NTHREADS * 9) + threadIdx.x] = - vertexInterp(isoValue, v[1], v[5], field[1], field[5]); - vertlist[(NTHREADS * 10) + threadIdx.x] = - vertexInterp(isoValue, v[2], v[6], field[2], field[6]); - vertlist[(NTHREADS * 11) + threadIdx.x] = - vertexInterp(isoValue, v[3], v[7], field[3], field[7]); - __syncthreads(); + vertlist[threadIdx.x] = vertexInterp(isoValue, v[0], v[1], field[0], field[1]); + vertlist[NTHREADS + threadIdx.x] = vertexInterp(isoValue, v[1], v[2], field[1], field[2]); + vertlist[(NTHREADS * 2) + threadIdx.x] = vertexInterp(isoValue, v[2], v[3], field[2], field[3]); + vertlist[(NTHREADS * 3) + threadIdx.x] = vertexInterp(isoValue, v[3], v[0], field[3], field[0]); + vertlist[(NTHREADS * 4) + threadIdx.x] = vertexInterp(isoValue, v[4], v[5], field[4], field[5]); + vertlist[(NTHREADS * 5) + threadIdx.x] = vertexInterp(isoValue, v[5], v[6], field[5], field[6]); + vertlist[(NTHREADS * 6) + threadIdx.x] = vertexInterp(isoValue, v[6], v[7], field[6], field[7]); + vertlist[(NTHREADS * 7) + threadIdx.x] = vertexInterp(isoValue, v[7], v[4], field[7], field[4]); + vertlist[(NTHREADS * 8) + threadIdx.x] = vertexInterp(isoValue, v[0], v[4], field[0], field[4]); + vertlist[(NTHREADS * 9) + threadIdx.x] = vertexInterp(isoValue, v[1], v[5], field[1], field[5]); + vertlist[(NTHREADS * 10) + threadIdx.x] = vertexInterp(isoValue, v[2], v[6], field[2], field[6]); + vertlist[(NTHREADS * 11) + threadIdx.x] = vertexInterp(isoValue, v[3], v[7], field[3], field[7]); + __syncthreads(); #else - float3 vertlist[12]; + float3 vertlist[12]; - vertlist[0] = vertexInterp(isoValue, v[0], v[1], field[0], field[1]); - vertlist[1] = vertexInterp(isoValue, v[1], v[2], field[1], field[2]); - vertlist[2] = vertexInterp(isoValue, v[2], v[3], field[2], field[3]); - vertlist[3] = vertexInterp(isoValue, v[3], v[0], field[3], field[0]); + vertlist[0] = vertexInterp(isoValue, v[0], v[1], field[0], field[1]); + vertlist[1] = vertexInterp(isoValue, v[1], v[2], field[1], field[2]); + vertlist[2] = vertexInterp(isoValue, v[2], v[3], field[2], field[3]); + vertlist[3] = vertexInterp(isoValue, v[3], v[0], field[3], field[0]); - vertlist[4] = vertexInterp(isoValue, v[4], v[5], field[4], field[5]); - vertlist[5] = vertexInterp(isoValue, v[5], v[6], field[5], field[6]); - vertlist[6] = vertexInterp(isoValue, v[6], v[7], field[6], field[7]); - vertlist[7] = vertexInterp(isoValue, v[7], v[4], field[7], field[4]); + vertlist[4] = vertexInterp(isoValue, v[4], v[5], field[4], field[5]); + vertlist[5] = vertexInterp(isoValue, v[5], v[6], field[5], field[6]); + vertlist[6] = vertexInterp(isoValue, v[6], v[7], field[6], field[7]); + vertlist[7] = vertexInterp(isoValue, v[7], v[4], field[7], field[4]); - vertlist[8] = vertexInterp(isoValue, v[0], v[4], field[0], field[4]); - vertlist[9] = vertexInterp(isoValue, v[1], v[5], field[1], field[5]); - vertlist[10] = vertexInterp(isoValue, v[2], v[6], field[2], field[6]); - vertlist[11] = vertexInterp(isoValue, v[3], v[7], field[3], field[7]); + vertlist[8] = vertexInterp(isoValue, v[0], v[4], field[0], field[4]); + vertlist[9] = vertexInterp(isoValue, v[1], v[5], field[1], field[5]); + vertlist[10] = vertexInterp(isoValue, v[2], v[6], field[2], field[6]); + vertlist[11] = vertexInterp(isoValue, v[3], v[7], field[3], field[7]); #endif - // output triangle vertices - uint numVerts = tex1Dfetch(numVertsTex, cubeindex); + // output triangle vertices + uint numVerts = tex1Dfetch(numVertsTex, cubeindex); - for (int i = 0; i < numVerts; i += 3) { - uint index = numVertsScanned[voxel] + i; + for (int i = 0; i < numVerts; i += 3) { + uint index = numVertsScanned[voxel] + i; - float3 *v[3]; - uint edge; - edge = tex1Dfetch(triTex, (cubeindex * 16) + i); + float3 *v[3]; + uint edge; + edge = tex1Dfetch(triTex, (cubeindex * 16) + i); #if USE_SHARED - v[0] = &vertlist[(edge * NTHREADS) + threadIdx.x]; + v[0] = &vertlist[(edge * NTHREADS) + threadIdx.x]; #else - v[0] = &vertlist[edge]; + v[0] = &vertlist[edge]; #endif - edge = tex1Dfetch(triTex, (cubeindex * 16) + i + 1); + edge = tex1Dfetch(triTex, (cubeindex * 16) + i + 1); #if USE_SHARED - v[1] = &vertlist[(edge * NTHREADS) + threadIdx.x]; + v[1] = &vertlist[(edge * NTHREADS) + threadIdx.x]; #else - v[1] = &vertlist[edge]; + v[1] = &vertlist[edge]; #endif - edge = tex1Dfetch(triTex, (cubeindex * 16) + i + 2); + edge = tex1Dfetch(triTex, (cubeindex * 16) + i + 2); #if USE_SHARED - v[2] = &vertlist[(edge * NTHREADS) + threadIdx.x]; + v[2] = &vertlist[(edge * NTHREADS) + threadIdx.x]; #else - v[2] = &vertlist[edge]; + v[2] = &vertlist[edge]; #endif - // calculate triangle surface normal - float3 n = calcNormal(v[0], v[1], v[2]); + // calculate triangle surface normal + float3 n = calcNormal(v[0], v[1], v[2]); - if (index < (maxVerts - 3)) { - pos[index] = make_float4(*v[0], 1.0f); - norm[index] = make_float4(n, 0.0f); + if (index < (maxVerts - 3)) { + pos[index] = make_float4(*v[0], 1.0f); + norm[index] = make_float4(n, 0.0f); - pos[index + 1] = make_float4(*v[1], 1.0f); - norm[index + 1] = make_float4(n, 0.0f); + pos[index + 1] = make_float4(*v[1], 1.0f); + norm[index + 1] = make_float4(n, 0.0f); - pos[index + 2] = make_float4(*v[2], 1.0f); - norm[index + 2] = make_float4(n, 0.0f); + pos[index + 2] = make_float4(*v[2], 1.0f); + norm[index + 2] = make_float4(n, 0.0f); + } } - } } -extern "C" void launch_generateTriangles2( - dim3 grid, dim3 threads, float4 *pos, float4 *norm, - uint *compactedVoxelArray, uint *numVertsScanned, uchar *volume, - uint3 gridSize, uint3 gridSizeShift, uint3 gridSizeMask, float3 voxelSize, - float isoValue, uint activeVoxels, uint maxVerts) { - generateTriangles2<<>>( - pos, norm, compactedVoxelArray, numVertsScanned, volume, gridSize, - gridSizeShift, gridSizeMask, voxelSize, isoValue, activeVoxels, maxVerts, - triTex, numVertsTex, volumeTex); - getLastCudaError("generateTriangles2 failed"); +extern "C" void launch_generateTriangles2(dim3 grid, + dim3 threads, + float4 *pos, + float4 *norm, + uint *compactedVoxelArray, + uint *numVertsScanned, + uchar *volume, + uint3 gridSize, + uint3 gridSizeShift, + uint3 gridSizeMask, + float3 voxelSize, + float isoValue, + uint activeVoxels, + uint maxVerts) +{ + generateTriangles2<<>>(pos, + norm, + compactedVoxelArray, + numVertsScanned, + volume, + gridSize, + gridSizeShift, + gridSizeMask, + voxelSize, + isoValue, + activeVoxels, + maxVerts, + triTex, + numVertsTex, + volumeTex); + getLastCudaError("generateTriangles2 failed"); } -extern "C" void ThrustScanWrapper(unsigned int *output, unsigned int *input, - unsigned int numElements) { - thrust::exclusive_scan(thrust::device_ptr(input), - thrust::device_ptr(input + numElements), - thrust::device_ptr(output)); +extern "C" void ThrustScanWrapper(unsigned int *output, unsigned int *input, unsigned int numElements) +{ + thrust::exclusive_scan(thrust::device_ptr(input), + thrust::device_ptr(input + numElements), + thrust::device_ptr(output)); } #endif diff --git a/Samples/5_Domain_Specific/marchingCubes/tables.h b/Samples/5_Domain_Specific/marchingCubes/tables.h index 3f128971..10d6b5ea 100644 --- a/Samples/5_Domain_Specific/marchingCubes/tables.h +++ b/Samples/5_Domain_Specific/marchingCubes/tables.h @@ -33,306 +33,166 @@ // edge table maps 8-bit flag representing which cube vertices are inside // the isosurface to 12-bit number indicating which edges are intersected uint edgeTable[256] = { - 0x0, 0x109, 0x203, 0x30a, 0x406, 0x50f, 0x605, 0x70c, 0x80c, 0x905, 0xa0f, - 0xb06, 0xc0a, 0xd03, 0xe09, 0xf00, 0x190, 0x99, 0x393, 0x29a, 0x596, 0x49f, - 0x795, 0x69c, 0x99c, 0x895, 0xb9f, 0xa96, 0xd9a, 0xc93, 0xf99, 0xe90, 0x230, - 0x339, 0x33, 0x13a, 0x636, 0x73f, 0x435, 0x53c, 0xa3c, 0xb35, 0x83f, 0x936, - 0xe3a, 0xf33, 0xc39, 0xd30, 0x3a0, 0x2a9, 0x1a3, 0xaa, 0x7a6, 0x6af, 0x5a5, - 0x4ac, 0xbac, 0xaa5, 0x9af, 0x8a6, 0xfaa, 0xea3, 0xda9, 0xca0, 0x460, 0x569, - 0x663, 0x76a, 0x66, 0x16f, 0x265, 0x36c, 0xc6c, 0xd65, 0xe6f, 0xf66, 0x86a, - 0x963, 0xa69, 0xb60, 0x5f0, 0x4f9, 0x7f3, 0x6fa, 0x1f6, 0xff, 0x3f5, 0x2fc, - 0xdfc, 0xcf5, 0xfff, 0xef6, 0x9fa, 0x8f3, 0xbf9, 0xaf0, 0x650, 0x759, 0x453, - 0x55a, 0x256, 0x35f, 0x55, 0x15c, 0xe5c, 0xf55, 0xc5f, 0xd56, 0xa5a, 0xb53, - 0x859, 0x950, 0x7c0, 0x6c9, 0x5c3, 0x4ca, 0x3c6, 0x2cf, 0x1c5, 0xcc, 0xfcc, - 0xec5, 0xdcf, 0xcc6, 0xbca, 0xac3, 0x9c9, 0x8c0, 0x8c0, 0x9c9, 0xac3, 0xbca, - 0xcc6, 0xdcf, 0xec5, 0xfcc, 0xcc, 0x1c5, 0x2cf, 0x3c6, 0x4ca, 0x5c3, 0x6c9, - 0x7c0, 0x950, 0x859, 0xb53, 0xa5a, 0xd56, 0xc5f, 0xf55, 0xe5c, 0x15c, 0x55, - 0x35f, 0x256, 0x55a, 0x453, 0x759, 0x650, 0xaf0, 0xbf9, 0x8f3, 0x9fa, 0xef6, - 0xfff, 0xcf5, 0xdfc, 0x2fc, 0x3f5, 0xff, 0x1f6, 0x6fa, 0x7f3, 0x4f9, 0x5f0, - 0xb60, 0xa69, 0x963, 0x86a, 0xf66, 0xe6f, 0xd65, 0xc6c, 0x36c, 0x265, 0x16f, - 0x66, 0x76a, 0x663, 0x569, 0x460, 0xca0, 0xda9, 0xea3, 0xfaa, 0x8a6, 0x9af, - 0xaa5, 0xbac, 0x4ac, 0x5a5, 0x6af, 0x7a6, 0xaa, 0x1a3, 0x2a9, 0x3a0, 0xd30, - 0xc39, 0xf33, 0xe3a, 0x936, 0x83f, 0xb35, 0xa3c, 0x53c, 0x435, 0x73f, 0x636, - 0x13a, 0x33, 0x339, 0x230, 0xe90, 0xf99, 0xc93, 0xd9a, 0xa96, 0xb9f, 0x895, - 0x99c, 0x69c, 0x795, 0x49f, 0x596, 0x29a, 0x393, 0x99, 0x190, 0xf00, 0xe09, - 0xd03, 0xc0a, 0xb06, 0xa0f, 0x905, 0x80c, 0x70c, 0x605, 0x50f, 0x406, 0x30a, - 0x203, 0x109, 0x0}; + 0x0, 0x109, 0x203, 0x30a, 0x406, 0x50f, 0x605, 0x70c, 0x80c, 0x905, 0xa0f, 0xb06, 0xc0a, 0xd03, 0xe09, 0xf00, + 0x190, 0x99, 0x393, 0x29a, 0x596, 0x49f, 0x795, 0x69c, 0x99c, 0x895, 0xb9f, 0xa96, 0xd9a, 0xc93, 0xf99, 0xe90, + 0x230, 0x339, 0x33, 0x13a, 0x636, 0x73f, 0x435, 0x53c, 0xa3c, 0xb35, 0x83f, 0x936, 0xe3a, 0xf33, 0xc39, 0xd30, + 0x3a0, 0x2a9, 0x1a3, 0xaa, 0x7a6, 0x6af, 0x5a5, 0x4ac, 0xbac, 0xaa5, 0x9af, 0x8a6, 0xfaa, 0xea3, 0xda9, 0xca0, + 0x460, 0x569, 0x663, 0x76a, 0x66, 0x16f, 0x265, 0x36c, 0xc6c, 0xd65, 0xe6f, 0xf66, 0x86a, 0x963, 0xa69, 0xb60, + 0x5f0, 0x4f9, 0x7f3, 0x6fa, 0x1f6, 0xff, 0x3f5, 0x2fc, 0xdfc, 0xcf5, 0xfff, 0xef6, 0x9fa, 0x8f3, 0xbf9, 0xaf0, + 0x650, 0x759, 0x453, 0x55a, 0x256, 0x35f, 0x55, 0x15c, 0xe5c, 0xf55, 0xc5f, 0xd56, 0xa5a, 0xb53, 0x859, 0x950, + 0x7c0, 0x6c9, 0x5c3, 0x4ca, 0x3c6, 0x2cf, 0x1c5, 0xcc, 0xfcc, 0xec5, 0xdcf, 0xcc6, 0xbca, 0xac3, 0x9c9, 0x8c0, + 0x8c0, 0x9c9, 0xac3, 0xbca, 0xcc6, 0xdcf, 0xec5, 0xfcc, 0xcc, 0x1c5, 0x2cf, 0x3c6, 0x4ca, 0x5c3, 0x6c9, 0x7c0, + 0x950, 0x859, 0xb53, 0xa5a, 0xd56, 0xc5f, 0xf55, 0xe5c, 0x15c, 0x55, 0x35f, 0x256, 0x55a, 0x453, 0x759, 0x650, + 0xaf0, 0xbf9, 0x8f3, 0x9fa, 0xef6, 0xfff, 0xcf5, 0xdfc, 0x2fc, 0x3f5, 0xff, 0x1f6, 0x6fa, 0x7f3, 0x4f9, 0x5f0, + 0xb60, 0xa69, 0x963, 0x86a, 0xf66, 0xe6f, 0xd65, 0xc6c, 0x36c, 0x265, 0x16f, 0x66, 0x76a, 0x663, 0x569, 0x460, + 0xca0, 0xda9, 0xea3, 0xfaa, 0x8a6, 0x9af, 0xaa5, 0xbac, 0x4ac, 0x5a5, 0x6af, 0x7a6, 0xaa, 0x1a3, 0x2a9, 0x3a0, + 0xd30, 0xc39, 0xf33, 0xe3a, 0x936, 0x83f, 0xb35, 0xa3c, 0x53c, 0x435, 0x73f, 0x636, 0x13a, 0x33, 0x339, 0x230, + 0xe90, 0xf99, 0xc93, 0xd9a, 0xa96, 0xb9f, 0x895, 0x99c, 0x69c, 0x795, 0x49f, 0x596, 0x29a, 0x393, 0x99, 0x190, + 0xf00, 0xe09, 0xd03, 0xc0a, 0xb06, 0xa0f, 0x905, 0x80c, 0x70c, 0x605, 0x50f, 0x406, 0x30a, 0x203, 0x109, 0x0}; // triangle table maps same cube vertex index to a list of up to 5 triangles // which are built from the interpolated edge vertices #define X 255 -uint triTable[256][16] = {{X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {0, 8, 3, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {0, 1, 9, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {1, 8, 3, 9, 8, 1, X, X, X, X, X, X, X, X, X, X}, - {1, 2, 10, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {0, 8, 3, 1, 2, 10, X, X, X, X, X, X, X, X, X, X}, - {9, 2, 10, 0, 2, 9, X, X, X, X, X, X, X, X, X, X}, - {2, 8, 3, 2, 10, 8, 10, 9, 8, X, X, X, X, X, X, X}, - {3, 11, 2, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {0, 11, 2, 8, 11, 0, X, X, X, X, X, X, X, X, X, X}, - {1, 9, 0, 2, 3, 11, X, X, X, X, X, X, X, X, X, X}, - {1, 11, 2, 1, 9, 11, 9, 8, 11, X, X, X, X, X, X, X}, - {3, 10, 1, 11, 10, 3, X, X, X, X, X, X, X, X, X, X}, - {0, 10, 1, 0, 8, 10, 8, 11, 10, X, X, X, X, X, X, X}, - {3, 9, 0, 3, 11, 9, 11, 10, 9, X, X, X, X, X, X, X}, - {9, 8, 10, 10, 8, 11, X, X, X, X, X, X, X, X, X, X}, - {4, 7, 8, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {4, 3, 0, 7, 3, 4, X, X, X, X, X, X, X, X, X, X}, - {0, 1, 9, 8, 4, 7, X, X, X, X, X, X, X, X, X, X}, - {4, 1, 9, 4, 7, 1, 7, 3, 1, X, X, X, X, X, X, X}, - {1, 2, 10, 8, 4, 7, X, X, X, X, X, X, X, X, X, X}, - {3, 4, 7, 3, 0, 4, 1, 2, 10, X, X, X, X, X, X, X}, - {9, 2, 10, 9, 0, 2, 8, 4, 7, X, X, X, X, X, X, X}, - {2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, X, X, X, X}, - {8, 4, 7, 3, 11, 2, X, X, X, X, X, X, X, X, X, X}, - {11, 4, 7, 11, 2, 4, 2, 0, 4, X, X, X, X, X, X, X}, - {9, 0, 1, 8, 4, 7, 2, 3, 11, X, X, X, X, X, X, X}, - {4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, X, X, X, X}, - {3, 10, 1, 3, 11, 10, 7, 8, 4, X, X, X, X, X, X, X}, - {1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, X, X, X, X}, - {4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, X, X, X, X}, - {4, 7, 11, 4, 11, 9, 9, 11, 10, X, X, X, X, X, X, X}, - {9, 5, 4, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {9, 5, 4, 0, 8, 3, X, X, X, X, X, X, X, X, X, X}, - {0, 5, 4, 1, 5, 0, X, X, X, X, X, X, X, X, X, X}, - {8, 5, 4, 8, 3, 5, 3, 1, 5, X, X, X, X, X, X, X}, - {1, 2, 10, 9, 5, 4, X, X, X, X, X, X, X, X, X, X}, - {3, 0, 8, 1, 2, 10, 4, 9, 5, X, X, X, X, X, X, X}, - {5, 2, 10, 5, 4, 2, 4, 0, 2, X, X, X, X, X, X, X}, - {2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, X, X, X, X}, - {9, 5, 4, 2, 3, 11, X, X, X, X, X, X, X, X, X, X}, - {0, 11, 2, 0, 8, 11, 4, 9, 5, X, X, X, X, X, X, X}, - {0, 5, 4, 0, 1, 5, 2, 3, 11, X, X, X, X, X, X, X}, - {2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, X, X, X, X}, - {10, 3, 11, 10, 1, 3, 9, 5, 4, X, X, X, X, X, X, X}, - {4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, X, X, X, X}, - {5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, X, X, X, X}, - {5, 4, 8, 5, 8, 10, 10, 8, 11, X, X, X, X, X, X, X}, - {9, 7, 8, 5, 7, 9, X, X, X, X, X, X, X, X, X, X}, - {9, 3, 0, 9, 5, 3, 5, 7, 3, X, X, X, X, X, X, X}, - {0, 7, 8, 0, 1, 7, 1, 5, 7, X, X, X, X, X, X, X}, - {1, 5, 3, 3, 5, 7, X, X, X, X, X, X, X, X, X, X}, - {9, 7, 8, 9, 5, 7, 10, 1, 2, X, X, X, X, X, X, X}, - {10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, X, X, X, X}, - {8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, X, X, X, X}, - {2, 10, 5, 2, 5, 3, 3, 5, 7, X, X, X, X, X, X, X}, - {7, 9, 5, 7, 8, 9, 3, 11, 2, X, X, X, X, X, X, X}, - {9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, X, X, X, X}, - {2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, X, X, X, X}, - {11, 2, 1, 11, 1, 7, 7, 1, 5, X, X, X, X, X, X, X}, - {9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, X, X, X, X}, - {5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, X}, - {11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, X}, - {11, 10, 5, 7, 11, 5, X, X, X, X, X, X, X, X, X, X}, - {10, 6, 5, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {0, 8, 3, 5, 10, 6, X, X, X, X, X, X, X, X, X, X}, - {9, 0, 1, 5, 10, 6, X, X, X, X, X, X, X, X, X, X}, - {1, 8, 3, 1, 9, 8, 5, 10, 6, X, X, X, X, X, X, X}, - {1, 6, 5, 2, 6, 1, X, X, X, X, X, X, X, X, X, X}, - {1, 6, 5, 1, 2, 6, 3, 0, 8, X, X, X, X, X, X, X}, - {9, 6, 5, 9, 0, 6, 0, 2, 6, X, X, X, X, X, X, X}, - {5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, X, X, X, X}, - {2, 3, 11, 10, 6, 5, X, X, X, X, X, X, X, X, X, X}, - {11, 0, 8, 11, 2, 0, 10, 6, 5, X, X, X, X, X, X, X}, - {0, 1, 9, 2, 3, 11, 5, 10, 6, X, X, X, X, X, X, X}, - {5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, X, X, X, X}, - {6, 3, 11, 6, 5, 3, 5, 1, 3, X, X, X, X, X, X, X}, - {0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, X, X, X, X}, - {3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, X, X, X, X}, - {6, 5, 9, 6, 9, 11, 11, 9, 8, X, X, X, X, X, X, X}, - {5, 10, 6, 4, 7, 8, X, X, X, X, X, X, X, X, X, X}, - {4, 3, 0, 4, 7, 3, 6, 5, 10, X, X, X, X, X, X, X}, - {1, 9, 0, 5, 10, 6, 8, 4, 7, X, X, X, X, X, X, X}, - {10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, X, X, X, X}, - {6, 1, 2, 6, 5, 1, 4, 7, 8, X, X, X, X, X, X, X}, - {1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, X, X, X, X}, - {8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, X, X, X, X}, - {7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, X}, - {3, 11, 2, 7, 8, 4, 10, 6, 5, X, X, X, X, X, X, X}, - {5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, X, X, X, X}, - {0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, X, X, X, X}, - {9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, X}, - {8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, X, X, X, X}, - {5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, X}, - {0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, X}, - {6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, X, X, X, X}, - {10, 4, 9, 6, 4, 10, X, X, X, X, X, X, X, X, X, X}, - {4, 10, 6, 4, 9, 10, 0, 8, 3, X, X, X, X, X, X, X}, - {10, 0, 1, 10, 6, 0, 6, 4, 0, X, X, X, X, X, X, X}, - {8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, X, X, X, X}, - {1, 4, 9, 1, 2, 4, 2, 6, 4, X, X, X, X, X, X, X}, - {3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, X, X, X, X}, - {0, 2, 4, 4, 2, 6, X, X, X, X, X, X, X, X, X, X}, - {8, 3, 2, 8, 2, 4, 4, 2, 6, X, X, X, X, X, X, X}, - {10, 4, 9, 10, 6, 4, 11, 2, 3, X, X, X, X, X, X, X}, - {0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, X, X, X, X}, - {3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, X, X, X, X}, - {6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, X}, - {9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, X, X, X, X}, - {8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, X}, - {3, 11, 6, 3, 6, 0, 0, 6, 4, X, X, X, X, X, X, X}, - {6, 4, 8, 11, 6, 8, X, X, X, X, X, X, X, X, X, X}, - {7, 10, 6, 7, 8, 10, 8, 9, 10, X, X, X, X, X, X, X}, - {0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, X, X, X, X}, - {10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, X, X, X, X}, - {10, 6, 7, 10, 7, 1, 1, 7, 3, X, X, X, X, X, X, X}, - {1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, X, X, X, X}, - {2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, X}, - {7, 8, 0, 7, 0, 6, 6, 0, 2, X, X, X, X, X, X, X}, - {7, 3, 2, 6, 7, 2, X, X, X, X, X, X, X, X, X, X}, - {2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, X, X, X, X}, - {2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, X}, - {1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, X}, - {11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, X, X, X, X}, - {8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, X}, - {0, 9, 1, 11, 6, 7, X, X, X, X, X, X, X, X, X, X}, - {7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, X, X, X, X}, - {7, 11, 6, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {7, 6, 11, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {3, 0, 8, 11, 7, 6, X, X, X, X, X, X, X, X, X, X}, - {0, 1, 9, 11, 7, 6, X, X, X, X, X, X, X, X, X, X}, - {8, 1, 9, 8, 3, 1, 11, 7, 6, X, X, X, X, X, X, X}, - {10, 1, 2, 6, 11, 7, X, X, X, X, X, X, X, X, X, X}, - {1, 2, 10, 3, 0, 8, 6, 11, 7, X, X, X, X, X, X, X}, - {2, 9, 0, 2, 10, 9, 6, 11, 7, X, X, X, X, X, X, X}, - {6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, X, X, X, X}, - {7, 2, 3, 6, 2, 7, X, X, X, X, X, X, X, X, X, X}, - {7, 0, 8, 7, 6, 0, 6, 2, 0, X, X, X, X, X, X, X}, - {2, 7, 6, 2, 3, 7, 0, 1, 9, X, X, X, X, X, X, X}, - {1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, X, X, X, X}, - {10, 7, 6, 10, 1, 7, 1, 3, 7, X, X, X, X, X, X, X}, - {10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, X, X, X, X}, - {0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, X, X, X, X}, - {7, 6, 10, 7, 10, 8, 8, 10, 9, X, X, X, X, X, X, X}, - {6, 8, 4, 11, 8, 6, X, X, X, X, X, X, X, X, X, X}, - {3, 6, 11, 3, 0, 6, 0, 4, 6, X, X, X, X, X, X, X}, - {8, 6, 11, 8, 4, 6, 9, 0, 1, X, X, X, X, X, X, X}, - {9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, X, X, X, X}, - {6, 8, 4, 6, 11, 8, 2, 10, 1, X, X, X, X, X, X, X}, - {1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, X, X, X, X}, - {4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, X, X, X, X}, - {10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, X}, - {8, 2, 3, 8, 4, 2, 4, 6, 2, X, X, X, X, X, X, X}, - {0, 4, 2, 4, 6, 2, X, X, X, X, X, X, X, X, X, X}, - {1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, X, X, X, X}, - {1, 9, 4, 1, 4, 2, 2, 4, 6, X, X, X, X, X, X, X}, - {8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, X, X, X, X}, - {10, 1, 0, 10, 0, 6, 6, 0, 4, X, X, X, X, X, X, X}, - {4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, X}, - {10, 9, 4, 6, 10, 4, X, X, X, X, X, X, X, X, X, X}, - {4, 9, 5, 7, 6, 11, X, X, X, X, X, X, X, X, X, X}, - {0, 8, 3, 4, 9, 5, 11, 7, 6, X, X, X, X, X, X, X}, - {5, 0, 1, 5, 4, 0, 7, 6, 11, X, X, X, X, X, X, X}, - {11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, X, X, X, X}, - {9, 5, 4, 10, 1, 2, 7, 6, 11, X, X, X, X, X, X, X}, - {6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, X, X, X, X}, - {7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, X, X, X, X}, - {3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, X}, - {7, 2, 3, 7, 6, 2, 5, 4, 9, X, X, X, X, X, X, X}, - {9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, X, X, X, X}, - {3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, X, X, X, X}, - {6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, X}, - {9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, X, X, X, X}, - {1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, X}, - {4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, X}, - {7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, X, X, X, X}, - {6, 9, 5, 6, 11, 9, 11, 8, 9, X, X, X, X, X, X, X}, - {3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, X, X, X, X}, - {0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, X, X, X, X}, - {6, 11, 3, 6, 3, 5, 5, 3, 1, X, X, X, X, X, X, X}, - {1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, X, X, X, X}, - {0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, X}, - {11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, X}, - {6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, X, X, X, X}, - {5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, X, X, X, X}, - {9, 5, 6, 9, 6, 0, 0, 6, 2, X, X, X, X, X, X, X}, - {1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, X}, - {1, 5, 6, 2, 1, 6, X, X, X, X, X, X, X, X, X, X}, - {1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, X}, - {10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, X, X, X, X}, - {0, 3, 8, 5, 6, 10, X, X, X, X, X, X, X, X, X, X}, - {10, 5, 6, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {11, 5, 10, 7, 5, 11, X, X, X, X, X, X, X, X, X, X}, - {11, 5, 10, 11, 7, 5, 8, 3, 0, X, X, X, X, X, X, X}, - {5, 11, 7, 5, 10, 11, 1, 9, 0, X, X, X, X, X, X, X}, - {10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, X, X, X, X}, - {11, 1, 2, 11, 7, 1, 7, 5, 1, X, X, X, X, X, X, X}, - {0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, X, X, X, X}, - {9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, X, X, X, X}, - {7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, X}, - {2, 5, 10, 2, 3, 5, 3, 7, 5, X, X, X, X, X, X, X}, - {8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, X, X, X, X}, - {9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, X, X, X, X}, - {9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, X}, - {1, 3, 5, 3, 7, 5, X, X, X, X, X, X, X, X, X, X}, - {0, 8, 7, 0, 7, 1, 1, 7, 5, X, X, X, X, X, X, X}, - {9, 0, 3, 9, 3, 5, 5, 3, 7, X, X, X, X, X, X, X}, - {9, 8, 7, 5, 9, 7, X, X, X, X, X, X, X, X, X, X}, - {5, 8, 4, 5, 10, 8, 10, 11, 8, X, X, X, X, X, X, X}, - {5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, X, X, X, X}, - {0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, X, X, X, X}, - {10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, X}, - {2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, X, X, X, X}, - {0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, X}, - {0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, X}, - {9, 4, 5, 2, 11, 3, X, X, X, X, X, X, X, X, X, X}, - {2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, X, X, X, X}, - {5, 10, 2, 5, 2, 4, 4, 2, 0, X, X, X, X, X, X, X}, - {3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, X}, - {5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, X, X, X, X}, - {8, 4, 5, 8, 5, 3, 3, 5, 1, X, X, X, X, X, X, X}, - {0, 4, 5, 1, 0, 5, X, X, X, X, X, X, X, X, X, X}, - {8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, X, X, X, X}, - {9, 4, 5, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {4, 11, 7, 4, 9, 11, 9, 10, 11, X, X, X, X, X, X, X}, - {0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, X, X, X, X}, - {1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, X, X, X, X}, - {3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, X}, - {4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, X, X, X, X}, - {9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, X}, - {11, 7, 4, 11, 4, 2, 2, 4, 0, X, X, X, X, X, X, X}, - {11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, X, X, X, X}, - {2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, X, X, X, X}, - {9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, X}, - {3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, X}, - {1, 10, 2, 8, 7, 4, X, X, X, X, X, X, X, X, X, X}, - {4, 9, 1, 4, 1, 7, 7, 1, 3, X, X, X, X, X, X, X}, - {4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, X, X, X, X}, - {4, 0, 3, 7, 4, 3, X, X, X, X, X, X, X, X, X, X}, - {4, 8, 7, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {9, 10, 8, 10, 11, 8, X, X, X, X, X, X, X, X, X, X}, - {3, 0, 9, 3, 9, 11, 11, 9, 10, X, X, X, X, X, X, X}, - {0, 1, 10, 0, 10, 8, 8, 10, 11, X, X, X, X, X, X, X}, - {3, 1, 10, 11, 3, 10, X, X, X, X, X, X, X, X, X, X}, - {1, 2, 11, 1, 11, 9, 9, 11, 8, X, X, X, X, X, X, X}, - {3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, X, X, X, X}, - {0, 2, 11, 8, 0, 11, X, X, X, X, X, X, X, X, X, X}, - {3, 2, 11, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {2, 3, 8, 2, 8, 10, 10, 8, 9, X, X, X, X, X, X, X}, - {9, 10, 2, 0, 9, 2, X, X, X, X, X, X, X, X, X, X}, - {2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, X, X, X, X}, - {1, 10, 2, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {1, 3, 8, 9, 1, 8, X, X, X, X, X, X, X, X, X, X}, - {0, 9, 1, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {0, 3, 8, X, X, X, X, X, X, X, X, X, X, X, X, X}, - {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}}; +uint triTable[256][16] = { + {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}, {0, 8, 3, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {0, 1, 9, X, X, X, X, X, X, X, X, X, X, X, X, X}, {1, 8, 3, 9, 8, 1, X, X, X, X, X, X, X, X, X, X}, + {1, 2, 10, X, X, X, X, X, X, X, X, X, X, X, X, X}, {0, 8, 3, 1, 2, 10, X, X, X, X, X, X, X, X, X, X}, + {9, 2, 10, 0, 2, 9, X, X, X, X, X, X, X, X, X, X}, {2, 8, 3, 2, 10, 8, 10, 9, 8, X, X, X, X, X, X, X}, + {3, 11, 2, X, X, X, X, X, X, X, X, X, X, X, X, X}, {0, 11, 2, 8, 11, 0, X, X, X, X, X, X, X, X, X, X}, + {1, 9, 0, 2, 3, 11, X, X, X, X, X, X, X, X, X, X}, {1, 11, 2, 1, 9, 11, 9, 8, 11, X, X, X, X, X, X, X}, + {3, 10, 1, 11, 10, 3, X, X, X, X, X, X, X, X, X, X}, {0, 10, 1, 0, 8, 10, 8, 11, 10, X, X, X, X, X, X, X}, + {3, 9, 0, 3, 11, 9, 11, 10, 9, X, X, X, X, X, X, X}, {9, 8, 10, 10, 8, 11, X, X, X, X, X, X, X, X, X, X}, + {4, 7, 8, X, X, X, X, X, X, X, X, X, X, X, X, X}, {4, 3, 0, 7, 3, 4, X, X, X, X, X, X, X, X, X, X}, + {0, 1, 9, 8, 4, 7, X, X, X, X, X, X, X, X, X, X}, {4, 1, 9, 4, 7, 1, 7, 3, 1, X, X, X, X, X, X, X}, + {1, 2, 10, 8, 4, 7, X, X, X, X, X, X, X, X, X, X}, {3, 4, 7, 3, 0, 4, 1, 2, 10, X, X, X, X, X, X, X}, + {9, 2, 10, 9, 0, 2, 8, 4, 7, X, X, X, X, X, X, X}, {2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, X, X, X, X}, + {8, 4, 7, 3, 11, 2, X, X, X, X, X, X, X, X, X, X}, {11, 4, 7, 11, 2, 4, 2, 0, 4, X, X, X, X, X, X, X}, + {9, 0, 1, 8, 4, 7, 2, 3, 11, X, X, X, X, X, X, X}, {4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, X, X, X, X}, + {3, 10, 1, 3, 11, 10, 7, 8, 4, X, X, X, X, X, X, X}, {1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, X, X, X, X}, + {4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, X, X, X, X}, {4, 7, 11, 4, 11, 9, 9, 11, 10, X, X, X, X, X, X, X}, + {9, 5, 4, X, X, X, X, X, X, X, X, X, X, X, X, X}, {9, 5, 4, 0, 8, 3, X, X, X, X, X, X, X, X, X, X}, + {0, 5, 4, 1, 5, 0, X, X, X, X, X, X, X, X, X, X}, {8, 5, 4, 8, 3, 5, 3, 1, 5, X, X, X, X, X, X, X}, + {1, 2, 10, 9, 5, 4, X, X, X, X, X, X, X, X, X, X}, {3, 0, 8, 1, 2, 10, 4, 9, 5, X, X, X, X, X, X, X}, + {5, 2, 10, 5, 4, 2, 4, 0, 2, X, X, X, X, X, X, X}, {2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, X, X, X, X}, + {9, 5, 4, 2, 3, 11, X, X, X, X, X, X, X, X, X, X}, {0, 11, 2, 0, 8, 11, 4, 9, 5, X, X, X, X, X, X, X}, + {0, 5, 4, 0, 1, 5, 2, 3, 11, X, X, X, X, X, X, X}, {2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, X, X, X, X}, + {10, 3, 11, 10, 1, 3, 9, 5, 4, X, X, X, X, X, X, X}, {4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, X, X, X, X}, + {5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, X, X, X, X}, {5, 4, 8, 5, 8, 10, 10, 8, 11, X, X, X, X, X, X, X}, + {9, 7, 8, 5, 7, 9, X, X, X, X, X, X, X, X, X, X}, {9, 3, 0, 9, 5, 3, 5, 7, 3, X, X, X, X, X, X, X}, + {0, 7, 8, 0, 1, 7, 1, 5, 7, X, X, X, X, X, X, X}, {1, 5, 3, 3, 5, 7, X, X, X, X, X, X, X, X, X, X}, + {9, 7, 8, 9, 5, 7, 10, 1, 2, X, X, X, X, X, X, X}, {10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, X, X, X, X}, + {8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, X, X, X, X}, {2, 10, 5, 2, 5, 3, 3, 5, 7, X, X, X, X, X, X, X}, + {7, 9, 5, 7, 8, 9, 3, 11, 2, X, X, X, X, X, X, X}, {9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, X, X, X, X}, + {2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, X, X, X, X}, {11, 2, 1, 11, 1, 7, 7, 1, 5, X, X, X, X, X, X, X}, + {9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, X, X, X, X}, {5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, X}, + {11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, X}, {11, 10, 5, 7, 11, 5, X, X, X, X, X, X, X, X, X, X}, + {10, 6, 5, X, X, X, X, X, X, X, X, X, X, X, X, X}, {0, 8, 3, 5, 10, 6, X, X, X, X, X, X, X, X, X, X}, + {9, 0, 1, 5, 10, 6, X, X, X, X, X, X, X, X, X, X}, {1, 8, 3, 1, 9, 8, 5, 10, 6, X, X, X, X, X, X, X}, + {1, 6, 5, 2, 6, 1, X, X, X, X, X, X, X, X, X, X}, {1, 6, 5, 1, 2, 6, 3, 0, 8, X, X, X, X, X, X, X}, + {9, 6, 5, 9, 0, 6, 0, 2, 6, X, X, X, X, X, X, X}, {5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, X, X, X, X}, + {2, 3, 11, 10, 6, 5, X, X, X, X, X, X, X, X, X, X}, {11, 0, 8, 11, 2, 0, 10, 6, 5, X, X, X, X, X, X, X}, + {0, 1, 9, 2, 3, 11, 5, 10, 6, X, X, X, X, X, X, X}, {5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, X, X, X, X}, + {6, 3, 11, 6, 5, 3, 5, 1, 3, X, X, X, X, X, X, X}, {0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, X, X, X, X}, + {3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, X, X, X, X}, {6, 5, 9, 6, 9, 11, 11, 9, 8, X, X, X, X, X, X, X}, + {5, 10, 6, 4, 7, 8, X, X, X, X, X, X, X, X, X, X}, {4, 3, 0, 4, 7, 3, 6, 5, 10, X, X, X, X, X, X, X}, + {1, 9, 0, 5, 10, 6, 8, 4, 7, X, X, X, X, X, X, X}, {10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, X, X, X, X}, + {6, 1, 2, 6, 5, 1, 4, 7, 8, X, X, X, X, X, X, X}, {1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, X, X, X, X}, + {8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, X, X, X, X}, {7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, X}, + {3, 11, 2, 7, 8, 4, 10, 6, 5, X, X, X, X, X, X, X}, {5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, X, X, X, X}, + {0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, X, X, X, X}, {9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, X}, + {8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, X, X, X, X}, {5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, X}, + {0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, X}, {6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, X, X, X, X}, + {10, 4, 9, 6, 4, 10, X, X, X, X, X, X, X, X, X, X}, {4, 10, 6, 4, 9, 10, 0, 8, 3, X, X, X, X, X, X, X}, + {10, 0, 1, 10, 6, 0, 6, 4, 0, X, X, X, X, X, X, X}, {8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, X, X, X, X}, + {1, 4, 9, 1, 2, 4, 2, 6, 4, X, X, X, X, X, X, X}, {3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, X, X, X, X}, + {0, 2, 4, 4, 2, 6, X, X, X, X, X, X, X, X, X, X}, {8, 3, 2, 8, 2, 4, 4, 2, 6, X, X, X, X, X, X, X}, + {10, 4, 9, 10, 6, 4, 11, 2, 3, X, X, X, X, X, X, X}, {0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, X, X, X, X}, + {3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, X, X, X, X}, {6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, X}, + {9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, X, X, X, X}, {8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, X}, + {3, 11, 6, 3, 6, 0, 0, 6, 4, X, X, X, X, X, X, X}, {6, 4, 8, 11, 6, 8, X, X, X, X, X, X, X, X, X, X}, + {7, 10, 6, 7, 8, 10, 8, 9, 10, X, X, X, X, X, X, X}, {0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, X, X, X, X}, + {10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, X, X, X, X}, {10, 6, 7, 10, 7, 1, 1, 7, 3, X, X, X, X, X, X, X}, + {1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, X, X, X, X}, {2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, X}, + {7, 8, 0, 7, 0, 6, 6, 0, 2, X, X, X, X, X, X, X}, {7, 3, 2, 6, 7, 2, X, X, X, X, X, X, X, X, X, X}, + {2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, X, X, X, X}, {2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, X}, + {1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, X}, {11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, X, X, X, X}, + {8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, X}, {0, 9, 1, 11, 6, 7, X, X, X, X, X, X, X, X, X, X}, + {7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, X, X, X, X}, {7, 11, 6, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {7, 6, 11, X, X, X, X, X, X, X, X, X, X, X, X, X}, {3, 0, 8, 11, 7, 6, X, X, X, X, X, X, X, X, X, X}, + {0, 1, 9, 11, 7, 6, X, X, X, X, X, X, X, X, X, X}, {8, 1, 9, 8, 3, 1, 11, 7, 6, X, X, X, X, X, X, X}, + {10, 1, 2, 6, 11, 7, X, X, X, X, X, X, X, X, X, X}, {1, 2, 10, 3, 0, 8, 6, 11, 7, X, X, X, X, X, X, X}, + {2, 9, 0, 2, 10, 9, 6, 11, 7, X, X, X, X, X, X, X}, {6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, X, X, X, X}, + {7, 2, 3, 6, 2, 7, X, X, X, X, X, X, X, X, X, X}, {7, 0, 8, 7, 6, 0, 6, 2, 0, X, X, X, X, X, X, X}, + {2, 7, 6, 2, 3, 7, 0, 1, 9, X, X, X, X, X, X, X}, {1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, X, X, X, X}, + {10, 7, 6, 10, 1, 7, 1, 3, 7, X, X, X, X, X, X, X}, {10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, X, X, X, X}, + {0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, X, X, X, X}, {7, 6, 10, 7, 10, 8, 8, 10, 9, X, X, X, X, X, X, X}, + {6, 8, 4, 11, 8, 6, X, X, X, X, X, X, X, X, X, X}, {3, 6, 11, 3, 0, 6, 0, 4, 6, X, X, X, X, X, X, X}, + {8, 6, 11, 8, 4, 6, 9, 0, 1, X, X, X, X, X, X, X}, {9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, X, X, X, X}, + {6, 8, 4, 6, 11, 8, 2, 10, 1, X, X, X, X, X, X, X}, {1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, X, X, X, X}, + {4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, X, X, X, X}, {10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, X}, + {8, 2, 3, 8, 4, 2, 4, 6, 2, X, X, X, X, X, X, X}, {0, 4, 2, 4, 6, 2, X, X, X, X, X, X, X, X, X, X}, + {1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, X, X, X, X}, {1, 9, 4, 1, 4, 2, 2, 4, 6, X, X, X, X, X, X, X}, + {8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, X, X, X, X}, {10, 1, 0, 10, 0, 6, 6, 0, 4, X, X, X, X, X, X, X}, + {4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, X}, {10, 9, 4, 6, 10, 4, X, X, X, X, X, X, X, X, X, X}, + {4, 9, 5, 7, 6, 11, X, X, X, X, X, X, X, X, X, X}, {0, 8, 3, 4, 9, 5, 11, 7, 6, X, X, X, X, X, X, X}, + {5, 0, 1, 5, 4, 0, 7, 6, 11, X, X, X, X, X, X, X}, {11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, X, X, X, X}, + {9, 5, 4, 10, 1, 2, 7, 6, 11, X, X, X, X, X, X, X}, {6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, X, X, X, X}, + {7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, X, X, X, X}, {3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, X}, + {7, 2, 3, 7, 6, 2, 5, 4, 9, X, X, X, X, X, X, X}, {9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, X, X, X, X}, + {3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, X, X, X, X}, {6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, X}, + {9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, X, X, X, X}, {1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, X}, + {4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, X}, {7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, X, X, X, X}, + {6, 9, 5, 6, 11, 9, 11, 8, 9, X, X, X, X, X, X, X}, {3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, X, X, X, X}, + {0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, X, X, X, X}, {6, 11, 3, 6, 3, 5, 5, 3, 1, X, X, X, X, X, X, X}, + {1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, X, X, X, X}, {0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, X}, + {11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, X}, {6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, X, X, X, X}, + {5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, X, X, X, X}, {9, 5, 6, 9, 6, 0, 0, 6, 2, X, X, X, X, X, X, X}, + {1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, X}, {1, 5, 6, 2, 1, 6, X, X, X, X, X, X, X, X, X, X}, + {1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, X}, {10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, X, X, X, X}, + {0, 3, 8, 5, 6, 10, X, X, X, X, X, X, X, X, X, X}, {10, 5, 6, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {11, 5, 10, 7, 5, 11, X, X, X, X, X, X, X, X, X, X}, {11, 5, 10, 11, 7, 5, 8, 3, 0, X, X, X, X, X, X, X}, + {5, 11, 7, 5, 10, 11, 1, 9, 0, X, X, X, X, X, X, X}, {10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, X, X, X, X}, + {11, 1, 2, 11, 7, 1, 7, 5, 1, X, X, X, X, X, X, X}, {0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, X, X, X, X}, + {9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, X, X, X, X}, {7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, X}, + {2, 5, 10, 2, 3, 5, 3, 7, 5, X, X, X, X, X, X, X}, {8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, X, X, X, X}, + {9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, X, X, X, X}, {9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, X}, + {1, 3, 5, 3, 7, 5, X, X, X, X, X, X, X, X, X, X}, {0, 8, 7, 0, 7, 1, 1, 7, 5, X, X, X, X, X, X, X}, + {9, 0, 3, 9, 3, 5, 5, 3, 7, X, X, X, X, X, X, X}, {9, 8, 7, 5, 9, 7, X, X, X, X, X, X, X, X, X, X}, + {5, 8, 4, 5, 10, 8, 10, 11, 8, X, X, X, X, X, X, X}, {5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, X, X, X, X}, + {0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, X, X, X, X}, {10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, X}, + {2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, X, X, X, X}, {0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, X}, + {0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, X}, {9, 4, 5, 2, 11, 3, X, X, X, X, X, X, X, X, X, X}, + {2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, X, X, X, X}, {5, 10, 2, 5, 2, 4, 4, 2, 0, X, X, X, X, X, X, X}, + {3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, X}, {5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, X, X, X, X}, + {8, 4, 5, 8, 5, 3, 3, 5, 1, X, X, X, X, X, X, X}, {0, 4, 5, 1, 0, 5, X, X, X, X, X, X, X, X, X, X}, + {8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, X, X, X, X}, {9, 4, 5, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {4, 11, 7, 4, 9, 11, 9, 10, 11, X, X, X, X, X, X, X}, {0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, X, X, X, X}, + {1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, X, X, X, X}, {3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, X}, + {4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, X, X, X, X}, {9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, X}, + {11, 7, 4, 11, 4, 2, 2, 4, 0, X, X, X, X, X, X, X}, {11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, X, X, X, X}, + {2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, X, X, X, X}, {9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, X}, + {3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, X}, {1, 10, 2, 8, 7, 4, X, X, X, X, X, X, X, X, X, X}, + {4, 9, 1, 4, 1, 7, 7, 1, 3, X, X, X, X, X, X, X}, {4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, X, X, X, X}, + {4, 0, 3, 7, 4, 3, X, X, X, X, X, X, X, X, X, X}, {4, 8, 7, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {9, 10, 8, 10, 11, 8, X, X, X, X, X, X, X, X, X, X}, {3, 0, 9, 3, 9, 11, 11, 9, 10, X, X, X, X, X, X, X}, + {0, 1, 10, 0, 10, 8, 8, 10, 11, X, X, X, X, X, X, X}, {3, 1, 10, 11, 3, 10, X, X, X, X, X, X, X, X, X, X}, + {1, 2, 11, 1, 11, 9, 9, 11, 8, X, X, X, X, X, X, X}, {3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, X, X, X, X}, + {0, 2, 11, 8, 0, 11, X, X, X, X, X, X, X, X, X, X}, {3, 2, 11, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {2, 3, 8, 2, 8, 10, 10, 8, 9, X, X, X, X, X, X, X}, {9, 10, 2, 0, 9, 2, X, X, X, X, X, X, X, X, X, X}, + {2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, X, X, X, X}, {1, 10, 2, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {1, 3, 8, 9, 1, 8, X, X, X, X, X, X, X, X, X, X}, {0, 9, 1, X, X, X, X, X, X, X, X, X, X, X, X, X}, + {0, 3, 8, X, X, X, X, X, X, X, X, X, X, X, X, X}, {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}}; #undef X // number of vertices for each case above uint numVertsTable[256] = { - 0, 3, 3, 6, 3, 6, 6, 9, 3, 6, 6, 9, 6, 9, 9, 6, 3, 6, 6, - 9, 6, 9, 9, 12, 6, 9, 9, 12, 9, 12, 12, 9, 3, 6, 6, 9, 6, 9, - 9, 12, 6, 9, 9, 12, 9, 12, 12, 9, 6, 9, 9, 6, 9, 12, 12, 9, 9, - 12, 12, 9, 12, 15, 15, 6, 3, 6, 6, 9, 6, 9, 9, 12, 6, 9, 9, 12, - 9, 12, 12, 9, 6, 9, 9, 12, 9, 12, 12, 15, 9, 12, 12, 15, 12, 15, 15, - 12, 6, 9, 9, 12, 9, 12, 6, 9, 9, 12, 12, 15, 12, 15, 9, 6, 9, 12, - 12, 9, 12, 15, 9, 6, 12, 15, 15, 12, 15, 6, 12, 3, 3, 6, 6, 9, 6, - 9, 9, 12, 6, 9, 9, 12, 9, 12, 12, 9, 6, 9, 9, 12, 9, 12, 12, 15, - 9, 6, 12, 9, 12, 9, 15, 6, 6, 9, 9, 12, 9, 12, 12, 15, 9, 12, 12, - 15, 12, 15, 15, 12, 9, 12, 12, 9, 12, 15, 15, 12, 12, 9, 15, 6, 15, 12, - 6, 3, 6, 9, 9, 12, 9, 12, 12, 15, 9, 12, 12, 15, 6, 9, 9, 6, 9, - 12, 12, 15, 12, 15, 15, 6, 12, 9, 15, 12, 9, 6, 12, 3, 9, 12, 12, 15, - 12, 15, 9, 12, 12, 15, 15, 6, 9, 12, 6, 3, 6, 9, 9, 6, 9, 12, 6, - 3, 9, 6, 12, 3, 6, 3, 3, 0, + 0, 3, 3, 6, 3, 6, 6, 9, 3, 6, 6, 9, 6, 9, 9, 6, 3, 6, 6, 9, 6, 9, 9, 12, 6, 9, 9, 12, 9, + 12, 12, 9, 3, 6, 6, 9, 6, 9, 9, 12, 6, 9, 9, 12, 9, 12, 12, 9, 6, 9, 9, 6, 9, 12, 12, 9, 9, 12, + 12, 9, 12, 15, 15, 6, 3, 6, 6, 9, 6, 9, 9, 12, 6, 9, 9, 12, 9, 12, 12, 9, 6, 9, 9, 12, 9, 12, 12, + 15, 9, 12, 12, 15, 12, 15, 15, 12, 6, 9, 9, 12, 9, 12, 6, 9, 9, 12, 12, 15, 12, 15, 9, 6, 9, 12, 12, 9, + 12, 15, 9, 6, 12, 15, 15, 12, 15, 6, 12, 3, 3, 6, 6, 9, 6, 9, 9, 12, 6, 9, 9, 12, 9, 12, 12, 9, 6, + 9, 9, 12, 9, 12, 12, 15, 9, 6, 12, 9, 12, 9, 15, 6, 6, 9, 9, 12, 9, 12, 12, 15, 9, 12, 12, 15, 12, 15, + 15, 12, 9, 12, 12, 9, 12, 15, 15, 12, 12, 9, 15, 6, 15, 12, 6, 3, 6, 9, 9, 12, 9, 12, 12, 15, 9, 12, 12, + 15, 6, 9, 9, 6, 9, 12, 12, 15, 12, 15, 15, 6, 12, 9, 15, 12, 9, 6, 12, 3, 9, 12, 12, 15, 12, 15, 9, 12, + 12, 15, 15, 6, 9, 12, 6, 3, 6, 9, 9, 6, 9, 12, 6, 3, 9, 6, 12, 3, 6, 3, 3, 0, }; diff --git a/Samples/5_Domain_Specific/nbody/bodysystem.h b/Samples/5_Domain_Specific/nbody/bodysystem.h index 2d11c1ed..9455dc2b 100644 --- a/Samples/5_Domain_Specific/nbody/bodysystem.h +++ b/Samples/5_Domain_Specific/nbody/bodysystem.h @@ -30,257 +30,262 @@ #include -enum NBodyConfig { - NBODY_CONFIG_RANDOM, - NBODY_CONFIG_SHELL, - NBODY_CONFIG_EXPAND, - NBODY_NUM_CONFIGS -}; +enum NBodyConfig { NBODY_CONFIG_RANDOM, NBODY_CONFIG_SHELL, NBODY_CONFIG_EXPAND, NBODY_NUM_CONFIGS }; enum BodyArray { - BODYSYSTEM_POSITION, - BODYSYSTEM_VELOCITY, + BODYSYSTEM_POSITION, + BODYSYSTEM_VELOCITY, }; -template -struct vec3 { - typedef float Type; -}; // dummy -template <> -struct vec3 { - typedef float3 Type; +template struct vec3 +{ + typedef float Type; +}; // dummy +template <> struct vec3 +{ + typedef float3 Type; }; -template <> -struct vec3 { - typedef double3 Type; +template <> struct vec3 +{ + typedef double3 Type; }; -template -struct vec4 { - typedef float Type; -}; // dummy -template <> -struct vec4 { - typedef float4 Type; +template struct vec4 +{ + typedef float Type; +}; // dummy +template <> struct vec4 +{ + typedef float4 Type; }; -template <> -struct vec4 { - typedef double4 Type; +template <> struct vec4 +{ + typedef double4 Type; }; class string; // BodySystem abstract base class -template -class BodySystem { - public: // methods - BodySystem(int numBodies) {} - virtual ~BodySystem() {} +template class BodySystem +{ +public: // methods + BodySystem(int numBodies) {} + virtual ~BodySystem() {} - virtual void loadTipsyFile(const std::string &filename) = 0; + virtual void loadTipsyFile(const std::string &filename) = 0; - virtual void update(T deltaTime) = 0; + virtual void update(T deltaTime) = 0; - virtual void setSoftening(T softening) = 0; - virtual void setDamping(T damping) = 0; + virtual void setSoftening(T softening) = 0; + virtual void setDamping(T damping) = 0; - virtual T *getArray(BodyArray array) = 0; - virtual void setArray(BodyArray array, const T *data) = 0; + virtual T *getArray(BodyArray array) = 0; + virtual void setArray(BodyArray array, const T *data) = 0; - virtual unsigned int getCurrentReadBuffer() const = 0; + virtual unsigned int getCurrentReadBuffer() const = 0; - virtual unsigned int getNumBodies() const = 0; + virtual unsigned int getNumBodies() const = 0; - virtual void synchronizeThreads() const {}; + virtual void synchronizeThreads() const {}; - protected: // methods - BodySystem() {} // default constructor +protected: // methods + BodySystem() {} // default constructor - virtual void _initialize(int numBodies) = 0; - virtual void _finalize() = 0; + virtual void _initialize(int numBodies) = 0; + virtual void _finalize() = 0; }; -inline float3 scalevec(float3 &vector, float scalar) { - float3 rt = vector; - rt.x *= scalar; - rt.y *= scalar; - rt.z *= scalar; - return rt; +inline float3 scalevec(float3 &vector, float scalar) +{ + float3 rt = vector; + rt.x *= scalar; + rt.y *= scalar; + rt.z *= scalar; + return rt; } -inline float normalize(float3 &vector) { - float dist = - sqrtf(vector.x * vector.x + vector.y * vector.y + vector.z * vector.z); +inline float normalize(float3 &vector) +{ + float dist = sqrtf(vector.x * vector.x + vector.y * vector.y + vector.z * vector.z); - if (dist > 1e-6) { - vector.x /= dist; - vector.y /= dist; - vector.z /= dist; - } + if (dist > 1e-6) { + vector.x /= dist; + vector.y /= dist; + vector.z /= dist; + } - return dist; + return dist; } -inline float dot(float3 v0, float3 v1) { - return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; -} +inline float dot(float3 v0, float3 v1) { return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; } -inline float3 cross(float3 v0, float3 v1) { - float3 rt; - rt.x = v0.y * v1.z - v0.z * v1.y; - rt.y = v0.z * v1.x - v0.x * v1.z; - rt.z = v0.x * v1.y - v0.y * v1.x; - return rt; +inline float3 cross(float3 v0, float3 v1) +{ + float3 rt; + rt.x = v0.y * v1.z - v0.z * v1.y; + rt.y = v0.z * v1.x - v0.x * v1.z; + rt.z = v0.x * v1.y - v0.y * v1.x; + return rt; } // utility function template -void randomizeBodies(NBodyConfig config, T *pos, T *vel, float *color, - float clusterScale, float velocityScale, int numBodies, - bool vec4vel) { - switch (config) { +void randomizeBodies(NBodyConfig config, + T *pos, + T *vel, + float *color, + float clusterScale, + float velocityScale, + int numBodies, + bool vec4vel) +{ + switch (config) { default: case NBODY_CONFIG_RANDOM: { - float scale = clusterScale * std::max(1.0f, numBodies / (1024.0f)); - float vscale = velocityScale * scale; + float scale = clusterScale * std::max(1.0f, numBodies / (1024.0f)); + float vscale = velocityScale * scale; - int p = 0, v = 0; - int i = 0; + int p = 0, v = 0; + int i = 0; - while (i < numBodies) { - float3 point; - // const int scale = 16; - point.x = rand() / (float)RAND_MAX * 2 - 1; - point.y = rand() / (float)RAND_MAX * 2 - 1; - point.z = rand() / (float)RAND_MAX * 2 - 1; - float lenSqr = dot(point, point); + while (i < numBodies) { + float3 point; + // const int scale = 16; + point.x = rand() / (float)RAND_MAX * 2 - 1; + point.y = rand() / (float)RAND_MAX * 2 - 1; + point.z = rand() / (float)RAND_MAX * 2 - 1; + float lenSqr = dot(point, point); - if (lenSqr > 1) continue; + if (lenSqr > 1) + continue; - float3 velocity; - velocity.x = rand() / (float)RAND_MAX * 2 - 1; - velocity.y = rand() / (float)RAND_MAX * 2 - 1; - velocity.z = rand() / (float)RAND_MAX * 2 - 1; - lenSqr = dot(velocity, velocity); + float3 velocity; + velocity.x = rand() / (float)RAND_MAX * 2 - 1; + velocity.y = rand() / (float)RAND_MAX * 2 - 1; + velocity.z = rand() / (float)RAND_MAX * 2 - 1; + lenSqr = dot(velocity, velocity); - if (lenSqr > 1) continue; + if (lenSqr > 1) + continue; - pos[p++] = point.x * scale; // pos.x - pos[p++] = point.y * scale; // pos.y - pos[p++] = point.z * scale; // pos.z - pos[p++] = 1.0f; // mass + pos[p++] = point.x * scale; // pos.x + pos[p++] = point.y * scale; // pos.y + pos[p++] = point.z * scale; // pos.z + pos[p++] = 1.0f; // mass - vel[v++] = velocity.x * vscale; // pos.x - vel[v++] = velocity.y * vscale; // pos.x - vel[v++] = velocity.z * vscale; // pos.x + vel[v++] = velocity.x * vscale; // pos.x + vel[v++] = velocity.y * vscale; // pos.x + vel[v++] = velocity.z * vscale; // pos.x - if (vec4vel) vel[v++] = 1.0f; // inverse mass + if (vec4vel) + vel[v++] = 1.0f; // inverse mass - i++; - } + i++; + } } break; case NBODY_CONFIG_SHELL: { - float scale = clusterScale; - float vscale = scale * velocityScale; - float inner = 2.5f * scale; - float outer = 4.0f * scale; + float scale = clusterScale; + float vscale = scale * velocityScale; + float inner = 2.5f * scale; + float outer = 4.0f * scale; - int p = 0, v = 0; - int i = 0; + int p = 0, v = 0; + int i = 0; - while (i < numBodies) // for(int i=0; i < numBodies; i++) - { - float x, y, z; - x = rand() / (float)RAND_MAX * 2 - 1; - y = rand() / (float)RAND_MAX * 2 - 1; - z = rand() / (float)RAND_MAX * 2 - 1; + while (i < numBodies) // for(int i=0; i < numBodies; i++) + { + float x, y, z; + x = rand() / (float)RAND_MAX * 2 - 1; + y = rand() / (float)RAND_MAX * 2 - 1; + z = rand() / (float)RAND_MAX * 2 - 1; - float3 point = {x, y, z}; - float len = normalize(point); + float3 point = {x, y, z}; + float len = normalize(point); - if (len > 1) continue; + if (len > 1) + continue; - pos[p++] = - point.x * (inner + (outer - inner) * rand() / (float)RAND_MAX); - pos[p++] = - point.y * (inner + (outer - inner) * rand() / (float)RAND_MAX); - pos[p++] = - point.z * (inner + (outer - inner) * rand() / (float)RAND_MAX); - pos[p++] = 1.0f; + pos[p++] = point.x * (inner + (outer - inner) * rand() / (float)RAND_MAX); + pos[p++] = point.y * (inner + (outer - inner) * rand() / (float)RAND_MAX); + pos[p++] = point.z * (inner + (outer - inner) * rand() / (float)RAND_MAX); + pos[p++] = 1.0f; - x = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); - y = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); - z = 1.0f; // * (rand() / (float) RAND_MAX * 2 - 1); - float3 axis = {x, y, z}; - normalize(axis); + x = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); + y = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); + z = 1.0f; // * (rand() / (float) RAND_MAX * 2 - 1); + float3 axis = {x, y, z}; + normalize(axis); - if (1 - dot(point, axis) < 1e-6) { - axis.x = point.y; - axis.y = point.x; - normalize(axis); + if (1 - dot(point, axis) < 1e-6) { + axis.x = point.y; + axis.y = point.x; + normalize(axis); + } + + // if (point.y < 0) axis = scalevec(axis, -1); + float3 vv = {(float)pos[4 * i], (float)pos[4 * i + 1], (float)pos[4 * i + 2]}; + vv = cross(vv, axis); + vel[v++] = vv.x * vscale; + vel[v++] = vv.y * vscale; + vel[v++] = vv.z * vscale; + + if (vec4vel) + vel[v++] = 1.0f; + + i++; } - - // if (point.y < 0) axis = scalevec(axis, -1); - float3 vv = {(float)pos[4 * i], (float)pos[4 * i + 1], - (float)pos[4 * i + 2]}; - vv = cross(vv, axis); - vel[v++] = vv.x * vscale; - vel[v++] = vv.y * vscale; - vel[v++] = vv.z * vscale; - - if (vec4vel) vel[v++] = 1.0f; - - i++; - } } break; case NBODY_CONFIG_EXPAND: { - float scale = clusterScale * numBodies / (1024.f); + float scale = clusterScale * numBodies / (1024.f); - if (scale < 1.0f) scale = clusterScale; + if (scale < 1.0f) + scale = clusterScale; - float vscale = scale * velocityScale; + float vscale = scale * velocityScale; - int p = 0, v = 0; + int p = 0, v = 0; - for (int i = 0; i < numBodies;) { - float3 point; + for (int i = 0; i < numBodies;) { + float3 point; - point.x = rand() / (float)RAND_MAX * 2 - 1; - point.y = rand() / (float)RAND_MAX * 2 - 1; - point.z = rand() / (float)RAND_MAX * 2 - 1; + point.x = rand() / (float)RAND_MAX * 2 - 1; + point.y = rand() / (float)RAND_MAX * 2 - 1; + point.z = rand() / (float)RAND_MAX * 2 - 1; - float lenSqr = dot(point, point); + float lenSqr = dot(point, point); - if (lenSqr > 1) continue; + if (lenSqr > 1) + continue; - pos[p++] = point.x * scale; // pos.x - pos[p++] = point.y * scale; // pos.y - pos[p++] = point.z * scale; // pos.z - pos[p++] = 1.0f; // mass - vel[v++] = point.x * vscale; // pos.x - vel[v++] = point.y * vscale; // pos.x - vel[v++] = point.z * vscale; // pos.x + pos[p++] = point.x * scale; // pos.x + pos[p++] = point.y * scale; // pos.y + pos[p++] = point.z * scale; // pos.z + pos[p++] = 1.0f; // mass + vel[v++] = point.x * vscale; // pos.x + vel[v++] = point.y * vscale; // pos.x + vel[v++] = point.z * vscale; // pos.x - if (vec4vel) vel[v++] = 1.0f; // inverse mass + if (vec4vel) + vel[v++] = 1.0f; // inverse mass - i++; - } + i++; + } } break; - } - - if (color) { - int v = 0; - - for (int i = 0; i < numBodies; i++) { - // const int scale = 16; - color[v++] = rand() / (float)RAND_MAX; - color[v++] = rand() / (float)RAND_MAX; - color[v++] = rand() / (float)RAND_MAX; - color[v++] = 1.0f; } - } + + if (color) { + int v = 0; + + for (int i = 0; i < numBodies; i++) { + // const int scale = 16; + color[v++] = rand() / (float)RAND_MAX; + color[v++] = rand() / (float)RAND_MAX; + color[v++] = rand() / (float)RAND_MAX; + color[v++] = 1.0f; + } + } } -#endif // __BODYSYSTEM_H__ +#endif // __BODYSYSTEM_H__ diff --git a/Samples/5_Domain_Specific/nbody/bodysystemcpu.h b/Samples/5_Domain_Specific/nbody/bodysystemcpu.h index 700e385a..11553ffb 100644 --- a/Samples/5_Domain_Specific/nbody/bodysystemcpu.h +++ b/Samples/5_Domain_Specific/nbody/bodysystemcpu.h @@ -31,49 +31,47 @@ #include "bodysystem.h" // CPU Body System -template -class BodySystemCPU : public BodySystem { - public: - BodySystemCPU(int numBodies); - virtual ~BodySystemCPU(); +template class BodySystemCPU : public BodySystem +{ +public: + BodySystemCPU(int numBodies); + virtual ~BodySystemCPU(); - virtual void loadTipsyFile(const std::string &filename); + virtual void loadTipsyFile(const std::string &filename); - virtual void update(T deltaTime); + virtual void update(T deltaTime); - virtual void setSoftening(T softening) { - m_softeningSquared = softening * softening; - } - virtual void setDamping(T damping) { m_damping = damping; } + virtual void setSoftening(T softening) { m_softeningSquared = softening * softening; } + virtual void setDamping(T damping) { m_damping = damping; } - virtual T *getArray(BodyArray array); - virtual void setArray(BodyArray array, const T *data); + virtual T *getArray(BodyArray array); + virtual void setArray(BodyArray array, const T *data); - virtual unsigned int getCurrentReadBuffer() const { return 0; } + virtual unsigned int getCurrentReadBuffer() const { return 0; } - virtual unsigned int getNumBodies() const { return m_numBodies; } + virtual unsigned int getNumBodies() const { return m_numBodies; } - protected: // methods - BodySystemCPU() {} // default constructor +protected: // methods + BodySystemCPU() {} // default constructor - virtual void _initialize(int numBodies); - virtual void _finalize(); + virtual void _initialize(int numBodies); + virtual void _finalize(); - void _computeNBodyGravitation(); - void _integrateNBodySystem(T deltaTime); + void _computeNBodyGravitation(); + void _integrateNBodySystem(T deltaTime); - protected: // data - int m_numBodies; - bool m_bInitialized; +protected: // data + int m_numBodies; + bool m_bInitialized; - T *m_pos; - T *m_vel; - T *m_force; + T *m_pos; + T *m_vel; + T *m_force; - T m_softeningSquared; - T m_damping; + T m_softeningSquared; + T m_damping; }; #include "bodysystemcpu_impl.h" -#endif // __BODYSYSTEMCPU_H__ +#endif // __BODYSYSTEMCPU_H__ diff --git a/Samples/5_Domain_Specific/nbody/bodysystemcpu_impl.h b/Samples/5_Domain_Specific/nbody/bodysystemcpu_impl.h index 14130064..bf0c7437 100644 --- a/Samples/5_Domain_Specific/nbody/bodysystemcpu_impl.h +++ b/Samples/5_Domain_Specific/nbody/bodysystemcpu_impl.h @@ -25,15 +25,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "bodysystemcpu.h" - -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include + +#include "bodysystemcpu.h" #include "tipsy.h" #ifdef OPENMP @@ -42,239 +42,229 @@ template BodySystemCPU::BodySystemCPU(int numBodies) - : m_numBodies(numBodies), - m_bInitialized(false), - m_force(0), - m_softeningSquared(.00125f), - m_damping(0.995f) { - m_pos = 0; - m_vel = 0; + : m_numBodies(numBodies) + , m_bInitialized(false) + , m_force(0) + , m_softeningSquared(.00125f) + , m_damping(0.995f) +{ + m_pos = 0; + m_vel = 0; - _initialize(numBodies); + _initialize(numBodies); } -template -BodySystemCPU::~BodySystemCPU() { - _finalize(); - m_numBodies = 0; +template BodySystemCPU::~BodySystemCPU() +{ + _finalize(); + m_numBodies = 0; } -template -void BodySystemCPU::_initialize(int numBodies) { - assert(!m_bInitialized); +template void BodySystemCPU::_initialize(int numBodies) +{ + assert(!m_bInitialized); - m_numBodies = numBodies; + m_numBodies = numBodies; - m_pos = new T[m_numBodies * 4]; - m_vel = new T[m_numBodies * 4]; - m_force = new T[m_numBodies * 3]; + m_pos = new T[m_numBodies * 4]; + m_vel = new T[m_numBodies * 4]; + m_force = new T[m_numBodies * 3]; - memset(m_pos, 0, m_numBodies * 4 * sizeof(T)); - memset(m_vel, 0, m_numBodies * 4 * sizeof(T)); - memset(m_force, 0, m_numBodies * 3 * sizeof(T)); + memset(m_pos, 0, m_numBodies * 4 * sizeof(T)); + memset(m_vel, 0, m_numBodies * 4 * sizeof(T)); + memset(m_force, 0, m_numBodies * 3 * sizeof(T)); - m_bInitialized = true; + m_bInitialized = true; } -template -void BodySystemCPU::_finalize() { - assert(m_bInitialized); +template void BodySystemCPU::_finalize() +{ + assert(m_bInitialized); - delete[] m_pos; - delete[] m_vel; - delete[] m_force; + delete[] m_pos; + delete[] m_vel; + delete[] m_force; - m_bInitialized = false; + m_bInitialized = false; } -template -void BodySystemCPU::loadTipsyFile(const std::string &filename) { - if (m_bInitialized) _finalize(); +template void BodySystemCPU::loadTipsyFile(const std::string &filename) +{ + if (m_bInitialized) + _finalize(); - vector::Type> positions; - vector::Type> velocities; - vector ids; + vector::Type> positions; + vector::Type> velocities; + vector ids; - int nBodies = 0; - int nFirst = 0, nSecond = 0, nThird = 0; + int nBodies = 0; + int nFirst = 0, nSecond = 0, nThird = 0; - read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, - nSecond, nThird); + read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, nSecond, nThird); - _initialize(nBodies); + _initialize(nBodies); - memcpy(m_pos, &positions[0], sizeof(vec4) * nBodies); - memcpy(m_vel, &velocities[0], sizeof(vec4) * nBodies); + memcpy(m_pos, &positions[0], sizeof(vec4) * nBodies); + memcpy(m_vel, &velocities[0], sizeof(vec4) * nBodies); } -template -void BodySystemCPU::update(T deltaTime) { - assert(m_bInitialized); +template void BodySystemCPU::update(T deltaTime) +{ + assert(m_bInitialized); - _integrateNBodySystem(deltaTime); + _integrateNBodySystem(deltaTime); - // std::swap(m_currentRead, m_currentWrite); + // std::swap(m_currentRead, m_currentWrite); } -template -T *BodySystemCPU::getArray(BodyArray array) { - assert(m_bInitialized); +template T *BodySystemCPU::getArray(BodyArray array) +{ + assert(m_bInitialized); - T *data = 0; + T *data = 0; - switch (array) { + switch (array) { default: case BODYSYSTEM_POSITION: - data = m_pos; - break; + data = m_pos; + break; case BODYSYSTEM_VELOCITY: - data = m_vel; - break; - } - - return data; -} - -template -void BodySystemCPU::setArray(BodyArray array, const T *data) { - assert(m_bInitialized); - - T *target = 0; - - switch (array) { - default: - case BODYSYSTEM_POSITION: - target = m_pos; - break; - - case BODYSYSTEM_VELOCITY: - target = m_vel; - break; - } - - memcpy(target, data, m_numBodies * 4 * sizeof(T)); -} - -template -T sqrt_T(T x) { - return sqrt(x); -} - -template <> -float sqrt_T(float x) { - return sqrtf(x); -} - -template -void bodyBodyInteraction(T accel[3], T posMass0[4], T posMass1[4], - T softeningSquared) { - T r[3]; - - // r_01 [3 FLOPS] - r[0] = posMass1[0] - posMass0[0]; - r[1] = posMass1[1] - posMass0[1]; - r[2] = posMass1[2] - posMass0[2]; - - // d^2 + e^2 [6 FLOPS] - T distSqr = r[0] * r[0] + r[1] * r[1] + r[2] * r[2]; - distSqr += softeningSquared; - - // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] - T invDist = (T)1.0 / (T)sqrt((double)distSqr); - T invDistCube = invDist * invDist * invDist; - - // s = m_j * invDistCube [1 FLOP] - T s = posMass1[3] * invDistCube; - - // (m_1 * r_01) / (d^2 + e^2)^(3/2) [6 FLOPS] - accel[0] += r[0] * s; - accel[1] += r[1] * s; - accel[2] += r[2] * s; -} - -template -void BodySystemCPU::_computeNBodyGravitation() { -#ifdef OPENMP -#pragma omp parallel for -#endif - - for (int i = 0; i < m_numBodies; i++) { - int indexForce = 3 * i; - - T acc[3] = {0, 0, 0}; - - // We unroll this loop 4X for a small performance boost. - int j = 0; - - while (j < m_numBodies) { - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; + data = m_vel; + break; } - m_force[indexForce] = acc[0]; - m_force[indexForce + 1] = acc[1]; - m_force[indexForce + 2] = acc[2]; - } + return data; } -template -void BodySystemCPU::_integrateNBodySystem(T deltaTime) { - _computeNBodyGravitation(); +template void BodySystemCPU::setArray(BodyArray array, const T *data) +{ + assert(m_bInitialized); + + T *target = 0; + + switch (array) { + default: + case BODYSYSTEM_POSITION: + target = m_pos; + break; + + case BODYSYSTEM_VELOCITY: + target = m_vel; + break; + } + + memcpy(target, data, m_numBodies * 4 * sizeof(T)); +} + +template T sqrt_T(T x) { return sqrt(x); } + +template <> float sqrt_T(float x) { return sqrtf(x); } + +template void bodyBodyInteraction(T accel[3], T posMass0[4], T posMass1[4], T softeningSquared) +{ + T r[3]; + + // r_01 [3 FLOPS] + r[0] = posMass1[0] - posMass0[0]; + r[1] = posMass1[1] - posMass0[1]; + r[2] = posMass1[2] - posMass0[2]; + + // d^2 + e^2 [6 FLOPS] + T distSqr = r[0] * r[0] + r[1] * r[1] + r[2] * r[2]; + distSqr += softeningSquared; + + // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] + T invDist = (T)1.0 / (T)sqrt((double)distSqr); + T invDistCube = invDist * invDist * invDist; + + // s = m_j * invDistCube [1 FLOP] + T s = posMass1[3] * invDistCube; + + // (m_1 * r_01) / (d^2 + e^2)^(3/2) [6 FLOPS] + accel[0] += r[0] * s; + accel[1] += r[1] * s; + accel[2] += r[2] * s; +} + +template void BodySystemCPU::_computeNBodyGravitation() +{ +#ifdef OPENMP +#pragma omp parallel for +#endif + + for (int i = 0; i < m_numBodies; i++) { + int indexForce = 3 * i; + + T acc[3] = {0, 0, 0}; + + // We unroll this loop 4X for a small performance boost. + int j = 0; + + while (j < m_numBodies) { + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + } + + m_force[indexForce] = acc[0]; + m_force[indexForce + 1] = acc[1]; + m_force[indexForce + 2] = acc[2]; + } +} + +template void BodySystemCPU::_integrateNBodySystem(T deltaTime) +{ + _computeNBodyGravitation(); #ifdef OPENMP #pragma omp parallel for #endif - for (int i = 0; i < m_numBodies; ++i) { - int index = 4 * i; - int indexForce = 3 * i; + for (int i = 0; i < m_numBodies; ++i) { + int index = 4 * i; + int indexForce = 3 * i; - T pos[3], vel[3], force[3]; - pos[0] = m_pos[index + 0]; - pos[1] = m_pos[index + 1]; - pos[2] = m_pos[index + 2]; - T invMass = m_pos[index + 3]; + T pos[3], vel[3], force[3]; + pos[0] = m_pos[index + 0]; + pos[1] = m_pos[index + 1]; + pos[2] = m_pos[index + 2]; + T invMass = m_pos[index + 3]; - vel[0] = m_vel[index + 0]; - vel[1] = m_vel[index + 1]; - vel[2] = m_vel[index + 2]; + vel[0] = m_vel[index + 0]; + vel[1] = m_vel[index + 1]; + vel[2] = m_vel[index + 2]; - force[0] = m_force[indexForce + 0]; - force[1] = m_force[indexForce + 1]; - force[2] = m_force[indexForce + 2]; + force[0] = m_force[indexForce + 0]; + force[1] = m_force[indexForce + 1]; + force[2] = m_force[indexForce + 2]; - // acceleration = force / mass; - // new velocity = old velocity + acceleration * deltaTime - vel[0] += (force[0] * invMass) * deltaTime; - vel[1] += (force[1] * invMass) * deltaTime; - vel[2] += (force[2] * invMass) * deltaTime; + // acceleration = force / mass; + // new velocity = old velocity + acceleration * deltaTime + vel[0] += (force[0] * invMass) * deltaTime; + vel[1] += (force[1] * invMass) * deltaTime; + vel[2] += (force[2] * invMass) * deltaTime; - vel[0] *= m_damping; - vel[1] *= m_damping; - vel[2] *= m_damping; + vel[0] *= m_damping; + vel[1] *= m_damping; + vel[2] *= m_damping; - // new position = old position + velocity * deltaTime - pos[0] += vel[0] * deltaTime; - pos[1] += vel[1] * deltaTime; - pos[2] += vel[2] * deltaTime; + // new position = old position + velocity * deltaTime + pos[0] += vel[0] * deltaTime; + pos[1] += vel[1] * deltaTime; + pos[2] += vel[2] * deltaTime; - m_pos[index + 0] = pos[0]; - m_pos[index + 1] = pos[1]; - m_pos[index + 2] = pos[2]; + m_pos[index + 0] = pos[0]; + m_pos[index + 1] = pos[1]; + m_pos[index + 2] = pos[2]; - m_vel[index + 0] = vel[0]; - m_vel[index + 1] = vel[1]; - m_vel[index + 2] = vel[2]; - } + m_vel[index + 0] = vel[0]; + m_vel[index + 1] = vel[1]; + m_vel[index + 2] = vel[2]; + } } diff --git a/Samples/5_Domain_Specific/nbody/bodysystemcuda.cu b/Samples/5_Domain_Specific/nbody/bodysystemcuda.cu index cb949786..0a0d9760 100644 --- a/Samples/5_Domain_Specific/nbody/bodysystemcuda.cu +++ b/Samples/5_Domain_Specific/nbody/bodysystemcuda.cu @@ -36,251 +36,247 @@ #endif // CUDA standard includes -#include -#include - #include +#include +#include namespace cg = cooperative_groups; #include "bodysystem.h" -__constant__ float softeningSquared; +__constant__ float softeningSquared; __constant__ double softeningSquared_fp64; -cudaError_t setSofteningSquared(float softeningSq) { - return cudaMemcpyToSymbol(softeningSquared, &softeningSq, sizeof(float), 0, - cudaMemcpyHostToDevice); +cudaError_t setSofteningSquared(float softeningSq) +{ + return cudaMemcpyToSymbol(softeningSquared, &softeningSq, sizeof(float), 0, cudaMemcpyHostToDevice); } -cudaError_t setSofteningSquared(double softeningSq) { - return cudaMemcpyToSymbol(softeningSquared_fp64, &softeningSq, sizeof(double), - 0, cudaMemcpyHostToDevice); +cudaError_t setSofteningSquared(double softeningSq) +{ + return cudaMemcpyToSymbol(softeningSquared_fp64, &softeningSq, sizeof(double), 0, cudaMemcpyHostToDevice); } -template -struct SharedMemory { - __device__ inline operator T *() { - extern __shared__ int __smem[]; - return (T *)__smem; - } +template struct SharedMemory +{ + __device__ inline operator T *() + { + extern __shared__ int __smem[]; + return (T *)__smem; + } - __device__ inline operator const T *() const { - extern __shared__ int __smem[]; - return (T *)__smem; - } + __device__ inline operator const T *() const + { + extern __shared__ int __smem[]; + return (T *)__smem; + } }; -template -__device__ T rsqrt_T(T x) { - return rsqrt(x); -} +template __device__ T rsqrt_T(T x) { return rsqrt(x); } -template <> -__device__ float rsqrt_T(float x) { - return rsqrtf(x); -} +template <> __device__ float rsqrt_T(float x) { return rsqrtf(x); } -template <> -__device__ double rsqrt_T(double x) { - return rsqrt(x); -} +template <> __device__ double rsqrt_T(double x) { return rsqrt(x); } // Macros to simplify shared memory addressing #define SX(i) sharedPos[i + blockDim.x * threadIdx.y] // This macro is only used when multithreadBodies is true (below) #define SX_SUM(i, j) sharedPos[i + blockDim.x * j] -template -__device__ T getSofteningSquared() { - return softeningSquared; -} -template <> -__device__ double getSofteningSquared() { - return softeningSquared_fp64; -} +template __device__ T getSofteningSquared() { return softeningSquared; } +template <> __device__ double getSofteningSquared() { return softeningSquared_fp64; } -template -struct DeviceData { - T *dPos[2]; // mapped host pointers - T *dVel; - cudaEvent_t event; - unsigned int offset; - unsigned int numBodies; +template struct DeviceData +{ + T *dPos[2]; // mapped host pointers + T *dVel; + cudaEvent_t event; + unsigned int offset; + unsigned int numBodies; }; template -__device__ typename vec3::Type bodyBodyInteraction( - typename vec3::Type ai, typename vec4::Type bi, - typename vec4::Type bj) { - typename vec3::Type r; +__device__ typename vec3::Type +bodyBodyInteraction(typename vec3::Type ai, typename vec4::Type bi, typename vec4::Type bj) +{ + typename vec3::Type r; - // r_ij [3 FLOPS] - r.x = bj.x - bi.x; - r.y = bj.y - bi.y; - r.z = bj.z - bi.z; + // r_ij [3 FLOPS] + r.x = bj.x - bi.x; + r.y = bj.y - bi.y; + r.z = bj.z - bi.z; - // distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS] - T distSqr = r.x * r.x + r.y * r.y + r.z * r.z; - distSqr += getSofteningSquared(); + // distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS] + T distSqr = r.x * r.x + r.y * r.y + r.z * r.z; + distSqr += getSofteningSquared(); - // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] - T invDist = rsqrt_T(distSqr); - T invDistCube = invDist * invDist * invDist; + // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] + T invDist = rsqrt_T(distSqr); + T invDistCube = invDist * invDist * invDist; - // s = m_j * invDistCube [1 FLOP] - T s = bj.w * invDistCube; + // s = m_j * invDistCube [1 FLOP] + T s = bj.w * invDistCube; - // a_i = a_i + s * r_ij [6 FLOPS] - ai.x += r.x * s; - ai.y += r.y * s; - ai.z += r.z * s; + // a_i = a_i + s * r_ij [6 FLOPS] + ai.x += r.x * s; + ai.y += r.y * s; + ai.z += r.z * s; - return ai; + return ai; } template -__device__ typename vec3::Type computeBodyAccel( - typename vec4::Type bodyPos, typename vec4::Type *positions, - int numTiles, cg::thread_block cta) { - typename vec4::Type *sharedPos = SharedMemory::Type>(); +__device__ typename vec3::Type +computeBodyAccel(typename vec4::Type bodyPos, typename vec4::Type *positions, int numTiles, cg::thread_block cta) +{ + typename vec4::Type *sharedPos = SharedMemory::Type>(); - typename vec3::Type acc = {0.0f, 0.0f, 0.0f}; + typename vec3::Type acc = {0.0f, 0.0f, 0.0f}; - for (int tile = 0; tile < numTiles; tile++) { - sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; + for (int tile = 0; tile < numTiles; tile++) { + sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; - cg::sync(cta); + cg::sync(cta); // This is the "tile_calculation" from the GPUG3 article. #pragma unroll 128 - for (unsigned int counter = 0; counter < blockDim.x; counter++) { - acc = bodyBodyInteraction(acc, bodyPos, sharedPos[counter]); + for (unsigned int counter = 0; counter < blockDim.x; counter++) { + acc = bodyBodyInteraction(acc, bodyPos, sharedPos[counter]); + } + + cg::sync(cta); } - cg::sync(cta); - } - - return acc; + return acc; } template __global__ void integrateBodies(typename vec4::Type *__restrict__ newPos, typename vec4::Type *__restrict__ oldPos, typename vec4::Type *vel, - unsigned int deviceOffset, - unsigned int deviceNumBodies, float deltaTime, - float damping, int numTiles) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - int index = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int deviceOffset, + unsigned int deviceNumBodies, + float deltaTime, + float damping, + int numTiles) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index >= deviceNumBodies) { - return; - } + if (index >= deviceNumBodies) { + return; + } - typename vec4::Type position = oldPos[deviceOffset + index]; + typename vec4::Type position = oldPos[deviceOffset + index]; - typename vec3::Type accel = - computeBodyAccel(position, oldPos, numTiles, cta); + typename vec3::Type accel = computeBodyAccel(position, oldPos, numTiles, cta); - // acceleration = force / mass; - // new velocity = old velocity + acceleration * deltaTime - // note we factor out the body's mass from the equation, here and in - // bodyBodyInteraction - // (because they cancel out). Thus here force == acceleration - typename vec4::Type velocity = vel[deviceOffset + index]; + // acceleration = force / mass; + // new velocity = old velocity + acceleration * deltaTime + // note we factor out the body's mass from the equation, here and in + // bodyBodyInteraction + // (because they cancel out). Thus here force == acceleration + typename vec4::Type velocity = vel[deviceOffset + index]; - velocity.x += accel.x * deltaTime; - velocity.y += accel.y * deltaTime; - velocity.z += accel.z * deltaTime; + velocity.x += accel.x * deltaTime; + velocity.y += accel.y * deltaTime; + velocity.z += accel.z * deltaTime; - velocity.x *= damping; - velocity.y *= damping; - velocity.z *= damping; + velocity.x *= damping; + velocity.y *= damping; + velocity.z *= damping; - // new position = old position + velocity * deltaTime - position.x += velocity.x * deltaTime; - position.y += velocity.y * deltaTime; - position.z += velocity.z * deltaTime; + // new position = old position + velocity * deltaTime + position.x += velocity.x * deltaTime; + position.y += velocity.y * deltaTime; + position.z += velocity.z * deltaTime; - // store new position and velocity - newPos[deviceOffset + index] = position; - vel[deviceOffset + index] = velocity; + // store new position and velocity + newPos[deviceOffset + index] = position; + vel[deviceOffset + index] = velocity; } template -void integrateNbodySystem(DeviceData *deviceData, +void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, float deltaTime, - float damping, unsigned int numBodies, - unsigned int numDevices, int blockSize, - bool bUsePBO) { - if (bUsePBO) { - checkCudaErrors(cudaGraphicsResourceSetMapFlags( - pgres[currentRead], cudaGraphicsMapFlagsReadOnly)); - checkCudaErrors(cudaGraphicsResourceSetMapFlags( - pgres[1 - currentRead], cudaGraphicsMapFlagsWriteDiscard)); - checkCudaErrors(cudaGraphicsMapResources(2, pgres, 0)); - size_t bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&(deviceData[0].dPos[currentRead]), &bytes, - pgres[currentRead])); - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&(deviceData[0].dPos[1 - currentRead]), &bytes, - pgres[1 - currentRead])); - } - - for (unsigned int dev = 0; dev != numDevices; dev++) { - if (numDevices > 1) { - cudaSetDevice(dev); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO) +{ + if (bUsePBO) { + checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres[currentRead], cudaGraphicsMapFlagsReadOnly)); + checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres[1 - currentRead], cudaGraphicsMapFlagsWriteDiscard)); + checkCudaErrors(cudaGraphicsMapResources(2, pgres, 0)); + size_t bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer( + (void **)&(deviceData[0].dPos[currentRead]), &bytes, pgres[currentRead])); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer( + (void **)&(deviceData[0].dPos[1 - currentRead]), &bytes, pgres[1 - currentRead])); } - int numBlocks = (deviceData[dev].numBodies + blockSize - 1) / blockSize; - int numTiles = (numBodies + blockSize - 1) / blockSize; - int sharedMemSize = blockSize * 4 * sizeof(T); // 4 floats for pos + for (unsigned int dev = 0; dev != numDevices; dev++) { + if (numDevices > 1) { + cudaSetDevice(dev); + } - integrateBodies<<>>( - (typename vec4::Type *)deviceData[dev].dPos[1 - currentRead], - (typename vec4::Type *)deviceData[dev].dPos[currentRead], - (typename vec4::Type *)deviceData[dev].dVel, deviceData[dev].offset, - deviceData[dev].numBodies, deltaTime, damping, numTiles); + int numBlocks = (deviceData[dev].numBodies + blockSize - 1) / blockSize; + int numTiles = (numBodies + blockSize - 1) / blockSize; + int sharedMemSize = blockSize * 4 * sizeof(T); // 4 floats for pos + + integrateBodies + <<>>((typename vec4::Type *)deviceData[dev].dPos[1 - currentRead], + (typename vec4::Type *)deviceData[dev].dPos[currentRead], + (typename vec4::Type *)deviceData[dev].dVel, + deviceData[dev].offset, + deviceData[dev].numBodies, + deltaTime, + damping, + numTiles); + + if (numDevices > 1) { + checkCudaErrors(cudaEventRecord(deviceData[dev].event)); + // MJH: Hack on older driver versions to force kernel launches to flush! + cudaStreamQuery(0); + } + + // check if kernel invocation generated an error + getLastCudaError("Kernel execution failed"); + } if (numDevices > 1) { - checkCudaErrors(cudaEventRecord(deviceData[dev].event)); - // MJH: Hack on older driver versions to force kernel launches to flush! - cudaStreamQuery(0); + for (unsigned int dev = 0; dev < numDevices; dev++) { + checkCudaErrors(cudaEventSynchronize(deviceData[dev].event)); + } } - // check if kernel invocation generated an error - getLastCudaError("Kernel execution failed"); - } - - if (numDevices > 1) { - for (unsigned int dev = 0; dev < numDevices; dev++) { - checkCudaErrors(cudaEventSynchronize(deviceData[dev].event)); + if (bUsePBO) { + checkCudaErrors(cudaGraphicsUnmapResources(2, pgres, 0)); } - } - - if (bUsePBO) { - checkCudaErrors(cudaGraphicsUnmapResources(2, pgres, 0)); - } } // Explicit specializations needed to generate code -template void integrateNbodySystem(DeviceData *deviceData, +template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, - float deltaTime, float damping, - unsigned int numBodies, - unsigned int numDevices, - int blockSize, bool bUsePBO); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO); -template void integrateNbodySystem(DeviceData *deviceData, +template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, - float deltaTime, float damping, - unsigned int numBodies, - unsigned int numDevices, - int blockSize, bool bUsePBO); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO); diff --git a/Samples/5_Domain_Specific/nbody/bodysystemcuda.h b/Samples/5_Domain_Specific/nbody/bodysystemcuda.h index 70e5c7ea..a135e177 100644 --- a/Samples/5_Domain_Specific/nbody/bodysystemcuda.h +++ b/Samples/5_Domain_Specific/nbody/bodysystemcuda.h @@ -30,73 +30,75 @@ #include "bodysystem.h" -template -struct DeviceData { - T *dPos[2]; // mapped host pointers - T *dVel; - cudaEvent_t event; - unsigned int offset; - unsigned int numBodies; +template struct DeviceData +{ + T *dPos[2]; // mapped host pointers + T *dVel; + cudaEvent_t event; + unsigned int offset; + unsigned int numBodies; }; // CUDA BodySystem: runs on the GPU -template -class BodySystemCUDA : public BodySystem { - public: - BodySystemCUDA(unsigned int numBodies, unsigned int numDevices, - unsigned int blockSize, bool usePBO, bool useSysMem = false, - bool useP2P = true, int deviceId = 0); - virtual ~BodySystemCUDA(); +template class BodySystemCUDA : public BodySystem +{ +public: + BodySystemCUDA(unsigned int numBodies, + unsigned int numDevices, + unsigned int blockSize, + bool usePBO, + bool useSysMem = false, + bool useP2P = true, + int deviceId = 0); + virtual ~BodySystemCUDA(); - virtual void loadTipsyFile(const std::string &filename); + virtual void loadTipsyFile(const std::string &filename); - virtual void update(T deltaTime); + virtual void update(T deltaTime); - virtual void setSoftening(T softening); - virtual void setDamping(T damping); + virtual void setSoftening(T softening); + virtual void setDamping(T damping); - virtual T *getArray(BodyArray array); - virtual void setArray(BodyArray array, const T *data); + virtual T *getArray(BodyArray array); + virtual void setArray(BodyArray array, const T *data); - virtual unsigned int getCurrentReadBuffer() const { - return m_pbo[m_currentRead]; - } + virtual unsigned int getCurrentReadBuffer() const { return m_pbo[m_currentRead]; } - virtual unsigned int getNumBodies() const { return m_numBodies; } + virtual unsigned int getNumBodies() const { return m_numBodies; } - protected: // methods - BodySystemCUDA() {} +protected: // methods + BodySystemCUDA() {} - virtual void _initialize(int numBodies); - virtual void _finalize(); + virtual void _initialize(int numBodies); + virtual void _finalize(); - protected: // data - unsigned int m_numBodies; - unsigned int m_numDevices; - bool m_bInitialized; - int m_devID; +protected: // data + unsigned int m_numBodies; + unsigned int m_numDevices; + bool m_bInitialized; + int m_devID; - // Host data - T *m_hPos[2]; - T *m_hVel; + // Host data + T *m_hPos[2]; + T *m_hVel; - DeviceData *m_deviceData; + DeviceData *m_deviceData; - bool m_bUsePBO; - bool m_bUseSysMem; - bool m_bUseP2P; - unsigned int m_SMVersion; + bool m_bUsePBO; + bool m_bUseSysMem; + bool m_bUseP2P; + unsigned int m_SMVersion; - T m_damping; + T m_damping; - unsigned int m_pbo[2]; - cudaGraphicsResource *m_pGRes[2]; - unsigned int m_currentRead; - unsigned int m_currentWrite; + unsigned int m_pbo[2]; + cudaGraphicsResource *m_pGRes[2]; + unsigned int m_currentRead; + unsigned int m_currentWrite; - unsigned int m_blockSize; + unsigned int m_blockSize; }; #include "bodysystemcuda_impl.h" -#endif // __BODYSYSTEMCUDA_H__ +#endif // __BODYSYSTEMCUDA_H__ diff --git a/Samples/5_Domain_Specific/nbody/bodysystemcuda_impl.h b/Samples/5_Domain_Specific/nbody/bodysystemcuda_impl.h index 239b018c..dc1b8902 100644 --- a/Samples/5_Domain_Specific/nbody/bodysystemcuda_impl.h +++ b/Samples/5_Domain_Specific/nbody/bodysystemcuda_impl.h @@ -25,24 +25,26 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include - +#include #include -#include -#include #include #include -#include -#include - #include +#include +#include +#include +#include template -void integrateNbodySystem(DeviceData *deviceData, +void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, float deltaTime, - float damping, unsigned int numBodies, - unsigned int numDevices, int blockSize, bool bUsePBO); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO); cudaError_t setSofteningSquared(float softeningSq); cudaError_t setSofteningSquared(double softeningSq); @@ -50,365 +52,367 @@ cudaError_t setSofteningSquared(double softeningSq); template BodySystemCUDA::BodySystemCUDA(unsigned int numBodies, unsigned int numDevices, - unsigned int blockSize, bool usePBO, - bool useSysMem, bool useP2P, int deviceId) - : m_numBodies(numBodies), - m_numDevices(numDevices), - m_bInitialized(false), - m_bUsePBO(usePBO), - m_bUseSysMem(useSysMem), - m_bUseP2P(useP2P), - m_currentRead(0), - m_currentWrite(1), - m_blockSize(blockSize), - m_devID(deviceId) { - m_hPos[0] = m_hPos[1] = 0; - m_hVel = 0; + unsigned int blockSize, + bool usePBO, + bool useSysMem, + bool useP2P, + int deviceId) + : m_numBodies(numBodies) + , m_numDevices(numDevices) + , m_bInitialized(false) + , m_bUsePBO(usePBO) + , m_bUseSysMem(useSysMem) + , m_bUseP2P(useP2P) + , m_currentRead(0) + , m_currentWrite(1) + , m_blockSize(blockSize) + , m_devID(deviceId) +{ + m_hPos[0] = m_hPos[1] = 0; + m_hVel = 0; - m_deviceData = 0; + m_deviceData = 0; - _initialize(numBodies); - setSoftening(0.00125f); - setDamping(0.995f); + _initialize(numBodies); + setSoftening(0.00125f); + setDamping(0.995f); } -template -BodySystemCUDA::~BodySystemCUDA() { - _finalize(); - m_numBodies = 0; +template BodySystemCUDA::~BodySystemCUDA() +{ + _finalize(); + m_numBodies = 0; } -template -void BodySystemCUDA::_initialize(int numBodies) { - assert(!m_bInitialized); +template void BodySystemCUDA::_initialize(int numBodies) +{ + assert(!m_bInitialized); - m_numBodies = numBodies; + m_numBodies = numBodies; - unsigned int memSize = sizeof(T) * 4 * numBodies; + unsigned int memSize = sizeof(T) * 4 * numBodies; - m_deviceData = new DeviceData[m_numDevices]; + m_deviceData = new DeviceData[m_numDevices]; - // divide up the workload amongst Devices - float *weights = new float[m_numDevices]; - int *numSms = new int[m_numDevices]; - float total = 0; - - for (unsigned int i = 0; i < m_numDevices; i++) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, i)); - - // Choose the weight based on the Compute Capability - // We estimate that a CC2.0 SM is about 4.0x faster than a CC 1.x SM for - // this application (since a 15-SM GF100 is about 2X faster than a 30-SM - // GT200). - numSms[i] = props.multiProcessorCount; - weights[i] = numSms[i] * (props.major >= 2 ? 4.f : 1.f); - total += weights[i]; - } - - unsigned int offset = 0; - unsigned int remaining = m_numBodies; - - for (unsigned int i = 0; i < m_numDevices; i++) { - unsigned int count = (int)((weights[i] / total) * m_numBodies); - // Rounding up to numSms[i]*256 leads to better GPU utilization _per_ GPU - // but when using multiple devices, it will lead to the last GPUs not having - // any work at all - // which means worse overall performance - // unsigned int round = numSms[i] * 256; - unsigned int round = 256; - - count = round * ((count + round - 1) / round); - if (count > remaining) { - count = remaining; - } - - remaining -= count; - m_deviceData[i].offset = offset; - m_deviceData[i].numBodies = count; - offset += count; - - if ((i == m_numDevices - 1) && (offset < m_numBodies - 1)) { - m_deviceData[i].numBodies += m_numBodies - offset; - } - } - - delete[] weights; - delete[] numSms; - - if (m_bUseSysMem) { - checkCudaErrors(cudaHostAlloc((void **)&m_hPos[0], memSize, - cudaHostAllocMapped | cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc((void **)&m_hPos[1], memSize, - cudaHostAllocMapped | cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc((void **)&m_hVel, memSize, - cudaHostAllocMapped | cudaHostAllocPortable)); - - memset(m_hPos[0], 0, memSize); - memset(m_hPos[1], 0, memSize); - memset(m_hVel, 0, memSize); + // divide up the workload amongst Devices + float *weights = new float[m_numDevices]; + int *numSms = new int[m_numDevices]; + float total = 0; for (unsigned int i = 0; i < m_numDevices; i++) { - if (m_numDevices > 1) { - checkCudaErrors(cudaSetDevice(i)); - } + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, i)); - checkCudaErrors(cudaEventCreate(&m_deviceData[i].event)); - checkCudaErrors(cudaHostGetDevicePointer( - (void **)&m_deviceData[i].dPos[0], (void *)m_hPos[0], 0)); - checkCudaErrors(cudaHostGetDevicePointer( - (void **)&m_deviceData[i].dPos[1], (void *)m_hPos[1], 0)); - checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dVel, - (void *)m_hVel, 0)); - } - } else { - m_hPos[0] = new T[m_numBodies * 4]; - m_hVel = new T[m_numBodies * 4]; - - memset(m_hPos[0], 0, memSize); - memset(m_hVel, 0, memSize); - - checkCudaErrors(cudaSetDevice(m_devID)); - checkCudaErrors(cudaEventCreate(&m_deviceData[0].event)); - - if (m_bUsePBO) { - // create the position pixel buffer objects for rendering - // we will actually compute directly from this memory in CUDA too - glGenBuffers(2, (GLuint *)m_pbo); - - for (int i = 0; i < 2; ++i) { - glBindBuffer(GL_ARRAY_BUFFER, m_pbo[i]); - glBufferData(GL_ARRAY_BUFFER, memSize, m_hPos[0], GL_DYNAMIC_DRAW); - - int size = 0; - glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); - - if ((unsigned)size != memSize) { - fprintf(stderr, "WARNING: Pixel Buffer Object allocation failed!n"); - } - - glBindBuffer(GL_ARRAY_BUFFER, 0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_pGRes[i], m_pbo[i], - cudaGraphicsMapFlagsNone)); - } - } else { - checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[0], memSize)); - checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[1], memSize)); + // Choose the weight based on the Compute Capability + // We estimate that a CC2.0 SM is about 4.0x faster than a CC 1.x SM for + // this application (since a 15-SM GF100 is about 2X faster than a 30-SM + // GT200). + numSms[i] = props.multiProcessorCount; + weights[i] = numSms[i] * (props.major >= 2 ? 4.f : 1.f); + total += weights[i]; } - checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dVel, memSize)); - - // At this point we already know P2P is supported - if (m_bUseP2P) { - for (unsigned int i = 1; i < m_numDevices; i++) { - int access = 0; - cudaError_t error; - - // Enable access for gpu_i to memory owned by gpu0 - checkCudaErrors(cudaSetDevice(i)); - if ((error = cudaDeviceEnablePeerAccess(0, 0)) != - cudaErrorPeerAccessAlreadyEnabled) { - checkCudaErrors(error); - } else { - // We might have already enabled P2P, so catch this and reset error - // code... - cudaGetLastError(); - } - - checkCudaErrors(cudaEventCreate(&m_deviceData[i].event)); - - // Point all GPUs to the memory allocated on gpu0 - m_deviceData[i].dPos[0] = m_deviceData[0].dPos[0]; - m_deviceData[i].dPos[1] = m_deviceData[0].dPos[1]; - m_deviceData[i].dVel = m_deviceData[0].dVel; - } - } - } - - m_bInitialized = true; -} - -template -void BodySystemCUDA::_finalize() { - assert(m_bInitialized); - - if (m_bUseSysMem) { - checkCudaErrors(cudaFreeHost(m_hPos[0])); - checkCudaErrors(cudaFreeHost(m_hPos[1])); - checkCudaErrors(cudaFreeHost(m_hVel)); + unsigned int offset = 0; + unsigned int remaining = m_numBodies; for (unsigned int i = 0; i < m_numDevices; i++) { - cudaEventDestroy(m_deviceData[i].event); - } - } else { - delete[] m_hPos[0]; - delete[] m_hPos[1]; - delete[] m_hVel; + unsigned int count = (int)((weights[i] / total) * m_numBodies); + // Rounding up to numSms[i]*256 leads to better GPU utilization _per_ GPU + // but when using multiple devices, it will lead to the last GPUs not having + // any work at all + // which means worse overall performance + // unsigned int round = numSms[i] * 256; + unsigned int round = 256; - checkCudaErrors(cudaFree((void **)m_deviceData[0].dVel)); - - if (m_bUsePBO) { - checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[0])); - checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[1])); - glDeleteBuffers(2, (const GLuint *)m_pbo); - } else { - checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[0])); - checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[1])); - - checkCudaErrors(cudaEventDestroy(m_deviceData[0].event)); - - if (m_bUseP2P) { - for (unsigned int i = 1; i < m_numDevices; i++) { - checkCudaErrors(cudaEventDestroy(m_deviceData[i].event)); + count = round * ((count + round - 1) / round); + if (count > remaining) { + count = remaining; } - } - } - } - delete[] m_deviceData; + remaining -= count; + m_deviceData[i].offset = offset; + m_deviceData[i].numBodies = count; + offset += count; - m_bInitialized = false; -} - -template -void BodySystemCUDA::loadTipsyFile(const std::string &filename) { - if (m_bInitialized) _finalize(); - - std::vector::Type> positions; - std::vector::Type> velocities; - std::vector ids; - - int nBodies = 0; - int nFirst = 0, nSecond = 0, nThird = 0; - - read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, - nSecond, nThird); - - _initialize(nBodies); - - setArray(BODYSYSTEM_POSITION, (T *)&positions[0]); - setArray(BODYSYSTEM_VELOCITY, (T *)&velocities[0]); -} - -template -void BodySystemCUDA::setSoftening(T softening) { - T softeningSq = softening * softening; - - for (unsigned int i = 0; i < m_numDevices; i++) { - if (m_numDevices > 1) { - checkCudaErrors(cudaSetDevice(i)); + if ((i == m_numDevices - 1) && (offset < m_numBodies - 1)) { + m_deviceData[i].numBodies += m_numBodies - offset; + } } - checkCudaErrors(setSofteningSquared(softeningSq)); - } + delete[] weights; + delete[] numSms; + + if (m_bUseSysMem) { + checkCudaErrors(cudaHostAlloc((void **)&m_hPos[0], memSize, cudaHostAllocMapped | cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void **)&m_hPos[1], memSize, cudaHostAllocMapped | cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void **)&m_hVel, memSize, cudaHostAllocMapped | cudaHostAllocPortable)); + + memset(m_hPos[0], 0, memSize); + memset(m_hPos[1], 0, memSize); + memset(m_hVel, 0, memSize); + + for (unsigned int i = 0; i < m_numDevices; i++) { + if (m_numDevices > 1) { + checkCudaErrors(cudaSetDevice(i)); + } + + checkCudaErrors(cudaEventCreate(&m_deviceData[i].event)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dPos[0], (void *)m_hPos[0], 0)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dPos[1], (void *)m_hPos[1], 0)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dVel, (void *)m_hVel, 0)); + } + } + else { + m_hPos[0] = new T[m_numBodies * 4]; + m_hVel = new T[m_numBodies * 4]; + + memset(m_hPos[0], 0, memSize); + memset(m_hVel, 0, memSize); + + checkCudaErrors(cudaSetDevice(m_devID)); + checkCudaErrors(cudaEventCreate(&m_deviceData[0].event)); + + if (m_bUsePBO) { + // create the position pixel buffer objects for rendering + // we will actually compute directly from this memory in CUDA too + glGenBuffers(2, (GLuint *)m_pbo); + + for (int i = 0; i < 2; ++i) { + glBindBuffer(GL_ARRAY_BUFFER, m_pbo[i]); + glBufferData(GL_ARRAY_BUFFER, memSize, m_hPos[0], GL_DYNAMIC_DRAW); + + int size = 0; + glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); + + if ((unsigned)size != memSize) { + fprintf(stderr, "WARNING: Pixel Buffer Object allocation failed!n"); + } + + glBindBuffer(GL_ARRAY_BUFFER, 0); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_pGRes[i], m_pbo[i], cudaGraphicsMapFlagsNone)); + } + } + else { + checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[0], memSize)); + checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[1], memSize)); + } + + checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dVel, memSize)); + + // At this point we already know P2P is supported + if (m_bUseP2P) { + for (unsigned int i = 1; i < m_numDevices; i++) { + int access = 0; + cudaError_t error; + + // Enable access for gpu_i to memory owned by gpu0 + checkCudaErrors(cudaSetDevice(i)); + if ((error = cudaDeviceEnablePeerAccess(0, 0)) != cudaErrorPeerAccessAlreadyEnabled) { + checkCudaErrors(error); + } + else { + // We might have already enabled P2P, so catch this and reset error + // code... + cudaGetLastError(); + } + + checkCudaErrors(cudaEventCreate(&m_deviceData[i].event)); + + // Point all GPUs to the memory allocated on gpu0 + m_deviceData[i].dPos[0] = m_deviceData[0].dPos[0]; + m_deviceData[i].dPos[1] = m_deviceData[0].dPos[1]; + m_deviceData[i].dVel = m_deviceData[0].dVel; + } + } + } + + m_bInitialized = true; } -template -void BodySystemCUDA::setDamping(T damping) { - m_damping = damping; +template void BodySystemCUDA::_finalize() +{ + assert(m_bInitialized); + + if (m_bUseSysMem) { + checkCudaErrors(cudaFreeHost(m_hPos[0])); + checkCudaErrors(cudaFreeHost(m_hPos[1])); + checkCudaErrors(cudaFreeHost(m_hVel)); + + for (unsigned int i = 0; i < m_numDevices; i++) { + cudaEventDestroy(m_deviceData[i].event); + } + } + else { + delete[] m_hPos[0]; + delete[] m_hPos[1]; + delete[] m_hVel; + + checkCudaErrors(cudaFree((void **)m_deviceData[0].dVel)); + + if (m_bUsePBO) { + checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[0])); + checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[1])); + glDeleteBuffers(2, (const GLuint *)m_pbo); + } + else { + checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[0])); + checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[1])); + + checkCudaErrors(cudaEventDestroy(m_deviceData[0].event)); + + if (m_bUseP2P) { + for (unsigned int i = 1; i < m_numDevices; i++) { + checkCudaErrors(cudaEventDestroy(m_deviceData[i].event)); + } + } + } + } + + delete[] m_deviceData; + + m_bInitialized = false; } -template -void BodySystemCUDA::update(T deltaTime) { - assert(m_bInitialized); +template void BodySystemCUDA::loadTipsyFile(const std::string &filename) +{ + if (m_bInitialized) + _finalize(); - integrateNbodySystem(m_deviceData, m_pGRes, m_currentRead, - (float)deltaTime, (float)m_damping, m_numBodies, - m_numDevices, m_blockSize, m_bUsePBO); + std::vector::Type> positions; + std::vector::Type> velocities; + std::vector ids; - std::swap(m_currentRead, m_currentWrite); + int nBodies = 0; + int nFirst = 0, nSecond = 0, nThird = 0; + + read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, nSecond, nThird); + + _initialize(nBodies); + + setArray(BODYSYSTEM_POSITION, (T *)&positions[0]); + setArray(BODYSYSTEM_VELOCITY, (T *)&velocities[0]); } -template -T *BodySystemCUDA::getArray(BodyArray array) { - assert(m_bInitialized); +template void BodySystemCUDA::setSoftening(T softening) +{ + T softeningSq = softening * softening; - T *hdata = 0; - T *ddata = 0; + for (unsigned int i = 0; i < m_numDevices; i++) { + if (m_numDevices > 1) { + checkCudaErrors(cudaSetDevice(i)); + } - cudaGraphicsResource *pgres = NULL; + checkCudaErrors(setSofteningSquared(softeningSq)); + } +} - int currentReadHost = m_bUseSysMem ? m_currentRead : 0; +template void BodySystemCUDA::setDamping(T damping) { m_damping = damping; } - switch (array) { +template void BodySystemCUDA::update(T deltaTime) +{ + assert(m_bInitialized); + + integrateNbodySystem(m_deviceData, + m_pGRes, + m_currentRead, + (float)deltaTime, + (float)m_damping, + m_numBodies, + m_numDevices, + m_blockSize, + m_bUsePBO); + + std::swap(m_currentRead, m_currentWrite); +} + +template T *BodySystemCUDA::getArray(BodyArray array) +{ + assert(m_bInitialized); + + T *hdata = 0; + T *ddata = 0; + + cudaGraphicsResource *pgres = NULL; + + int currentReadHost = m_bUseSysMem ? m_currentRead : 0; + + switch (array) { default: case BODYSYSTEM_POSITION: - hdata = m_hPos[currentReadHost]; - ddata = m_deviceData[0].dPos[m_currentRead]; + hdata = m_hPos[currentReadHost]; + ddata = m_deviceData[0].dPos[m_currentRead]; - if (m_bUsePBO) { - pgres = m_pGRes[m_currentRead]; - } - - break; - - case BODYSYSTEM_VELOCITY: - hdata = m_hVel; - ddata = m_deviceData[0].dVel; - break; - } - - if (!m_bUseSysMem) { - if (pgres) { - checkCudaErrors( - cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly)); - checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0)); - size_t bytes; - checkCudaErrors( - cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres)); - } - - checkCudaErrors(cudaMemcpy(hdata, ddata, m_numBodies * 4 * sizeof(T), - cudaMemcpyDeviceToHost)); - - if (pgres) { - checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0)); - } - } - - return hdata; -} - -template -void BodySystemCUDA::setArray(BodyArray array, const T *data) { - assert(m_bInitialized); - - m_currentRead = 0; - m_currentWrite = 1; - - switch (array) { - default: - case BODYSYSTEM_POSITION: { - if (m_bUsePBO) { - glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]); - glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data); - - int size = 0; - glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); - - if ((unsigned)size != 4 * (sizeof(T) * m_numBodies)) { - fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n"); + if (m_bUsePBO) { + pgres = m_pGRes[m_currentRead]; } - glBindBuffer(GL_ARRAY_BUFFER, 0); - } else { - if (m_bUseSysMem) { - memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T)); - } else - checkCudaErrors(cudaMemcpy(m_deviceData[0].dPos[m_currentRead], data, - m_numBodies * 4 * sizeof(T), - cudaMemcpyHostToDevice)); - } + break; + + case BODYSYSTEM_VELOCITY: + hdata = m_hVel; + ddata = m_deviceData[0].dVel; + break; + } + + if (!m_bUseSysMem) { + if (pgres) { + checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly)); + checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0)); + size_t bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres)); + } + + checkCudaErrors(cudaMemcpy(hdata, ddata, m_numBodies * 4 * sizeof(T), cudaMemcpyDeviceToHost)); + + if (pgres) { + checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0)); + } + } + + return hdata; +} + +template void BodySystemCUDA::setArray(BodyArray array, const T *data) +{ + assert(m_bInitialized); + + m_currentRead = 0; + m_currentWrite = 1; + + switch (array) { + default: + case BODYSYSTEM_POSITION: { + if (m_bUsePBO) { + glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]); + glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data); + + int size = 0; + glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); + + if ((unsigned)size != 4 * (sizeof(T) * m_numBodies)) { + fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n"); + } + + glBindBuffer(GL_ARRAY_BUFFER, 0); + } + else { + if (m_bUseSysMem) { + memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T)); + } + else + checkCudaErrors(cudaMemcpy( + m_deviceData[0].dPos[m_currentRead], data, m_numBodies * 4 * sizeof(T), cudaMemcpyHostToDevice)); + } } break; case BODYSYSTEM_VELOCITY: - if (m_bUseSysMem) { - memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T)); - } else - checkCudaErrors(cudaMemcpy(m_deviceData[0].dVel, data, - m_numBodies * 4 * sizeof(T), - cudaMemcpyHostToDevice)); + if (m_bUseSysMem) { + memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T)); + } + else + checkCudaErrors( + cudaMemcpy(m_deviceData[0].dVel, data, m_numBodies * 4 * sizeof(T), cudaMemcpyHostToDevice)); - break; - } + break; + } } diff --git a/Samples/5_Domain_Specific/nbody/nbody.cpp b/Samples/5_Domain_Specific/nbody/nbody.cpp index e01e6345..1347a000 100644 --- a/Samples/5_Domain_Specific/nbody/nbody.cpp +++ b/Samples/5_Domain_Specific/nbody/nbody.cpp @@ -37,50 +37,48 @@ #include #endif -#include -#include -#include #include #include -#include - -#include +#include +#include #include +#include #include #include +#include +#include -#include "bodysystemcuda.h" #include "bodysystemcpu.h" -#include "render_particles.h" +#include "bodysystemcuda.h" #include "cuda_runtime.h" +#include "render_particles.h" // view params -int ox = 0, oy = 0; -int buttonState = 0; -float camera_trans[] = {0, -2, -150}; -float camera_rot[] = {0, 0, 0}; -float camera_trans_lag[] = {0, -2, -150}; -float camera_rot_lag[] = {0, 0, 0}; -const float inertia = 0.1f; +int ox = 0, oy = 0; +int buttonState = 0; +float camera_trans[] = {0, -2, -150}; +float camera_rot[] = {0, 0, 0}; +float camera_trans_lag[] = {0, -2, -150}; +float camera_rot_lag[] = {0, 0, 0}; +const float inertia = 0.1f; -ParticleRenderer::DisplayMode displayMode = - ParticleRenderer::PARTICLE_SPRITES_COLOR; +ParticleRenderer::DisplayMode displayMode = ParticleRenderer::PARTICLE_SPRITES_COLOR; -bool benchmark = false; -bool compareToCPU = false; -bool QATest = false; -int blockSize = 256; -bool useHostMem = false; -bool useP2P = true; // this is always optimal to use P2P path when available -bool fp64 = false; -bool useCpu = false; -int numDevsRequested = 1; -bool displayEnabled = true; -bool bPause = false; -bool bFullscreen = false; -bool bDispInteractions = false; -bool bSupportDouble = false; -int flopsPerInteraction = 20; +bool benchmark = false; +bool compareToCPU = false; +bool QATest = false; +int blockSize = 256; +bool useHostMem = false; +bool useP2P = true; // this is always optimal to use P2P path when available +bool fp64 = false; +bool useCpu = false; +int numDevsRequested = 1; +bool displayEnabled = true; +bool bPause = false; +bool bFullscreen = false; +bool bDispInteractions = false; +bool bSupportDouble = false; +int flopsPerInteraction = 20; char deviceName[100]; @@ -90,39 +88,48 @@ int numBodies = 16384; std::string tipsyFile = ""; -int numIterations = 0; // run until exit +int numIterations = 0; // run until exit -void computePerfStats(double &interactionsPerSecond, double &gflops, - float milliseconds, int iterations) { - // double precision uses intrinsic operation followed by refinement, - // resulting in higher operation count per interaction. - // (Note Astrophysicists use 38 flops per interaction no matter what, - // based on "historical precedent", but they are using FLOP/s as a - // measure of "science throughput". We are using it as a measure of - // hardware throughput. They should really use interactions/s... - // const int flopsPerInteraction = fp64 ? 30 : 20; - interactionsPerSecond = (float)numBodies * (float)numBodies; - interactionsPerSecond *= 1e-9 * iterations * 1000 / milliseconds; - gflops = interactionsPerSecond * (float)flopsPerInteraction; +void computePerfStats(double &interactionsPerSecond, double &gflops, float milliseconds, int iterations) +{ + // double precision uses intrinsic operation followed by refinement, + // resulting in higher operation count per interaction. + // (Note Astrophysicists use 38 flops per interaction no matter what, + // based on "historical precedent", but they are using FLOP/s as a + // measure of "science throughput". We are using it as a measure of + // hardware throughput. They should really use interactions/s... + // const int flopsPerInteraction = fp64 ? 30 : 20; + interactionsPerSecond = (float)numBodies * (float)numBodies; + interactionsPerSecond *= 1e-9 * iterations * 1000 / milliseconds; + gflops = interactionsPerSecond * (float)flopsPerInteraction; } //////////////////////////////////////// // Demo Parameters //////////////////////////////////////// -struct NBodyParams { - float m_timestep; - float m_clusterScale; - float m_velocityScale; - float m_softening; - float m_damping; - float m_pointSize; - float m_x, m_y, m_z; +struct NBodyParams +{ + float m_timestep; + float m_clusterScale; + float m_velocityScale; + float m_softening; + float m_damping; + float m_pointSize; + float m_x, m_y, m_z; - void print() { - printf("{ %f, %f, %f, %f, %f, %f, %f, %f, %f },\n", m_timestep, - m_clusterScale, m_velocityScale, m_softening, m_damping, m_pointSize, - m_x, m_y, m_z); - } + void print() + { + printf("{ %f, %f, %f, %f, %f, %f, %f, %f, %f },\n", + m_timestep, + m_clusterScale, + m_velocityScale, + m_softening, + m_damping, + m_pointSize, + m_x, + m_y, + m_z); + } }; NBodyParams demoParams[] = { @@ -132,14 +139,13 @@ NBodyParams demoParams[] = { {0.0006f, 0.16f, 1000.0f, 1.0f, 1.0f, 0.07f, 0, 0, -1.5f}, {0.0019f, 0.32f, 276.0f, 1.0f, 1.0f, 0.07f, 0, 0, -5}, {0.0016f, 0.32f, 272.0f, 0.145f, 1.0f, 0.08f, 0, 0, -5}, - {0.016000f, 6.040000f, 0.000000f, 1.000000f, 1.000000f, 0.760000f, 0, 0, - -50}, + {0.016000f, 6.040000f, 0.000000f, 1.000000f, 1.000000f, 0.760000f, 0, 0, -50}, }; -int numDemos = sizeof(demoParams) / sizeof(NBodyParams); -bool cycleDemo = true; -int activeDemo = 0; -float demoTime = 10000.0f; // ms +int numDemos = sizeof(demoParams) / sizeof(NBodyParams); +bool cycleDemo = true; +int activeDemo = 0; +float demoTime = 10000.0f; // ms StopWatchInterface *demoTimer = NULL, *timer = NULL; // run multiple iterations to compute an average sort time @@ -147,1157 +153,1209 @@ StopWatchInterface *demoTimer = NULL, *timer = NULL; NBodyParams activeParams = demoParams[activeDemo]; // The UI. -ParamListGL *paramlist; // parameter list -bool bShowSliders = true; +ParamListGL *paramlist; // parameter list +bool bShowSliders = true; // fps -static int fpsCount = 0; -static int fpsLimit = 5; +static int fpsCount = 0; +static int fpsLimit = 5; cudaEvent_t startEvent, stopEvent; cudaEvent_t hostMemSyncEvent; -template -class NBodyDemo { - public: - static void Create() { m_singleton = new NBodyDemo; } - static void Destroy() { delete m_singleton; } - - static void init(int numBodies, int numDevices, int blockSize, bool usePBO, - bool useHostMem, bool useP2P, bool useCpu, int devID) { - m_singleton->_init(numBodies, numDevices, blockSize, usePBO, useHostMem, - useP2P, useCpu, devID); - } - - static void reset(int numBodies, NBodyConfig config) { - m_singleton->_reset(numBodies, config); - } - - static void selectDemo(int index) { m_singleton->_selectDemo(index); } - - static bool compareResults(int numBodies) { - return m_singleton->_compareResults(numBodies); - } - - static void runBenchmark(int iterations) { - m_singleton->_runBenchmark(iterations); - } - - static void updateParams() { - m_singleton->m_nbody->setSoftening(activeParams.m_softening); - m_singleton->m_nbody->setDamping(activeParams.m_damping); - } - - static void updateSimulation() { - m_singleton->m_nbody->update(activeParams.m_timestep); - } - - static void display() { - m_singleton->m_renderer->setSpriteSize(activeParams.m_pointSize); - - if (useHostMem) { - // This event sync is required because we are rendering from the host - // memory that CUDA is - // writing. If we don't wait until CUDA is done updating it, we will - // render partially - // updated data, resulting in a jerky frame rate. - if (!useCpu) { - cudaEventSynchronize(hostMemSyncEvent); - } - - m_singleton->m_renderer->setPositions( - m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION), - m_singleton->m_nbody->getNumBodies()); - } else { - m_singleton->m_renderer->setPBO( - m_singleton->m_nbody->getCurrentReadBuffer(), - m_singleton->m_nbody->getNumBodies(), (sizeof(T) > 4)); - } - - // display particles - m_singleton->m_renderer->display(displayMode); - } - - static void getArrays(T *pos, T *vel) { - T *_pos = m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION); - T *_vel = m_singleton->m_nbody->getArray(BODYSYSTEM_VELOCITY); - memcpy(pos, _pos, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); - memcpy(vel, _vel, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); - } - - static void setArrays(const T *pos, const T *vel) { - if (pos != m_singleton->m_hPos) { - memcpy(m_singleton->m_hPos, pos, numBodies * 4 * sizeof(T)); - } - - if (vel != m_singleton->m_hVel) { - memcpy(m_singleton->m_hVel, vel, numBodies * 4 * sizeof(T)); - } - - m_singleton->m_nbody->setArray(BODYSYSTEM_POSITION, m_singleton->m_hPos); - m_singleton->m_nbody->setArray(BODYSYSTEM_VELOCITY, m_singleton->m_hVel); - - if (!benchmark && !useCpu && !compareToCPU) { - m_singleton->_resetRenderer(); - } - } - - private: - static NBodyDemo *m_singleton; - - BodySystem *m_nbody; - BodySystemCUDA *m_nbodyCuda; - BodySystemCPU *m_nbodyCpu; - - ParticleRenderer *m_renderer; - - T *m_hPos; - T *m_hVel; - float *m_hColor; - - private: - NBodyDemo() - : m_nbody(0), - m_nbodyCuda(0), - m_nbodyCpu(0), - m_renderer(0), - m_hPos(0), - m_hVel(0), - m_hColor(0) {} - - ~NBodyDemo() { - if (m_nbodyCpu) { - delete m_nbodyCpu; - } - - if (m_nbodyCuda) { - delete m_nbodyCuda; - } - - if (m_hPos) { - delete[] m_hPos; - } - - if (m_hVel) { - delete[] m_hVel; - } - - if (m_hColor) { - delete[] m_hColor; - } - - sdkDeleteTimer(&demoTimer); - - if (!benchmark && !compareToCPU) delete m_renderer; - } - - void _init(int numBodies, int numDevices, int blockSize, bool bUsePBO, - bool useHostMem, bool useP2P, bool useCpu, int devID) { - if (useCpu) { - m_nbodyCpu = new BodySystemCPU(numBodies); - m_nbody = m_nbodyCpu; - m_nbodyCuda = 0; - } else { - m_nbodyCuda = new BodySystemCUDA(numBodies, numDevices, blockSize, - bUsePBO, useHostMem, useP2P, devID); - m_nbody = m_nbodyCuda; - m_nbodyCpu = 0; - } - - // allocate host memory - m_hPos = new T[numBodies * 4]; - m_hVel = new T[numBodies * 4]; - m_hColor = new float[numBodies * 4]; - - m_nbody->setSoftening(activeParams.m_softening); - m_nbody->setDamping(activeParams.m_damping); - - if (useCpu) { - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - } else { - checkCudaErrors(cudaEventCreate(&startEvent)); - checkCudaErrors(cudaEventCreate(&stopEvent)); - checkCudaErrors(cudaEventCreate(&hostMemSyncEvent)); - } - - if (!benchmark && !compareToCPU) { - m_renderer = new ParticleRenderer; - _resetRenderer(); - } - - sdkCreateTimer(&demoTimer); - sdkStartTimer(&demoTimer); - } - - void _reset(int numBodies, NBodyConfig config) { - if (tipsyFile == "") { - randomizeBodies(config, m_hPos, m_hVel, m_hColor, - activeParams.m_clusterScale, activeParams.m_velocityScale, - numBodies, true); - setArrays(m_hPos, m_hVel); - } else { - m_nbody->loadTipsyFile(tipsyFile); - ::numBodies = m_nbody->getNumBodies(); - } - } - - void _resetRenderer() { - if (fp64) { - float color[4] = {0.4f, 0.8f, 0.1f, 1.0f}; - m_renderer->setBaseColor(color); - } else { - float color[4] = {1.0f, 0.6f, 0.3f, 1.0f}; - m_renderer->setBaseColor(color); - } - - m_renderer->setColors(m_hColor, m_nbody->getNumBodies()); - m_renderer->setSpriteSize(activeParams.m_pointSize); - } - - void _selectDemo(int index) { - assert(index < numDemos); - - activeParams = demoParams[index]; - camera_trans[0] = camera_trans_lag[0] = activeParams.m_x; - camera_trans[1] = camera_trans_lag[1] = activeParams.m_y; - camera_trans[2] = camera_trans_lag[2] = activeParams.m_z; - reset(numBodies, NBODY_CONFIG_SHELL); - sdkResetTimer(&demoTimer); - } - - bool _compareResults(int numBodies) { - assert(m_nbodyCuda); - - bool passed = true; - - m_nbody->update(0.001f); +template class NBodyDemo +{ +public: + static void Create() { m_singleton = new NBodyDemo; } + static void Destroy() { delete m_singleton; } + static void init(int numBodies, + int numDevices, + int blockSize, + bool usePBO, + bool useHostMem, + bool useP2P, + bool useCpu, + int devID) { - m_nbodyCpu = new BodySystemCPU(numBodies); + m_singleton->_init(numBodies, numDevices, blockSize, usePBO, useHostMem, useP2P, useCpu, devID); + } - m_nbodyCpu->setArray(BODYSYSTEM_POSITION, m_hPos); - m_nbodyCpu->setArray(BODYSYSTEM_VELOCITY, m_hVel); + static void reset(int numBodies, NBodyConfig config) { m_singleton->_reset(numBodies, config); } - m_nbodyCpu->update(0.001f); + static void selectDemo(int index) { m_singleton->_selectDemo(index); } - T *cudaPos = m_nbodyCuda->getArray(BODYSYSTEM_POSITION); - T *cpuPos = m_nbodyCpu->getArray(BODYSYSTEM_POSITION); + static bool compareResults(int numBodies) { return m_singleton->_compareResults(numBodies); } - T tolerance = 0.0005f; + static void runBenchmark(int iterations) { m_singleton->_runBenchmark(iterations); } - for (int i = 0; i < numBodies; i++) { - if (fabs(cpuPos[i] - cudaPos[i]) > tolerance) { - passed = false; - printf("Error: (host)%f != (device)%f\n", cpuPos[i], cudaPos[i]); + static void updateParams() + { + m_singleton->m_nbody->setSoftening(activeParams.m_softening); + m_singleton->m_nbody->setDamping(activeParams.m_damping); + } + + static void updateSimulation() { m_singleton->m_nbody->update(activeParams.m_timestep); } + + static void display() + { + m_singleton->m_renderer->setSpriteSize(activeParams.m_pointSize); + + if (useHostMem) { + // This event sync is required because we are rendering from the host + // memory that CUDA is + // writing. If we don't wait until CUDA is done updating it, we will + // render partially + // updated data, resulting in a jerky frame rate. + if (!useCpu) { + cudaEventSynchronize(hostMemSyncEvent); + } + + m_singleton->m_renderer->setPositions(m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION), + m_singleton->m_nbody->getNumBodies()); + } + else { + m_singleton->m_renderer->setPBO( + m_singleton->m_nbody->getCurrentReadBuffer(), m_singleton->m_nbody->getNumBodies(), (sizeof(T) > 4)); } - } - } - if (passed) { - printf(" OK\n"); - } - return passed; - } - void _runBenchmark(int iterations) { - // once without timing to prime the device - if (!useCpu) { - m_nbody->update(activeParams.m_timestep); + // display particles + m_singleton->m_renderer->display(displayMode); } - if (useCpu) { - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - } else { - checkCudaErrors(cudaEventRecord(startEvent, 0)); + static void getArrays(T *pos, T *vel) + { + T *_pos = m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION); + T *_vel = m_singleton->m_nbody->getArray(BODYSYSTEM_VELOCITY); + memcpy(pos, _pos, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); + memcpy(vel, _vel, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); } - for (int i = 0; i < iterations; ++i) { - m_nbody->update(activeParams.m_timestep); + static void setArrays(const T *pos, const T *vel) + { + if (pos != m_singleton->m_hPos) { + memcpy(m_singleton->m_hPos, pos, numBodies * 4 * sizeof(T)); + } + + if (vel != m_singleton->m_hVel) { + memcpy(m_singleton->m_hVel, vel, numBodies * 4 * sizeof(T)); + } + + m_singleton->m_nbody->setArray(BODYSYSTEM_POSITION, m_singleton->m_hPos); + m_singleton->m_nbody->setArray(BODYSYSTEM_VELOCITY, m_singleton->m_hVel); + + if (!benchmark && !useCpu && !compareToCPU) { + m_singleton->_resetRenderer(); + } } - float milliseconds = 0; +private: + static NBodyDemo *m_singleton; - if (useCpu) { - sdkStopTimer(&timer); - milliseconds = sdkGetTimerValue(&timer); - sdkStartTimer(&timer); - } else { - checkCudaErrors(cudaEventRecord(stopEvent, 0)); - checkCudaErrors(cudaEventSynchronize(stopEvent)); - checkCudaErrors( - cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + BodySystem *m_nbody; + BodySystemCUDA *m_nbodyCuda; + BodySystemCPU *m_nbodyCpu; + + ParticleRenderer *m_renderer; + + T *m_hPos; + T *m_hVel; + float *m_hColor; + +private: + NBodyDemo() + : m_nbody(0) + , m_nbodyCuda(0) + , m_nbodyCpu(0) + , m_renderer(0) + , m_hPos(0) + , m_hVel(0) + , m_hColor(0) + { } - double interactionsPerSecond = 0; - double gflops = 0; - computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); + ~NBodyDemo() + { + if (m_nbodyCpu) { + delete m_nbodyCpu; + } - printf("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, - iterations, milliseconds); - printf("= %.3f billion interactions per second\n", interactionsPerSecond); - printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, - (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); - } + if (m_nbodyCuda) { + delete m_nbodyCuda; + } + + if (m_hPos) { + delete[] m_hPos; + } + + if (m_hVel) { + delete[] m_hVel; + } + + if (m_hColor) { + delete[] m_hColor; + } + + sdkDeleteTimer(&demoTimer); + + if (!benchmark && !compareToCPU) + delete m_renderer; + } + + void _init(int numBodies, + int numDevices, + int blockSize, + bool bUsePBO, + bool useHostMem, + bool useP2P, + bool useCpu, + int devID) + { + if (useCpu) { + m_nbodyCpu = new BodySystemCPU(numBodies); + m_nbody = m_nbodyCpu; + m_nbodyCuda = 0; + } + else { + m_nbodyCuda = new BodySystemCUDA(numBodies, numDevices, blockSize, bUsePBO, useHostMem, useP2P, devID); + m_nbody = m_nbodyCuda; + m_nbodyCpu = 0; + } + + // allocate host memory + m_hPos = new T[numBodies * 4]; + m_hVel = new T[numBodies * 4]; + m_hColor = new float[numBodies * 4]; + + m_nbody->setSoftening(activeParams.m_softening); + m_nbody->setDamping(activeParams.m_damping); + + if (useCpu) { + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + } + else { + checkCudaErrors(cudaEventCreate(&startEvent)); + checkCudaErrors(cudaEventCreate(&stopEvent)); + checkCudaErrors(cudaEventCreate(&hostMemSyncEvent)); + } + + if (!benchmark && !compareToCPU) { + m_renderer = new ParticleRenderer; + _resetRenderer(); + } + + sdkCreateTimer(&demoTimer); + sdkStartTimer(&demoTimer); + } + + void _reset(int numBodies, NBodyConfig config) + { + if (tipsyFile == "") { + randomizeBodies(config, + m_hPos, + m_hVel, + m_hColor, + activeParams.m_clusterScale, + activeParams.m_velocityScale, + numBodies, + true); + setArrays(m_hPos, m_hVel); + } + else { + m_nbody->loadTipsyFile(tipsyFile); + ::numBodies = m_nbody->getNumBodies(); + } + } + + void _resetRenderer() + { + if (fp64) { + float color[4] = {0.4f, 0.8f, 0.1f, 1.0f}; + m_renderer->setBaseColor(color); + } + else { + float color[4] = {1.0f, 0.6f, 0.3f, 1.0f}; + m_renderer->setBaseColor(color); + } + + m_renderer->setColors(m_hColor, m_nbody->getNumBodies()); + m_renderer->setSpriteSize(activeParams.m_pointSize); + } + + void _selectDemo(int index) + { + assert(index < numDemos); + + activeParams = demoParams[index]; + camera_trans[0] = camera_trans_lag[0] = activeParams.m_x; + camera_trans[1] = camera_trans_lag[1] = activeParams.m_y; + camera_trans[2] = camera_trans_lag[2] = activeParams.m_z; + reset(numBodies, NBODY_CONFIG_SHELL); + sdkResetTimer(&demoTimer); + } + + bool _compareResults(int numBodies) + { + assert(m_nbodyCuda); + + bool passed = true; + + m_nbody->update(0.001f); + + { + m_nbodyCpu = new BodySystemCPU(numBodies); + + m_nbodyCpu->setArray(BODYSYSTEM_POSITION, m_hPos); + m_nbodyCpu->setArray(BODYSYSTEM_VELOCITY, m_hVel); + + m_nbodyCpu->update(0.001f); + + T *cudaPos = m_nbodyCuda->getArray(BODYSYSTEM_POSITION); + T *cpuPos = m_nbodyCpu->getArray(BODYSYSTEM_POSITION); + + T tolerance = 0.0005f; + + for (int i = 0; i < numBodies; i++) { + if (fabs(cpuPos[i] - cudaPos[i]) > tolerance) { + passed = false; + printf("Error: (host)%f != (device)%f\n", cpuPos[i], cudaPos[i]); + } + } + } + if (passed) { + printf(" OK\n"); + } + return passed; + } + + void _runBenchmark(int iterations) + { + // once without timing to prime the device + if (!useCpu) { + m_nbody->update(activeParams.m_timestep); + } + + if (useCpu) { + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + } + else { + checkCudaErrors(cudaEventRecord(startEvent, 0)); + } + + for (int i = 0; i < iterations; ++i) { + m_nbody->update(activeParams.m_timestep); + } + + float milliseconds = 0; + + if (useCpu) { + sdkStopTimer(&timer); + milliseconds = sdkGetTimerValue(&timer); + sdkStartTimer(&timer); + } + else { + checkCudaErrors(cudaEventRecord(stopEvent, 0)); + checkCudaErrors(cudaEventSynchronize(stopEvent)); + checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + } + + double interactionsPerSecond = 0; + double gflops = 0; + computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); + + printf("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, iterations, milliseconds); + printf("= %.3f billion interactions per second\n", interactionsPerSecond); + printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", + gflops, + (sizeof(T) > 4) ? "double" : "single", + flopsPerInteraction); + } }; -void finalize() { - if (!useCpu) { - checkCudaErrors(cudaEventDestroy(startEvent)); - checkCudaErrors(cudaEventDestroy(stopEvent)); - checkCudaErrors(cudaEventDestroy(hostMemSyncEvent)); - } +void finalize() +{ + if (!useCpu) { + checkCudaErrors(cudaEventDestroy(startEvent)); + checkCudaErrors(cudaEventDestroy(stopEvent)); + checkCudaErrors(cudaEventDestroy(hostMemSyncEvent)); + } - NBodyDemo::Destroy(); + NBodyDemo::Destroy(); - if (bSupportDouble) NBodyDemo::Destroy(); + if (bSupportDouble) + NBodyDemo::Destroy(); } -template <> -NBodyDemo *NBodyDemo::m_singleton = 0; -template <> -NBodyDemo *NBodyDemo::m_singleton = 0; +template <> NBodyDemo *NBodyDemo::m_singleton = 0; +template <> NBodyDemo *NBodyDemo::m_singleton = 0; -template -void switchDemoPrecision() { - cudaDeviceSynchronize(); +template void switchDemoPrecision() +{ + cudaDeviceSynchronize(); - fp64 = !fp64; - flopsPerInteraction = fp64 ? 30 : 20; + fp64 = !fp64; + flopsPerInteraction = fp64 ? 30 : 20; - T_old *oldPos = new T_old[numBodies * 4]; - T_old *oldVel = new T_old[numBodies * 4]; + T_old *oldPos = new T_old[numBodies * 4]; + T_old *oldVel = new T_old[numBodies * 4]; - NBodyDemo::getArrays(oldPos, oldVel); + NBodyDemo::getArrays(oldPos, oldVel); - // convert float to double - T_new *newPos = new T_new[numBodies * 4]; - T_new *newVel = new T_new[numBodies * 4]; + // convert float to double + T_new *newPos = new T_new[numBodies * 4]; + T_new *newVel = new T_new[numBodies * 4]; - for (int i = 0; i < numBodies * 4; i++) { - newPos[i] = (T_new)oldPos[i]; - newVel[i] = (T_new)oldVel[i]; - } + for (int i = 0; i < numBodies * 4; i++) { + newPos[i] = (T_new)oldPos[i]; + newVel[i] = (T_new)oldVel[i]; + } - NBodyDemo::setArrays(newPos, newVel); + NBodyDemo::setArrays(newPos, newVel); - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); - delete[] oldPos; - delete[] oldVel; - delete[] newPos; - delete[] newVel; + delete[] oldPos; + delete[] oldVel; + delete[] newPos; + delete[] newVel; } // check for OpenGL errors -inline void checkGLErrors(const char *s) { - GLenum error; +inline void checkGLErrors(const char *s) +{ + GLenum error; - while ((error = glGetError()) != GL_NO_ERROR) { - fprintf(stderr, "%s: error - %s\n", s, (char *)gluErrorString(error)); - } -} - -void initGL(int *argc, char **argv) { - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with OpenGL/CUDA - // interop. - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); - glutInitWindowSize(720, 480); - glutCreateWindow("CUDA n-body system"); - - if (bFullscreen) { - glutFullScreen(); - } - - else if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported("GL_ARB_multitexture " - "GL_ARB_vertex_buffer_object")) { - fprintf(stderr, "Required OpenGL extensions missing."); - exit(EXIT_FAILURE); - } else { -#if defined(WIN32) - wglSwapIntervalEXT(0); -#elif defined(LINUX) - glxSwapIntervalSGI(0); -#endif - } - - glEnable(GL_DEPTH_TEST); - glClearColor(0.0, 0.0, 0.0, 1.0); - - checkGLErrors("initGL"); -} - -void initParameters() { - // create a new parameter list - paramlist = new ParamListGL("sliders"); - paramlist->SetBarColorInner(0.8f, 0.8f, 0.0f); - - // add some parameters to the list - - // Point Size - paramlist->AddParam(new Param("Point Size", activeParams.m_pointSize, - 0.001f, 10.0f, 0.01f, - &activeParams.m_pointSize)); - - // Velocity Damping - paramlist->AddParam(new Param("Velocity Damping", - activeParams.m_damping, 0.5f, 1.0f, - .0001f, &(activeParams.m_damping))); - // Softening Factor - paramlist->AddParam(new Param("Softening Factor", - activeParams.m_softening, 0.001f, 1.0f, - .0001f, &(activeParams.m_softening))); - // Time step size - paramlist->AddParam(new Param("Time Step", activeParams.m_timestep, - 0.0f, 1.0f, .0001f, - &(activeParams.m_timestep))); - // Cluster scale (only affects starting configuration - paramlist->AddParam(new Param("Cluster Scale", - activeParams.m_clusterScale, 0.0f, 10.0f, - 0.01f, &(activeParams.m_clusterScale))); - - // Velocity scale (only affects starting configuration) - paramlist->AddParam( - new Param("Velocity Scale", activeParams.m_velocityScale, 0.0f, - 1000.0f, 0.1f, &activeParams.m_velocityScale)); -} - -void selectDemo(int activeDemo) { - if (fp64) { - NBodyDemo::selectDemo(activeDemo); - } else { - NBodyDemo::selectDemo(activeDemo); - } -} - -void updateSimulation() { - if (fp64) { - NBodyDemo::updateSimulation(); - } else { - NBodyDemo::updateSimulation(); - } -} - -void displayNBodySystem() { - if (fp64) { - NBodyDemo::display(); - } else { - NBodyDemo::display(); - } -} - -void display() { - static double gflops = 0; - static double ifps = 0; - static double interactionsPerSecond = 0; - - // update the simulation - if (!bPause) { - if (cycleDemo && (sdkGetTimerValue(&demoTimer) > demoTime)) { - activeDemo = (activeDemo + 1) % numDemos; - selectDemo(activeDemo); + while ((error = glGetError()) != GL_NO_ERROR) { + fprintf(stderr, "%s: error - %s\n", s, (char *)gluErrorString(error)); } +} - updateSimulation(); - - if (!useCpu) { - cudaEventRecord(hostMemSyncEvent, - 0); // insert an event to wait on before rendering - } - } - - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - - if (displayEnabled) { - // view transform - { - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - - for (int c = 0; c < 3; ++c) { - camera_trans_lag[c] += - (camera_trans[c] - camera_trans_lag[c]) * inertia; - camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; - } - - glTranslatef(camera_trans_lag[0], camera_trans_lag[1], - camera_trans_lag[2]); - glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); - glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); - } - - displayNBodySystem(); - - // display user interface - if (bShowSliders) { - glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color - glEnable(GL_BLEND); - paramlist->Render(0, 0); - glDisable(GL_BLEND); - } +void initGL(int *argc, char **argv) +{ + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with OpenGL/CUDA + // interop. + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); + glutInitWindowSize(720, 480); + glutCreateWindow("CUDA n-body system"); if (bFullscreen) { - beginWinCoords(); - char msg0[256], msg1[256], msg2[256]; - - if (bDispInteractions) { - sprintf(msg1, "%0.2f billion interactions per second", - interactionsPerSecond); - } else { - sprintf(msg1, "%0.2f GFLOP/s", gflops); - } - - sprintf(msg0, "%s", deviceName); - sprintf(msg2, "%0.2f FPS [%s | %d bodies]", ifps, - fp64 ? "double precision" : "single precision", numBodies); - - glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color - glEnable(GL_BLEND); - glColor3f(0.46f, 0.73f, 0.0f); - glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 122, msg0, - GLUT_BITMAP_TIMES_ROMAN_24); - glColor3f(1.0f, 1.0f, 1.0f); - glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 96, msg2, - GLUT_BITMAP_TIMES_ROMAN_24); - glColor3f(1.0f, 1.0f, 1.0f); - glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 70, msg1, - GLUT_BITMAP_TIMES_ROMAN_24); - glDisable(GL_BLEND); - - endWinCoords(); + glutFullScreen(); } - glutSwapBuffers(); - } - - fpsCount++; - - // this displays the frame rate updated every second (independent of frame - // rate) - if (fpsCount >= fpsLimit) { - char fps[256]; - - float milliseconds = 1; - - // stop timer - if (useCpu) { - milliseconds = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - } else { - checkCudaErrors(cudaEventRecord(stopEvent, 0)); - checkCudaErrors(cudaEventSynchronize(stopEvent)); - checkCudaErrors( - cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + else if (!isGLVersionSupported(2, 0) + || !areGLExtensionsSupported("GL_ARB_multitexture " + "GL_ARB_vertex_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions missing."); + exit(EXIT_FAILURE); + } + else { +#if defined(WIN32) + wglSwapIntervalEXT(0); +#elif defined(LINUX) + glxSwapIntervalSGI(0); +#endif } - milliseconds /= (float)fpsCount; - computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); + glEnable(GL_DEPTH_TEST); + glClearColor(0.0, 0.0, 0.0, 1.0); - ifps = 1.f / (milliseconds / 1000.f); - sprintf(fps, - "CUDA N-Body (%d bodies): " - "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s", - numBodies, ifps, interactionsPerSecond, gflops, - fp64 ? "double precision" : "single precision"); - - glutSetWindowTitle(fps); - fpsCount = 0; - fpsLimit = (ifps > 1.f) ? (int)ifps : 1; - - if (bPause) { - fpsLimit = 0; - } - - // restart timer - if (!useCpu) { - checkCudaErrors(cudaEventRecord(startEvent, 0)); - } - } - - glutReportErrors(); + checkGLErrors("initGL"); } -void reshape(int w, int h) { - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - gluPerspective(60.0, (float)w / (float)h, 0.1, 1000.0); +void initParameters() +{ + // create a new parameter list + paramlist = new ParamListGL("sliders"); + paramlist->SetBarColorInner(0.8f, 0.8f, 0.0f); - glMatrixMode(GL_MODELVIEW); - glViewport(0, 0, w, h); + // add some parameters to the list + + // Point Size + paramlist->AddParam( + new Param("Point Size", activeParams.m_pointSize, 0.001f, 10.0f, 0.01f, &activeParams.m_pointSize)); + + // Velocity Damping + paramlist->AddParam( + new Param("Velocity Damping", activeParams.m_damping, 0.5f, 1.0f, .0001f, &(activeParams.m_damping))); + // Softening Factor + paramlist->AddParam(new Param( + "Softening Factor", activeParams.m_softening, 0.001f, 1.0f, .0001f, &(activeParams.m_softening))); + // Time step size + paramlist->AddParam( + new Param("Time Step", activeParams.m_timestep, 0.0f, 1.0f, .0001f, &(activeParams.m_timestep))); + // Cluster scale (only affects starting configuration + paramlist->AddParam(new Param( + "Cluster Scale", activeParams.m_clusterScale, 0.0f, 10.0f, 0.01f, &(activeParams.m_clusterScale))); + + // Velocity scale (only affects starting configuration) + paramlist->AddParam(new Param( + "Velocity Scale", activeParams.m_velocityScale, 0.0f, 1000.0f, 0.1f, &activeParams.m_velocityScale)); } -void updateParams() { - if (fp64) { - NBodyDemo::updateParams(); - } else { - NBodyDemo::updateParams(); - } -} - -void mouse(int button, int state, int x, int y) { - if (bShowSliders) { - // call list mouse function - if (paramlist->Mouse(x, y, button, state)) { - updateParams(); +void selectDemo(int activeDemo) +{ + if (fp64) { + NBodyDemo::selectDemo(activeDemo); + } + else { + NBodyDemo::selectDemo(activeDemo); } - } - - int mods; - - if (state == GLUT_DOWN) { - buttonState |= 1 << button; - } else if (state == GLUT_UP) { - buttonState = 0; - } - - mods = glutGetModifiers(); - - if (mods & GLUT_ACTIVE_SHIFT) { - buttonState = 2; - } else if (mods & GLUT_ACTIVE_CTRL) { - buttonState = 3; - } - - ox = x; - oy = y; - - glutPostRedisplay(); } -void motion(int x, int y) { - if (bShowSliders) { - // call parameter list motion function - if (paramlist->Motion(x, y)) { - updateParams(); - glutPostRedisplay(); - return; +void updateSimulation() +{ + if (fp64) { + NBodyDemo::updateSimulation(); } - } + else { + NBodyDemo::updateSimulation(); + } +} - float dx = (float)(x - ox); - float dy = (float)(y - oy); +void displayNBodySystem() +{ + if (fp64) { + NBodyDemo::display(); + } + else { + NBodyDemo::display(); + } +} - if (buttonState == 3) { - // left+middle = zoom - camera_trans[2] += (dy / 100.0f) * 0.5f * fabs(camera_trans[2]); - } else if (buttonState & 2) { - // middle = translate - camera_trans[0] += dx / 100.0f; - camera_trans[1] -= dy / 100.0f; - } else if (buttonState & 1) { - // left = rotate - camera_rot[0] += dy / 5.0f; - camera_rot[1] += dx / 5.0f; - } +void display() +{ + static double gflops = 0; + static double ifps = 0; + static double interactionsPerSecond = 0; - ox = x; - oy = y; - glutPostRedisplay(); + // update the simulation + if (!bPause) { + if (cycleDemo && (sdkGetTimerValue(&demoTimer) > demoTime)) { + activeDemo = (activeDemo + 1) % numDemos; + selectDemo(activeDemo); + } + + updateSimulation(); + + if (!useCpu) { + cudaEventRecord(hostMemSyncEvent, + 0); // insert an event to wait on before rendering + } + } + + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + + if (displayEnabled) { + // view transform + { + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + + for (int c = 0; c < 3; ++c) { + camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; + camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; + } + + glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); + glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); + glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); + } + + displayNBodySystem(); + + // display user interface + if (bShowSliders) { + glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color + glEnable(GL_BLEND); + paramlist->Render(0, 0); + glDisable(GL_BLEND); + } + + if (bFullscreen) { + beginWinCoords(); + char msg0[256], msg1[256], msg2[256]; + + if (bDispInteractions) { + sprintf(msg1, "%0.2f billion interactions per second", interactionsPerSecond); + } + else { + sprintf(msg1, "%0.2f GFLOP/s", gflops); + } + + sprintf(msg0, "%s", deviceName); + sprintf( + msg2, "%0.2f FPS [%s | %d bodies]", ifps, fp64 ? "double precision" : "single precision", numBodies); + + glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color + glEnable(GL_BLEND); + glColor3f(0.46f, 0.73f, 0.0f); + glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 122, msg0, GLUT_BITMAP_TIMES_ROMAN_24); + glColor3f(1.0f, 1.0f, 1.0f); + glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 96, msg2, GLUT_BITMAP_TIMES_ROMAN_24); + glColor3f(1.0f, 1.0f, 1.0f); + glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 70, msg1, GLUT_BITMAP_TIMES_ROMAN_24); + glDisable(GL_BLEND); + + endWinCoords(); + } + + glutSwapBuffers(); + } + + fpsCount++; + + // this displays the frame rate updated every second (independent of frame + // rate) + if (fpsCount >= fpsLimit) { + char fps[256]; + + float milliseconds = 1; + + // stop timer + if (useCpu) { + milliseconds = sdkGetTimerValue(&timer); + sdkResetTimer(&timer); + } + else { + checkCudaErrors(cudaEventRecord(stopEvent, 0)); + checkCudaErrors(cudaEventSynchronize(stopEvent)); + checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + } + + milliseconds /= (float)fpsCount; + computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); + + ifps = 1.f / (milliseconds / 1000.f); + sprintf(fps, + "CUDA N-Body (%d bodies): " + "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s", + numBodies, + ifps, + interactionsPerSecond, + gflops, + fp64 ? "double precision" : "single precision"); + + glutSetWindowTitle(fps); + fpsCount = 0; + fpsLimit = (ifps > 1.f) ? (int)ifps : 1; + + if (bPause) { + fpsLimit = 0; + } + + // restart timer + if (!useCpu) { + checkCudaErrors(cudaEventRecord(startEvent, 0)); + } + } + + glutReportErrors(); +} + +void reshape(int w, int h) +{ + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (float)w / (float)h, 0.1, 1000.0); + + glMatrixMode(GL_MODELVIEW); + glViewport(0, 0, w, h); +} + +void updateParams() +{ + if (fp64) { + NBodyDemo::updateParams(); + } + else { + NBodyDemo::updateParams(); + } +} + +void mouse(int button, int state, int x, int y) +{ + if (bShowSliders) { + // call list mouse function + if (paramlist->Mouse(x, y, button, state)) { + updateParams(); + } + } + + int mods; + + if (state == GLUT_DOWN) { + buttonState |= 1 << button; + } + else if (state == GLUT_UP) { + buttonState = 0; + } + + mods = glutGetModifiers(); + + if (mods & GLUT_ACTIVE_SHIFT) { + buttonState = 2; + } + else if (mods & GLUT_ACTIVE_CTRL) { + buttonState = 3; + } + + ox = x; + oy = y; + + glutPostRedisplay(); +} + +void motion(int x, int y) +{ + if (bShowSliders) { + // call parameter list motion function + if (paramlist->Motion(x, y)) { + updateParams(); + glutPostRedisplay(); + return; + } + } + + float dx = (float)(x - ox); + float dy = (float)(y - oy); + + if (buttonState == 3) { + // left+middle = zoom + camera_trans[2] += (dy / 100.0f) * 0.5f * fabs(camera_trans[2]); + } + else if (buttonState & 2) { + // middle = translate + camera_trans[0] += dx / 100.0f; + camera_trans[1] -= dy / 100.0f; + } + else if (buttonState & 1) { + // left = rotate + camera_rot[0] += dy / 5.0f; + camera_rot[1] += dx / 5.0f; + } + + ox = x; + oy = y; + glutPostRedisplay(); } // commented out to remove unused parameter warnings in Linux -void key(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void key(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case ' ': - bPause = !bPause; - break; + bPause = !bPause; + break; - case 27: // escape + case 27: // escape case 'q': case 'Q': - finalize(); - exit(EXIT_SUCCESS); - break; + finalize(); + exit(EXIT_SUCCESS); + break; - case 13: // return - if (bSupportDouble) { - if (fp64) { - switchDemoPrecision(); - } else { - switchDemoPrecision(); + case 13: // return + if (bSupportDouble) { + if (fp64) { + switchDemoPrecision(); + } + else { + switchDemoPrecision(); + } + + printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); } - printf("> %s precision floating point simulation\n", - fp64 ? "Double" : "Single"); - } - - break; + break; case '`': - bShowSliders = !bShowSliders; - break; + bShowSliders = !bShowSliders; + break; case 'g': case 'G': - bDispInteractions = !bDispInteractions; - break; + bDispInteractions = !bDispInteractions; + break; case 'p': case 'P': - displayMode = (ParticleRenderer::DisplayMode)( - (displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES); - break; + displayMode = (ParticleRenderer::DisplayMode)((displayMode + 1) % ParticleRenderer::PARTICLE_NUM_MODES); + break; case 'c': case 'C': - cycleDemo = !cycleDemo; - printf("Cycle Demo Parameters: %s\n", cycleDemo ? "ON" : "OFF"); - break; + cycleDemo = !cycleDemo; + printf("Cycle Demo Parameters: %s\n", cycleDemo ? "ON" : "OFF"); + break; case '[': - activeDemo = - (activeDemo == 0) ? numDemos - 1 : (activeDemo - 1) % numDemos; - selectDemo(activeDemo); - break; + activeDemo = (activeDemo == 0) ? numDemos - 1 : (activeDemo - 1) % numDemos; + selectDemo(activeDemo); + break; case ']': - activeDemo = (activeDemo + 1) % numDemos; - selectDemo(activeDemo); - break; + activeDemo = (activeDemo + 1) % numDemos; + selectDemo(activeDemo); + break; case 'd': case 'D': - displayEnabled = !displayEnabled; - break; + displayEnabled = !displayEnabled; + break; case 'o': case 'O': - activeParams.print(); - break; + activeParams.print(); + break; case '1': - if (fp64) { - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - } else { - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - } + if (fp64) { + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + } + else { + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + } - break; + break; case '2': - if (fp64) { - NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); - } else { - NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); - } + if (fp64) { + NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); + } + else { + NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); + } - break; + break; case '3': - if (fp64) { - NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); - } else { - NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); - } + if (fp64) { + NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); + } + else { + NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); + } - break; - } + break; + } - glutPostRedisplay(); + glutPostRedisplay(); } -void special(int key, int x, int y) { - paramlist->Special(key, x, y); - glutPostRedisplay(); +void special(int key, int x, int y) +{ + paramlist->Special(key, x, y); + glutPostRedisplay(); } void idle(void) { glutPostRedisplay(); } -void showHelp() { - printf("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); - printf( - "\t-fp64 (use double precision floating point values for " - "simulation)\n"); - printf("\t-hostmem (stores simulation data in host memory)\n"); - printf("\t-benchmark (run benchmark to measure performance) \n"); - printf( - "\t-numbodies= (number of bodies (>= 1) to run in simulation) \n"); - printf( - "\t-device= (where d=0,1,2.... for the CUDA device to use)\n"); - printf( - "\t-numdevices= (where i=(number of CUDA devices > 0) to use for " - "simulation)\n"); - printf( - "\t-compare (compares simulation results running once on the " - "default GPU and once on the CPU)\n"); - printf("\t-cpu (run n-body simulation on the CPU)\n"); - printf("\t-tipsy= (load a tipsy model file for simulation)\n\n"); +void showHelp() +{ + printf("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); + printf("\t-fp64 (use double precision floating point values for " + "simulation)\n"); + printf("\t-hostmem (stores simulation data in host memory)\n"); + printf("\t-benchmark (run benchmark to measure performance) \n"); + printf("\t-numbodies= (number of bodies (>= 1) to run in simulation) \n"); + printf("\t-device= (where d=0,1,2.... for the CUDA device to use)\n"); + printf("\t-numdevices= (where i=(number of CUDA devices > 0) to use for " + "simulation)\n"); + printf("\t-compare (compares simulation results running once on the " + "default GPU and once on the CPU)\n"); + printf("\t-cpu (run n-body simulation on the CPU)\n"); + printf("\t-tipsy= (load a tipsy model file for simulation)\n\n"); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - bool bTestResults = true; +int main(int argc, char **argv) +{ + bool bTestResults = true; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("\n> Command line options\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("\n> Command line options\n"); + showHelp(); + return 0; + } + + printf("Run \"nbody -benchmark [-numbodies=]\" to measure " + "performance.\n"); showHelp(); - return 0; - } - printf( - "Run \"nbody -benchmark [-numbodies=]\" to measure " - "performance.\n"); - showHelp(); + printf("NOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - printf( - "NOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + bFullscreen = (checkCmdLineFlag(argc, (const char **)argv, "fullscreen") != 0); - bFullscreen = - (checkCmdLineFlag(argc, (const char **)argv, "fullscreen") != 0); - - if (bFullscreen) { - bShowSliders = false; - } - - benchmark = (checkCmdLineFlag(argc, (const char **)argv, "benchmark") != 0); - - compareToCPU = - ((checkCmdLineFlag(argc, (const char **)argv, "compare") != 0) || - (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0)); - - QATest = (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0); - useHostMem = (checkCmdLineFlag(argc, (const char **)argv, "hostmem") != 0); - fp64 = (checkCmdLineFlag(argc, (const char **)argv, "fp64") != 0); - - flopsPerInteraction = fp64 ? 30 : 20; - - useCpu = (checkCmdLineFlag(argc, (const char **)argv, "cpu") != 0); - - if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) { - numDevsRequested = - getCmdLineArgumentInt(argc, (const char **)argv, "numdevices"); - - if (numDevsRequested < 1) { - printf( - "Error: \"number of CUDA devices\" specified %d is invalid. Value " - "should be >= 1\n", - numDevsRequested); - exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); - } else { - printf("number of CUDA devices = %d\n", numDevsRequested); + if (bFullscreen) { + bShowSliders = false; } - } - int numDevsAvailable = 0; - bool customGPU = false; - cudaGetDeviceCount(&numDevsAvailable); + benchmark = (checkCmdLineFlag(argc, (const char **)argv, "benchmark") != 0); - if (numDevsAvailable < numDevsRequested) { - printf("Error: only %d Devices available, %d requested. Exiting.\n", - numDevsAvailable, numDevsRequested); - exit(EXIT_FAILURE); - } + compareToCPU = ((checkCmdLineFlag(argc, (const char **)argv, "compare") != 0) + || (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0)); - if (numDevsRequested > 1) { - // If user did not explicitly request host memory to be used, we default to - // P2P. - // We fallback to host memory, if any of GPUs does not support P2P. - bool allGPUsSupportP2P = true; - if (!useHostMem) { - // Enable P2P only in one direction, as every peer will access gpu0 - for (int i = 1; i < numDevsRequested; ++i) { - int canAccessPeer; - checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeer, i, 0)); + QATest = (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0); + useHostMem = (checkCmdLineFlag(argc, (const char **)argv, "hostmem") != 0); + fp64 = (checkCmdLineFlag(argc, (const char **)argv, "fp64") != 0); - if (canAccessPeer != 1) { - allGPUsSupportP2P = false; + flopsPerInteraction = fp64 ? 30 : 20; + + useCpu = (checkCmdLineFlag(argc, (const char **)argv, "cpu") != 0); + + if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) { + numDevsRequested = getCmdLineArgumentInt(argc, (const char **)argv, "numdevices"); + + if (numDevsRequested < 1) { + printf("Error: \"number of CUDA devices\" specified %d is invalid. Value " + "should be >= 1\n", + numDevsRequested); + exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); + } + else { + printf("number of CUDA devices = %d\n", numDevsRequested); } - } - - if (!allGPUsSupportP2P) { - useHostMem = true; - useP2P = false; - } } - } - printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); - printf("> Simulation data stored in %s memory\n", - useHostMem ? "system" : "video"); - printf("> %s precision floating point simulation\n", - fp64 ? "Double" : "Single"); - printf("> %d Devices used for simulation\n", numDevsRequested); + int numDevsAvailable = 0; + bool customGPU = false; + cudaGetDeviceCount(&numDevsAvailable); - int devID; - cudaDeviceProp props; + if (numDevsAvailable < numDevsRequested) { + printf("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); + exit(EXIT_FAILURE); + } - if (useCpu) { - useHostMem = true; - compareToCPU = false; - bSupportDouble = true; + if (numDevsRequested > 1) { + // If user did not explicitly request host memory to be used, we default to + // P2P. + // We fallback to host memory, if any of GPUs does not support P2P. + bool allGPUsSupportP2P = true; + if (!useHostMem) { + // Enable P2P only in one direction, as every peer will access gpu0 + for (int i = 1; i < numDevsRequested; ++i) { + int canAccessPeer; + checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeer, i, 0)); + + if (canAccessPeer != 1) { + allGPUsSupportP2P = false; + } + } + + if (!allGPUsSupportP2P) { + useHostMem = true; + useP2P = false; + } + } + } + + printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); + printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video"); + printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); + printf("> %d Devices used for simulation\n", numDevsRequested); + + int devID; + cudaDeviceProp props; + + if (useCpu) { + useHostMem = true; + compareToCPU = false; + bSupportDouble = true; #ifdef OPENMP - printf("> Simulation with CPU using OpenMP\n"); + printf("> Simulation with CPU using OpenMP\n"); #else - printf("> Simulation with CPU\n"); + printf("> Simulation with CPU\n"); #endif - } - - // Initialize GL and GLUT if necessary - if (!benchmark && !compareToCPU) { - initGL(&argc, argv); - initParameters(); - } - - if (!useCpu) { - // Now choose the CUDA Device - // Either without GL interop: - if (benchmark || compareToCPU || useHostMem) { - // Note if we are using host memory for the body system, we - // don't use CUDA-GL interop. - - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - customGPU = true; - } - - devID = findCudaDevice(argc, (const char **)argv); - } else // or with GL interop: - { - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - customGPU = true; - } - - devID = findCudaDevice(argc, (const char **)argv); } - checkCudaErrors(cudaGetDevice(&devID)); - checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + // Initialize GL and GLUT if necessary + if (!benchmark && !compareToCPU) { + initGL(&argc, argv); + initParameters(); + } - bSupportDouble = true; + if (!useCpu) { + // Now choose the CUDA Device + // Either without GL interop: + if (benchmark || compareToCPU || useHostMem) { + // Note if we are using host memory for the body system, we + // don't use CUDA-GL interop. + + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + customGPU = true; + } + + devID = findCudaDevice(argc, (const char **)argv); + } + else // or with GL interop: + { + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + customGPU = true; + } + + devID = findCudaDevice(argc, (const char **)argv); + } + + checkCudaErrors(cudaGetDevice(&devID)); + checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + + bSupportDouble = true; #if CUDART_VERSION < 4000 - if (numDevsRequested > 1) { - printf("MultiGPU n-body requires CUDA 4.0 or later\n"); - exit(EXIT_SUCCESS); - } + if (numDevsRequested > 1) { + printf("MultiGPU n-body requires CUDA 4.0 or later\n"); + exit(EXIT_SUCCESS); + } #endif - // Initialize devices - if (numDevsRequested > 1 && customGPU) { - printf("You can't use --numdevices and --device at the same time.\n"); - exit(EXIT_SUCCESS); - } + // Initialize devices + if (numDevsRequested > 1 && customGPU) { + printf("You can't use --numdevices and --device at the same time.\n"); + exit(EXIT_SUCCESS); + } - if (customGPU || numDevsRequested == 1) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, devID)); - printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, - props.name); - } else { - for (int i = 0; i < numDevsRequested; i++) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, i)); + if (customGPU || numDevsRequested == 1) { + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); + } + else { + for (int i = 0; i < numDevsRequested; i++) { + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, i)); - printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, - props.name); + printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); - if (useHostMem) { + if (useHostMem) { #if CUDART_VERSION >= 2020 - if (!props.canMapHostMemory) { - fprintf(stderr, "Device %d cannot map host memory!\n", devID); - exit(EXIT_SUCCESS); - } + if (!props.canMapHostMemory) { + fprintf(stderr, "Device %d cannot map host memory!\n", devID); + exit(EXIT_SUCCESS); + } - if (numDevsRequested > 1) { - checkCudaErrors(cudaSetDevice(i)); - } + if (numDevsRequested > 1) { + checkCudaErrors(cudaSetDevice(i)); + } - checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); #else - fprintf(stderr, - "This CUDART version does not support " - " field\n"); - exit(EXIT_SUCCESS); + fprintf(stderr, + "This CUDART version does not support " + " field\n"); + exit(EXIT_SUCCESS); #endif + } + } + + // CC 1.2 and earlier do not support double precision + if (props.major * 10 + props.minor <= 12) { + bSupportDouble = false; + } } - } - // CC 1.2 and earlier do not support double precision - if (props.major * 10 + props.minor <= 12) { - bSupportDouble = false; - } + // if(numDevsRequested > 1) + // checkCudaErrors(cudaSetDevice(devID)); + + if (fp64 && !bSupportDouble) { + fprintf(stderr, + "One or more of the requested devices does not support double " + "precision floating-point\n"); + exit(EXIT_SUCCESS); + } } - // if(numDevsRequested > 1) - // checkCudaErrors(cudaSetDevice(devID)); + numIterations = 0; + blockSize = 0; - if (fp64 && !bSupportDouble) { - fprintf(stderr, - "One or more of the requested devices does not support double " - "precision floating-point\n"); - exit(EXIT_SUCCESS); + if (checkCmdLineFlag(argc, (const char **)argv, "i")) { + numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); } - } - numIterations = 0; - blockSize = 0; + if (checkCmdLineFlag(argc, (const char **)argv, "blockSize")) { + blockSize = getCmdLineArgumentInt(argc, (const char **)argv, "blockSize"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "i")) { - numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); - } + if (blockSize == 0) // blockSize not set on command line + blockSize = 256; - if (checkCmdLineFlag(argc, (const char **)argv, "blockSize")) { - blockSize = getCmdLineArgumentInt(argc, (const char **)argv, "blockSize"); - } - - if (blockSize == 0) // blockSize not set on command line - blockSize = 256; - - // default number of bodies is #SMs * 4 * CTA size - if (useCpu) + // default number of bodies is #SMs * 4 * CTA size + if (useCpu) #ifdef OPENMP - numBodies = 8192; + numBodies = 8192; #else - numBodies = 4096; + numBodies = 4096; #endif - else if (numDevsRequested == 1) { - numBodies = compareToCPU ? 4096 : blockSize * 4 * props.multiProcessorCount; - } else { - numBodies = 0; - - for (int i = 0; i < numDevsRequested; i++) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, i)); - numBodies += - blockSize * (props.major >= 2 ? 4 : 1) * props.multiProcessorCount; + else if (numDevsRequested == 1) { + numBodies = compareToCPU ? 4096 : blockSize * 4 * props.multiProcessorCount; } - } + else { + numBodies = 0; - if (checkCmdLineFlag(argc, (const char **)argv, "numbodies")) { - numBodies = getCmdLineArgumentInt(argc, (const char **)argv, "numbodies"); - - if (numBodies < 1) { - printf( - "Error: \"number of bodies\" specified %d is invalid. Value should " - "be >= 1\n", - numBodies); - exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); - } else if (numBodies % blockSize) { - int newNumBodies = ((numBodies / blockSize) + 1) * blockSize; - printf( - "Warning: \"number of bodies\" specified %d is not a multiple of " - "%d.\n", - numBodies, blockSize); - printf("Rounding up to the nearest multiple: %d.\n", newNumBodies); - numBodies = newNumBodies; - } else { - printf("number of bodies = %d\n", numBodies); - } - } - - char *fname; - - if (getCmdLineArgumentString(argc, (const char **)argv, "tipsy", &fname)) { - tipsyFile.assign(fname, strlen(fname)); - cycleDemo = false; - bShowSliders = false; - } - - if (numBodies <= 1024) { - activeParams.m_clusterScale = 1.52f; - activeParams.m_velocityScale = 2.f; - } else if (numBodies <= 2048) { - activeParams.m_clusterScale = 1.56f; - activeParams.m_velocityScale = 2.64f; - } else if (numBodies <= 4096) { - activeParams.m_clusterScale = 1.68f; - activeParams.m_velocityScale = 2.98f; - } else if (numBodies <= 8192) { - activeParams.m_clusterScale = 1.98f; - activeParams.m_velocityScale = 2.9f; - } else if (numBodies <= 16384) { - activeParams.m_clusterScale = 1.54f; - activeParams.m_velocityScale = 8.f; - } else if (numBodies <= 32768) { - activeParams.m_clusterScale = 1.44f; - activeParams.m_velocityScale = 11.f; - } - - // Create the demo -- either double (fp64) or float (fp32, default) - // implementation - NBodyDemo::Create(); - - NBodyDemo::init(numBodies, numDevsRequested, blockSize, - !(benchmark || compareToCPU || useHostMem), useHostMem, - useP2P, useCpu, devID); - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - - if (bSupportDouble) { - NBodyDemo::Create(); - NBodyDemo::init(numBodies, numDevsRequested, blockSize, - !(benchmark || compareToCPU || useHostMem), - useHostMem, useP2P, useCpu, devID); - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - } - - if (fp64) { - if (benchmark) { - if (numIterations <= 0) { - numIterations = 10; - } else if (numIterations > 10) { - printf("Advisory: setting a high number of iterations\n"); - printf("in benchmark mode may cause failure on Windows\n"); - printf("Vista and Win7. On these OSes, set iterations <= 10\n"); - } - - NBodyDemo::runBenchmark(numIterations); - } else if (compareToCPU) { - bTestResults = NBodyDemo::compareResults(numBodies); - } else { - glutDisplayFunc(display); - glutReshapeFunc(reshape); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutKeyboardFunc(key); - glutSpecialFunc(special); - glutIdleFunc(idle); - - if (!useCpu) { - checkCudaErrors(cudaEventRecord(startEvent, 0)); - } - - glutMainLoop(); + for (int i = 0; i < numDevsRequested; i++) { + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, i)); + numBodies += blockSize * (props.major >= 2 ? 4 : 1) * props.multiProcessorCount; + } } - } else { - if (benchmark) { - if (numIterations <= 0) { - numIterations = 10; - } + if (checkCmdLineFlag(argc, (const char **)argv, "numbodies")) { + numBodies = getCmdLineArgumentInt(argc, (const char **)argv, "numbodies"); - NBodyDemo::runBenchmark(numIterations); - } else if (compareToCPU) { - bTestResults = NBodyDemo::compareResults(numBodies); - } else { - glutDisplayFunc(display); - glutReshapeFunc(reshape); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutKeyboardFunc(key); - glutSpecialFunc(special); - glutIdleFunc(idle); - - if (!useCpu) { - checkCudaErrors(cudaEventRecord(startEvent, 0)); - } - - glutMainLoop(); + if (numBodies < 1) { + printf("Error: \"number of bodies\" specified %d is invalid. Value should " + "be >= 1\n", + numBodies); + exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); + } + else if (numBodies % blockSize) { + int newNumBodies = ((numBodies / blockSize) + 1) * blockSize; + printf("Warning: \"number of bodies\" specified %d is not a multiple of " + "%d.\n", + numBodies, + blockSize); + printf("Rounding up to the nearest multiple: %d.\n", newNumBodies); + numBodies = newNumBodies; + } + else { + printf("number of bodies = %d\n", numBodies); + } } - } - finalize(); - exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); + char *fname; + + if (getCmdLineArgumentString(argc, (const char **)argv, "tipsy", &fname)) { + tipsyFile.assign(fname, strlen(fname)); + cycleDemo = false; + bShowSliders = false; + } + + if (numBodies <= 1024) { + activeParams.m_clusterScale = 1.52f; + activeParams.m_velocityScale = 2.f; + } + else if (numBodies <= 2048) { + activeParams.m_clusterScale = 1.56f; + activeParams.m_velocityScale = 2.64f; + } + else if (numBodies <= 4096) { + activeParams.m_clusterScale = 1.68f; + activeParams.m_velocityScale = 2.98f; + } + else if (numBodies <= 8192) { + activeParams.m_clusterScale = 1.98f; + activeParams.m_velocityScale = 2.9f; + } + else if (numBodies <= 16384) { + activeParams.m_clusterScale = 1.54f; + activeParams.m_velocityScale = 8.f; + } + else if (numBodies <= 32768) { + activeParams.m_clusterScale = 1.44f; + activeParams.m_velocityScale = 11.f; + } + + // Create the demo -- either double (fp64) or float (fp32, default) + // implementation + NBodyDemo::Create(); + + NBodyDemo::init(numBodies, + numDevsRequested, + blockSize, + !(benchmark || compareToCPU || useHostMem), + useHostMem, + useP2P, + useCpu, + devID); + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + + if (bSupportDouble) { + NBodyDemo::Create(); + NBodyDemo::init(numBodies, + numDevsRequested, + blockSize, + !(benchmark || compareToCPU || useHostMem), + useHostMem, + useP2P, + useCpu, + devID); + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + } + + if (fp64) { + if (benchmark) { + if (numIterations <= 0) { + numIterations = 10; + } + else if (numIterations > 10) { + printf("Advisory: setting a high number of iterations\n"); + printf("in benchmark mode may cause failure on Windows\n"); + printf("Vista and Win7. On these OSes, set iterations <= 10\n"); + } + + NBodyDemo::runBenchmark(numIterations); + } + else if (compareToCPU) { + bTestResults = NBodyDemo::compareResults(numBodies); + } + else { + glutDisplayFunc(display); + glutReshapeFunc(reshape); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutKeyboardFunc(key); + glutSpecialFunc(special); + glutIdleFunc(idle); + + if (!useCpu) { + checkCudaErrors(cudaEventRecord(startEvent, 0)); + } + + glutMainLoop(); + } + } + else { + if (benchmark) { + if (numIterations <= 0) { + numIterations = 10; + } + + NBodyDemo::runBenchmark(numIterations); + } + else if (compareToCPU) { + bTestResults = NBodyDemo::compareResults(numBodies); + } + else { + glutDisplayFunc(display); + glutReshapeFunc(reshape); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutKeyboardFunc(key); + glutSpecialFunc(special); + glutIdleFunc(idle); + + if (!useCpu) { + checkCudaErrors(cudaEventRecord(startEvent, 0)); + } + + glutMainLoop(); + } + } + + finalize(); + exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/nbody/render_particles.cpp b/Samples/5_Domain_Specific/nbody/render_particles.cpp index 0e9454e7..12432f9e 100644 --- a/Samples/5_Domain_Specific/nbody/render_particles.cpp +++ b/Samples/5_Domain_Specific/nbody/render_particles.cpp @@ -28,286 +28,288 @@ #include "render_particles.h" #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION -#include - -#include -#include - -#include - -#include #include +#include +#include +#include +#include +#include -#define GL_POINT_SPRITE_ARB 0x8861 -#define GL_COORD_REPLACE_ARB 0x8862 +#define GL_POINT_SPRITE_ARB 0x8861 +#define GL_COORD_REPLACE_ARB 0x8862 #define GL_VERTEX_PROGRAM_POINT_SIZE_NV 0x8642 ParticleRenderer::ParticleRenderer() - : m_pos(0), - m_numParticles(0), - m_pointSize(1.0f), - m_spriteSize(2.0f), - m_vertexShader(0), - m_vertexShaderPoints(0), - m_pixelShader(0), - m_programPoints(0), - m_programSprites(0), - m_texture(0), - m_pbo(0), - m_vboColor(0), - m_bFp64Positions(false) { - _initGL(); + : m_pos(0) + , m_numParticles(0) + , m_pointSize(1.0f) + , m_spriteSize(2.0f) + , m_vertexShader(0) + , m_vertexShaderPoints(0) + , m_pixelShader(0) + , m_programPoints(0) + , m_programSprites(0) + , m_texture(0) + , m_pbo(0) + , m_vboColor(0) + , m_bFp64Positions(false) +{ + _initGL(); } ParticleRenderer::~ParticleRenderer() { m_pos = 0; } void ParticleRenderer::resetPBO() { glDeleteBuffers(1, (GLuint *)&m_pbo); } -void ParticleRenderer::setPositions(float *pos, int numParticles) { - m_pos = pos; - m_numParticles = numParticles; +void ParticleRenderer::setPositions(float *pos, int numParticles) +{ + m_pos = pos; + m_numParticles = numParticles; - if (!m_pbo) { - glGenBuffers(1, (GLuint *)&m_pbo); - } - - glBindBuffer(GL_ARRAY_BUFFER, m_pbo); - glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), pos, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); - SDK_CHECK_ERROR_GL(); -} - -void ParticleRenderer::setPositions(double *pos, int numParticles) { - m_bFp64Positions = true; - m_pos_fp64 = pos; - m_numParticles = numParticles; - - if (!m_pbo) { - glGenBuffers(1, (GLuint *)&m_pbo); - } - - glBindBuffer(GL_ARRAY_BUFFER, m_pbo); - glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(double), pos, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); - SDK_CHECK_ERROR_GL(); -} - -void ParticleRenderer::setColors(float *color, int numParticles) { - glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); - glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), color, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); -} - -void ParticleRenderer::setBaseColor(float color[4]) { - for (int i = 0; i < 4; i++) m_baseColor[i] = color[i]; -} - -void ParticleRenderer::setPBO(unsigned int pbo, int numParticles, bool fp64) { - m_pbo = pbo; - m_numParticles = numParticles; - - if (fp64) m_bFp64Positions = true; -} - -void ParticleRenderer::_drawPoints(bool color) { - if (!m_pbo) { - glBegin(GL_POINTS); - { - int k = 0; - - for (int i = 0; i < m_numParticles; ++i) { - if (m_bFp64Positions) - glVertex3dv(&m_pos_fp64[k]); - else { - glVertex3fv(&m_pos[k]); - } - - k += 4; - } + if (!m_pbo) { + glGenBuffers(1, (GLuint *)&m_pbo); } - glEnd(); - } else { - glEnableClientState(GL_VERTEX_ARRAY); glBindBuffer(GL_ARRAY_BUFFER, m_pbo); - - if (m_bFp64Positions) - glVertexPointer(4, GL_DOUBLE, 0, 0); - else - glVertexPointer(4, GL_FLOAT, 0, 0); - - if (color) { - glEnableClientState(GL_COLOR_ARRAY); - glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); - // glActiveTexture(GL_TEXTURE1); - // glTexCoordPointer(4, GL_FLOAT, 0, 0); - glColorPointer(4, GL_FLOAT, 0, 0); - } - - glDrawArrays(GL_POINTS, 0, m_numParticles); + glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), pos, GL_STATIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); - glDisableClientState(GL_VERTEX_ARRAY); - glDisableClientState(GL_COLOR_ARRAY); - } + SDK_CHECK_ERROR_GL(); } -void ParticleRenderer::display(DisplayMode mode /* = PARTICLE_POINTS */) { - switch (mode) { +void ParticleRenderer::setPositions(double *pos, int numParticles) +{ + m_bFp64Positions = true; + m_pos_fp64 = pos; + m_numParticles = numParticles; + + if (!m_pbo) { + glGenBuffers(1, (GLuint *)&m_pbo); + } + + glBindBuffer(GL_ARRAY_BUFFER, m_pbo); + glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(double), pos, GL_STATIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); + SDK_CHECK_ERROR_GL(); +} + +void ParticleRenderer::setColors(float *color, int numParticles) +{ + glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); + glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), color, GL_STATIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); +} + +void ParticleRenderer::setBaseColor(float color[4]) +{ + for (int i = 0; i < 4; i++) + m_baseColor[i] = color[i]; +} + +void ParticleRenderer::setPBO(unsigned int pbo, int numParticles, bool fp64) +{ + m_pbo = pbo; + m_numParticles = numParticles; + + if (fp64) + m_bFp64Positions = true; +} + +void ParticleRenderer::_drawPoints(bool color) +{ + if (!m_pbo) { + glBegin(GL_POINTS); + { + int k = 0; + + for (int i = 0; i < m_numParticles; ++i) { + if (m_bFp64Positions) + glVertex3dv(&m_pos_fp64[k]); + else { + glVertex3fv(&m_pos[k]); + } + + k += 4; + } + } + glEnd(); + } + else { + glEnableClientState(GL_VERTEX_ARRAY); + + glBindBuffer(GL_ARRAY_BUFFER, m_pbo); + + if (m_bFp64Positions) + glVertexPointer(4, GL_DOUBLE, 0, 0); + else + glVertexPointer(4, GL_FLOAT, 0, 0); + + if (color) { + glEnableClientState(GL_COLOR_ARRAY); + glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); + // glActiveTexture(GL_TEXTURE1); + // glTexCoordPointer(4, GL_FLOAT, 0, 0); + glColorPointer(4, GL_FLOAT, 0, 0); + } + + glDrawArrays(GL_POINTS, 0, m_numParticles); + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDisableClientState(GL_VERTEX_ARRAY); + glDisableClientState(GL_COLOR_ARRAY); + } +} + +void ParticleRenderer::display(DisplayMode mode /* = PARTICLE_POINTS */) +{ + switch (mode) { case PARTICLE_POINTS: - glColor3f(1, 1, 1); - glPointSize(m_pointSize); - glUseProgram(m_programPoints); - _drawPoints(); - glUseProgram(0); - break; + glColor3f(1, 1, 1); + glPointSize(m_pointSize); + glUseProgram(m_programPoints); + _drawPoints(); + glUseProgram(0); + break; case PARTICLE_SPRITES: default: { - // setup point sprites - glEnable(GL_POINT_SPRITE_ARB); - glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE); - glEnable(GL_VERTEX_PROGRAM_POINT_SIZE_NV); - glPointSize(m_spriteSize); - glBlendFunc(GL_SRC_ALPHA, GL_ONE); - glEnable(GL_BLEND); - glDepthMask(GL_FALSE); + // setup point sprites + glEnable(GL_POINT_SPRITE_ARB); + glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE); + glEnable(GL_VERTEX_PROGRAM_POINT_SIZE_NV); + glPointSize(m_spriteSize); + glBlendFunc(GL_SRC_ALPHA, GL_ONE); + glEnable(GL_BLEND); + glDepthMask(GL_FALSE); - glUseProgram(m_programSprites); - GLuint texLoc = glGetUniformLocation(m_programSprites, "splatTexture"); - glUniform1i(texLoc, 0); + glUseProgram(m_programSprites); + GLuint texLoc = glGetUniformLocation(m_programSprites, "splatTexture"); + glUniform1i(texLoc, 0); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, m_texture); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, m_texture); - glColor3f(1, 1, 1); - glSecondaryColor3fv(m_baseColor); + glColor3f(1, 1, 1); + glSecondaryColor3fv(m_baseColor); - _drawPoints(); + _drawPoints(); - glUseProgram(0); + glUseProgram(0); - glDisable(GL_POINT_SPRITE_ARB); - glDisable(GL_BLEND); - glDepthMask(GL_TRUE); + glDisable(GL_POINT_SPRITE_ARB); + glDisable(GL_BLEND); + glDepthMask(GL_TRUE); } break; case PARTICLE_SPRITES_COLOR: { - // setup point sprites - glEnable(GL_POINT_SPRITE_ARB); - glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE); - glEnable(GL_VERTEX_PROGRAM_POINT_SIZE_NV); - glPointSize(m_spriteSize); - glBlendFunc(GL_SRC_ALPHA, GL_ONE); - glEnable(GL_BLEND); - glDepthMask(GL_FALSE); + // setup point sprites + glEnable(GL_POINT_SPRITE_ARB); + glTexEnvi(GL_POINT_SPRITE_ARB, GL_COORD_REPLACE_ARB, GL_TRUE); + glEnable(GL_VERTEX_PROGRAM_POINT_SIZE_NV); + glPointSize(m_spriteSize); + glBlendFunc(GL_SRC_ALPHA, GL_ONE); + glEnable(GL_BLEND); + glDepthMask(GL_FALSE); - glUseProgram(m_programSprites); - GLuint texLoc = glGetUniformLocation(m_programSprites, "splatTexture"); - glUniform1i(texLoc, 0); + glUseProgram(m_programSprites); + GLuint texLoc = glGetUniformLocation(m_programSprites, "splatTexture"); + glUniform1i(texLoc, 0); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, m_texture); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, m_texture); - glColor3f(1, 1, 1); - glSecondaryColor3fv(m_baseColor); + glColor3f(1, 1, 1); + glSecondaryColor3fv(m_baseColor); - _drawPoints(true); + _drawPoints(true); - glUseProgram(0); + glUseProgram(0); - glDisable(GL_POINT_SPRITE_ARB); - glDisable(GL_BLEND); - glDepthMask(GL_TRUE); + glDisable(GL_POINT_SPRITE_ARB); + glDisable(GL_BLEND); + glDepthMask(GL_TRUE); } break; - } + } - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } -const char vertexShaderPoints[] = { - "void main() \n" - "{ \n" - " vec4 vert = vec4(gl_Vertex.xyz, 1.0); " - " " - " \n" - " gl_Position = gl_ProjectionMatrix * gl_ModelViewMatrix * vert; " - " \n" - " gl_FrontColor = gl_Color; \n" - "} " - "\n"}; +const char vertexShaderPoints[] = {"void main() \n" + "{ \n" + " vec4 vert = vec4(gl_Vertex.xyz, 1.0); " + " " + " \n" + " gl_Position = gl_ProjectionMatrix * gl_ModelViewMatrix * vert; " + " \n" + " gl_FrontColor = gl_Color; \n" + "} " + "\n"}; -const char vertexShader[] = { - "void main() \n" - "{ \n" - " float pointSize = 500.0 * gl_Point.size; \n" - " vec4 vert = gl_Vertex; " - " \n" - " vert.w = 1.0; " - " " - " \n" - " vec3 pos_eye = vec3 (gl_ModelViewMatrix * vert); \n" - " gl_PointSize = max(1.0, pointSize / (1.0 - pos_eye.z)); \n" - " gl_TexCoord[0] = gl_MultiTexCoord0; \n" - //" gl_TexCoord[1] = gl_MultiTexCoord1; \n" - " gl_Position = gl_ProjectionMatrix * gl_ModelViewMatrix * vert; \n" - " gl_FrontColor = gl_Color; \n" - " gl_FrontSecondaryColor = gl_SecondaryColor; \n" - "} " - "\n"}; +const char vertexShader[] = {"void main() \n" + "{ \n" + " float pointSize = 500.0 * gl_Point.size; \n" + " vec4 vert = gl_Vertex; " + " \n" + " vert.w = 1.0; " + " " + " \n" + " vec3 pos_eye = vec3 (gl_ModelViewMatrix * vert); \n" + " gl_PointSize = max(1.0, pointSize / (1.0 - pos_eye.z)); \n" + " gl_TexCoord[0] = gl_MultiTexCoord0; \n" + //" gl_TexCoord[1] = gl_MultiTexCoord1; \n" + " gl_Position = gl_ProjectionMatrix * gl_ModelViewMatrix * vert; \n" + " gl_FrontColor = gl_Color; \n" + " gl_FrontSecondaryColor = gl_SecondaryColor; \n" + "} " + "\n"}; -const char pixelShader[] = { - "uniform sampler2D splatTexture; \n" +const char pixelShader[] = {"uniform sampler2D splatTexture; \n" - "void main() \n" - "{ \n" - " vec4 color2 = gl_SecondaryColor; \n" - " vec4 color = (0.6 + 0.4 * gl_Color) * texture2D(splatTexture, " - "gl_TexCoord[0].st); \n" - " gl_FragColor = \n" - " color * color2;\n" // mix(vec4(0.1, 0.0, 0.0, color.w), color2, - // color.w);\n" - "} " - "\n"}; + "void main() \n" + "{ \n" + " vec4 color2 = gl_SecondaryColor; \n" + " vec4 color = (0.6 + 0.4 * gl_Color) * texture2D(splatTexture, " + "gl_TexCoord[0].st); \n" + " gl_FragColor = \n" + " color * color2;\n" // mix(vec4(0.1, 0.0, 0.0, color.w), color2, + // color.w);\n" + "} " + "\n"}; -void ParticleRenderer::_initGL() { - m_vertexShader = glCreateShader(GL_VERTEX_SHADER); - m_vertexShaderPoints = glCreateShader(GL_VERTEX_SHADER); - m_pixelShader = glCreateShader(GL_FRAGMENT_SHADER); +void ParticleRenderer::_initGL() +{ + m_vertexShader = glCreateShader(GL_VERTEX_SHADER); + m_vertexShaderPoints = glCreateShader(GL_VERTEX_SHADER); + m_pixelShader = glCreateShader(GL_FRAGMENT_SHADER); - const char *v = vertexShader; - const char *p = pixelShader; - glShaderSource(m_vertexShader, 1, &v, 0); - glShaderSource(m_pixelShader, 1, &p, 0); - const char *vp = vertexShaderPoints; - glShaderSource(m_vertexShaderPoints, 1, &vp, 0); + const char *v = vertexShader; + const char *p = pixelShader; + glShaderSource(m_vertexShader, 1, &v, 0); + glShaderSource(m_pixelShader, 1, &p, 0); + const char *vp = vertexShaderPoints; + glShaderSource(m_vertexShaderPoints, 1, &vp, 0); - glCompileShader(m_vertexShader); - glCompileShader(m_vertexShaderPoints); - glCompileShader(m_pixelShader); + glCompileShader(m_vertexShader); + glCompileShader(m_vertexShaderPoints); + glCompileShader(m_pixelShader); - m_programSprites = glCreateProgram(); - glAttachShader(m_programSprites, m_vertexShader); - glAttachShader(m_programSprites, m_pixelShader); - glLinkProgram(m_programSprites); + m_programSprites = glCreateProgram(); + glAttachShader(m_programSprites, m_vertexShader); + glAttachShader(m_programSprites, m_pixelShader); + glLinkProgram(m_programSprites); - m_programPoints = glCreateProgram(); - glAttachShader(m_programPoints, m_vertexShaderPoints); - glLinkProgram(m_programPoints); + m_programPoints = glCreateProgram(); + glAttachShader(m_programPoints, m_vertexShaderPoints); + glLinkProgram(m_programPoints); - _createTexture(32); + _createTexture(32); - glGenBuffers(1, (GLuint *)&m_vboColor); - glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); - glBufferData(GL_ARRAY_BUFFER, m_numParticles * 4 * sizeof(float), 0, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); + glGenBuffers(1, (GLuint *)&m_vboColor); + glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); + glBufferData(GL_ARRAY_BUFFER, m_numParticles * 4 * sizeof(float), 0, GL_STATIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); } //------------------------------------------------------------------------------ @@ -315,54 +317,56 @@ void ParticleRenderer::_initGL() { // Description : //------------------------------------------------------------------------------ /** -* EvalHermite(float pA, float pB, float vA, float vB, float u) -* @brief Evaluates Hermite basis functions for the specified coefficients. -*/ -inline float evalHermite(float pA, float pB, float vA, float vB, float u) { - float u2 = (u * u), u3 = u2 * u; - float B0 = 2 * u3 - 3 * u2 + 1; - float B1 = -2 * u3 + 3 * u2; - float B2 = u3 - 2 * u2 + u; - float B3 = u3 - u; - return (B0 * pA + B1 * pB + B2 * vA + B3 * vB); + * EvalHermite(float pA, float pB, float vA, float vB, float u) + * @brief Evaluates Hermite basis functions for the specified coefficients. + */ +inline float evalHermite(float pA, float pB, float vA, float vB, float u) +{ + float u2 = (u * u), u3 = u2 * u; + float B0 = 2 * u3 - 3 * u2 + 1; + float B1 = -2 * u3 + 3 * u2; + float B2 = u3 - 2 * u2 + u; + float B3 = u3 - u; + return (B0 * pA + B1 * pB + B2 * vA + B3 * vB); } -unsigned char *createGaussianMap(int N) { - float *M = new float[2 * N * N]; - unsigned char *B = new unsigned char[4 * N * N]; - float X, Y, Y2, Dist; - float Incr = 2.0f / N; - int i = 0; - int j = 0; - Y = -1.0f; +unsigned char *createGaussianMap(int N) +{ + float *M = new float[2 * N * N]; + unsigned char *B = new unsigned char[4 * N * N]; + float X, Y, Y2, Dist; + float Incr = 2.0f / N; + int i = 0; + int j = 0; + Y = -1.0f; - // float mmax = 0; - for (int y = 0; y < N; y++, Y += Incr) { - Y2 = Y * Y; - X = -1.0f; + // float mmax = 0; + for (int y = 0; y < N; y++, Y += Incr) { + Y2 = Y * Y; + X = -1.0f; - for (int x = 0; x < N; x++, X += Incr, i += 2, j += 4) { - Dist = (float)sqrtf(X * X + Y2); + for (int x = 0; x < N; x++, X += Incr, i += 2, j += 4) { + Dist = (float)sqrtf(X * X + Y2); - if (Dist > 1) Dist = 1; + if (Dist > 1) + Dist = 1; - M[i + 1] = M[i] = evalHermite(1.0f, 0, 0, 0, Dist); - B[j + 3] = B[j + 2] = B[j + 1] = B[j] = (unsigned char)(M[i] * 255); + M[i + 1] = M[i] = evalHermite(1.0f, 0, 0, 0, Dist); + B[j + 3] = B[j + 2] = B[j + 1] = B[j] = (unsigned char)(M[i] * 255); + } } - } - delete[] M; - return (B); + delete[] M; + return (B); } -void ParticleRenderer::_createTexture(int resolution) { - unsigned char *data = createGaussianMap(resolution); - glGenTextures(1, (GLuint *)&m_texture); - glBindTexture(GL_TEXTURE_2D, m_texture); - glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP_SGIS, GL_TRUE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, - GL_LINEAR_MIPMAP_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, resolution, resolution, 0, GL_RGBA, - GL_UNSIGNED_BYTE, data); +void ParticleRenderer::_createTexture(int resolution) +{ + unsigned char *data = createGaussianMap(resolution); + glGenTextures(1, (GLuint *)&m_texture); + glBindTexture(GL_TEXTURE_2D, m_texture); + glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP_SGIS, GL_TRUE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, resolution, resolution, 0, GL_RGBA, GL_UNSIGNED_BYTE, data); } diff --git a/Samples/5_Domain_Specific/nbody/render_particles.h b/Samples/5_Domain_Specific/nbody/render_particles.h index 9b60a088..1e4a5744 100644 --- a/Samples/5_Domain_Specific/nbody/render_particles.h +++ b/Samples/5_Domain_Specific/nbody/render_particles.h @@ -28,56 +28,52 @@ #ifndef __RENDER_PARTICLES__ #define __RENDER_PARTICLES__ -class ParticleRenderer { - public: - ParticleRenderer(); - ~ParticleRenderer(); +class ParticleRenderer +{ +public: + ParticleRenderer(); + ~ParticleRenderer(); - void setPositions(float *pos, int numParticles); - void setPositions(double *pos, int numParticles); - void setBaseColor(float color[4]); - void setColors(float *color, int numParticles); - void setPBO(unsigned int pbo, int numParticles, bool fp64); + void setPositions(float *pos, int numParticles); + void setPositions(double *pos, int numParticles); + void setBaseColor(float color[4]); + void setColors(float *color, int numParticles); + void setPBO(unsigned int pbo, int numParticles, bool fp64); - enum DisplayMode { - PARTICLE_POINTS, - PARTICLE_SPRITES, - PARTICLE_SPRITES_COLOR, - PARTICLE_NUM_MODES - }; + enum DisplayMode { PARTICLE_POINTS, PARTICLE_SPRITES, PARTICLE_SPRITES_COLOR, PARTICLE_NUM_MODES }; - void display(DisplayMode mode = PARTICLE_POINTS); + void display(DisplayMode mode = PARTICLE_POINTS); - void setPointSize(float size) { m_pointSize = size; } - void setSpriteSize(float size) { m_spriteSize = size; } + void setPointSize(float size) { m_pointSize = size; } + void setSpriteSize(float size) { m_spriteSize = size; } - void resetPBO(); + void resetPBO(); - protected: // methods - void _initGL(); - void _createTexture(int resolution); - void _drawPoints(bool color = false); +protected: // methods + void _initGL(); + void _createTexture(int resolution); + void _drawPoints(bool color = false); - protected: // data - float *m_pos; - double *m_pos_fp64; - int m_numParticles; +protected: // data + float *m_pos; + double *m_pos_fp64; + int m_numParticles; - float m_pointSize; - float m_spriteSize; + float m_pointSize; + float m_spriteSize; - unsigned int m_vertexShader; - unsigned int m_vertexShaderPoints; - unsigned int m_pixelShader; - unsigned int m_programPoints; - unsigned int m_programSprites; - unsigned int m_texture; - unsigned int m_pbo; - unsigned int m_vboColor; + unsigned int m_vertexShader; + unsigned int m_vertexShaderPoints; + unsigned int m_pixelShader; + unsigned int m_programPoints; + unsigned int m_programSprites; + unsigned int m_texture; + unsigned int m_pbo; + unsigned int m_vboColor; - float m_baseColor[4]; + float m_baseColor[4]; - bool m_bFp64Positions; + bool m_bFp64Positions; }; -#endif //__ RENDER_PARTICLES__ +#endif //__ RENDER_PARTICLES__ diff --git a/Samples/5_Domain_Specific/nbody/tipsy.h b/Samples/5_Domain_Specific/nbody/tipsy.h index 99692a9c..fc1faa90 100644 --- a/Samples/5_Domain_Specific/nbody/tipsy.h +++ b/Samples/5_Domain_Specific/nbody/tipsy.h @@ -17,11 +17,11 @@ struct gas_particle Real rho; Real temp; Real hsmooth; - Real metals ; - Real phi ; -} ; + Real metals; + Real phi; +}; -//struct gas_particle *gas_particles; +// struct gas_particle *gas_particles; struct dark_particle { @@ -29,45 +29,45 @@ struct dark_particle Real pos[MAXDIM]; Real vel[MAXDIM]; Real eps; - int phi ; -} ; + int phi; +}; -//struct dark_particle *dark_particles; +// struct dark_particle *dark_particles; struct star_particle { Real mass; Real pos[MAXDIM]; Real vel[MAXDIM]; - Real metals ; - Real tform ; + Real metals; + Real tform; Real eps; - int phi ; -} ; + int phi; +}; -//struct star_particle *star_particles; +// struct star_particle *star_particles; struct dump { - double time ; - int nbodies ; - int ndim ; - int nsph ; - int ndark ; - int nstar ; -} ; + double time; + int nbodies; + int ndim; + int nsph; + int ndark; + int nstar; +}; -typedef struct dump header ; +typedef struct dump header; template -void read_tipsy_file(vector &bodyPositions, - vector &bodyVelocities, - vector &bodiesIDs, +void read_tipsy_file(vector &bodyPositions, + vector &bodyVelocities, + vector &bodiesIDs, const std::string &fileName, - int &NTotal, - int &NFirst, - int &NSecond, - int &NThird) + int &NTotal, + int &NFirst, + int &NSecond, + int &NThird) { /* Read in our custom version of the tipsy file format written by @@ -82,59 +82,55 @@ void read_tipsy_file(vector &bodyPositions, ifstream inputFile(fullFileName, ios::in | ios::binary); - if (!inputFile.is_open()) - { + if (!inputFile.is_open()) { cout << "Can't open input file \n"; exit(EXIT_SUCCESS); } - dump h; + dump h; inputFile.read((char *)&h, sizeof(h)); - int idummy; + int idummy; real4 positions; real4 velocity; - //Read tipsy header - NTotal = h.nbodies; - NFirst = h.ndark; - NSecond = h.nstar; - NThird = h.nsph; + // Read tipsy header + NTotal = h.nbodies; + NFirst = h.ndark; + NSecond = h.nstar; + NThird = h.nsph; - //Start reading + // Start reading int particleCount = 0; dark_particle d; star_particle s; - for (int i=0; i < NTotal; i++) - { - if (i < NFirst) - { + for (int i = 0; i < NTotal; i++) { + if (i < NFirst) { inputFile.read((char *)&d, sizeof(d)); - velocity.w = d.eps; - positions.w = d.mass; - positions.x = d.pos[0]; - positions.y = d.pos[1]; - positions.z = d.pos[2]; - velocity.x = d.vel[0]; - velocity.y = d.vel[1]; - velocity.z = d.vel[2]; - idummy = d.phi; + velocity.w = d.eps; + positions.w = d.mass; + positions.x = d.pos[0]; + positions.y = d.pos[1]; + positions.z = d.pos[2]; + velocity.x = d.vel[0]; + velocity.y = d.vel[1]; + velocity.z = d.vel[2]; + idummy = d.phi; } - else - { + else { inputFile.read((char *)&s, sizeof(s)); - velocity.w = s.eps; - positions.w = s.mass; - positions.x = s.pos[0]; - positions.y = s.pos[1]; - positions.z = s.pos[2]; - velocity.x = s.vel[0]; - velocity.y = s.vel[1]; - velocity.z = s.vel[2]; - idummy = s.phi; + velocity.w = s.eps; + positions.w = s.mass; + positions.x = s.pos[0]; + positions.y = s.pos[1]; + positions.z = s.pos[2]; + velocity.x = s.vel[0]; + velocity.y = s.vel[1]; + velocity.z = s.vel[2]; + idummy = s.phi; } bodyPositions.push_back(positions); @@ -142,18 +138,16 @@ void read_tipsy_file(vector &bodyPositions, bodiesIDs.push_back(idummy); particleCount++; - }//end for + } // end for // round up to a multiple of 256 bodies since our kernel only supports that... int newTotal = NTotal; - if (NTotal % 256) - { + if (NTotal % 256) { newTotal = ((NTotal / 256) + 1) * 256; } - for (int i = NTotal; i < newTotal; i++) - { + for (int i = NTotal; i < newTotal; i++) { positions.w = positions.x = positions.y = positions.z = 0; velocity.x = velocity.y = velocity.z = 0; bodyPositions.push_back(positions); diff --git a/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu index ee8fb3ed..29909dea 100644 --- a/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu +++ b/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu @@ -26,670 +26,674 @@ */ #include -#include - #include #include +#include using namespace std; const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test"; typedef enum { - P2P_WRITE = 0, - P2P_READ = 1, + P2P_WRITE = 0, + P2P_READ = 1, } P2PDataTransfer; typedef enum { - CE = 0, - SM = 1, + CE = 0, + SM = 1, } P2PEngine; -P2PEngine p2p_mechanism = CE; // By default use Copy Engine +P2PEngine p2p_mechanism = CE; // By default use Copy Engine // Macro for checking cuda errors following a cuda launch or api call -#define cudaCheckError() \ - { \ - cudaError_t e = cudaGetLastError(); \ - if (e != cudaSuccess) { \ - printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \ - cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ - } -__global__ void delay(volatile int *flag, - unsigned long long timeout_clocks = 10000000) { - // Wait until the application notifies us that it has completed queuing up the - // experiment, or timeout and exit, allowing the application to make progress - long long int start_clock, sample_clock; - start_clock = clock64(); - - while (!*flag) { - sample_clock = clock64(); - - if (sample_clock - start_clock > timeout_clocks) { - break; +#define cudaCheckError() \ + { \ + cudaError_t e = cudaGetLastError(); \ + if (e != cudaSuccess) { \ + printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } +__global__ void delay(volatile int *flag, unsigned long long timeout_clocks = 10000000) +{ + // Wait until the application notifies us that it has completed queuing up the + // experiment, or timeout and exit, allowing the application to make progress + long long int start_clock, sample_clock; + start_clock = clock64(); + + while (!*flag) { + sample_clock = clock64(); + + if (sample_clock - start_clock > timeout_clocks) { + break; + } } - } } // This kernel is for demonstration purposes only, not a performant kernel for // p2p transfers. -__global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src, - size_t num_elems) { - size_t globalId = blockIdx.x * blockDim.x + threadIdx.x; - size_t gridSize = blockDim.x * gridDim.x; +__global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src, size_t num_elems) +{ + size_t globalId = blockIdx.x * blockDim.x + threadIdx.x; + size_t gridSize = blockDim.x * gridDim.x; #pragma unroll(5) - for (size_t i = globalId; i < num_elems; i += gridSize) { - dest[i] = src[i]; - } + for (size_t i = globalId; i < num_elems; i += gridSize) { + dest[i] = src[i]; + } } /////////////////////////////////////////////////////////////////////////// // Print help screen /////////////////////////////////////////////////////////////////////////// -void printHelp(void) { - printf("Usage: p2pBandwidthLatencyTest [OPTION]...\n"); - printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n"); - printf("\n"); +void printHelp(void) +{ + printf("Usage: p2pBandwidthLatencyTest [OPTION]...\n"); + printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n"); + printf("\n"); - printf("Options:\n"); - printf("--help\t\tDisplay this help menu\n"); - printf( - "--p2p_read\tUse P2P reads for data transfers between GPU pairs and show " - "corresponding results.\n \t\tDefault used is P2P write operation.\n"); - printf("--sm_copy Use SM intiated p2p transfers instead of Copy Engine\n"); - printf("--numElems= Number of integer elements to be used in p2p copy.\n"); + printf("Options:\n"); + printf("--help\t\tDisplay this help menu\n"); + printf("--p2p_read\tUse P2P reads for data transfers between GPU pairs and show " + "corresponding results.\n \t\tDefault used is P2P write operation.\n"); + printf("--sm_copy Use SM intiated p2p transfers instead of Copy Engine\n"); + printf("--numElems= Number of integer elements to be used in p2p copy.\n"); } -void checkP2Paccess(int numGPUs) { - for (int i = 0; i < numGPUs; i++) { - cudaSetDevice(i); - cudaCheckError(); - - for (int j = 0; j < numGPUs; j++) { - int access; - if (i != j) { - cudaDeviceCanAccessPeer(&access, i, j); - cudaCheckError(); - printf("Device=%d %s Access Peer Device=%d\n", i, - access ? "CAN" : "CANNOT", j); - } - } - } - printf( - "\n***NOTE: In case a device doesn't have P2P access to other one, it " - "falls back to normal memcopy procedure.\nSo you can see lesser " - "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n"); -} - -void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice, - int num_elems, int repeat, bool p2paccess, - cudaStream_t streamToRun) { - int blockSize = 0; - int numBlocks = 0; - - cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p); - cudaCheckError(); - - if (p2p_mechanism == SM && p2paccess) { - for (int r = 0; r < repeat; r++) { - copyp2p<<>>( - (int4 *)dest, (int4 *)src, num_elems / 4); - } - } else { - for (int r = 0; r < repeat; r++) { - cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice, - sizeof(int) * num_elems, streamToRun); - } - } -} - -void outputBandwidthMatrix(int numElems, int numGPUs, bool p2p, P2PDataTransfer p2p_method) { - int repeat = 5; - volatile int *flag = NULL; - vector buffers(numGPUs); - vector buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy - vector start(numGPUs); - vector stop(numGPUs); - vector stream(numGPUs); - - cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); - cudaCheckError(); - - for (int d = 0; d < numGPUs; d++) { - cudaSetDevice(d); - cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking); - cudaMalloc(&buffers[d], numElems * sizeof(int)); - cudaCheckError(); - cudaMemset(buffers[d], 0, numElems * sizeof(int)); - cudaCheckError(); - cudaMalloc(&buffersD2D[d], numElems * sizeof(int)); - cudaCheckError(); - cudaMemset(buffersD2D[d], 0, numElems * sizeof(int)); - cudaCheckError(); - cudaEventCreate(&start[d]); - cudaCheckError(); - cudaEventCreate(&stop[d]); - cudaCheckError(); - } - - vector bandwidthMatrix(numGPUs * numGPUs); - - for (int i = 0; i < numGPUs; i++) { - cudaSetDevice(i); - - for (int j = 0; j < numGPUs; j++) { - int access = 0; - if (p2p) { - cudaDeviceCanAccessPeer(&access, i, j); - if (access) { - cudaDeviceEnablePeerAccess(j, 0); - cudaCheckError(); - cudaSetDevice(j); - cudaCheckError(); - cudaDeviceEnablePeerAccess(i, 0); - cudaCheckError(); - cudaSetDevice(i); - cudaCheckError(); - } - } - - cudaStreamSynchronize(stream[i]); - cudaCheckError(); - - // Block the stream until all the work is queued up - // DANGER! - cudaMemcpy*Async may infinitely block waiting for - // room to push the operation, so keep the number of repeatitions - // relatively low. Higher repeatitions will cause the delay kernel - // to timeout and lead to unstable results. - *flag = 0; - delay<<<1, 1, 0, stream[i]>>>(flag); - cudaCheckError(); - cudaEventRecord(start[i], stream[i]); - cudaCheckError(); - - if (i == j) { - // Perform intra-GPU, D2D copies - performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, - access, stream[i]); - - } else { - if (p2p_method == P2P_WRITE) { - performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, - stream[i]); - } else { - performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, - stream[i]); - } - } - - cudaEventRecord(stop[i], stream[i]); - cudaCheckError(); - - // Release the queued events - *flag = 1; - cudaStreamSynchronize(stream[i]); - cudaCheckError(); - - float time_ms; - cudaEventElapsedTime(&time_ms, start[i], stop[i]); - double time_s = time_ms / 1e3; - - double gb = numElems * sizeof(int) * repeat / (double)1e9; - if (i == j) { - gb *= 2; // must count both the read and the write here - } - bandwidthMatrix[i * numGPUs + j] = gb / time_s; - if (p2p && access) { - cudaDeviceDisablePeerAccess(j); - cudaSetDevice(j); - cudaDeviceDisablePeerAccess(i); +void checkP2Paccess(int numGPUs) +{ + for (int i = 0; i < numGPUs; i++) { cudaSetDevice(i); cudaCheckError(); - } + + for (int j = 0; j < numGPUs; j++) { + int access; + if (i != j) { + cudaDeviceCanAccessPeer(&access, i, j); + cudaCheckError(); + printf("Device=%d %s Access Peer Device=%d\n", i, access ? "CAN" : "CANNOT", j); + } + } } - } - - printf(" D\\D"); - - for (int j = 0; j < numGPUs; j++) { - printf("%6d ", j); - } - - printf("\n"); - - for (int i = 0; i < numGPUs; i++) { - printf("%6d ", i); - - for (int j = 0; j < numGPUs; j++) { - printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]); - } - - printf("\n"); - } - - for (int d = 0; d < numGPUs; d++) { - cudaSetDevice(d); - cudaFree(buffers[d]); - cudaFree(buffersD2D[d]); - cudaCheckError(); - cudaEventDestroy(start[d]); - cudaCheckError(); - cudaEventDestroy(stop[d]); - cudaCheckError(); - cudaStreamDestroy(stream[d]); - cudaCheckError(); - } - - cudaFreeHost((void *)flag); - cudaCheckError(); + printf("\n***NOTE: In case a device doesn't have P2P access to other one, it " + "falls back to normal memcopy procedure.\nSo you can see lesser " + "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n"); } -void outputBidirectionalBandwidthMatrix(int numElems, int numGPUs, bool p2p) { - int repeat = 5; - volatile int *flag = NULL; - vector buffers(numGPUs); - vector buffersD2D(numGPUs); - vector start(numGPUs); - vector stop(numGPUs); - vector stream0(numGPUs); - vector stream1(numGPUs); +void performP2PCopy(int *dest, + int destDevice, + int *src, + int srcDevice, + int num_elems, + int repeat, + bool p2paccess, + cudaStream_t streamToRun) +{ + int blockSize = 0; + int numBlocks = 0; - cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); - cudaCheckError(); + cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p); + cudaCheckError(); - for (int d = 0; d < numGPUs; d++) { - cudaSetDevice(d); - cudaMalloc(&buffers[d], numElems * sizeof(int)); - cudaMemset(buffers[d], 0, numElems * sizeof(int)); - cudaMalloc(&buffersD2D[d], numElems * sizeof(int)); - cudaMemset(buffersD2D[d], 0, numElems * sizeof(int)); - cudaCheckError(); - cudaEventCreate(&start[d]); - cudaCheckError(); - cudaEventCreate(&stop[d]); - cudaCheckError(); - cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking); - cudaCheckError(); - cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking); - cudaCheckError(); - } - - vector bandwidthMatrix(numGPUs * numGPUs); - - for (int i = 0; i < numGPUs; i++) { - cudaSetDevice(i); - - for (int j = 0; j < numGPUs; j++) { - int access = 0; - if (p2p) { - cudaDeviceCanAccessPeer(&access, i, j); - if (access) { - cudaSetDevice(i); - cudaDeviceEnablePeerAccess(j, 0); - cudaCheckError(); - cudaSetDevice(j); - cudaDeviceEnablePeerAccess(i, 0); - cudaCheckError(); + if (p2p_mechanism == SM && p2paccess) { + for (int r = 0; r < repeat; r++) { + copyp2p<<>>((int4 *)dest, (int4 *)src, num_elems / 4); } - } - - cudaSetDevice(i); - cudaStreamSynchronize(stream0[i]); - cudaStreamSynchronize(stream1[j]); - cudaCheckError(); - - // Block the stream until all the work is queued up - // DANGER! - cudaMemcpy*Async may infinitely block waiting for - // room to push the operation, so keep the number of repeatitions - // relatively low. Higher repeatitions will cause the delay kernel - // to timeout and lead to unstable results. - *flag = 0; - cudaSetDevice(i); - // No need to block stream1 since it'll be blocked on stream0's event - delay<<<1, 1, 0, stream0[i]>>>(flag); - cudaCheckError(); - - // Force stream1 not to start until stream0 does, in order to ensure - // the events on stream0 fully encompass the time needed for all - // operations - cudaEventRecord(start[i], stream0[i]); - cudaStreamWaitEvent(stream1[j], start[i], 0); - - if (i == j) { - // For intra-GPU perform 2 memcopies buffersD2D <-> buffers - performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, - access, stream0[i]); - performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat, - access, stream1[i]); - } else { - if (access && p2p_mechanism == SM) { - cudaSetDevice(j); - } - performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, - stream1[j]); - if (access && p2p_mechanism == SM) { - cudaSetDevice(i); - } - performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, - stream0[i]); - } - - // Notify stream0 that stream1 is complete and record the time of - // the total transaction - cudaEventRecord(stop[j], stream1[j]); - cudaStreamWaitEvent(stream0[i], stop[j], 0); - cudaEventRecord(stop[i], stream0[i]); - - // Release the queued operations - *flag = 1; - cudaStreamSynchronize(stream0[i]); - cudaStreamSynchronize(stream1[j]); - cudaCheckError(); - - float time_ms; - cudaEventElapsedTime(&time_ms, start[i], stop[i]); - double time_s = time_ms / 1e3; - - double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9; - if (i == j) { - gb *= 2; // must count both the read and the write here - } - bandwidthMatrix[i * numGPUs + j] = gb / time_s; - if (p2p && access) { - cudaSetDevice(i); - cudaDeviceDisablePeerAccess(j); - cudaSetDevice(j); - cudaDeviceDisablePeerAccess(i); - } } - } - - printf(" D\\D"); - - for (int j = 0; j < numGPUs; j++) { - printf("%6d ", j); - } - - printf("\n"); - - for (int i = 0; i < numGPUs; i++) { - printf("%6d ", i); - - for (int j = 0; j < numGPUs; j++) { - printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]); + else { + for (int r = 0; r < repeat; r++) { + cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice, sizeof(int) * num_elems, streamToRun); + } } - - printf("\n"); - } - - for (int d = 0; d < numGPUs; d++) { - cudaSetDevice(d); - cudaFree(buffers[d]); - cudaFree(buffersD2D[d]); - cudaCheckError(); - cudaEventDestroy(start[d]); - cudaCheckError(); - cudaEventDestroy(stop[d]); - cudaCheckError(); - cudaStreamDestroy(stream0[d]); - cudaCheckError(); - cudaStreamDestroy(stream1[d]); - cudaCheckError(); - } - - cudaFreeHost((void *)flag); - cudaCheckError(); } -void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) { - int repeat = 100; - int numElems = 4; // perform 1-int4 transfer. - volatile int *flag = NULL; - StopWatchInterface *stopWatch = NULL; - vector buffers(numGPUs); - vector buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy - vector stream(numGPUs); - vector start(numGPUs); - vector stop(numGPUs); +void outputBandwidthMatrix(int numElems, int numGPUs, bool p2p, P2PDataTransfer p2p_method) +{ + int repeat = 5; + volatile int *flag = NULL; + vector buffers(numGPUs); + vector buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy + vector start(numGPUs); + vector stop(numGPUs); + vector stream(numGPUs); - cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); - cudaCheckError(); - - if (!sdkCreateTimer(&stopWatch)) { - printf("Failed to create stop watch\n"); - exit(EXIT_FAILURE); - } - sdkStartTimer(&stopWatch); - - for (int d = 0; d < numGPUs; d++) { - cudaSetDevice(d); - cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking); - cudaMalloc(&buffers[d], sizeof(int) * numElems); - cudaMemset(buffers[d], 0, sizeof(int) * numElems); - cudaMalloc(&buffersD2D[d], sizeof(int) * numElems); - cudaMemset(buffersD2D[d], 0, sizeof(int) * numElems); + cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); cudaCheckError(); - cudaEventCreate(&start[d]); - cudaCheckError(); - cudaEventCreate(&stop[d]); - cudaCheckError(); - } - vector gpuLatencyMatrix(numGPUs * numGPUs); - vector cpuLatencyMatrix(numGPUs * numGPUs); - - for (int i = 0; i < numGPUs; i++) { - cudaSetDevice(i); - - for (int j = 0; j < numGPUs; j++) { - int access = 0; - if (p2p) { - cudaDeviceCanAccessPeer(&access, i, j); - if (access) { - cudaDeviceEnablePeerAccess(j, 0); - cudaCheckError(); - cudaSetDevice(j); - cudaDeviceEnablePeerAccess(i, 0); - cudaSetDevice(i); - cudaCheckError(); - } - } - cudaStreamSynchronize(stream[i]); - cudaCheckError(); - - // Block the stream until all the work is queued up - // DANGER! - cudaMemcpy*Async may infinitely block waiting for - // room to push the operation, so keep the number of repeatitions - // relatively low. Higher repeatitions will cause the delay kernel - // to timeout and lead to unstable results. - *flag = 0; - delay<<<1, 1, 0, stream[i]>>>(flag); - cudaCheckError(); - cudaEventRecord(start[i], stream[i]); - - sdkResetTimer(&stopWatch); - if (i == j) { - // Perform intra-GPU, D2D copies - performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, - access, stream[i]); - } else { - if (p2p_method == P2P_WRITE) { - performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, - stream[i]); - } else { - performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, - stream[i]); - } - } - float cpu_time_ms = sdkGetTimerValue(&stopWatch); - - cudaEventRecord(stop[i], stream[i]); - // Now that the work has been queued up, release the stream - *flag = 1; - cudaStreamSynchronize(stream[i]); - cudaCheckError(); - - float gpu_time_ms; - cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]); - - gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat; - cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat; - if (p2p && access) { - cudaDeviceDisablePeerAccess(j); - cudaSetDevice(j); - cudaDeviceDisablePeerAccess(i); - cudaSetDevice(i); + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking); + cudaMalloc(&buffers[d], numElems * sizeof(int)); cudaCheckError(); - } - } - } - - printf(" GPU"); - - for (int j = 0; j < numGPUs; j++) { - printf("%6d ", j); - } - - printf("\n"); - - for (int i = 0; i < numGPUs; i++) { - printf("%6d ", i); - - for (int j = 0; j < numGPUs; j++) { - printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]); - } - - printf("\n"); - } - - printf("\n CPU"); - - for (int j = 0; j < numGPUs; j++) { - printf("%6d ", j); - } - - printf("\n"); - - for (int i = 0; i < numGPUs; i++) { - printf("%6d ", i); - - for (int j = 0; j < numGPUs; j++) { - printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]); - } - - printf("\n"); - } - - for (int d = 0; d < numGPUs; d++) { - cudaSetDevice(d); - cudaFree(buffers[d]); - cudaFree(buffersD2D[d]); - cudaCheckError(); - cudaEventDestroy(start[d]); - cudaCheckError(); - cudaEventDestroy(stop[d]); - cudaCheckError(); - cudaStreamDestroy(stream[d]); - cudaCheckError(); - } - - sdkDeleteTimer(&stopWatch); - - cudaFreeHost((void *)flag); - cudaCheckError(); -} - -int main(int argc, char **argv) { - int numGPUs, numElems = 40000000; - P2PDataTransfer p2p_method = P2P_WRITE; - - cudaGetDeviceCount(&numGPUs); - cudaCheckError(); - - // process command line args - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printHelp(); - return 0; - } - - if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) { - p2p_method = P2P_READ; - } - - if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) { - p2p_mechanism = SM; - } - - // number of elements of int to be used in copy. - if (checkCmdLineFlag(argc, (const char **)argv, "numElems")) { - numElems = getCmdLineArgumentInt(argc, (const char **)argv, "numElems"); - } - - printf("[%s]\n", sSampleName); - - // output devices - for (int i = 0; i < numGPUs; i++) { - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, i); - cudaCheckError(); - printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i, - prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID); - } - - checkP2Paccess(numGPUs); - - // Check peer-to-peer connectivity - printf("P2P Connectivity Matrix\n"); - printf(" D\\D"); - - for (int j = 0; j < numGPUs; j++) { - printf("%6d", j); - } - printf("\n"); - - for (int i = 0; i < numGPUs; i++) { - printf("%6d\t", i); - for (int j = 0; j < numGPUs; j++) { - if (i != j) { - int access; - cudaDeviceCanAccessPeer(&access, i, j); + cudaMemset(buffers[d], 0, numElems * sizeof(int)); cudaCheckError(); - printf("%6d", (access) ? 1 : 0); - } else { - printf("%6d", 1); - } + cudaMalloc(&buffersD2D[d], numElems * sizeof(int)); + cudaCheckError(); + cudaMemset(buffersD2D[d], 0, numElems * sizeof(int)); + cudaCheckError(); + cudaEventCreate(&start[d]); + cudaCheckError(); + cudaEventCreate(&stop[d]); + cudaCheckError(); + } + + vector bandwidthMatrix(numGPUs * numGPUs); + + for (int i = 0; i < numGPUs; i++) { + cudaSetDevice(i); + + for (int j = 0; j < numGPUs; j++) { + int access = 0; + if (p2p) { + cudaDeviceCanAccessPeer(&access, i, j); + if (access) { + cudaDeviceEnablePeerAccess(j, 0); + cudaCheckError(); + cudaSetDevice(j); + cudaCheckError(); + cudaDeviceEnablePeerAccess(i, 0); + cudaCheckError(); + cudaSetDevice(i); + cudaCheckError(); + } + } + + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + // Block the stream until all the work is queued up + // DANGER! - cudaMemcpy*Async may infinitely block waiting for + // room to push the operation, so keep the number of repeatitions + // relatively low. Higher repeatitions will cause the delay kernel + // to timeout and lead to unstable results. + *flag = 0; + delay<<<1, 1, 0, stream[i]>>>(flag); + cudaCheckError(); + cudaEventRecord(start[i], stream[i]); + cudaCheckError(); + + if (i == j) { + // Perform intra-GPU, D2D copies + performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream[i]); + } + else { + if (p2p_method == P2P_WRITE) { + performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream[i]); + } + else { + performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream[i]); + } + } + + cudaEventRecord(stop[i], stream[i]); + cudaCheckError(); + + // Release the queued events + *flag = 1; + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + float time_ms; + cudaEventElapsedTime(&time_ms, start[i], stop[i]); + double time_s = time_ms / 1e3; + + double gb = numElems * sizeof(int) * repeat / (double)1e9; + if (i == j) { + gb *= 2; // must count both the read and the write here + } + bandwidthMatrix[i * numGPUs + j] = gb / time_s; + if (p2p && access) { + cudaDeviceDisablePeerAccess(j); + cudaSetDevice(j); + cudaDeviceDisablePeerAccess(i); + cudaSetDevice(i); + cudaCheckError(); + } + } + } + + printf(" D\\D"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaFree(buffers[d]); + cudaFree(buffersD2D[d]); + cudaCheckError(); + cudaEventDestroy(start[d]); + cudaCheckError(); + cudaEventDestroy(stop[d]); + cudaCheckError(); + cudaStreamDestroy(stream[d]); + cudaCheckError(); + } + + cudaFreeHost((void *)flag); + cudaCheckError(); +} + +void outputBidirectionalBandwidthMatrix(int numElems, int numGPUs, bool p2p) +{ + int repeat = 5; + volatile int *flag = NULL; + vector buffers(numGPUs); + vector buffersD2D(numGPUs); + vector start(numGPUs); + vector stop(numGPUs); + vector stream0(numGPUs); + vector stream1(numGPUs); + + cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); + cudaCheckError(); + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaMalloc(&buffers[d], numElems * sizeof(int)); + cudaMemset(buffers[d], 0, numElems * sizeof(int)); + cudaMalloc(&buffersD2D[d], numElems * sizeof(int)); + cudaMemset(buffersD2D[d], 0, numElems * sizeof(int)); + cudaCheckError(); + cudaEventCreate(&start[d]); + cudaCheckError(); + cudaEventCreate(&stop[d]); + cudaCheckError(); + cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking); + cudaCheckError(); + cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking); + cudaCheckError(); + } + + vector bandwidthMatrix(numGPUs * numGPUs); + + for (int i = 0; i < numGPUs; i++) { + cudaSetDevice(i); + + for (int j = 0; j < numGPUs; j++) { + int access = 0; + if (p2p) { + cudaDeviceCanAccessPeer(&access, i, j); + if (access) { + cudaSetDevice(i); + cudaDeviceEnablePeerAccess(j, 0); + cudaCheckError(); + cudaSetDevice(j); + cudaDeviceEnablePeerAccess(i, 0); + cudaCheckError(); + } + } + + cudaSetDevice(i); + cudaStreamSynchronize(stream0[i]); + cudaStreamSynchronize(stream1[j]); + cudaCheckError(); + + // Block the stream until all the work is queued up + // DANGER! - cudaMemcpy*Async may infinitely block waiting for + // room to push the operation, so keep the number of repeatitions + // relatively low. Higher repeatitions will cause the delay kernel + // to timeout and lead to unstable results. + *flag = 0; + cudaSetDevice(i); + // No need to block stream1 since it'll be blocked on stream0's event + delay<<<1, 1, 0, stream0[i]>>>(flag); + cudaCheckError(); + + // Force stream1 not to start until stream0 does, in order to ensure + // the events on stream0 fully encompass the time needed for all + // operations + cudaEventRecord(start[i], stream0[i]); + cudaStreamWaitEvent(stream1[j], start[i], 0); + + if (i == j) { + // For intra-GPU perform 2 memcopies buffersD2D <-> buffers + performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream0[i]); + performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat, access, stream1[i]); + } + else { + if (access && p2p_mechanism == SM) { + cudaSetDevice(j); + } + performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream1[j]); + if (access && p2p_mechanism == SM) { + cudaSetDevice(i); + } + performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream0[i]); + } + + // Notify stream0 that stream1 is complete and record the time of + // the total transaction + cudaEventRecord(stop[j], stream1[j]); + cudaStreamWaitEvent(stream0[i], stop[j], 0); + cudaEventRecord(stop[i], stream0[i]); + + // Release the queued operations + *flag = 1; + cudaStreamSynchronize(stream0[i]); + cudaStreamSynchronize(stream1[j]); + cudaCheckError(); + + float time_ms; + cudaEventElapsedTime(&time_ms, start[i], stop[i]); + double time_s = time_ms / 1e3; + + double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9; + if (i == j) { + gb *= 2; // must count both the read and the write here + } + bandwidthMatrix[i * numGPUs + j] = gb / time_s; + if (p2p && access) { + cudaSetDevice(i); + cudaDeviceDisablePeerAccess(j); + cudaSetDevice(j); + cudaDeviceDisablePeerAccess(i); + } + } + } + + printf(" D\\D"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaFree(buffers[d]); + cudaFree(buffersD2D[d]); + cudaCheckError(); + cudaEventDestroy(start[d]); + cudaCheckError(); + cudaEventDestroy(stop[d]); + cudaCheckError(); + cudaStreamDestroy(stream0[d]); + cudaCheckError(); + cudaStreamDestroy(stream1[d]); + cudaCheckError(); + } + + cudaFreeHost((void *)flag); + cudaCheckError(); +} + +void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) +{ + int repeat = 100; + int numElems = 4; // perform 1-int4 transfer. + volatile int *flag = NULL; + StopWatchInterface *stopWatch = NULL; + vector buffers(numGPUs); + vector buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy + vector stream(numGPUs); + vector start(numGPUs); + vector stop(numGPUs); + + cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable); + cudaCheckError(); + + if (!sdkCreateTimer(&stopWatch)) { + printf("Failed to create stop watch\n"); + exit(EXIT_FAILURE); + } + sdkStartTimer(&stopWatch); + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking); + cudaMalloc(&buffers[d], sizeof(int) * numElems); + cudaMemset(buffers[d], 0, sizeof(int) * numElems); + cudaMalloc(&buffersD2D[d], sizeof(int) * numElems); + cudaMemset(buffersD2D[d], 0, sizeof(int) * numElems); + cudaCheckError(); + cudaEventCreate(&start[d]); + cudaCheckError(); + cudaEventCreate(&stop[d]); + cudaCheckError(); + } + + vector gpuLatencyMatrix(numGPUs * numGPUs); + vector cpuLatencyMatrix(numGPUs * numGPUs); + + for (int i = 0; i < numGPUs; i++) { + cudaSetDevice(i); + + for (int j = 0; j < numGPUs; j++) { + int access = 0; + if (p2p) { + cudaDeviceCanAccessPeer(&access, i, j); + if (access) { + cudaDeviceEnablePeerAccess(j, 0); + cudaCheckError(); + cudaSetDevice(j); + cudaDeviceEnablePeerAccess(i, 0); + cudaSetDevice(i); + cudaCheckError(); + } + } + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + // Block the stream until all the work is queued up + // DANGER! - cudaMemcpy*Async may infinitely block waiting for + // room to push the operation, so keep the number of repeatitions + // relatively low. Higher repeatitions will cause the delay kernel + // to timeout and lead to unstable results. + *flag = 0; + delay<<<1, 1, 0, stream[i]>>>(flag); + cudaCheckError(); + cudaEventRecord(start[i], stream[i]); + + sdkResetTimer(&stopWatch); + if (i == j) { + // Perform intra-GPU, D2D copies + performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream[i]); + } + else { + if (p2p_method == P2P_WRITE) { + performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream[i]); + } + else { + performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream[i]); + } + } + float cpu_time_ms = sdkGetTimerValue(&stopWatch); + + cudaEventRecord(stop[i], stream[i]); + // Now that the work has been queued up, release the stream + *flag = 1; + cudaStreamSynchronize(stream[i]); + cudaCheckError(); + + float gpu_time_ms; + cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]); + + gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat; + cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat; + if (p2p && access) { + cudaDeviceDisablePeerAccess(j); + cudaSetDevice(j); + cudaDeviceDisablePeerAccess(i); + cudaSetDevice(i); + cudaCheckError(); + } + } + } + + printf(" GPU"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + printf("\n CPU"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d ", j); + } + + printf("\n"); + + for (int i = 0; i < numGPUs; i++) { + printf("%6d ", i); + + for (int j = 0; j < numGPUs; j++) { + printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]); + } + + printf("\n"); + } + + for (int d = 0; d < numGPUs; d++) { + cudaSetDevice(d); + cudaFree(buffers[d]); + cudaFree(buffersD2D[d]); + cudaCheckError(); + cudaEventDestroy(start[d]); + cudaCheckError(); + cudaEventDestroy(stop[d]); + cudaCheckError(); + cudaStreamDestroy(stream[d]); + cudaCheckError(); + } + + sdkDeleteTimer(&stopWatch); + + cudaFreeHost((void *)flag); + cudaCheckError(); +} + +int main(int argc, char **argv) +{ + int numGPUs, numElems = 40000000; + P2PDataTransfer p2p_method = P2P_WRITE; + + cudaGetDeviceCount(&numGPUs); + cudaCheckError(); + + // process command line args + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printHelp(); + return 0; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) { + p2p_method = P2P_READ; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) { + p2p_mechanism = SM; + } + + // number of elements of int to be used in copy. + if (checkCmdLineFlag(argc, (const char **)argv, "numElems")) { + numElems = getCmdLineArgumentInt(argc, (const char **)argv, "numElems"); + } + + printf("[%s]\n", sSampleName); + + // output devices + for (int i = 0; i < numGPUs; i++) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + cudaCheckError(); + printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", + i, + prop.name, + prop.pciBusID, + prop.pciDeviceID, + prop.pciDomainID); + } + + checkP2Paccess(numGPUs); + + // Check peer-to-peer connectivity + printf("P2P Connectivity Matrix\n"); + printf(" D\\D"); + + for (int j = 0; j < numGPUs; j++) { + printf("%6d", j); } printf("\n"); - } - printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n"); - outputBandwidthMatrix(numElems, numGPUs, false, P2P_WRITE); - printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n"); - outputBandwidthMatrix(numElems, numGPUs, true, P2P_WRITE); - if (p2p_method == P2P_READ) { - printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n"); - outputBandwidthMatrix(numElems, numGPUs, true, p2p_method); - } - printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n"); - outputBidirectionalBandwidthMatrix(numElems, numGPUs, false); - printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n"); - outputBidirectionalBandwidthMatrix(numElems, numGPUs, true); + for (int i = 0; i < numGPUs; i++) { + printf("%6d\t", i); + for (int j = 0; j < numGPUs; j++) { + if (i != j) { + int access; + cudaDeviceCanAccessPeer(&access, i, j); + cudaCheckError(); + printf("%6d", (access) ? 1 : 0); + } + else { + printf("%6d", 1); + } + } + printf("\n"); + } - printf("P2P=Disabled Latency Matrix (us)\n"); - outputLatencyMatrix(numGPUs, false, P2P_WRITE); - printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n"); - outputLatencyMatrix(numGPUs, true, P2P_WRITE); - if (p2p_method == P2P_READ) { - printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n"); - outputLatencyMatrix(numGPUs, true, p2p_method); - } + printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n"); + outputBandwidthMatrix(numElems, numGPUs, false, P2P_WRITE); + printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n"); + outputBandwidthMatrix(numElems, numGPUs, true, P2P_WRITE); + if (p2p_method == P2P_READ) { + printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n"); + outputBandwidthMatrix(numElems, numGPUs, true, p2p_method); + } + printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n"); + outputBidirectionalBandwidthMatrix(numElems, numGPUs, false); + printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n"); + outputBidirectionalBandwidthMatrix(numElems, numGPUs, true); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); + printf("P2P=Disabled Latency Matrix (us)\n"); + outputLatencyMatrix(numGPUs, false, P2P_WRITE); + printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n"); + outputLatencyMatrix(numGPUs, true, P2P_WRITE); + if (p2p_method == P2P_READ) { + printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n"); + outputLatencyMatrix(numGPUs, true, p2p_method); + } - exit(EXIT_SUCCESS); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); + + exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/postProcessGL/README.md b/Samples/5_Domain_Specific/postProcessGL/README.md index 4abab5d2..effb247f 100644 --- a/Samples/5_Domain_Specific/postProcessGL/README.md +++ b/Samples/5_Domain_Specific/postProcessGL/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/postProcessGL/main.cpp b/Samples/5_Domain_Specific/postProcessGL/main.cpp index 0e69b7a2..35b46035 100644 --- a/Samples/5_Domain_Specific/postProcessGL/main.cpp +++ b/Samples/5_Domain_Specific/postProcessGL/main.cpp @@ -81,18 +81,17 @@ #endif // CUDA includes -#include #include +#include // CUDA utilities and system includes #include - #include #include // Shared Library Test Functions -#define MAX_EPSILON 10 -#define REFRESH_DELAY 10 // ms +#define MAX_EPSILON 10 +#define REFRESH_DELAY 10 // ms const char *sSDKname = "postProcessGL"; @@ -103,24 +102,24 @@ CheckRender *g_CheckRender = NULL; //////////////////////////////////////////////////////////////////////////////// // constants / global variables -unsigned int window_width = 512; -unsigned int window_height = 512; -unsigned int image_width = 512; -unsigned int image_height = 512; -int iGLUTWindowHandle = 0; // handle to the GLUT window +unsigned int window_width = 512; +unsigned int window_height = 512; +unsigned int image_width = 512; +unsigned int image_height = 512; +int iGLUTWindowHandle = 0; // handle to the GLUT window // pbo and fbo variables #ifdef USE_TEXSUBIMAGE2D -GLuint pbo_dest; +GLuint pbo_dest; struct cudaGraphicsResource *cuda_pbo_dest_resource; #else -unsigned int *cuda_dest_resource; -GLuint shDrawTex; // draws a texture +unsigned int *cuda_dest_resource; +GLuint shDrawTex; // draws a texture struct cudaGraphicsResource *cuda_tex_result_resource; #endif extern cudaTextureObject_t inTexObject; -GLuint fbo_source; +GLuint fbo_source; struct cudaGraphicsResource *cuda_tex_screen_resource; unsigned int size_tex_data; @@ -128,26 +127,26 @@ unsigned int num_texels; unsigned int num_values; // (offscreen) render target fbo variables -GLuint framebuffer; // to bind the proper targets -GLuint depth_buffer; // for proper depth test while rendering the scene -GLuint tex_screen; // where we render the image -GLuint tex_cudaResult; // where we will copy the CUDA result +GLuint framebuffer; // to bind the proper targets +GLuint depth_buffer; // for proper depth test while rendering the scene +GLuint tex_screen; // where we render the image +GLuint tex_cudaResult; // where we will copy the CUDA result float rotate[3]; -char *ref_file = NULL; -bool enable_cuda = true; -bool animate = true; -int blur_radius = 8; -int max_blur_radius = 16; +char *ref_file = NULL; +bool enable_cuda = true; +bool animate = true; +int blur_radius = 8; +int max_blur_radius = 16; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; // Timer -static int fpsCount = 0; -static int fpsLimit = 1; -StopWatchInterface *timer = NULL; +static int fpsCount = 0; +static int fpsLimit = 1; +StopWatchInterface *timer = NULL; #ifndef USE_TEXTURE_RGBA8UI #pragma message("Note: Using Texture fmt GL_RGBA16F_ARB") @@ -161,13 +160,20 @@ StopWatchInterface *timer = NULL; // output from 0-1 to 0-255. This is why we have some GLSL code, in this case #pragma message("Note: Using Texture RGBA8UI + GLSL for teapot rendering") #endif -GLuint shDrawPot; // colors the teapot +GLuint shDrawPot; // colors the teapot //////////////////////////////////////////////////////////////////////////////// -extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, - cudaArray *g_data, unsigned int *g_odata, - int imgw, int imgh, int tilew, int radius, - float threshold, float highlight); +extern "C" void launch_cudaProcess(dim3 grid, + dim3 block, + int sbytes, + cudaArray *g_data, + unsigned int *g_odata, + int imgw, + int imgh, + int tilew, + int radius, + float threshold, + float highlight); // Forward declarations void runStdProgram(int argc, char **argv); @@ -182,10 +188,8 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource); void deletePBO(GLuint *pbo); #endif -void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, - unsigned int size_y); -void createTextureSrc(GLuint *tex_screen, unsigned int size_x, - unsigned int size_y); +void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y); +void createTextureSrc(GLuint *tex_screen, unsigned int size_x, unsigned int size_y); void deleteTexture(GLuint *tex); void createDepthBuffer(GLuint *depth, unsigned int size_x, unsigned int size_y); void deleteDepthBuffer(GLuint *depth); @@ -202,98 +206,96 @@ void mainMenu(int i); //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void process(int width, int height, int radius) { - cudaArray *in_array; - unsigned int *out_data; +void process(int width, int height, int radius) +{ + cudaArray *in_array; + unsigned int *out_data; #ifdef USE_TEXSUBIMAGE2D - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&out_data, &num_bytes, cuda_pbo_dest_resource)); + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&out_data, &num_bytes, cuda_pbo_dest_resource)); // printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n", // num_bytes, size_tex_data); #else - out_data = cuda_dest_resource; + out_data = cuda_dest_resource; #endif - // map buffer objects to get CUDA device pointers - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_screen_resource, 0)); - // printf("Mapping tex_in\n"); - checkCudaErrors(cudaGraphicsSubResourceGetMappedArray( - &in_array, cuda_tex_screen_resource, 0, 0)); + // map buffer objects to get CUDA device pointers + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_screen_resource, 0)); + // printf("Mapping tex_in\n"); + checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(&in_array, cuda_tex_screen_resource, 0, 0)); - // calculate grid size - dim3 block(16, 16, 1); - // dim3 block(16, 16, 1); - dim3 grid(width / block.x, height / block.y, 1); - int sbytes = (block.x + (2 * radius)) * (block.y + (2 * radius)) * - sizeof(unsigned int); + // calculate grid size + dim3 block(16, 16, 1); + // dim3 block(16, 16, 1); + dim3 grid(width / block.x, height / block.y, 1); + int sbytes = (block.x + (2 * radius)) * (block.y + (2 * radius)) * sizeof(unsigned int); - // execute CUDA kernel - launch_cudaProcess(grid, block, sbytes, in_array, out_data, width, height, - block.x + (2 * radius), radius, 0.8f, 4.0f); + // execute CUDA kernel + launch_cudaProcess( + grid, block, sbytes, in_array, out_data, width, height, block.x + (2 * radius), radius, 0.8f, 4.0f); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_screen_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_screen_resource, 0)); #ifdef USE_TEXSUBIMAGE2D - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_dest_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_dest_resource, 0)); #endif - checkCudaErrors(cudaDestroyTextureObject(inTexObject)); + checkCudaErrors(cudaDestroyTextureObject(inTexObject)); } #ifdef USE_TEXSUBIMAGE2D //////////////////////////////////////////////////////////////////////////////// //! Create PBO //////////////////////////////////////////////////////////////////////////////// -void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) { - // set up vertex data parameter - num_texels = image_width * image_height; - num_values = num_texels * 4; - size_tex_data = sizeof(GLubyte) * num_values; - void *data = malloc(size_tex_data); +void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) +{ + // set up vertex data parameter + num_texels = image_width * image_height; + num_values = num_texels * 4; + size_tex_data = sizeof(GLubyte) * num_values; + void *data = malloc(size_tex_data); - // create buffer object - glGenBuffers(1, pbo); - glBindBuffer(GL_ARRAY_BUFFER, *pbo); - glBufferData(GL_ARRAY_BUFFER, size_tex_data, data, GL_DYNAMIC_DRAW); - free(data); + // create buffer object + glGenBuffers(1, pbo); + glBindBuffer(GL_ARRAY_BUFFER, *pbo); + glBufferData(GL_ARRAY_BUFFER, size_tex_data, data, GL_DYNAMIC_DRAW); + free(data); - glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindBuffer(GL_ARRAY_BUFFER, 0); - // register this buffer object with CUDA - checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, - cudaGraphicsMapFlagsNone)); + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, cudaGraphicsMapFlagsNone)); - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } -void deletePBO(GLuint *pbo) { - glDeleteBuffers(1, pbo); - SDK_CHECK_ERROR_GL(); - *pbo = 0; +void deletePBO(GLuint *pbo) +{ + glDeleteBuffers(1, pbo); + SDK_CHECK_ERROR_GL(); + *pbo = 0; } #endif -const GLenum fbo_targets[] = { - GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT, - GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT}; +const GLenum fbo_targets[] = {GL_COLOR_ATTACHMENT0_EXT, + GL_COLOR_ATTACHMENT1_EXT, + GL_COLOR_ATTACHMENT2_EXT, + GL_COLOR_ATTACHMENT3_EXT}; #ifndef USE_TEXSUBIMAGE2D -static const char *glsl_drawtex_vertshader_src = - "void main(void)\n" - "{\n" - " gl_Position = gl_Vertex;\n" - " gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n" - "}\n"; +static const char *glsl_drawtex_vertshader_src = "void main(void)\n" + "{\n" + " gl_Position = gl_Vertex;\n" + " gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n" + "}\n"; -static const char *glsl_drawtex_fragshader_src = - "#version 130\n" - "uniform usampler2D texImage;\n" - "void main()\n" - "{\n" - " vec4 c = texture(texImage, gl_TexCoord[0].xy);\n" - " gl_FragColor = c / 255.0;\n" - "}\n"; +static const char *glsl_drawtex_fragshader_src = "#version 130\n" + "uniform usampler2D texImage;\n" + "void main()\n" + "{\n" + " vec4 c = texture(texImage, gl_TexCoord[0].xy);\n" + " gl_FragColor = c / 255.0;\n" + "}\n"; #endif static const char *glsl_drawpot_fragshader_src = @@ -320,40 +322,42 @@ static const char *glsl_drawpot_fragshader_src = //////////////////////////////////////////////////////////////////////////////// //! render a simple 3D scene //////////////////////////////////////////////////////////////////////////////// -void renderScene(bool colorScale) { - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); +void renderScene(bool colorScale) +{ + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + + if (colorScale) { + glUseProgram(shDrawPot); + glBindFragDataLocationEXT(shDrawPot, 0, "FragColor"); + SDK_CHECK_ERROR_GL(); + } + + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + glTranslatef(0.0, 0.0, -3.0); + glRotatef(rotate[0], 1.0, 0.0, 0.0); + glRotatef(rotate[1], 0.0, 1.0, 0.0); + glRotatef(rotate[2], 0.0, 0.0, 1.0); + + glViewport(0, 0, 512, 512); + + glEnable(GL_LIGHTING); + glEnable(GL_DEPTH_TEST); + + glutSolidTeapot(1.0); + + if (colorScale) { + glUseProgram(0); + } - if (colorScale) { - glUseProgram(shDrawPot); - glBindFragDataLocationEXT(shDrawPot, 0, "FragColor"); SDK_CHECK_ERROR_GL(); - } - - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - glTranslatef(0.0, 0.0, -3.0); - glRotatef(rotate[0], 1.0, 0.0, 0.0); - glRotatef(rotate[1], 0.0, 1.0, 0.0); - glRotatef(rotate[2], 0.0, 0.0, 1.0); - - glViewport(0, 0, 512, 512); - - glEnable(GL_LIGHTING); - glEnable(GL_DEPTH_TEST); - - glutSolidTeapot(1.0); - - if (colorScale) { - glUseProgram(0); - } - - SDK_CHECK_ERROR_GL(); } // copy image and process using CUDA -void processImage() { - // run the Cuda kernel - process(image_width, image_height, blur_radius); +void processImage() +{ + // run the Cuda kernel + process(image_width, image_height, blur_radius); // CUDA generated data in cuda memory or in a mapped PBO made of BGRA 8 bits // 2 solutions, here : @@ -361,217 +365,218 @@ void processImage() { // possible hidden conversion // - map the texture and blit the result thanks to CUDA API #ifdef USE_TEXSUBIMAGE2D - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest); - glBindTexture(GL_TEXTURE_2D, tex_cudaResult); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - SDK_CHECK_ERROR_GL(); - glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + glBindTexture(GL_TEXTURE_2D, tex_cudaResult); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + SDK_CHECK_ERROR_GL(); + glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); #else - // We want to copy cuda_dest_resource data to the texture - // map buffer objects to get CUDA device pointers - cudaArray *texture_ptr; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0)); - checkCudaErrors(cudaGraphicsSubResourceGetMappedArray( - &texture_ptr, cuda_tex_result_resource, 0, 0)); + // We want to copy cuda_dest_resource data to the texture + // map buffer objects to get CUDA device pointers + cudaArray *texture_ptr; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0)); + checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(&texture_ptr, cuda_tex_result_resource, 0, 0)); - int num_texels = image_width * image_height; - int num_values = num_texels * 4; - int size_tex_data = sizeof(GLubyte) * num_values; - checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, - size_tex_data, cudaMemcpyDeviceToDevice)); + int num_texels = image_width * image_height; + int num_values = num_texels * 4; + int size_tex_data = sizeof(GLubyte) * num_values; + checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice)); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0)); #endif } // display image to the screen as textured quad -void displayImage(GLuint texture) { - glBindTexture(GL_TEXTURE_2D, texture); - glEnable(GL_TEXTURE_2D); - glDisable(GL_DEPTH_TEST); - glDisable(GL_LIGHTING); - glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); +void displayImage(GLuint texture) +{ + glBindTexture(GL_TEXTURE_2D, texture); + glEnable(GL_TEXTURE_2D); + glDisable(GL_DEPTH_TEST); + glDisable(GL_LIGHTING); + glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE); - glMatrixMode(GL_PROJECTION); - glPushMatrix(); - glLoadIdentity(); - glOrtho(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0); + glMatrixMode(GL_PROJECTION); + glPushMatrix(); + glLoadIdentity(); + glOrtho(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glViewport(0, 0, window_width, window_height); + glViewport(0, 0, window_width, window_height); // if the texture is a 8 bits UI, scale the fetch with a GLSL shader #ifndef USE_TEXSUBIMAGE2D - glUseProgram(shDrawTex); - GLint id = glGetUniformLocation(shDrawTex, "texImage"); - glUniform1i(id, 0); // texture unit 0 to "texImage" - SDK_CHECK_ERROR_GL(); + glUseProgram(shDrawTex); + GLint id = glGetUniformLocation(shDrawTex, "texImage"); + glUniform1i(id, 0); // texture unit 0 to "texImage" + SDK_CHECK_ERROR_GL(); #endif - glBegin(GL_QUADS); - glTexCoord2f(0.0, 0.0); - glVertex3f(-1.0, -1.0, 0.5); - glTexCoord2f(1.0, 0.0); - glVertex3f(1.0, -1.0, 0.5); - glTexCoord2f(1.0, 1.0); - glVertex3f(1.0, 1.0, 0.5); - glTexCoord2f(0.0, 1.0); - glVertex3f(-1.0, 1.0, 0.5); - glEnd(); + glBegin(GL_QUADS); + glTexCoord2f(0.0, 0.0); + glVertex3f(-1.0, -1.0, 0.5); + glTexCoord2f(1.0, 0.0); + glVertex3f(1.0, -1.0, 0.5); + glTexCoord2f(1.0, 1.0); + glVertex3f(1.0, 1.0, 0.5); + glTexCoord2f(0.0, 1.0); + glVertex3f(-1.0, 1.0, 0.5); + glEnd(); - glMatrixMode(GL_PROJECTION); - glPopMatrix(); + glMatrixMode(GL_PROJECTION); + glPopMatrix(); - glDisable(GL_TEXTURE_2D); + glDisable(GL_TEXTURE_2D); #ifndef USE_TEXSUBIMAGE2D - glUseProgram(0); + glUseProgram(0); #endif - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } //////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - if (enable_cuda) { - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, framebuffer); + if (enable_cuda) { + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, framebuffer); #ifndef USE_TEXTURE_RGBA8UI - renderScene(false); + renderScene(false); #else - renderScene(true); // output of fragment * by 255 (for RGBA8UI texture) + renderScene(true); // output of fragment * by 255 (for RGBA8UI texture) #endif - processImage(); - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); - displayImage(tex_cudaResult); - } else { - renderScene(false); - } - - // NOTE: I needed to add this call so the timing is consistent. - // Need to investigate why - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - - // flip backbuffer - glutSwapBuffers(); - - // If specified, Check rendering against reference, - if (ref_file && g_CheckRender && g_CheckRender->IsQAReadback()) { - static int pass = 0; - - if (pass > 0) { - g_CheckRender->readback(window_width, window_height); - char currentOutputPPM[256]; - sprintf(currentOutputPPM, "teapot_%d.ppm", blur_radius); - g_CheckRender->savePPM(currentOutputPPM, true, NULL); - - if (!g_CheckRender->PPMvsPPM(currentOutputPPM, - sdkFindFilePath(ref_file, pArgv[0]), - MAX_EPSILON, 0.30f)) { - g_TotalErrors++; - } - - Cleanup((g_TotalErrors == 0) ? EXIT_SUCCESS : EXIT_FAILURE); + processImage(); + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); + displayImage(tex_cudaResult); + } + else { + renderScene(false); } - pass++; - } + // NOTE: I needed to add this call so the timing is consistent. + // Need to investigate why + cudaDeviceSynchronize(); + sdkStopTimer(&timer); - // Update fps counter, fps/title display and log - if (++fpsCount == fpsLimit) { - char cTitle[256]; - float fps = 1000.0f / sdkGetAverageTimerValue(&timer); - sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, - window_height, fps); - glutSetWindowTitle(cTitle); - // printf("%s\n", cTitle); - fpsCount = 0; - fpsLimit = (int)((fps > 1.0f) ? fps : 1.0f); - sdkResetTimer(&timer); - } + // flip backbuffer + glutSwapBuffers(); + + // If specified, Check rendering against reference, + if (ref_file && g_CheckRender && g_CheckRender->IsQAReadback()) { + static int pass = 0; + + if (pass > 0) { + g_CheckRender->readback(window_width, window_height); + char currentOutputPPM[256]; + sprintf(currentOutputPPM, "teapot_%d.ppm", blur_radius); + g_CheckRender->savePPM(currentOutputPPM, true, NULL); + + if (!g_CheckRender->PPMvsPPM(currentOutputPPM, sdkFindFilePath(ref_file, pArgv[0]), MAX_EPSILON, 0.30f)) { + g_TotalErrors++; + } + + Cleanup((g_TotalErrors == 0) ? EXIT_SUCCESS : EXIT_FAILURE); + } + + pass++; + } + + // Update fps counter, fps/title display and log + if (++fpsCount == fpsLimit) { + char cTitle[256]; + float fps = 1000.0f / sdkGetAverageTimerValue(&timer); + sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, window_height, fps); + glutSetWindowTitle(cTitle); + // printf("%s\n", cTitle); + fpsCount = 0; + fpsLimit = (int)((fps > 1.0f) ? fps : 1.0f); + sdkResetTimer(&timer); + } } -void timerEvent(int value) { - if (animate) { - rotate[0] += 0.2f; +void timerEvent(int value) +{ + if (animate) { + rotate[0] += 0.2f; - if (rotate[0] > 360.0f) { - rotate[0] -= 360.0f; + if (rotate[0] > 360.0f) { + rotate[0] -= 360.0f; + } + + rotate[1] += 0.6f; + + if (rotate[1] > 360.0f) { + rotate[1] -= 360.0f; + } + + rotate[2] += 1.0f; + + if (rotate[2] > 360.0f) { + rotate[2] -= 360.0f; + } } - rotate[1] += 0.6f; - - if (rotate[1] > 360.0f) { - rotate[1] -= 360.0f; - } - - rotate[2] += 1.0f; - - if (rotate[2] > 360.0f) { - rotate[2] -= 360.0f; - } - } - - glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); } //////////////////////////////////////////////////////////////////////////////// //! Keyboard events handler //////////////////////////////////////////////////////////////////////////////// -void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case (27): - Cleanup(EXIT_SUCCESS); - break; + Cleanup(EXIT_SUCCESS); + break; case ' ': - enable_cuda ^= 1; + enable_cuda ^= 1; #ifdef USE_TEXTURE_RGBA8UI - if (enable_cuda) { - glClearColorIuiEXT(128, 128, 128, 255); - } else { - glClearColor(0.5, 0.5, 0.5, 1.0); - } + if (enable_cuda) { + glClearColorIuiEXT(128, 128, 128, 255); + } + else { + glClearColor(0.5, 0.5, 0.5, 1.0); + } #endif - break; + break; case 'a': - animate ^= 1; - break; + animate ^= 1; + break; case '=': case '+': - if (blur_radius < 16) { - blur_radius++; - } + if (blur_radius < 16) { + blur_radius++; + } - printf("radius = %d\n", blur_radius); - break; + printf("radius = %d\n", blur_radius); + break; case '-': - if (blur_radius > 1) { - blur_radius--; - } + if (blur_radius > 1) { + blur_radius--; + } - printf("radius = %d\n", blur_radius); - break; - } + printf("radius = %d\n", blur_radius); + break; + } } -void reshape(int w, int h) { - window_width = w; - window_height = h; +void reshape(int w, int h) +{ + window_width = w; + window_height = h; } void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); } @@ -579,92 +584,86 @@ void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void createTextureSrc(GLuint *tex_screen, unsigned int size_x, - unsigned int size_y) { - // create a texture - glGenTextures(1, tex_screen); - glBindTexture(GL_TEXTURE_2D, *tex_screen); +void createTextureSrc(GLuint *tex_screen, unsigned int size_x, unsigned int size_y) +{ + // create a texture + glGenTextures(1, tex_screen); + glBindTexture(GL_TEXTURE_2D, *tex_screen); - // set basic parameters - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + // set basic parameters + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); // buffer data #ifndef USE_TEXTURE_RGBA8UI - printf("Creating a Texture render target GL_RGBA16F_ARB\n"); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, size_x, size_y, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); + printf("Creating a Texture render target GL_RGBA16F_ARB\n"); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, size_x, size_y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); #else - printf("Creating a Texture render target GL_RGBA8UI_EXT\n"); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, - GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL); + printf("Creating a Texture render target GL_RGBA8UI_EXT\n"); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL); #endif - SDK_CHECK_ERROR_GL(); - // register this texture with CUDA - checkCudaErrors(cudaGraphicsGLRegisterImage(&cuda_tex_screen_resource, - *tex_screen, GL_TEXTURE_2D, - cudaGraphicsMapFlagsReadOnly)); + SDK_CHECK_ERROR_GL(); + // register this texture with CUDA + checkCudaErrors(cudaGraphicsGLRegisterImage( + &cuda_tex_screen_resource, *tex_screen, GL_TEXTURE_2D, cudaGraphicsMapFlagsReadOnly)); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, - unsigned int size_y) { - // create a texture - glGenTextures(1, tex_cudaResult); - glBindTexture(GL_TEXTURE_2D, *tex_cudaResult); +void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y) +{ + // create a texture + glGenTextures(1, tex_cudaResult); + glBindTexture(GL_TEXTURE_2D, *tex_cudaResult); - // set basic parameters - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + // set basic parameters + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); #ifdef USE_TEXSUBIMAGE2D - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - SDK_CHECK_ERROR_GL(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + SDK_CHECK_ERROR_GL(); #else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, - GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL); - SDK_CHECK_ERROR_GL(); - // register this texture with CUDA - checkCudaErrors(cudaGraphicsGLRegisterImage( - &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, - cudaGraphicsMapFlagsWriteDiscard)); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL); + SDK_CHECK_ERROR_GL(); + // register this texture with CUDA + checkCudaErrors(cudaGraphicsGLRegisterImage( + &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard)); #endif } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void deleteTexture(GLuint *tex) { - glDeleteTextures(1, tex); - SDK_CHECK_ERROR_GL(); +void deleteTexture(GLuint *tex) +{ + glDeleteTextures(1, tex); + SDK_CHECK_ERROR_GL(); - *tex = 0; + *tex = 0; } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void createDepthBuffer(GLuint *depth, unsigned int size_x, - unsigned int size_y) { - // create a renderbuffer - glGenRenderbuffersEXT(1, depth); - glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, *depth); +void createDepthBuffer(GLuint *depth, unsigned int size_x, unsigned int size_y) +{ + // create a renderbuffer + glGenRenderbuffersEXT(1, depth); + glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, *depth); - // allocate storage - glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_DEPTH_COMPONENT24, size_x, - size_y); + // allocate storage + glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_DEPTH_COMPONENT24, size_x, size_y); - // clean up - glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, 0); + // clean up + glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, 0); - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } //////////////////////////////////////////////////////////////////////////////// @@ -693,359 +692,364 @@ void createDepthBuffer(GLuint *depth, unsigned int size_x, //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void deleteDepthBuffer(GLuint *depth) { - glDeleteRenderbuffersEXT(1, depth); - SDK_CHECK_ERROR_GL(); +void deleteDepthBuffer(GLuint *depth) +{ + glDeleteRenderbuffersEXT(1, depth); + SDK_CHECK_ERROR_GL(); - *depth = 0; + *depth = 0; } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void createFramebuffer(GLuint *fbo, GLuint color, GLuint depth) { - // create and bind a framebuffer - glGenFramebuffersEXT(1, fbo); - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, *fbo); +void createFramebuffer(GLuint *fbo, GLuint color, GLuint depth) +{ + // create and bind a framebuffer + glGenFramebuffersEXT(1, fbo); + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, *fbo); - // attach images - glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, - GL_TEXTURE_2D, color, 0); - // glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, - // GL_RENDERBUFFER_EXT, color); - glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_DEPTH_ATTACHMENT_EXT, - GL_RENDERBUFFER_EXT, depth); + // attach images + glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_2D, color, 0); + // glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, + // GL_RENDERBUFFER_EXT, color); + glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_DEPTH_ATTACHMENT_EXT, GL_RENDERBUFFER_EXT, depth); - // clean up - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); + // clean up + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void deleteFramebuffer(GLuint *fbo) { - glDeleteFramebuffersEXT(1, fbo); - SDK_CHECK_ERROR_GL(); +void deleteFramebuffer(GLuint *fbo) +{ + glDeleteFramebuffersEXT(1, fbo); + SDK_CHECK_ERROR_GL(); - *fbo = 0; + *fbo = 0; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ #if defined(__linux__) - char *Xstatus = getenv("DISPLAY"); - if (Xstatus == NULL) { - printf("Waiving execution as X server is not running\n"); - exit(EXIT_WAIVED); - } - setenv("DISPLAY", ":0", 0); + char *Xstatus = getenv("DISPLAY"); + if (Xstatus == NULL) { + printf("Waiving execution as X server is not running\n"); + exit(EXIT_WAIVED); + } + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", argv[0]); + printf("%s Starting...\n\n", argv[0]); - if (checkCmdLineFlag(argc, (const char **)argv, "radius") && - checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - blur_radius = getCmdLineArgumentInt(argc, (const char **)argv, "radius"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "radius") && checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + blur_radius = getCmdLineArgumentInt(argc, (const char **)argv, "radius"); + } - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - printf("[%s]\n", argv[0]); - printf(" Does not explicitly support -device=n\n"); - printf( - " This sample requires OpenGL. Only -file= -radius= " - "are supported\n"); - printf("exiting...\n"); - exit(EXIT_WAIVED); - } + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + printf("[%s]\n", argv[0]); + printf(" Does not explicitly support -device=n\n"); + printf(" This sample requires OpenGL. Only -file= -radius= " + "are supported\n"); + printf("exiting...\n"); + exit(EXIT_WAIVED); + } - if (ref_file) { - printf("(Test with OpenGL verification)\n"); - animate = false; + if (ref_file) { + printf("(Test with OpenGL verification)\n"); + animate = false; - runStdProgram(argc, argv); - } else { - printf("(Interactive OpenGL Demo)\n"); - animate = true; + runStdProgram(argc, argv); + } + else { + printf("(Interactive OpenGL Demo)\n"); + animate = true; - runStdProgram(argc, argv); - } + runStdProgram(argc, argv); + } - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void FreeResource() { - sdkDeleteTimer(&timer); +void FreeResource() +{ + sdkDeleteTimer(&timer); - // unregister this buffer object with CUDA - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_tex_screen_resource)); + // unregister this buffer object with CUDA + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_tex_screen_resource)); #ifdef USE_TEXSUBIMAGE2D - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_dest_resource)); - deletePBO(&pbo_dest); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_dest_resource)); + deletePBO(&pbo_dest); #else - cudaFree(cuda_dest_resource); + cudaFree(cuda_dest_resource); #endif - deleteTexture(&tex_screen); - deleteTexture(&tex_cudaResult); - deleteDepthBuffer(&depth_buffer); - deleteFramebuffer(&framebuffer); + deleteTexture(&tex_screen); + deleteTexture(&tex_cudaResult); + deleteDepthBuffer(&depth_buffer); + deleteFramebuffer(&framebuffer); - if (iGLUTWindowHandle) { - glutDestroyWindow(iGLUTWindowHandle); - } + if (iGLUTWindowHandle) { + glutDestroyWindow(iGLUTWindowHandle); + } - // finalize logs and leave - printf("postProcessGL.exe Exiting...\n"); + // finalize logs and leave + printf("postProcessGL.exe Exiting...\n"); } -void Cleanup(int iExitCode) { - FreeResource(); - printf("Images are %s\n", - (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching"); - exit(EXIT_SUCCESS); +void Cleanup(int iExitCode) +{ + FreeResource(); + printf("Images are %s\n", (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching"); + exit(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -GLuint compileGLSLprogram(const char *vertex_shader_src, - const char *fragment_shader_src) { - GLuint v, f, p = 0; +GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_shader_src) +{ + GLuint v, f, p = 0; - p = glCreateProgram(); + p = glCreateProgram(); - if (vertex_shader_src) { - v = glCreateShader(GL_VERTEX_SHADER); - glShaderSource(v, 1, &vertex_shader_src, NULL); - glCompileShader(v); + if (vertex_shader_src) { + v = glCreateShader(GL_VERTEX_SHADER); + glShaderSource(v, 1, &vertex_shader_src, NULL); + glCompileShader(v); - // check if shader compiled - GLint compiled = 0; - glGetShaderiv(v, GL_COMPILE_STATUS, &compiled); + // check if shader compiled + GLint compiled = 0; + glGetShaderiv(v, GL_COMPILE_STATUS, &compiled); - if (!compiled) { - //#ifdef NV_REPORT_COMPILE_ERRORS - char temp[256] = ""; - glGetShaderInfoLog(v, 256, NULL, temp); - printf("Vtx Compile failed:\n%s\n", temp); - //#endif - glDeleteShader(v); - return 0; - } else { - glAttachShader(p, v); + if (!compiled) { + // #ifdef NV_REPORT_COMPILE_ERRORS + char temp[256] = ""; + glGetShaderInfoLog(v, 256, NULL, temp); + printf("Vtx Compile failed:\n%s\n", temp); + // #endif + glDeleteShader(v); + return 0; + } + else { + glAttachShader(p, v); + } } - } - if (fragment_shader_src) { - f = glCreateShader(GL_FRAGMENT_SHADER); - glShaderSource(f, 1, &fragment_shader_src, NULL); - glCompileShader(f); + if (fragment_shader_src) { + f = glCreateShader(GL_FRAGMENT_SHADER); + glShaderSource(f, 1, &fragment_shader_src, NULL); + glCompileShader(f); - // check if shader compiled - GLint compiled = 0; - glGetShaderiv(f, GL_COMPILE_STATUS, &compiled); + // check if shader compiled + GLint compiled = 0; + glGetShaderiv(f, GL_COMPILE_STATUS, &compiled); - if (!compiled) { - //#ifdef NV_REPORT_COMPILE_ERRORS - char temp[256] = ""; - glGetShaderInfoLog(f, 256, NULL, temp); - printf("frag Compile failed:\n%s\n", temp); - //#endif - glDeleteShader(f); - return 0; - } else { - glAttachShader(p, f); + if (!compiled) { + // #ifdef NV_REPORT_COMPILE_ERRORS + char temp[256] = ""; + glGetShaderInfoLog(f, 256, NULL, temp); + printf("frag Compile failed:\n%s\n", temp); + // #endif + glDeleteShader(f); + return 0; + } + else { + glAttachShader(p, f); + } } - } - glLinkProgram(p); + glLinkProgram(p); - int infologLength = 0; - int charsWritten = 0; + int infologLength = 0; + int charsWritten = 0; - GLint linked = 0; - glGetProgramiv(p, GL_LINK_STATUS, &linked); + GLint linked = 0; + glGetProgramiv(p, GL_LINK_STATUS, &linked); - if (linked == 0) { - glGetProgramiv(p, GL_INFO_LOG_LENGTH, (GLint *)&infologLength); + if (linked == 0) { + glGetProgramiv(p, GL_INFO_LOG_LENGTH, (GLint *)&infologLength); - if (infologLength > 0) { - char *infoLog = (char *)malloc(infologLength); - glGetProgramInfoLog(p, infologLength, (GLsizei *)&charsWritten, infoLog); - printf("Shader compilation error: %s\n", infoLog); - free(infoLog); + if (infologLength > 0) { + char *infoLog = (char *)malloc(infologLength); + glGetProgramInfoLog(p, infologLength, (GLsizei *)&charsWritten, infoLog); + printf("Shader compilation error: %s\n", infoLog); + free(infoLog); + } } - } - return p; + return p; } //////////////////////////////////////////////////////////////////////////////// //! Allocate the "render target" of CUDA //////////////////////////////////////////////////////////////////////////////// #ifndef USE_TEXSUBIMAGE2D -void initCUDABuffers() { - // set up vertex data parameter - num_texels = image_width * image_height; - num_values = num_texels * 4; - size_tex_data = sizeof(GLubyte) * num_values; - checkCudaErrors(cudaMalloc((void **)&cuda_dest_resource, size_tex_data)); - // checkCudaErrors(cudaHostAlloc((void**)&cuda_dest_resource, size_tex_data, - // )); +void initCUDABuffers() +{ + // set up vertex data parameter + num_texels = image_width * image_height; + num_values = num_texels * 4; + size_tex_data = sizeof(GLubyte) * num_values; + checkCudaErrors(cudaMalloc((void **)&cuda_dest_resource, size_tex_data)); + // checkCudaErrors(cudaHostAlloc((void**)&cuda_dest_resource, size_tex_data, + // )); } #endif //////////////////////////////////////////////////////////////////////////////// //! //////////////////////////////////////////////////////////////////////////////// -void initGLBuffers() { +void initGLBuffers() +{ // create pbo #ifdef USE_TEXSUBIMAGE2D - createPBO(&pbo_dest, &cuda_pbo_dest_resource); + createPBO(&pbo_dest, &cuda_pbo_dest_resource); #endif - // create texture that will receive the result of CUDA - createTextureDst(&tex_cudaResult, image_width, image_height); + // create texture that will receive the result of CUDA + createTextureDst(&tex_cudaResult, image_width, image_height); - // create texture for blitting onto the screen - createTextureSrc(&tex_screen, image_width, image_height); - // createRenderBuffer(&tex_screen, image_width, image_height); // Doesn't work + // create texture for blitting onto the screen + createTextureSrc(&tex_screen, image_width, image_height); + // createRenderBuffer(&tex_screen, image_width, image_height); // Doesn't work - // create a depth buffer for offscreen rendering - createDepthBuffer(&depth_buffer, image_width, image_height); + // create a depth buffer for offscreen rendering + createDepthBuffer(&depth_buffer, image_width, image_height); - // create a framebuffer for offscreen rendering - createFramebuffer(&framebuffer, tex_screen, depth_buffer); + // create a framebuffer for offscreen rendering + createFramebuffer(&framebuffer, tex_screen, depth_buffer); - // load shader programs - shDrawPot = compileGLSLprogram(NULL, glsl_drawpot_fragshader_src); + // load shader programs + shDrawPot = compileGLSLprogram(NULL, glsl_drawpot_fragshader_src); #ifndef USE_TEXSUBIMAGE2D - shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, - glsl_drawtex_fragshader_src); + shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, glsl_drawtex_fragshader_src); #endif - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); } //////////////////////////////////////////////////////////////////////////////// //! Run standard demo loop with or without GL verification //////////////////////////////////////////////////////////////////////////////// -void runStdProgram(int argc, char **argv) { - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with OpenGL/CUDA - // interop. - if (false == initGL(&argc, argv)) { - return; - } +void runStdProgram(int argc, char **argv) +{ + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with OpenGL/CUDA + // interop. + if (false == initGL(&argc, argv)) { + return; + } - // Now initialize CUDA context - findCudaDevice(argc, (const char **)argv); + // Now initialize CUDA context + findCudaDevice(argc, (const char **)argv); - sdkCreateTimer(&timer); - sdkResetTimer(&timer); + sdkCreateTimer(&timer); + sdkResetTimer(&timer); - // register callbacks - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); - glutTimerFunc(REFRESH_DELAY, timerEvent, 0); + // register callbacks + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); - // create menu - glutCreateMenu(mainMenu); - glutAddMenuEntry("Toggle CUDA Post Processing (on/off) [ ]", ' '); - glutAddMenuEntry("Toggle Animation (on/off) [a]", 'a'); - glutAddMenuEntry("Increase Blur Radius [=]", '='); - glutAddMenuEntry("Decrease Blur Radius [-]", '-'); - glutAddMenuEntry("Quit (esc)", '\033'); - glutAttachMenu(GLUT_RIGHT_BUTTON); + // create menu + glutCreateMenu(mainMenu); + glutAddMenuEntry("Toggle CUDA Post Processing (on/off) [ ]", ' '); + glutAddMenuEntry("Toggle Animation (on/off) [a]", 'a'); + glutAddMenuEntry("Increase Blur Radius [=]", '='); + glutAddMenuEntry("Decrease Blur Radius [-]", '-'); + glutAddMenuEntry("Quit (esc)", '\033'); + glutAttachMenu(GLUT_RIGHT_BUTTON); - initGLBuffers(); + initGLBuffers(); #ifndef USE_TEXSUBIMAGE2D - initCUDABuffers(); + initCUDABuffers(); #endif - // Creating the Auto-Validation Code - if (ref_file) { - g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); - g_CheckRender->setPixelFormat(GL_RGBA); - g_CheckRender->setExecPath(argv[0]); - g_CheckRender->EnableQAReadback(true); - } + // Creating the Auto-Validation Code + if (ref_file) { + g_CheckRender = new CheckBackBuffer(window_width, window_height, 4); + g_CheckRender->setPixelFormat(GL_RGBA); + g_CheckRender->setExecPath(argv[0]); + g_CheckRender->EnableQAReadback(true); + } - printf( - "\n" - "\tControls\n" - "\t(right click mouse button for Menu)\n" - "\t[ ] : Toggle CUDA Post Processing (on/off)\n" - "\t[a] : Toggle Animation (on/off)\n" - "\t[=] : Increase Blur Radius\n" - "\t[-] : Decrease Blur Radius\n" - "\t[esc] - Quit\n\n"); + printf("\n" + "\tControls\n" + "\t(right click mouse button for Menu)\n" + "\t[ ] : Toggle CUDA Post Processing (on/off)\n" + "\t[a] : Toggle Animation (on/off)\n" + "\t[=] : Increase Blur Radius\n" + "\t[-] : Decrease Blur Radius\n" + "\t[esc] - Quit\n\n"); - // start rendering mainloop - glutMainLoop(); + // start rendering mainloop + glutMainLoop(); - // Normally unused return path - Cleanup(EXIT_SUCCESS); + // Normally unused return path + Cleanup(EXIT_SUCCESS); } //////////////////////////////////////////////////////////////////////////////// //! Initialize GL //////////////////////////////////////////////////////////////////////////////// -bool initGL(int *argc, char **argv) { - // Create GL context - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); - glutInitWindowSize(window_width, window_height); - iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing"); +bool initGL(int *argc, char **argv) +{ + // Create GL context + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); + glutInitWindowSize(window_width, window_height); + iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing"); - // initialize necessary OpenGL extensions - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported("GL_ARB_pixel_buffer_object " - "GL_EXT_framebuffer_object")) { - printf("ERROR: Support for necessary OpenGL extensions missing."); - fflush(stderr); - return false; - } + // initialize necessary OpenGL extensions + if (!isGLVersionSupported(2, 0) + || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object " + "GL_EXT_framebuffer_object")) { + printf("ERROR: Support for necessary OpenGL extensions missing."); + fflush(stderr); + return false; + } // default initialization #ifndef USE_TEXTURE_RGBA8UI - glClearColor(0.5, 0.5, 0.5, 1.0); + glClearColor(0.5, 0.5, 0.5, 1.0); #else - glClearColorIuiEXT(128, 128, 128, 255); + glClearColorIuiEXT(128, 128, 128, 255); #endif - glDisable(GL_DEPTH_TEST); + glDisable(GL_DEPTH_TEST); - // viewport - glViewport(0, 0, window_width, window_height); + // viewport + glViewport(0, 0, window_width, window_height); - // projection - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, - 10.0f); + // projection + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, 10.0f); - glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); - glEnable(GL_LIGHT0); - float red[] = {1.0f, 0.1f, 0.1f, 1.0f}; - float white[] = {1.0f, 1.0f, 1.0f, 1.0f}; - glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red); - glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, white); - glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 60.0f); + glEnable(GL_LIGHT0); + float red[] = {1.0f, 0.1f, 0.1f, 1.0f}; + float white[] = {1.0f, 1.0f, 1.0f, 1.0f}; + glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red); + glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, white); + glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 60.0f); - SDK_CHECK_ERROR_GL(); + SDK_CHECK_ERROR_GL(); - return true; + return true; } diff --git a/Samples/5_Domain_Specific/postProcessGL/postProcessGL.cu b/Samples/5_Domain_Specific/postProcessGL/postProcessGL.cu index 6bf4b87e..6c31fd36 100644 --- a/Samples/5_Domain_Specific/postProcessGL/postProcessGL.cu +++ b/Samples/5_Domain_Specific/postProcessGL/postProcessGL.cu @@ -41,27 +41,28 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); } __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); } // convert floating point rgb color to 8-bit integer -__device__ int rgbToInt(float r, float g, float b) { - r = clamp(r, 0.0f, 255.0f); - g = clamp(g, 0.0f, 255.0f); - b = clamp(b, 0.0f, 255.0f); - return (int(b) << 16) | (int(g) << 8) | int(r); +__device__ int rgbToInt(float r, float g, float b) +{ + r = clamp(r, 0.0f, 255.0f); + g = clamp(g, 0.0f, 255.0f); + b = clamp(b, 0.0f, 255.0f); + return (int(b) << 16) | (int(g) << 8) | int(r); } // get pixel from 2D image, with clamping to border -__device__ uchar4 getPixel(int x, int y, cudaTextureObject_t inTex) { +__device__ uchar4 getPixel(int x, int y, cudaTextureObject_t inTex) +{ #ifndef USE_TEXTURE_RGBA8UI - float4 res = tex2D(inTex, x, y); - uchar4 ucres = make_uchar4(res.x * 255.0f, res.y * 255.0f, res.z * 255.0f, - res.w * 255.0f); + float4 res = tex2D(inTex, x, y); + uchar4 ucres = make_uchar4(res.x * 255.0f, res.y * 255.0f, res.z * 255.0f, res.w * 255.0f); #else - uchar4 ucres = tex2D(inTex, x, y); + uchar4 ucres = tex2D(inTex, x, y); #endif - return ucres; + return ucres; } // macros to make indexing shared memory easier -#define SMEM(X, Y) sdata[(Y)*tilew + (X)] +#define SMEM(X, Y) sdata[(Y) * tilew + (X)] /* 2D convolution using shared memory @@ -80,132 +81,144 @@ __device__ uchar4 getPixel(int x, int y, cudaTextureObject_t inTex) { <----tilew----> */ -__global__ void cudaProcess(unsigned int *g_odata, int imgw, int imgh, - int tilew, int r, float threshold, float highlight, - cudaTextureObject_t inTex) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - extern __shared__ uchar4 sdata[]; +__global__ void cudaProcess(unsigned int *g_odata, + int imgw, + int imgh, + int tilew, + int r, + float threshold, + float highlight, + cudaTextureObject_t inTex) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + extern __shared__ uchar4 sdata[]; - int tx = threadIdx.x; - int ty = threadIdx.y; - int bw = blockDim.x; - int bh = blockDim.y; - int x = blockIdx.x * bw + tx; - int y = blockIdx.y * bh + ty; + int tx = threadIdx.x; + int ty = threadIdx.y; + int bw = blockDim.x; + int bh = blockDim.y; + int x = blockIdx.x * bw + tx; + int y = blockIdx.y * bh + ty; #if 0 uchar4 c4 = getPixel(x, y); g_odata[y*imgw+x] = rgbToInt(c4.z, c4.y, c4.x); #else - // copy tile to shared memory - // center region - SMEM(r + tx, r + ty) = getPixel(x, y, inTex); + // copy tile to shared memory + // center region + SMEM(r + tx, r + ty) = getPixel(x, y, inTex); - // borders - if (threadIdx.x < r) { - // left - SMEM(tx, r + ty) = getPixel(x - r, y, inTex); - // right - SMEM(r + bw + tx, r + ty) = getPixel(x + bw, y, inTex); - } + // borders + if (threadIdx.x < r) { + // left + SMEM(tx, r + ty) = getPixel(x - r, y, inTex); + // right + SMEM(r + bw + tx, r + ty) = getPixel(x + bw, y, inTex); + } - if (threadIdx.y < r) { - // top - SMEM(r + tx, ty) = getPixel(x, y - r, inTex); - // bottom - SMEM(r + tx, r + bh + ty) = getPixel(x, y + bh, inTex); - } + if (threadIdx.y < r) { + // top + SMEM(r + tx, ty) = getPixel(x, y - r, inTex); + // bottom + SMEM(r + tx, r + bh + ty) = getPixel(x, y + bh, inTex); + } - // load corners - if ((threadIdx.x < r) && (threadIdx.y < r)) { - // tl - SMEM(tx, ty) = getPixel(x - r, y - r, inTex); - // bl - SMEM(tx, r + bh + ty) = getPixel(x - r, y + bh, inTex); - // tr - SMEM(r + bw + tx, ty) = getPixel(x + bh, y - r, inTex); - // br - SMEM(r + bw + tx, r + bh + ty) = getPixel(x + bw, y + bh, inTex); - } + // load corners + if ((threadIdx.x < r) && (threadIdx.y < r)) { + // tl + SMEM(tx, ty) = getPixel(x - r, y - r, inTex); + // bl + SMEM(tx, r + bh + ty) = getPixel(x - r, y + bh, inTex); + // tr + SMEM(r + bw + tx, ty) = getPixel(x + bh, y - r, inTex); + // br + SMEM(r + bw + tx, r + bh + ty) = getPixel(x + bw, y + bh, inTex); + } - // wait for loads to complete - cg::sync(cta); + // wait for loads to complete + cg::sync(cta); - // perform convolution - float rsum = 0.0f; - float gsum = 0.0f; - float bsum = 0.0f; - float samples = 0.0f; + // perform convolution + float rsum = 0.0f; + float gsum = 0.0f; + float bsum = 0.0f; + float samples = 0.0f; - for (int dy = -r; dy <= r; dy++) { - for (int dx = -r; dx <= r; dx++) { + for (int dy = -r; dy <= r; dy++) { + for (int dx = -r; dx <= r; dx++) { #if 0 // try this to see the benefit of using shared memory uchar4 pixel = getPixel(x+dx, y+dy); #else - uchar4 pixel = SMEM(r + tx + dx, r + ty + dy); + uchar4 pixel = SMEM(r + tx + dx, r + ty + dy); #endif - // only sum pixels within disc-shaped kernel - float l = dx * dx + dy * dy; + // only sum pixels within disc-shaped kernel + float l = dx * dx + dy * dy; - if (l <= r * r) { - float r = float(pixel.x); - float g = float(pixel.y); - float b = float(pixel.z); + if (l <= r * r) { + float r = float(pixel.x); + float g = float(pixel.y); + float b = float(pixel.z); #if 1 - // brighten highlights - float lum = (r + g + b) / (255 * 3); + // brighten highlights + float lum = (r + g + b) / (255 * 3); - if (lum > threshold) { - r *= highlight; - g *= highlight; - b *= highlight; - } + if (lum > threshold) { + r *= highlight; + g *= highlight; + b *= highlight; + } #endif - rsum += r; - gsum += g; - bsum += b; - samples += 1.0f; - } + rsum += r; + gsum += g; + bsum += b; + samples += 1.0f; + } + } } - } - rsum /= samples; - gsum /= samples; - bsum /= samples; - // ABGR - g_odata[y * imgw + x] = rgbToInt(rsum, gsum, bsum); + rsum /= samples; + gsum /= samples; + bsum /= samples; + // ABGR + g_odata[y * imgw + x] = rgbToInt(rsum, gsum, bsum); // g_odata[y*imgw+x] = rgbToInt(x,y,0); #endif } -extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, - cudaArray *g_data_array, - unsigned int *g_odata, int imgw, int imgh, - int tilew, int radius, float threshold, - float highlight) { - struct cudaChannelFormatDesc desc; - checkCudaErrors(cudaGetChannelDesc(&desc, g_data_array)); +extern "C" void launch_cudaProcess(dim3 grid, + dim3 block, + int sbytes, + cudaArray *g_data_array, + unsigned int *g_odata, + int imgw, + int imgh, + int tilew, + int radius, + float threshold, + float highlight) +{ + struct cudaChannelFormatDesc desc; + checkCudaErrors(cudaGetChannelDesc(&desc, g_data_array)); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = g_data_array; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = g_data_array; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&inTexObject, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&inTexObject, &texRes, &texDescr, NULL)); #if 0 printf("CUDA Array channel descriptor, bits per component:\n"); @@ -219,38 +232,40 @@ extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, // printf("\n"); #ifdef GPU_PROFILING - StopWatchInterface *timer = 0; - sdkCreateTimer(&timer); + StopWatchInterface *timer = 0; + sdkCreateTimer(&timer); - int nIter = 30; + int nIter = 30; - for (int i = -1; i < nIter; ++i) { - if (i == 0) { - sdkStartTimer(&timer); + for (int i = -1; i < nIter; ++i) { + if (i == 0) { + sdkStartTimer(&timer); + } + +#endif + + cudaProcess<<>>( + g_odata, imgw, imgh, block.x + (2 * radius), radius, 0.8f, 4.0f, inTexObject); + +#ifdef GPU_PROFILING + } + + cudaDeviceSynchronize(); + sdkStopTimer(&timer); + double dSeconds = sdkGetTimerValue(&timer) / ((double)nIter * 1000.0); + double dNumTexels = (double)imgw * (double)imgh; + double mtexps = 1.0e-6 * dNumTexels / dSeconds; + + if (radius == 4) { + printf("\n"); + printf("postprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = " + "%.0f Texels, NumDevsUsed = %d, Workgroup = %u\n", + mtexps, + dSeconds, + dNumTexels, + 1, + block.x * block.y); } -#endif - - cudaProcess<<>>(g_odata, imgw, imgh, - block.x + (2 * radius), radius, 0.8f, - 4.0f, inTexObject); - -#ifdef GPU_PROFILING - } - - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - double dSeconds = sdkGetTimerValue(&timer) / ((double)nIter * 1000.0); - double dNumTexels = (double)imgw * (double)imgh; - double mtexps = 1.0e-6 * dNumTexels / dSeconds; - - if (radius == 4) { - printf("\n"); - printf( - "postprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = " - "%.0f Texels, NumDevsUsed = %d, Workgroup = %u\n", - mtexps, dSeconds, dNumTexels, 1, block.x * block.y); - } - #endif } diff --git a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator.cpp b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator.cpp index 1c426449..c32f3f60 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator.cpp +++ b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator.cpp @@ -29,19 +29,17 @@ #include // Utilities and system includes -#include #include +#include #include "quasirandomGenerator_common.h" //////////////////////////////////////////////////////////////////////////////// // CPU code //////////////////////////////////////////////////////////////////////////////// -extern "C" void initQuasirandomGenerator( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]); +extern "C" void initQuasirandomGenerator(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]); -extern "C" float getQuasirandomValue( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim); +extern "C" float getQuasirandomValue(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim); extern "C" double getQuasirandomValue63(INT64 i, int dim); extern "C" double MoroInvCNDcpu(unsigned int p); @@ -49,135 +47,132 @@ extern "C" double MoroInvCNDcpu(unsigned int p); //////////////////////////////////////////////////////////////////////////////// // GPU code //////////////////////////////////////////////////////////////////////////////// -extern "C" void initTableGPU( - unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]); -extern "C" void quasirandomGeneratorGPU(float *d_Output, unsigned int seed, - unsigned int N); -extern "C" void inverseCNDgpu(float *d_Output, unsigned int *d_Input, - unsigned int N); +extern "C" void initTableGPU(unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]); +extern "C" void quasirandomGeneratorGPU(float *d_Output, unsigned int seed, unsigned int N); +extern "C" void inverseCNDgpu(float *d_Output, unsigned int *d_Input, unsigned int N); const int N = 1048576; -int main(int argc, char **argv) { - // Start logs - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + // Start logs + printf("%s Starting...\n\n", argv[0]); - unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; + unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; - float *h_OutputGPU, *d_Output; + float *h_OutputGPU, *d_Output; - int dim, pos; - double delta, ref, sumDelta, sumRef, L1norm, gpuTime; + int dim, pos; + double delta, ref, sumDelta, sumRef, L1norm, gpuTime; - StopWatchInterface *hTimer = NULL; + StopWatchInterface *hTimer = NULL; - if (sizeof(INT64) != 8) { - printf("sizeof(INT64) != 8\n"); - return 0; - } - - sdkCreateTimer(&hTimer); - - printf("Allocating GPU memory...\n"); - checkCudaErrors( - cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); - - printf("Allocating CPU memory...\n"); - h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); - - printf("Initializing QRNG tables...\n\n"); - initQuasirandomGenerator(tableCPU); - - initTableGPU(tableCPU); - - printf("Testing QRNG...\n\n"); - checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); - int numIterations = 20; - - for (int i = -1; i < numIterations; i++) { - if (i == 0) { - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + if (sizeof(INT64) != 8) { + printf("sizeof(INT64) != 8\n"); + return 0; } - quasirandomGeneratorGPU(d_Output, 0, N); - } + sdkCreateTimer(&hTimer); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; - printf( - "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size " - "= %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", - (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, - QRNG_DIMENSIONS * N, 1, 128 * QRNG_DIMENSIONS); + printf("Allocating GPU memory...\n"); + checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); - printf("\nReading GPU results...\n"); - checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, - QRNG_DIMENSIONS * N * sizeof(float), - cudaMemcpyDeviceToHost)); + printf("Allocating CPU memory...\n"); + h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); - printf("Comparing to the CPU results...\n\n"); - sumDelta = 0; - sumRef = 0; + printf("Initializing QRNG tables...\n\n"); + initQuasirandomGenerator(tableCPU); - for (dim = 0; dim < QRNG_DIMENSIONS; dim++) - for (pos = 0; pos < N; pos++) { - ref = getQuasirandomValue63(pos, dim); - delta = (double)h_OutputGPU[dim * N + pos] - ref; - sumDelta += fabs(delta); - sumRef += fabs(ref); + initTableGPU(tableCPU); + + printf("Testing QRNG...\n\n"); + checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); + int numIterations = 20; + + for (int i = -1; i < numIterations; i++) { + if (i == 0) { + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } + + quasirandomGeneratorGPU(d_Output, 0, N); } - printf("L1 norm: %E\n", sumDelta / sumRef); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; + printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size " + "= %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", + (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, + gpuTime, + QRNG_DIMENSIONS * N, + 1, + 128 * QRNG_DIMENSIONS); - printf("\nTesting inverseCNDgpu()...\n\n"); - checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); + printf("\nReading GPU results...\n"); + checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); - for (int i = -1; i < numIterations; i++) { - if (i == 0) { - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + printf("Comparing to the CPU results...\n\n"); + sumDelta = 0; + sumRef = 0; + + for (dim = 0; dim < QRNG_DIMENSIONS; dim++) + for (pos = 0; pos < N; pos++) { + ref = getQuasirandomValue63(pos, dim); + delta = (double)h_OutputGPU[dim * N + pos] - ref; + sumDelta += fabs(delta); + sumRef += fabs(ref); + } + + printf("L1 norm: %E\n", sumDelta / sumRef); + + printf("\nTesting inverseCNDgpu()...\n\n"); + checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); + + for (int i = -1; i < numIterations; i++) { + if (i == 0) { + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } + + inverseCNDgpu(d_Output, NULL, QRNG_DIMENSIONS * N); } - inverseCNDgpu(d_Output, NULL, QRNG_DIMENSIONS * N); - } + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; + printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f " + "s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", + (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, + gpuTime, + QRNG_DIMENSIONS * N, + 1, + 128); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; - printf( - "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f " - "s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", - (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, - QRNG_DIMENSIONS * N, 1, 128); + printf("Reading GPU results...\n"); + checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); - printf("Reading GPU results...\n"); - checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, - QRNG_DIMENSIONS * N * sizeof(float), - cudaMemcpyDeviceToHost)); + printf("\nComparing to the CPU results...\n"); + sumDelta = 0; + sumRef = 0; + unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); - printf("\nComparing to the CPU results...\n"); - sumDelta = 0; - sumRef = 0; - unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); + for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { + unsigned int d = (pos + 1) * distance; + ref = MoroInvCNDcpu(d); + delta = (double)h_OutputGPU[pos] - ref; + sumDelta += fabs(delta); + sumRef += fabs(ref); + } - for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { - unsigned int d = (pos + 1) * distance; - ref = MoroInvCNDcpu(d); - delta = (double)h_OutputGPU[pos] - ref; - sumDelta += fabs(delta); - sumRef += fabs(ref); - } + printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); - printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); + printf("Shutting down...\n"); + sdkDeleteTimer(&hTimer); + free(h_OutputGPU); + checkCudaErrors(cudaFree(d_Output)); - printf("Shutting down...\n"); - sdkDeleteTimer(&hTimer); - free(h_OutputGPU); - checkCudaErrors(cudaFree(d_Output)); - - exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); + exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_common.h b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_common.h index 067726cd..56c605a0 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_common.h +++ b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_common.h @@ -35,6 +35,6 @@ typedef long long int INT64; #define QRNG_DIMENSIONS 3 #define QRNG_RESOLUTION 31 -#define INT_SCALE (1.0f / (float)0x80000001U) +#define INT_SCALE (1.0f / (float)0x80000001U) #endif diff --git a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_gold.cpp b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_gold.cpp index bd35d021..0976258c 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_gold.cpp +++ b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_gold.cpp @@ -25,8 +25,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include #include "quasirandomGenerator_common.h" @@ -36,69 +36,71 @@ // Internal 64(63)-bit table static INT64 cjn[63][QRNG_DIMENSIONS]; -static int GeneratePolynomials(int buffer[QRNG_DIMENSIONS], bool primitive) { - int i, j, n, p1, p2, l; - int e_p1, e_p2, e_b; +static int GeneratePolynomials(int buffer[QRNG_DIMENSIONS], bool primitive) +{ + int i, j, n, p1, p2, l; + int e_p1, e_p2, e_b; - // generate all polynomials to buffer - for (n = 1, buffer[0] = 0x2, p2 = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { - // search for the next irreducible polynomial - for (p1 = buffer[n - 1] + 1;; ++p1) { - // find degree of polynomial p1 - for (e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { - } + // generate all polynomials to buffer + for (n = 1, buffer[0] = 0x2, p2 = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { + // search for the next irreducible polynomial + for (p1 = buffer[n - 1] + 1;; ++p1) { + // find degree of polynomial p1 + for (e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { + } - // try to divide p1 by all polynomials in buffer - for (i = 0; i < n; ++i) { - // find the degree of buffer[i] - for (e_b = e_p1; (buffer[i] & (1 << e_b)) == 0; --e_b) { + // try to divide p1 by all polynomials in buffer + for (i = 0; i < n; ++i) { + // find the degree of buffer[i] + for (e_b = e_p1; (buffer[i] & (1 << e_b)) == 0; --e_b) { + } + + // divide p2 by buffer[i] until the end + for (p2 = (buffer[i] << ((e_p2 = e_p1) - e_b)) ^ p1; p2 >= buffer[i]; + p2 = (buffer[i] << (e_p2 - e_b)) ^ p2) { + for (; (p2 & (1 << e_p2)) == 0; --e_p2) { + } + } // compute new degree of p2 + + // division without remainder!!! p1 is not irreducible + if (p2 == 0) { + break; + } + } + + // all divisions were with remainder - p1 is irreducible + if (p2 != 0) { + e_p2 = 0; + + if (primitive) { + // check that p1 has only one cycle (i.e. is monic, or primitive) + j = ~(0xffffffff << (e_p1 + 1)); + e_b = (1 << e_p1) | 0x1; + + for (p2 = e_b, e_p2 = (1 << e_p1) - 2; e_p2 > 0; --e_p2) { + p2 <<= 1; + i = p2 & p1; + i = (i & 0x55555555) + ((i >> 1) & 0x55555555); + i = (i & 0x33333333) + ((i >> 2) & 0x33333333); + i = (i & 0x07070707) + ((i >> 4) & 0x07070707); + p2 |= (i % 255) & 1; + + if ((p2 & j) == e_b) + break; + } + } + + // it is monic - add it to the list of polynomials + if (e_p2 == 0) { + buffer[n] = p1; + l += e_p1; + break; + } + } } - - // divide p2 by buffer[i] until the end - for (p2 = (buffer[i] << ((e_p2 = e_p1) - e_b)) ^ p1; p2 >= buffer[i]; - p2 = (buffer[i] << (e_p2 - e_b)) ^ p2) { - for (; (p2 & (1 << e_p2)) == 0; --e_p2) { - } - } // compute new degree of p2 - - // division without remainder!!! p1 is not irreducible - if (p2 == 0) { - break; - } - } - - // all divisions were with remainder - p1 is irreducible - if (p2 != 0) { - e_p2 = 0; - - if (primitive) { - // check that p1 has only one cycle (i.e. is monic, or primitive) - j = ~(0xffffffff << (e_p1 + 1)); - e_b = (1 << e_p1) | 0x1; - - for (p2 = e_b, e_p2 = (1 << e_p1) - 2; e_p2 > 0; --e_p2) { - p2 <<= 1; - i = p2 & p1; - i = (i & 0x55555555) + ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - i = (i & 0x07070707) + ((i >> 4) & 0x07070707); - p2 |= (i % 255) & 1; - - if ((p2 & j) == e_b) break; - } - } - - // it is monic - add it to the list of polynomials - if (e_p2 == 0) { - buffer[n] = p1; - l += e_p1; - break; - } - } } - } - return l + 1; + return l + 1; } //////////////////////////////////////////////////////////////////////////////// @@ -112,214 +114,217 @@ static int GeneratePolynomials(int buffer[QRNG_DIMENSIONS], bool primitive) { // July 1992.", // year = "1992" } //////////////////////////////////////////////////////////////////////////////// -static void GenerateCJ() { - int buffer[QRNG_DIMENSIONS]; - int *polynomials; - int n, p1, l, e_p1; +static void GenerateCJ() +{ + int buffer[QRNG_DIMENSIONS]; + int *polynomials; + int n, p1, l, e_p1; - // Niederreiter (in contrast to Sobol) allows to use not primitive, but just - // irreducible polynomials - l = GeneratePolynomials(buffer, false); + // Niederreiter (in contrast to Sobol) allows to use not primitive, but just + // irreducible polynomials + l = GeneratePolynomials(buffer, false); - // convert all polynomials from buffer to polynomials table - polynomials = new int[l + 2 * QRNG_DIMENSIONS + 1]; + // convert all polynomials from buffer to polynomials table + polynomials = new int[l + 2 * QRNG_DIMENSIONS + 1]; - for (n = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { - // find degree of polynomial p1 - for (p1 = buffer[n], e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { - } - - // fill polynomials table with values for this polynomial - polynomials[l++] = 1; - - for (--e_p1; e_p1 >= 0; --e_p1) { - polynomials[l++] = (p1 >> e_p1) & 1; - } - - polynomials[l++] = -1; - } - - polynomials[l] = -1; - - // irreducible polynomial p - int *p = polynomials, e, d; - // polynomial b - int b_arr[1024], *b, m; - // v array - int v_arr[1024], *v; - // temporary polynomial, required to do multiplication of p and b - int t_arr[1024], *t; - // subsidiary variables - int i, j, u, m1, ip, it; - - // cycle over monic irreducible polynomials - for (d = 0; p[0] != -1; p += e + 2) { - // allocate memory for cj array for dimension (ip + 1) - for (i = 0; i < 63; ++i) { - cjn[i][d] = 0; - } - - // determine the power of irreducible polynomial - for (e = 0; p[e + 1] != -1; ++e) { - } - - // polynomial b in the beginning is just '1' - (b = b_arr + 1023)[m = 0] = 1; - // v array needs only (63 + e - 2) length - v = v_arr + 1023 - (63 + e - 2); - - // cycle over all coefficients - for (j = 63 - 1, u = e; j >= 0; --j, ++u) { - if (u == e) { - u = 0; - - // multiply b by p (polynomials multiplication) - for (i = 0, t = t_arr + 1023 - (m1 = m); i <= m; ++i) { - t[i] = b[i]; + for (n = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { + // find degree of polynomial p1 + for (p1 = buffer[n], e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { } - b = b_arr + 1023 - (m += e); + // fill polynomials table with values for this polynomial + polynomials[l++] = 1; - for (i = 0; i <= m; ++i) { - b[i] = 0; + for (--e_p1; e_p1 >= 0; --e_p1) { + polynomials[l++] = (p1 >> e_p1) & 1; + } - for (ip = e - (m - i), it = m1; ip <= e && it >= 0; ++ip, --it) { - if (ip >= 0) { - b[i] ^= p[ip] & t[it]; + polynomials[l++] = -1; + } + + polynomials[l] = -1; + + // irreducible polynomial p + int *p = polynomials, e, d; + // polynomial b + int b_arr[1024], *b, m; + // v array + int v_arr[1024], *v; + // temporary polynomial, required to do multiplication of p and b + int t_arr[1024], *t; + // subsidiary variables + int i, j, u, m1, ip, it; + + // cycle over monic irreducible polynomials + for (d = 0; p[0] != -1; p += e + 2) { + // allocate memory for cj array for dimension (ip + 1) + for (i = 0; i < 63; ++i) { + cjn[i][d] = 0; + } + + // determine the power of irreducible polynomial + for (e = 0; p[e + 1] != -1; ++e) { + } + + // polynomial b in the beginning is just '1' + (b = b_arr + 1023)[m = 0] = 1; + // v array needs only (63 + e - 2) length + v = v_arr + 1023 - (63 + e - 2); + + // cycle over all coefficients + for (j = 63 - 1, u = e; j >= 0; --j, ++u) { + if (u == e) { + u = 0; + + // multiply b by p (polynomials multiplication) + for (i = 0, t = t_arr + 1023 - (m1 = m); i <= m; ++i) { + t[i] = b[i]; + } + + b = b_arr + 1023 - (m += e); + + for (i = 0; i <= m; ++i) { + b[i] = 0; + + for (ip = e - (m - i), it = m1; ip <= e && it >= 0; ++ip, --it) { + if (ip >= 0) { + b[i] ^= p[ip] & t[it]; + } + } + } + + // multiplication of polynomials finished + + // calculate v + for (i = 0; i < m1; ++i) { + v[i] = 0; + } + + for (; i < m; ++i) { + v[i] = 1; + } + + for (; i <= 63 + e - 2; ++i) { + v[i] = 0; + + for (it = 1; it <= m; ++it) { + v[i] ^= v[i - it] & b[it]; + } + } + } + + // copy calculated v to cj + for (i = 0; i < 63; i++) { + cjn[i][d] |= (INT64)v[i + u] << j; } - } } - // multiplication of polynomials finished - - // calculate v - for (i = 0; i < m1; ++i) { - v[i] = 0; - } - - for (; i < m; ++i) { - v[i] = 1; - } - - for (; i <= 63 + e - 2; ++i) { - v[i] = 0; - - for (it = 1; it <= m; ++it) { - v[i] ^= v[i - it] & b[it]; - } - } - } - - // copy calculated v to cj - for (i = 0; i < 63; i++) { - cjn[i][d] |= (INT64)v[i + u] << j; - } + ++d; } - ++d; - } - - delete[] polynomials; + delete[] polynomials; } // Generate 63-bit quasirandom number for given index and dimension and // normalize -extern "C" double getQuasirandomValue63(INT64 i, int dim) { - const double INT63_SCALE = (1.0 / (double)0x8000000000000001ULL); - INT64 result = 0; +extern "C" double getQuasirandomValue63(INT64 i, int dim) +{ + const double INT63_SCALE = (1.0 / (double)0x8000000000000001ULL); + INT64 result = 0; - for (int bit = 0; bit < 63; bit++, i >>= 1) - if (i & 1) result ^= cjn[bit][dim]; + for (int bit = 0; bit < 63; bit++, i >>= 1) + if (i & 1) + result ^= cjn[bit][dim]; - return (double)(result + 1) * INT63_SCALE; + return (double)(result + 1) * INT63_SCALE; } //////////////////////////////////////////////////////////////////////////////// // Initialization (table setup) //////////////////////////////////////////////////////////////////////////////// -extern "C" void initQuasirandomGenerator( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]) { - GenerateCJ(); +extern "C" void initQuasirandomGenerator(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]) +{ + GenerateCJ(); - for (int dim = 0; dim < QRNG_DIMENSIONS; dim++) - for (int bit = 0; bit < QRNG_RESOLUTION; bit++) - table[dim][bit] = (int)((cjn[bit][dim] >> 32) & 0x7FFFFFFF); + for (int dim = 0; dim < QRNG_DIMENSIONS; dim++) + for (int bit = 0; bit < QRNG_RESOLUTION; bit++) + table[dim][bit] = (int)((cjn[bit][dim] >> 32) & 0x7FFFFFFF); } //////////////////////////////////////////////////////////////////////////////// // Generate 31-bit quasirandom number for given index and dimension //////////////////////////////////////////////////////////////////////////////// -extern "C" float getQuasirandomValue( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim) { - int result = 0; +extern "C" float getQuasirandomValue(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim) +{ + int result = 0; - for (int bit = 0; bit < QRNG_RESOLUTION; bit++, i >>= 1) - if (i & 1) result ^= table[dim][bit]; + for (int bit = 0; bit < QRNG_RESOLUTION; bit++, i >>= 1) + if (i & 1) + result ^= table[dim][bit]; - return (float)(result + 1) * INT_SCALE; + return (float)(result + 1) * INT_SCALE; } //////////////////////////////////////////////////////////////////////////////// // Moro's Inverse Cumulative Normal Distribution function approximation //////////////////////////////////////////////////////////////////////////////// -extern "C" double MoroInvCNDcpu(unsigned int x) { - const double a1 = 2.50662823884; - const double a2 = -18.61500062529; - const double a3 = 41.39119773534; - const double a4 = -25.44106049637; - const double b1 = -8.4735109309; - const double b2 = 23.08336743743; - const double b3 = -21.06224101826; - const double b4 = 3.13082909833; - const double c1 = 0.337475482272615; - const double c2 = 0.976169019091719; - const double c3 = 0.160797971491821; - const double c4 = 2.76438810333863E-02; - const double c5 = 3.8405729373609E-03; - const double c6 = 3.951896511919E-04; - const double c7 = 3.21767881768E-05; - const double c8 = 2.888167364E-07; - const double c9 = 3.960315187E-07; +extern "C" double MoroInvCNDcpu(unsigned int x) +{ + const double a1 = 2.50662823884; + const double a2 = -18.61500062529; + const double a3 = 41.39119773534; + const double a4 = -25.44106049637; + const double b1 = -8.4735109309; + const double b2 = 23.08336743743; + const double b3 = -21.06224101826; + const double b4 = 3.13082909833; + const double c1 = 0.337475482272615; + const double c2 = 0.976169019091719; + const double c3 = 0.160797971491821; + const double c4 = 2.76438810333863E-02; + const double c5 = 3.8405729373609E-03; + const double c6 = 3.951896511919E-04; + const double c7 = 3.21767881768E-05; + const double c8 = 2.888167364E-07; + const double c9 = 3.960315187E-07; - double z; + double z; - bool negate = false; + bool negate = false; - // Ensure the conversion to floating point will give a value in the - // range (0,0.5] by restricting the input to the bottom half of the - // input domain. We will later reflect the result if the input was - // originally in the top half of the input domain - if (x >= 0x80000000UL) { - x = 0xffffffffUL - x; - negate = true; - } + // Ensure the conversion to floating point will give a value in the + // range (0,0.5] by restricting the input to the bottom half of the + // input domain. We will later reflect the result if the input was + // originally in the top half of the input domain + if (x >= 0x80000000UL) { + x = 0xffffffffUL - x; + negate = true; + } - // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) - // Convert to floating point in (0,0.5] - const double x1 = 1.0 / static_cast(0xffffffffUL); - const double x2 = x1 / 2.0; - double p1 = x * x1 + x2; - // Convert to floating point in (-0.5,0] - double p2 = p1 - 0.5; + // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) + // Convert to floating point in (0,0.5] + const double x1 = 1.0 / static_cast(0xffffffffUL); + const double x2 = x1 / 2.0; + double p1 = x * x1 + x2; + // Convert to floating point in (-0.5,0] + double p2 = p1 - 0.5; - // The input to the Moro inversion is p2 which is in the range - // (-0.5,0]. This means that our output will be the negative side - // of the bell curve (which we will reflect if "negate" is true). + // The input to the Moro inversion is p2 which is in the range + // (-0.5,0]. This means that our output will be the negative side + // of the bell curve (which we will reflect if "negate" is true). - // Main body of the bell curve for |p| < 0.42 - if (p2 > -0.42) { - z = p2 * p2; - z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / - ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0); - } - // Special case (Chebychev) for tail - else { - z = log(-log(p1)); - z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * - (c7 + z * (c8 + z * c9)))))))); - } + // Main body of the bell curve for |p| < 0.42 + if (p2 > -0.42) { + z = p2 * p2; + z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0); + } + // Special case (Chebychev) for tail + else { + z = log(-log(p1)); + z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * (c7 + z * (c8 + z * c9)))))))); + } - // If the original input (x) was in the top half of the range, reflect - // to get the positive side of the bell curve - return negate ? -z : z; + // If the original input (x) was in the top half of the range, reflect + // to get the positive side of the bell curve + return negate ? -z : z; } diff --git a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_kernel.cu b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_kernel.cu index 3425048b..95ebadeb 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_kernel.cu +++ b/Samples/5_Domain_Specific/quasirandomGenerator/quasirandomGenerator_kernel.cu @@ -28,9 +28,10 @@ #ifndef QUASIRANDOMGENERATOR_KERNEL_CUH #define QUASIRANDOMGENERATOR_KERNEL_CUH +#include #include #include -#include + #include "quasirandomGenerator_common.h" // Fast integer multiplication @@ -41,138 +42,134 @@ //////////////////////////////////////////////////////////////////////////////// static __constant__ unsigned int c_Table[QRNG_DIMENSIONS][QRNG_RESOLUTION]; -static __global__ void quasirandomGeneratorKernel(float *d_Output, - unsigned int seed, - unsigned int N) { - unsigned int *dimBase = &c_Table[threadIdx.y][0]; - unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; - unsigned int threadN = MUL(blockDim.x, gridDim.x); +static __global__ void quasirandomGeneratorKernel(float *d_Output, unsigned int seed, unsigned int N) +{ + unsigned int *dimBase = &c_Table[threadIdx.y][0]; + unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; + unsigned int threadN = MUL(blockDim.x, gridDim.x); - for (unsigned int pos = tid; pos < N; pos += threadN) { - unsigned int result = 0; - unsigned int data = seed + pos; + for (unsigned int pos = tid; pos < N; pos += threadN) { + unsigned int result = 0; + unsigned int data = seed + pos; - for (int bit = 0; bit < QRNG_RESOLUTION; bit++, data >>= 1) - if (data & 1) { - result ^= dimBase[bit]; - } + for (int bit = 0; bit < QRNG_RESOLUTION; bit++, data >>= 1) + if (data & 1) { + result ^= dimBase[bit]; + } - d_Output[MUL(threadIdx.y, N) + pos] = (float)(result + 1) * INT_SCALE; - } + d_Output[MUL(threadIdx.y, N) + pos] = (float)(result + 1) * INT_SCALE; + } } // Table initialization routine -extern "C" void initTableGPU( - unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]) { - checkCudaErrors(cudaMemcpyToSymbol( - c_Table, tableCPU, - QRNG_DIMENSIONS * QRNG_RESOLUTION * sizeof(unsigned int))); +extern "C" void initTableGPU(unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_Table, tableCPU, QRNG_DIMENSIONS * QRNG_RESOLUTION * sizeof(unsigned int))); } // Host-side interface -extern "C" void quasirandomGeneratorGPU(float *d_Output, unsigned int seed, - unsigned int N) { - dim3 threads(128, QRNG_DIMENSIONS); - quasirandomGeneratorKernel<<<128, threads>>>(d_Output, seed, N); - getLastCudaError("quasirandomGeneratorKernel() execution failed.\n"); +extern "C" void quasirandomGeneratorGPU(float *d_Output, unsigned int seed, unsigned int N) +{ + dim3 threads(128, QRNG_DIMENSIONS); + quasirandomGeneratorKernel<<<128, threads>>>(d_Output, seed, N); + getLastCudaError("quasirandomGeneratorKernel() execution failed.\n"); } //////////////////////////////////////////////////////////////////////////////// // Moro's Inverse Cumulative Normal Distribution function approximation //////////////////////////////////////////////////////////////////////////////// -__device__ inline float MoroInvCNDgpu(unsigned int x) { - const float a1 = 2.50662823884f; - const float a2 = -18.61500062529f; - const float a3 = 41.39119773534f; - const float a4 = -25.44106049637f; - const float b1 = -8.4735109309f; - const float b2 = 23.08336743743f; - const float b3 = -21.06224101826f; - const float b4 = 3.13082909833f; - const float c1 = 0.337475482272615f; - const float c2 = 0.976169019091719f; - const float c3 = 0.160797971491821f; - const float c4 = 2.76438810333863E-02f; - const float c5 = 3.8405729373609E-03f; - const float c6 = 3.951896511919E-04f; - const float c7 = 3.21767881768E-05f; - const float c8 = 2.888167364E-07f; - const float c9 = 3.960315187E-07f; +__device__ inline float MoroInvCNDgpu(unsigned int x) +{ + const float a1 = 2.50662823884f; + const float a2 = -18.61500062529f; + const float a3 = 41.39119773534f; + const float a4 = -25.44106049637f; + const float b1 = -8.4735109309f; + const float b2 = 23.08336743743f; + const float b3 = -21.06224101826f; + const float b4 = 3.13082909833f; + const float c1 = 0.337475482272615f; + const float c2 = 0.976169019091719f; + const float c3 = 0.160797971491821f; + const float c4 = 2.76438810333863E-02f; + const float c5 = 3.8405729373609E-03f; + const float c6 = 3.951896511919E-04f; + const float c7 = 3.21767881768E-05f; + const float c8 = 2.888167364E-07f; + const float c9 = 3.960315187E-07f; - float z; + float z; - bool negate = false; + bool negate = false; - // Ensure the conversion to floating point will give a value in the - // range (0,0.5] by restricting the input to the bottom half of the - // input domain. We will later reflect the result if the input was - // originally in the top half of the input domain - if (x >= 0x80000000UL) { - x = 0xffffffffUL - x; - negate = true; - } + // Ensure the conversion to floating point will give a value in the + // range (0,0.5] by restricting the input to the bottom half of the + // input domain. We will later reflect the result if the input was + // originally in the top half of the input domain + if (x >= 0x80000000UL) { + x = 0xffffffffUL - x; + negate = true; + } - // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) - // Convert to floating point in (0,0.5] - const float x1 = 1.0f / static_cast(0xffffffffUL); - const float x2 = x1 / 2.0f; - float p1 = x * x1 + x2; - // Convert to floating point in (-0.5,0] - float p2 = p1 - 0.5f; + // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) + // Convert to floating point in (0,0.5] + const float x1 = 1.0f / static_cast(0xffffffffUL); + const float x2 = x1 / 2.0f; + float p1 = x * x1 + x2; + // Convert to floating point in (-0.5,0] + float p2 = p1 - 0.5f; - // The input to the Moro inversion is p2 which is in the range - // (-0.5,0]. This means that our output will be the negative side - // of the bell curve (which we will reflect if "negate" is true). + // The input to the Moro inversion is p2 which is in the range + // (-0.5,0]. This means that our output will be the negative side + // of the bell curve (which we will reflect if "negate" is true). - // Main body of the bell curve for |p| < 0.42 - if (p2 > -0.42f) { - z = p2 * p2; - z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / - ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0f); - } - // Special case (Chebychev) for tail - else { - z = __logf(-__logf(p1)); - z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * (c7 + z - * (c8 + z * c9)))))))); - } + // Main body of the bell curve for |p| < 0.42 + if (p2 > -0.42f) { + z = p2 * p2; + z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0f); + } + // Special case (Chebychev) for tail + else { + z = __logf(-__logf(p1)); + z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * (c7 + z * (c8 + z * c9)))))))); + } - // If the original input (x) was in the top half of the range, reflect - // to get the positive side of the bell curve - return negate ? -z : z; + // If the original input (x) was in the top half of the range, reflect + // to get the positive side of the bell curve + return negate ? -z : z; } //////////////////////////////////////////////////////////////////////////////// // Main kernel. Choose between transforming // input sequence and uniform ascending (0, 1) sequence //////////////////////////////////////////////////////////////////////////////// -static __global__ void inverseCNDKernel(float *d_Output, unsigned int *d_Input, - unsigned int pathN) { - unsigned int distance = ((unsigned int)-1) / (pathN + 1); - unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; - unsigned int threadN = MUL(blockDim.x, gridDim.x); +static __global__ void inverseCNDKernel(float *d_Output, unsigned int *d_Input, unsigned int pathN) +{ + unsigned int distance = ((unsigned int)-1) / (pathN + 1); + unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; + unsigned int threadN = MUL(blockDim.x, gridDim.x); - // Transform input number sequence if it's supplied - if (d_Input) { - for (unsigned int pos = tid; pos < pathN; pos += threadN) { - unsigned int d = d_Input[pos]; - d_Output[pos] = (float)MoroInvCNDgpu(d); + // Transform input number sequence if it's supplied + if (d_Input) { + for (unsigned int pos = tid; pos < pathN; pos += threadN) { + unsigned int d = d_Input[pos]; + d_Output[pos] = (float)MoroInvCNDgpu(d); + } } - } - // Else generate input uniformly placed samples on the fly - // and write to destination - else { - for (unsigned int pos = tid; pos < pathN; pos += threadN) { - unsigned int d = (pos + 1) * distance; - d_Output[pos] = (float)MoroInvCNDgpu(d); + // Else generate input uniformly placed samples on the fly + // and write to destination + else { + for (unsigned int pos = tid; pos < pathN; pos += threadN) { + unsigned int d = (pos + 1) * distance; + d_Output[pos] = (float)MoroInvCNDgpu(d); + } } - } } -extern "C" void inverseCNDgpu(float *d_Output, unsigned int *d_Input, - unsigned int N) { - inverseCNDKernel<<<128, 128>>>(d_Output, d_Input, N); - getLastCudaError("inverseCNDKernel() execution failed.\n"); +extern "C" void inverseCNDgpu(float *d_Output, unsigned int *d_Input, unsigned int N) +{ + inverseCNDKernel<<<128, 128>>>(d_Output, d_Input, N); + getLastCudaError("inverseCNDKernel() execution failed.\n"); } #endif diff --git a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator.cpp b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator.cpp index 43f96abb..3f4c0636 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator.cpp +++ b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator.cpp @@ -26,156 +26,155 @@ */ // CUDA Runtime -#include #include +#include // Utilities and system includes #include -#include "quasirandomGenerator_gpu.cuh" - #include "quasirandomGenerator_common.h" +#include "quasirandomGenerator_gpu.cuh" //////////////////////////////////////////////////////////////////////////////// // CPU code //////////////////////////////////////////////////////////////////////////////// -extern "C" void initQuasirandomGenerator( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]); +extern "C" void initQuasirandomGenerator(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]); -extern "C" float getQuasirandomValue( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim); +extern "C" float getQuasirandomValue(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim); extern "C" double getQuasirandomValue63(INT64 i, int dim); extern "C" double MoroInvCNDcpu(unsigned int p); const int N = 1048576; -int main(int argc, char **argv) { - // Start logs - printf("%s Starting...\n\n", argv[0]); +int main(int argc, char **argv) +{ + // Start logs + printf("%s Starting...\n\n", argv[0]); - // Compile the kernels - char *kernel_file = - sdkFindFilePath("quasirandomGenerator_kernel.cu", argv[0]); - compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); - module = loadCUBIN(cubin, argc, argv); + // Compile the kernels + char *kernel_file = sdkFindFilePath("quasirandomGenerator_kernel.cu", argv[0]); + compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); + module = loadCUBIN(cubin, argc, argv); - unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; - float *h_OutputGPU; - CUdeviceptr d_Output; + unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; + float *h_OutputGPU; + CUdeviceptr d_Output; - int dim, pos; - double delta, ref, sumDelta, sumRef, L1norm, gpuTime; + int dim, pos; + double delta, ref, sumDelta, sumRef, L1norm, gpuTime; - StopWatchInterface *hTimer = NULL; + StopWatchInterface *hTimer = NULL; - if (sizeof(INT64) != 8) { - printf("sizeof(INT64) != 8\n"); - return 0; - } - - sdkCreateTimer(&hTimer); - - printf("Allocating GPU memory...\n"); - checkCudaErrors(cuMemAlloc(&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); - - printf("Allocating CPU memory...\n"); - h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); - - printf("Initializing QRNG tables...\n\n"); - initQuasirandomGenerator(tableCPU); - - initTableGPU(tableCPU); - - printf("Testing QRNG...\n\n"); - - checkCudaErrors(cuMemsetD8(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); - - int numIterations = 20; - - for (int i = -1; i < numIterations; i++) { - if (i == 0) { - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + if (sizeof(INT64) != 8) { + printf("sizeof(INT64) != 8\n"); + return 0; } - quasirandomGeneratorGPU(d_Output, 0, N); - } + sdkCreateTimer(&hTimer); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; - printf( - "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size " - "= %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", - (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, - QRNG_DIMENSIONS * N, 1, 128 * QRNG_DIMENSIONS); + printf("Allocating GPU memory...\n"); + checkCudaErrors(cuMemAlloc(&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); - printf("\nReading GPU results...\n"); + printf("Allocating CPU memory...\n"); + h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); - checkCudaErrors( - cuMemcpyDtoH(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float))); + printf("Initializing QRNG tables...\n\n"); + initQuasirandomGenerator(tableCPU); - printf("Comparing to the CPU results...\n\n"); - sumDelta = 0; - sumRef = 0; + initTableGPU(tableCPU); - for (dim = 0; dim < QRNG_DIMENSIONS; dim++) - for (pos = 0; pos < N; pos++) { - ref = getQuasirandomValue63(pos, dim); - delta = (double)h_OutputGPU[dim * N + pos] - ref; - sumDelta += fabs(delta); - sumRef += fabs(ref); + printf("Testing QRNG...\n\n"); + + checkCudaErrors(cuMemsetD8(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); + + int numIterations = 20; + + for (int i = -1; i < numIterations; i++) { + if (i == 0) { + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } + + quasirandomGeneratorGPU(d_Output, 0, N); } - printf("L1 norm: %E\n", sumDelta / sumRef); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; + printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size " + "= %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", + (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, + gpuTime, + QRNG_DIMENSIONS * N, + 1, + 128 * QRNG_DIMENSIONS); - printf("\nTesting inverseCNDgpu()...\n\n"); + printf("\nReading GPU results...\n"); - checkCudaErrors(cuMemsetD8(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); + checkCudaErrors(cuMemcpyDtoH(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float))); - for (int i = -1; i < numIterations; i++) { - if (i == 0) { - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + printf("Comparing to the CPU results...\n\n"); + sumDelta = 0; + sumRef = 0; + + for (dim = 0; dim < QRNG_DIMENSIONS; dim++) + for (pos = 0; pos < N; pos++) { + ref = getQuasirandomValue63(pos, dim); + delta = (double)h_OutputGPU[dim * N + pos] - ref; + sumDelta += fabs(delta); + sumRef += fabs(ref); + } + + printf("L1 norm: %E\n", sumDelta / sumRef); + + printf("\nTesting inverseCNDgpu()...\n\n"); + + checkCudaErrors(cuMemsetD8(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); + + for (int i = -1; i < numIterations; i++) { + if (i == 0) { + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + } + + inverseCNDgpu(d_Output, QRNG_DIMENSIONS * N); } - inverseCNDgpu(d_Output, QRNG_DIMENSIONS * N); - } + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; + printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f " + "s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", + (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, + gpuTime, + QRNG_DIMENSIONS * N, + 1, + 128); - sdkStopTimer(&hTimer); - gpuTime = sdkGetTimerValue(&hTimer) / (double)numIterations * 1e-3; - printf( - "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f " - "s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", - (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, - QRNG_DIMENSIONS * N, 1, 128); + printf("Reading GPU results...\n"); + checkCudaErrors(cuMemcpyDtoH(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float))); - printf("Reading GPU results...\n"); - checkCudaErrors( - cuMemcpyDtoH(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float))); + printf("\nComparing to the CPU results...\n"); - printf("\nComparing to the CPU results...\n"); + sumDelta = 0; + sumRef = 0; + unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); - sumDelta = 0; - sumRef = 0; - unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); + for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { + unsigned int d = (pos + 1) * distance; + ref = MoroInvCNDcpu(d); + delta = (double)h_OutputGPU[pos] - ref; + sumDelta += fabs(delta); + sumRef += fabs(ref); + } - for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { - unsigned int d = (pos + 1) * distance; - ref = MoroInvCNDcpu(d); - delta = (double)h_OutputGPU[pos] - ref; - sumDelta += fabs(delta); - sumRef += fabs(ref); - } + printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); + printf("Shutting down...\n"); - printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); - printf("Shutting down...\n"); + sdkDeleteTimer(&hTimer); + free(h_OutputGPU); - sdkDeleteTimer(&hTimer); - free(h_OutputGPU); + checkCudaErrors(cuMemFree(d_Output)); - checkCudaErrors(cuMemFree(d_Output)); - - exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); + exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_common.h b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_common.h index aad8f0f2..f6e1b124 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_common.h +++ b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_common.h @@ -36,6 +36,6 @@ typedef long long int INT64; #define QRNG_DIMENSIONS 3 #define QRNG_RESOLUTION 31 -#define INT_SCALE (1.0f / (float)0x80000001U) +#define INT_SCALE (1.0f / (float)0x80000001U) #endif diff --git a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gold.cpp b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gold.cpp index 918e7e85..5037967e 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gold.cpp +++ b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gold.cpp @@ -25,8 +25,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include #include "quasirandomGenerator_common.h" @@ -37,69 +37,71 @@ // Internal 64(63)-bit table static INT64 cjn[63][QRNG_DIMENSIONS]; -static int GeneratePolynomials(int buffer[QRNG_DIMENSIONS], bool primitive) { - int i, j, n, p1, p2, l; - int e_p1, e_p2, e_b; +static int GeneratePolynomials(int buffer[QRNG_DIMENSIONS], bool primitive) +{ + int i, j, n, p1, p2, l; + int e_p1, e_p2, e_b; - // generate all polynomials to buffer - for (n = 1, buffer[0] = 0x2, p2 = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { - // search for the next irreducible polynomial - for (p1 = buffer[n - 1] + 1;; ++p1) { - // find degree of polynomial p1 - for (e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { - } + // generate all polynomials to buffer + for (n = 1, buffer[0] = 0x2, p2 = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { + // search for the next irreducible polynomial + for (p1 = buffer[n - 1] + 1;; ++p1) { + // find degree of polynomial p1 + for (e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { + } - // try to divide p1 by all polynomials in buffer - for (i = 0; i < n; ++i) { - // find the degree of buffer[i] - for (e_b = e_p1; (buffer[i] & (1 << e_b)) == 0; --e_b) { + // try to divide p1 by all polynomials in buffer + for (i = 0; i < n; ++i) { + // find the degree of buffer[i] + for (e_b = e_p1; (buffer[i] & (1 << e_b)) == 0; --e_b) { + } + + // divide p2 by buffer[i] until the end + for (p2 = (buffer[i] << ((e_p2 = e_p1) - e_b)) ^ p1; p2 >= buffer[i]; + p2 = (buffer[i] << (e_p2 - e_b)) ^ p2) { + for (; (p2 & (1 << e_p2)) == 0; --e_p2) { + } + } // compute new degree of p2 + + // division without remainder!!! p1 is not irreducible + if (p2 == 0) { + break; + } + } + + // all divisions were with remainder - p1 is irreducible + if (p2 != 0) { + e_p2 = 0; + + if (primitive) { + // check that p1 has only one cycle (i.e. is monic, or primitive) + j = ~(0xffffffff << (e_p1 + 1)); + e_b = (1 << e_p1) | 0x1; + + for (p2 = e_b, e_p2 = (1 << e_p1) - 2; e_p2 > 0; --e_p2) { + p2 <<= 1; + i = p2 & p1; + i = (i & 0x55555555) + ((i >> 1) & 0x55555555); + i = (i & 0x33333333) + ((i >> 2) & 0x33333333); + i = (i & 0x07070707) + ((i >> 4) & 0x07070707); + p2 |= (i % 255) & 1; + + if ((p2 & j) == e_b) + break; + } + } + + // it is monic - add it to the list of polynomials + if (e_p2 == 0) { + buffer[n] = p1; + l += e_p1; + break; + } + } } - - // divide p2 by buffer[i] until the end - for (p2 = (buffer[i] << ((e_p2 = e_p1) - e_b)) ^ p1; p2 >= buffer[i]; - p2 = (buffer[i] << (e_p2 - e_b)) ^ p2) { - for (; (p2 & (1 << e_p2)) == 0; --e_p2) { - } - } // compute new degree of p2 - - // division without remainder!!! p1 is not irreducible - if (p2 == 0) { - break; - } - } - - // all divisions were with remainder - p1 is irreducible - if (p2 != 0) { - e_p2 = 0; - - if (primitive) { - // check that p1 has only one cycle (i.e. is monic, or primitive) - j = ~(0xffffffff << (e_p1 + 1)); - e_b = (1 << e_p1) | 0x1; - - for (p2 = e_b, e_p2 = (1 << e_p1) - 2; e_p2 > 0; --e_p2) { - p2 <<= 1; - i = p2 & p1; - i = (i & 0x55555555) + ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - i = (i & 0x07070707) + ((i >> 4) & 0x07070707); - p2 |= (i % 255) & 1; - - if ((p2 & j) == e_b) break; - } - } - - // it is monic - add it to the list of polynomials - if (e_p2 == 0) { - buffer[n] = p1; - l += e_p1; - break; - } - } } - } - return l + 1; + return l + 1; } //////////////////////////////////////////////////////////////////////////////// @@ -114,221 +116,224 @@ static int GeneratePolynomials(int buffer[QRNG_DIMENSIONS], bool primitive) { // year = "1992" } //////////////////////////////////////////////////////////////////////////////// -static void GenerateCJ() { - int buffer[QRNG_DIMENSIONS]; - int *polynomials; - int n, p1, l, e_p1; +static void GenerateCJ() +{ + int buffer[QRNG_DIMENSIONS]; + int *polynomials; + int n, p1, l, e_p1; - // Niederreiter (in contrast to Sobol) allows to use not primitive, but just - // irreducible polynomials - l = GeneratePolynomials(buffer, false); + // Niederreiter (in contrast to Sobol) allows to use not primitive, but just + // irreducible polynomials + l = GeneratePolynomials(buffer, false); - // convert all polynomials from buffer to polynomials table - polynomials = new int[l + 2 * QRNG_DIMENSIONS + 1]; + // convert all polynomials from buffer to polynomials table + polynomials = new int[l + 2 * QRNG_DIMENSIONS + 1]; - for (n = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { - // find degree of polynomial p1 - for (p1 = buffer[n], e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { - } - - // fill polynomials table with values for this polynomial - polynomials[l++] = 1; - - for (--e_p1; e_p1 >= 0; --e_p1) { - polynomials[l++] = (p1 >> e_p1) & 1; - } - - polynomials[l++] = -1; - } - - polynomials[l] = -1; - - // irreducible polynomial p - int *p = polynomials, e, d; - - // polynomial b - int b_arr[1024], *b, m; - - // v array - int v_arr[1024], *v; - - // temporary polynomial, required to do multiplication of p and b - int t_arr[1024], *t; - - // subsidiary variables - int i, j, u, m1, ip, it; - - // cycle over monic irreducible polynomials - for (d = 0; p[0] != -1; p += e + 2) { - // allocate memory for cj array for dimension (ip + 1) - for (i = 0; i < 63; ++i) { - cjn[i][d] = 0; - } - - // determine the power of irreducible polynomial - for (e = 0; p[e + 1] != -1; ++e) { - } - - // polynomial b in the beginning is just '1' - (b = b_arr + 1023)[m = 0] = 1; - - // v array needs only (63 + e - 2) length - v = v_arr + 1023 - (63 + e - 2); - - // cycle over all coefficients - for (j = 63 - 1, u = e; j >= 0; --j, ++u) { - if (u == e) { - u = 0; - - // multiply b by p (polynomials multiplication) - for (i = 0, t = t_arr + 1023 - (m1 = m); i <= m; ++i) { - t[i] = b[i]; + for (n = 0, l = 0; n < QRNG_DIMENSIONS; ++n) { + // find degree of polynomial p1 + for (p1 = buffer[n], e_p1 = 30; (p1 & (1 << e_p1)) == 0; --e_p1) { } - b = b_arr + 1023 - (m += e); + // fill polynomials table with values for this polynomial + polynomials[l++] = 1; - for (i = 0; i <= m; ++i) { - b[i] = 0; + for (--e_p1; e_p1 >= 0; --e_p1) { + polynomials[l++] = (p1 >> e_p1) & 1; + } - for (ip = e - (m - i), it = m1; ip <= e && it >= 0; ++ip, --it) { - if (ip >= 0) { - b[i] ^= p[ip] & t[it]; + polynomials[l++] = -1; + } + + polynomials[l] = -1; + + // irreducible polynomial p + int *p = polynomials, e, d; + + // polynomial b + int b_arr[1024], *b, m; + + // v array + int v_arr[1024], *v; + + // temporary polynomial, required to do multiplication of p and b + int t_arr[1024], *t; + + // subsidiary variables + int i, j, u, m1, ip, it; + + // cycle over monic irreducible polynomials + for (d = 0; p[0] != -1; p += e + 2) { + // allocate memory for cj array for dimension (ip + 1) + for (i = 0; i < 63; ++i) { + cjn[i][d] = 0; + } + + // determine the power of irreducible polynomial + for (e = 0; p[e + 1] != -1; ++e) { + } + + // polynomial b in the beginning is just '1' + (b = b_arr + 1023)[m = 0] = 1; + + // v array needs only (63 + e - 2) length + v = v_arr + 1023 - (63 + e - 2); + + // cycle over all coefficients + for (j = 63 - 1, u = e; j >= 0; --j, ++u) { + if (u == e) { + u = 0; + + // multiply b by p (polynomials multiplication) + for (i = 0, t = t_arr + 1023 - (m1 = m); i <= m; ++i) { + t[i] = b[i]; + } + + b = b_arr + 1023 - (m += e); + + for (i = 0; i <= m; ++i) { + b[i] = 0; + + for (ip = e - (m - i), it = m1; ip <= e && it >= 0; ++ip, --it) { + if (ip >= 0) { + b[i] ^= p[ip] & t[it]; + } + } + } + + // multiplication of polynomials finished + // calculate v + for (i = 0; i < m1; ++i) { + v[i] = 0; + } + + for (; i < m; ++i) { + v[i] = 1; + } + + for (; i <= 63 + e - 2; ++i) { + v[i] = 0; + for (it = 1; it <= m; ++it) { + v[i] ^= v[i - it] & b[it]; + } + } + } + + // copy calculated v to cj + for (i = 0; i < 63; i++) { + cjn[i][d] |= (INT64)v[i + u] << j; } - } } - // multiplication of polynomials finished - // calculate v - for (i = 0; i < m1; ++i) { - v[i] = 0; - } - - for (; i < m; ++i) { - v[i] = 1; - } - - for (; i <= 63 + e - 2; ++i) { - v[i] = 0; - for (it = 1; it <= m; ++it) { - v[i] ^= v[i - it] & b[it]; - } - } - } - - // copy calculated v to cj - for (i = 0; i < 63; i++) { - cjn[i][d] |= (INT64)v[i + u] << j; - } + ++d; } - ++d; - } - - delete[] polynomials; + delete[] polynomials; } // Generate 63-bit quasirandom number for given index and dimension and // normalize -extern "C" double getQuasirandomValue63(INT64 i, int dim) { - const double INT63_SCALE = (1.0 / (double)0x8000000000000001ULL); - INT64 result = 0; +extern "C" double getQuasirandomValue63(INT64 i, int dim) +{ + const double INT63_SCALE = (1.0 / (double)0x8000000000000001ULL); + INT64 result = 0; - for (int bit = 0; bit < 63; bit++, i >>= 1) - if (i & 1) result ^= cjn[bit][dim]; + for (int bit = 0; bit < 63; bit++, i >>= 1) + if (i & 1) + result ^= cjn[bit][dim]; - return (double)(result + 1) * INT63_SCALE; + return (double)(result + 1) * INT63_SCALE; } //////////////////////////////////////////////////////////////////////////////// // Initialization (table setup) //////////////////////////////////////////////////////////////////////////////// -extern "C" void initQuasirandomGenerator( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]) { - GenerateCJ(); +extern "C" void initQuasirandomGenerator(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION]) +{ + GenerateCJ(); - for (int dim = 0; dim < QRNG_DIMENSIONS; dim++) - for (int bit = 0; bit < QRNG_RESOLUTION; bit++) - table[dim][bit] = (int)((cjn[bit][dim] >> 32) & 0x7FFFFFFF); + for (int dim = 0; dim < QRNG_DIMENSIONS; dim++) + for (int bit = 0; bit < QRNG_RESOLUTION; bit++) + table[dim][bit] = (int)((cjn[bit][dim] >> 32) & 0x7FFFFFFF); } //////////////////////////////////////////////////////////////////////////////// // Generate 31-bit quasirandom number for given index and dimension //////////////////////////////////////////////////////////////////////////////// -extern "C" float getQuasirandomValue( - unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim) { - int result = 0; +extern "C" float getQuasirandomValue(unsigned int table[QRNG_DIMENSIONS][QRNG_RESOLUTION], int i, int dim) +{ + int result = 0; - for (int bit = 0; bit < QRNG_RESOLUTION; bit++, i >>= 1) - if (i & 1) result ^= table[dim][bit]; + for (int bit = 0; bit < QRNG_RESOLUTION; bit++, i >>= 1) + if (i & 1) + result ^= table[dim][bit]; - return (float)(result + 1) * INT_SCALE; + return (float)(result + 1) * INT_SCALE; } //////////////////////////////////////////////////////////////////////////////// // Moro's Inverse Cumulative Normal Distribution function approximation //////////////////////////////////////////////////////////////////////////////// -extern "C" double MoroInvCNDcpu(unsigned int x) { - const double a1 = 2.50662823884; - const double a2 = -18.61500062529; - const double a3 = 41.39119773534; - const double a4 = -25.44106049637; - const double b1 = -8.4735109309; - const double b2 = 23.08336743743; - const double b3 = -21.06224101826; - const double b4 = 3.13082909833; - const double c1 = 0.337475482272615; - const double c2 = 0.976169019091719; - const double c3 = 0.160797971491821; - const double c4 = 2.76438810333863E-02; - const double c5 = 3.8405729373609E-03; - const double c6 = 3.951896511919E-04; - const double c7 = 3.21767881768E-05; - const double c8 = 2.888167364E-07; - const double c9 = 3.960315187E-07; +extern "C" double MoroInvCNDcpu(unsigned int x) +{ + const double a1 = 2.50662823884; + const double a2 = -18.61500062529; + const double a3 = 41.39119773534; + const double a4 = -25.44106049637; + const double b1 = -8.4735109309; + const double b2 = 23.08336743743; + const double b3 = -21.06224101826; + const double b4 = 3.13082909833; + const double c1 = 0.337475482272615; + const double c2 = 0.976169019091719; + const double c3 = 0.160797971491821; + const double c4 = 2.76438810333863E-02; + const double c5 = 3.8405729373609E-03; + const double c6 = 3.951896511919E-04; + const double c7 = 3.21767881768E-05; + const double c8 = 2.888167364E-07; + const double c9 = 3.960315187E-07; - double z; + double z; - bool negate = false; + bool negate = false; - // Ensure the conversion to floating point will give a value in the - // range (0,0.5] by restricting the input to the bottom half of the - // input domain. We will later reflect the result if the input was - // originally in the top half of the input domain + // Ensure the conversion to floating point will give a value in the + // range (0,0.5] by restricting the input to the bottom half of the + // input domain. We will later reflect the result if the input was + // originally in the top half of the input domain - if (x >= 0x80000000UL) { - x = 0xffffffffUL - x; - negate = true; - } + if (x >= 0x80000000UL) { + x = 0xffffffffUL - x; + negate = true; + } - // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) - // Convert to floating point in (0,0.5] - const double x1 = 1.0 / static_cast(0xffffffffUL); - const double x2 = x1 / 2.0; - double p1 = x * x1 + x2; + // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) + // Convert to floating point in (0,0.5] + const double x1 = 1.0 / static_cast(0xffffffffUL); + const double x2 = x1 / 2.0; + double p1 = x * x1 + x2; - // Convert to floating point in (-0.5,0] - double p2 = p1 - 0.5; + // Convert to floating point in (-0.5,0] + double p2 = p1 - 0.5; - // The input to the Moro inversion is p2 which is in the range - // (-0.5,0]. This means that our output will be the negative side - // of the bell curve (which we will reflect if "negate" is true). + // The input to the Moro inversion is p2 which is in the range + // (-0.5,0]. This means that our output will be the negative side + // of the bell curve (which we will reflect if "negate" is true). - // Main body of the bell curve for |p| < 0.42 - if (p2 > -0.42) { - z = p2 * p2; - z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / - ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0); - } - // Special case (Chebychev) for tail - else { - z = log(-log(p1)); - z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * - (c7 + z * (c8 + z * c9)))))))); - } + // Main body of the bell curve for |p| < 0.42 + if (p2 > -0.42) { + z = p2 * p2; + z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0); + } + // Special case (Chebychev) for tail + else { + z = log(-log(p1)); + z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * (c7 + z * (c8 + z * c9)))))))); + } - // If the original input (x) was in the top half of the range, reflect - // to get the positive side of the bell curve - return negate ? -z : z; + // If the original input (x) was in the top half of the range, reflect + // to get the positive side of the bell curve + return negate ? -z : z; } diff --git a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gpu.cuh b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gpu.cuh index 54474e45..ca100d30 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gpu.cuh +++ b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_gpu.cuh @@ -29,14 +29,15 @@ #define QUASIRANDOMGENERATOR_GPU_CUH #include + #include "quasirandomGenerator_common.h" // Fast integer multiplication #define MUL(a, b) __umul24(a, b) // Global variables for nvrtc outputs -char *cubin; -size_t cubinSize; +char *cubin; +size_t cubinSize; CUmodule module; //////////////////////////////////////////////////////////////////////////////// @@ -48,54 +49,60 @@ CUmodule module; //////////////////////////////////////////////////////////////////////////////// // Table initialization routine -void initTableGPU(unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]) { - CUdeviceptr c_Table; - checkCudaErrors(cuModuleGetGlobal(&c_Table, NULL, module, "c_Table")); - checkCudaErrors( - cuMemcpyHtoD(c_Table, tableCPU, - QRNG_DIMENSIONS * QRNG_RESOLUTION * sizeof(unsigned int))); +void initTableGPU(unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]) +{ + CUdeviceptr c_Table; + checkCudaErrors(cuModuleGetGlobal(&c_Table, NULL, module, "c_Table")); + checkCudaErrors(cuMemcpyHtoD(c_Table, tableCPU, QRNG_DIMENSIONS * QRNG_RESOLUTION * sizeof(unsigned int))); } // Host-side interface -void quasirandomGeneratorGPU(CUdeviceptr d_Output, unsigned int seed, - unsigned int N) { - dim3 threads(128, QRNG_DIMENSIONS); - dim3 cudaGridSize(128, 1, 1); +void quasirandomGeneratorGPU(CUdeviceptr d_Output, unsigned int seed, unsigned int N) +{ + dim3 threads(128, QRNG_DIMENSIONS); + dim3 cudaGridSize(128, 1, 1); - CUfunction kernel_addr; - checkCudaErrors( - cuModuleGetFunction(&kernel_addr, module, "quasirandomGeneratorKernel")); + CUfunction kernel_addr; + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "quasirandomGeneratorKernel")); - void *args[] = {(void *)&d_Output, (void *)&seed, (void *)&N}; - checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - threads.x, threads.y, - threads.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &args[0], /* arguments */ - 0)); + void *args[] = {(void *)&d_Output, (void *)&seed, (void *)&N}; + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + threads.x, + threads.y, + threads.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &args[0], /* arguments */ + 0)); - checkCudaErrors(cuCtxSynchronize()); + checkCudaErrors(cuCtxSynchronize()); } -void inverseCNDgpu(CUdeviceptr d_Output, unsigned int N) { - dim3 threads(128, 1, 1); - dim3 cudaGridSize(128, 1, 1); +void inverseCNDgpu(CUdeviceptr d_Output, unsigned int N) +{ + dim3 threads(128, 1, 1); + dim3 cudaGridSize(128, 1, 1); - CUfunction kernel_addr; - checkCudaErrors( - cuModuleGetFunction(&kernel_addr, module, "inverseCNDKernel")); + CUfunction kernel_addr; + checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "inverseCNDKernel")); - void *args[] = {(void *)&d_Output, (void *)&N}; - checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, - cudaGridSize.z, /* grid dim */ - threads.x, threads.y, - threads.z, /* block dim */ - 0, 0, /* shared mem, stream */ - &args[0], /* arguments */ - 0)); + void *args[] = {(void *)&d_Output, (void *)&N}; + checkCudaErrors(cuLaunchKernel(kernel_addr, + cudaGridSize.x, + cudaGridSize.y, + cudaGridSize.z, /* grid dim */ + threads.x, + threads.y, + threads.z, /* block dim */ + 0, + 0, /* shared mem, stream */ + &args[0], /* arguments */ + 0)); - checkCudaErrors(cuCtxSynchronize()); + checkCudaErrors(cuCtxSynchronize()); } #endif diff --git a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_kernel.cu b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_kernel.cu index a9fb59a6..3b445c31 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_kernel.cu +++ b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/quasirandomGenerator_kernel.cu @@ -38,90 +38,88 @@ //////////////////////////////////////////////////////////////////////////////// __constant__ unsigned int c_Table[QRNG_DIMENSIONS][QRNG_RESOLUTION]; -extern "C" __global__ void quasirandomGeneratorKernel(float *d_Output, - unsigned int seed, - unsigned int N) { - unsigned int *dimBase = &c_Table[threadIdx.y][0]; - unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; - unsigned int threadN = MUL(blockDim.x, gridDim.x); +extern "C" __global__ void quasirandomGeneratorKernel(float *d_Output, unsigned int seed, unsigned int N) +{ + unsigned int *dimBase = &c_Table[threadIdx.y][0]; + unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; + unsigned int threadN = MUL(blockDim.x, gridDim.x); - for (unsigned int pos = tid; pos < N; pos += threadN) { - unsigned int result = 0; - unsigned int data = seed + pos; + for (unsigned int pos = tid; pos < N; pos += threadN) { + unsigned int result = 0; + unsigned int data = seed + pos; - for (int bit = 0; bit < QRNG_RESOLUTION; bit++, data >>= 1) - if (data & 1) { - result ^= dimBase[bit]; - } + for (int bit = 0; bit < QRNG_RESOLUTION; bit++, data >>= 1) + if (data & 1) { + result ^= dimBase[bit]; + } - d_Output[MUL(threadIdx.y, N) + pos] = (float)(result + 1) * INT_SCALE; - } + d_Output[MUL(threadIdx.y, N) + pos] = (float)(result + 1) * INT_SCALE; + } } //////////////////////////////////////////////////////////////////////////////// // Moro's Inverse Cumulative Normal Distribution function approximation //////////////////////////////////////////////////////////////////////////////// -__device__ inline float MoroInvCNDgpu(unsigned int x) { - const float a1 = 2.50662823884f; - const float a2 = -18.61500062529f; - const float a3 = 41.39119773534f; - const float a4 = -25.44106049637f; - const float b1 = -8.4735109309f; - const float b2 = 23.08336743743f; - const float b3 = -21.06224101826f; - const float b4 = 3.13082909833f; - const float c1 = 0.337475482272615f; - const float c2 = 0.976169019091719f; - const float c3 = 0.160797971491821f; - const float c4 = 2.76438810333863E-02f; - const float c5 = 3.8405729373609E-03f; - const float c6 = 3.951896511919E-04f; - const float c7 = 3.21767881768E-05f; - const float c8 = 2.888167364E-07f; - const float c9 = 3.960315187E-07f; +__device__ inline float MoroInvCNDgpu(unsigned int x) +{ + const float a1 = 2.50662823884f; + const float a2 = -18.61500062529f; + const float a3 = 41.39119773534f; + const float a4 = -25.44106049637f; + const float b1 = -8.4735109309f; + const float b2 = 23.08336743743f; + const float b3 = -21.06224101826f; + const float b4 = 3.13082909833f; + const float c1 = 0.337475482272615f; + const float c2 = 0.976169019091719f; + const float c3 = 0.160797971491821f; + const float c4 = 2.76438810333863E-02f; + const float c5 = 3.8405729373609E-03f; + const float c6 = 3.951896511919E-04f; + const float c7 = 3.21767881768E-05f; + const float c8 = 2.888167364E-07f; + const float c9 = 3.960315187E-07f; - float z; + float z; - bool negate = false; + bool negate = false; - // Ensure the conversion to floating point will give a value in the - // range (0,0.5] by restricting the input to the bottom half of the - // input domain. We will later reflect the result if the input was - // originally in the top half of the input domain - if (x >= 0x80000000UL) { - x = 0xffffffffUL - x; - negate = true; - } + // Ensure the conversion to floating point will give a value in the + // range (0,0.5] by restricting the input to the bottom half of the + // input domain. We will later reflect the result if the input was + // originally in the top half of the input domain + if (x >= 0x80000000UL) { + x = 0xffffffffUL - x; + negate = true; + } - // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) - // Convert to floating point in (0,0.5] - const float x1 = 1.0f / static_cast(0xffffffffUL); - const float x2 = x1 / 2.0f; - float p1 = x * x1 + x2; - // Convert to floating point in (-0.5,0] - float p2 = p1 - 0.5f; + // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff]) + // Convert to floating point in (0,0.5] + const float x1 = 1.0f / static_cast(0xffffffffUL); + const float x2 = x1 / 2.0f; + float p1 = x * x1 + x2; + // Convert to floating point in (-0.5,0] + float p2 = p1 - 0.5f; - // The input to the Moro inversion is p2 which is in the range - // (-0.5,0]. This means that our output will be the negative side - // of the bell curve (which we will reflect if "negate" is true). + // The input to the Moro inversion is p2 which is in the range + // (-0.5,0]. This means that our output will be the negative side + // of the bell curve (which we will reflect if "negate" is true). - // Main body of the bell curve for |p| < 0.42 - if (p2 > -0.42f) { - z = p2 * p2; - z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / - ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0f); - } - // Special case (Chebychev) for tail - else { - z = __logf(-__logf(p1)); - z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * - (c7 + z * (c8 + z * c9)))))))); - } + // Main body of the bell curve for |p| < 0.42 + if (p2 > -0.42f) { + z = p2 * p2; + z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0f); + } + // Special case (Chebychev) for tail + else { + z = __logf(-__logf(p1)); + z = -(c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * (c7 + z * (c8 + z * c9)))))))); + } - // If the original input (x) was in the top half of the range, reflect - // to get the positive side of the bell curve + // If the original input (x) was in the top half of the range, reflect + // to get the positive side of the bell curve - return negate ? -z : z; + return negate ? -z : z; } //////////////////////////////////////////////////////////////////////////////// @@ -129,31 +127,31 @@ __device__ inline float MoroInvCNDgpu(unsigned int x) { // input sequence and uniform ascending (0, 1) sequence //////////////////////////////////////////////////////////////////////////////// -extern "C" __global__ void inverseCNDKernel(float *d_Output, - unsigned int pathN) { - unsigned int distance = ((unsigned int)-1) / (pathN + 1); - unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; - unsigned int threadN = MUL(blockDim.x, gridDim.x); +extern "C" __global__ void inverseCNDKernel(float *d_Output, unsigned int pathN) +{ + unsigned int distance = ((unsigned int)-1) / (pathN + 1); + unsigned int tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x; + unsigned int threadN = MUL(blockDim.x, gridDim.x); - // Transform input number sequence if it's supplied - if (0) // d_Input) - { - /* - for (unsigned int pos = tid; pos < pathN; pos += threadN) - { - unsigned int d = d_Input[pos]; - d_Output[pos] = (float)MoroInvCNDgpu(d); - } - */ - } - // Else generate input uniformly placed samples on the fly - // and write to destination - else { - for (unsigned int pos = tid; pos < pathN; pos += threadN) { - unsigned int d = (pos + 1) * distance; - d_Output[pos] = (float)MoroInvCNDgpu(d); + // Transform input number sequence if it's supplied + if (0) // d_Input) + { + /* + for (unsigned int pos = tid; pos < pathN; pos += threadN) + { + unsigned int d = d_Input[pos]; + d_Output[pos] = (float)MoroInvCNDgpu(d); + } + */ + } + // Else generate input uniformly placed samples on the fly + // and write to destination + else { + for (unsigned int pos = tid; pos < pathN; pos += threadN) { + unsigned int d = (pos + 1) * distance; + d_Output[pos] = (float)MoroInvCNDgpu(d); + } } - } } #endif diff --git a/Samples/5_Domain_Specific/recursiveGaussian/README.md b/Samples/5_Domain_Specific/recursiveGaussian/README.md index 6c9dc6dc..6600f99b 100644 --- a/Samples/5_Domain_Specific/recursiveGaussian/README.md +++ b/Samples/5_Domain_Specific/recursiveGaussian/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian.cpp b/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian.cpp index 42c0db3b..ee121299 100644 --- a/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian.cpp +++ b/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian.cpp @@ -64,472 +64,467 @@ #endif // CUDA includes and interop headers -#include #include +#include // CUDA utilities and system includes +#include // includes cuda.h and cuda_runtime_api.h #include -#include // includes cuda.h and cuda_runtime_api.h // Includes -#include -#include -#include #include +#include +#include +#include #define MAX(a, b) ((a > b) ? a : b) #define USE_SIMPLE_FILTER 0 #define MAX_EPSILON_ERROR 5.0f -#define THRESHOLD 0.15f +#define THRESHOLD 0.15f // Define the files that are to be save and the reference images for validation -const char *sOriginal[] = {"teapot512_10.ppm", "teapot512_14.ppm", "teapot512_18.ppm", - "teapot512_22.ppm", NULL}; +const char *sOriginal[] = {"teapot512_10.ppm", "teapot512_14.ppm", "teapot512_18.ppm", "teapot512_22.ppm", NULL}; -const char *sReference[] = {"ref_10.ppm", "ref_14.ppm", "ref_18.ppm", - "ref_22.ppm", NULL}; +const char *sReference[] = {"ref_10.ppm", "ref_14.ppm", "ref_18.ppm", "ref_22.ppm", NULL}; const char *image_filename = "teapot512.ppm"; -float sigma = 10.0f; -int order = 0; -int nthreads = 64; // number of threads per block +float sigma = 10.0f; +int order = 0; +int nthreads = 64; // number of threads per block -unsigned int width, height; -unsigned int *h_img = NULL; -unsigned int *d_img = NULL; +unsigned int width, height; +unsigned int *h_img = NULL; +unsigned int *d_img = NULL; unsigned int *d_temp = NULL; -GLuint pbo = 0; // OpenGL pixel buffer object -GLuint texid = 0; // texture +GLuint pbo = 0; // OpenGL pixel buffer object +GLuint texid = 0; // texture cudaGraphicsResource_t cuda_vbo_resource; StopWatchInterface *timer = 0; // Auto-Verification Code -const int frameCheckNumber = 4; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -unsigned int frameCount = 0; +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +unsigned int frameCount = 0; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; bool runBenchmark = false; const char *sSDKsample = "CUDA Recursive Gaussian"; -extern "C" void transpose(unsigned int *d_src, unsigned int *d_dest, - unsigned int width, int height); +extern "C" void transpose(unsigned int *d_src, unsigned int *d_dest, unsigned int width, int height); -extern "C" void gaussianFilterRGBA(unsigned int *d_src, unsigned int *d_dest, - unsigned int *d_temp, int width, int height, - float sigma, int order, int nthreads); +extern "C" void gaussianFilterRGBA(unsigned int *d_src, + unsigned int *d_dest, + unsigned int *d_temp, + int width, + int height, + float sigma, + int order, + int nthreads); void cleanup(); -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - fpsLimit = ftoi(MAX(ifps, 1.f)); - sdkResetTimer(&timer); - } + fpsLimit = ftoi(MAX(ifps, 1.f)); + sdkResetTimer(&timer); + } } // display results using OpenGL -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - // execute filter, writing results to pbo - unsigned int *d_result; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_result, &num_bytes, cuda_vbo_resource)); - gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, - nthreads); + // execute filter, writing results to pbo + unsigned int *d_result; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes, cuda_vbo_resource)); + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); - // unmap buffer object - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); + // unmap buffer object + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); - // load texture from pbo - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBindTexture(GL_TEXTURE_2D, texid); - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, - GL_UNSIGNED_BYTE, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + // load texture from pbo + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBindTexture(GL_TEXTURE_2D, texid); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - // display results - glClear(GL_COLOR_BUFFER_BIT); + // display results + glClear(GL_COLOR_BUFFER_BIT); - glEnable(GL_TEXTURE_2D); - glDisable(GL_DEPTH_TEST); + glEnable(GL_TEXTURE_2D); + glDisable(GL_DEPTH_TEST); - glBegin(GL_QUADS); - glTexCoord2f(0, 1); - glVertex2f(0, 0); - glTexCoord2f(1, 1); - glVertex2f(1, 0); - glTexCoord2f(1, 0); - glVertex2f(1, 1); - glTexCoord2f(0, 0); - glVertex2f(0, 1); - glEnd(); + glBegin(GL_QUADS); + glTexCoord2f(0, 1); + glVertex2f(0, 0); + glTexCoord2f(1, 1); + glVertex2f(1, 0); + glTexCoord2f(1, 0); + glVertex2f(1, 1); + glTexCoord2f(0, 0); + glVertex2f(0, 1); + glEnd(); - glDisable(GL_TEXTURE_2D); - glutSwapBuffers(); + glDisable(GL_TEXTURE_2D); + glutSwapBuffers(); - sdkStopTimer(&timer); + sdkStopTimer(&timer); - computeFPS(); + computeFPS(); } void idle() { glutPostRedisplay(); } -void cleanup() { - sdkDeleteTimer(&timer); +void cleanup() +{ + sdkDeleteTimer(&timer); - checkCudaErrors(cudaFree(d_img)); - checkCudaErrors(cudaFree(d_temp)); + checkCudaErrors(cudaFree(d_img)); + checkCudaErrors(cudaFree(d_temp)); - if (!runBenchmark) { - if (pbo) { - // unregister this buffer object with CUDA - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); - glDeleteBuffers(1, &pbo); + if (!runBenchmark) { + if (pbo) { + // unregister this buffer object with CUDA + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); + glDeleteBuffers(1, &pbo); + } + + if (texid) { + glDeleteTextures(1, &texid); + } } - - if (texid) { - glDeleteTextures(1, &texid); - } - } } -void keyboard(unsigned char key, int x, int y) { - switch (key) { +void keyboard(unsigned char key, int x, int y) +{ + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case '=': - sigma += 0.1f; - break; + sigma += 0.1f; + break; case '-': - sigma -= 0.1f; + sigma -= 0.1f; - if (sigma < 0.0) { - sigma = 0.0f; - } + if (sigma < 0.0) { + sigma = 0.0f; + } - break; + break; case '+': - sigma += 1.0f; - break; + sigma += 1.0f; + break; case '_': - sigma -= 1.0f; + sigma -= 1.0f; - if (sigma < 0.0) { - sigma = 0.0f; - } + if (sigma < 0.0) { + sigma = 0.0f; + } - break; + break; case '0': - order = 0; - break; + order = 0; + break; case '1': - order = 1; - sigma = 0.5f; - break; + order = 1; + sigma = 0.5f; + break; case '2': - order = 2; - sigma = 0.5f; - break; + order = 2; + sigma = 0.5f; + break; default: - break; - } + break; + } - printf("sigma = %f\n", sigma); - glutPostRedisplay(); + printf("sigma = %f\n", sigma); + glutPostRedisplay(); } -void reshape(int x, int y) { - glViewport(0, 0, x, y); +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } -void initCudaBuffers() { - unsigned int size = width * height * sizeof(unsigned int); +void initCudaBuffers() +{ + unsigned int size = width * height * sizeof(unsigned int); - // allocate device memory - checkCudaErrors(cudaMalloc((void **)&d_img, size)); - checkCudaErrors(cudaMalloc((void **)&d_temp, size)); + // allocate device memory + checkCudaErrors(cudaMalloc((void **)&d_img, size)); + checkCudaErrors(cudaMalloc((void **)&d_temp, size)); - checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice)); - sdkCreateTimer(&timer); + sdkCreateTimer(&timer); } -void initGLBuffers() { - // create pixel buffer object to store final image - glGenBuffers(1, &pbo); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, - h_img, GL_STREAM_DRAW_ARB); +void initGLBuffers() +{ + // create pixel buffer object to store final image + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, h_img, GL_STREAM_DRAW_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_vbo_resource, pbo, cudaGraphicsRegisterFlagsWriteDiscard)); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, pbo, cudaGraphicsRegisterFlagsWriteDiscard)); - // create texture for display - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); + // create texture for display + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); } -void initGL(int *argc, char **argv) { - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); - glutInitWindowSize(width, height); - glutCreateWindow(sSDKsample); - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutReshapeFunc(reshape); - glutIdleFunc(idle); +void initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); + glutInitWindowSize(width, height); + glutCreateWindow(sSDKsample); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutIdleFunc(idle); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - printf("Press '+' and '-' to change filter width\n"); - printf("0, 1, 2 - change filter order\n"); + printf("Press '+' and '-' to change filter width\n"); + printf("0, 1, 2 - change filter order\n"); - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported( - "GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { - fprintf(stderr, "Required OpenGL extensions missing."); - exit(EXIT_FAILURE); - } + if (!isGLVersionSupported(2, 0) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions missing."); + exit(EXIT_FAILURE); + } } -void benchmark(int iterations) { - // allocate memory for result - unsigned int *d_result; - unsigned int size = width * height * sizeof(unsigned int); - checkCudaErrors(cudaMalloc((void **)&d_result, size)); +void benchmark(int iterations) +{ + // allocate memory for result + unsigned int *d_result; + unsigned int size = width * height * sizeof(unsigned int); + checkCudaErrors(cudaMalloc((void **)&d_result, size)); - // warm-up - gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, - nthreads); + // warm-up + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStartTimer(&timer); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStartTimer(&timer); - // execute the kernel - for (int i = 0; i < iterations; i++) { - gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, - nthreads); - } + // execute the kernel + for (int i = 0; i < iterations; i++) { + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + } - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&timer); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer); - // check if kernel execution generated an error - getLastCudaError("Kernel execution failed"); + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - printf("%.2f Mpixels/sec\n", - (width * height * iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / - 1e6); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width * height * iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); - checkCudaErrors(cudaFree(d_result)); + checkCudaErrors(cudaFree(d_result)); } -bool runSingleTest(const char *ref_file, const char *exec_path) { - // allocate memory for result - int nTotalErrors = 0; - unsigned int *d_result; - unsigned int size = width * height * sizeof(unsigned int); - checkCudaErrors(cudaMalloc((void **)&d_result, size)); +bool runSingleTest(const char *ref_file, const char *exec_path) +{ + // allocate memory for result + int nTotalErrors = 0; + unsigned int *d_result; + unsigned int size = width * height * sizeof(unsigned int); + checkCudaErrors(cudaMalloc((void **)&d_result, size)); - // warm-up - gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, - nthreads); + // warm-up + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); - checkCudaErrors(cudaDeviceSynchronize()); - sdkStartTimer(&timer); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStartTimer(&timer); - gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, - nthreads); - checkCudaErrors(cudaDeviceSynchronize()); - getLastCudaError("Kernel execution failed"); - sdkStopTimer(&timer); + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("Kernel execution failed"); + sdkStopTimer(&timer); - unsigned char *h_result = (unsigned char *)malloc(width * height * 4); - checkCudaErrors(cudaMemcpy(h_result, d_result, width * height * 4, - cudaMemcpyDeviceToHost)); + unsigned char *h_result = (unsigned char *)malloc(width * height * 4); + checkCudaErrors(cudaMemcpy(h_result, d_result, width * height * 4, cudaMemcpyDeviceToHost)); - char dump_file[1024]; - sprintf(dump_file, "teapot512_%02d.ppm", (int)sigma); - sdkSavePPM4ub(dump_file, h_result, width, height); + char dump_file[1024]; + sprintf(dump_file, "teapot512_%02d.ppm", (int)sigma); + sdkSavePPM4ub(dump_file, h_result, width, height); - if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), - MAX_EPSILON_ERROR, THRESHOLD, false)) { - nTotalErrors++; - } + if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) { + nTotalErrors++; + } - printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); - printf("%.2f Mpixels/sec\n", - (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); - checkCudaErrors(cudaFree(d_result)); - free(h_result); + checkCudaErrors(cudaFree(d_result)); + free(h_result); - printf("Summary: %d errors!\n", nTotalErrors); + printf("Summary: %d errors!\n", nTotalErrors); - printf(nTotalErrors == 0 ? "Test passed\n" : "Test failed!\n"); - return (nTotalErrors == 0); + printf(nTotalErrors == 0 ? "Test passed\n" : "Test failed!\n"); + return (nTotalErrors == 0); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; - char *ref_file = NULL; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; + char *ref_file = NULL; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - printf( - "NOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + printf("NOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - fpsLimit = frameCheckNumber; + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + fpsLimit = frameCheckNumber; + } } - } - // Get the path of the filename - char *filename; + // Get the path of the filename + char *filename; - if (getCmdLineArgumentString(argc, (const char **)argv, "image", &filename)) { - image_filename = filename; - } + if (getCmdLineArgumentString(argc, (const char **)argv, "image", &filename)) { + image_filename = filename; + } - // load image - char *image_path = sdkFindFilePath(image_filename, argv[0]); + // load image + char *image_path = sdkFindFilePath(image_filename, argv[0]); - if (image_path == NULL) { - fprintf(stderr, "Error unable to find and load image file: '%s'\n", - image_filename); - exit(EXIT_FAILURE); - } + if (image_path == NULL) { + fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); + exit(EXIT_FAILURE); + } - sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); + sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); - if (!h_img) { - printf("Error unable to load PPM file: '%s'\n", image_path); - exit(EXIT_FAILURE); - } + if (!h_img) { + printf("Error unable to load PPM file: '%s'\n", image_path); + exit(EXIT_FAILURE); + } - printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); + printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); - if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { - nthreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { + nthreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { - sigma = getCmdLineArgumentFloat(argc, (const char **)argv, "sigma"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { + sigma = getCmdLineArgumentFloat(argc, (const char **)argv, "sigma"); + } - runBenchmark = checkCmdLineFlag(argc, (const char **)argv, "benchmark"); + runBenchmark = checkCmdLineFlag(argc, (const char **)argv, "benchmark"); - int device; - struct cudaDeviceProp prop; - cudaGetDevice(&device); - cudaGetDeviceProperties(&prop, device); + int device; + struct cudaDeviceProp prop; + cudaGetDevice(&device); + cudaGetDeviceProperties(&prop, device); - if (!strncmp("Tesla", prop.name, 5)) { - printf( - "Tesla card detected, running the test in benchmark mode (no OpenGL " - "display)\n"); - // runBenchmark = true; - runBenchmark = true; - } + if (!strncmp("Tesla", prop.name, 5)) { + printf("Tesla card detected, running the test in benchmark mode (no OpenGL " + "display)\n"); + // runBenchmark = true; + runBenchmark = true; + } - // Benchmark or AutoTest mode detected, no OpenGL - if (runBenchmark == true || ref_file != NULL) { - findCudaDevice(argc, (const char **)argv); - } else { - // First initialize OpenGL context, and then select CUDA device. - initGL(&argc, argv); - findCudaDevice(argc, (const char **)argv); - } + // Benchmark or AutoTest mode detected, no OpenGL + if (runBenchmark == true || ref_file != NULL) { + findCudaDevice(argc, (const char **)argv); + } + else { + // First initialize OpenGL context, and then select CUDA device. + initGL(&argc, argv); + findCudaDevice(argc, (const char **)argv); + } - initCudaBuffers(); + initCudaBuffers(); - if (ref_file) { - printf("(Automated Testing)\n"); - bool testPassed = runSingleTest(ref_file, argv[0]); + if (ref_file) { + printf("(Automated Testing)\n"); + bool testPassed = runSingleTest(ref_file, argv[0]); - cleanup(); - exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); - } + cleanup(); + exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); + } - if (runBenchmark) { - printf("(Run Benchmark)\n"); - benchmark(100); + if (runBenchmark) { + printf("(Run Benchmark)\n"); + benchmark(100); + + cleanup(); + exit(EXIT_SUCCESS); + } + + initGLBuffers(); + glutMainLoop(); - cleanup(); exit(EXIT_SUCCESS); - } - - initGLBuffers(); - glutMainLoop(); - - exit(EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_cuda.cu b/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_cuda.cu index 88be07e9..b526b886 100644 --- a/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_cuda.cu +++ b/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_cuda.cu @@ -49,13 +49,12 @@ Thanks to David Tschumperl� and all the CImg contributors! */ -#include -#include -#include - #include #include #include +#include +#include +#include #include "recursiveGaussian_kernel.cuh" @@ -67,11 +66,12 @@ int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } /* Transpose a 2D array (see SDK transpose example) */ -extern "C" void transpose(uint *d_src, uint *d_dest, uint width, int height) { - dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1); - dim3 threads(BLOCK_DIM, BLOCK_DIM, 1); - d_transpose<<>>(d_dest, d_src, width, height); - getLastCudaError("Kernel execution failed"); +extern "C" void transpose(uint *d_src, uint *d_dest, uint width, int height) +{ + dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1); + dim3 threads(BLOCK_DIM, BLOCK_DIM, 1); + d_transpose<<>>(d_dest, d_src, width, height); + getLastCudaError("Kernel execution failed"); } /* @@ -88,73 +88,68 @@ extern "C" void transpose(uint *d_src, uint *d_dest, uint width, int height) { */ // 8-bit RGBA version -extern "C" void gaussianFilterRGBA(uint *d_src, uint *d_dest, uint *d_temp, - int width, int height, float sigma, - int order, int nthreads) { - // compute filter coefficients - const float nsigma = sigma < 0.1f ? 0.1f : sigma, alpha = 1.695f / nsigma, - ema = (float)std::exp(-alpha), ema2 = (float)std::exp(-2 * alpha), - b1 = -2 * ema, b2 = ema2; +extern "C" void +gaussianFilterRGBA(uint *d_src, uint *d_dest, uint *d_temp, int width, int height, float sigma, int order, int nthreads) +{ + // compute filter coefficients + const float nsigma = sigma < 0.1f ? 0.1f : sigma, alpha = 1.695f / nsigma, ema = (float)std::exp(-alpha), + ema2 = (float)std::exp(-2 * alpha), b1 = -2 * ema, b2 = ema2; - float a0 = 0, a1 = 0, a2 = 0, a3 = 0, coefp = 0, coefn = 0; + float a0 = 0, a1 = 0, a2 = 0, a3 = 0, coefp = 0, coefn = 0; - switch (order) { + switch (order) { case 0: { - const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2); - a0 = k; - a1 = k * (alpha - 1) * ema; - a2 = k * (alpha + 1) * ema; - a3 = -k * ema2; + const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2); + a0 = k; + a1 = k * (alpha - 1) * ema; + a2 = k * (alpha + 1) * ema; + a3 = -k * ema2; } break; case 1: { - const float k = (1 - ema) * (1 - ema) / ema; - a0 = k * ema; - a1 = a3 = 0; - a2 = -a0; + const float k = (1 - ema) * (1 - ema) / ema; + a0 = k * ema; + a1 = a3 = 0; + a2 = -a0; } break; case 2: { - const float ea = (float)std::exp(-alpha), - k = -(ema2 - 1) / (2 * alpha * ema), - kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) / - (3 * ea + 1 + 3 * ea * ea + ea * ea * ea)); - a0 = kn; - a1 = -kn * (1 + k * alpha) * ema; - a2 = kn * (1 - k * alpha) * ema; - a3 = -kn * ema2; + const float ea = (float)std::exp(-alpha), k = -(ema2 - 1) / (2 * alpha * ema), + kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) / (3 * ea + 1 + 3 * ea * ea + ea * ea * ea)); + a0 = kn; + a1 = -kn * (1 + k * alpha) * ema; + a2 = kn * (1 - k * alpha) * ema; + a3 = -kn * ema2; } break; default: - fprintf(stderr, "gaussianFilter: invalid order parameter!\n"); - return; - } + fprintf(stderr, "gaussianFilter: invalid order parameter!\n"); + return; + } - coefp = (a0 + a1) / (1 + b1 + b2); - coefn = (a2 + a3) / (1 + b1 + b2); + coefp = (a0 + a1) / (1 + b1 + b2); + coefn = (a2 + a3) / (1 + b1 + b2); // process columns #if USE_SIMPLE_FILTER - d_simpleRecursive_rgba<<>>( - d_src, d_temp, width, height, ema); + d_simpleRecursive_rgba<<>>(d_src, d_temp, width, height, ema); #else - d_recursiveGaussian_rgba<<>>( - d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn); + d_recursiveGaussian_rgba<<>>( + d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn); #endif - getLastCudaError("Kernel execution failed"); + getLastCudaError("Kernel execution failed"); - transpose(d_temp, d_dest, width, height); - getLastCudaError("transpose: Kernel execution failed"); + transpose(d_temp, d_dest, width, height); + getLastCudaError("transpose: Kernel execution failed"); // process rows #if USE_SIMPLE_FILTER - d_simpleRecursive_rgba<<>>( - d_dest, d_temp, height, width, ema); + d_simpleRecursive_rgba<<>>(d_dest, d_temp, height, width, ema); #else - d_recursiveGaussian_rgba<<>>( - d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn); + d_recursiveGaussian_rgba<<>>( + d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn); #endif - getLastCudaError("Kernel execution failed"); + getLastCudaError("Kernel execution failed"); - transpose(d_temp, d_dest, height, width); + transpose(d_temp, d_dest, height, width); } diff --git a/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_kernel.cuh b/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_kernel.cuh index f49bf0e4..9136b302 100644 --- a/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_kernel.cuh +++ b/Samples/5_Domain_Specific/recursiveGaussian/recursiveGaussian_kernel.cuh @@ -32,68 +32,70 @@ #ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_ #define _RECURSIVEGAUSSIAN_KERNEL_CU_ -#include -#include -#include #include +#include +#include +#include namespace cg = cooperative_groups; #include #include -#define BLOCK_DIM 16 +#define BLOCK_DIM 16 #define CLAMP_TO_EDGE 1 // Transpose kernel (see transpose CUDA Sample for details) -__global__ void d_transpose(uint *odata, uint *idata, int width, int height) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); +__global__ void d_transpose(uint *odata, uint *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); - __shared__ uint block[BLOCK_DIM][BLOCK_DIM + 1]; + __shared__ uint block[BLOCK_DIM][BLOCK_DIM + 1]; - // read the matrix tile into shared memory - unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x; - unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y; + // read the matrix tile into shared memory + unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x; + unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y; - if ((xIndex < width) && (yIndex < height)) { - unsigned int index_in = yIndex * width + xIndex; - block[threadIdx.y][threadIdx.x] = idata[index_in]; - } + if ((xIndex < width) && (yIndex < height)) { + unsigned int index_in = yIndex * width + xIndex; + block[threadIdx.y][threadIdx.x] = idata[index_in]; + } - cg::sync(cta); + cg::sync(cta); - // write the transposed matrix tile to global memory - xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x; - yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y; + // write the transposed matrix tile to global memory + xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x; + yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y; - if ((xIndex < height) && (yIndex < width)) { - unsigned int index_out = yIndex * height + xIndex; - odata[index_out] = block[threadIdx.x][threadIdx.y]; - } + if ((xIndex < height) && (yIndex < width)) { + unsigned int index_out = yIndex * height + xIndex; + odata[index_out] = block[threadIdx.x][threadIdx.y]; + } } // RGBA version // reads from 32-bit uint array holding 8-bit RGBA // convert floating point rgba color to 32-bit integer -__device__ uint rgbaFloatToInt(float4 rgba) { - rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] - rgba.y = __saturatef(rgba.y); - rgba.z = __saturatef(rgba.z); - rgba.w = __saturatef(rgba.w); - return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | - (uint(rgba.y * 255) << 8) | uint(rgba.x * 255); +__device__ uint rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | (uint(rgba.y * 255) << 8) | uint(rgba.x * 255); } // convert from 32-bit int to float4 -__device__ float4 rgbaIntToFloat(uint c) { - float4 rgba; - rgba.x = (c & 0xff) / 255.0f; - rgba.y = ((c >> 8) & 0xff) / 255.0f; - rgba.z = ((c >> 16) & 0xff) / 255.0f; - rgba.w = ((c >> 24) & 0xff) / 255.0f; - return rgba; +__device__ float4 rgbaIntToFloat(uint c) +{ + float4 rgba; + rgba.x = (c & 0xff) / 255.0f; + rgba.y = ((c >> 8) & 0xff) / 255.0f; + rgba.z = ((c >> 16) & 0xff) / 255.0f; + rgba.w = ((c >> 24) & 0xff) / 255.0f; + return rgba; } /* @@ -108,44 +110,44 @@ __device__ float4 rgbaIntToFloat(uint c) { a - blur parameter */ -__global__ void d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, - float a) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, float a) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - if (x >= w) return; + if (x >= w) + return; - id += x; // advance pointers to correct column - od += x; + id += x; // advance pointers to correct column + od += x; - // forward pass - float4 yp = rgbaIntToFloat(*id); // previous output + // forward pass + float4 yp = rgbaIntToFloat(*id); // previous output - for (int y = 0; y < h; y++) { - float4 xc = rgbaIntToFloat(*id); - float4 yc = - xc + a * (yp - xc); // simple lerp between current and previous value - *od = rgbaFloatToInt(yc); - id += w; - od += w; // move to next row - yp = yc; - } + for (int y = 0; y < h; y++) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = xc + a * (yp - xc); // simple lerp between current and previous value + *od = rgbaFloatToInt(yc); + id += w; + od += w; // move to next row + yp = yc; + } - // reset pointers to point to last element in column - id -= w; - od -= w; - - // reverse pass - // ensures response is symmetrical - yp = rgbaIntToFloat(*id); - - for (int y = h - 1; y >= 0; y--) { - float4 xc = rgbaIntToFloat(*id); - float4 yc = xc + a * (yp - xc); - *od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc) * 0.5f); + // reset pointers to point to last element in column id -= w; - od -= w; // move to previous row - yp = yc; - } + od -= w; + + // reverse pass + // ensures response is symmetrical + yp = rgbaIntToFloat(*id); + + for (int y = h - 1; y >= 0; y--) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = xc + a * (yp - xc); + *od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc) * 0.5f); + id -= w; + od -= w; // move to previous row + yp = yc; + } } /* @@ -159,65 +161,75 @@ __global__ void d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, a0-a3, b1, b2, coefp, coefn - filter parameters */ -__global__ void d_recursiveGaussian_rgba(uint *id, uint *od, int w, int h, - float a0, float a1, float a2, float a3, - float b1, float b2, float coefp, - float coefn) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void d_recursiveGaussian_rgba(uint *id, + uint *od, + int w, + int h, + float a0, + float a1, + float a2, + float a3, + float b1, + float b2, + float coefp, + float coefn) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - if (x >= w) return; + if (x >= w) + return; - id += x; // advance pointers to correct column - od += x; + id += x; // advance pointers to correct column + od += x; - // forward pass - float4 xp = make_float4(0.0f); // previous input - float4 yp = make_float4(0.0f); // previous output - float4 yb = make_float4(0.0f); // previous output by 2 + // forward pass + float4 xp = make_float4(0.0f); // previous input + float4 yp = make_float4(0.0f); // previous output + float4 yb = make_float4(0.0f); // previous output by 2 #if CLAMP_TO_EDGE - xp = rgbaIntToFloat(*id); - yb = coefp * xp; - yp = yb; + xp = rgbaIntToFloat(*id); + yb = coefp * xp; + yp = yb; #endif - for (int y = 0; y < h; y++) { - float4 xc = rgbaIntToFloat(*id); - float4 yc = a0 * xc + a1 * xp - b1 * yp - b2 * yb; - *od = rgbaFloatToInt(yc); - id += w; - od += w; // move to next row - xp = xc; - yb = yp; - yp = yc; - } + for (int y = 0; y < h; y++) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = a0 * xc + a1 * xp - b1 * yp - b2 * yb; + *od = rgbaFloatToInt(yc); + id += w; + od += w; // move to next row + xp = xc; + yb = yp; + yp = yc; + } - // reset pointers to point to last element in column - id -= w; - od -= w; - - // reverse pass - // ensures response is symmetrical - float4 xn = make_float4(0.0f); - float4 xa = make_float4(0.0f); - float4 yn = make_float4(0.0f); - float4 ya = make_float4(0.0f); -#if CLAMP_TO_EDGE - xn = xa = rgbaIntToFloat(*id); - yn = coefn * xn; - ya = yn; -#endif - - for (int y = h - 1; y >= 0; y--) { - float4 xc = rgbaIntToFloat(*id); - float4 yc = a2 * xn + a3 * xa - b1 * yn - b2 * ya; - xa = xn; - xn = xc; - ya = yn; - yn = yc; - *od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc); + // reset pointers to point to last element in column id -= w; - od -= w; // move to previous row - } + od -= w; + + // reverse pass + // ensures response is symmetrical + float4 xn = make_float4(0.0f); + float4 xa = make_float4(0.0f); + float4 yn = make_float4(0.0f); + float4 ya = make_float4(0.0f); +#if CLAMP_TO_EDGE + xn = xa = rgbaIntToFloat(*id); + yn = coefn * xn; + ya = yn; +#endif + + for (int y = h - 1; y >= 0; y--) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = a2 * xn + a3 * xa - b1 * yn - b2 * ya; + xa = xn; + xn = xc; + ya = yn; + yn = yc; + *od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc); + id -= w; + od -= w; // move to previous row + } } -#endif // #ifndef _GAUSSIAN_KERNEL_H_ +#endif // #ifndef _GAUSSIAN_KERNEL_H_ diff --git a/Samples/5_Domain_Specific/simpleD3D11/README.md b/Samples/5_Domain_Specific/simpleD3D11/README.md index a3c7b2c6..efa4af24 100644 --- a/Samples/5_Domain_Specific/simpleD3D11/README.md +++ b/Samples/5_Domain_Specific/simpleD3D11/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/simpleD3D11/ShaderStructs.h b/Samples/5_Domain_Specific/simpleD3D11/ShaderStructs.h index de87055c..1e5caae7 100644 --- a/Samples/5_Domain_Specific/simpleD3D11/ShaderStructs.h +++ b/Samples/5_Domain_Specific/simpleD3D11/ShaderStructs.h @@ -27,17 +27,22 @@ #pragma once -//#include "stdafx.h" -#include +// #include "stdafx.h" #include +#include + #include "helper_cuda.h" using namespace DirectX; struct Vertex { - XMFLOAT3 position; - XMFLOAT4 color; + XMFLOAT3 position; + XMFLOAT4 color; }; -void RunSineWaveKernel(size_t mesh_width, size_t mesh_height, Vertex *cudaDevVertptr, cudaStream_t streamToRun, float AnimTime); \ No newline at end of file +void RunSineWaveKernel(size_t mesh_width, + size_t mesh_height, + Vertex *cudaDevVertptr, + cudaStream_t streamToRun, + float AnimTime); diff --git a/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp b/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp index 8998001a..dad4828e 100644 --- a/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp +++ b/Samples/5_Domain_Specific/simpleD3D11/simpleD3D11.cpp @@ -26,25 +26,26 @@ */ /* This example demonstrates how to use the CUDA-D3D11 External Resource Interoperability APIs -* to update D3D11 buffers from CUDA and synchronize between D3D11 and CUDA with Keyed Mutexes. + * to update D3D11 buffers from CUDA and synchronize between D3D11 and CUDA with Keyed Mutexes. */ -#pragma warning(disable: 4312) +#pragma warning(disable : 4312) -#include #include +#include // This header inclues all the necessary D3D11 and CUDA includes -#include -#include -#include #include +#include #include +#include +#include // includes, project -#include #include -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h +#include + #include "ShaderStructs.h" #include "sinewave_cuda.h" @@ -55,62 +56,59 @@ static char *SDK_name = "simpleD3D11"; //----------------------------------------------------------------------------- // Global variables //----------------------------------------------------------------------------- -IDXGIAdapter1 *g_pCudaCapableAdapter = NULL; // Adapter to use -ID3D11Device *g_pd3dDevice = NULL; // Our rendering device -ID3D11DeviceContext *g_pd3dDeviceContext = NULL; -IDXGISwapChain *g_pSwapChain = NULL; // The swap chain of the window -ID3D11RenderTargetView *g_pSwapChainRTV = NULL; //The Render target view on the swap chain ( used for clear) -ID3D11RasterizerState *g_pRasterState = NULL; -ID3D11InputLayout *g_pInputLayout = NULL; +IDXGIAdapter1 *g_pCudaCapableAdapter = NULL; // Adapter to use +ID3D11Device *g_pd3dDevice = NULL; // Our rendering device +ID3D11DeviceContext *g_pd3dDeviceContext = NULL; +IDXGISwapChain *g_pSwapChain = NULL; // The swap chain of the window +ID3D11RenderTargetView *g_pSwapChainRTV = NULL; // The Render target view on the swap chain ( used for clear) +ID3D11RasterizerState *g_pRasterState = NULL; +ID3D11InputLayout *g_pInputLayout = NULL; ID3D11VertexShader *g_pVertexShader; ID3D11PixelShader *g_pPixelShader; ID3D11InputLayout *g_pLayout; ID3D11Buffer *g_VertexBuffer; IDXGIKeyedMutex *g_pKeyedMutex11; -Vertex *d_VertexBufPtr = NULL; -cudaExternalMemory_t extMemory; +Vertex *d_VertexBufPtr = NULL; +cudaExternalMemory_t extMemory; cudaExternalSemaphore_t extSemaphore; // // Vertex and Pixel shaders here : VSMain() & PSMain() // -static const char g_simpleShaders[] = -"struct PSInput\n" \ -"{ \n" \ -" float4 position : SV_POSITION;\n" \ -" float4 color : COLOR; \n" \ -"};\n" \ -"PSInput VSMain(float3 position : POSITION, float4 color : COLOR)\n" \ -"{ \n" \ -" PSInput result;\n" \ -" result.position = float4(position, 1.0f); \n" \ -" // Pass the color through without modification. \n" \ -" result.color = color; \n" \ -" return result; \n" \ -"} \n" \ -"float4 PSMain(PSInput input) : SV_TARGET \n" \ -"{ \n" \ -" return input.color; \n" \ -"} \n" \ -; +static const char g_simpleShaders[] = "struct PSInput\n" + "{ \n" + " float4 position : SV_POSITION;\n" + " float4 color : COLOR; \n" + "};\n" + "PSInput VSMain(float3 position : POSITION, float4 color : COLOR)\n" + "{ \n" + " PSInput result;\n" + " result.position = float4(position, 1.0f); \n" + " // Pass the color through without modification. \n" + " result.color = color; \n" + " return result; \n" + "} \n" + "float4 PSMain(PSInput input) : SV_TARGET \n" + "{ \n" + " return input.color; \n" + "} \n"; // testing/tracing function used pervasively in tests. if the condition is unsatisfied // then spew and fail the function immediately (doing no cleanup) -#define AssertOrQuit(x) \ - if (!(x)) \ - { \ +#define AssertOrQuit(x) \ + if (!(x)) { \ fprintf(stdout, "Assert unsatisfied in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \ - return 1; \ + return 1; \ } bool g_bDone = false; bool g_bPassed = true; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; -const unsigned int g_WindowWidth = 720; +const unsigned int g_WindowWidth = 720; const unsigned int g_WindowHeight = 720; int g_iFrameToCompare = 10; @@ -128,7 +126,7 @@ void Render(); LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam); -#define NAME_LEN 512 +#define NAME_LEN 512 bool findCUDADevice() { @@ -136,13 +134,11 @@ bool findCUDADevice() // This function call returns 0 if there are no CUDA capable devices. checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - if (deviceCount == 0) - { + if (deviceCount == 0) { printf("> There are no device(s) supporting CUDA\n"); return false; } - else - { + else { printf("> Found %d CUDA Capable Device(s)\n", deviceCount); } @@ -151,41 +147,37 @@ bool findCUDADevice() bool findDXDevice(char *dev_name) { - HRESULT hr = S_OK; + HRESULT hr = S_OK; cudaError cuStatus; - int cuda_dev = -1; + int cuda_dev = -1; // Iterate through the candidate adapters IDXGIFactory1 *pFactory; hr = sFnPtr_CreateDXGIFactory(__uuidof(IDXGIFactory1), (void **)(&pFactory)); - if (! SUCCEEDED(hr)) - { + if (!SUCCEEDED(hr)) { printf("> No DXGI Factory created.\n"); return false; } UINT adapter = 0; - for (; !g_pCudaCapableAdapter; ++adapter) - { + for (; !g_pCudaCapableAdapter; ++adapter) { // Get a candidate DXGI adapter IDXGIAdapter1 *pAdapter = NULL; hr = pFactory->EnumAdapters1(adapter, &pAdapter); - if (FAILED(hr)) - { - break; // no compatible adapters found + if (FAILED(hr)) { + break; // no compatible adapters found } // Query to see if there exists a corresponding compute device int cuDevice; cuStatus = cudaD3D11GetDevice(&cuDevice, pAdapter); - printLastCudaError("cudaD3D11GetDevice failed"); //This prints and resets the cudaError to cudaSuccess + printLastCudaError("cudaD3D11GetDevice failed"); // This prints and resets the cudaError to cudaSuccess - if (cudaSuccess == cuStatus) - { + if (cudaSuccess == cuStatus) { // If so, mark it as the one against which to create our d3d11 device g_pCudaCapableAdapter = pAdapter; g_pCudaCapableAdapter->AddRef(); @@ -196,12 +188,11 @@ bool findDXDevice(char *dev_name) pAdapter->Release(); } - printf("> Found %d D3D11 Adapater(s).\n", (int) adapter); + printf("> Found %d D3D11 Adapater(s).\n", (int)adapter); pFactory->Release(); - if (!g_pCudaCapableAdapter) - { + if (!g_pCudaCapableAdapter) { printf("> Found 0 D3D11 Adapater(s) /w Compute capability.\n"); return false; } @@ -225,7 +216,7 @@ bool findDXDevice(char *dev_name) //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { - char device_name[256]; + char device_name[256]; char *ref_file = NULL; pArgc = &argc; @@ -233,20 +224,20 @@ int main(int argc, char *argv[]) printf("[%s] - Starting...\n", SDK_name); - if (!findCUDADevice()) // Search for CUDA GPU + if (!findCUDADevice()) // Search for CUDA GPU { printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); exit(EXIT_SUCCESS); } - if (!dynlinkLoadD3D11API()) // Search for D3D API (locate drivers, does not mean device is found) + if (!dynlinkLoadD3D11API()) // Search for D3D API (locate drivers, does not mean device is found) { printf("> D3D11 API libraries NOT found on.. Exiting.\n"); dynlinkUnloadD3D11API(); exit(EXIT_SUCCESS); } - if (!findDXDevice(device_name)) // Search for D3D Hardware Device + if (!findDXDevice(device_name)) // Search for D3D Hardware Device { printf("> D3D11 Graphics Device NOT found.. Exiting.\n"); dynlinkUnloadD3D11API(); @@ -254,8 +245,7 @@ int main(int argc, char *argv[]) } // command line options - if (argc > 1) - { + if (argc > 1) { // automatied build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); @@ -265,26 +255,41 @@ int main(int argc, char *argv[]) // create window // // Register the window class - WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, - GetModuleHandle(NULL), NULL, NULL, NULL, NULL, - "CUDA SDK", NULL - }; + WNDCLASSEX wc = {sizeof(WNDCLASSEX), + CS_CLASSDC, + MsgProc, + 0L, + 0L, + GetModuleHandle(NULL), + NULL, + NULL, + NULL, + NULL, + "CUDA SDK", + NULL}; RegisterClassEx(&wc); // Create the application's window - int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); - int yMenu = ::GetSystemMetrics(SM_CYMENU); - int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); - HWND hWnd = CreateWindow(wc.lpszClassName, "CUDA/D3D11 InterOP", - WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth + 2*xBorder, g_WindowHeight+ 2*yBorder+yMenu, - NULL, NULL, wc.hInstance, NULL); + int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); + int yMenu = ::GetSystemMetrics(SM_CYMENU); + int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); + HWND hWnd = CreateWindow(wc.lpszClassName, + "CUDA/D3D11 InterOP", + WS_OVERLAPPEDWINDOW, + 0, + 0, + g_WindowWidth + 2 * xBorder, + g_WindowHeight + 2 * yBorder + yMenu, + NULL, + NULL, + wc.hInstance, + NULL); ShowWindow(hWnd, SW_SHOWDEFAULT); UpdateWindow(hWnd); // Initialize Direct3D - if (!SUCCEEDED(InitD3D(hWnd))) - { + if (!SUCCEEDED(InitD3D(hWnd))) { printf("InitD3D Failed.. Exiting..\n"); exit(EXIT_FAILURE); } @@ -292,8 +297,7 @@ int main(int argc, char *argv[]) // // the main loop // - while (false == g_bDone) - { + while (false == g_bDone) { Render(); // @@ -302,28 +306,23 @@ int main(int argc, char *argv[]) MSG msg; ZeroMemory(&msg, sizeof(msg)); - while (msg.message!=WM_QUIT) - { - if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) - { + while (msg.message != WM_QUIT) { + if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) { TranslateMessage(&msg); DispatchMessage(&msg); } - else - { + else { Render(); - if (ref_file) - { - for (int count=0; countGetBufferPointer(); printf(pStr); } @@ -454,7 +455,17 @@ HRESULT InitD3D(HWND hWnd) } // Pixel shader { - hr = D3DCompile(g_simpleShaders, strlen(g_simpleShaders), "Memory", NULL, NULL, "PSMain", "ps_4_0", 0/*Flags1*/, 0/*Flags2*/, &PS, &pErrorMsgs); + hr = D3DCompile(g_simpleShaders, + strlen(g_simpleShaders), + "Memory", + NULL, + NULL, + "PSMain", + "ps_4_0", + 0 /*Flags1*/, + 0 /*Flags2*/, + &PS, + &pErrorMsgs); AssertOrQuit(SUCCEEDED(hr)); hr = g_pd3dDevice->CreatePixelShader(PS->GetBufferPointer(), PS->GetBufferSize(), NULL, &g_pPixelShader); @@ -464,26 +475,24 @@ HRESULT InitD3D(HWND hWnd) } D3D11_BUFFER_DESC bufferDesc; - bufferDesc.Usage = D3D11_USAGE_DEFAULT; - bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight; - bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + bufferDesc.Usage = D3D11_USAGE_DEFAULT; + bufferDesc.ByteWidth = sizeof(Vertex) * g_WindowWidth * g_WindowHeight; + bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; bufferDesc.CPUAccessFlags = 0; - bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX; + bufferDesc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX; hr = g_pd3dDevice->CreateBuffer(&bufferDesc, NULL, &g_VertexBuffer); AssertOrQuit(SUCCEEDED(hr)); - hr = g_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void**)&g_pKeyedMutex11); + hr = g_VertexBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (void **)&g_pKeyedMutex11); AssertOrQuit(SUCCEEDED(hr)); - D3D11_INPUT_ELEMENT_DESC inputElementDescs[] = - { - { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 }, - { "COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 1, 12, D3D11_INPUT_PER_VERTEX_DATA, 0 } - }; + D3D11_INPUT_ELEMENT_DESC inputElementDescs[] = { + {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 1, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}}; - hr = g_pd3dDevice->CreateInputLayout(inputElementDescs, 2, VS->GetBufferPointer(), VS->GetBufferSize(), &g_pLayout); + hr = g_pd3dDevice->CreateInputLayout(inputElementDescs, 2, VS->GetBufferPointer(), VS->GetBufferSize(), &g_pLayout); AssertOrQuit(SUCCEEDED(hr)); // Setup Input Layout g_pd3dDeviceContext->IASetInputLayout(g_pLayout); @@ -491,34 +500,33 @@ HRESULT InitD3D(HWND hWnd) g_pd3dDeviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_POINTLIST); AssertOrQuit(SUCCEEDED(hr)); - IDXGIResource1* pResource; - HANDLE sharedHandle; - g_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource); + IDXGIResource1 *pResource; + HANDLE sharedHandle; + g_VertexBuffer->QueryInterface(__uuidof(IDXGIResource1), (void **)&pResource); hr = pResource->GetSharedHandle(&sharedHandle); - if (!SUCCEEDED(hr)) - { + if (!SUCCEEDED(hr)) { std::cout << "Failed GetSharedHandle hr= " << hr << std::endl; } // Import the D3D11 Vertex Buffer into CUDA d_VertexBufPtr = cudaImportVertexBuffer(sharedHandle, extMemory, g_WindowWidth, g_WindowHeight); pResource->Release(); - g_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void**)&pResource); + g_pKeyedMutex11->QueryInterface(__uuidof(IDXGIResource1), (void **)&pResource); pResource->GetSharedHandle(&sharedHandle); // Import the D3D11 Keyed Mutex into CUDA cudaImportKeyedMutex(sharedHandle, extSemaphore); pResource->Release(); D3D11_RASTERIZER_DESC rasterizerState; - rasterizerState.FillMode = D3D11_FILL_SOLID; - rasterizerState.CullMode = D3D11_CULL_FRONT; + rasterizerState.FillMode = D3D11_FILL_SOLID; + rasterizerState.CullMode = D3D11_CULL_FRONT; rasterizerState.FrontCounterClockwise = false; - rasterizerState.DepthBias = false; - rasterizerState.DepthBiasClamp = 0; - rasterizerState.SlopeScaledDepthBias = 0; - rasterizerState.DepthClipEnable = false; - rasterizerState.ScissorEnable = false; - rasterizerState.MultisampleEnable = false; + rasterizerState.DepthBias = false; + rasterizerState.DepthBiasClamp = 0; + rasterizerState.SlopeScaledDepthBias = 0; + rasterizerState.DepthClipEnable = false; + rasterizerState.ScissorEnable = false; + rasterizerState.MultisampleEnable = false; rasterizerState.AntialiasedLineEnable = false; g_pd3dDevice->CreateRasterizerState(&rasterizerState, &g_pRasterState); g_pd3dDeviceContext->RSSetState(g_pRasterState); @@ -535,7 +543,7 @@ bool DrawScene(uint64_t &key) HRESULT hr = S_OK; // Clear the backbuffer - float ClearColor[4] = { 0.5f, 0.5f, 0.6f, 1.0f }; + float ClearColor[4] = {0.5f, 0.5f, 0.6f, 1.0f}; g_pd3dDeviceContext->ClearRenderTargetView(g_pSwapChainRTV, ClearColor); @@ -544,7 +552,7 @@ bool DrawScene(uint64_t &key) UINT stride = sizeof(Vertex); UINT offset = 0; g_pd3dDeviceContext->IASetVertexBuffers(0, 1, &g_VertexBuffer, &stride, &offset); - g_pd3dDeviceContext->Draw(g_WindowHeight*g_WindowWidth, 0); + g_pd3dDeviceContext->Draw(g_WindowHeight * g_WindowWidth, 0); hr = g_pKeyedMutex11->ReleaseSync(key); AssertOrQuit(SUCCEEDED(hr)); @@ -567,38 +575,31 @@ void Cleanup() // clean up Direct3D // // release the resources we created - if (g_pInputLayout != NULL) - { + if (g_pInputLayout != NULL) { g_pInputLayout->Release(); } - if (g_pVertexShader) - { + if (g_pVertexShader) { g_pVertexShader->Release(); } - if (g_pPixelShader) - { + if (g_pPixelShader) { g_pPixelShader->Release(); } - if (g_VertexBuffer) - { + if (g_VertexBuffer) { g_VertexBuffer->Release(); } - if (g_pSwapChainRTV != NULL) - { + if (g_pSwapChainRTV != NULL) { g_pSwapChainRTV->Release(); } - if (g_pSwapChain != NULL) - { + if (g_pSwapChain != NULL) { g_pSwapChain->Release(); } - if (g_pd3dDevice != NULL) - { + if (g_pd3dDevice != NULL) { g_pd3dDevice->Release(); } } @@ -624,30 +625,27 @@ void Render() //----------------------------------------------------------------------------- static LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam) { - switch (msg) - { - case WM_KEYDOWN: - if (wParam==VK_ESCAPE) - { - g_bDone = true; - Cleanup(); - PostQuitMessage(0); - return 0; - } - - break; - - case WM_DESTROY: + switch (msg) { + case WM_KEYDOWN: + if (wParam == VK_ESCAPE) { g_bDone = true; Cleanup(); PostQuitMessage(0); return 0; + } - case WM_PAINT: - ValidateRect(hWnd, NULL); - return 0; + break; + + case WM_DESTROY: + g_bDone = true; + Cleanup(); + PostQuitMessage(0); + return 0; + + case WM_PAINT: + ValidateRect(hWnd, NULL); + return 0; } return DefWindowProc(hWnd, msg, wParam, lParam); } - diff --git a/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.cu b/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.cu index 925df766..83e13099 100644 --- a/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.cu +++ b/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.cu @@ -26,46 +26,46 @@ */ #include + #include "ShaderStructs.h" #include "helper_cuda.h" #include "sinewave_cuda.h" __global__ void sinewave_gen_kernel(Vertex *vertices, unsigned int width, unsigned int height, float time) { - unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; // calculate uv coordinates - float u = x / (float) width; - float v = y / (float) height; - u = u*2.0f - 1.0f; - v = v*2.0f - 1.0f; + float u = x / (float)width; + float v = y / (float)height; + u = u * 2.0f - 1.0f; + v = v * 2.0f - 1.0f; // calculate simple sine wave pattern float freq = 4.0f; - float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f; + float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; - if (y < height && x < width) - { + if (y < height && x < width) { // write output vertex - vertices[y*width+x].position.x = u; - vertices[y*width+x].position.y = w; - vertices[y*width+x].position.z = v; - vertices[y*width+x].color.x = 1.0f; - vertices[y*width+x].color.y = 0.0f; - vertices[y*width+x].color.z = 0.0f; - vertices[y*width + x].color.w = 0.0f; + vertices[y * width + x].position.x = u; + vertices[y * width + x].position.y = w; + vertices[y * width + x].position.z = v; + vertices[y * width + x].color.x = 1.0f; + vertices[y * width + x].color.y = 0.0f; + vertices[y * width + x].color.z = 0.0f; + vertices[y * width + x].color.w = 0.0f; } } -Vertex* cudaImportVertexBuffer(void*sharedHandle, cudaExternalMemory_t &externalMemory, int meshWidth, int meshHeight) +Vertex *cudaImportVertexBuffer(void *sharedHandle, cudaExternalMemory_t &externalMemory, int meshWidth, int meshHeight) { cudaExternalMemoryHandleDesc externalMemoryHandleDesc; memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D11ResourceKmt; - externalMemoryHandleDesc.size = sizeof(Vertex) * meshHeight * meshWidth; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D11ResourceKmt; + externalMemoryHandleDesc.size = sizeof(Vertex) * meshHeight * meshWidth; + externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; externalMemoryHandleDesc.handle.win32.handle = sharedHandle; checkCudaErrors(cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc)); @@ -73,31 +73,35 @@ Vertex* cudaImportVertexBuffer(void*sharedHandle, cudaExternalMemory_t &external cudaExternalMemoryBufferDesc externalMemoryBufferDesc; memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc)); externalMemoryBufferDesc.offset = 0; - externalMemoryBufferDesc.size = sizeof(Vertex) * meshHeight * meshWidth; - externalMemoryBufferDesc.flags = 0; + externalMemoryBufferDesc.size = sizeof(Vertex) * meshHeight * meshWidth; + externalMemoryBufferDesc.flags = 0; - Vertex* cudaDevVertptr = NULL; - checkCudaErrors(cudaExternalMemoryGetMappedBuffer((void**)&cudaDevVertptr, externalMemory, &externalMemoryBufferDesc)); + Vertex *cudaDevVertptr = NULL; + checkCudaErrors( + cudaExternalMemoryGetMappedBuffer((void **)&cudaDevVertptr, externalMemory, &externalMemoryBufferDesc)); return cudaDevVertptr; } -void cudaImportKeyedMutex(void*sharedHandle, cudaExternalSemaphore_t &extSemaphore) +void cudaImportKeyedMutex(void *sharedHandle, cudaExternalSemaphore_t &extSemaphore) { cudaExternalSemaphoreHandleDesc extSemaDesc; memset(&extSemaDesc, 0, sizeof(extSemaDesc)); - extSemaDesc.type = cudaExternalSemaphoreHandleTypeKeyedMutexKmt; + extSemaDesc.type = cudaExternalSemaphoreHandleTypeKeyedMutexKmt; extSemaDesc.handle.win32.handle = sharedHandle; - extSemaDesc.flags = 0; + extSemaDesc.flags = 0; checkCudaErrors(cudaImportExternalSemaphore(&extSemaphore, &extSemaDesc)); } -void cudaAcquireSync(cudaExternalSemaphore_t &extSemaphore, uint64_t key, unsigned int timeoutMs, cudaStream_t streamToRun) +void cudaAcquireSync(cudaExternalSemaphore_t &extSemaphore, + uint64_t key, + unsigned int timeoutMs, + cudaStream_t streamToRun) { cudaExternalSemaphoreWaitParams extSemWaitParams; memset(&extSemWaitParams, 0, sizeof(extSemWaitParams)); - extSemWaitParams.params.keyedMutex.key = key; + extSemWaitParams.params.keyedMutex.key = key; extSemWaitParams.params.keyedMutex.timeoutMs = timeoutMs; checkCudaErrors(cudaWaitExternalSemaphoresAsync(&extSemaphore, &extSemWaitParams, 1, streamToRun)); @@ -115,18 +119,22 @@ void cudaReleaseSync(cudaExternalSemaphore_t &extSemaphore, uint64_t key, cudaSt //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void RunSineWaveKernel(cudaExternalSemaphore_t &extSemaphore, uint64_t &key, unsigned int timeoutMs, - size_t mesh_width, size_t mesh_height, Vertex *cudaDevVertptr, cudaStream_t streamToRun) +void RunSineWaveKernel(cudaExternalSemaphore_t &extSemaphore, + uint64_t &key, + unsigned int timeoutMs, + size_t mesh_width, + size_t mesh_height, + Vertex *cudaDevVertptr, + cudaStream_t streamToRun) { static float t = 0.0f; cudaAcquireSync(extSemaphore, key++, timeoutMs, streamToRun); dim3 block(16, 16, 1); dim3 grid(mesh_width / 16, mesh_height / 16, 1); - sinewave_gen_kernel<<< grid, block, 0, streamToRun >>>(cudaDevVertptr, mesh_width, mesh_height, t); + sinewave_gen_kernel<<>>(cudaDevVertptr, mesh_width, mesh_height, t); getLastCudaError("sinewave_gen_kernel execution failed.\n"); cudaReleaseSync(extSemaphore, key, streamToRun); t += 0.01f; } - diff --git a/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.h b/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.h index 77c89214..16d1a35e 100644 --- a/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.h +++ b/Samples/5_Domain_Specific/simpleD3D11/sinewave_cuda.h @@ -29,11 +29,17 @@ #define SINEWAVE_CUDA_H #include + #include "ShaderStructs.h" #include "helper_cuda.h" -void RunSineWaveKernel(cudaExternalSemaphore_t &extSemaphore, uint64_t &key, unsigned int timeoutMs, - size_t mesh_width, size_t mesh_height, Vertex *cudaDevVertptr, cudaStream_t streamToRun); -Vertex* cudaImportVertexBuffer(void*sharedHandle, cudaExternalMemory_t &externalMemory, int meshWidth, int meshHeight); -void cudaImportKeyedMutex(void*sharedHandle, cudaExternalSemaphore_t &extSemaphore); -#endif // ! \ No newline at end of file +void RunSineWaveKernel(cudaExternalSemaphore_t &extSemaphore, + uint64_t &key, + unsigned int timeoutMs, + size_t mesh_width, + size_t mesh_height, + Vertex *cudaDevVertptr, + cudaStream_t streamToRun); +Vertex *cudaImportVertexBuffer(void *sharedHandle, cudaExternalMemory_t &externalMemory, int meshWidth, int meshHeight); +void cudaImportKeyedMutex(void *sharedHandle, cudaExternalSemaphore_t &extSemaphore); +#endif // ! diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/README.md b/Samples/5_Domain_Specific/simpleD3D11Texture/README.md index 526e1e44..613b9531 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/README.md +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/d3dx11effect/d3dx11effect.h b/Samples/5_Domain_Specific/simpleD3D11Texture/d3dx11effect/d3dx11effect.h index 7540a2fd..d9341ae7 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/d3dx11effect/d3dx11effect.h +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/d3dx11effect/d3dx11effect.h @@ -119,8 +119,8 @@ typedef struct _D3DX11_STATE_BLOCK_MASK // //---------------------------------------------------------------------------- -#define D3DX11_EFFECT_OPTIMIZED (1 << 21) -#define D3DX11_EFFECT_CLONE (1 << 22) +#define D3DX11_EFFECT_OPTIMIZED (1 << 21) +#define D3DX11_EFFECT_CLONE (1 << 22) // These are the only valid parameter flags to D3DX11CreateEffect* #define D3DX11_EFFECT_RUNTIME_VALID_FLAGS (0) @@ -142,8 +142,8 @@ typedef struct _D3DX11_STATE_BLOCK_MASK // register keyword. //---------------------------------------------------------------------------- -#define D3DX11_EFFECT_VARIABLE_ANNOTATION (1 << 1) -#define D3DX11_EFFECT_VARIABLE_EXPLICIT_BIND_POINT (1 << 2) +#define D3DX11_EFFECT_VARIABLE_ANNOTATION (1 << 1) +#define D3DX11_EFFECT_VARIABLE_EXPLICIT_BIND_POINT (1 << 2) //---------------------------------------------------------------------------- // D3DX11_EFFECT_CLONE flags: @@ -156,7 +156,7 @@ typedef struct _D3DX11_STATE_BLOCK_MASK // own ID3D11Buffer's created in the cloned effect. //---------------------------------------------------------------------------- -#define D3DX11_EFFECT_CLONE_FORCE_NONSINGLE (1 << 0) +#define D3DX11_EFFECT_CLONE_FORCE_NONSINGLE (1 << 0) //---------------------------------------------------------------------------- // D3DX11_EFFECT_PASS flags: @@ -173,20 +173,18 @@ typedef struct _D3DX11_STATE_BLOCK_MASK // When applying a pass, do not set the state indicated in the flag name. //---------------------------------------------------------------------------- -#define D3DX11_EFFECT_PASS_COMMIT_CHANGES (1 << 0) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_OMIT_SHADERS_AND_INTERFACES (1 << 1) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_OMIT_STATE_OBJECTS (1 << 2) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_OMIT_RTVS_AND_DSVS (1 << 3) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_OMIT_SAMPLERS (1 << 4) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_OMIT_CBS (1 << 5) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_OMIT_SRVS (1 << 6) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_OMIT_UAVS (1 << 7) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_COMMIT_CHANGES (1 << 0) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_OMIT_SHADERS_AND_INTERFACES (1 << 1) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_OMIT_STATE_OBJECTS (1 << 2) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_OMIT_RTVS_AND_DSVS (1 << 3) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_OMIT_SAMPLERS (1 << 4) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_OMIT_CBS (1 << 5) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_OMIT_SRVS (1 << 6) // TODO: not yet implemented +#define D3DX11_EFFECT_PASS_OMIT_UAVS (1 << 7) // TODO: not yet implemented -#define D3DX11_EFFECT_PASS_ONLY_SET_SHADERS_AND_CBS ( D3DX11_EFFECT_PASS_OMIT_STATE_OBJECTS | \ - D3DX11_EFFECT_PASS_OMIT_RTVS_AND_DSVS | \ - D3DX11_EFFECT_PASS_OMIT_SAMPLERS | \ - D3DX11_EFFECT_PASS_OMIT_SRVS | \ - D3DX11_EFFECT_PASS_OMIT_UAVS ); +#define D3DX11_EFFECT_PASS_ONLY_SET_SHADERS_AND_CBS \ + (D3DX11_EFFECT_PASS_OMIT_STATE_OBJECTS | D3DX11_EFFECT_PASS_OMIT_RTVS_AND_DSVS | D3DX11_EFFECT_PASS_OMIT_SAMPLERS \ + | D3DX11_EFFECT_PASS_OMIT_SRVS | D3DX11_EFFECT_PASS_OMIT_UAVS); ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectType ////////////////////////////////////////////////////////// @@ -200,35 +198,34 @@ typedef struct _D3DX11_STATE_BLOCK_MASK typedef struct _D3DX11_EFFECT_TYPE_DESC { - LPCSTR TypeName; // Name of the type + LPCSTR TypeName; // Name of the type // (e.g. "float4" or "MyStruct") - D3D10_SHADER_VARIABLE_CLASS Class; // (e.g. scalar, vector, object, etc.) - D3D10_SHADER_VARIABLE_TYPE Type; // (e.g. float, texture, vertexshader, etc.) + D3D10_SHADER_VARIABLE_CLASS Class; // (e.g. scalar, vector, object, etc.) + D3D10_SHADER_VARIABLE_TYPE Type; // (e.g. float, texture, vertexshader, etc.) - UINT Elements; // Number of elements in this type + UINT Elements; // Number of elements in this type // (0 if not an array) - UINT Members; // Number of members + UINT Members; // Number of members // (0 if not a structure) - UINT Rows; // Number of rows in this type + UINT Rows; // Number of rows in this type // (0 if not a numeric primitive) - UINT Columns; // Number of columns in this type + UINT Columns; // Number of columns in this type // (0 if not a numeric primitive) - UINT PackedSize; // Number of bytes required to represent + UINT PackedSize; // Number of bytes required to represent // this data type, when tightly packed - UINT UnpackedSize; // Number of bytes occupied by this data + UINT UnpackedSize; // Number of bytes occupied by this data // type, when laid out in a constant buffer - UINT Stride; // Number of bytes to seek between elements, + UINT Stride; // Number of bytes to seek between elements, // when laid out in a constant buffer } D3DX11_EFFECT_TYPE_DESC; -typedef interface ID3DX11EffectType ID3DX11EffectType; +typedef interface ID3DX11EffectType ID3DX11EffectType; typedef interface ID3DX11EffectType *LPD3D11EFFECTTYPE; // {4250D721-D5E5-491F-B62B-587C43186285} -DEFINE_GUID(IID_ID3DX11EffectType, - 0x4250d721, 0xd5e5, 0x491f, 0xb6, 0x2b, 0x58, 0x7c, 0x43, 0x18, 0x62, 0x85); +DEFINE_GUID(IID_ID3DX11EffectType, 0x4250d721, 0xd5e5, 0x491f, 0xb6, 0x2b, 0x58, 0x7c, 0x43, 0x18, 0x62, 0x85); #undef INTERFACE #define INTERFACE ID3DX11EffectType @@ -236,7 +233,7 @@ DEFINE_GUID(IID_ID3DX11EffectType, DECLARE_INTERFACE(ID3DX11EffectType) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_TYPE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_TYPE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectType *, GetMemberTypeByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectType *, GetMemberTypeByName)(THIS_ LPCSTR Name) PURE; STDMETHOD_(ID3DX11EffectType *, GetMemberTypeBySemantic)(THIS_ LPCSTR Semantic) PURE; @@ -256,58 +253,57 @@ DECLARE_INTERFACE(ID3DX11EffectType) typedef struct _D3DX11_EFFECT_VARIABLE_DESC { - LPCSTR Name; // Name of this variable, annotation, + LPCSTR Name; // Name of this variable, annotation, // or structure member - LPCSTR Semantic; // Semantic string of this variable + LPCSTR Semantic; // Semantic string of this variable // or structure member (NULL for // annotations or if not present) - UINT Flags; // D3DX11_EFFECT_VARIABLE_* flags - UINT Annotations; // Number of annotations on this variable + UINT Flags; // D3DX11_EFFECT_VARIABLE_* flags + UINT Annotations; // Number of annotations on this variable // (always 0 for annotations) - UINT BufferOffset; // Offset into containing cbuffer or tbuffer + UINT BufferOffset; // Offset into containing cbuffer or tbuffer // (always 0 for annotations or variables // not in constant buffers) - UINT ExplicitBindPoint; // Used if the variable has been explicitly bound + UINT ExplicitBindPoint; // Used if the variable has been explicitly bound // using the register keyword. Check Flags for // D3DX11_EFFECT_VARIABLE_EXPLICIT_BIND_POINT; } D3DX11_EFFECT_VARIABLE_DESC; -typedef interface ID3DX11EffectVariable ID3DX11EffectVariable; +typedef interface ID3DX11EffectVariable ID3DX11EffectVariable; typedef interface ID3DX11EffectVariable *LPD3D11EFFECTVARIABLE; // {036A777D-B56E-4B25-B313-CC3DDAB71873} -DEFINE_GUID(IID_ID3DX11EffectVariable, - 0x036a777d, 0xb56e, 0x4b25, 0xb3, 0x13, 0xcc, 0x3d, 0xda, 0xb7, 0x18, 0x73); +DEFINE_GUID(IID_ID3DX11EffectVariable, 0x036a777d, 0xb56e, 0x4b25, 0xb3, 0x13, 0xcc, 0x3d, 0xda, 0xb7, 0x18, 0x73); #undef INTERFACE #define INTERFACE ID3DX11EffectVariable // Forward defines -typedef interface ID3DX11EffectScalarVariable ID3DX11EffectScalarVariable; -typedef interface ID3DX11EffectVectorVariable ID3DX11EffectVectorVariable; -typedef interface ID3DX11EffectMatrixVariable ID3DX11EffectMatrixVariable; -typedef interface ID3DX11EffectStringVariable ID3DX11EffectStringVariable; -typedef interface ID3DX11EffectClassInstanceVariable ID3DX11EffectClassInstanceVariable; -typedef interface ID3DX11EffectInterfaceVariable ID3DX11EffectInterfaceVariable; -typedef interface ID3DX11EffectShaderResourceVariable ID3DX11EffectShaderResourceVariable; +typedef interface ID3DX11EffectScalarVariable ID3DX11EffectScalarVariable; +typedef interface ID3DX11EffectVectorVariable ID3DX11EffectVectorVariable; +typedef interface ID3DX11EffectMatrixVariable ID3DX11EffectMatrixVariable; +typedef interface ID3DX11EffectStringVariable ID3DX11EffectStringVariable; +typedef interface ID3DX11EffectClassInstanceVariable ID3DX11EffectClassInstanceVariable; +typedef interface ID3DX11EffectInterfaceVariable ID3DX11EffectInterfaceVariable; +typedef interface ID3DX11EffectShaderResourceVariable ID3DX11EffectShaderResourceVariable; typedef interface ID3DX11EffectUnorderedAccessViewVariable ID3DX11EffectUnorderedAccessViewVariable; -typedef interface ID3DX11EffectRenderTargetViewVariable ID3DX11EffectRenderTargetViewVariable; -typedef interface ID3DX11EffectDepthStencilViewVariable ID3DX11EffectDepthStencilViewVariable; -typedef interface ID3DX11EffectConstantBuffer ID3DX11EffectConstantBuffer; -typedef interface ID3DX11EffectShaderVariable ID3DX11EffectShaderVariable; -typedef interface ID3DX11EffectBlendVariable ID3DX11EffectBlendVariable; -typedef interface ID3DX11EffectDepthStencilVariable ID3DX11EffectDepthStencilVariable; -typedef interface ID3DX11EffectRasterizerVariable ID3DX11EffectRasterizerVariable; -typedef interface ID3DX11EffectSamplerVariable ID3DX11EffectSamplerVariable; +typedef interface ID3DX11EffectRenderTargetViewVariable ID3DX11EffectRenderTargetViewVariable; +typedef interface ID3DX11EffectDepthStencilViewVariable ID3DX11EffectDepthStencilViewVariable; +typedef interface ID3DX11EffectConstantBuffer ID3DX11EffectConstantBuffer; +typedef interface ID3DX11EffectShaderVariable ID3DX11EffectShaderVariable; +typedef interface ID3DX11EffectBlendVariable ID3DX11EffectBlendVariable; +typedef interface ID3DX11EffectDepthStencilVariable ID3DX11EffectDepthStencilVariable; +typedef interface ID3DX11EffectRasterizerVariable ID3DX11EffectRasterizerVariable; +typedef interface ID3DX11EffectSamplerVariable ID3DX11EffectSamplerVariable; DECLARE_INTERFACE(ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -345,12 +341,22 @@ DECLARE_INTERFACE(ID3DX11EffectVariable) // ID3DX11EffectScalarVariable //////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectScalarVariable ID3DX11EffectScalarVariable; +typedef interface ID3DX11EffectScalarVariable ID3DX11EffectScalarVariable; typedef interface ID3DX11EffectScalarVariable *LPD3D11EFFECTSCALARVARIABLE; // {921EF2E5-A65D-4E92-9FC6-4E9CC09A4ADE} DEFINE_GUID(IID_ID3DX11EffectScalarVariable, - 0x921ef2e5, 0xa65d, 0x4e92, 0x9f, 0xc6, 0x4e, 0x9c, 0xc0, 0x9a, 0x4a, 0xde); + 0x921ef2e5, + 0xa65d, + 0x4e92, + 0x9f, + 0xc6, + 0x4e, + 0x9c, + 0xc0, + 0x9a, + 0x4a, + 0xde); #undef INTERFACE #define INTERFACE ID3DX11EffectScalarVariable @@ -359,7 +365,7 @@ DECLARE_INTERFACE_(ID3DX11EffectScalarVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -405,22 +411,32 @@ DECLARE_INTERFACE_(ID3DX11EffectScalarVariable, ID3DX11EffectVariable) STDMETHOD(GetIntArray)(THIS_ int *pData, UINT Offset, UINT Count) PURE; STDMETHOD(SetBool)(THIS_ BOOL Value) PURE; - STDMETHOD(GetBool)(THIS_ BOOL *pValue) PURE; + STDMETHOD(GetBool)(THIS_ BOOL * pValue) PURE; - STDMETHOD(SetBoolArray)(THIS_ BOOL *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetBoolArray)(THIS_ BOOL *pData, UINT Offset, UINT Count) PURE; + STDMETHOD(SetBoolArray)(THIS_ BOOL * pData, UINT Offset, UINT Count) PURE; + STDMETHOD(GetBoolArray)(THIS_ BOOL * pData, UINT Offset, UINT Count) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectVectorVariable //////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectVectorVariable ID3DX11EffectVectorVariable; +typedef interface ID3DX11EffectVectorVariable ID3DX11EffectVectorVariable; typedef interface ID3DX11EffectVectorVariable *LPD3D11EFFECTVECTORVARIABLE; // {5E785D4A-D87B-48D8-B6E6-0F8CA7E7467A} DEFINE_GUID(IID_ID3DX11EffectVectorVariable, - 0x5e785d4a, 0xd87b, 0x48d8, 0xb6, 0xe6, 0x0f, 0x8c, 0xa7, 0xe7, 0x46, 0x7a); + 0x5e785d4a, + 0xd87b, + 0x48d8, + 0xb6, + 0xe6, + 0x0f, + 0x8c, + 0xa7, + 0xe7, + 0x46, + 0x7a); #undef INTERFACE #define INTERFACE ID3DX11EffectVectorVariable @@ -429,7 +445,7 @@ DECLARE_INTERFACE_(ID3DX11EffectVectorVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -462,19 +478,19 @@ DECLARE_INTERFACE_(ID3DX11EffectVectorVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT ByteOffset, UINT ByteCount) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT ByteOffset, UINT ByteCount) PURE; - STDMETHOD(SetBoolVector)(THIS_ BOOL *pData) PURE; + STDMETHOD(SetBoolVector)(THIS_ BOOL * pData) PURE; STDMETHOD(SetIntVector)(THIS_ int *pData) PURE; STDMETHOD(SetFloatVector)(THIS_ float *pData) PURE; - STDMETHOD(GetBoolVector)(THIS_ BOOL *pData) PURE; + STDMETHOD(GetBoolVector)(THIS_ BOOL * pData) PURE; STDMETHOD(GetIntVector)(THIS_ int *pData) PURE; STDMETHOD(GetFloatVector)(THIS_ float *pData) PURE; - STDMETHOD(SetBoolVectorArray)(THIS_ BOOL *pData, UINT Offset, UINT Count) PURE; + STDMETHOD(SetBoolVectorArray)(THIS_ BOOL * pData, UINT Offset, UINT Count) PURE; STDMETHOD(SetIntVectorArray)(THIS_ int *pData, UINT Offset, UINT Count) PURE; STDMETHOD(SetFloatVectorArray)(THIS_ float *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetBoolVectorArray)(THIS_ BOOL *pData, UINT Offset, UINT Count) PURE; + STDMETHOD(GetBoolVectorArray)(THIS_ BOOL * pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetIntVectorArray)(THIS_ int *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetFloatVectorArray)(THIS_ float *pData, UINT Offset, UINT Count) PURE; }; @@ -483,12 +499,22 @@ DECLARE_INTERFACE_(ID3DX11EffectVectorVariable, ID3DX11EffectVariable) // ID3DX11EffectMatrixVariable //////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectMatrixVariable ID3DX11EffectMatrixVariable; +typedef interface ID3DX11EffectMatrixVariable ID3DX11EffectMatrixVariable; typedef interface ID3DX11EffectMatrixVariable *LPD3D11EFFECTMATRIXVARIABLE; // {E1096CF4-C027-419A-8D86-D29173DC803E} DEFINE_GUID(IID_ID3DX11EffectMatrixVariable, - 0xe1096cf4, 0xc027, 0x419a, 0x8d, 0x86, 0xd2, 0x91, 0x73, 0xdc, 0x80, 0x3e); + 0xe1096cf4, + 0xc027, + 0x419a, + 0x8d, + 0x86, + 0xd2, + 0x91, + 0x73, + 0xdc, + 0x80, + 0x3e); #undef INTERFACE #define INTERFACE ID3DX11EffectMatrixVariable @@ -497,7 +523,7 @@ DECLARE_INTERFACE_(ID3DX11EffectMatrixVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -547,12 +573,22 @@ DECLARE_INTERFACE_(ID3DX11EffectMatrixVariable, ID3DX11EffectVariable) // ID3DX11EffectStringVariable //////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectStringVariable ID3DX11EffectStringVariable; +typedef interface ID3DX11EffectStringVariable ID3DX11EffectStringVariable; typedef interface ID3DX11EffectStringVariable *LPD3D11EFFECTSTRINGVARIABLE; // {F355C818-01BE-4653-A7CC-60FFFEDDC76D} DEFINE_GUID(IID_ID3DX11EffectStringVariable, - 0xf355c818, 0x01be, 0x4653, 0xa7, 0xcc, 0x60, 0xff, 0xfe, 0xdd, 0xc7, 0x6d); + 0xf355c818, + 0x01be, + 0x4653, + 0xa7, + 0xcc, + 0x60, + 0xff, + 0xfe, + 0xdd, + 0xc7, + 0x6d); #undef INTERFACE #define INTERFACE ID3DX11EffectStringVariable @@ -561,7 +597,7 @@ DECLARE_INTERFACE_(ID3DX11EffectStringVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -594,20 +630,30 @@ DECLARE_INTERFACE_(ID3DX11EffectStringVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetString)(THIS_ LPCSTR *ppString) PURE; - STDMETHOD(GetStringArray)(THIS_ LPCSTR *ppStrings, UINT Offset, UINT Count) PURE; + STDMETHOD(GetString)(THIS_ LPCSTR * ppString) PURE; + STDMETHOD(GetStringArray)(THIS_ LPCSTR * ppStrings, UINT Offset, UINT Count) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectClassInstanceVariable //////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectClassInstanceVariable ID3DX11EffectClassInstanceVariable; +typedef interface ID3DX11EffectClassInstanceVariable ID3DX11EffectClassInstanceVariable; typedef interface ID3DX11EffectClassInstanceVariable *LPD3D11EFFECTCLASSINSTANCEVARIABLE; // {926A8053-2A39-4DB4-9BDE-CF649ADEBDC1} DEFINE_GUID(IID_ID3DX11EffectClassInstanceVariable, - 0x926a8053, 0x2a39, 0x4db4, 0x9b, 0xde, 0xcf, 0x64, 0x9a, 0xde, 0xbd, 0xc1); + 0x926a8053, + 0x2a39, + 0x4db4, + 0x9b, + 0xde, + 0xcf, + 0x64, + 0x9a, + 0xde, + 0xbd, + 0xc1); #undef INTERFACE #define INTERFACE ID3DX11EffectClassInstanceVariable @@ -616,7 +662,7 @@ DECLARE_INTERFACE_(ID3DX11EffectClassInstanceVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -648,19 +694,29 @@ DECLARE_INTERFACE_(ID3DX11EffectClassInstanceVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetClassInstance)(ID3D11ClassInstance **ppClassInstance) PURE; + STDMETHOD(GetClassInstance)(ID3D11ClassInstance * *ppClassInstance) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectInterfaceVariable //////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectInterfaceVariable ID3DX11EffectInterfaceVariable; +typedef interface ID3DX11EffectInterfaceVariable ID3DX11EffectInterfaceVariable; typedef interface ID3DX11EffectInterfaceVariable *LPD3D11EFFECTINTERFACEVARIABLE; // {516C8CD8-1C80-40A4-B19B-0688792F11A5} DEFINE_GUID(IID_ID3DX11EffectInterfaceVariable, - 0x516c8cd8, 0x1c80, 0x40a4, 0xb1, 0x9b, 0x06, 0x88, 0x79, 0x2f, 0x11, 0xa5); + 0x516c8cd8, + 0x1c80, + 0x40a4, + 0xb1, + 0x9b, + 0x06, + 0x88, + 0x79, + 0x2f, + 0x11, + 0xa5); #undef INTERFACE #define INTERFACE ID3DX11EffectInterfaceVariable @@ -669,7 +725,7 @@ DECLARE_INTERFACE_(ID3DX11EffectInterfaceVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -701,20 +757,30 @@ DECLARE_INTERFACE_(ID3DX11EffectInterfaceVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(SetClassInstance)(ID3DX11EffectClassInstanceVariable *pEffectClassInstance) PURE; - STDMETHOD(GetClassInstance)(ID3DX11EffectClassInstanceVariable **ppEffectClassInstance) PURE; + STDMETHOD(SetClassInstance)(ID3DX11EffectClassInstanceVariable * pEffectClassInstance) PURE; + STDMETHOD(GetClassInstance)(ID3DX11EffectClassInstanceVariable * *ppEffectClassInstance) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectShaderResourceVariable //////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectShaderResourceVariable ID3DX11EffectShaderResourceVariable; +typedef interface ID3DX11EffectShaderResourceVariable ID3DX11EffectShaderResourceVariable; typedef interface ID3DX11EffectShaderResourceVariable *LPD3D11EFFECTSHADERRESOURCEVARIABLE; // {350DB233-BBE0-485C-9BFE-C0026B844F89} DEFINE_GUID(IID_ID3DX11EffectShaderResourceVariable, - 0x350db233, 0xbbe0, 0x485c, 0x9b, 0xfe, 0xc0, 0x02, 0x6b, 0x84, 0x4f, 0x89); + 0x350db233, + 0xbbe0, + 0x485c, + 0x9b, + 0xfe, + 0xc0, + 0x02, + 0x6b, + 0x84, + 0x4f, + 0x89); #undef INTERFACE #define INTERFACE ID3DX11EffectShaderResourceVariable @@ -723,7 +789,7 @@ DECLARE_INTERFACE_(ID3DX11EffectShaderResourceVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -756,23 +822,33 @@ DECLARE_INTERFACE_(ID3DX11EffectShaderResourceVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(SetResource)(THIS_ ID3D11ShaderResourceView *pResource) PURE; - STDMETHOD(GetResource)(THIS_ ID3D11ShaderResourceView **ppResource) PURE; + STDMETHOD(SetResource)(THIS_ ID3D11ShaderResourceView * pResource) PURE; + STDMETHOD(GetResource)(THIS_ ID3D11ShaderResourceView * *ppResource) PURE; - STDMETHOD(SetResourceArray)(THIS_ ID3D11ShaderResourceView **ppResources, UINT Offset, UINT Count) PURE; - STDMETHOD(GetResourceArray)(THIS_ ID3D11ShaderResourceView **ppResources, UINT Offset, UINT Count) PURE; + STDMETHOD(SetResourceArray)(THIS_ ID3D11ShaderResourceView * *ppResources, UINT Offset, UINT Count) PURE; + STDMETHOD(GetResourceArray)(THIS_ ID3D11ShaderResourceView * *ppResources, UINT Offset, UINT Count) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectUnorderedAccessViewVariable //////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectUnorderedAccessViewVariable ID3DX11EffectUnorderedAccessViewVariable; +typedef interface ID3DX11EffectUnorderedAccessViewVariable ID3DX11EffectUnorderedAccessViewVariable; typedef interface ID3DX11EffectUnorderedAccessViewVariable *LPD3D11EFFECTUNORDEREDACCESSVIEWVARIABLE; // {79B4AC8C-870A-47D2-B05A-8BD5CC3EE6C9} DEFINE_GUID(IID_ID3DX11EffectUnorderedAccessViewVariable, - 0x79b4ac8c, 0x870a, 0x47d2, 0xb0, 0x5a, 0x8b, 0xd5, 0xcc, 0x3e, 0xe6, 0xc9); + 0x79b4ac8c, + 0x870a, + 0x47d2, + 0xb0, + 0x5a, + 0x8b, + 0xd5, + 0xcc, + 0x3e, + 0xe6, + 0xc9); #undef INTERFACE #define INTERFACE ID3DX11EffectUnorderedAccessViewVariable @@ -781,7 +857,7 @@ DECLARE_INTERFACE_(ID3DX11EffectUnorderedAccessViewVariable, ID3DX11EffectVariab { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -814,23 +890,35 @@ DECLARE_INTERFACE_(ID3DX11EffectUnorderedAccessViewVariable, ID3DX11EffectVariab STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(SetUnorderedAccessView)(THIS_ ID3D11UnorderedAccessView *pResource) PURE; - STDMETHOD(GetUnorderedAccessView)(THIS_ ID3D11UnorderedAccessView **ppResource) PURE; + STDMETHOD(SetUnorderedAccessView)(THIS_ ID3D11UnorderedAccessView * pResource) PURE; + STDMETHOD(GetUnorderedAccessView)(THIS_ ID3D11UnorderedAccessView * *ppResource) PURE; - STDMETHOD(SetUnorderedAccessViewArray)(THIS_ ID3D11UnorderedAccessView **ppResources, UINT Offset, UINT Count) PURE; - STDMETHOD(GetUnorderedAccessViewArray)(THIS_ ID3D11UnorderedAccessView **ppResources, UINT Offset, UINT Count) PURE; + STDMETHOD(SetUnorderedAccessViewArray)(THIS_ ID3D11UnorderedAccessView * *ppResources, UINT Offset, UINT Count) + PURE; + STDMETHOD(GetUnorderedAccessViewArray)(THIS_ ID3D11UnorderedAccessView * *ppResources, UINT Offset, UINT Count) + PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectRenderTargetViewVariable ////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectRenderTargetViewVariable ID3DX11EffectRenderTargetViewVariable; +typedef interface ID3DX11EffectRenderTargetViewVariable ID3DX11EffectRenderTargetViewVariable; typedef interface ID3DX11EffectRenderTargetViewVariable *LPD3D11EFFECTRENDERTARGETVIEWVARIABLE; // {D5066909-F40C-43F8-9DB5-057C2A208552} DEFINE_GUID(IID_ID3DX11EffectRenderTargetViewVariable, - 0xd5066909, 0xf40c, 0x43f8, 0x9d, 0xb5, 0x05, 0x7c, 0x2a, 0x20, 0x85, 0x52); + 0xd5066909, + 0xf40c, + 0x43f8, + 0x9d, + 0xb5, + 0x05, + 0x7c, + 0x2a, + 0x20, + 0x85, + 0x52); #undef INTERFACE #define INTERFACE ID3DX11EffectRenderTargetViewVariable @@ -839,7 +927,7 @@ DECLARE_INTERFACE_(ID3DX11EffectRenderTargetViewVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -872,23 +960,33 @@ DECLARE_INTERFACE_(ID3DX11EffectRenderTargetViewVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(SetRenderTarget)(THIS_ ID3D11RenderTargetView *pResource) PURE; - STDMETHOD(GetRenderTarget)(THIS_ ID3D11RenderTargetView **ppResource) PURE; + STDMETHOD(SetRenderTarget)(THIS_ ID3D11RenderTargetView * pResource) PURE; + STDMETHOD(GetRenderTarget)(THIS_ ID3D11RenderTargetView * *ppResource) PURE; - STDMETHOD(SetRenderTargetArray)(THIS_ ID3D11RenderTargetView **ppResources, UINT Offset, UINT Count) PURE; - STDMETHOD(GetRenderTargetArray)(THIS_ ID3D11RenderTargetView **ppResources, UINT Offset, UINT Count) PURE; + STDMETHOD(SetRenderTargetArray)(THIS_ ID3D11RenderTargetView * *ppResources, UINT Offset, UINT Count) PURE; + STDMETHOD(GetRenderTargetArray)(THIS_ ID3D11RenderTargetView * *ppResources, UINT Offset, UINT Count) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectDepthStencilViewVariable ////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectDepthStencilViewVariable ID3DX11EffectDepthStencilViewVariable; +typedef interface ID3DX11EffectDepthStencilViewVariable ID3DX11EffectDepthStencilViewVariable; typedef interface ID3DX11EffectDepthStencilViewVariable *LPD3D11EFFECTDEPTHSTENCILVIEWVARIABLE; // {33C648AC-2E9E-4A2E-9CD6-DE31ACC5B347} DEFINE_GUID(IID_ID3DX11EffectDepthStencilViewVariable, - 0x33c648ac, 0x2e9e, 0x4a2e, 0x9c, 0xd6, 0xde, 0x31, 0xac, 0xc5, 0xb3, 0x47); + 0x33c648ac, + 0x2e9e, + 0x4a2e, + 0x9c, + 0xd6, + 0xde, + 0x31, + 0xac, + 0xc5, + 0xb3, + 0x47); #undef INTERFACE #define INTERFACE ID3DX11EffectDepthStencilViewVariable @@ -897,7 +995,7 @@ DECLARE_INTERFACE_(ID3DX11EffectDepthStencilViewVariable, ID3DX11EffectVariable) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -930,23 +1028,33 @@ DECLARE_INTERFACE_(ID3DX11EffectDepthStencilViewVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(SetDepthStencil)(THIS_ ID3D11DepthStencilView *pResource) PURE; - STDMETHOD(GetDepthStencil)(THIS_ ID3D11DepthStencilView **ppResource) PURE; + STDMETHOD(SetDepthStencil)(THIS_ ID3D11DepthStencilView * pResource) PURE; + STDMETHOD(GetDepthStencil)(THIS_ ID3D11DepthStencilView * *ppResource) PURE; - STDMETHOD(SetDepthStencilArray)(THIS_ ID3D11DepthStencilView **ppResources, UINT Offset, UINT Count) PURE; - STDMETHOD(GetDepthStencilArray)(THIS_ ID3D11DepthStencilView **ppResources, UINT Offset, UINT Count) PURE; + STDMETHOD(SetDepthStencilArray)(THIS_ ID3D11DepthStencilView * *ppResources, UINT Offset, UINT Count) PURE; + STDMETHOD(GetDepthStencilArray)(THIS_ ID3D11DepthStencilView * *ppResources, UINT Offset, UINT Count) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectConstantBuffer //////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectConstantBuffer ID3DX11EffectConstantBuffer; +typedef interface ID3DX11EffectConstantBuffer ID3DX11EffectConstantBuffer; typedef interface ID3DX11EffectConstantBuffer *LPD3D11EFFECTCONSTANTBUFFER; // {2CB6C733-82D2-4000-B3DA-6219D9A99BF2} DEFINE_GUID(IID_ID3DX11EffectConstantBuffer, - 0x2cb6c733, 0x82d2, 0x4000, 0xb3, 0xda, 0x62, 0x19, 0xd9, 0xa9, 0x9b, 0xf2); + 0x2cb6c733, + 0x82d2, + 0x4000, + 0xb3, + 0xda, + 0x62, + 0x19, + 0xd9, + 0xa9, + 0x9b, + 0xf2); #undef INTERFACE #define INTERFACE ID3DX11EffectConstantBuffer @@ -954,7 +1062,7 @@ DEFINE_GUID(IID_ID3DX11EffectConstantBuffer, DECLARE_INTERFACE_(ID3DX11EffectConstantBuffer, ID3DX11EffectVariable) { STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -987,13 +1095,13 @@ DECLARE_INTERFACE_(ID3DX11EffectConstantBuffer, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(SetConstantBuffer)(THIS_ ID3D11Buffer *pConstantBuffer) PURE; + STDMETHOD(SetConstantBuffer)(THIS_ ID3D11Buffer * pConstantBuffer) PURE; STDMETHOD(UndoSetConstantBuffer)(THIS) PURE; - STDMETHOD(GetConstantBuffer)(THIS_ ID3D11Buffer **ppConstantBuffer) PURE; + STDMETHOD(GetConstantBuffer)(THIS_ ID3D11Buffer * *ppConstantBuffer) PURE; - STDMETHOD(SetTextureBuffer)(THIS_ ID3D11ShaderResourceView *pTextureBuffer) PURE; + STDMETHOD(SetTextureBuffer)(THIS_ ID3D11ShaderResourceView * pTextureBuffer) PURE; STDMETHOD(UndoSetTextureBuffer)(THIS) PURE; - STDMETHOD(GetTextureBuffer)(THIS_ ID3D11ShaderResourceView **ppTextureBuffer) PURE; + STDMETHOD(GetTextureBuffer)(THIS_ ID3D11ShaderResourceView * *ppTextureBuffer) PURE; }; ////////////////////////////////////////////////////////////////////////////// @@ -1008,32 +1116,42 @@ DECLARE_INTERFACE_(ID3DX11EffectConstantBuffer, ID3DX11EffectVariable) typedef struct _D3DX11_EFFECT_SHADER_DESC { - CONST BYTE *pInputSignature; // Passed into CreateInputLayout, + CONST BYTE *pInputSignature; // Passed into CreateInputLayout, // valid on VS and GS only - BOOL IsInline; // Is this an anonymous shader variable + BOOL IsInline; // Is this an anonymous shader variable // resulting from an inline shader assignment? // -- The following fields are not valid after Optimize() -- - CONST BYTE *pBytecode; // Shader bytecode - UINT BytecodeLength; + CONST BYTE *pBytecode; // Shader bytecode + UINT BytecodeLength; - LPCSTR SODecls[D3D11_SO_STREAM_COUNT]; // Stream out declaration string (for GS with SO) - UINT RasterizedStream; + LPCSTR SODecls[D3D11_SO_STREAM_COUNT]; // Stream out declaration string (for GS with SO) + UINT RasterizedStream; - UINT NumInputSignatureEntries; // Number of entries in the input signature - UINT NumOutputSignatureEntries; // Number of entries in the output signature - UINT NumPatchConstantSignatureEntries; // Number of entries in the patch constant signature + UINT NumInputSignatureEntries; // Number of entries in the input signature + UINT NumOutputSignatureEntries; // Number of entries in the output signature + UINT NumPatchConstantSignatureEntries; // Number of entries in the patch constant signature } D3DX11_EFFECT_SHADER_DESC; -typedef interface ID3DX11EffectShaderVariable ID3DX11EffectShaderVariable; +typedef interface ID3DX11EffectShaderVariable ID3DX11EffectShaderVariable; typedef interface ID3DX11EffectShaderVariable *LPD3D11EFFECTSHADERVARIABLE; // {7508B344-020A-4EC7-9118-62CDD36C88D7} DEFINE_GUID(IID_ID3DX11EffectShaderVariable, - 0x7508b344, 0x020a, 0x4ec7, 0x91, 0x18, 0x62, 0xcd, 0xd3, 0x6c, 0x88, 0xd7); + 0x7508b344, + 0x020a, + 0x4ec7, + 0x91, + 0x18, + 0x62, + 0xcd, + 0xd3, + 0x6c, + 0x88, + 0xd7); #undef INTERFACE #define INTERFACE ID3DX11EffectShaderVariable @@ -1041,7 +1159,7 @@ DEFINE_GUID(IID_ID3DX11EffectShaderVariable, DECLARE_INTERFACE_(ID3DX11EffectShaderVariable, ID3DX11EffectVariable) { STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -1074,30 +1192,32 @@ DECLARE_INTERFACE_(ID3DX11EffectShaderVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetShaderDesc)(THIS_ UINT ShaderIndex, D3DX11_EFFECT_SHADER_DESC *pDesc) PURE; + STDMETHOD(GetShaderDesc)(THIS_ UINT ShaderIndex, D3DX11_EFFECT_SHADER_DESC * pDesc) PURE; - STDMETHOD(GetVertexShader)(THIS_ UINT ShaderIndex, ID3D11VertexShader **ppVS) PURE; - STDMETHOD(GetGeometryShader)(THIS_ UINT ShaderIndex, ID3D11GeometryShader **ppGS) PURE; - STDMETHOD(GetPixelShader)(THIS_ UINT ShaderIndex, ID3D11PixelShader **ppPS) PURE; - STDMETHOD(GetHullShader)(THIS_ UINT ShaderIndex, ID3D11HullShader **ppPS) PURE; - STDMETHOD(GetDomainShader)(THIS_ UINT ShaderIndex, ID3D11DomainShader **ppPS) PURE; - STDMETHOD(GetComputeShader)(THIS_ UINT ShaderIndex, ID3D11ComputeShader **ppPS) PURE; + STDMETHOD(GetVertexShader)(THIS_ UINT ShaderIndex, ID3D11VertexShader * *ppVS) PURE; + STDMETHOD(GetGeometryShader)(THIS_ UINT ShaderIndex, ID3D11GeometryShader * *ppGS) PURE; + STDMETHOD(GetPixelShader)(THIS_ UINT ShaderIndex, ID3D11PixelShader * *ppPS) PURE; + STDMETHOD(GetHullShader)(THIS_ UINT ShaderIndex, ID3D11HullShader * *ppPS) PURE; + STDMETHOD(GetDomainShader)(THIS_ UINT ShaderIndex, ID3D11DomainShader * *ppPS) PURE; + STDMETHOD(GetComputeShader)(THIS_ UINT ShaderIndex, ID3D11ComputeShader * *ppPS) PURE; - STDMETHOD(GetInputSignatureElementDesc)(THIS_ UINT ShaderIndex, UINT Element, D3D11_SIGNATURE_PARAMETER_DESC *pDesc) PURE; - STDMETHOD(GetOutputSignatureElementDesc)(THIS_ UINT ShaderIndex, UINT Element, D3D11_SIGNATURE_PARAMETER_DESC *pDesc) PURE; - STDMETHOD(GetPatchConstantSignatureElementDesc)(THIS_ UINT ShaderIndex, UINT Element, D3D11_SIGNATURE_PARAMETER_DESC *pDesc) PURE; + STDMETHOD(GetInputSignatureElementDesc)( + THIS_ UINT ShaderIndex, UINT Element, D3D11_SIGNATURE_PARAMETER_DESC * pDesc) PURE; + STDMETHOD(GetOutputSignatureElementDesc)( + THIS_ UINT ShaderIndex, UINT Element, D3D11_SIGNATURE_PARAMETER_DESC * pDesc) PURE; + STDMETHOD(GetPatchConstantSignatureElementDesc)( + THIS_ UINT ShaderIndex, UINT Element, D3D11_SIGNATURE_PARAMETER_DESC * pDesc) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectBlendVariable ///////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectBlendVariable ID3DX11EffectBlendVariable; +typedef interface ID3DX11EffectBlendVariable ID3DX11EffectBlendVariable; typedef interface ID3DX11EffectBlendVariable *LPD3D11EFFECTBLENDVARIABLE; // {D664F4D7-3B81-4805-B277-C1DF58C39F53} -DEFINE_GUID(IID_ID3DX11EffectBlendVariable, - 0xd664f4d7, 0x3b81, 0x4805, 0xb2, 0x77, 0xc1, 0xdf, 0x58, 0xc3, 0x9f, 0x53); +DEFINE_GUID(IID_ID3DX11EffectBlendVariable, 0xd664f4d7, 0x3b81, 0x4805, 0xb2, 0x77, 0xc1, 0xdf, 0x58, 0xc3, 0x9f, 0x53); #undef INTERFACE #define INTERFACE ID3DX11EffectBlendVariable @@ -1105,7 +1225,7 @@ DEFINE_GUID(IID_ID3DX11EffectBlendVariable, DECLARE_INTERFACE_(ID3DX11EffectBlendVariable, ID3DX11EffectVariable) { STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -1138,22 +1258,32 @@ DECLARE_INTERFACE_(ID3DX11EffectBlendVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetBlendState)(THIS_ UINT Index, ID3D11BlendState **ppBlendState) PURE; - STDMETHOD(SetBlendState)(THIS_ UINT Index, ID3D11BlendState *pBlendState) PURE; + STDMETHOD(GetBlendState)(THIS_ UINT Index, ID3D11BlendState * *ppBlendState) PURE; + STDMETHOD(SetBlendState)(THIS_ UINT Index, ID3D11BlendState * pBlendState) PURE; STDMETHOD(UndoSetBlendState)(THIS_ UINT Index) PURE; - STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_BLEND_DESC *pBlendDesc) PURE; + STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_BLEND_DESC * pBlendDesc) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectDepthStencilVariable ////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectDepthStencilVariable ID3DX11EffectDepthStencilVariable; +typedef interface ID3DX11EffectDepthStencilVariable ID3DX11EffectDepthStencilVariable; typedef interface ID3DX11EffectDepthStencilVariable *LPD3D11EFFECTDEPTHSTENCILVARIABLE; // {69B5751B-61A5-48E5-BD41-D93988111563} DEFINE_GUID(IID_ID3DX11EffectDepthStencilVariable, - 0x69b5751b, 0x61a5, 0x48e5, 0xbd, 0x41, 0xd9, 0x39, 0x88, 0x11, 0x15, 0x63); + 0x69b5751b, + 0x61a5, + 0x48e5, + 0xbd, + 0x41, + 0xd9, + 0x39, + 0x88, + 0x11, + 0x15, + 0x63); #undef INTERFACE #define INTERFACE ID3DX11EffectDepthStencilVariable @@ -1161,7 +1291,7 @@ DEFINE_GUID(IID_ID3DX11EffectDepthStencilVariable, DECLARE_INTERFACE_(ID3DX11EffectDepthStencilVariable, ID3DX11EffectVariable) { STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -1194,22 +1324,32 @@ DECLARE_INTERFACE_(ID3DX11EffectDepthStencilVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetDepthStencilState)(THIS_ UINT Index, ID3D11DepthStencilState **ppDepthStencilState) PURE; - STDMETHOD(SetDepthStencilState)(THIS_ UINT Index, ID3D11DepthStencilState *pDepthStencilState) PURE; + STDMETHOD(GetDepthStencilState)(THIS_ UINT Index, ID3D11DepthStencilState * *ppDepthStencilState) PURE; + STDMETHOD(SetDepthStencilState)(THIS_ UINT Index, ID3D11DepthStencilState * pDepthStencilState) PURE; STDMETHOD(UndoSetDepthStencilState)(THIS_ UINT Index) PURE; - STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_DEPTH_STENCIL_DESC *pDepthStencilDesc) PURE; + STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_DEPTH_STENCIL_DESC * pDepthStencilDesc) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectRasterizerVariable //////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectRasterizerVariable ID3DX11EffectRasterizerVariable; +typedef interface ID3DX11EffectRasterizerVariable ID3DX11EffectRasterizerVariable; typedef interface ID3DX11EffectRasterizerVariable *LPD3D11EFFECTRASTERIZERVARIABLE; // {53A262F6-5F74-4151-A132-E3DD19A62C9D} DEFINE_GUID(IID_ID3DX11EffectRasterizerVariable, - 0x53a262f6, 0x5f74, 0x4151, 0xa1, 0x32, 0xe3, 0xdd, 0x19, 0xa6, 0x2c, 0x9d); + 0x53a262f6, + 0x5f74, + 0x4151, + 0xa1, + 0x32, + 0xe3, + 0xdd, + 0x19, + 0xa6, + 0x2c, + 0x9d); #undef INTERFACE #define INTERFACE ID3DX11EffectRasterizerVariable @@ -1217,7 +1357,7 @@ DEFINE_GUID(IID_ID3DX11EffectRasterizerVariable, DECLARE_INTERFACE_(ID3DX11EffectRasterizerVariable, ID3DX11EffectVariable) { STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -1250,22 +1390,32 @@ DECLARE_INTERFACE_(ID3DX11EffectRasterizerVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetRasterizerState)(THIS_ UINT Index, ID3D11RasterizerState **ppRasterizerState) PURE; - STDMETHOD(SetRasterizerState)(THIS_ UINT Index, ID3D11RasterizerState *pRasterizerState) PURE; + STDMETHOD(GetRasterizerState)(THIS_ UINT Index, ID3D11RasterizerState * *ppRasterizerState) PURE; + STDMETHOD(SetRasterizerState)(THIS_ UINT Index, ID3D11RasterizerState * pRasterizerState) PURE; STDMETHOD(UndoSetRasterizerState)(THIS_ UINT Index) PURE; - STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_RASTERIZER_DESC *pRasterizerDesc) PURE; + STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_RASTERIZER_DESC * pRasterizerDesc) PURE; }; ////////////////////////////////////////////////////////////////////////////// // ID3DX11EffectSamplerVariable /////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -typedef interface ID3DX11EffectSamplerVariable ID3DX11EffectSamplerVariable; +typedef interface ID3DX11EffectSamplerVariable ID3DX11EffectSamplerVariable; typedef interface ID3DX11EffectSamplerVariable *LPD3D11EFFECTSAMPLERVARIABLE; // {C6402E55-1095-4D95-8931-F92660513DD9} DEFINE_GUID(IID_ID3DX11EffectSamplerVariable, - 0xc6402e55, 0x1095, 0x4d95, 0x89, 0x31, 0xf9, 0x26, 0x60, 0x51, 0x3d, 0xd9); + 0xc6402e55, + 0x1095, + 0x4d95, + 0x89, + 0x31, + 0xf9, + 0x26, + 0x60, + 0x51, + 0x3d, + 0xd9); #undef INTERFACE #define INTERFACE ID3DX11EffectSamplerVariable @@ -1273,7 +1423,7 @@ DEFINE_GUID(IID_ID3DX11EffectSamplerVariable, DECLARE_INTERFACE_(ID3DX11EffectSamplerVariable, ID3DX11EffectVariable) { STDMETHOD_(ID3DX11EffectType *, GetType)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_VARIABLE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -1306,10 +1456,10 @@ DECLARE_INTERFACE_(ID3DX11EffectSamplerVariable, ID3DX11EffectVariable) STDMETHOD(SetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; STDMETHOD(GetRawValue)(THIS_ void *pData, UINT Offset, UINT Count) PURE; - STDMETHOD(GetSampler)(THIS_ UINT Index, ID3D11SamplerState **ppSampler) PURE; - STDMETHOD(SetSampler)(THIS_ UINT Index, ID3D11SamplerState *pSampler) PURE; + STDMETHOD(GetSampler)(THIS_ UINT Index, ID3D11SamplerState * *ppSampler) PURE; + STDMETHOD(SetSampler)(THIS_ UINT Index, ID3D11SamplerState * pSampler) PURE; STDMETHOD(UndoSetSampler)(THIS_ UINT Index) PURE; - STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_SAMPLER_DESC *pSamplerDesc) PURE; + STDMETHOD(GetBackingStore)(THIS_ UINT Index, D3D11_SAMPLER_DESC * pSamplerDesc) PURE; }; ////////////////////////////////////////////////////////////////////////////// @@ -1324,16 +1474,16 @@ DECLARE_INTERFACE_(ID3DX11EffectSamplerVariable, ID3DX11EffectVariable) typedef struct _D3DX11_PASS_DESC { - LPCSTR Name; // Name of this pass (NULL if not anonymous) - UINT Annotations; // Number of annotations on this pass + LPCSTR Name; // Name of this pass (NULL if not anonymous) + UINT Annotations; // Number of annotations on this pass - BYTE *pIAInputSignature; // Signature from VS or GS (if there is no VS) + BYTE *pIAInputSignature; // Signature from VS or GS (if there is no VS) // or NULL if neither exists - SIZE_T IAInputSignatureSize; // Singature size in bytes + SIZE_T IAInputSignatureSize; // Singature size in bytes - UINT StencilRef; // Specified in SetDepthStencilState() - UINT SampleMask; // Specified in SetBlendState() - FLOAT BlendFactor[4]; // Specified in SetBlendState() + UINT StencilRef; // Specified in SetDepthStencilState() + UINT SampleMask; // Specified in SetBlendState() + FLOAT BlendFactor[4]; // Specified in SetBlendState() } D3DX11_PASS_DESC; //---------------------------------------------------------------------------- @@ -1344,7 +1494,7 @@ typedef struct _D3DX11_PASS_DESC typedef struct _D3DX11_PASS_SHADER_DESC { - ID3DX11EffectShaderVariable *pShaderVariable; // The variable that this shader came from. + ID3DX11EffectShaderVariable *pShaderVariable; // The variable that this shader came from. // If this is an inline shader assignment, // the returned interface will be an // anonymous shader variable, which is @@ -1355,16 +1505,15 @@ typedef struct _D3DX11_PASS_SHADER_DESC // the pass block, pShaderVariable != NULL, // but pShaderVariable->IsValid() == FALSE. - UINT ShaderIndex; // The element of pShaderVariable (if an array) + UINT ShaderIndex; // The element of pShaderVariable (if an array) // or 0 if not applicable } D3DX11_PASS_SHADER_DESC; -typedef interface ID3DX11EffectPass ID3DX11EffectPass; +typedef interface ID3DX11EffectPass ID3DX11EffectPass; typedef interface ID3DX11EffectPass *LPD3D11EFFECTPASS; // {3437CEC4-4AC1-4D87-8916-F4BD5A41380C} -DEFINE_GUID(IID_ID3DX11EffectPass, - 0x3437cec4, 0x4ac1, 0x4d87, 0x89, 0x16, 0xf4, 0xbd, 0x5a, 0x41, 0x38, 0x0c); +DEFINE_GUID(IID_ID3DX11EffectPass, 0x3437cec4, 0x4ac1, 0x4d87, 0x89, 0x16, 0xf4, 0xbd, 0x5a, 0x41, 0x38, 0x0c); #undef INTERFACE #define INTERFACE ID3DX11EffectPass @@ -1372,21 +1521,21 @@ DEFINE_GUID(IID_ID3DX11EffectPass, DECLARE_INTERFACE(ID3DX11EffectPass) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_PASS_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_PASS_DESC * pDesc) PURE; - STDMETHOD(GetVertexShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC *pDesc) PURE; - STDMETHOD(GetGeometryShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC *pDesc) PURE; - STDMETHOD(GetPixelShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC *pDesc) PURE; - STDMETHOD(GetHullShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC *pDesc) PURE; - STDMETHOD(GetDomainShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC *pDesc) PURE; - STDMETHOD(GetComputeShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC *pDesc) PURE; + STDMETHOD(GetVertexShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC * pDesc) PURE; + STDMETHOD(GetGeometryShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC * pDesc) PURE; + STDMETHOD(GetPixelShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC * pDesc) PURE; + STDMETHOD(GetHullShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC * pDesc) PURE; + STDMETHOD(GetDomainShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC * pDesc) PURE; + STDMETHOD(GetComputeShaderDesc)(THIS_ D3DX11_PASS_SHADER_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; - STDMETHOD(Apply)(THIS_ UINT Flags, ID3D11DeviceContext* pContext) PURE; + STDMETHOD(Apply)(THIS_ UINT Flags, ID3D11DeviceContext * pContext) PURE; - STDMETHOD(ComputeStateBlockMask)(THIS_ D3DX11_STATE_BLOCK_MASK *pStateBlockMask) PURE; + STDMETHOD(ComputeStateBlockMask)(THIS_ D3DX11_STATE_BLOCK_MASK * pStateBlockMask) PURE; }; ////////////////////////////////////////////////////////////////////////////// @@ -1401,17 +1550,16 @@ DECLARE_INTERFACE(ID3DX11EffectPass) typedef struct _D3DX11_TECHNIQUE_DESC { - LPCSTR Name; // Name of this technique (NULL if not anonymous) - UINT Passes; // Number of passes contained within - UINT Annotations; // Number of annotations on this technique + LPCSTR Name; // Name of this technique (NULL if not anonymous) + UINT Passes; // Number of passes contained within + UINT Annotations; // Number of annotations on this technique } D3DX11_TECHNIQUE_DESC; -typedef interface ID3DX11EffectTechnique ID3DX11EffectTechnique; +typedef interface ID3DX11EffectTechnique ID3DX11EffectTechnique; typedef interface ID3DX11EffectTechnique *LPD3D11EFFECTTECHNIQUE; // {51198831-1F1D-4F47-BD76-41CB0835B1DE} -DEFINE_GUID(IID_ID3DX11EffectTechnique, - 0x51198831, 0x1f1d, 0x4f47, 0xbd, 0x76, 0x41, 0xcb, 0x08, 0x35, 0xb1, 0xde); +DEFINE_GUID(IID_ID3DX11EffectTechnique, 0x51198831, 0x1f1d, 0x4f47, 0xbd, 0x76, 0x41, 0xcb, 0x08, 0x35, 0xb1, 0xde); #undef INTERFACE #define INTERFACE ID3DX11EffectTechnique @@ -1419,7 +1567,7 @@ DEFINE_GUID(IID_ID3DX11EffectTechnique, DECLARE_INTERFACE(ID3DX11EffectTechnique) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_TECHNIQUE_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_TECHNIQUE_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -1427,7 +1575,7 @@ DECLARE_INTERFACE(ID3DX11EffectTechnique) STDMETHOD_(ID3DX11EffectPass *, GetPassByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectPass *, GetPassByName)(THIS_ LPCSTR Name) PURE; - STDMETHOD(ComputeStateBlockMask)(THIS_ D3DX11_STATE_BLOCK_MASK *pStateBlockMask) PURE; + STDMETHOD(ComputeStateBlockMask)(THIS_ D3DX11_STATE_BLOCK_MASK * pStateBlockMask) PURE; }; ////////////////////////////////////////////////////////////////////////////// @@ -1442,17 +1590,16 @@ DECLARE_INTERFACE(ID3DX11EffectTechnique) typedef struct _D3DX11_GROUP_DESC { - LPCSTR Name; // Name of this group (only NULL if global) - UINT Techniques; // Number of techniques contained within - UINT Annotations; // Number of annotations on this group + LPCSTR Name; // Name of this group (only NULL if global) + UINT Techniques; // Number of techniques contained within + UINT Annotations; // Number of annotations on this group } D3DX11_GROUP_DESC; -typedef interface ID3DX11EffectGroup ID3DX11EffectGroup; +typedef interface ID3DX11EffectGroup ID3DX11EffectGroup; typedef interface ID3DX11EffectGroup *LPD3D11EFFECTGROUP; // {03074acf-97de-485f-b201-cb775264afd6} -DEFINE_GUID(IID_ID3DX11EffectGroup, - 0x03074acf, 0x97de, 0x485f, 0xb2, 0x01, 0xcb, 0x77, 0x52, 0x64, 0xaf, 0xd6); +DEFINE_GUID(IID_ID3DX11EffectGroup, 0x03074acf, 0x97de, 0x485f, 0xb2, 0x01, 0xcb, 0x77, 0x52, 0x64, 0xaf, 0xd6); #undef INTERFACE #define INTERFACE ID3DX11EffectGroup @@ -1460,7 +1607,7 @@ DEFINE_GUID(IID_ID3DX11EffectGroup, DECLARE_INTERFACE(ID3DX11EffectGroup) { STDMETHOD_(BOOL, IsValid)(THIS) PURE; - STDMETHOD(GetDesc)(THIS_ D3DX11_GROUP_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_GROUP_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectVariable *, GetAnnotationByName)(THIS_ LPCSTR Name) PURE; @@ -1481,19 +1628,18 @@ DECLARE_INTERFACE(ID3DX11EffectGroup) typedef struct _D3DX11_EFFECT_DESC { - UINT ConstantBuffers; // Number of constant buffers in this effect - UINT GlobalVariables; // Number of global variables in this effect - UINT InterfaceVariables; // Number of global interfaces in this effect - UINT Techniques; // Number of techniques in this effect - UINT Groups; // Number of groups in this effect + UINT ConstantBuffers; // Number of constant buffers in this effect + UINT GlobalVariables; // Number of global variables in this effect + UINT InterfaceVariables; // Number of global interfaces in this effect + UINT Techniques; // Number of techniques in this effect + UINT Groups; // Number of groups in this effect } D3DX11_EFFECT_DESC; -typedef interface ID3DX11Effect ID3DX11Effect; +typedef interface ID3DX11Effect ID3DX11Effect; typedef interface ID3DX11Effect *LPD3D11EFFECT; // {FA61CA24-E4BA-4262-9DB8-B132E8CAE319} -DEFINE_GUID(IID_ID3DX11Effect, - 0xfa61ca24, 0xe4ba, 0x4262, 0x9d, 0xb8, 0xb1, 0x32, 0xe8, 0xca, 0xe3, 0x19); +DEFINE_GUID(IID_ID3DX11Effect, 0xfa61ca24, 0xe4ba, 0x4262, 0x9d, 0xb8, 0xb1, 0x32, 0xe8, 0xca, 0xe3, 0x19); #undef INTERFACE #define INTERFACE ID3DX11Effect @@ -1501,17 +1647,17 @@ DEFINE_GUID(IID_ID3DX11Effect, DECLARE_INTERFACE_(ID3DX11Effect, IUnknown) { // IUnknown - STDMETHOD(QueryInterface)(THIS_ REFIID iid, LPVOID *ppv) PURE; + STDMETHOD(QueryInterface)(THIS_ REFIID iid, LPVOID * ppv) PURE; STDMETHOD_(ULONG, AddRef)(THIS) PURE; STDMETHOD_(ULONG, Release)(THIS) PURE; STDMETHOD_(BOOL, IsValid)(THIS) PURE; // Managing D3D Device - STDMETHOD(GetDevice)(THIS_ ID3D11Device **ppDevice) PURE; + STDMETHOD(GetDevice)(THIS_ ID3D11Device * *ppDevice) PURE; // New Reflection APIs - STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_DESC *pDesc) PURE; + STDMETHOD(GetDesc)(THIS_ D3DX11_EFFECT_DESC * pDesc) PURE; STDMETHOD_(ID3DX11EffectConstantBuffer *, GetConstantBufferByIndex)(THIS_ UINT Index) PURE; STDMETHOD_(ID3DX11EffectConstantBuffer *, GetConstantBufferByName)(THIS_ LPCSTR Name) PURE; @@ -1528,7 +1674,7 @@ DECLARE_INTERFACE_(ID3DX11Effect, IUnknown) STDMETHOD_(ID3D11ClassLinkage *, GetClassLinkage)(THIS) PURE; - STDMETHOD(CloneEffect)(THIS_ UINT Flags, ID3DX11Effect **ppClonedEffect) PURE; + STDMETHOD(CloneEffect)(THIS_ UINT Flags, ID3DX11Effect * *ppClonedEffect) PURE; STDMETHOD(Optimize)(THIS) PURE; STDMETHOD_(BOOL, IsOptimized)(THIS) PURE; }; @@ -1538,41 +1684,45 @@ DECLARE_INTERFACE_(ID3DX11Effect, IUnknown) ////////////////////////////////////////////////////////////////////////////// #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif //__cplusplus -//---------------------------------------------------------------------------- -// D3DX11CreateEffectFromMemory: -// -------------------------- -// Creates an effect from a binary effect or file -// -// Parameters: -// -// [in] -// -// -// pData -// Blob of compiled effect data -// DataLength -// Length of the data blob -// FXFlags -// Compilation flags pertaining to Effect compilation, honored -// by the Effect compiler -// pDevice -// Pointer to the D3D11 device on which to create Effect resources -// -// [out] -// -// ppEffect -// Address of the newly created Effect interface -// -//---------------------------------------------------------------------------- + //---------------------------------------------------------------------------- + // D3DX11CreateEffectFromMemory: + // -------------------------- + // Creates an effect from a binary effect or file + // + // Parameters: + // + // [in] + // + // + // pData + // Blob of compiled effect data + // DataLength + // Length of the data blob + // FXFlags + // Compilation flags pertaining to Effect compilation, honored + // by the Effect compiler + // pDevice + // Pointer to the D3D11 device on which to create Effect resources + // + // [out] + // + // ppEffect + // Address of the newly created Effect interface + // + //---------------------------------------------------------------------------- -HRESULT WINAPI D3DX11CreateEffectFromMemory(void *pData, SIZE_T DataLength, UINT FXFlags, ID3D11Device *pDevice, ID3DX11Effect **ppEffect); + HRESULT WINAPI D3DX11CreateEffectFromMemory(void *pData, + SIZE_T DataLength, + UINT FXFlags, + ID3D11Device *pDevice, + ID3DX11Effect **ppEffect); #ifdef __cplusplus } #endif //__cplusplus #endif //__D3DX11EFFECT_H__ - diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp b/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp index b606490f..ade0e2b1 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/simpleD3D11Texture.cpp @@ -31,19 +31,19 @@ #pragma warning(disable : 4312) -#include #include +#include // This header inclues all the necessary D3D11 and CUDA includes -#include -#include #include +#include #include +#include // includes, project -#include #include -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h +#include #define MAX_EPSILON 10 @@ -52,245 +52,240 @@ static char *SDK_name = "simpleD3D11Texture"; //----------------------------------------------------------------------------- // Global variables //----------------------------------------------------------------------------- -IDXGIAdapter *g_pCudaCapableAdapter = NULL; // Adapter to use -ID3D11Device *g_pd3dDevice = NULL; // Our rendering device -ID3D11DeviceContext *g_pd3dDeviceContext = NULL; -IDXGISwapChain *g_pSwapChain = NULL; // The swap chain of the window -ID3D11RenderTargetView *g_pSwapChainRTV = - NULL; // The Render target view on the swap chain ( used for clear) -ID3D11RasterizerState *g_pRasterState = NULL; +IDXGIAdapter *g_pCudaCapableAdapter = NULL; // Adapter to use +ID3D11Device *g_pd3dDevice = NULL; // Our rendering device +ID3D11DeviceContext *g_pd3dDeviceContext = NULL; +IDXGISwapChain *g_pSwapChain = NULL; // The swap chain of the window +ID3D11RenderTargetView *g_pSwapChainRTV = NULL; // The Render target view on the swap chain ( used for clear) +ID3D11RasterizerState *g_pRasterState = NULL; ID3D11InputLayout *g_pInputLayout = NULL; #ifdef USEEFFECT -#pragma message( \ - ">>>> NOTE : Using Effect library (see DXSDK Utility folder for sources)") -#pragma message( \ - ">>>> WARNING : Currently only libs for vc9 are provided with the sample. See DXSDK for more...") -#pragma message( \ - ">>>> WARNING : The effect is currently failing... some strange internal error in Effect lib") -ID3DX11Effect *g_pSimpleEffect = NULL; -ID3DX11EffectTechnique *g_pSimpleTechnique = NULL; -ID3DX11EffectVectorVariable *g_pvQuadRect = NULL; -ID3DX11EffectScalarVariable *g_pUseCase = NULL; -ID3DX11EffectShaderResourceVariable *g_pTexture2D = NULL; -ID3DX11EffectShaderResourceVariable *g_pTexture3D = NULL; -ID3DX11EffectShaderResourceVariable *g_pTextureCube = NULL; +#pragma message(">>>> NOTE : Using Effect library (see DXSDK Utility folder for sources)") +#pragma message(">>>> WARNING : Currently only libs for vc9 are provided with the sample. See DXSDK for more...") +#pragma message(">>>> WARNING : The effect is currently failing... some strange internal error in Effect lib") +ID3DX11Effect *g_pSimpleEffect = NULL; +ID3DX11EffectTechnique *g_pSimpleTechnique = NULL; +ID3DX11EffectVectorVariable *g_pvQuadRect = NULL; +ID3DX11EffectScalarVariable *g_pUseCase = NULL; +ID3DX11EffectShaderResourceVariable *g_pTexture2D = NULL; +ID3DX11EffectShaderResourceVariable *g_pTexture3D = NULL; +ID3DX11EffectShaderResourceVariable *g_pTextureCube = NULL; -static const char g_simpleEffectSrc[] = - "float4 g_vQuadRect; \n" - "int g_UseCase; \n" - "Texture2D g_Texture2D; \n" - "Texture3D g_Texture3D; \n" - "TextureCube g_TextureCube; \n" - "\n" - "SamplerState samLinear{ \n" - " Filter = MIN_MAG_LINEAR_MIP_POINT; \n" - "};\n" - "\n" - "struct Fragment{ \n" - " float4 Pos : SV_POSITION;\n" - " float3 Tex : TEXCOORD0; };\n" - "\n" - "Fragment VS( uint vertexId : SV_VertexID )\n" - "{\n" - " Fragment f;\n" - " f.Tex = float3( 0.f, 0.f, 0.f); \n" - " if (vertexId == 1) f.Tex.x = 1.f; \n" - " else if (vertexId == 2) f.Tex.y = 1.f; \n" - " else if (vertexId == 3) f.Tex.xy = float2(1.f, 1.f); \n" - " \n" - " f.Pos = float4( g_vQuadRect.xy + f.Tex * g_vQuadRect.zw, 0, 1);\n" - " \n" - " if (g_UseCase == 1) { \n" - " if (vertexId == 1) f.Tex.z = 0.5f; \n" - " else if (vertexId == 2) f.Tex.z = 0.5f; \n" - " else if (vertexId == 3) f.Tex.z = 1.f; \n" - " } \n" - " else if (g_UseCase >= 2) { \n" - " f.Tex.xy = f.Tex.xy * 2.f - 1.f; \n" - " } \n" - " return f;\n" - "}\n" - "\n" - "float4 PS( Fragment f ) : SV_Target\n" - "{\n" - " if (g_UseCase == 0) return g_Texture2D.Sample( samLinear, f.Tex.xy ); " - "\n" - " else if (g_UseCase == 1) return g_Texture3D.Sample( samLinear, f.Tex " - "); \n" - " else if (g_UseCase == 2) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.xy, 1.0) ); \n" - " else if (g_UseCase == 3) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.xy, -1.0) ); \n" - " else if (g_UseCase == 4) return g_TextureCube.Sample( samLinear, " - "float3(1.0, f.Tex.xy) ); \n" - " else if (g_UseCase == 5) return g_TextureCube.Sample( samLinear, " - "float3(-1.0, f.Tex.xy) ); \n" - " else if (g_UseCase == 6) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.x, 1.0, f.Tex.y) ); \n" - " else if (g_UseCase == 7) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.x, -1.0, f.Tex.y) ); \n" - " else return float4(f.Tex, 1);\n" - "}\n" - "\n" - "technique11 Render\n" - "{\n" - " pass P0\n" - " {\n" - " SetVertexShader( CompileShader( vs_5_0, VS() ) );\n" - " SetGeometryShader( NULL );\n" - " SetPixelShader( CompileShader( ps_5_0, PS() ) );\n" - " }\n" - "}\n" - "\n"; +static const char g_simpleEffectSrc[] = "float4 g_vQuadRect; \n" + "int g_UseCase; \n" + "Texture2D g_Texture2D; \n" + "Texture3D g_Texture3D; \n" + "TextureCube g_TextureCube; \n" + "\n" + "SamplerState samLinear{ \n" + " Filter = MIN_MAG_LINEAR_MIP_POINT; \n" + "};\n" + "\n" + "struct Fragment{ \n" + " float4 Pos : SV_POSITION;\n" + " float3 Tex : TEXCOORD0; };\n" + "\n" + "Fragment VS( uint vertexId : SV_VertexID )\n" + "{\n" + " Fragment f;\n" + " f.Tex = float3( 0.f, 0.f, 0.f); \n" + " if (vertexId == 1) f.Tex.x = 1.f; \n" + " else if (vertexId == 2) f.Tex.y = 1.f; \n" + " else if (vertexId == 3) f.Tex.xy = float2(1.f, 1.f); \n" + " \n" + " f.Pos = float4( g_vQuadRect.xy + f.Tex * g_vQuadRect.zw, 0, 1);\n" + " \n" + " if (g_UseCase == 1) { \n" + " if (vertexId == 1) f.Tex.z = 0.5f; \n" + " else if (vertexId == 2) f.Tex.z = 0.5f; \n" + " else if (vertexId == 3) f.Tex.z = 1.f; \n" + " } \n" + " else if (g_UseCase >= 2) { \n" + " f.Tex.xy = f.Tex.xy * 2.f - 1.f; \n" + " } \n" + " return f;\n" + "}\n" + "\n" + "float4 PS( Fragment f ) : SV_Target\n" + "{\n" + " if (g_UseCase == 0) return g_Texture2D.Sample( samLinear, f.Tex.xy ); " + "\n" + " else if (g_UseCase == 1) return g_Texture3D.Sample( samLinear, f.Tex " + "); \n" + " else if (g_UseCase == 2) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.xy, 1.0) ); \n" + " else if (g_UseCase == 3) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.xy, -1.0) ); \n" + " else if (g_UseCase == 4) return g_TextureCube.Sample( samLinear, " + "float3(1.0, f.Tex.xy) ); \n" + " else if (g_UseCase == 5) return g_TextureCube.Sample( samLinear, " + "float3(-1.0, f.Tex.xy) ); \n" + " else if (g_UseCase == 6) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.x, 1.0, f.Tex.y) ); \n" + " else if (g_UseCase == 7) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.x, -1.0, f.Tex.y) ); \n" + " else return float4(f.Tex, 1);\n" + "}\n" + "\n" + "technique11 Render\n" + "{\n" + " pass P0\n" + " {\n" + " SetVertexShader( CompileShader( vs_5_0, VS() ) );\n" + " SetGeometryShader( NULL );\n" + " SetPixelShader( CompileShader( ps_5_0, PS() ) );\n" + " }\n" + "}\n" + "\n"; #else // // Vertex and Pixel shaders here : VS() & PS() // -static const char g_simpleShaders[] = - "cbuffer cbuf \n" - "{ \n" - " float4 g_vQuadRect; \n" - " int g_UseCase; \n" - "} \n" - "Texture2D g_Texture2D; \n" - "Texture3D g_Texture3D; \n" - "TextureCube g_TextureCube; \n" - "\n" - "SamplerState samLinear{ \n" - " Filter = MIN_MAG_LINEAR_MIP_POINT; \n" - "};\n" - "\n" - "struct Fragment{ \n" - " float4 Pos : SV_POSITION;\n" - " float3 Tex : TEXCOORD0; };\n" - "\n" - "Fragment VS( uint vertexId : SV_VertexID )\n" - "{\n" - " Fragment f;\n" - " f.Tex = float3( 0.f, 0.f, 0.f); \n" - " if (vertexId == 1) f.Tex.x = 1.f; \n" - " else if (vertexId == 2) f.Tex.y = 1.f; \n" - " else if (vertexId == 3) f.Tex.xy = float2(1.f, 1.f); \n" - " \n" - " f.Pos = float4( g_vQuadRect.xy + f.Tex * g_vQuadRect.zw, 0, 1);\n" - " \n" - " if (g_UseCase == 1) { \n" - " if (vertexId == 1) f.Tex.z = 0.5f; \n" - " else if (vertexId == 2) f.Tex.z = 0.5f; \n" - " else if (vertexId == 3) f.Tex.z = 1.f; \n" - " } \n" - " else if (g_UseCase >= 2) { \n" - " f.Tex.xy = f.Tex.xy * 2.f - 1.f; \n" - " } \n" - " return f;\n" - "}\n" - "\n" - "float4 PS( Fragment f ) : SV_Target\n" - "{\n" - " if (g_UseCase == 0) return g_Texture2D.Sample( samLinear, f.Tex.xy ); " - "\n" - " else if (g_UseCase == 1) return g_Texture3D.Sample( samLinear, f.Tex " - "); \n" - " else if (g_UseCase == 2) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.xy, 1.0) ); \n" - " else if (g_UseCase == 3) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.xy, -1.0) ); \n" - " else if (g_UseCase == 4) return g_TextureCube.Sample( samLinear, " - "float3(1.0, f.Tex.xy) ); \n" - " else if (g_UseCase == 5) return g_TextureCube.Sample( samLinear, " - "float3(-1.0, f.Tex.xy) ); \n" - " else if (g_UseCase == 6) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.x, 1.0, f.Tex.y) ); \n" - " else if (g_UseCase == 7) return g_TextureCube.Sample( samLinear, " - "float3(f.Tex.x, -1.0, f.Tex.y) ); \n" - " else return float4(f.Tex, 1);\n" - "}\n" - "\n"; +static const char g_simpleShaders[] = "cbuffer cbuf \n" + "{ \n" + " float4 g_vQuadRect; \n" + " int g_UseCase; \n" + "} \n" + "Texture2D g_Texture2D; \n" + "Texture3D g_Texture3D; \n" + "TextureCube g_TextureCube; \n" + "\n" + "SamplerState samLinear{ \n" + " Filter = MIN_MAG_LINEAR_MIP_POINT; \n" + "};\n" + "\n" + "struct Fragment{ \n" + " float4 Pos : SV_POSITION;\n" + " float3 Tex : TEXCOORD0; };\n" + "\n" + "Fragment VS( uint vertexId : SV_VertexID )\n" + "{\n" + " Fragment f;\n" + " f.Tex = float3( 0.f, 0.f, 0.f); \n" + " if (vertexId == 1) f.Tex.x = 1.f; \n" + " else if (vertexId == 2) f.Tex.y = 1.f; \n" + " else if (vertexId == 3) f.Tex.xy = float2(1.f, 1.f); \n" + " \n" + " f.Pos = float4( g_vQuadRect.xy + f.Tex * g_vQuadRect.zw, 0, 1);\n" + " \n" + " if (g_UseCase == 1) { \n" + " if (vertexId == 1) f.Tex.z = 0.5f; \n" + " else if (vertexId == 2) f.Tex.z = 0.5f; \n" + " else if (vertexId == 3) f.Tex.z = 1.f; \n" + " } \n" + " else if (g_UseCase >= 2) { \n" + " f.Tex.xy = f.Tex.xy * 2.f - 1.f; \n" + " } \n" + " return f;\n" + "}\n" + "\n" + "float4 PS( Fragment f ) : SV_Target\n" + "{\n" + " if (g_UseCase == 0) return g_Texture2D.Sample( samLinear, f.Tex.xy ); " + "\n" + " else if (g_UseCase == 1) return g_Texture3D.Sample( samLinear, f.Tex " + "); \n" + " else if (g_UseCase == 2) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.xy, 1.0) ); \n" + " else if (g_UseCase == 3) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.xy, -1.0) ); \n" + " else if (g_UseCase == 4) return g_TextureCube.Sample( samLinear, " + "float3(1.0, f.Tex.xy) ); \n" + " else if (g_UseCase == 5) return g_TextureCube.Sample( samLinear, " + "float3(-1.0, f.Tex.xy) ); \n" + " else if (g_UseCase == 6) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.x, 1.0, f.Tex.y) ); \n" + " else if (g_UseCase == 7) return g_TextureCube.Sample( samLinear, " + "float3(f.Tex.x, -1.0, f.Tex.y) ); \n" + " else return float4(f.Tex, 1);\n" + "}\n" + "\n"; -struct ConstantBuffer { - float vQuadRect[4]; - int UseCase; +struct ConstantBuffer +{ + float vQuadRect[4]; + int UseCase; }; ID3D11VertexShader *g_pVertexShader; -ID3D11PixelShader *g_pPixelShader; -ID3D11Buffer *g_pConstantBuffer; +ID3D11PixelShader *g_pPixelShader; +ID3D11Buffer *g_pConstantBuffer; ID3D11SamplerState *g_pSamplerState; #endif // testing/tracing function used pervasively in tests. if the condition is // unsatisfied // then spew and fail the function immediately (doing no cleanup) -#define AssertOrQuit(x) \ - if (!(x)) { \ - fprintf(stdout, "Assert unsatisfied in %s at %s:%d\n", __FUNCTION__, \ - __FILE__, __LINE__); \ - return 1; \ - } +#define AssertOrQuit(x) \ + if (!(x)) { \ + fprintf(stdout, "Assert unsatisfied in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \ + return 1; \ + } -bool g_bDone = false; +bool g_bDone = false; bool g_bPassed = true; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; -const unsigned int g_WindowWidth = 720; +const unsigned int g_WindowWidth = 720; const unsigned int g_WindowHeight = 720; int g_iFrameToCompare = 10; // Data structure for 2D texture shared between DX10 and CUDA -struct { - ID3D11Texture2D *pTexture; - ID3D11ShaderResourceView *pSRView; - cudaGraphicsResource *cudaResource; - void *cudaLinearMemory; - size_t pitch; - int width; - int height; +struct +{ + ID3D11Texture2D *pTexture; + ID3D11ShaderResourceView *pSRView; + cudaGraphicsResource *cudaResource; + void *cudaLinearMemory; + size_t pitch; + int width; + int height; #ifndef USEEFFECT - int offsetInShader; + int offsetInShader; #endif } g_texture_2d; // Data structure for volume textures shared between DX10 and CUDA -struct { - ID3D11Texture3D *pTexture; - ID3D11ShaderResourceView *pSRView; - cudaGraphicsResource *cudaResource; - void *cudaLinearMemory; - size_t pitch; - int width; - int height; - int depth; +struct +{ + ID3D11Texture3D *pTexture; + ID3D11ShaderResourceView *pSRView; + cudaGraphicsResource *cudaResource; + void *cudaLinearMemory; + size_t pitch; + int width; + int height; + int depth; #ifndef USEEFFECT - int offsetInShader; + int offsetInShader; #endif } g_texture_3d; // Data structure for cube texture shared between DX10 and CUDA -struct { - ID3D11Texture2D *pTexture; - ID3D11ShaderResourceView *pSRView; - cudaGraphicsResource *cudaResource; - void *cudaLinearMemory; - size_t pitch; - int size; +struct +{ + ID3D11Texture2D *pTexture; + ID3D11ShaderResourceView *pSRView; + cudaGraphicsResource *cudaResource; + void *cudaLinearMemory; + size_t pitch; + int size; #ifndef USEEFFECT - int offsetInShader; + int offsetInShader; #endif } g_texture_cube; // The CUDA kernel launchers that get called -extern "C" { -bool cuda_texture_2d(void *surface, size_t width, size_t height, size_t pitch, - float t); -bool cuda_texture_3d(void *surface, int width, int height, int depth, - size_t pitch, size_t pitchslice, float t); -bool cuda_texture_cube(void *surface, int width, int height, size_t pitch, - int face, float t); +extern "C" +{ + bool cuda_texture_2d(void *surface, size_t width, size_t height, size_t pitch, float t); + bool cuda_texture_3d(void *surface, int width, int height, int depth, size_t pitch, size_t pitchslice, float t); + bool cuda_texture_cube(void *surface, int width, int height, size_t pitch, int face, float t); } //----------------------------------------------------------------------------- @@ -308,939 +303,963 @@ LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam); #define NAME_LEN 512 -bool findCUDADevice() { - int nGraphicsGPU = 0; - int deviceCount = 0; - bool bFoundGraphics = false; - char devname[NAME_LEN]; +bool findCUDADevice() +{ + int nGraphicsGPU = 0; + int deviceCount = 0; + bool bFoundGraphics = false; + char devname[NAME_LEN]; - // This function call returns 0 if there are no CUDA capable devices. - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + // This function call returns 0 if there are no CUDA capable devices. + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, - cudaGetErrorString(error_id)); - exit(EXIT_FAILURE); - } + if (error_id != cudaSuccess) { + printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); + exit(EXIT_FAILURE); + } - if (deviceCount == 0) { - printf("> There are no device(s) supporting CUDA\n"); - return false; - } else { - printf("> Found %d CUDA Capable Device(s)\n", deviceCount); - } + if (deviceCount == 0) { + printf("> There are no device(s) supporting CUDA\n"); + return false; + } + else { + printf("> Found %d CUDA Capable Device(s)\n", deviceCount); + } - // Get CUDA device properties - cudaDeviceProp deviceProp; + // Get CUDA device properties + cudaDeviceProp deviceProp; - for (int dev = 0; dev < deviceCount; ++dev) { - cudaGetDeviceProperties(&deviceProp, dev); - STRCPY(devname, NAME_LEN, deviceProp.name); - printf("> GPU %d: %s\n", dev, devname); - } + for (int dev = 0; dev < deviceCount; ++dev) { + cudaGetDeviceProperties(&deviceProp, dev); + STRCPY(devname, NAME_LEN, deviceProp.name); + printf("> GPU %d: %s\n", dev, devname); + } - return true; + return true; } -bool findDXDevice(char *dev_name) { - HRESULT hr = S_OK; - cudaError cuStatus; +bool findDXDevice(char *dev_name) +{ + HRESULT hr = S_OK; + cudaError cuStatus; - // Iterate through the candidate adapters - IDXGIFactory *pFactory; - hr = sFnPtr_CreateDXGIFactory(__uuidof(IDXGIFactory), (void **)(&pFactory)); + // Iterate through the candidate adapters + IDXGIFactory *pFactory; + hr = sFnPtr_CreateDXGIFactory(__uuidof(IDXGIFactory), (void **)(&pFactory)); - if (!SUCCEEDED(hr)) { - printf("> No DXGI Factory created.\n"); - return false; - } - - UINT adapter = 0; - - for (; !g_pCudaCapableAdapter; ++adapter) { - // Get a candidate DXGI adapter - IDXGIAdapter *pAdapter = NULL; - hr = pFactory->EnumAdapters(adapter, &pAdapter); - - if (FAILED(hr)) { - break; // no compatible adapters found + if (!SUCCEEDED(hr)) { + printf("> No DXGI Factory created.\n"); + return false; } - // Query to see if there exists a corresponding compute device - int cuDevice; - cuStatus = cudaD3D11GetDevice(&cuDevice, pAdapter); - printLastCudaError("cudaD3D11GetDevice failed"); // This prints and resets - // the cudaError to - // cudaSuccess + UINT adapter = 0; - if (cudaSuccess == cuStatus) { - // If so, mark it as the one against which to create our d3d10 device - g_pCudaCapableAdapter = pAdapter; - g_pCudaCapableAdapter->AddRef(); + for (; !g_pCudaCapableAdapter; ++adapter) { + // Get a candidate DXGI adapter + IDXGIAdapter *pAdapter = NULL; + hr = pFactory->EnumAdapters(adapter, &pAdapter); + + if (FAILED(hr)) { + break; // no compatible adapters found + } + + // Query to see if there exists a corresponding compute device + int cuDevice; + cuStatus = cudaD3D11GetDevice(&cuDevice, pAdapter); + printLastCudaError("cudaD3D11GetDevice failed"); // This prints and resets + // the cudaError to + // cudaSuccess + + if (cudaSuccess == cuStatus) { + // If so, mark it as the one against which to create our d3d10 device + g_pCudaCapableAdapter = pAdapter; + g_pCudaCapableAdapter->AddRef(); + } + + pAdapter->Release(); } - pAdapter->Release(); - } + printf("> Found %d D3D11 Adapater(s).\n", (int)adapter); - printf("> Found %d D3D11 Adapater(s).\n", (int)adapter); + pFactory->Release(); - pFactory->Release(); + if (!g_pCudaCapableAdapter) { + printf("> Found 0 D3D11 Adapater(s) /w Compute capability.\n"); + return false; + } - if (!g_pCudaCapableAdapter) { - printf("> Found 0 D3D11 Adapater(s) /w Compute capability.\n"); - return false; - } + DXGI_ADAPTER_DESC adapterDesc; + g_pCudaCapableAdapter->GetDesc(&adapterDesc); + wcstombs(dev_name, adapterDesc.Description, 128); - DXGI_ADAPTER_DESC adapterDesc; - g_pCudaCapableAdapter->GetDesc(&adapterDesc); - wcstombs(dev_name, adapterDesc.Description, 128); + printf("> Found 1 D3D11 Adapater(s) /w Compute capability.\n"); + printf("> %s\n", dev_name); - printf("> Found 1 D3D11 Adapater(s) /w Compute capability.\n"); - printf("> %s\n", dev_name); - - return true; + return true; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char *argv[]) { - char device_name[256]; - char *ref_file = NULL; +int main(int argc, char *argv[]) +{ + char device_name[256]; + char *ref_file = NULL; - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; - printf("[%s] - Starting...\n", SDK_name); + printf("[%s] - Starting...\n", SDK_name); - if (!findCUDADevice()) // Search for CUDA GPU - { - printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); - exit(EXIT_SUCCESS); - } + if (!findCUDADevice()) // Search for CUDA GPU + { + printf("> CUDA Device NOT found on \"%s\".. Exiting.\n", device_name); + exit(EXIT_SUCCESS); + } - if (!dynlinkLoadD3D11API()) // Search for D3D API (locate drivers, does not - // mean device is found) - { - printf("> D3D11 API libraries NOT found on.. Exiting.\n"); - dynlinkUnloadD3D11API(); - exit(EXIT_SUCCESS); - } + if (!dynlinkLoadD3D11API()) // Search for D3D API (locate drivers, does not + // mean device is found) + { + printf("> D3D11 API libraries NOT found on.. Exiting.\n"); + dynlinkUnloadD3D11API(); + exit(EXIT_SUCCESS); + } - if (!findDXDevice(device_name)) // Search for D3D Hardware Device - { - printf("> D3D11 Graphics Device NOT found.. Exiting.\n"); - dynlinkUnloadD3D11API(); - exit(EXIT_SUCCESS); - } + if (!findDXDevice(device_name)) // Search for D3D Hardware Device + { + printf("> D3D11 Graphics Device NOT found.. Exiting.\n"); + dynlinkUnloadD3D11API(); + exit(EXIT_SUCCESS); + } - // command line options - if (argc > 1) { - // automatied build testing harness - if (checkCmdLineFlag(argc, (const char **)argv, "file")) - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - } + // command line options + if (argc > 1) { + // automatied build testing harness + if (checkCmdLineFlag(argc, (const char **)argv, "file")) + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + } // // create window // // Register the window class #if 1 - WNDCLASSEX wc = {sizeof(WNDCLASSEX), - CS_CLASSDC, - MsgProc, - 0L, - 0L, - GetModuleHandle(NULL), - NULL, - NULL, - NULL, - NULL, - "CUDA SDK", - NULL}; - RegisterClassEx(&wc); + WNDCLASSEX wc = {sizeof(WNDCLASSEX), + CS_CLASSDC, + MsgProc, + 0L, + 0L, + GetModuleHandle(NULL), + NULL, + NULL, + NULL, + NULL, + "CUDA SDK", + NULL}; + RegisterClassEx(&wc); - // Create the application's window - int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); - int yMenu = ::GetSystemMetrics(SM_CYMENU); - int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); - HWND hWnd = CreateWindow( - wc.lpszClassName, "CUDA/D3D11 Texture InterOP", WS_OVERLAPPEDWINDOW, 0, 0, - g_WindowWidth + 2 * xBorder, g_WindowHeight + 2 * yBorder + yMenu, NULL, - NULL, wc.hInstance, NULL); + // Create the application's window + int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); + int yMenu = ::GetSystemMetrics(SM_CYMENU); + int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); + HWND hWnd = CreateWindow(wc.lpszClassName, + "CUDA/D3D11 Texture InterOP", + WS_OVERLAPPEDWINDOW, + 0, + 0, + g_WindowWidth + 2 * xBorder, + g_WindowHeight + 2 * yBorder + yMenu, + NULL, + NULL, + wc.hInstance, + NULL); #else - static WNDCLASSEX wc = { - sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, - GetModuleHandle(NULL), NULL, NULL, NULL, NULL, - "CudaD3D9Tex", NULL}; - RegisterClassEx(&wc); - HWND hWnd = CreateWindow("CudaD3D9Tex", "CUDA D3D9 Texture Interop", - WS_OVERLAPPEDWINDOW, 0, 0, 800, 320, - GetDesktopWindow(), NULL, wc.hInstance, NULL); + static WNDCLASSEX wc = {sizeof(WNDCLASSEX), + CS_CLASSDC, + MsgProc, + 0L, + 0L, + GetModuleHandle(NULL), + NULL, + NULL, + NULL, + NULL, + "CudaD3D9Tex", + NULL}; + RegisterClassEx(&wc); + HWND hWnd = CreateWindow("CudaD3D9Tex", + "CUDA D3D9 Texture Interop", + WS_OVERLAPPEDWINDOW, + 0, + 0, + 800, + 320, + GetDesktopWindow(), + NULL, + wc.hInstance, + NULL); #endif - ShowWindow(hWnd, SW_SHOWDEFAULT); - UpdateWindow(hWnd); + ShowWindow(hWnd, SW_SHOWDEFAULT); + UpdateWindow(hWnd); - // Initialize Direct3D - if (SUCCEEDED(InitD3D(hWnd)) && SUCCEEDED(InitTextures())) { - // 2D - // register the Direct3D resources that we'll use - // we'll read to and write from g_texture_2d, so don't set any special map - // flags for it - cudaGraphicsD3D11RegisterResource(&g_texture_2d.cudaResource, - g_texture_2d.pTexture, - cudaGraphicsRegisterFlagsNone); - getLastCudaError("cudaGraphicsD3D11RegisterResource (g_texture_2d) failed"); - // cuda cannot write into the texture directly : the texture is seen as a - // cudaArray and can only be mapped as a texture - // Create a buffer so that cuda can write into it - // pixel fmt is DXGI_FORMAT_R32G32B32A32_FLOAT - cudaMallocPitch(&g_texture_2d.cudaLinearMemory, &g_texture_2d.pitch, - g_texture_2d.width * sizeof(float) * 4, - g_texture_2d.height); - getLastCudaError("cudaMallocPitch (g_texture_2d) failed"); - cudaMemset(g_texture_2d.cudaLinearMemory, 1, - g_texture_2d.pitch * g_texture_2d.height); + // Initialize Direct3D + if (SUCCEEDED(InitD3D(hWnd)) && SUCCEEDED(InitTextures())) { + // 2D + // register the Direct3D resources that we'll use + // we'll read to and write from g_texture_2d, so don't set any special map + // flags for it + cudaGraphicsD3D11RegisterResource( + &g_texture_2d.cudaResource, g_texture_2d.pTexture, cudaGraphicsRegisterFlagsNone); + getLastCudaError("cudaGraphicsD3D11RegisterResource (g_texture_2d) failed"); + // cuda cannot write into the texture directly : the texture is seen as a + // cudaArray and can only be mapped as a texture + // Create a buffer so that cuda can write into it + // pixel fmt is DXGI_FORMAT_R32G32B32A32_FLOAT + cudaMallocPitch(&g_texture_2d.cudaLinearMemory, + &g_texture_2d.pitch, + g_texture_2d.width * sizeof(float) * 4, + g_texture_2d.height); + getLastCudaError("cudaMallocPitch (g_texture_2d) failed"); + cudaMemset(g_texture_2d.cudaLinearMemory, 1, g_texture_2d.pitch * g_texture_2d.height); - // CUBE - cudaGraphicsD3D11RegisterResource(&g_texture_cube.cudaResource, - g_texture_cube.pTexture, - cudaGraphicsRegisterFlagsNone); - getLastCudaError( - "cudaGraphicsD3D11RegisterResource (g_texture_cube) failed"); - // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM - cudaMallocPitch(&g_texture_cube.cudaLinearMemory, &g_texture_cube.pitch, - g_texture_cube.size * 4, g_texture_cube.size); - getLastCudaError("cudaMallocPitch (g_texture_cube) failed"); - cudaMemset(g_texture_cube.cudaLinearMemory, 1, - g_texture_cube.pitch * g_texture_cube.size); - getLastCudaError("cudaMemset (g_texture_cube) failed"); + // CUBE + cudaGraphicsD3D11RegisterResource( + &g_texture_cube.cudaResource, g_texture_cube.pTexture, cudaGraphicsRegisterFlagsNone); + getLastCudaError("cudaGraphicsD3D11RegisterResource (g_texture_cube) failed"); + // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM + cudaMallocPitch( + &g_texture_cube.cudaLinearMemory, &g_texture_cube.pitch, g_texture_cube.size * 4, g_texture_cube.size); + getLastCudaError("cudaMallocPitch (g_texture_cube) failed"); + cudaMemset(g_texture_cube.cudaLinearMemory, 1, g_texture_cube.pitch * g_texture_cube.size); + getLastCudaError("cudaMemset (g_texture_cube) failed"); - // 3D - cudaGraphicsD3D11RegisterResource(&g_texture_3d.cudaResource, - g_texture_3d.pTexture, - cudaGraphicsRegisterFlagsNone); - getLastCudaError("cudaGraphicsD3D11RegisterResource (g_texture_3d) failed"); - // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM - // cudaMallocPitch(&g_texture_3d.cudaLinearMemory, &g_texture_3d.pitch, - // g_texture_3d.width * 4, g_texture_3d.height * g_texture_3d.depth); - cudaMalloc( - &g_texture_3d.cudaLinearMemory, - g_texture_3d.width * 4 * g_texture_3d.height * g_texture_3d.depth); - g_texture_3d.pitch = g_texture_3d.width * 4; - getLastCudaError("cudaMallocPitch (g_texture_3d) failed"); - cudaMemset(g_texture_3d.cudaLinearMemory, 1, - g_texture_3d.pitch * g_texture_3d.height * g_texture_3d.depth); - getLastCudaError("cudaMemset (g_texture_3d) failed"); - } - - // - // the main loop - // - while (false == g_bDone) { - Render(); + // 3D + cudaGraphicsD3D11RegisterResource( + &g_texture_3d.cudaResource, g_texture_3d.pTexture, cudaGraphicsRegisterFlagsNone); + getLastCudaError("cudaGraphicsD3D11RegisterResource (g_texture_3d) failed"); + // create the buffer. pixel fmt is DXGI_FORMAT_R8G8B8A8_SNORM + // cudaMallocPitch(&g_texture_3d.cudaLinearMemory, &g_texture_3d.pitch, + // g_texture_3d.width * 4, g_texture_3d.height * g_texture_3d.depth); + cudaMalloc(&g_texture_3d.cudaLinearMemory, g_texture_3d.width * 4 * g_texture_3d.height * g_texture_3d.depth); + g_texture_3d.pitch = g_texture_3d.width * 4; + getLastCudaError("cudaMallocPitch (g_texture_3d) failed"); + cudaMemset(g_texture_3d.cudaLinearMemory, 1, g_texture_3d.pitch * g_texture_3d.height * g_texture_3d.depth); + getLastCudaError("cudaMemset (g_texture_3d) failed"); + } // - // handle I/O + // the main loop // - MSG msg; - ZeroMemory(&msg, sizeof(msg)); - - while (msg.message != WM_QUIT) { - if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) { - TranslateMessage(&msg); - DispatchMessage(&msg); - } else { + while (false == g_bDone) { Render(); - if (ref_file) { - for (int count = 0; count < g_iFrameToCompare; count++) { - Render(); - } + // + // handle I/O + // + MSG msg; + ZeroMemory(&msg, sizeof(msg)); - const char *cur_image_path = "simpleD3D11Texture.ppm"; + while (msg.message != WM_QUIT) { + if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) { + TranslateMessage(&msg); + DispatchMessage(&msg); + } + else { + Render(); - // Save a reference of our current test run image - CheckRenderD3D11::ActiveRenderTargetToPPM(g_pd3dDevice, - cur_image_path); + if (ref_file) { + for (int count = 0; count < g_iFrameToCompare; count++) { + Render(); + } - // compare to offical reference image, printing PASS or FAIL. - g_bPassed = CheckRenderD3D11::PPMvsPPM(cur_image_path, ref_file, - argv[0], MAX_EPSILON, 0.15f); + const char *cur_image_path = "simpleD3D11Texture.ppm"; - g_bDone = true; + // Save a reference of our current test run image + CheckRenderD3D11::ActiveRenderTargetToPPM(g_pd3dDevice, cur_image_path); - Cleanup(); + // compare to offical reference image, printing PASS or FAIL. + g_bPassed = CheckRenderD3D11::PPMvsPPM(cur_image_path, ref_file, argv[0], MAX_EPSILON, 0.15f); - PostQuitMessage(0); - } else { - g_bPassed = true; + g_bDone = true; + + Cleanup(); + + PostQuitMessage(0); + } + else { + g_bPassed = true; + } + } } - } - } - }; + }; - // Release D3D Library (after message loop) - dynlinkUnloadD3D11API(); + // Release D3D Library (after message loop) + dynlinkUnloadD3D11API(); - // Unregister windows class - UnregisterClass(wc.lpszClassName, wc.hInstance); + // Unregister windows class + UnregisterClass(wc.lpszClassName, wc.hInstance); - // - // and exit - // - printf("> %s running on %s exiting...\n", SDK_name, device_name); + // + // and exit + // + printf("> %s running on %s exiting...\n", SDK_name, device_name); - exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); + exit(g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); } //----------------------------------------------------------------------------- // Name: InitD3D() // Desc: Initializes Direct3D //----------------------------------------------------------------------------- -HRESULT InitD3D(HWND hWnd) { - HRESULT hr = S_OK; +HRESULT InitD3D(HWND hWnd) +{ + HRESULT hr = S_OK; - // Set up the structure used to create the device and swapchain - DXGI_SWAP_CHAIN_DESC sd; - ZeroMemory(&sd, sizeof(sd)); - sd.BufferCount = 1; - sd.BufferDesc.Width = g_WindowWidth; - sd.BufferDesc.Height = g_WindowHeight; - sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; - sd.BufferDesc.RefreshRate.Numerator = 60; - sd.BufferDesc.RefreshRate.Denominator = 1; - sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; - sd.OutputWindow = hWnd; - sd.SampleDesc.Count = 1; - sd.SampleDesc.Quality = 0; - sd.Windowed = TRUE; + // Set up the structure used to create the device and swapchain + DXGI_SWAP_CHAIN_DESC sd; + ZeroMemory(&sd, sizeof(sd)); + sd.BufferCount = 1; + sd.BufferDesc.Width = g_WindowWidth; + sd.BufferDesc.Height = g_WindowHeight; + sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + sd.BufferDesc.RefreshRate.Numerator = 60; + sd.BufferDesc.RefreshRate.Denominator = 1; + sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; + sd.OutputWindow = hWnd; + sd.SampleDesc.Count = 1; + sd.SampleDesc.Quality = 0; + sd.Windowed = TRUE; - D3D_FEATURE_LEVEL tour_fl[] = {D3D_FEATURE_LEVEL_11_0, D3D_FEATURE_LEVEL_10_1, - D3D_FEATURE_LEVEL_10_0}; - D3D_FEATURE_LEVEL flRes; - // Create device and swapchain - hr = sFnPtr_D3D11CreateDeviceAndSwapChain( - g_pCudaCapableAdapter, - D3D_DRIVER_TYPE_UNKNOWN, // D3D_DRIVER_TYPE_HARDWARE, - NULL, // HMODULE Software - 0, // UINT Flags - tour_fl, // D3D_FEATURE_LEVEL* pFeatureLevels - 3, // FeatureLevels - D3D11_SDK_VERSION, // UINT SDKVersion - &sd, // DXGI_SWAP_CHAIN_DESC* pSwapChainDesc - &g_pSwapChain, // IDXGISwapChain** ppSwapChain - &g_pd3dDevice, // ID3D11Device** ppDevice - &flRes, // D3D_FEATURE_LEVEL* pFeatureLevel - &g_pd3dDeviceContext // ID3D11DeviceContext** ppImmediateContext - ); - AssertOrQuit(SUCCEEDED(hr)); + D3D_FEATURE_LEVEL tour_fl[] = {D3D_FEATURE_LEVEL_11_0, D3D_FEATURE_LEVEL_10_1, D3D_FEATURE_LEVEL_10_0}; + D3D_FEATURE_LEVEL flRes; + // Create device and swapchain + hr = sFnPtr_D3D11CreateDeviceAndSwapChain(g_pCudaCapableAdapter, + D3D_DRIVER_TYPE_UNKNOWN, // D3D_DRIVER_TYPE_HARDWARE, + NULL, // HMODULE Software + 0, // UINT Flags + tour_fl, // D3D_FEATURE_LEVEL* pFeatureLevels + 3, // FeatureLevels + D3D11_SDK_VERSION, // UINT SDKVersion + &sd, // DXGI_SWAP_CHAIN_DESC* pSwapChainDesc + &g_pSwapChain, // IDXGISwapChain** ppSwapChain + &g_pd3dDevice, // ID3D11Device** ppDevice + &flRes, // D3D_FEATURE_LEVEL* pFeatureLevel + &g_pd3dDeviceContext // ID3D11DeviceContext** ppImmediateContext + ); + AssertOrQuit(SUCCEEDED(hr)); - g_pCudaCapableAdapter->Release(); + g_pCudaCapableAdapter->Release(); - // Get the immediate DeviceContext - g_pd3dDevice->GetImmediateContext(&g_pd3dDeviceContext); + // Get the immediate DeviceContext + g_pd3dDevice->GetImmediateContext(&g_pd3dDeviceContext); - // Create a render target view of the swapchain - ID3D11Texture2D *pBuffer; - hr = - g_pSwapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), (LPVOID *)&pBuffer); - AssertOrQuit(SUCCEEDED(hr)); + // Create a render target view of the swapchain + ID3D11Texture2D *pBuffer; + hr = g_pSwapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), (LPVOID *)&pBuffer); + AssertOrQuit(SUCCEEDED(hr)); - hr = g_pd3dDevice->CreateRenderTargetView(pBuffer, NULL, &g_pSwapChainRTV); - AssertOrQuit(SUCCEEDED(hr)); - pBuffer->Release(); + hr = g_pd3dDevice->CreateRenderTargetView(pBuffer, NULL, &g_pSwapChainRTV); + AssertOrQuit(SUCCEEDED(hr)); + pBuffer->Release(); - g_pd3dDeviceContext->OMSetRenderTargets(1, &g_pSwapChainRTV, NULL); + g_pd3dDeviceContext->OMSetRenderTargets(1, &g_pSwapChainRTV, NULL); - // Setup the viewport - D3D11_VIEWPORT vp; - vp.Width = g_WindowWidth; - vp.Height = g_WindowHeight; - vp.MinDepth = 0.0f; - vp.MaxDepth = 1.0f; - vp.TopLeftX = 0; - vp.TopLeftY = 0; - g_pd3dDeviceContext->RSSetViewports(1, &vp); + // Setup the viewport + D3D11_VIEWPORT vp; + vp.Width = g_WindowWidth; + vp.Height = g_WindowHeight; + vp.MinDepth = 0.0f; + vp.MaxDepth = 1.0f; + vp.TopLeftX = 0; + vp.TopLeftY = 0; + g_pd3dDeviceContext->RSSetViewports(1, &vp); #ifdef USEEFFECT - // Setup the effect - { - ID3D10Blob *effectCode, *effectErrors; - hr = D3DX11CompileFromMemory( - g_simpleEffectSrc, sizeof(g_simpleEffectSrc), "NoFile", NULL, NULL, "", - "fx_5_0", - D3D10_SHADER_OPTIMIZATION_LEVEL0 | - D3D10_SHADER_ENABLE_BACKWARDS_COMPATIBILITY | D3D10_SHADER_DEBUG, - 0, 0, &effectCode, &effectErrors, 0); + // Setup the effect + { + ID3D10Blob *effectCode, *effectErrors; + hr = D3DX11CompileFromMemory(g_simpleEffectSrc, + sizeof(g_simpleEffectSrc), + "NoFile", + NULL, + NULL, + "", + "fx_5_0", + D3D10_SHADER_OPTIMIZATION_LEVEL0 | D3D10_SHADER_ENABLE_BACKWARDS_COMPATIBILITY + | D3D10_SHADER_DEBUG, + 0, + 0, + &effectCode, + &effectErrors, + 0); - if (FAILED(hr)) { - const char *pStr = (const char *)effectErrors->GetBufferPointer(); - printf(pStr); - assert(1); + if (FAILED(hr)) { + const char *pStr = (const char *)effectErrors->GetBufferPointer(); + printf(pStr); + assert(1); + } + + hr = D3DX11CreateEffectFromMemory( + effectCode->GetBufferPointer(), effectCode->GetBufferSize(), 0 /*FXFlags*/, g_pd3dDevice, &g_pSimpleEffect); + AssertOrQuit(SUCCEEDED(hr)); + g_pSimpleTechnique = g_pSimpleEffect->GetTechniqueByName("Render"); + + g_pvQuadRect = g_pSimpleEffect->GetVariableByName("g_vQuadRect")->AsVector(); + g_pUseCase = g_pSimpleEffect->GetVariableByName("g_UseCase")->AsScalar(); + + g_pTexture2D = g_pSimpleEffect->GetVariableByName("g_Texture2D")->AsShaderResource(); + g_pTexture3D = g_pSimpleEffect->GetVariableByName("g_Texture3D")->AsShaderResource(); + g_pTextureCube = g_pSimpleEffect->GetVariableByName("g_TextureCube")->AsShaderResource(); } - - hr = D3DX11CreateEffectFromMemory( - effectCode->GetBufferPointer(), effectCode->GetBufferSize(), - 0 /*FXFlags*/, g_pd3dDevice, &g_pSimpleEffect); - AssertOrQuit(SUCCEEDED(hr)); - g_pSimpleTechnique = g_pSimpleEffect->GetTechniqueByName("Render"); - - g_pvQuadRect = - g_pSimpleEffect->GetVariableByName("g_vQuadRect")->AsVector(); - g_pUseCase = g_pSimpleEffect->GetVariableByName("g_UseCase")->AsScalar(); - - g_pTexture2D = - g_pSimpleEffect->GetVariableByName("g_Texture2D")->AsShaderResource(); - g_pTexture3D = - g_pSimpleEffect->GetVariableByName("g_Texture3D")->AsShaderResource(); - g_pTextureCube = - g_pSimpleEffect->GetVariableByName("g_TextureCube")->AsShaderResource(); - } #else - ID3DBlob *pShader; - ID3DBlob *pErrorMsgs; - // Vertex shader - { - hr = D3DCompile(g_simpleShaders, strlen(g_simpleShaders), "Memory", NULL, - NULL, "VS", "vs_4_0", 0 /*Flags1*/, 0 /*Flags2*/, &pShader, - &pErrorMsgs); + ID3DBlob *pShader; + ID3DBlob *pErrorMsgs; + // Vertex shader + { + hr = D3DCompile(g_simpleShaders, + strlen(g_simpleShaders), + "Memory", + NULL, + NULL, + "VS", + "vs_4_0", + 0 /*Flags1*/, + 0 /*Flags2*/, + &pShader, + &pErrorMsgs); - if (FAILED(hr)) { - const char *pStr = (const char *)pErrorMsgs->GetBufferPointer(); - printf(pStr); + if (FAILED(hr)) { + const char *pStr = (const char *)pErrorMsgs->GetBufferPointer(); + printf(pStr); + } + + AssertOrQuit(SUCCEEDED(hr)); + hr = g_pd3dDevice->CreateVertexShader( + pShader->GetBufferPointer(), pShader->GetBufferSize(), NULL, &g_pVertexShader); + AssertOrQuit(SUCCEEDED(hr)); + // Let's bind it now : no other vtx shader will replace it... + g_pd3dDeviceContext->VSSetShader(g_pVertexShader, NULL, 0); + // hr = g_pd3dDevice->CreateInputLayout(...pShader used for signature...) No + // need } + // Pixel shader + { + hr = D3DCompile(g_simpleShaders, + strlen(g_simpleShaders), + "Memory", + NULL, + NULL, + "PS", + "ps_4_0", + 0 /*Flags1*/, + 0 /*Flags2*/, + &pShader, + &pErrorMsgs); - AssertOrQuit(SUCCEEDED(hr)); - hr = g_pd3dDevice->CreateVertexShader(pShader->GetBufferPointer(), - pShader->GetBufferSize(), NULL, - &g_pVertexShader); - AssertOrQuit(SUCCEEDED(hr)); - // Let's bind it now : no other vtx shader will replace it... - g_pd3dDeviceContext->VSSetShader(g_pVertexShader, NULL, 0); - // hr = g_pd3dDevice->CreateInputLayout(...pShader used for signature...) No - // need - } - // Pixel shader - { - hr = D3DCompile(g_simpleShaders, strlen(g_simpleShaders), "Memory", NULL, - NULL, "PS", "ps_4_0", 0 /*Flags1*/, 0 /*Flags2*/, &pShader, - &pErrorMsgs); - - AssertOrQuit(SUCCEEDED(hr)); - hr = g_pd3dDevice->CreatePixelShader(pShader->GetBufferPointer(), - pShader->GetBufferSize(), NULL, - &g_pPixelShader); - AssertOrQuit(SUCCEEDED(hr)); - // Let's bind it now : no other pix shader will replace it... - g_pd3dDeviceContext->PSSetShader(g_pPixelShader, NULL, 0); - } - // Create the constant buffer - { - D3D11_BUFFER_DESC cbDesc; - cbDesc.Usage = D3D11_USAGE_DYNAMIC; - cbDesc.BindFlags = - D3D11_BIND_CONSTANT_BUFFER; // D3D11_BIND_SHADER_RESOURCE; - cbDesc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - cbDesc.MiscFlags = 0; - cbDesc.ByteWidth = 16 * ((sizeof(ConstantBuffer) + 16) / 16); - // cbDesc.StructureByteStride = 0; - hr = g_pd3dDevice->CreateBuffer(&cbDesc, NULL, &g_pConstantBuffer); - AssertOrQuit(SUCCEEDED(hr)); - // Assign the buffer now : nothing in the code will interfere with this - // (very simple sample) - g_pd3dDeviceContext->VSSetConstantBuffers(0, 1, &g_pConstantBuffer); - g_pd3dDeviceContext->PSSetConstantBuffers(0, 1, &g_pConstantBuffer); - } - // SamplerState - { - D3D11_SAMPLER_DESC sDesc; - sDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR; - sDesc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; - sDesc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; - sDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; - sDesc.MinLOD = 0; - sDesc.MaxLOD = 8; - sDesc.MipLODBias = 0; - sDesc.MaxAnisotropy = 1; - hr = g_pd3dDevice->CreateSamplerState(&sDesc, &g_pSamplerState); - AssertOrQuit(SUCCEEDED(hr)); - g_pd3dDeviceContext->PSSetSamplers(0, 1, &g_pSamplerState); - } + AssertOrQuit(SUCCEEDED(hr)); + hr = g_pd3dDevice->CreatePixelShader( + pShader->GetBufferPointer(), pShader->GetBufferSize(), NULL, &g_pPixelShader); + AssertOrQuit(SUCCEEDED(hr)); + // Let's bind it now : no other pix shader will replace it... + g_pd3dDeviceContext->PSSetShader(g_pPixelShader, NULL, 0); + } + // Create the constant buffer + { + D3D11_BUFFER_DESC cbDesc; + cbDesc.Usage = D3D11_USAGE_DYNAMIC; + cbDesc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; // D3D11_BIND_SHADER_RESOURCE; + cbDesc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + cbDesc.MiscFlags = 0; + cbDesc.ByteWidth = 16 * ((sizeof(ConstantBuffer) + 16) / 16); + // cbDesc.StructureByteStride = 0; + hr = g_pd3dDevice->CreateBuffer(&cbDesc, NULL, &g_pConstantBuffer); + AssertOrQuit(SUCCEEDED(hr)); + // Assign the buffer now : nothing in the code will interfere with this + // (very simple sample) + g_pd3dDeviceContext->VSSetConstantBuffers(0, 1, &g_pConstantBuffer); + g_pd3dDeviceContext->PSSetConstantBuffers(0, 1, &g_pConstantBuffer); + } + // SamplerState + { + D3D11_SAMPLER_DESC sDesc; + sDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR; + sDesc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; + sDesc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; + sDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; + sDesc.MinLOD = 0; + sDesc.MaxLOD = 8; + sDesc.MipLODBias = 0; + sDesc.MaxAnisotropy = 1; + hr = g_pd3dDevice->CreateSamplerState(&sDesc, &g_pSamplerState); + AssertOrQuit(SUCCEEDED(hr)); + g_pd3dDeviceContext->PSSetSamplers(0, 1, &g_pSamplerState); + } #endif - // Setup no Input Layout - g_pd3dDeviceContext->IASetInputLayout(0); - g_pd3dDeviceContext->IASetPrimitiveTopology( - D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); + // Setup no Input Layout + g_pd3dDeviceContext->IASetInputLayout(0); + g_pd3dDeviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); - D3D11_RASTERIZER_DESC rasterizerState; - rasterizerState.FillMode = D3D11_FILL_SOLID; - rasterizerState.CullMode = D3D11_CULL_FRONT; - rasterizerState.FrontCounterClockwise = false; - rasterizerState.DepthBias = false; - rasterizerState.DepthBiasClamp = 0; - rasterizerState.SlopeScaledDepthBias = 0; - rasterizerState.DepthClipEnable = false; - rasterizerState.ScissorEnable = false; - rasterizerState.MultisampleEnable = false; - rasterizerState.AntialiasedLineEnable = false; - g_pd3dDevice->CreateRasterizerState(&rasterizerState, &g_pRasterState); - g_pd3dDeviceContext->RSSetState(g_pRasterState); + D3D11_RASTERIZER_DESC rasterizerState; + rasterizerState.FillMode = D3D11_FILL_SOLID; + rasterizerState.CullMode = D3D11_CULL_FRONT; + rasterizerState.FrontCounterClockwise = false; + rasterizerState.DepthBias = false; + rasterizerState.DepthBiasClamp = 0; + rasterizerState.SlopeScaledDepthBias = 0; + rasterizerState.DepthClipEnable = false; + rasterizerState.ScissorEnable = false; + rasterizerState.MultisampleEnable = false; + rasterizerState.AntialiasedLineEnable = false; + g_pd3dDevice->CreateRasterizerState(&rasterizerState, &g_pRasterState); + g_pd3dDeviceContext->RSSetState(g_pRasterState); - return S_OK; + return S_OK; } //----------------------------------------------------------------------------- // Name: InitTextures() // Desc: Initializes Direct3D Textures (allocation and initialization) //----------------------------------------------------------------------------- -HRESULT InitTextures() { - // - // create the D3D resources we'll be using - // - // 2D texture - { - g_texture_2d.width = 256; - g_texture_2d.height = 256; +HRESULT InitTextures() +{ + // + // create the D3D resources we'll be using + // + // 2D texture + { + g_texture_2d.width = 256; + g_texture_2d.height = 256; - D3D11_TEXTURE2D_DESC desc; - ZeroMemory(&desc, sizeof(D3D11_TEXTURE2D_DESC)); - desc.Width = g_texture_2d.width; - desc.Height = g_texture_2d.height; - desc.MipLevels = 1; - desc.ArraySize = 1; - desc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT; - desc.SampleDesc.Count = 1; - desc.Usage = D3D11_USAGE_DEFAULT; - desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + D3D11_TEXTURE2D_DESC desc; + ZeroMemory(&desc, sizeof(D3D11_TEXTURE2D_DESC)); + desc.Width = g_texture_2d.width; + desc.Height = g_texture_2d.height; + desc.MipLevels = 1; + desc.ArraySize = 1; + desc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT; + desc.SampleDesc.Count = 1; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - if (FAILED(g_pd3dDevice->CreateTexture2D(&desc, NULL, - &g_texture_2d.pTexture))) { - return E_FAIL; - } + if (FAILED(g_pd3dDevice->CreateTexture2D(&desc, NULL, &g_texture_2d.pTexture))) { + return E_FAIL; + } - if (FAILED(g_pd3dDevice->CreateShaderResourceView( - g_texture_2d.pTexture, NULL, &g_texture_2d.pSRView))) { - return E_FAIL; - } + if (FAILED(g_pd3dDevice->CreateShaderResourceView(g_texture_2d.pTexture, NULL, &g_texture_2d.pSRView))) { + return E_FAIL; + } #ifdef USEEFFECT - g_pTexture2D->SetResource(g_texture_2d.pSRView); + g_pTexture2D->SetResource(g_texture_2d.pSRView); #else - g_texture_2d.offsetInShader = - 0; // to be clean we should look for the offset from the shader code - g_pd3dDeviceContext->PSSetShaderResources(g_texture_2d.offsetInShader, 1, - &g_texture_2d.pSRView); + g_texture_2d.offsetInShader = 0; // to be clean we should look for the offset from the shader code + g_pd3dDeviceContext->PSSetShaderResources(g_texture_2d.offsetInShader, 1, &g_texture_2d.pSRView); #endif - } - - // 3D texture - { - g_texture_3d.width = 64; - g_texture_3d.height = 64; - g_texture_3d.depth = 64; - - D3D11_TEXTURE3D_DESC desc; - ZeroMemory(&desc, sizeof(D3D11_TEXTURE3D_DESC)); - desc.Width = g_texture_3d.width; - desc.Height = g_texture_3d.height; - desc.Depth = g_texture_3d.depth; - desc.MipLevels = 1; - desc.Format = DXGI_FORMAT_R8G8B8A8_SNORM; - desc.Usage = D3D11_USAGE_DEFAULT; - desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - - if (FAILED(g_pd3dDevice->CreateTexture3D(&desc, NULL, - &g_texture_3d.pTexture))) { - return E_FAIL; } - if (FAILED(g_pd3dDevice->CreateShaderResourceView( - g_texture_3d.pTexture, NULL, &g_texture_3d.pSRView))) { - return E_FAIL; - } + // 3D texture + { + g_texture_3d.width = 64; + g_texture_3d.height = 64; + g_texture_3d.depth = 64; + + D3D11_TEXTURE3D_DESC desc; + ZeroMemory(&desc, sizeof(D3D11_TEXTURE3D_DESC)); + desc.Width = g_texture_3d.width; + desc.Height = g_texture_3d.height; + desc.Depth = g_texture_3d.depth; + desc.MipLevels = 1; + desc.Format = DXGI_FORMAT_R8G8B8A8_SNORM; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + + if (FAILED(g_pd3dDevice->CreateTexture3D(&desc, NULL, &g_texture_3d.pTexture))) { + return E_FAIL; + } + + if (FAILED(g_pd3dDevice->CreateShaderResourceView(g_texture_3d.pTexture, NULL, &g_texture_3d.pSRView))) { + return E_FAIL; + } #ifdef USEEFFECT - g_pTexture3D->SetResource(g_texture_3d.pSRView); + g_pTexture3D->SetResource(g_texture_3d.pSRView); #else - g_texture_3d.offsetInShader = - 1; // to be clean we should look for the offset from the shader code - g_pd3dDeviceContext->PSSetShaderResources(g_texture_3d.offsetInShader, 1, - &g_texture_3d.pSRView); + g_texture_3d.offsetInShader = 1; // to be clean we should look for the offset from the shader code + g_pd3dDeviceContext->PSSetShaderResources(g_texture_3d.offsetInShader, 1, &g_texture_3d.pSRView); #endif - } - - // cube texture - { - g_texture_cube.size = 64; - - D3D11_TEXTURE2D_DESC desc; - ZeroMemory(&desc, sizeof(D3D11_TEXTURE2D_DESC)); - desc.Width = g_texture_cube.size; - desc.Height = g_texture_cube.size; - desc.MipLevels = 1; - desc.ArraySize = 6; - desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; - desc.SampleDesc.Count = 1; - desc.Usage = D3D11_USAGE_DEFAULT; - desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - desc.MiscFlags = D3D11_RESOURCE_MISC_TEXTURECUBE; - - if (FAILED(g_pd3dDevice->CreateTexture2D(&desc, NULL, - &g_texture_cube.pTexture))) { - return E_FAIL; } - D3D11_SHADER_RESOURCE_VIEW_DESC SRVDesc; - ZeroMemory(&SRVDesc, sizeof(SRVDesc)); - SRVDesc.Format = desc.Format; - SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; - SRVDesc.TextureCube.MipLevels = desc.MipLevels; - SRVDesc.TextureCube.MostDetailedMip = 0; + // cube texture + { + g_texture_cube.size = 64; - if (FAILED(g_pd3dDevice->CreateShaderResourceView( - g_texture_cube.pTexture, &SRVDesc, &g_texture_cube.pSRView))) { - return E_FAIL; - } + D3D11_TEXTURE2D_DESC desc; + ZeroMemory(&desc, sizeof(D3D11_TEXTURE2D_DESC)); + desc.Width = g_texture_cube.size; + desc.Height = g_texture_cube.size; + desc.MipLevels = 1; + desc.ArraySize = 6; + desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + desc.SampleDesc.Count = 1; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + desc.MiscFlags = D3D11_RESOURCE_MISC_TEXTURECUBE; + + if (FAILED(g_pd3dDevice->CreateTexture2D(&desc, NULL, &g_texture_cube.pTexture))) { + return E_FAIL; + } + + D3D11_SHADER_RESOURCE_VIEW_DESC SRVDesc; + ZeroMemory(&SRVDesc, sizeof(SRVDesc)); + SRVDesc.Format = desc.Format; + SRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; + SRVDesc.TextureCube.MipLevels = desc.MipLevels; + SRVDesc.TextureCube.MostDetailedMip = 0; + + if (FAILED( + g_pd3dDevice->CreateShaderResourceView(g_texture_cube.pTexture, &SRVDesc, &g_texture_cube.pSRView))) { + return E_FAIL; + } #ifdef USEEFFECT - g_pTextureCube->SetResource(g_texture_cube.pSRView); + g_pTextureCube->SetResource(g_texture_cube.pSRView); #else - g_texture_cube.offsetInShader = - 2; // to be clean we should look for the offset from the shader code - g_pd3dDeviceContext->PSSetShaderResources(g_texture_cube.offsetInShader, 1, - &g_texture_cube.pSRView); + g_texture_cube.offsetInShader = 2; // to be clean we should look for the offset from the shader code + g_pd3dDeviceContext->PSSetShaderResources(g_texture_cube.offsetInShader, 1, &g_texture_cube.pSRView); #endif - } + } - return S_OK; + return S_OK; } //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void RunKernels() { - static float t = 0.0f; +void RunKernels() +{ + static float t = 0.0f; - // populate the 2d texture - { - cudaArray *cuArray; - cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_2d.cudaResource, - 0, 0); - getLastCudaError( - "cudaGraphicsSubResourceGetMappedArray (cuda_texture_2d) failed"); + // populate the 2d texture + { + cudaArray *cuArray; + cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_2d.cudaResource, 0, 0); + getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_2d) failed"); - // kick off the kernel and send the staging buffer cudaLinearMemory as an - // argument to allow the kernel to write to it - cuda_texture_2d(g_texture_2d.cudaLinearMemory, g_texture_2d.width, - g_texture_2d.height, g_texture_2d.pitch, t); - getLastCudaError("cuda_texture_2d failed"); + // kick off the kernel and send the staging buffer cudaLinearMemory as an + // argument to allow the kernel to write to it + cuda_texture_2d(g_texture_2d.cudaLinearMemory, g_texture_2d.width, g_texture_2d.height, g_texture_2d.pitch, t); + getLastCudaError("cuda_texture_2d failed"); - // then we want to copy cudaLinearMemory to the D3D texture, via its mapped - // form : cudaArray - cudaMemcpy2DToArray( - cuArray, // dst array - 0, 0, // offset - g_texture_2d.cudaLinearMemory, g_texture_2d.pitch, // src - g_texture_2d.width * 4 * sizeof(float), g_texture_2d.height, // extent - cudaMemcpyDeviceToDevice); // kind - getLastCudaError("cudaMemcpy2DToArray failed"); - } - // populate the volume texture - { - size_t pitchSlice = g_texture_3d.pitch * g_texture_3d.height; - cudaArray *cuArray; - cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_3d.cudaResource, - 0, 0); - getLastCudaError( - "cudaGraphicsSubResourceGetMappedArray (cuda_texture_3d) failed"); + // then we want to copy cudaLinearMemory to the D3D texture, via its mapped + // form : cudaArray + cudaMemcpy2DToArray(cuArray, // dst array + 0, + 0, // offset + g_texture_2d.cudaLinearMemory, + g_texture_2d.pitch, // src + g_texture_2d.width * 4 * sizeof(float), + g_texture_2d.height, // extent + cudaMemcpyDeviceToDevice); // kind + getLastCudaError("cudaMemcpy2DToArray failed"); + } + // populate the volume texture + { + size_t pitchSlice = g_texture_3d.pitch * g_texture_3d.height; + cudaArray *cuArray; + cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_3d.cudaResource, 0, 0); + getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_3d) failed"); - // kick off the kernel and send the staging buffer cudaLinearMemory as an - // argument to allow the kernel to write to it - cuda_texture_3d(g_texture_3d.cudaLinearMemory, g_texture_3d.width, - g_texture_3d.height, g_texture_3d.depth, g_texture_3d.pitch, - pitchSlice, t); - getLastCudaError("cuda_texture_3d failed"); + // kick off the kernel and send the staging buffer cudaLinearMemory as an + // argument to allow the kernel to write to it + cuda_texture_3d(g_texture_3d.cudaLinearMemory, + g_texture_3d.width, + g_texture_3d.height, + g_texture_3d.depth, + g_texture_3d.pitch, + pitchSlice, + t); + getLastCudaError("cuda_texture_3d failed"); - // then we want to copy cudaLinearMemory to the D3D texture, via its mapped - // form : cudaArray - struct cudaMemcpy3DParms memcpyParams = {0}; - memcpyParams.dstArray = cuArray; - memcpyParams.srcPtr.ptr = g_texture_3d.cudaLinearMemory; - memcpyParams.srcPtr.pitch = g_texture_3d.pitch; - memcpyParams.srcPtr.xsize = g_texture_3d.width; - memcpyParams.srcPtr.ysize = g_texture_3d.height; - memcpyParams.extent.width = g_texture_3d.width; - memcpyParams.extent.height = g_texture_3d.height; - memcpyParams.extent.depth = g_texture_3d.depth; - memcpyParams.kind = cudaMemcpyDeviceToDevice; - cudaMemcpy3D(&memcpyParams); - getLastCudaError("cudaMemcpy3D failed"); - } + // then we want to copy cudaLinearMemory to the D3D texture, via its mapped + // form : cudaArray + struct cudaMemcpy3DParms memcpyParams = {0}; + memcpyParams.dstArray = cuArray; + memcpyParams.srcPtr.ptr = g_texture_3d.cudaLinearMemory; + memcpyParams.srcPtr.pitch = g_texture_3d.pitch; + memcpyParams.srcPtr.xsize = g_texture_3d.width; + memcpyParams.srcPtr.ysize = g_texture_3d.height; + memcpyParams.extent.width = g_texture_3d.width; + memcpyParams.extent.height = g_texture_3d.height; + memcpyParams.extent.depth = g_texture_3d.depth; + memcpyParams.kind = cudaMemcpyDeviceToDevice; + cudaMemcpy3D(&memcpyParams); + getLastCudaError("cudaMemcpy3D failed"); + } - // populate the faces of the cube map - for (int face = 0; face < 6; ++face) { - cudaArray *cuArray; - cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_cube.cudaResource, - face, 0); - getLastCudaError( - "cudaGraphicsSubResourceGetMappedArray (cuda_texture_cube) failed"); + // populate the faces of the cube map + for (int face = 0; face < 6; ++face) { + cudaArray *cuArray; + cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_cube.cudaResource, face, 0); + getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_cube) failed"); - // kick off the kernel and send the staging buffer cudaLinearMemory as an - // argument to allow the kernel to write to it - cuda_texture_cube(g_texture_cube.cudaLinearMemory, g_texture_cube.size, - g_texture_cube.size, g_texture_cube.pitch, face, t); - getLastCudaError("cuda_texture_cube failed"); + // kick off the kernel and send the staging buffer cudaLinearMemory as an + // argument to allow the kernel to write to it + cuda_texture_cube( + g_texture_cube.cudaLinearMemory, g_texture_cube.size, g_texture_cube.size, g_texture_cube.pitch, face, t); + getLastCudaError("cuda_texture_cube failed"); - // then we want to copy cudaLinearMemory to the D3D texture, via its mapped - // form : cudaArray - cudaMemcpy2DToArray(cuArray, // dst array - 0, 0, // offset - g_texture_cube.cudaLinearMemory, - g_texture_cube.pitch, // src - g_texture_cube.size * 4, g_texture_cube.size, // extent - cudaMemcpyDeviceToDevice); // kind - getLastCudaError("cudaMemcpy2DToArray failed"); - } + // then we want to copy cudaLinearMemory to the D3D texture, via its mapped + // form : cudaArray + cudaMemcpy2DToArray(cuArray, // dst array + 0, + 0, // offset + g_texture_cube.cudaLinearMemory, + g_texture_cube.pitch, // src + g_texture_cube.size * 4, + g_texture_cube.size, // extent + cudaMemcpyDeviceToDevice); // kind + getLastCudaError("cudaMemcpy2DToArray failed"); + } - t += 0.1f; + t += 0.1f; } //////////////////////////////////////////////////////////////////////////////// //! Draw the final result on the screen //////////////////////////////////////////////////////////////////////////////// -bool DrawScene() { - // Clear the backbuffer to a black color - float ClearColor[4] = {0.5f, 0.5f, 0.6f, 1.0f}; - g_pd3dDeviceContext->ClearRenderTargetView(g_pSwapChainRTV, ClearColor); +bool DrawScene() +{ + // Clear the backbuffer to a black color + float ClearColor[4] = {0.5f, 0.5f, 0.6f, 1.0f}; + g_pd3dDeviceContext->ClearRenderTargetView(g_pSwapChainRTV, ClearColor); - float quadRect[4] = {-0.9f, -0.9f, 0.7f, 0.7f}; + float quadRect[4] = {-0.9f, -0.9f, 0.7f, 0.7f}; // // draw the 2d texture // #ifdef USEEFFECT - g_pUseCase->SetInt(0); - g_pvQuadRect->SetFloatVector((float *)&quadRect); - g_pSimpleTechnique->GetPassByIndex(0)->Apply(0, g_pd3dDeviceContext); -#else - HRESULT hr; - D3D11_MAPPED_SUBRESOURCE mappedResource; - ConstantBuffer *pcb; - hr = g_pd3dDeviceContext->Map(g_pConstantBuffer, 0, D3D11_MAP_WRITE_DISCARD, - 0, &mappedResource); - AssertOrQuit(SUCCEEDED(hr)); - pcb = (ConstantBuffer *)mappedResource.pData; - { - memcpy(pcb->vQuadRect, quadRect, sizeof(float) * 4); - pcb->UseCase = 0; - } - g_pd3dDeviceContext->Unmap(g_pConstantBuffer, 0); -#endif - g_pd3dDeviceContext->Draw(4, 0); - - // - // draw a slice the 3d texture - // - quadRect[1] = 0.1f; -#ifdef USEEFFECT - g_pUseCase->SetInt(1); - g_pvQuadRect->SetFloatVector((float *)&quadRect); - g_pSimpleTechnique->GetPassByIndex(0)->Apply(0, g_pd3dDeviceContext); -#else - hr = g_pd3dDeviceContext->Map(g_pConstantBuffer, 0, D3D11_MAP_WRITE_DISCARD, - 0, &mappedResource); - AssertOrQuit(SUCCEEDED(hr)); - pcb = (ConstantBuffer *)mappedResource.pData; - { - memcpy(pcb->vQuadRect, quadRect, sizeof(float) * 4); - pcb->UseCase = 1; - } - g_pd3dDeviceContext->Unmap(g_pConstantBuffer, 0); -#endif - g_pd3dDeviceContext->Draw(4, 0); - - // - // draw the 6 faces of the cube texture - // - float faceRect[4] = {-0.1f, -0.9f, 0.5f, 0.5f}; - - for (int f = 0; f < 6; f++) { - if (f == 3) { - faceRect[0] += 0.55f; - faceRect[1] = -0.9f; - } - -#ifdef USEEFFECT - g_pUseCase->SetInt(2 + f); - g_pvQuadRect->SetFloatVector((float *)&faceRect); + g_pUseCase->SetInt(0); + g_pvQuadRect->SetFloatVector((float *)&quadRect); g_pSimpleTechnique->GetPassByIndex(0)->Apply(0, g_pd3dDeviceContext); #else - hr = g_pd3dDeviceContext->Map(g_pConstantBuffer, 0, D3D11_MAP_WRITE_DISCARD, - 0, &mappedResource); + HRESULT hr; + D3D11_MAPPED_SUBRESOURCE mappedResource; + ConstantBuffer *pcb; + hr = g_pd3dDeviceContext->Map(g_pConstantBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &mappedResource); AssertOrQuit(SUCCEEDED(hr)); pcb = (ConstantBuffer *)mappedResource.pData; { - memcpy(pcb->vQuadRect, faceRect, sizeof(float) * 4); - pcb->UseCase = 2 + f; + memcpy(pcb->vQuadRect, quadRect, sizeof(float) * 4); + pcb->UseCase = 0; } g_pd3dDeviceContext->Unmap(g_pConstantBuffer, 0); #endif g_pd3dDeviceContext->Draw(4, 0); - faceRect[1] += 0.6f; - } - // Present the backbuffer contents to the display - g_pSwapChain->Present(0, 0); - return true; + // + // draw a slice the 3d texture + // + quadRect[1] = 0.1f; +#ifdef USEEFFECT + g_pUseCase->SetInt(1); + g_pvQuadRect->SetFloatVector((float *)&quadRect); + g_pSimpleTechnique->GetPassByIndex(0)->Apply(0, g_pd3dDeviceContext); +#else + hr = g_pd3dDeviceContext->Map(g_pConstantBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &mappedResource); + AssertOrQuit(SUCCEEDED(hr)); + pcb = (ConstantBuffer *)mappedResource.pData; + { + memcpy(pcb->vQuadRect, quadRect, sizeof(float) * 4); + pcb->UseCase = 1; + } + g_pd3dDeviceContext->Unmap(g_pConstantBuffer, 0); +#endif + g_pd3dDeviceContext->Draw(4, 0); + + // + // draw the 6 faces of the cube texture + // + float faceRect[4] = {-0.1f, -0.9f, 0.5f, 0.5f}; + + for (int f = 0; f < 6; f++) { + if (f == 3) { + faceRect[0] += 0.55f; + faceRect[1] = -0.9f; + } + +#ifdef USEEFFECT + g_pUseCase->SetInt(2 + f); + g_pvQuadRect->SetFloatVector((float *)&faceRect); + g_pSimpleTechnique->GetPassByIndex(0)->Apply(0, g_pd3dDeviceContext); +#else + hr = g_pd3dDeviceContext->Map(g_pConstantBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &mappedResource); + AssertOrQuit(SUCCEEDED(hr)); + pcb = (ConstantBuffer *)mappedResource.pData; + { + memcpy(pcb->vQuadRect, faceRect, sizeof(float) * 4); + pcb->UseCase = 2 + f; + } + g_pd3dDeviceContext->Unmap(g_pConstantBuffer, 0); +#endif + g_pd3dDeviceContext->Draw(4, 0); + faceRect[1] += 0.6f; + } + + // Present the backbuffer contents to the display + g_pSwapChain->Present(0, 0); + return true; } //----------------------------------------------------------------------------- // Name: Cleanup() // Desc: Releases all previously initialized objects //----------------------------------------------------------------------------- -void Cleanup() { - // unregister the Cuda resources - cudaGraphicsUnregisterResource(g_texture_2d.cudaResource); - getLastCudaError("cudaGraphicsUnregisterResource (g_texture_2d) failed"); - cudaFree(g_texture_2d.cudaLinearMemory); - getLastCudaError("cudaFree (g_texture_2d) failed"); +void Cleanup() +{ + // unregister the Cuda resources + cudaGraphicsUnregisterResource(g_texture_2d.cudaResource); + getLastCudaError("cudaGraphicsUnregisterResource (g_texture_2d) failed"); + cudaFree(g_texture_2d.cudaLinearMemory); + getLastCudaError("cudaFree (g_texture_2d) failed"); - cudaGraphicsUnregisterResource(g_texture_cube.cudaResource); - getLastCudaError("cudaGraphicsUnregisterResource (g_texture_cube) failed"); - cudaFree(g_texture_cube.cudaLinearMemory); - getLastCudaError("cudaFree (g_texture_2d) failed"); + cudaGraphicsUnregisterResource(g_texture_cube.cudaResource); + getLastCudaError("cudaGraphicsUnregisterResource (g_texture_cube) failed"); + cudaFree(g_texture_cube.cudaLinearMemory); + getLastCudaError("cudaFree (g_texture_2d) failed"); - cudaGraphicsUnregisterResource(g_texture_3d.cudaResource); - getLastCudaError("cudaGraphicsUnregisterResource (g_texture_3d) failed"); - cudaFree(g_texture_3d.cudaLinearMemory); - getLastCudaError("cudaFree (g_texture_2d) failed"); + cudaGraphicsUnregisterResource(g_texture_3d.cudaResource); + getLastCudaError("cudaGraphicsUnregisterResource (g_texture_3d) failed"); + cudaFree(g_texture_3d.cudaLinearMemory); + getLastCudaError("cudaFree (g_texture_2d) failed"); - // - // clean up Direct3D - // - { - // release the resources we created - g_texture_2d.pSRView->Release(); - g_texture_2d.pTexture->Release(); - g_texture_cube.pSRView->Release(); - g_texture_cube.pTexture->Release(); - g_texture_3d.pSRView->Release(); - g_texture_3d.pTexture->Release(); + // + // clean up Direct3D + // + { + // release the resources we created + g_texture_2d.pSRView->Release(); + g_texture_2d.pTexture->Release(); + g_texture_cube.pSRView->Release(); + g_texture_cube.pTexture->Release(); + g_texture_3d.pSRView->Release(); + g_texture_3d.pTexture->Release(); - if (g_pInputLayout != NULL) { - g_pInputLayout->Release(); - } + if (g_pInputLayout != NULL) { + g_pInputLayout->Release(); + } #ifdef USEEFFECT - if (g_pSimpleEffect != NULL) { - g_pSimpleEffect->Release(); - } + if (g_pSimpleEffect != NULL) { + g_pSimpleEffect->Release(); + } #else - if (g_pVertexShader) { - g_pVertexShader->Release(); - } + if (g_pVertexShader) { + g_pVertexShader->Release(); + } - if (g_pPixelShader) { - g_pPixelShader->Release(); - } + if (g_pPixelShader) { + g_pPixelShader->Release(); + } - if (g_pConstantBuffer) { - g_pConstantBuffer->Release(); - } + if (g_pConstantBuffer) { + g_pConstantBuffer->Release(); + } - if (g_pSamplerState) { - g_pSamplerState->Release(); - } + if (g_pSamplerState) { + g_pSamplerState->Release(); + } #endif - if (g_pSwapChainRTV != NULL) { - g_pSwapChainRTV->Release(); - } + if (g_pSwapChainRTV != NULL) { + g_pSwapChainRTV->Release(); + } - if (g_pSwapChain != NULL) { - g_pSwapChain->Release(); - } + if (g_pSwapChain != NULL) { + g_pSwapChain->Release(); + } - if (g_pd3dDevice != NULL) { - g_pd3dDevice->Release(); + if (g_pd3dDevice != NULL) { + g_pd3dDevice->Release(); + } } - } } //----------------------------------------------------------------------------- // Name: Render() // Desc: Launches the CUDA kernels to fill in the texture data //----------------------------------------------------------------------------- -void Render() { - // - // map the resources we've registered so we can access them in Cuda - // - it is most efficient to map and unmap all resources in a single call, - // and to have the map/unmap calls be the boundary between using the GPU - // for Direct3D and Cuda - // - static bool doit = true; +void Render() +{ + // + // map the resources we've registered so we can access them in Cuda + // - it is most efficient to map and unmap all resources in a single call, + // and to have the map/unmap calls be the boundary between using the GPU + // for Direct3D and Cuda + // + static bool doit = true; - if (doit) { - doit = true; - cudaStream_t stream = 0; - const int nbResources = 3; - cudaGraphicsResource *ppResources[nbResources] = { - g_texture_2d.cudaResource, g_texture_3d.cudaResource, - g_texture_cube.cudaResource, - }; - cudaGraphicsMapResources(nbResources, ppResources, stream); - getLastCudaError("cudaGraphicsMapResources(3) failed"); + if (doit) { + doit = true; + cudaStream_t stream = 0; + const int nbResources = 3; + cudaGraphicsResource *ppResources[nbResources] = { + g_texture_2d.cudaResource, + g_texture_3d.cudaResource, + g_texture_cube.cudaResource, + }; + cudaGraphicsMapResources(nbResources, ppResources, stream); + getLastCudaError("cudaGraphicsMapResources(3) failed"); + + // + // run kernels which will populate the contents of those textures + // + RunKernels(); + + // + // unmap the resources + // + cudaGraphicsUnmapResources(nbResources, ppResources, stream); + getLastCudaError("cudaGraphicsUnmapResources(3) failed"); + } // - // run kernels which will populate the contents of those textures + // draw the scene using them // - RunKernels(); - - // - // unmap the resources - // - cudaGraphicsUnmapResources(nbResources, ppResources, stream); - getLastCudaError("cudaGraphicsUnmapResources(3) failed"); - } - - // - // draw the scene using them - // - DrawScene(); + DrawScene(); } //----------------------------------------------------------------------------- // Name: MsgProc() // Desc: The window's message handler //----------------------------------------------------------------------------- -static LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, - LPARAM lParam) { - switch (msg) { +static LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam) +{ + switch (msg) { case WM_KEYDOWN: - if (wParam == VK_ESCAPE) { + if (wParam == VK_ESCAPE) { + g_bDone = true; + Cleanup(); + PostQuitMessage(0); + return 0; + } + + break; + + case WM_DESTROY: g_bDone = true; Cleanup(); PostQuitMessage(0); return 0; - } - - break; - - case WM_DESTROY: - g_bDone = true; - Cleanup(); - PostQuitMessage(0); - return 0; case WM_PAINT: - ValidateRect(hWnd, NULL); - return 0; - } + ValidateRect(hWnd, NULL); + return 0; + } - return DefWindowProc(hWnd, msg, wParam, lParam); + return DefWindowProc(hWnd, msg, wParam, lParam); } diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/texture_2d.cu b/Samples/5_Domain_Specific/simpleD3D11Texture/texture_2d.cu index 0b00c644..57419b44 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/texture_2d.cu +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/texture_2d.cu @@ -37,42 +37,42 @@ * writes from the texture, hence why this texture was not mapped * as WriteDiscard. */ -__global__ void cuda_kernel_texture_2d(unsigned char *surface, int width, - int height, size_t pitch, float t) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - float *pixel; +__global__ void cuda_kernel_texture_2d(unsigned char *surface, int width, int height, size_t pitch, float t) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + float *pixel; - // in the case where, due to quantization into grids, we have - // more threads than pixels, skip the threads which don't - // correspond to valid pixels - if (x >= width || y >= height) return; + // in the case where, due to quantization into grids, we have + // more threads than pixels, skip the threads which don't + // correspond to valid pixels + if (x >= width || y >= height) + return; - // get a pointer to the pixel at (x,y) - pixel = (float *)(surface + y * pitch) + 4 * x; + // get a pointer to the pixel at (x,y) + pixel = (float *)(surface + y * pitch) + 4 * x; - // populate it - float value_x = 0.5f + 0.5f * cos(t + 10.0f * ((2.0f * x) / width - 1.0f)); - float value_y = 0.5f + 0.5f * cos(t + 10.0f * ((2.0f * y) / height - 1.0f)); - pixel[0] = 0.5 * pixel[0] + 0.5 * pow(value_x, 3.0f); // red - pixel[1] = 0.5 * pixel[1] + 0.5 * pow(value_y, 3.0f); // green - pixel[2] = 0.5f + 0.5f * cos(t); // blue - pixel[3] = 1; // alpha + // populate it + float value_x = 0.5f + 0.5f * cos(t + 10.0f * ((2.0f * x) / width - 1.0f)); + float value_y = 0.5f + 0.5f * cos(t + 10.0f * ((2.0f * y) / height - 1.0f)); + pixel[0] = 0.5 * pixel[0] + 0.5 * pow(value_x, 3.0f); // red + pixel[1] = 0.5 * pixel[1] + 0.5 * pow(value_y, 3.0f); // green + pixel[2] = 0.5f + 0.5f * cos(t); // blue + pixel[3] = 1; // alpha } -extern "C" void cuda_texture_2d(void *surface, int width, int height, - size_t pitch, float t) { - cudaError_t error = cudaSuccess; +extern "C" void cuda_texture_2d(void *surface, int width, int height, size_t pitch, float t) +{ + cudaError_t error = cudaSuccess; - dim3 Db = dim3(16, 16); // block dimensions are fixed to be 256 threads - dim3 Dg = dim3((width + Db.x - 1) / Db.x, (height + Db.y - 1) / Db.y); + dim3 Db = dim3(16, 16); // block dimensions are fixed to be 256 threads + dim3 Dg = dim3((width + Db.x - 1) / Db.x, (height + Db.y - 1) / Db.y); - cuda_kernel_texture_2d<<>>((unsigned char *)surface, width, height, - pitch, t); + cuda_kernel_texture_2d<<>>((unsigned char *)surface, width, height, pitch, t); - error = cudaGetLastError(); + error = cudaGetLastError(); - if (error != cudaSuccess) { - printf("cuda_kernel_texture_2d() failed to launch error = %d\n", error); - } + if (error != cudaSuccess) { + printf("cuda_kernel_texture_2d() failed to launch error = %d\n", error); + } } diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/texture_3d.cu b/Samples/5_Domain_Specific/simpleD3D11Texture/texture_3d.cu index 2ce840a3..331eeda4 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/texture_3d.cu +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/texture_3d.cu @@ -33,46 +33,48 @@ * Paint a 3D texture with a gradient in X (blue) and Z (green), and have every * other Z slice have full red. */ -__global__ void cuda_kernel_texture_3d(unsigned char *surface, int width, - int height, int depth, size_t pitch, - size_t pitchSlice, float t) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void cuda_kernel_texture_3d(unsigned char *surface, + int width, + int height, + int depth, + size_t pitch, + size_t pitchSlice, + float t) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; - // in the case where, due to quantization into grids, we have - // more threads than pixels, skip the threads which don't - // correspond to valid pixels - if (x >= width || y >= height) return; + // in the case where, due to quantization into grids, we have + // more threads than pixels, skip the threads which don't + // correspond to valid pixels + if (x >= width || y >= height) + return; - // walk across the Z slices of this texture. it should be noted that - // this is far from optimal data access. - for (int z = 0; z < depth; ++z) { - // get a pointer to this pixel - unsigned char *pixel = surface + z * pitchSlice + y * pitch + 4 * x; - pixel[0] = - (unsigned char)(255.f * (0.5f + 0.5f * - cos(t + (x * x + y * y + z * z) * 0.0001f * 3.14f))); // red - pixel[1] = - (unsigned char)(255.f * (0.5f + 0.5f * - sin(t + (x * x + y * y + z * z) * 0.0001f * 3.14f))); // green - pixel[2] = (unsigned char)0; // blue - pixel[3] = 255; // alpha - } + // walk across the Z slices of this texture. it should be noted that + // this is far from optimal data access. + for (int z = 0; z < depth; ++z) { + // get a pointer to this pixel + unsigned char *pixel = surface + z * pitchSlice + y * pitch + 4 * x; + pixel[0] = (unsigned char)(255.f * (0.5f + 0.5f * cos(t + (x * x + y * y + z * z) * 0.0001f * 3.14f))); // red + pixel[1] = (unsigned char)(255.f * (0.5f + 0.5f * sin(t + (x * x + y * y + z * z) * 0.0001f * 3.14f))); // green + pixel[2] = (unsigned char)0; // blue + pixel[3] = 255; // alpha + } } -extern "C" void cuda_texture_3d(void *surface, int width, int height, int depth, - size_t pitch, size_t pitchSlice, float t) { - cudaError_t error = cudaSuccess; +extern "C" void +cuda_texture_3d(void *surface, int width, int height, int depth, size_t pitch, size_t pitchSlice, float t) +{ + cudaError_t error = cudaSuccess; - dim3 Db = dim3(16, 16); // block dimensions are fixed to be 256 threads - dim3 Dg = dim3((width + Db.x - 1) / Db.x, (height + Db.y - 1) / Db.y); + dim3 Db = dim3(16, 16); // block dimensions are fixed to be 256 threads + dim3 Dg = dim3((width + Db.x - 1) / Db.x, (height + Db.y - 1) / Db.y); - cuda_kernel_texture_3d<<>>((unsigned char *)surface, width, height, - depth, pitch, pitchSlice, t); + cuda_kernel_texture_3d<<>>((unsigned char *)surface, width, height, depth, pitch, pitchSlice, t); - error = cudaGetLastError(); + error = cudaGetLastError(); - if (error != cudaSuccess) { - printf("cuda_kernel_texture_3d() failed to launch error = %d\n", error); - } + if (error != cudaSuccess) { + printf("cuda_kernel_texture_3d() failed to launch error = %d\n", error); + } } diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/texture_cube.cu b/Samples/5_Domain_Specific/simpleD3D11Texture/texture_cube.cu index d767333f..e442c600 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/texture_cube.cu +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/texture_cube.cu @@ -38,54 +38,55 @@ * face of a * cube map. */ -__global__ void cuda_kernel_texture_cube(char *surface, int width, int height, - size_t pitch, int face, float t) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - unsigned char *pixel; +__global__ void cuda_kernel_texture_cube(char *surface, int width, int height, size_t pitch, int face, float t) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + unsigned char *pixel; - // in the case where, due to quantization into grids, we have - // more threads than pixels, skip the threads which don't - // correspond to valid pixels - if (x >= width || y >= height) return; + // in the case where, due to quantization into grids, we have + // more threads than pixels, skip the threads which don't + // correspond to valid pixels + if (x >= width || y >= height) + return; - // get a pointer to this pixel - pixel = (unsigned char *)(surface + y * pitch) + 4 * x; + // get a pointer to this pixel + pixel = (unsigned char *)(surface + y * pitch) + 4 * x; - // populate it - float theta_x = (2.0f * x) / width - 1.0f; - float theta_y = (2.0f * y) / height - 1.0f; - float theta = 2.0f * PI * sqrt(theta_x * theta_x + theta_y * theta_y); - unsigned char value = 255 * (0.6f + 0.4f * cos(theta + t)); + // populate it + float theta_x = (2.0f * x) / width - 1.0f; + float theta_y = (2.0f * y) / height - 1.0f; + float theta = 2.0f * PI * sqrt(theta_x * theta_x + theta_y * theta_y); + unsigned char value = 255 * (0.6f + 0.4f * cos(theta + t)); - pixel[3] = 255; // alpha + pixel[3] = 255; // alpha - if (face % 2) { - pixel[0] = // blue - pixel[1] = // green - pixel[2] = 0.5; // red - pixel[face / 2] = value; - } else { - pixel[0] = // blue - pixel[1] = // green - pixel[2] = value; // red - pixel[face / 2] = 0.5; - } + if (face % 2) { + pixel[0] = // blue + pixel[1] = // green + pixel[2] = 0.5; // red + pixel[face / 2] = value; + } + else { + pixel[0] = // blue + pixel[1] = // green + pixel[2] = value; // red + pixel[face / 2] = 0.5; + } } -extern "C" void cuda_texture_cube(void *surface, int width, int height, - size_t pitch, int face, float t) { - cudaError_t error = cudaSuccess; +extern "C" void cuda_texture_cube(void *surface, int width, int height, size_t pitch, int face, float t) +{ + cudaError_t error = cudaSuccess; - dim3 Db = dim3(16, 16); // block dimensions are fixed to be 256 threads - dim3 Dg = dim3((width + Db.x - 1) / Db.x, (height + Db.y - 1) / Db.y); + dim3 Db = dim3(16, 16); // block dimensions are fixed to be 256 threads + dim3 Dg = dim3((width + Db.x - 1) / Db.x, (height + Db.y - 1) / Db.y); - cuda_kernel_texture_cube<<>>((char *)surface, width, height, pitch, - face, t); + cuda_kernel_texture_cube<<>>((char *)surface, width, height, pitch, face, t); - error = cudaGetLastError(); + error = cudaGetLastError(); - if (error != cudaSuccess) { - printf("cuda_kernel_texture_cube() failed to launch error = %d\n", error); - } + if (error != cudaSuccess) { + printf("cuda_kernel_texture_cube() failed to launch error = %d\n", error); + } } diff --git a/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.cpp b/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.cpp old mode 100755 new mode 100644 index 223d8871..2c69f39e --- a/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.cpp +++ b/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.cpp @@ -50,95 +50,101 @@ The MIT License (MIT) SOFTWARE. */ -#include "stdafx.h" #include "DX12CudaSample.h" + #include +#include "stdafx.h" + using namespace Microsoft::WRL; DX12CudaSample::DX12CudaSample(UINT width, UINT height, std::string name) - : m_width(width), m_height(height), m_title(name), m_useWarpDevice(false) { - m_aspectRatio = static_cast(width) / static_cast(height); + : m_width(width) + , m_height(height) + , m_title(name) + , m_useWarpDevice(false) +{ + m_aspectRatio = static_cast(width) / static_cast(height); } DX12CudaSample::~DX12CudaSample() {} -std::wstring DX12CudaSample::string2wstring(const std::string& s) { - int len; - int slength = (int)s.length() + 1; - len = MultiByteToWideChar(CP_ACP, 0, s.c_str(), slength, 0, 0); - wchar_t* buf = new wchar_t[len]; - MultiByteToWideChar(CP_ACP, 0, s.c_str(), slength, buf, len); - std::wstring r(buf); - delete[] buf; - return r; +std::wstring DX12CudaSample::string2wstring(const std::string &s) +{ + int len; + int slength = (int)s.length() + 1; + len = MultiByteToWideChar(CP_ACP, 0, s.c_str(), slength, 0, 0); + wchar_t *buf = new wchar_t[len]; + MultiByteToWideChar(CP_ACP, 0, s.c_str(), slength, buf, len); + std::wstring r(buf); + delete[] buf; + return r; } // Helper function for resolving the full path of assets. -std::wstring DX12CudaSample::GetAssetFullPath(const char* assetName) { - LPTSTR lpBuffer = new char[4096]; - GetCurrentDirectory(FILENAME_MAX, lpBuffer); - char* tmp = sdkFindFilePath((const char*)assetName, "simpleD3D12"); - if (tmp == NULL) { - throw std::exception("File not found"); - } - for (int i = 0; i < strlen(tmp); i++) { - if (tmp[i] == '/') { - tmp[i] = '\\'; +std::wstring DX12CudaSample::GetAssetFullPath(const char *assetName) +{ + LPTSTR lpBuffer = new char[4096]; + GetCurrentDirectory(FILENAME_MAX, lpBuffer); + char *tmp = sdkFindFilePath((const char *)assetName, "simpleD3D12"); + if (tmp == NULL) { + throw std::exception("File not found"); } - } - m_assetsPath = lpBuffer; - m_assetsPath = m_assetsPath + "\\" + tmp; + for (int i = 0; i < strlen(tmp); i++) { + if (tmp[i] == '/') { + tmp[i] = '\\'; + } + } + m_assetsPath = lpBuffer; + m_assetsPath = m_assetsPath + "\\" + tmp; - std::wstring stemp = string2wstring(m_assetsPath); + std::wstring stemp = string2wstring(m_assetsPath); - return stemp; + return stemp; } // Helper function for acquiring the first available hardware adapter that // supports Direct3D 12. If no such adapter can be found, *ppAdapter will be set // to nullptr. -_Use_decl_annotations_ void DX12CudaSample::GetHardwareAdapter( - IDXGIFactory2* pFactory, IDXGIAdapter1** ppAdapter) { - ComPtr adapter; - *ppAdapter = nullptr; +_Use_decl_annotations_ void DX12CudaSample::GetHardwareAdapter(IDXGIFactory2 *pFactory, IDXGIAdapter1 **ppAdapter) +{ + ComPtr adapter; + *ppAdapter = nullptr; - for (UINT adapterIndex = 0; - DXGI_ERROR_NOT_FOUND != pFactory->EnumAdapters1(adapterIndex, &adapter); - ++adapterIndex) { - DXGI_ADAPTER_DESC1 desc; - adapter->GetDesc1(&desc); + for (UINT adapterIndex = 0; DXGI_ERROR_NOT_FOUND != pFactory->EnumAdapters1(adapterIndex, &adapter); + ++adapterIndex) { + DXGI_ADAPTER_DESC1 desc; + adapter->GetDesc1(&desc); - if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) { - // Don't select the Basic Render Driver adapter. - // If you want a software adapter, pass in "/warp" on the command line. - continue; + if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) { + // Don't select the Basic Render Driver adapter. + // If you want a software adapter, pass in "/warp" on the command line. + continue; + } + + // Check to see if the adapter supports Direct3D 12, but don't create the + // actual device yet. + if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, _uuidof(ID3D12Device), nullptr))) { + break; + } } - // Check to see if the adapter supports Direct3D 12, but don't create the - // actual device yet. - if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, - _uuidof(ID3D12Device), nullptr))) { - break; - } - } - - *ppAdapter = adapter.Detach(); + *ppAdapter = adapter.Detach(); } // Helper function for setting the window's title text. -void DX12CudaSample::SetCustomWindowText(const char* text) { - std::string windowText = m_title + text; - SetWindowText(Win32Application::GetHwnd(), windowText.c_str()); +void DX12CudaSample::SetCustomWindowText(const char *text) +{ + std::string windowText = m_title + text; + SetWindowText(Win32Application::GetHwnd(), windowText.c_str()); } // Helper function for parsing any supplied command line args. -_Use_decl_annotations_ void DX12CudaSample::ParseCommandLineArgs(WCHAR* argv[], - int argc) { - for (int i = 1; i < argc; ++i) { - if (_wcsnicmp(argv[i], L"-warp", wcslen(argv[i])) == 0 || - _wcsnicmp(argv[i], L"/warp", wcslen(argv[i])) == 0) { - m_useWarpDevice = true; - m_title = m_title + " (WARP)"; +_Use_decl_annotations_ void DX12CudaSample::ParseCommandLineArgs(WCHAR *argv[], int argc) +{ + for (int i = 1; i < argc; ++i) { + if (_wcsnicmp(argv[i], L"-warp", wcslen(argv[i])) == 0 || _wcsnicmp(argv[i], L"/warp", wcslen(argv[i])) == 0) { + m_useWarpDevice = true; + m_title = m_title + " (WARP)"; + } } - } } diff --git a/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.h b/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.h old mode 100755 new mode 100644 index c7e4a64f..903ed717 --- a/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.h +++ b/Samples/5_Domain_Specific/simpleD3D12/DX12CudaSample.h @@ -55,45 +55,45 @@ The MIT License (MIT) #include "DXSampleHelper.h" #include "Win32Application.h" -class DX12CudaSample { - public: - DX12CudaSample(UINT width, UINT height, std::string name); - virtual ~DX12CudaSample(); +class DX12CudaSample +{ +public: + DX12CudaSample(UINT width, UINT height, std::string name); + virtual ~DX12CudaSample(); - virtual void OnInit() = 0; - virtual void OnRender() = 0; - virtual void OnDestroy() = 0; + virtual void OnInit() = 0; + virtual void OnRender() = 0; + virtual void OnDestroy() = 0; - // Samples override the event handlers to handle specific messages. - virtual void OnKeyDown(UINT8 /*key*/) {} - virtual void OnKeyUp(UINT8 /*key*/) {} + // Samples override the event handlers to handle specific messages. + virtual void OnKeyDown(UINT8 /*key*/) {} + virtual void OnKeyUp(UINT8 /*key*/) {} - // Accessors. - UINT GetWidth() const { return m_width; } - UINT GetHeight() const { return m_height; } - const CHAR* GetTitle() const { return m_title.c_str(); } + // Accessors. + UINT GetWidth() const { return m_width; } + UINT GetHeight() const { return m_height; } + const CHAR *GetTitle() const { return m_title.c_str(); } - void ParseCommandLineArgs(_In_reads_(argc) WCHAR* argv[], int argc); + void ParseCommandLineArgs(_In_reads_(argc) WCHAR *argv[], int argc); - protected: - std::wstring GetAssetFullPath(const char* assetName); - void GetHardwareAdapter(_In_ IDXGIFactory2* pFactory, - _Outptr_result_maybenull_ IDXGIAdapter1** ppAdapter); - void SetCustomWindowText(const char* text); - std::wstring string2wstring(const std::string& s); +protected: + std::wstring GetAssetFullPath(const char *assetName); + void GetHardwareAdapter(_In_ IDXGIFactory2 *pFactory, _Outptr_result_maybenull_ IDXGIAdapter1 **ppAdapter); + void SetCustomWindowText(const char *text); + std::wstring string2wstring(const std::string &s); - // Viewport dimensions. - UINT m_width; - UINT m_height; - float m_aspectRatio; + // Viewport dimensions. + UINT m_width; + UINT m_height; + float m_aspectRatio; - // Adapter info. - bool m_useWarpDevice; + // Adapter info. + bool m_useWarpDevice; - private: - // Root assets path. - std::string m_assetsPath; +private: + // Root assets path. + std::string m_assetsPath; - // Window title. - std::string m_title; + // Window title. + std::string m_title; }; diff --git a/Samples/5_Domain_Specific/simpleD3D12/DXSampleHelper.h b/Samples/5_Domain_Specific/simpleD3D12/DXSampleHelper.h old mode 100755 new mode 100644 index ce92dc82..be069074 --- a/Samples/5_Domain_Specific/simpleD3D12/DXSampleHelper.h +++ b/Samples/5_Domain_Specific/simpleD3D12/DXSampleHelper.h @@ -59,134 +59,148 @@ The MIT License (MIT) // that may still be referenced by the GPU. using Microsoft::WRL::ComPtr; -inline std::string HrToString(HRESULT hr) { - char s_str[64] = {}; - sprintf_s(s_str, "HRESULT of 0x%08X", static_cast(hr)); - return std::string(s_str); +inline std::string HrToString(HRESULT hr) +{ + char s_str[64] = {}; + sprintf_s(s_str, "HRESULT of 0x%08X", static_cast(hr)); + return std::string(s_str); } -class HrException : public std::runtime_error { - public: - HrException(HRESULT hr) : std::runtime_error(HrToString(hr)), m_hr(hr) {} - HRESULT Error() const { return m_hr; } +class HrException : public std::runtime_error +{ +public: + HrException(HRESULT hr) + : std::runtime_error(HrToString(hr)) + , m_hr(hr) + { + } + HRESULT Error() const { return m_hr; } - private: - const HRESULT m_hr; +private: + const HRESULT m_hr; }; #define SAFE_RELEASE(p) \ - if (p) (p)->Release() + if (p) \ + (p)->Release() -inline void ThrowIfFailed(HRESULT hr) { - if (FAILED(hr)) { - throw HrException(hr); - } +inline void ThrowIfFailed(HRESULT hr) +{ + if (FAILED(hr)) { + throw HrException(hr); + } } -inline HRESULT ReadDataFromFile(LPCWSTR filename, byte** data, UINT* size) { - using namespace Microsoft::WRL; +inline HRESULT ReadDataFromFile(LPCWSTR filename, byte **data, UINT *size) +{ + using namespace Microsoft::WRL; - CREATEFILE2_EXTENDED_PARAMETERS extendedParams = {}; - extendedParams.dwSize = sizeof(CREATEFILE2_EXTENDED_PARAMETERS); - extendedParams.dwFileAttributes = FILE_ATTRIBUTE_NORMAL; - extendedParams.dwFileFlags = FILE_FLAG_SEQUENTIAL_SCAN; - extendedParams.dwSecurityQosFlags = SECURITY_ANONYMOUS; - extendedParams.lpSecurityAttributes = nullptr; - extendedParams.hTemplateFile = nullptr; + CREATEFILE2_EXTENDED_PARAMETERS extendedParams = {}; + extendedParams.dwSize = sizeof(CREATEFILE2_EXTENDED_PARAMETERS); + extendedParams.dwFileAttributes = FILE_ATTRIBUTE_NORMAL; + extendedParams.dwFileFlags = FILE_FLAG_SEQUENTIAL_SCAN; + extendedParams.dwSecurityQosFlags = SECURITY_ANONYMOUS; + extendedParams.lpSecurityAttributes = nullptr; + extendedParams.hTemplateFile = nullptr; - Wrappers::FileHandle file(CreateFile2(filename, GENERIC_READ, FILE_SHARE_READ, - OPEN_EXISTING, &extendedParams)); - if (file.Get() == INVALID_HANDLE_VALUE) { - throw std::exception(); - } + Wrappers::FileHandle file(CreateFile2(filename, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, &extendedParams)); + if (file.Get() == INVALID_HANDLE_VALUE) { + throw std::exception(); + } - FILE_STANDARD_INFO fileInfo = {}; - if (!GetFileInformationByHandleEx(file.Get(), FileStandardInfo, &fileInfo, - sizeof(fileInfo))) { - throw std::exception(); - } + FILE_STANDARD_INFO fileInfo = {}; + if (!GetFileInformationByHandleEx(file.Get(), FileStandardInfo, &fileInfo, sizeof(fileInfo))) { + throw std::exception(); + } - if (fileInfo.EndOfFile.HighPart != 0) { - throw std::exception(); - } + if (fileInfo.EndOfFile.HighPart != 0) { + throw std::exception(); + } - *data = reinterpret_cast(malloc(fileInfo.EndOfFile.LowPart)); - *size = fileInfo.EndOfFile.LowPart; + *data = reinterpret_cast(malloc(fileInfo.EndOfFile.LowPart)); + *size = fileInfo.EndOfFile.LowPart; - if (!ReadFile(file.Get(), *data, fileInfo.EndOfFile.LowPart, nullptr, - nullptr)) { - throw std::exception(); - } + if (!ReadFile(file.Get(), *data, fileInfo.EndOfFile.LowPart, nullptr, nullptr)) { + throw std::exception(); + } - return S_OK; + return S_OK; } // Assign a name to the object to aid with debugging. #if defined(_DEBUG) || defined(DBG) -inline void SetName(ID3D12Object* pObject, LPCWSTR name) { - pObject->SetName(name); -} -inline void SetNameIndexed(ID3D12Object* pObject, LPCWSTR name, UINT index) { - WCHAR fullName[50]; - if (swprintf_s(fullName, L"%s[%u]", name, index) > 0) { - pObject->SetName(fullName); - } +inline void SetName(ID3D12Object *pObject, LPCWSTR name) { pObject->SetName(name); } +inline void SetNameIndexed(ID3D12Object *pObject, LPCWSTR name, UINT index) +{ + WCHAR fullName[50]; + if (swprintf_s(fullName, L"%s[%u]", name, index) > 0) { + pObject->SetName(fullName); + } } #else -inline void SetName(ID3D12Object*, LPCWSTR) {} -inline void SetNameIndexed(ID3D12Object*, LPCWSTR, UINT) {} +inline void SetName(ID3D12Object *, LPCWSTR) {} +inline void SetNameIndexed(ID3D12Object *, LPCWSTR, UINT) {} #endif // Naming helper for ComPtr. // Assigns the name of the variable as the name of the object. // The indexed variant will include the index in the name of the object. -#define NAME_D3D12_OBJECT(x) SetName((x).Get(), L#x) +#define NAME_D3D12_OBJECT(x) SetName((x).Get(), L#x) #define NAME_D3D12_OBJECT_INDEXED(x, n) SetNameIndexed((x)[n].Get(), L#x, n) -inline UINT CalculateConstantBufferByteSize(UINT byteSize) { - // Constant buffer size is required to be aligned. - return (byteSize + (D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1)) & - ~(D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1); +inline UINT CalculateConstantBufferByteSize(UINT byteSize) +{ + // Constant buffer size is required to be aligned. + return (byteSize + (D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1)) + & ~(D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1); } #ifdef D3D_COMPILE_STANDARD_FILE_INCLUDE -inline Microsoft::WRL::ComPtr CompileShader( - const std::wstring& filename, const D3D_SHADER_MACRO* defines, - const std::string& entrypoint, const std::string& target) { - UINT compileFlags = 0; +inline Microsoft::WRL::ComPtr CompileShader(const std::wstring &filename, + const D3D_SHADER_MACRO *defines, + const std::string &entrypoint, + const std::string &target) +{ + UINT compileFlags = 0; #if defined(_DEBUG) || defined(DBG) - compileFlags = D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION; + compileFlags = D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION; #endif - HRESULT hr; + HRESULT hr; - Microsoft::WRL::ComPtr byteCode = nullptr; - Microsoft::WRL::ComPtr errors; - hr = D3DCompileFromFile(filename.c_str(), defines, - D3D_COMPILE_STANDARD_FILE_INCLUDE, entrypoint.c_str(), - target.c_str(), compileFlags, 0, &byteCode, &errors); + Microsoft::WRL::ComPtr byteCode = nullptr; + Microsoft::WRL::ComPtr errors; + hr = D3DCompileFromFile(filename.c_str(), + defines, + D3D_COMPILE_STANDARD_FILE_INCLUDE, + entrypoint.c_str(), + target.c_str(), + compileFlags, + 0, + &byteCode, + &errors); - if (errors != nullptr) { - OutputDebugStringA((char*)errors->GetBufferPointer()); - } - ThrowIfFailed(hr); + if (errors != nullptr) { + OutputDebugStringA((char *)errors->GetBufferPointer()); + } + ThrowIfFailed(hr); - return byteCode; + return byteCode; } #endif // Resets all elements in a ComPtr array. -template -void ResetComPtrArray(T* comPtrArray) { - for (auto& i : *comPtrArray) { - i.Reset(); - } +template void ResetComPtrArray(T *comPtrArray) +{ + for (auto &i : *comPtrArray) { + i.Reset(); + } } // Resets all elements in a unique_ptr array. -template -void ResetUniquePtrArray(T* uniquePtrArray) { - for (auto& i : *uniquePtrArray) { - i.reset(); - } +template void ResetUniquePtrArray(T *uniquePtrArray) +{ + for (auto &i : *uniquePtrArray) { + i.reset(); + } } diff --git a/Samples/5_Domain_Specific/simpleD3D12/Main.cpp b/Samples/5_Domain_Specific/simpleD3D12/Main.cpp old mode 100755 new mode 100644 index 64eda448..f1478373 --- a/Samples/5_Domain_Specific/simpleD3D12/Main.cpp +++ b/Samples/5_Domain_Specific/simpleD3D12/Main.cpp @@ -25,11 +25,11 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "stdafx.h" #include "simpleD3D12.h" +#include "stdafx.h" -_Use_decl_annotations_ int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE, LPSTR, - int nCmdShow) { - DX12CudaInterop sample(1280, 720, "D3D12 CUDA Interop"); - return Win32Application::Run(&sample, hInstance, nCmdShow); +_Use_decl_annotations_ int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE, LPSTR, int nCmdShow) +{ + DX12CudaInterop sample(1280, 720, "D3D12 CUDA Interop"); + return Win32Application::Run(&sample, hInstance, nCmdShow); } diff --git a/Samples/5_Domain_Specific/simpleD3D12/README.md b/Samples/5_Domain_Specific/simpleD3D12/README.md index 4b3e6413..fbc27ef2 100644 --- a/Samples/5_Domain_Specific/simpleD3D12/README.md +++ b/Samples/5_Domain_Specific/simpleD3D12/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/simpleD3D12/ShaderStructs.h b/Samples/5_Domain_Specific/simpleD3D12/ShaderStructs.h old mode 100755 new mode 100644 index d5b1ef62..91e2be52 --- a/Samples/5_Domain_Specific/simpleD3D12/ShaderStructs.h +++ b/Samples/5_Domain_Specific/simpleD3D12/ShaderStructs.h @@ -27,21 +27,25 @@ #pragma once -#include -#include #include #include #include +#include +#include + #include "helper_cuda.h" using namespace DirectX; -struct Vertex { - XMFLOAT3 position; - XMFLOAT4 color; +struct Vertex +{ + XMFLOAT3 position; + XMFLOAT4 color; }; -void RunSineWaveKernel(size_t mesh_width, size_t mesh_height, - Vertex *cudaDevVertptr, cudaStream_t streamToRun, - float AnimTime); \ No newline at end of file +void RunSineWaveKernel(size_t mesh_width, + size_t mesh_height, + Vertex *cudaDevVertptr, + cudaStream_t streamToRun, + float AnimTime); diff --git a/Samples/5_Domain_Specific/simpleD3D12/Win32Application.cpp b/Samples/5_Domain_Specific/simpleD3D12/Win32Application.cpp old mode 100755 new mode 100644 index e27a6ebd..7f8ccec8 --- a/Samples/5_Domain_Specific/simpleD3D12/Win32Application.cpp +++ b/Samples/5_Domain_Specific/simpleD3D12/Win32Application.cpp @@ -50,103 +50,104 @@ The MIT License (MIT) SOFTWARE. */ -#include "stdafx.h" #include "Win32Application.h" +#include "stdafx.h" + HWND Win32Application::m_hwnd = nullptr; -int Win32Application::Run(DX12CudaSample* pSample, HINSTANCE hInstance, - int nCmdShow) { - // Parse the command line parameters - int argc; - LPWSTR* argv = CommandLineToArgvW(GetCommandLineW(), &argc); - pSample->ParseCommandLineArgs(argv, argc); - LocalFree(argv); +int Win32Application::Run(DX12CudaSample *pSample, HINSTANCE hInstance, int nCmdShow) +{ + // Parse the command line parameters + int argc; + LPWSTR *argv = CommandLineToArgvW(GetCommandLineW(), &argc); + pSample->ParseCommandLineArgs(argv, argc); + LocalFree(argv); - // Initialize the window class. - WNDCLASSEX windowClass = {0}; - windowClass.cbSize = sizeof(WNDCLASSEX); - windowClass.style = CS_HREDRAW | CS_VREDRAW; - windowClass.lpfnWndProc = WindowProc; - windowClass.hInstance = hInstance; - windowClass.hCursor = LoadCursor(NULL, IDC_ARROW); - windowClass.lpszClassName = "DX12CudaSampleClass"; - RegisterClassEx(&windowClass); + // Initialize the window class. + WNDCLASSEX windowClass = {0}; + windowClass.cbSize = sizeof(WNDCLASSEX); + windowClass.style = CS_HREDRAW | CS_VREDRAW; + windowClass.lpfnWndProc = WindowProc; + windowClass.hInstance = hInstance; + windowClass.hCursor = LoadCursor(NULL, IDC_ARROW); + windowClass.lpszClassName = "DX12CudaSampleClass"; + RegisterClassEx(&windowClass); - RECT windowRect = {0, 0, static_cast(pSample->GetWidth()), - static_cast(pSample->GetHeight())}; - AdjustWindowRect(&windowRect, WS_OVERLAPPEDWINDOW, FALSE); + RECT windowRect = {0, 0, static_cast(pSample->GetWidth()), static_cast(pSample->GetHeight())}; + AdjustWindowRect(&windowRect, WS_OVERLAPPEDWINDOW, FALSE); - // Create the window and store a handle to it. - m_hwnd = CreateWindow(windowClass.lpszClassName, pSample->GetTitle(), - WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, CW_USEDEFAULT, - windowRect.right - windowRect.left, - windowRect.bottom - windowRect.top, - nullptr, // We have no parent window. - nullptr, // We aren't using menus. - hInstance, pSample); + // Create the window and store a handle to it. + m_hwnd = CreateWindow(windowClass.lpszClassName, + pSample->GetTitle(), + WS_OVERLAPPEDWINDOW, + CW_USEDEFAULT, + CW_USEDEFAULT, + windowRect.right - windowRect.left, + windowRect.bottom - windowRect.top, + nullptr, // We have no parent window. + nullptr, // We aren't using menus. + hInstance, + pSample); - // Initialize the sample. OnInit is defined in each child-implementation of - // DXSample. - pSample->OnInit(); + // Initialize the sample. OnInit is defined in each child-implementation of + // DXSample. + pSample->OnInit(); - ShowWindow(m_hwnd, nCmdShow); + ShowWindow(m_hwnd, nCmdShow); - // Main sample loop. - MSG msg = {}; - while (msg.message != WM_QUIT) { - // Process any messages in the queue. - if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE)) { - TranslateMessage(&msg); - DispatchMessage(&msg); + // Main sample loop. + MSG msg = {}; + while (msg.message != WM_QUIT) { + // Process any messages in the queue. + if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE)) { + TranslateMessage(&msg); + DispatchMessage(&msg); + } } - } - pSample->OnDestroy(); + pSample->OnDestroy(); - // Return this part of the WM_QUIT message to Windows. - return static_cast(msg.wParam); + // Return this part of the WM_QUIT message to Windows. + return static_cast(msg.wParam); } // Main message handler for the sample. -LRESULT CALLBACK Win32Application::WindowProc(HWND hWnd, UINT message, - WPARAM wParam, LPARAM lParam) { - DX12CudaSample* pSample = - reinterpret_cast(GetWindowLongPtr(hWnd, GWLP_USERDATA)); +LRESULT CALLBACK Win32Application::WindowProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) +{ + DX12CudaSample *pSample = reinterpret_cast(GetWindowLongPtr(hWnd, GWLP_USERDATA)); - switch (message) { + switch (message) { case WM_CREATE: { - // Save the DXSample* passed in to CreateWindow. - LPCREATESTRUCT pCreateStruct = reinterpret_cast(lParam); - SetWindowLongPtr( - hWnd, GWLP_USERDATA, - reinterpret_cast(pCreateStruct->lpCreateParams)); + // Save the DXSample* passed in to CreateWindow. + LPCREATESTRUCT pCreateStruct = reinterpret_cast(lParam); + SetWindowLongPtr(hWnd, GWLP_USERDATA, reinterpret_cast(pCreateStruct->lpCreateParams)); } - return 0; + return 0; case WM_KEYDOWN: - if (pSample) { - pSample->OnKeyDown(static_cast(wParam)); - } - return 0; + if (pSample) { + pSample->OnKeyDown(static_cast(wParam)); + } + return 0; case WM_KEYUP: - if (pSample) { - pSample->OnKeyUp(static_cast(wParam)); - } - return 0; + if (pSample) { + pSample->OnKeyUp(static_cast(wParam)); + } + return 0; case WM_PAINT: - if (pSample) { - pSample->OnRender(); - } - return 0; + if (pSample) { + pSample->OnRender(); + } + return 0; case WM_DESTROY: - PostQuitMessage(0); - return 0; - } + PostQuitMessage(0); + return 0; + } - // Handle any messages the switch statement didn't. - return DefWindowProc(hWnd, message, wParam, lParam); + // Handle any messages the switch statement didn't. + return DefWindowProc(hWnd, message, wParam, lParam); } diff --git a/Samples/5_Domain_Specific/simpleD3D12/Win32Application.h b/Samples/5_Domain_Specific/simpleD3D12/Win32Application.h old mode 100755 new mode 100644 index 4cec1489..1427a1a2 --- a/Samples/5_Domain_Specific/simpleD3D12/Win32Application.h +++ b/Samples/5_Domain_Specific/simpleD3D12/Win32Application.h @@ -56,15 +56,15 @@ The MIT License (MIT) class DX12CudaSample; -class Win32Application { - public: - static int Run(DX12CudaSample* pSample, HINSTANCE hInstance, int nCmdShow); - static HWND GetHwnd() { return m_hwnd; } +class Win32Application +{ +public: + static int Run(DX12CudaSample *pSample, HINSTANCE hInstance, int nCmdShow); + static HWND GetHwnd() { return m_hwnd; } - protected: - static LRESULT CALLBACK WindowProc(HWND hWnd, UINT message, WPARAM wParam, - LPARAM lParam); +protected: + static LRESULT CALLBACK WindowProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam); - private: - static HWND m_hwnd; +private: + static HWND m_hwnd; }; diff --git a/Samples/5_Domain_Specific/simpleD3D12/d3dx12.h b/Samples/5_Domain_Specific/simpleD3D12/d3dx12.h old mode 100755 new mode 100644 index dc4608ae..0bb2b03a --- a/Samples/5_Domain_Specific/simpleD3D12/d3dx12.h +++ b/Samples/5_Domain_Specific/simpleD3D12/d3dx12.h @@ -12,720 +12,655 @@ #include "d3d12.h" -#if defined( __cplusplus ) +#if defined(__cplusplus) -struct CD3DX12_DEFAULT {}; +struct CD3DX12_DEFAULT +{ +}; extern const DECLSPEC_SELECTANY CD3DX12_DEFAULT D3D12_DEFAULT; //------------------------------------------------------------------------------------------------ -inline bool operator==( const D3D12_VIEWPORT& l, const D3D12_VIEWPORT& r ) +inline bool operator==(const D3D12_VIEWPORT &l, const D3D12_VIEWPORT &r) { - return l.TopLeftX == r.TopLeftX && l.TopLeftY == r.TopLeftY && l.Width == r.Width && - l.Height == r.Height && l.MinDepth == r.MinDepth && l.MaxDepth == r.MaxDepth; + return l.TopLeftX == r.TopLeftX && l.TopLeftY == r.TopLeftY && l.Width == r.Width && l.Height == r.Height + && l.MinDepth == r.MinDepth && l.MaxDepth == r.MaxDepth; } //------------------------------------------------------------------------------------------------ -inline bool operator!=( const D3D12_VIEWPORT& l, const D3D12_VIEWPORT& r ) -{ return !( l == r ); } +inline bool operator!=(const D3D12_VIEWPORT &l, const D3D12_VIEWPORT &r) { return !(l == r); } //------------------------------------------------------------------------------------------------ struct CD3DX12_RECT : public D3D12_RECT { - CD3DX12_RECT() - {} - explicit CD3DX12_RECT( const D3D12_RECT& o ) : - D3D12_RECT( o ) - {} - explicit CD3DX12_RECT( - LONG Left, - LONG Top, - LONG Right, - LONG Bottom ) + CD3DX12_RECT() {} + explicit CD3DX12_RECT(const D3D12_RECT &o) + : D3D12_RECT(o) { - left = Left; - top = Top; - right = Right; + } + explicit CD3DX12_RECT(LONG Left, LONG Top, LONG Right, LONG Bottom) + { + left = Left; + top = Top; + right = Right; bottom = Bottom; } ~CD3DX12_RECT() {} - operator const D3D12_RECT&() const { return *this; } + operator const D3D12_RECT &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_BOX : public D3D12_BOX { - CD3DX12_BOX() - {} - explicit CD3DX12_BOX( const D3D12_BOX& o ) : - D3D12_BOX( o ) - {} - explicit CD3DX12_BOX( - LONG Left, - LONG Right ) + CD3DX12_BOX() {} + explicit CD3DX12_BOX(const D3D12_BOX &o) + : D3D12_BOX(o) { - left = Left; - top = 0; - front = 0; - right = Right; + } + explicit CD3DX12_BOX(LONG Left, LONG Right) + { + left = Left; + top = 0; + front = 0; + right = Right; bottom = 1; - back = 1; + back = 1; } - explicit CD3DX12_BOX( - LONG Left, - LONG Top, - LONG Right, - LONG Bottom ) + explicit CD3DX12_BOX(LONG Left, LONG Top, LONG Right, LONG Bottom) { - left = Left; - top = Top; - front = 0; - right = Right; + left = Left; + top = Top; + front = 0; + right = Right; bottom = Bottom; - back = 1; + back = 1; } - explicit CD3DX12_BOX( - LONG Left, - LONG Top, - LONG Front, - LONG Right, - LONG Bottom, - LONG Back ) + explicit CD3DX12_BOX(LONG Left, LONG Top, LONG Front, LONG Right, LONG Bottom, LONG Back) { - left = Left; - top = Top; - front = Front; - right = Right; + left = Left; + top = Top; + front = Front; + right = Right; bottom = Bottom; - back = Back; + back = Back; } ~CD3DX12_BOX() {} - operator const D3D12_BOX&() const { return *this; } + operator const D3D12_BOX &() const { return *this; } }; -inline bool operator==( const D3D12_BOX& l, const D3D12_BOX& r ) +inline bool operator==(const D3D12_BOX &l, const D3D12_BOX &r) { - return l.left == r.left && l.top == r.top && l.front == r.front && - l.right == r.right && l.bottom == r.bottom && l.back == r.back; + return l.left == r.left && l.top == r.top && l.front == r.front && l.right == r.right && l.bottom == r.bottom + && l.back == r.back; } -inline bool operator!=( const D3D12_BOX& l, const D3D12_BOX& r ) -{ return !( l == r ); } +inline bool operator!=(const D3D12_BOX &l, const D3D12_BOX &r) { return !(l == r); } //------------------------------------------------------------------------------------------------ struct CD3DX12_DEPTH_STENCIL_DESC : public D3D12_DEPTH_STENCIL_DESC { - CD3DX12_DEPTH_STENCIL_DESC() - {} - explicit CD3DX12_DEPTH_STENCIL_DESC( const D3D12_DEPTH_STENCIL_DESC& o ) : - D3D12_DEPTH_STENCIL_DESC( o ) - {} - explicit CD3DX12_DEPTH_STENCIL_DESC( CD3DX12_DEFAULT ) + CD3DX12_DEPTH_STENCIL_DESC() {} + explicit CD3DX12_DEPTH_STENCIL_DESC(const D3D12_DEPTH_STENCIL_DESC &o) + : D3D12_DEPTH_STENCIL_DESC(o) { - DepthEnable = TRUE; - DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL; - DepthFunc = D3D12_COMPARISON_FUNC_LESS; - StencilEnable = FALSE; - StencilReadMask = D3D12_DEFAULT_STENCIL_READ_MASK; - StencilWriteMask = D3D12_DEFAULT_STENCIL_WRITE_MASK; - const D3D12_DEPTH_STENCILOP_DESC defaultStencilOp = - { D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, D3D12_COMPARISON_FUNC_ALWAYS }; - FrontFace = defaultStencilOp; - BackFace = defaultStencilOp; } - explicit CD3DX12_DEPTH_STENCIL_DESC( - BOOL depthEnable, - D3D12_DEPTH_WRITE_MASK depthWriteMask, - D3D12_COMPARISON_FUNC depthFunc, - BOOL stencilEnable, - UINT8 stencilReadMask, - UINT8 stencilWriteMask, - D3D12_STENCIL_OP frontStencilFailOp, - D3D12_STENCIL_OP frontStencilDepthFailOp, - D3D12_STENCIL_OP frontStencilPassOp, - D3D12_COMPARISON_FUNC frontStencilFunc, - D3D12_STENCIL_OP backStencilFailOp, - D3D12_STENCIL_OP backStencilDepthFailOp, - D3D12_STENCIL_OP backStencilPassOp, - D3D12_COMPARISON_FUNC backStencilFunc ) + explicit CD3DX12_DEPTH_STENCIL_DESC(CD3DX12_DEFAULT) { - DepthEnable = depthEnable; - DepthWriteMask = depthWriteMask; - DepthFunc = depthFunc; - StencilEnable = stencilEnable; - StencilReadMask = stencilReadMask; - StencilWriteMask = stencilWriteMask; - FrontFace.StencilFailOp = frontStencilFailOp; + DepthEnable = TRUE; + DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL; + DepthFunc = D3D12_COMPARISON_FUNC_LESS; + StencilEnable = FALSE; + StencilReadMask = D3D12_DEFAULT_STENCIL_READ_MASK; + StencilWriteMask = D3D12_DEFAULT_STENCIL_WRITE_MASK; + const D3D12_DEPTH_STENCILOP_DESC defaultStencilOp = { + D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, D3D12_COMPARISON_FUNC_ALWAYS}; + FrontFace = defaultStencilOp; + BackFace = defaultStencilOp; + } + explicit CD3DX12_DEPTH_STENCIL_DESC(BOOL depthEnable, + D3D12_DEPTH_WRITE_MASK depthWriteMask, + D3D12_COMPARISON_FUNC depthFunc, + BOOL stencilEnable, + UINT8 stencilReadMask, + UINT8 stencilWriteMask, + D3D12_STENCIL_OP frontStencilFailOp, + D3D12_STENCIL_OP frontStencilDepthFailOp, + D3D12_STENCIL_OP frontStencilPassOp, + D3D12_COMPARISON_FUNC frontStencilFunc, + D3D12_STENCIL_OP backStencilFailOp, + D3D12_STENCIL_OP backStencilDepthFailOp, + D3D12_STENCIL_OP backStencilPassOp, + D3D12_COMPARISON_FUNC backStencilFunc) + { + DepthEnable = depthEnable; + DepthWriteMask = depthWriteMask; + DepthFunc = depthFunc; + StencilEnable = stencilEnable; + StencilReadMask = stencilReadMask; + StencilWriteMask = stencilWriteMask; + FrontFace.StencilFailOp = frontStencilFailOp; FrontFace.StencilDepthFailOp = frontStencilDepthFailOp; - FrontFace.StencilPassOp = frontStencilPassOp; - FrontFace.StencilFunc = frontStencilFunc; - BackFace.StencilFailOp = backStencilFailOp; - BackFace.StencilDepthFailOp = backStencilDepthFailOp; - BackFace.StencilPassOp = backStencilPassOp; - BackFace.StencilFunc = backStencilFunc; + FrontFace.StencilPassOp = frontStencilPassOp; + FrontFace.StencilFunc = frontStencilFunc; + BackFace.StencilFailOp = backStencilFailOp; + BackFace.StencilDepthFailOp = backStencilDepthFailOp; + BackFace.StencilPassOp = backStencilPassOp; + BackFace.StencilFunc = backStencilFunc; } ~CD3DX12_DEPTH_STENCIL_DESC() {} - operator const D3D12_DEPTH_STENCIL_DESC&() const { return *this; } + operator const D3D12_DEPTH_STENCIL_DESC &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_BLEND_DESC : public D3D12_BLEND_DESC { - CD3DX12_BLEND_DESC() - {} - explicit CD3DX12_BLEND_DESC( const D3D12_BLEND_DESC& o ) : - D3D12_BLEND_DESC( o ) - {} - explicit CD3DX12_BLEND_DESC( CD3DX12_DEFAULT ) + CD3DX12_BLEND_DESC() {} + explicit CD3DX12_BLEND_DESC(const D3D12_BLEND_DESC &o) + : D3D12_BLEND_DESC(o) { - AlphaToCoverageEnable = FALSE; - IndependentBlendEnable = FALSE; - const D3D12_RENDER_TARGET_BLEND_DESC defaultRenderTargetBlendDesc = - { - FALSE,FALSE, - D3D12_BLEND_ONE, D3D12_BLEND_ZERO, D3D12_BLEND_OP_ADD, - D3D12_BLEND_ONE, D3D12_BLEND_ZERO, D3D12_BLEND_OP_ADD, + } + explicit CD3DX12_BLEND_DESC(CD3DX12_DEFAULT) + { + AlphaToCoverageEnable = FALSE; + IndependentBlendEnable = FALSE; + const D3D12_RENDER_TARGET_BLEND_DESC defaultRenderTargetBlendDesc = { + FALSE, + FALSE, + D3D12_BLEND_ONE, + D3D12_BLEND_ZERO, + D3D12_BLEND_OP_ADD, + D3D12_BLEND_ONE, + D3D12_BLEND_ZERO, + D3D12_BLEND_OP_ADD, D3D12_LOGIC_OP_NOOP, D3D12_COLOR_WRITE_ENABLE_ALL, }; for (UINT i = 0; i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT; ++i) - RenderTarget[ i ] = defaultRenderTargetBlendDesc; + RenderTarget[i] = defaultRenderTargetBlendDesc; } ~CD3DX12_BLEND_DESC() {} - operator const D3D12_BLEND_DESC&() const { return *this; } + operator const D3D12_BLEND_DESC &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_RASTERIZER_DESC : public D3D12_RASTERIZER_DESC { - CD3DX12_RASTERIZER_DESC() - {} - explicit CD3DX12_RASTERIZER_DESC( const D3D12_RASTERIZER_DESC& o ) : - D3D12_RASTERIZER_DESC( o ) - {} - explicit CD3DX12_RASTERIZER_DESC( CD3DX12_DEFAULT ) + CD3DX12_RASTERIZER_DESC() {} + explicit CD3DX12_RASTERIZER_DESC(const D3D12_RASTERIZER_DESC &o) + : D3D12_RASTERIZER_DESC(o) { - FillMode = D3D12_FILL_MODE_SOLID; - CullMode = D3D12_CULL_MODE_BACK; - FrontCounterClockwise = FALSE; - DepthBias = D3D12_DEFAULT_DEPTH_BIAS; - DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP; - SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS; - DepthClipEnable = TRUE; - MultisampleEnable = FALSE; - AntialiasedLineEnable = FALSE; - ForcedSampleCount = 0; - ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF; } - explicit CD3DX12_RASTERIZER_DESC( - D3D12_FILL_MODE fillMode, - D3D12_CULL_MODE cullMode, - BOOL frontCounterClockwise, - INT depthBias, - FLOAT depthBiasClamp, - FLOAT slopeScaledDepthBias, - BOOL depthClipEnable, - BOOL multisampleEnable, - BOOL antialiasedLineEnable, - UINT forcedSampleCount, - D3D12_CONSERVATIVE_RASTERIZATION_MODE conservativeRaster) + explicit CD3DX12_RASTERIZER_DESC(CD3DX12_DEFAULT) { - FillMode = fillMode; - CullMode = cullMode; + FillMode = D3D12_FILL_MODE_SOLID; + CullMode = D3D12_CULL_MODE_BACK; + FrontCounterClockwise = FALSE; + DepthBias = D3D12_DEFAULT_DEPTH_BIAS; + DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP; + SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS; + DepthClipEnable = TRUE; + MultisampleEnable = FALSE; + AntialiasedLineEnable = FALSE; + ForcedSampleCount = 0; + ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF; + } + explicit CD3DX12_RASTERIZER_DESC(D3D12_FILL_MODE fillMode, + D3D12_CULL_MODE cullMode, + BOOL frontCounterClockwise, + INT depthBias, + FLOAT depthBiasClamp, + FLOAT slopeScaledDepthBias, + BOOL depthClipEnable, + BOOL multisampleEnable, + BOOL antialiasedLineEnable, + UINT forcedSampleCount, + D3D12_CONSERVATIVE_RASTERIZATION_MODE conservativeRaster) + { + FillMode = fillMode; + CullMode = cullMode; FrontCounterClockwise = frontCounterClockwise; - DepthBias = depthBias; - DepthBiasClamp = depthBiasClamp; - SlopeScaledDepthBias = slopeScaledDepthBias; - DepthClipEnable = depthClipEnable; - MultisampleEnable = multisampleEnable; + DepthBias = depthBias; + DepthBiasClamp = depthBiasClamp; + SlopeScaledDepthBias = slopeScaledDepthBias; + DepthClipEnable = depthClipEnable; + MultisampleEnable = multisampleEnable; AntialiasedLineEnable = antialiasedLineEnable; - ForcedSampleCount = forcedSampleCount; - ConservativeRaster = conservativeRaster; + ForcedSampleCount = forcedSampleCount; + ConservativeRaster = conservativeRaster; } ~CD3DX12_RASTERIZER_DESC() {} - operator const D3D12_RASTERIZER_DESC&() const { return *this; } + operator const D3D12_RASTERIZER_DESC &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_RESOURCE_ALLOCATION_INFO : public D3D12_RESOURCE_ALLOCATION_INFO { - CD3DX12_RESOURCE_ALLOCATION_INFO() - {} - explicit CD3DX12_RESOURCE_ALLOCATION_INFO( const D3D12_RESOURCE_ALLOCATION_INFO& o ) : - D3D12_RESOURCE_ALLOCATION_INFO( o ) - {} - CD3DX12_RESOURCE_ALLOCATION_INFO( - UINT64 size, - UINT64 alignment ) + CD3DX12_RESOURCE_ALLOCATION_INFO() {} + explicit CD3DX12_RESOURCE_ALLOCATION_INFO(const D3D12_RESOURCE_ALLOCATION_INFO &o) + : D3D12_RESOURCE_ALLOCATION_INFO(o) + { + } + CD3DX12_RESOURCE_ALLOCATION_INFO(UINT64 size, UINT64 alignment) { SizeInBytes = size; - Alignment = alignment; + Alignment = alignment; } - operator const D3D12_RESOURCE_ALLOCATION_INFO&() const { return *this; } + operator const D3D12_RESOURCE_ALLOCATION_INFO &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_HEAP_PROPERTIES : public D3D12_HEAP_PROPERTIES { - CD3DX12_HEAP_PROPERTIES() - {} - explicit CD3DX12_HEAP_PROPERTIES(const D3D12_HEAP_PROPERTIES &o) : - D3D12_HEAP_PROPERTIES(o) - {} - CD3DX12_HEAP_PROPERTIES( - D3D12_CPU_PAGE_PROPERTY cpuPageProperty, - D3D12_MEMORY_POOL memoryPoolPreference, - UINT creationNodeMask = 1, - UINT nodeMask = 1 ) + CD3DX12_HEAP_PROPERTIES() {} + explicit CD3DX12_HEAP_PROPERTIES(const D3D12_HEAP_PROPERTIES &o) + : D3D12_HEAP_PROPERTIES(o) { - Type = D3D12_HEAP_TYPE_CUSTOM; - CPUPageProperty = cpuPageProperty; + } + CD3DX12_HEAP_PROPERTIES(D3D12_CPU_PAGE_PROPERTY cpuPageProperty, + D3D12_MEMORY_POOL memoryPoolPreference, + UINT creationNodeMask = 1, + UINT nodeMask = 1) + { + Type = D3D12_HEAP_TYPE_CUSTOM; + CPUPageProperty = cpuPageProperty; MemoryPoolPreference = memoryPoolPreference; - CreationNodeMask = creationNodeMask; - VisibleNodeMask = nodeMask; + CreationNodeMask = creationNodeMask; + VisibleNodeMask = nodeMask; } - explicit CD3DX12_HEAP_PROPERTIES( - D3D12_HEAP_TYPE type, - UINT creationNodeMask = 1, - UINT nodeMask = 1 ) + explicit CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE type, UINT creationNodeMask = 1, UINT nodeMask = 1) { - Type = type; - CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; + Type = type; + CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; - CreationNodeMask = creationNodeMask; - VisibleNodeMask = nodeMask; + CreationNodeMask = creationNodeMask; + VisibleNodeMask = nodeMask; } - operator const D3D12_HEAP_PROPERTIES&() const { return *this; } + operator const D3D12_HEAP_PROPERTIES &() const { return *this; } bool IsCPUAccessible() const { - return Type == D3D12_HEAP_TYPE_UPLOAD || Type == D3D12_HEAP_TYPE_READBACK || (Type == D3D12_HEAP_TYPE_CUSTOM && - (CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE || CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_BACK)); + return Type == D3D12_HEAP_TYPE_UPLOAD || Type == D3D12_HEAP_TYPE_READBACK + || (Type == D3D12_HEAP_TYPE_CUSTOM + && (CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE + || CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_BACK)); } }; -inline bool operator==( const D3D12_HEAP_PROPERTIES& l, const D3D12_HEAP_PROPERTIES& r ) +inline bool operator==(const D3D12_HEAP_PROPERTIES &l, const D3D12_HEAP_PROPERTIES &r) { - return l.Type == r.Type && l.CPUPageProperty == r.CPUPageProperty && - l.MemoryPoolPreference == r.MemoryPoolPreference && - l.CreationNodeMask == r.CreationNodeMask && - l.VisibleNodeMask == r.VisibleNodeMask; + return l.Type == r.Type && l.CPUPageProperty == r.CPUPageProperty + && l.MemoryPoolPreference == r.MemoryPoolPreference && l.CreationNodeMask == r.CreationNodeMask + && l.VisibleNodeMask == r.VisibleNodeMask; } -inline bool operator!=( const D3D12_HEAP_PROPERTIES& l, const D3D12_HEAP_PROPERTIES& r ) -{ return !( l == r ); } +inline bool operator!=(const D3D12_HEAP_PROPERTIES &l, const D3D12_HEAP_PROPERTIES &r) { return !(l == r); } //------------------------------------------------------------------------------------------------ struct CD3DX12_HEAP_DESC : public D3D12_HEAP_DESC { - CD3DX12_HEAP_DESC() - {} - explicit CD3DX12_HEAP_DESC(const D3D12_HEAP_DESC &o) : - D3D12_HEAP_DESC(o) - {} - CD3DX12_HEAP_DESC( - UINT64 size, - D3D12_HEAP_PROPERTIES properties, - UINT64 alignment = 0, - D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE ) + CD3DX12_HEAP_DESC() {} + explicit CD3DX12_HEAP_DESC(const D3D12_HEAP_DESC &o) + : D3D12_HEAP_DESC(o) + { + } + CD3DX12_HEAP_DESC(UINT64 size, + D3D12_HEAP_PROPERTIES properties, + UINT64 alignment = 0, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) { SizeInBytes = size; - Properties = properties; - Alignment = alignment; - Flags = flags; + Properties = properties; + Alignment = alignment; + Flags = flags; } - CD3DX12_HEAP_DESC( - UINT64 size, - D3D12_HEAP_TYPE type, - UINT64 alignment = 0, - D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE ) + CD3DX12_HEAP_DESC(UINT64 size, + D3D12_HEAP_TYPE type, + UINT64 alignment = 0, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) { SizeInBytes = size; - Properties = CD3DX12_HEAP_PROPERTIES( type ); - Alignment = alignment; - Flags = flags; + Properties = CD3DX12_HEAP_PROPERTIES(type); + Alignment = alignment; + Flags = flags; } - CD3DX12_HEAP_DESC( - UINT64 size, - D3D12_CPU_PAGE_PROPERTY cpuPageProperty, - D3D12_MEMORY_POOL memoryPoolPreference, - UINT64 alignment = 0, - D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE ) + CD3DX12_HEAP_DESC(UINT64 size, + D3D12_CPU_PAGE_PROPERTY cpuPageProperty, + D3D12_MEMORY_POOL memoryPoolPreference, + UINT64 alignment = 0, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) { SizeInBytes = size; - Properties = CD3DX12_HEAP_PROPERTIES( cpuPageProperty, memoryPoolPreference ); - Alignment = alignment; - Flags = flags; + Properties = CD3DX12_HEAP_PROPERTIES(cpuPageProperty, memoryPoolPreference); + Alignment = alignment; + Flags = flags; } - CD3DX12_HEAP_DESC( - const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo, - D3D12_HEAP_PROPERTIES properties, - D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE ) + CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, + D3D12_HEAP_PROPERTIES properties, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) { SizeInBytes = resAllocInfo.SizeInBytes; - Properties = properties; - Alignment = resAllocInfo.Alignment; - Flags = flags; + Properties = properties; + Alignment = resAllocInfo.Alignment; + Flags = flags; } - CD3DX12_HEAP_DESC( - const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo, - D3D12_HEAP_TYPE type, - D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE ) + CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, + D3D12_HEAP_TYPE type, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) { SizeInBytes = resAllocInfo.SizeInBytes; - Properties = CD3DX12_HEAP_PROPERTIES( type ); - Alignment = resAllocInfo.Alignment; - Flags = flags; + Properties = CD3DX12_HEAP_PROPERTIES(type); + Alignment = resAllocInfo.Alignment; + Flags = flags; } - CD3DX12_HEAP_DESC( - const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo, - D3D12_CPU_PAGE_PROPERTY cpuPageProperty, - D3D12_MEMORY_POOL memoryPoolPreference, - D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE ) + CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, + D3D12_CPU_PAGE_PROPERTY cpuPageProperty, + D3D12_MEMORY_POOL memoryPoolPreference, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) { SizeInBytes = resAllocInfo.SizeInBytes; - Properties = CD3DX12_HEAP_PROPERTIES( cpuPageProperty, memoryPoolPreference ); - Alignment = resAllocInfo.Alignment; - Flags = flags; + Properties = CD3DX12_HEAP_PROPERTIES(cpuPageProperty, memoryPoolPreference); + Alignment = resAllocInfo.Alignment; + Flags = flags; } - operator const D3D12_HEAP_DESC&() const { return *this; } + operator const D3D12_HEAP_DESC &() const { return *this; } bool IsCPUAccessible() const - { return static_cast< const CD3DX12_HEAP_PROPERTIES* >( &Properties )->IsCPUAccessible(); } + { + return static_cast(&Properties)->IsCPUAccessible(); + } }; -inline bool operator==( const D3D12_HEAP_DESC& l, const D3D12_HEAP_DESC& r ) +inline bool operator==(const D3D12_HEAP_DESC &l, const D3D12_HEAP_DESC &r) { - return l.SizeInBytes == r.SizeInBytes && - l.Properties == r.Properties && - l.Alignment == r.Alignment && - l.Flags == r.Flags; + return l.SizeInBytes == r.SizeInBytes && l.Properties == r.Properties && l.Alignment == r.Alignment + && l.Flags == r.Flags; } -inline bool operator!=( const D3D12_HEAP_DESC& l, const D3D12_HEAP_DESC& r ) -{ return !( l == r ); } +inline bool operator!=(const D3D12_HEAP_DESC &l, const D3D12_HEAP_DESC &r) { return !(l == r); } //------------------------------------------------------------------------------------------------ struct CD3DX12_CLEAR_VALUE : public D3D12_CLEAR_VALUE { - CD3DX12_CLEAR_VALUE() - {} - explicit CD3DX12_CLEAR_VALUE(const D3D12_CLEAR_VALUE &o) : - D3D12_CLEAR_VALUE(o) - {} - CD3DX12_CLEAR_VALUE( - DXGI_FORMAT format, - const FLOAT color[4] ) + CD3DX12_CLEAR_VALUE() {} + explicit CD3DX12_CLEAR_VALUE(const D3D12_CLEAR_VALUE &o) + : D3D12_CLEAR_VALUE(o) + { + } + CD3DX12_CLEAR_VALUE(DXGI_FORMAT format, const FLOAT color[4]) { Format = format; - memcpy( Color, color, sizeof( Color ) ); + memcpy(Color, color, sizeof(Color)); } - CD3DX12_CLEAR_VALUE( - DXGI_FORMAT format, - FLOAT depth, - UINT8 stencil ) + CD3DX12_CLEAR_VALUE(DXGI_FORMAT format, FLOAT depth, UINT8 stencil) { Format = format; /* Use memcpy to preserve NAN values */ - memcpy( &DepthStencil.Depth, &depth, sizeof( depth ) ); + memcpy(&DepthStencil.Depth, &depth, sizeof(depth)); DepthStencil.Stencil = stencil; } - operator const D3D12_CLEAR_VALUE&() const { return *this; } + operator const D3D12_CLEAR_VALUE &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_RANGE : public D3D12_RANGE { - CD3DX12_RANGE() - {} - explicit CD3DX12_RANGE(const D3D12_RANGE &o) : - D3D12_RANGE(o) - {} - CD3DX12_RANGE( - SIZE_T begin, - SIZE_T end ) + CD3DX12_RANGE() {} + explicit CD3DX12_RANGE(const D3D12_RANGE &o) + : D3D12_RANGE(o) + { + } + CD3DX12_RANGE(SIZE_T begin, SIZE_T end) { Begin = begin; - End = end; + End = end; } - operator const D3D12_RANGE&() const { return *this; } + operator const D3D12_RANGE &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_SHADER_BYTECODE : public D3D12_SHADER_BYTECODE { - CD3DX12_SHADER_BYTECODE() - {} - explicit CD3DX12_SHADER_BYTECODE(const D3D12_SHADER_BYTECODE &o) : - D3D12_SHADER_BYTECODE(o) - {} - CD3DX12_SHADER_BYTECODE( - ID3DBlob* pShaderBlob ) + CD3DX12_SHADER_BYTECODE() {} + explicit CD3DX12_SHADER_BYTECODE(const D3D12_SHADER_BYTECODE &o) + : D3D12_SHADER_BYTECODE(o) + { + } + CD3DX12_SHADER_BYTECODE(ID3DBlob *pShaderBlob) { pShaderBytecode = pShaderBlob->GetBufferPointer(); - BytecodeLength = pShaderBlob->GetBufferSize(); + BytecodeLength = pShaderBlob->GetBufferSize(); } - CD3DX12_SHADER_BYTECODE( - void* _pShaderBytecode, - SIZE_T bytecodeLength ) + CD3DX12_SHADER_BYTECODE(void *_pShaderBytecode, SIZE_T bytecodeLength) { pShaderBytecode = _pShaderBytecode; - BytecodeLength = bytecodeLength; + BytecodeLength = bytecodeLength; } - operator const D3D12_SHADER_BYTECODE&() const { return *this; } + operator const D3D12_SHADER_BYTECODE &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_TILED_RESOURCE_COORDINATE : public D3D12_TILED_RESOURCE_COORDINATE { - CD3DX12_TILED_RESOURCE_COORDINATE() - {} - explicit CD3DX12_TILED_RESOURCE_COORDINATE(const D3D12_TILED_RESOURCE_COORDINATE &o) : - D3D12_TILED_RESOURCE_COORDINATE(o) - {} - CD3DX12_TILED_RESOURCE_COORDINATE( - UINT x, - UINT y, - UINT z, - UINT subresource ) + CD3DX12_TILED_RESOURCE_COORDINATE() {} + explicit CD3DX12_TILED_RESOURCE_COORDINATE(const D3D12_TILED_RESOURCE_COORDINATE &o) + : D3D12_TILED_RESOURCE_COORDINATE(o) { - X = x; - Y = y; - Z = z; + } + CD3DX12_TILED_RESOURCE_COORDINATE(UINT x, UINT y, UINT z, UINT subresource) + { + X = x; + Y = y; + Z = z; Subresource = subresource; } - operator const D3D12_TILED_RESOURCE_COORDINATE&() const { return *this; } + operator const D3D12_TILED_RESOURCE_COORDINATE &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_TILE_REGION_SIZE : public D3D12_TILE_REGION_SIZE { - CD3DX12_TILE_REGION_SIZE() - {} - explicit CD3DX12_TILE_REGION_SIZE(const D3D12_TILE_REGION_SIZE &o) : - D3D12_TILE_REGION_SIZE(o) - {} - CD3DX12_TILE_REGION_SIZE( - UINT numTiles, - BOOL useBox, - UINT width, - UINT16 height, - UINT16 depth ) + CD3DX12_TILE_REGION_SIZE() {} + explicit CD3DX12_TILE_REGION_SIZE(const D3D12_TILE_REGION_SIZE &o) + : D3D12_TILE_REGION_SIZE(o) + { + } + CD3DX12_TILE_REGION_SIZE(UINT numTiles, BOOL useBox, UINT width, UINT16 height, UINT16 depth) { NumTiles = numTiles; - UseBox = useBox; - Width = width; - Height = height; - Depth = depth; + UseBox = useBox; + Width = width; + Height = height; + Depth = depth; } - operator const D3D12_TILE_REGION_SIZE&() const { return *this; } + operator const D3D12_TILE_REGION_SIZE &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_SUBRESOURCE_TILING : public D3D12_SUBRESOURCE_TILING { - CD3DX12_SUBRESOURCE_TILING() - {} - explicit CD3DX12_SUBRESOURCE_TILING(const D3D12_SUBRESOURCE_TILING &o) : - D3D12_SUBRESOURCE_TILING(o) - {} - CD3DX12_SUBRESOURCE_TILING( - UINT widthInTiles, - UINT16 heightInTiles, - UINT16 depthInTiles, - UINT startTileIndexInOverallResource ) + CD3DX12_SUBRESOURCE_TILING() {} + explicit CD3DX12_SUBRESOURCE_TILING(const D3D12_SUBRESOURCE_TILING &o) + : D3D12_SUBRESOURCE_TILING(o) { - WidthInTiles = widthInTiles; - HeightInTiles = heightInTiles; - DepthInTiles = depthInTiles; + } + CD3DX12_SUBRESOURCE_TILING(UINT widthInTiles, + UINT16 heightInTiles, + UINT16 depthInTiles, + UINT startTileIndexInOverallResource) + { + WidthInTiles = widthInTiles; + HeightInTiles = heightInTiles; + DepthInTiles = depthInTiles; StartTileIndexInOverallResource = startTileIndexInOverallResource; } - operator const D3D12_SUBRESOURCE_TILING&() const { return *this; } + operator const D3D12_SUBRESOURCE_TILING &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_TILE_SHAPE : public D3D12_TILE_SHAPE { - CD3DX12_TILE_SHAPE() - {} - explicit CD3DX12_TILE_SHAPE(const D3D12_TILE_SHAPE &o) : - D3D12_TILE_SHAPE(o) - {} - CD3DX12_TILE_SHAPE( - UINT widthInTexels, - UINT heightInTexels, - UINT depthInTexels ) + CD3DX12_TILE_SHAPE() {} + explicit CD3DX12_TILE_SHAPE(const D3D12_TILE_SHAPE &o) + : D3D12_TILE_SHAPE(o) { - WidthInTexels = widthInTexels; - HeightInTexels = heightInTexels; - DepthInTexels = depthInTexels; } - operator const D3D12_TILE_SHAPE&() const { return *this; } + CD3DX12_TILE_SHAPE(UINT widthInTexels, UINT heightInTexels, UINT depthInTexels) + { + WidthInTexels = widthInTexels; + HeightInTexels = heightInTexels; + DepthInTexels = depthInTexels; + } + operator const D3D12_TILE_SHAPE &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_RESOURCE_BARRIER : public D3D12_RESOURCE_BARRIER { - CD3DX12_RESOURCE_BARRIER() - {} - explicit CD3DX12_RESOURCE_BARRIER(const D3D12_RESOURCE_BARRIER &o) : - D3D12_RESOURCE_BARRIER(o) - {} - static inline CD3DX12_RESOURCE_BARRIER Transition( - _In_ ID3D12Resource* pResource, - D3D12_RESOURCE_STATES stateBefore, - D3D12_RESOURCE_STATES stateAfter, - UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES, - D3D12_RESOURCE_BARRIER_FLAGS flags = D3D12_RESOURCE_BARRIER_FLAG_NONE) + CD3DX12_RESOURCE_BARRIER() {} + explicit CD3DX12_RESOURCE_BARRIER(const D3D12_RESOURCE_BARRIER &o) + : D3D12_RESOURCE_BARRIER(o) + { + } + static inline CD3DX12_RESOURCE_BARRIER + Transition(_In_ ID3D12Resource *pResource, + D3D12_RESOURCE_STATES stateBefore, + D3D12_RESOURCE_STATES stateAfter, + UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES, + D3D12_RESOURCE_BARRIER_FLAGS flags = D3D12_RESOURCE_BARRIER_FLAG_NONE) { CD3DX12_RESOURCE_BARRIER result; ZeroMemory(&result, sizeof(result)); D3D12_RESOURCE_BARRIER &barrier = result; - result.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; - result.Flags = flags; - barrier.Transition.pResource = pResource; - barrier.Transition.StateBefore = stateBefore; - barrier.Transition.StateAfter = stateAfter; - barrier.Transition.Subresource = subresource; + result.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + result.Flags = flags; + barrier.Transition.pResource = pResource; + barrier.Transition.StateBefore = stateBefore; + barrier.Transition.StateAfter = stateAfter; + barrier.Transition.Subresource = subresource; return result; } - static inline CD3DX12_RESOURCE_BARRIER Aliasing( - _In_ ID3D12Resource* pResourceBefore, - _In_ ID3D12Resource* pResourceAfter) + static inline CD3DX12_RESOURCE_BARRIER Aliasing(_In_ ID3D12Resource *pResourceBefore, + _In_ ID3D12Resource *pResourceAfter) { CD3DX12_RESOURCE_BARRIER result; ZeroMemory(&result, sizeof(result)); - D3D12_RESOURCE_BARRIER &barrier = result; - result.Type = D3D12_RESOURCE_BARRIER_TYPE_ALIASING; + D3D12_RESOURCE_BARRIER &barrier = result; + result.Type = D3D12_RESOURCE_BARRIER_TYPE_ALIASING; barrier.Aliasing.pResourceBefore = pResourceBefore; - barrier.Aliasing.pResourceAfter = pResourceAfter; + barrier.Aliasing.pResourceAfter = pResourceAfter; return result; } - static inline CD3DX12_RESOURCE_BARRIER UAV( - _In_ ID3D12Resource* pResource) + static inline CD3DX12_RESOURCE_BARRIER UAV(_In_ ID3D12Resource *pResource) { CD3DX12_RESOURCE_BARRIER result; ZeroMemory(&result, sizeof(result)); D3D12_RESOURCE_BARRIER &barrier = result; - result.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; - barrier.UAV.pResource = pResource; + result.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; + barrier.UAV.pResource = pResource; return result; } - operator const D3D12_RESOURCE_BARRIER&() const { return *this; } + operator const D3D12_RESOURCE_BARRIER &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_PACKED_MIP_INFO : public D3D12_PACKED_MIP_INFO { - CD3DX12_PACKED_MIP_INFO() - {} - explicit CD3DX12_PACKED_MIP_INFO(const D3D12_PACKED_MIP_INFO &o) : - D3D12_PACKED_MIP_INFO(o) - {} - CD3DX12_PACKED_MIP_INFO( - UINT8 numStandardMips, - UINT8 numPackedMips, - UINT numTilesForPackedMips, - UINT startTileIndexInOverallResource ) + CD3DX12_PACKED_MIP_INFO() {} + explicit CD3DX12_PACKED_MIP_INFO(const D3D12_PACKED_MIP_INFO &o) + : D3D12_PACKED_MIP_INFO(o) { - NumStandardMips = numStandardMips; - NumPackedMips = numPackedMips; - NumTilesForPackedMips = numTilesForPackedMips; + } + CD3DX12_PACKED_MIP_INFO(UINT8 numStandardMips, + UINT8 numPackedMips, + UINT numTilesForPackedMips, + UINT startTileIndexInOverallResource) + { + NumStandardMips = numStandardMips; + NumPackedMips = numPackedMips; + NumTilesForPackedMips = numTilesForPackedMips; StartTileIndexInOverallResource = startTileIndexInOverallResource; } - operator const D3D12_PACKED_MIP_INFO&() const { return *this; } + operator const D3D12_PACKED_MIP_INFO &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_SUBRESOURCE_FOOTPRINT : public D3D12_SUBRESOURCE_FOOTPRINT { - CD3DX12_SUBRESOURCE_FOOTPRINT() - {} - explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_SUBRESOURCE_FOOTPRINT &o) : - D3D12_SUBRESOURCE_FOOTPRINT(o) - {} - CD3DX12_SUBRESOURCE_FOOTPRINT( - DXGI_FORMAT format, - UINT width, - UINT height, - UINT depth, - UINT rowPitch ) + CD3DX12_SUBRESOURCE_FOOTPRINT() {} + explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_SUBRESOURCE_FOOTPRINT &o) + : D3D12_SUBRESOURCE_FOOTPRINT(o) { - Format = format; - Width = width; - Height = height; - Depth = depth; + } + CD3DX12_SUBRESOURCE_FOOTPRINT(DXGI_FORMAT format, UINT width, UINT height, UINT depth, UINT rowPitch) + { + Format = format; + Width = width; + Height = height; + Depth = depth; RowPitch = rowPitch; } - explicit CD3DX12_SUBRESOURCE_FOOTPRINT( - const D3D12_RESOURCE_DESC& resDesc, - UINT rowPitch ) + explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_RESOURCE_DESC &resDesc, UINT rowPitch) { - Format = resDesc.Format; - Width = UINT( resDesc.Width ); - Height = resDesc.Height; - Depth = (resDesc.Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? resDesc.DepthOrArraySize : 1); + Format = resDesc.Format; + Width = UINT(resDesc.Width); + Height = resDesc.Height; + Depth = (resDesc.Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? resDesc.DepthOrArraySize : 1); RowPitch = rowPitch; } - operator const D3D12_SUBRESOURCE_FOOTPRINT&() const { return *this; } + operator const D3D12_SUBRESOURCE_FOOTPRINT &() const { return *this; } }; //------------------------------------------------------------------------------------------------ struct CD3DX12_TEXTURE_COPY_LOCATION : public D3D12_TEXTURE_COPY_LOCATION -{ - CD3DX12_TEXTURE_COPY_LOCATION() - {} - explicit CD3DX12_TEXTURE_COPY_LOCATION(const D3D12_TEXTURE_COPY_LOCATION &o) : - D3D12_TEXTURE_COPY_LOCATION(o) - {} - CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource* pRes) { pResource = pRes; } - CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource* pRes, D3D12_PLACED_SUBRESOURCE_FOOTPRINT const& Footprint) +{ + CD3DX12_TEXTURE_COPY_LOCATION() {} + explicit CD3DX12_TEXTURE_COPY_LOCATION(const D3D12_TEXTURE_COPY_LOCATION &o) + : D3D12_TEXTURE_COPY_LOCATION(o) { - pResource = pRes; - Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; + } + CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource *pRes) { pResource = pRes; } + CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource *pRes, D3D12_PLACED_SUBRESOURCE_FOOTPRINT const &Footprint) + { + pResource = pRes; + Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; PlacedFootprint = Footprint; } - CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource* pRes, UINT Sub) + CD3DX12_TEXTURE_COPY_LOCATION(ID3D12Resource *pRes, UINT Sub) { - pResource = pRes; - Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + pResource = pRes; + Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; SubresourceIndex = Sub; } -}; +}; //------------------------------------------------------------------------------------------------ struct CD3DX12_DESCRIPTOR_RANGE : public D3D12_DESCRIPTOR_RANGE { - CD3DX12_DESCRIPTOR_RANGE() { } - explicit CD3DX12_DESCRIPTOR_RANGE(const D3D12_DESCRIPTOR_RANGE &o) : - D3D12_DESCRIPTOR_RANGE(o) - {} - CD3DX12_DESCRIPTOR_RANGE( - D3D12_DESCRIPTOR_RANGE_TYPE rangeType, - UINT numDescriptors, - UINT baseShaderRegister, - UINT registerSpace = 0, - UINT offsetInDescriptorsFromTableStart = - D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) + CD3DX12_DESCRIPTOR_RANGE() {} + explicit CD3DX12_DESCRIPTOR_RANGE(const D3D12_DESCRIPTOR_RANGE &o) + : D3D12_DESCRIPTOR_RANGE(o) + { + } + CD3DX12_DESCRIPTOR_RANGE(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, + UINT numDescriptors, + UINT baseShaderRegister, + UINT registerSpace = 0, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) { Init(rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart); } - - inline void Init( - D3D12_DESCRIPTOR_RANGE_TYPE rangeType, - UINT numDescriptors, - UINT baseShaderRegister, - UINT registerSpace = 0, - UINT offsetInDescriptorsFromTableStart = - D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) + + inline void Init(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, + UINT numDescriptors, + UINT baseShaderRegister, + UINT registerSpace = 0, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) { Init(*this, rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart); } - - static inline void Init( - _Out_ D3D12_DESCRIPTOR_RANGE &range, - D3D12_DESCRIPTOR_RANGE_TYPE rangeType, - UINT numDescriptors, - UINT baseShaderRegister, - UINT registerSpace = 0, - UINT offsetInDescriptorsFromTableStart = - D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) + + static inline void Init(_Out_ D3D12_DESCRIPTOR_RANGE &range, + D3D12_DESCRIPTOR_RANGE_TYPE rangeType, + UINT numDescriptors, + UINT baseShaderRegister, + UINT registerSpace = 0, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) { - range.RangeType = rangeType; - range.NumDescriptors = numDescriptors; - range.BaseShaderRegister = baseShaderRegister; - range.RegisterSpace = registerSpace; + range.RangeType = rangeType; + range.NumDescriptors = numDescriptors; + range.BaseShaderRegister = baseShaderRegister; + range.RegisterSpace = registerSpace; range.OffsetInDescriptorsFromTableStart = offsetInDescriptorsFromTableStart; } }; @@ -734,30 +669,28 @@ struct CD3DX12_DESCRIPTOR_RANGE : public D3D12_DESCRIPTOR_RANGE struct CD3DX12_ROOT_DESCRIPTOR_TABLE : public D3D12_ROOT_DESCRIPTOR_TABLE { CD3DX12_ROOT_DESCRIPTOR_TABLE() {} - explicit CD3DX12_ROOT_DESCRIPTOR_TABLE(const D3D12_ROOT_DESCRIPTOR_TABLE &o) : - D3D12_ROOT_DESCRIPTOR_TABLE(o) - {} - CD3DX12_ROOT_DESCRIPTOR_TABLE( - UINT numDescriptorRanges, - _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* _pDescriptorRanges) + explicit CD3DX12_ROOT_DESCRIPTOR_TABLE(const D3D12_ROOT_DESCRIPTOR_TABLE &o) + : D3D12_ROOT_DESCRIPTOR_TABLE(o) + { + } + CD3DX12_ROOT_DESCRIPTOR_TABLE(UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) { Init(numDescriptorRanges, _pDescriptorRanges); } - - inline void Init( - UINT numDescriptorRanges, - _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* _pDescriptorRanges) + + inline void Init(UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) { Init(*this, numDescriptorRanges, _pDescriptorRanges); } - - static inline void Init( - _Out_ D3D12_ROOT_DESCRIPTOR_TABLE &rootDescriptorTable, - UINT numDescriptorRanges, - _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* _pDescriptorRanges) + + static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR_TABLE &rootDescriptorTable, + UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) { rootDescriptorTable.NumDescriptorRanges = numDescriptorRanges; - rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges; + rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges; } }; @@ -765,34 +698,26 @@ struct CD3DX12_ROOT_DESCRIPTOR_TABLE : public D3D12_ROOT_DESCRIPTOR_TABLE struct CD3DX12_ROOT_CONSTANTS : public D3D12_ROOT_CONSTANTS { CD3DX12_ROOT_CONSTANTS() {} - explicit CD3DX12_ROOT_CONSTANTS(const D3D12_ROOT_CONSTANTS &o) : - D3D12_ROOT_CONSTANTS(o) - {} - CD3DX12_ROOT_CONSTANTS( - UINT num32BitValues, - UINT shaderRegister, - UINT registerSpace = 0) + explicit CD3DX12_ROOT_CONSTANTS(const D3D12_ROOT_CONSTANTS &o) + : D3D12_ROOT_CONSTANTS(o) + { + } + CD3DX12_ROOT_CONSTANTS(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0) { Init(num32BitValues, shaderRegister, registerSpace); } - - inline void Init( - UINT num32BitValues, - UINT shaderRegister, - UINT registerSpace = 0) + + inline void Init(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0) { Init(*this, num32BitValues, shaderRegister, registerSpace); } - - static inline void Init( - _Out_ D3D12_ROOT_CONSTANTS &rootConstants, - UINT num32BitValues, - UINT shaderRegister, - UINT registerSpace = 0) + + static inline void + Init(_Out_ D3D12_ROOT_CONSTANTS &rootConstants, UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0) { rootConstants.Num32BitValues = num32BitValues; rootConstants.ShaderRegister = shaderRegister; - rootConstants.RegisterSpace = registerSpace; + rootConstants.RegisterSpace = registerSpace; } }; @@ -800,27 +725,18 @@ struct CD3DX12_ROOT_CONSTANTS : public D3D12_ROOT_CONSTANTS struct CD3DX12_ROOT_DESCRIPTOR : public D3D12_ROOT_DESCRIPTOR { CD3DX12_ROOT_DESCRIPTOR() {} - explicit CD3DX12_ROOT_DESCRIPTOR(const D3D12_ROOT_DESCRIPTOR &o) : - D3D12_ROOT_DESCRIPTOR(o) - {} - CD3DX12_ROOT_DESCRIPTOR( - UINT shaderRegister, - UINT registerSpace = 0) + explicit CD3DX12_ROOT_DESCRIPTOR(const D3D12_ROOT_DESCRIPTOR &o) + : D3D12_ROOT_DESCRIPTOR(o) { - Init(shaderRegister, registerSpace); } - - inline void Init( - UINT shaderRegister, - UINT registerSpace = 0) - { - Init(*this, shaderRegister, registerSpace); - } - + CD3DX12_ROOT_DESCRIPTOR(UINT shaderRegister, UINT registerSpace = 0) { Init(shaderRegister, registerSpace); } + + inline void Init(UINT shaderRegister, UINT registerSpace = 0) { Init(*this, shaderRegister, registerSpace); } + static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR &table, UINT shaderRegister, UINT registerSpace = 0) { table.ShaderRegister = shaderRegister; - table.RegisterSpace = registerSpace; + table.RegisterSpace = registerSpace; } }; @@ -828,103 +744,95 @@ struct CD3DX12_ROOT_DESCRIPTOR : public D3D12_ROOT_DESCRIPTOR struct CD3DX12_ROOT_PARAMETER : public D3D12_ROOT_PARAMETER { CD3DX12_ROOT_PARAMETER() {} - explicit CD3DX12_ROOT_PARAMETER(const D3D12_ROOT_PARAMETER &o) : - D3D12_ROOT_PARAMETER(o) - {} - - static inline void InitAsDescriptorTable( - _Out_ D3D12_ROOT_PARAMETER &rootParam, - UINT numDescriptorRanges, - _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* pDescriptorRanges, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + explicit CD3DX12_ROOT_PARAMETER(const D3D12_ROOT_PARAMETER &o) + : D3D12_ROOT_PARAMETER(o) { - rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + } + + static inline void InitAsDescriptorTable(_Out_ D3D12_ROOT_PARAMETER &rootParam, + UINT numDescriptorRanges, + _In_reads_(numDescriptorRanges) + const D3D12_DESCRIPTOR_RANGE *pDescriptorRanges, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParam.ShaderVisibility = visibility; CD3DX12_ROOT_DESCRIPTOR_TABLE::Init(rootParam.DescriptorTable, numDescriptorRanges, pDescriptorRanges); } - static inline void InitAsConstants( - _Out_ D3D12_ROOT_PARAMETER &rootParam, - UINT num32BitValues, - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + static inline void InitAsConstants(_Out_ D3D12_ROOT_PARAMETER &rootParam, + UINT num32BitValues, + UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { - rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; rootParam.ShaderVisibility = visibility; CD3DX12_ROOT_CONSTANTS::Init(rootParam.Constants, num32BitValues, shaderRegister, registerSpace); } - static inline void InitAsConstantBufferView( - _Out_ D3D12_ROOT_PARAMETER &rootParam, - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + static inline void InitAsConstantBufferView(_Out_ D3D12_ROOT_PARAMETER &rootParam, + UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { - rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; rootParam.ShaderVisibility = visibility; CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace); } - static inline void InitAsShaderResourceView( - _Out_ D3D12_ROOT_PARAMETER &rootParam, - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + static inline void InitAsShaderResourceView(_Out_ D3D12_ROOT_PARAMETER &rootParam, + UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { - rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV; + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV; rootParam.ShaderVisibility = visibility; CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace); } - static inline void InitAsUnorderedAccessView( - _Out_ D3D12_ROOT_PARAMETER &rootParam, - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + static inline void InitAsUnorderedAccessView(_Out_ D3D12_ROOT_PARAMETER &rootParam, + UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { - rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV; + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV; rootParam.ShaderVisibility = visibility; CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace); } - - inline void InitAsDescriptorTable( - UINT numDescriptorRanges, - _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE* pDescriptorRanges, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + + inline void InitAsDescriptorTable(UINT numDescriptorRanges, + _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *pDescriptorRanges, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { InitAsDescriptorTable(*this, numDescriptorRanges, pDescriptorRanges, visibility); } - - inline void InitAsConstants( - UINT num32BitValues, - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + + inline void InitAsConstants(UINT num32BitValues, + UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { InitAsConstants(*this, num32BitValues, shaderRegister, registerSpace, visibility); } - inline void InitAsConstantBufferView( - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + inline void InitAsConstantBufferView(UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { InitAsConstantBufferView(*this, shaderRegister, registerSpace, visibility); } - inline void InitAsShaderResourceView( - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + inline void InitAsShaderResourceView(UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { InitAsShaderResourceView(*this, shaderRegister, registerSpace, visibility); } - inline void InitAsUnorderedAccessView( - UINT shaderRegister, - UINT registerSpace = 0, - D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) + inline void InitAsUnorderedAccessView(UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) { InitAsUnorderedAccessView(*this, shaderRegister, registerSpace, visibility); } @@ -934,148 +842,139 @@ struct CD3DX12_ROOT_PARAMETER : public D3D12_ROOT_PARAMETER struct CD3DX12_STATIC_SAMPLER_DESC : public D3D12_STATIC_SAMPLER_DESC { CD3DX12_STATIC_SAMPLER_DESC() {} - explicit CD3DX12_STATIC_SAMPLER_DESC(const D3D12_STATIC_SAMPLER_DESC &o) : - D3D12_STATIC_SAMPLER_DESC(o) - {} - CD3DX12_STATIC_SAMPLER_DESC( - UINT shaderRegister, - D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, - D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - FLOAT mipLODBias = 0, - UINT maxAnisotropy = 16, - D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, - D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, - FLOAT minLOD = 0.f, - FLOAT maxLOD = D3D12_FLOAT32_MAX, - D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, - UINT registerSpace = 0) + explicit CD3DX12_STATIC_SAMPLER_DESC(const D3D12_STATIC_SAMPLER_DESC &o) + : D3D12_STATIC_SAMPLER_DESC(o) { - Init( - shaderRegister, - filter, - addressU, - addressV, - addressW, - mipLODBias, - maxAnisotropy, - comparisonFunc, - borderColor, - minLOD, - maxLOD, - shaderVisibility, - registerSpace); } - - static inline void Init( - _Out_ D3D12_STATIC_SAMPLER_DESC &samplerDesc, - UINT shaderRegister, - D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, - D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - FLOAT mipLODBias = 0, - UINT maxAnisotropy = 16, - D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, - D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, - FLOAT minLOD = 0.f, - FLOAT maxLOD = D3D12_FLOAT32_MAX, - D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, - UINT registerSpace = 0) + CD3DX12_STATIC_SAMPLER_DESC(UINT shaderRegister, + D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, + D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + FLOAT mipLODBias = 0, + UINT maxAnisotropy = 16, + D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, + D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, + FLOAT minLOD = 0.f, + FLOAT maxLOD = D3D12_FLOAT32_MAX, + D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, + UINT registerSpace = 0) { - samplerDesc.ShaderRegister = shaderRegister; - samplerDesc.Filter = filter; - samplerDesc.AddressU = addressU; - samplerDesc.AddressV = addressV; - samplerDesc.AddressW = addressW; - samplerDesc.MipLODBias = mipLODBias; - samplerDesc.MaxAnisotropy = maxAnisotropy; - samplerDesc.ComparisonFunc = comparisonFunc; - samplerDesc.BorderColor = borderColor; - samplerDesc.MinLOD = minLOD; - samplerDesc.MaxLOD = maxLOD; + Init(shaderRegister, + filter, + addressU, + addressV, + addressW, + mipLODBias, + maxAnisotropy, + comparisonFunc, + borderColor, + minLOD, + maxLOD, + shaderVisibility, + registerSpace); + } + + static inline void Init(_Out_ D3D12_STATIC_SAMPLER_DESC &samplerDesc, + UINT shaderRegister, + D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, + D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + FLOAT mipLODBias = 0, + UINT maxAnisotropy = 16, + D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, + D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, + FLOAT minLOD = 0.f, + FLOAT maxLOD = D3D12_FLOAT32_MAX, + D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, + UINT registerSpace = 0) + { + samplerDesc.ShaderRegister = shaderRegister; + samplerDesc.Filter = filter; + samplerDesc.AddressU = addressU; + samplerDesc.AddressV = addressV; + samplerDesc.AddressW = addressW; + samplerDesc.MipLODBias = mipLODBias; + samplerDesc.MaxAnisotropy = maxAnisotropy; + samplerDesc.ComparisonFunc = comparisonFunc; + samplerDesc.BorderColor = borderColor; + samplerDesc.MinLOD = minLOD; + samplerDesc.MaxLOD = maxLOD; samplerDesc.ShaderVisibility = shaderVisibility; - samplerDesc.RegisterSpace = registerSpace; + samplerDesc.RegisterSpace = registerSpace; } - inline void Init( - UINT shaderRegister, - D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, - D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, - FLOAT mipLODBias = 0, - UINT maxAnisotropy = 16, - D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, - D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, - FLOAT minLOD = 0.f, - FLOAT maxLOD = D3D12_FLOAT32_MAX, - D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, - UINT registerSpace = 0) + inline void Init(UINT shaderRegister, + D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, + D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + FLOAT mipLODBias = 0, + UINT maxAnisotropy = 16, + D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, + D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, + FLOAT minLOD = 0.f, + FLOAT maxLOD = D3D12_FLOAT32_MAX, + D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, + UINT registerSpace = 0) { - Init( - *this, - shaderRegister, - filter, - addressU, - addressV, - addressW, - mipLODBias, - maxAnisotropy, - comparisonFunc, - borderColor, - minLOD, - maxLOD, - shaderVisibility, - registerSpace); + Init(*this, + shaderRegister, + filter, + addressU, + addressV, + addressW, + mipLODBias, + maxAnisotropy, + comparisonFunc, + borderColor, + minLOD, + maxLOD, + shaderVisibility, + registerSpace); } - }; //------------------------------------------------------------------------------------------------ struct CD3DX12_ROOT_SIGNATURE_DESC : public D3D12_ROOT_SIGNATURE_DESC { CD3DX12_ROOT_SIGNATURE_DESC() {} - explicit CD3DX12_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o) : - D3D12_ROOT_SIGNATURE_DESC(o) - {} - CD3DX12_ROOT_SIGNATURE_DESC( - UINT numParameters, - _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters, - UINT numStaticSamplers = 0, - _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL, - D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) + explicit CD3DX12_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o) + : D3D12_ROOT_SIGNATURE_DESC(o) + { + } + CD3DX12_ROOT_SIGNATURE_DESC(UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) + const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = NULL, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) { Init(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); } - CD3DX12_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT) - { - Init(0, NULL, 0, NULL, D3D12_ROOT_SIGNATURE_FLAG_NONE); - } - - inline void Init( - UINT numParameters, - _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters, - UINT numStaticSamplers = 0, - _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL, - D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) + CD3DX12_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT) { Init(0, NULL, 0, NULL, D3D12_ROOT_SIGNATURE_FLAG_NONE); } + + inline void Init(UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = NULL, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) { Init(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); } - static inline void Init( - _Out_ D3D12_ROOT_SIGNATURE_DESC &desc, - UINT numParameters, - _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER* _pParameters, - UINT numStaticSamplers = 0, - _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC* _pStaticSamplers = NULL, - D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) + static inline void Init(_Out_ D3D12_ROOT_SIGNATURE_DESC &desc, + UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = NULL, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) { - desc.NumParameters = numParameters; - desc.pParameters = _pParameters; + desc.NumParameters = numParameters; + desc.pParameters = _pParameters; desc.NumStaticSamplers = numStaticSamplers; - desc.pStaticSamplers = _pStaticSamplers; - desc.Flags = flags; + desc.pStaticSamplers = _pStaticSamplers; + desc.Flags = flags; } }; @@ -1083,58 +982,61 @@ struct CD3DX12_ROOT_SIGNATURE_DESC : public D3D12_ROOT_SIGNATURE_DESC struct CD3DX12_CPU_DESCRIPTOR_HANDLE : public D3D12_CPU_DESCRIPTOR_HANDLE { CD3DX12_CPU_DESCRIPTOR_HANDLE() {} - explicit CD3DX12_CPU_DESCRIPTOR_HANDLE(const D3D12_CPU_DESCRIPTOR_HANDLE &o) : - D3D12_CPU_DESCRIPTOR_HANDLE(o) - {} + explicit CD3DX12_CPU_DESCRIPTOR_HANDLE(const D3D12_CPU_DESCRIPTOR_HANDLE &o) + : D3D12_CPU_DESCRIPTOR_HANDLE(o) + { + } CD3DX12_CPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) { ptr = 0; } CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize) { InitOffsetted(other, offsetScaledByIncrementSize); } - CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors, UINT descriptorIncrementSize) + CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, + INT offsetInDescriptors, + UINT descriptorIncrementSize) { InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize); } - CD3DX12_CPU_DESCRIPTOR_HANDLE& Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) - { + CD3DX12_CPU_DESCRIPTOR_HANDLE &Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) + { ptr += offsetInDescriptors * descriptorIncrementSize; return *this; } - CD3DX12_CPU_DESCRIPTOR_HANDLE& Offset(INT offsetScaledByIncrementSize) - { + CD3DX12_CPU_DESCRIPTOR_HANDLE &Offset(INT offsetScaledByIncrementSize) + { ptr += offsetScaledByIncrementSize; return *this; } - bool operator==(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE& other) - { - return (ptr == other.ptr); - } - bool operator!=(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE& other) - { - return (ptr != other.ptr); - } + bool operator==(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other) { return (ptr == other.ptr); } + bool operator!=(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other) { return (ptr != other.ptr); } CD3DX12_CPU_DESCRIPTOR_HANDLE &operator=(const D3D12_CPU_DESCRIPTOR_HANDLE &other) { ptr = other.ptr; return *this; } - + inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) { InitOffsetted(*this, base, offsetScaledByIncrementSize); } - - inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize) + + inline void + InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize) { InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize); } - - static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) + + static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, + INT offsetScaledByIncrementSize) { handle.ptr = base.ptr + offsetScaledByIncrementSize; } - - static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize) + + static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, + INT offsetInDescriptors, + UINT descriptorIncrementSize) { handle.ptr = base.ptr + offsetInDescriptors * descriptorIncrementSize; } @@ -1144,87 +1046,91 @@ struct CD3DX12_CPU_DESCRIPTOR_HANDLE : public D3D12_CPU_DESCRIPTOR_HANDLE struct CD3DX12_GPU_DESCRIPTOR_HANDLE : public D3D12_GPU_DESCRIPTOR_HANDLE { CD3DX12_GPU_DESCRIPTOR_HANDLE() {} - explicit CD3DX12_GPU_DESCRIPTOR_HANDLE(const D3D12_GPU_DESCRIPTOR_HANDLE &o) : - D3D12_GPU_DESCRIPTOR_HANDLE(o) - {} + explicit CD3DX12_GPU_DESCRIPTOR_HANDLE(const D3D12_GPU_DESCRIPTOR_HANDLE &o) + : D3D12_GPU_DESCRIPTOR_HANDLE(o) + { + } CD3DX12_GPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) { ptr = 0; } CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize) { InitOffsetted(other, offsetScaledByIncrementSize); } - CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors, UINT descriptorIncrementSize) + CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, + INT offsetInDescriptors, + UINT descriptorIncrementSize) { InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize); } - CD3DX12_GPU_DESCRIPTOR_HANDLE& Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) - { + CD3DX12_GPU_DESCRIPTOR_HANDLE &Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) + { ptr += offsetInDescriptors * descriptorIncrementSize; return *this; } - CD3DX12_GPU_DESCRIPTOR_HANDLE& Offset(INT offsetScaledByIncrementSize) - { + CD3DX12_GPU_DESCRIPTOR_HANDLE &Offset(INT offsetScaledByIncrementSize) + { ptr += offsetScaledByIncrementSize; return *this; } - inline bool operator==(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE& other) - { - return (ptr == other.ptr); - } - inline bool operator!=(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE& other) - { - return (ptr != other.ptr); - } + inline bool operator==(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other) { return (ptr == other.ptr); } + inline bool operator!=(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other) { return (ptr != other.ptr); } CD3DX12_GPU_DESCRIPTOR_HANDLE &operator=(const D3D12_GPU_DESCRIPTOR_HANDLE &other) { ptr = other.ptr; return *this; } - + inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) { InitOffsetted(*this, base, offsetScaledByIncrementSize); } - - inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize) + + inline void + InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize) { InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize); } - - static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) + + static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, + INT offsetScaledByIncrementSize) { handle.ptr = base.ptr + offsetScaledByIncrementSize; } - - static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, UINT descriptorIncrementSize) + + static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, + INT offsetInDescriptors, + UINT descriptorIncrementSize) { handle.ptr = base.ptr + offsetInDescriptors * descriptorIncrementSize; } }; //------------------------------------------------------------------------------------------------ -inline UINT D3D12CalcSubresource( UINT MipSlice, UINT ArraySlice, UINT PlaneSlice, UINT MipLevels, UINT ArraySize ) -{ - return MipSlice + ArraySlice * MipLevels + PlaneSlice * MipLevels * ArraySize; +inline UINT D3D12CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice, UINT MipLevels, UINT ArraySize) +{ + return MipSlice + ArraySlice * MipLevels + PlaneSlice * MipLevels * ArraySize; } //------------------------------------------------------------------------------------------------ template -inline void D3D12DecomposeSubresource( UINT Subresource, UINT MipLevels, UINT ArraySize, _Out_ T& MipSlice, _Out_ U& ArraySlice, _Out_ V& PlaneSlice ) +inline void D3D12DecomposeSubresource(UINT Subresource, + UINT MipLevels, + UINT ArraySize, + _Out_ T &MipSlice, + _Out_ U &ArraySlice, + _Out_ V &PlaneSlice) { - MipSlice = static_cast(Subresource % MipLevels); + MipSlice = static_cast(Subresource % MipLevels); ArraySlice = static_cast((Subresource / MipLevels) % ArraySize); PlaneSlice = static_cast(Subresource / (MipLevels * ArraySize)); } //------------------------------------------------------------------------------------------------ -inline UINT8 D3D12GetFormatPlaneCount( - _In_ ID3D12Device* pDevice, - DXGI_FORMAT Format - ) +inline UINT8 D3D12GetFormatPlaneCount(_In_ ID3D12Device *pDevice, DXGI_FORMAT Format) { D3D12_FEATURE_DATA_FORMAT_INFO formatInfo = {Format}; - if (FAILED(pDevice->CheckFeatureSupport(D3D12_FEATURE_FORMAT_INFO, &formatInfo, sizeof(formatInfo)))) - { + if (FAILED(pDevice->CheckFeatureSupport(D3D12_FEATURE_FORMAT_INFO, &formatInfo, sizeof(formatInfo)))) { return 0; } return formatInfo.PlaneCount; @@ -1233,211 +1139,220 @@ inline UINT8 D3D12GetFormatPlaneCount( //------------------------------------------------------------------------------------------------ struct CD3DX12_RESOURCE_DESC : public D3D12_RESOURCE_DESC { - CD3DX12_RESOURCE_DESC() - {} - explicit CD3DX12_RESOURCE_DESC( const D3D12_RESOURCE_DESC& o ) : - D3D12_RESOURCE_DESC( o ) - {} - CD3DX12_RESOURCE_DESC( - D3D12_RESOURCE_DIMENSION dimension, - UINT64 alignment, - UINT64 width, - UINT height, - UINT16 depthOrArraySize, - UINT16 mipLevels, - DXGI_FORMAT format, - UINT sampleCount, - UINT sampleQuality, - D3D12_TEXTURE_LAYOUT layout, - D3D12_RESOURCE_FLAGS flags ) + CD3DX12_RESOURCE_DESC() {} + explicit CD3DX12_RESOURCE_DESC(const D3D12_RESOURCE_DESC &o) + : D3D12_RESOURCE_DESC(o) { - Dimension = dimension; - Alignment = alignment; - Width = width; - Height = height; - DepthOrArraySize = depthOrArraySize; - MipLevels = mipLevels; - Format = format; - SampleDesc.Count = sampleCount; + } + CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION dimension, + UINT64 alignment, + UINT64 width, + UINT height, + UINT16 depthOrArraySize, + UINT16 mipLevels, + DXGI_FORMAT format, + UINT sampleCount, + UINT sampleQuality, + D3D12_TEXTURE_LAYOUT layout, + D3D12_RESOURCE_FLAGS flags) + { + Dimension = dimension; + Alignment = alignment; + Width = width; + Height = height; + DepthOrArraySize = depthOrArraySize; + MipLevels = mipLevels; + Format = format; + SampleDesc.Count = sampleCount; SampleDesc.Quality = sampleQuality; - Layout = layout; - Flags = flags; + Layout = layout; + Flags = flags; } - static inline CD3DX12_RESOURCE_DESC Buffer( - const D3D12_RESOURCE_ALLOCATION_INFO& resAllocInfo, - D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE ) + static inline CD3DX12_RESOURCE_DESC Buffer(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE) { - return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_BUFFER, resAllocInfo.Alignment, resAllocInfo.SizeInBytes, - 1, 1, 1, DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags ); + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_BUFFER, + resAllocInfo.Alignment, + resAllocInfo.SizeInBytes, + 1, + 1, + 1, + DXGI_FORMAT_UNKNOWN, + 1, + 0, + D3D12_TEXTURE_LAYOUT_ROW_MAJOR, + flags); } - static inline CD3DX12_RESOURCE_DESC Buffer( - UINT64 width, - D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, - UINT64 alignment = 0 ) + static inline CD3DX12_RESOURCE_DESC + Buffer(UINT64 width, D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, UINT64 alignment = 0) { - return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_BUFFER, alignment, width, 1, 1, 1, - DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags ); + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_BUFFER, + alignment, + width, + 1, + 1, + 1, + DXGI_FORMAT_UNKNOWN, + 1, + 0, + D3D12_TEXTURE_LAYOUT_ROW_MAJOR, + flags); } - static inline CD3DX12_RESOURCE_DESC Tex1D( - DXGI_FORMAT format, - UINT64 width, - UINT16 arraySize = 1, - UINT16 mipLevels = 0, - D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, - D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, - UINT64 alignment = 0 ) + static inline CD3DX12_RESOURCE_DESC Tex1D(DXGI_FORMAT format, + UINT64 width, + UINT16 arraySize = 1, + UINT16 mipLevels = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) { - return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_TEXTURE1D, alignment, width, 1, arraySize, - mipLevels, format, 1, 0, layout, flags ); + return CD3DX12_RESOURCE_DESC( + D3D12_RESOURCE_DIMENSION_TEXTURE1D, alignment, width, 1, arraySize, mipLevels, format, 1, 0, layout, flags); } - static inline CD3DX12_RESOURCE_DESC Tex2D( - DXGI_FORMAT format, - UINT64 width, - UINT height, - UINT16 arraySize = 1, - UINT16 mipLevels = 0, - UINT sampleCount = 1, - UINT sampleQuality = 0, - D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, - D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, - UINT64 alignment = 0 ) + static inline CD3DX12_RESOURCE_DESC Tex2D(DXGI_FORMAT format, + UINT64 width, + UINT height, + UINT16 arraySize = 1, + UINT16 mipLevels = 0, + UINT sampleCount = 1, + UINT sampleQuality = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) { - return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_TEXTURE2D, alignment, width, height, arraySize, - mipLevels, format, sampleCount, sampleQuality, layout, flags ); + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE2D, + alignment, + width, + height, + arraySize, + mipLevels, + format, + sampleCount, + sampleQuality, + layout, + flags); } - static inline CD3DX12_RESOURCE_DESC Tex3D( - DXGI_FORMAT format, - UINT64 width, - UINT height, - UINT16 depth, - UINT16 mipLevels = 0, - D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, - D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, - UINT64 alignment = 0 ) + static inline CD3DX12_RESOURCE_DESC Tex3D(DXGI_FORMAT format, + UINT64 width, + UINT height, + UINT16 depth, + UINT16 mipLevels = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) { - return CD3DX12_RESOURCE_DESC( D3D12_RESOURCE_DIMENSION_TEXTURE3D, alignment, width, height, depth, - mipLevels, format, 1, 0, layout, flags ); + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE3D, + alignment, + width, + height, + depth, + mipLevels, + format, + 1, + 0, + layout, + flags); } - inline UINT16 Depth() const - { return (Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); } - inline UINT16 ArraySize() const - { return (Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); } - inline UINT8 PlaneCount(_In_ ID3D12Device* pDevice) const - { return D3D12GetFormatPlaneCount(pDevice, Format); } - inline UINT Subresources(_In_ ID3D12Device* pDevice) const - { return MipLevels * ArraySize() * PlaneCount(pDevice); } + inline UINT16 Depth() const { return (Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); } + inline UINT16 ArraySize() const { return (Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); } + inline UINT8 PlaneCount(_In_ ID3D12Device *pDevice) const { return D3D12GetFormatPlaneCount(pDevice, Format); } + inline UINT Subresources(_In_ ID3D12Device *pDevice) const { return MipLevels * ArraySize() * PlaneCount(pDevice); } inline UINT CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice) - { return D3D12CalcSubresource(MipSlice, ArraySlice, PlaneSlice, MipLevels, ArraySize()); } - operator const D3D12_RESOURCE_DESC&() const { return *this; } + { + return D3D12CalcSubresource(MipSlice, ArraySlice, PlaneSlice, MipLevels, ArraySize()); + } + operator const D3D12_RESOURCE_DESC &() const { return *this; } }; -inline bool operator==( const D3D12_RESOURCE_DESC& l, const D3D12_RESOURCE_DESC& r ) +inline bool operator==(const D3D12_RESOURCE_DESC &l, const D3D12_RESOURCE_DESC &r) { - return l.Dimension == r.Dimension && - l.Alignment == r.Alignment && - l.Width == r.Width && - l.Height == r.Height && - l.DepthOrArraySize == r.DepthOrArraySize && - l.MipLevels == r.MipLevels && - l.Format == r.Format && - l.SampleDesc.Count == r.SampleDesc.Count && - l.SampleDesc.Quality == r.SampleDesc.Quality && - l.Layout == r.Layout && - l.Flags == r.Flags; + return l.Dimension == r.Dimension && l.Alignment == r.Alignment && l.Width == r.Width && l.Height == r.Height + && l.DepthOrArraySize == r.DepthOrArraySize && l.MipLevels == r.MipLevels && l.Format == r.Format + && l.SampleDesc.Count == r.SampleDesc.Count && l.SampleDesc.Quality == r.SampleDesc.Quality + && l.Layout == r.Layout && l.Flags == r.Flags; } -inline bool operator!=( const D3D12_RESOURCE_DESC& l, const D3D12_RESOURCE_DESC& r ) -{ return !( l == r ); } +inline bool operator!=(const D3D12_RESOURCE_DESC &l, const D3D12_RESOURCE_DESC &r) { return !(l == r); } //------------------------------------------------------------------------------------------------ // Row-by-row memcpy -inline void MemcpySubresource( - _In_ const D3D12_MEMCPY_DEST* pDest, - _In_ const D3D12_SUBRESOURCE_DATA* pSrc, - SIZE_T RowSizeInBytes, - UINT NumRows, - UINT NumSlices) +inline void MemcpySubresource(_In_ const D3D12_MEMCPY_DEST *pDest, + _In_ const D3D12_SUBRESOURCE_DATA *pSrc, + SIZE_T RowSizeInBytes, + UINT NumRows, + UINT NumSlices) { - for (UINT z = 0; z < NumSlices; ++z) - { - BYTE* pDestSlice = reinterpret_cast(pDest->pData) + pDest->SlicePitch * z; - const BYTE* pSrcSlice = reinterpret_cast(pSrc->pData) + pSrc->SlicePitch * z; - for (UINT y = 0; y < NumRows; ++y) - { - memcpy(pDestSlice + pDest->RowPitch * y, - pSrcSlice + pSrc->RowPitch * y, - RowSizeInBytes); + for (UINT z = 0; z < NumSlices; ++z) { + BYTE *pDestSlice = reinterpret_cast(pDest->pData) + pDest->SlicePitch * z; + const BYTE *pSrcSlice = reinterpret_cast(pSrc->pData) + pSrc->SlicePitch * z; + for (UINT y = 0; y < NumRows; ++y) { + memcpy(pDestSlice + pDest->RowPitch * y, pSrcSlice + pSrc->RowPitch * y, RowSizeInBytes); } } } //------------------------------------------------------------------------------------------------ // Returns required size of a buffer to be used for data upload -inline UINT64 GetRequiredIntermediateSize( - _In_ ID3D12Resource* pDestinationResource, - _In_range_(0,D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, - _In_range_(0,D3D12_REQ_SUBRESOURCES-FirstSubresource) UINT NumSubresources) +inline UINT64 GetRequiredIntermediateSize(_In_ ID3D12Resource *pDestinationResource, + _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources) { - D3D12_RESOURCE_DESC Desc = pDestinationResource->GetDesc(); - UINT64 RequiredSize = 0; - - ID3D12Device* pDevice; - pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast(&pDevice)); - pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, 0, nullptr, nullptr, nullptr, &RequiredSize); + D3D12_RESOURCE_DESC Desc = pDestinationResource->GetDesc(); + UINT64 RequiredSize = 0; + + ID3D12Device *pDevice; + pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints( + &Desc, FirstSubresource, NumSubresources, 0, nullptr, nullptr, nullptr, &RequiredSize); pDevice->Release(); - + return RequiredSize; } //------------------------------------------------------------------------------------------------ // All arrays must be populated (e.g. by calling GetCopyableFootprints) -inline UINT64 UpdateSubresources( - _In_ ID3D12GraphicsCommandList* pCmdList, - _In_ ID3D12Resource* pDestinationResource, - _In_ ID3D12Resource* pIntermediate, - _In_range_(0,D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, - _In_range_(0,D3D12_REQ_SUBRESOURCES-FirstSubresource) UINT NumSubresources, - UINT64 RequiredSize, - _In_reads_(NumSubresources) const D3D12_PLACED_SUBRESOURCE_FOOTPRINT* pLayouts, - _In_reads_(NumSubresources) const UINT* pNumRows, - _In_reads_(NumSubresources) const UINT64* pRowSizesInBytes, - _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA* pSrcData) +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, + _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, + _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources, + UINT64 RequiredSize, + _In_reads_(NumSubresources) const D3D12_PLACED_SUBRESOURCE_FOOTPRINT *pLayouts, + _In_reads_(NumSubresources) const UINT *pNumRows, + _In_reads_(NumSubresources) const UINT64 *pRowSizesInBytes, + _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA *pSrcData) { // Minor validation D3D12_RESOURCE_DESC IntermediateDesc = pIntermediate->GetDesc(); - D3D12_RESOURCE_DESC DestinationDesc = pDestinationResource->GetDesc(); - if (IntermediateDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || - IntermediateDesc.Width < RequiredSize + pLayouts[0].Offset || - RequiredSize > (SIZE_T)-1 || - (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER && - (FirstSubresource != 0 || NumSubresources != 1))) - { + D3D12_RESOURCE_DESC DestinationDesc = pDestinationResource->GetDesc(); + if (IntermediateDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER + || IntermediateDesc.Width < RequiredSize + pLayouts[0].Offset || RequiredSize > (SIZE_T)-1 + || (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER + && (FirstSubresource != 0 || NumSubresources != 1))) { return 0; } - - BYTE* pData; - HRESULT hr = pIntermediate->Map(0, NULL, reinterpret_cast(&pData)); - if (FAILED(hr)) - { + + BYTE *pData; + HRESULT hr = pIntermediate->Map(0, NULL, reinterpret_cast(&pData)); + if (FAILED(hr)) { return 0; } - - for (UINT i = 0; i < NumSubresources; ++i) - { - if (pRowSizesInBytes[i] > (SIZE_T)-1) return 0; - D3D12_MEMCPY_DEST DestData = { pData + pLayouts[i].Offset, pLayouts[i].Footprint.RowPitch, pLayouts[i].Footprint.RowPitch * pNumRows[i] }; - MemcpySubresource(&DestData, &pSrcData[i], (SIZE_T)pRowSizesInBytes[i], pNumRows[i], pLayouts[i].Footprint.Depth); + + for (UINT i = 0; i < NumSubresources; ++i) { + if (pRowSizesInBytes[i] > (SIZE_T)-1) + return 0; + D3D12_MEMCPY_DEST DestData = { + pData + pLayouts[i].Offset, pLayouts[i].Footprint.RowPitch, pLayouts[i].Footprint.RowPitch * pNumRows[i]}; + MemcpySubresource( + &DestData, &pSrcData[i], (SIZE_T)pRowSizesInBytes[i], pNumRows[i], pLayouts[i].Footprint.Depth); } pIntermediate->Unmap(0, NULL); - - if (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER) - { - CD3DX12_BOX SrcBox( UINT( pLayouts[0].Offset ), UINT( pLayouts[0].Offset + pLayouts[0].Footprint.Width ) ); + + if (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER) { + CD3DX12_BOX SrcBox(UINT(pLayouts[0].Offset), UINT(pLayouts[0].Offset + pLayouts[0].Footprint.Width)); pCmdList->CopyBufferRegion( pDestinationResource, 0, pIntermediate, pLayouts[0].Offset, pLayouts[0].Footprint.Width); } - else - { - for (UINT i = 0; i < NumSubresources; ++i) - { + else { + for (UINT i = 0; i < NumSubresources; ++i) { CD3DX12_TEXTURE_COPY_LOCATION Dst(pDestinationResource, i + FirstSubresource); CD3DX12_TEXTURE_COPY_LOCATION Src(pIntermediate, pLayouts[i]); pCmdList->CopyTextureRegion(&Dst, 0, 0, 0, &Src, nullptr); @@ -1448,37 +1363,51 @@ inline UINT64 UpdateSubresources( //------------------------------------------------------------------------------------------------ // Heap-allocating UpdateSubresources implementation -inline UINT64 UpdateSubresources( - _In_ ID3D12GraphicsCommandList* pCmdList, - _In_ ID3D12Resource* pDestinationResource, - _In_ ID3D12Resource* pIntermediate, - UINT64 IntermediateOffset, - _In_range_(0,D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, - _In_range_(0,D3D12_REQ_SUBRESOURCES-FirstSubresource) UINT NumSubresources, - _In_reads_(NumSubresources) D3D12_SUBRESOURCE_DATA* pSrcData) +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, + _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, + UINT64 IntermediateOffset, + _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources, + _In_reads_(NumSubresources) D3D12_SUBRESOURCE_DATA *pSrcData) { UINT64 RequiredSize = 0; - UINT64 MemToAlloc = static_cast(sizeof(D3D12_PLACED_SUBRESOURCE_FOOTPRINT) + sizeof(UINT) + sizeof(UINT64)) * NumSubresources; - if (MemToAlloc > SIZE_MAX) - { - return 0; + UINT64 MemToAlloc = static_cast(sizeof(D3D12_PLACED_SUBRESOURCE_FOOTPRINT) + sizeof(UINT) + sizeof(UINT64)) + * NumSubresources; + if (MemToAlloc > SIZE_MAX) { + return 0; } - void* pMem = HeapAlloc(GetProcessHeap(), 0, static_cast(MemToAlloc)); - if (pMem == NULL) - { - return 0; + void *pMem = HeapAlloc(GetProcessHeap(), 0, static_cast(MemToAlloc)); + if (pMem == NULL) { + return 0; } - D3D12_PLACED_SUBRESOURCE_FOOTPRINT* pLayouts = reinterpret_cast(pMem); - UINT64* pRowSizesInBytes = reinterpret_cast(pLayouts + NumSubresources); - UINT* pNumRows = reinterpret_cast(pRowSizesInBytes + NumSubresources); - + D3D12_PLACED_SUBRESOURCE_FOOTPRINT *pLayouts = reinterpret_cast(pMem); + UINT64 *pRowSizesInBytes = reinterpret_cast(pLayouts + NumSubresources); + UINT *pNumRows = reinterpret_cast(pRowSizesInBytes + NumSubresources); + D3D12_RESOURCE_DESC Desc = pDestinationResource->GetDesc(); - ID3D12Device* pDevice; - pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast(&pDevice)); - pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, pLayouts, pNumRows, pRowSizesInBytes, &RequiredSize); + ID3D12Device *pDevice; + pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints(&Desc, + FirstSubresource, + NumSubresources, + IntermediateOffset, + pLayouts, + pNumRows, + pRowSizesInBytes, + &RequiredSize); pDevice->Release(); - - UINT64 Result = UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, RequiredSize, pLayouts, pNumRows, pRowSizesInBytes, pSrcData); + + UINT64 Result = UpdateSubresources(pCmdList, + pDestinationResource, + pIntermediate, + FirstSubresource, + NumSubresources, + RequiredSize, + pLayouts, + pNumRows, + pRowSizesInBytes, + pSrcData); HeapFree(GetProcessHeap(), 0, pMem); return Result; } @@ -1486,47 +1415,55 @@ inline UINT64 UpdateSubresources( //------------------------------------------------------------------------------------------------ // Stack-allocating UpdateSubresources implementation template -inline UINT64 UpdateSubresources( - _In_ ID3D12GraphicsCommandList* pCmdList, - _In_ ID3D12Resource* pDestinationResource, - _In_ ID3D12Resource* pIntermediate, - UINT64 IntermediateOffset, - _In_range_(0, MaxSubresources) UINT FirstSubresource, - _In_range_(1, MaxSubresources - FirstSubresource) UINT NumSubresources, - _In_reads_(NumSubresources) D3D12_SUBRESOURCE_DATA* pSrcData) +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, + _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, + UINT64 IntermediateOffset, + _In_range_(0, MaxSubresources) UINT FirstSubresource, + _In_range_(1, MaxSubresources - FirstSubresource) UINT NumSubresources, + _In_reads_(NumSubresources) D3D12_SUBRESOURCE_DATA *pSrcData) { - UINT64 RequiredSize = 0; + UINT64 RequiredSize = 0; D3D12_PLACED_SUBRESOURCE_FOOTPRINT Layouts[MaxSubresources]; - UINT NumRows[MaxSubresources]; - UINT64 RowSizesInBytes[MaxSubresources]; - + UINT NumRows[MaxSubresources]; + UINT64 RowSizesInBytes[MaxSubresources]; + D3D12_RESOURCE_DESC Desc = pDestinationResource->GetDesc(); - ID3D12Device* pDevice; - pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast(&pDevice)); - pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, Layouts, NumRows, RowSizesInBytes, &RequiredSize); + ID3D12Device *pDevice; + pDestinationResource->GetDevice(__uuidof(*pDevice), reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints( + &Desc, FirstSubresource, NumSubresources, IntermediateOffset, Layouts, NumRows, RowSizesInBytes, &RequiredSize); pDevice->Release(); - - return UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, RequiredSize, Layouts, NumRows, RowSizesInBytes, pSrcData); + + return UpdateSubresources(pCmdList, + pDestinationResource, + pIntermediate, + FirstSubresource, + NumSubresources, + RequiredSize, + Layouts, + NumRows, + RowSizesInBytes, + pSrcData); } //------------------------------------------------------------------------------------------------ -inline bool D3D12IsLayoutOpaque( D3D12_TEXTURE_LAYOUT Layout ) -{ return Layout == D3D12_TEXTURE_LAYOUT_UNKNOWN || Layout == D3D12_TEXTURE_LAYOUT_64KB_UNDEFINED_SWIZZLE; } +inline bool D3D12IsLayoutOpaque(D3D12_TEXTURE_LAYOUT Layout) +{ + return Layout == D3D12_TEXTURE_LAYOUT_UNKNOWN || Layout == D3D12_TEXTURE_LAYOUT_64KB_UNDEFINED_SWIZZLE; +} //------------------------------------------------------------------------------------------------ -inline ID3D12CommandList * const * CommandListCast(ID3D12GraphicsCommandList * const * pp) +inline ID3D12CommandList *const *CommandListCast(ID3D12GraphicsCommandList *const *pp) { // This cast is useful for passing strongly typed command list pointers into // ExecuteCommandLists. // This cast is valid as long as the const-ness is respected. D3D12 APIs do // respect the const-ness of their arguments. - return reinterpret_cast(pp); + return reinterpret_cast(pp); } #endif // defined( __cplusplus ) #endif //__D3DX12_H__ - - - diff --git a/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.cpp b/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.cpp old mode 100755 new mode 100644 index be1aa474..4c9c04db --- a/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.cpp +++ b/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.cpp @@ -25,543 +25,514 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include - -#include "d3dx12.h" - -#include -#include -#include - -#include -#include "ShaderStructs.h" #include "simpleD3D12.h" + #include +#include +#include +#include +#include +#include + +#include "ShaderStructs.h" +#include "d3dx12.h" ////////////////////////////////////////////// // WindowsSecurityAttributes implementation // ////////////////////////////////////////////// -class WindowsSecurityAttributes { - protected: - SECURITY_ATTRIBUTES m_winSecurityAttributes; - PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; +class WindowsSecurityAttributes +{ +protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; - public: - WindowsSecurityAttributes(); - ~WindowsSecurityAttributes(); - SECURITY_ATTRIBUTES *operator&(); +public: + WindowsSecurityAttributes(); + ~WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES *operator&(); }; -WindowsSecurityAttributes::WindowsSecurityAttributes() { - m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc( - 1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); - assert(m_winPSecurityDescriptor != (PSECURITY_DESCRIPTOR)NULL); +WindowsSecurityAttributes::WindowsSecurityAttributes() +{ + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); + assert(m_winPSecurityDescriptor != (PSECURITY_DESCRIPTOR)NULL); - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + - SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - InitializeSecurityDescriptor(m_winPSecurityDescriptor, - SECURITY_DESCRIPTOR_REVISION); + InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION); - SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = - SECURITY_WORLD_SID_AUTHORITY; - AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, - 0, 0, 0, 0, 0, ppSID); + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID); - EXPLICIT_ACCESS explicitAccess; - ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); - explicitAccess.grfAccessPermissions = - STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; - explicitAccess.grfAccessMode = SET_ACCESS; - explicitAccess.grfInheritance = INHERIT_ONLY; - explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; - explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; - explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; - SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); - SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); - m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); - m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; - m_winSecurityAttributes.bInheritHandle = TRUE; + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; } -WindowsSecurityAttributes::~WindowsSecurityAttributes() { - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + - SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); +WindowsSecurityAttributes::~WindowsSecurityAttributes() +{ + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - if (*ppSID) { - FreeSid(*ppSID); - } - if (*ppACL) { - LocalFree(*ppACL); - } - free(m_winPSecurityDescriptor); + if (*ppSID) { + FreeSid(*ppSID); + } + if (*ppACL) { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); } -SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { - return &m_winSecurityAttributes; -} +SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { return &m_winSecurityAttributes; } DX12CudaInterop::DX12CudaInterop(UINT width, UINT height, std::string name) - : DX12CudaSample(width, height, name), - m_frameIndex(0), - m_scissorRect(0, 0, static_cast(width), static_cast(height)), - m_fenceValues{}, - m_rtvDescriptorSize(0) { - m_viewport = {0.0f, 0.0f, static_cast(width), - static_cast(height)}; - m_AnimTime = 1.0f; + : DX12CudaSample(width, height, name) + , m_frameIndex(0) + , m_scissorRect(0, 0, static_cast(width), static_cast(height)) + , m_fenceValues{} + , m_rtvDescriptorSize(0) +{ + m_viewport = {0.0f, 0.0f, static_cast(width), static_cast(height)}; + m_AnimTime = 1.0f; } -void DX12CudaInterop::OnInit() { - LoadPipeline(); - InitCuda(); - LoadAssets(); +void DX12CudaInterop::OnInit() +{ + LoadPipeline(); + InitCuda(); + LoadAssets(); } // Load the rendering pipeline dependencies. -void DX12CudaInterop::LoadPipeline() { - UINT dxgiFactoryFlags = 0; +void DX12CudaInterop::LoadPipeline() +{ + UINT dxgiFactoryFlags = 0; #if defined(_DEBUG) - // Enable the debug layer (requires the Graphics Tools "optional feature"). - // NOTE: Enabling the debug layer after device creation will invalidate the - // active device. - { - ComPtr debugController; - if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) { - debugController->EnableDebugLayer(); + // Enable the debug layer (requires the Graphics Tools "optional feature"). + // NOTE: Enabling the debug layer after device creation will invalidate the + // active device. + { + ComPtr debugController; + if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) { + debugController->EnableDebugLayer(); - // Enable additional debug layers. - dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG; + // Enable additional debug layers. + dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG; + } } - } #endif - ComPtr factory; - ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory))); + ComPtr factory; + ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory))); - if (m_useWarpDevice) { - ComPtr warpAdapter; - ThrowIfFailed(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter))); + if (m_useWarpDevice) { + ComPtr warpAdapter; + ThrowIfFailed(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter))); - ThrowIfFailed(D3D12CreateDevice(warpAdapter.Get(), D3D_FEATURE_LEVEL_11_0, - IID_PPV_ARGS(&m_device))); - } else { - ComPtr hardwareAdapter; - GetHardwareAdapter(factory.Get(), &hardwareAdapter); - - ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), - D3D_FEATURE_LEVEL_11_0, - IID_PPV_ARGS(&m_device))); - DXGI_ADAPTER_DESC1 desc; - hardwareAdapter->GetDesc1(&desc); - m_dx12deviceluid = desc.AdapterLuid; - } - - // Describe and create the command queue. - D3D12_COMMAND_QUEUE_DESC queueDesc = {}; - queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; - queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; - - ThrowIfFailed( - m_device->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&m_commandQueue))); - - // Describe and create the swap chain. - DXGI_SWAP_CHAIN_DESC1 swapChainDesc = {}; - swapChainDesc.BufferCount = FrameCount; - swapChainDesc.Width = m_width; - swapChainDesc.Height = m_height; - swapChainDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; - swapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; - swapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD; - swapChainDesc.SampleDesc.Count = 1; - - ComPtr swapChain; - ThrowIfFailed(factory->CreateSwapChainForHwnd( - m_commandQueue.Get(), // Swap chain needs the queue so that it can force - // a flush on it. - Win32Application::GetHwnd(), &swapChainDesc, nullptr, nullptr, - &swapChain)); - - // This sample does not support fullscreen transitions. - ThrowIfFailed(factory->MakeWindowAssociation(Win32Application::GetHwnd(), - DXGI_MWA_NO_ALT_ENTER)); - - ThrowIfFailed(swapChain.As(&m_swapChain)); - m_frameIndex = m_swapChain->GetCurrentBackBufferIndex(); - - // Create descriptor heaps. - { - // Describe and create a render target view (RTV) descriptor heap. - D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {}; - rtvHeapDesc.NumDescriptors = FrameCount; - rtvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV; - rtvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; - ThrowIfFailed( - m_device->CreateDescriptorHeap(&rtvHeapDesc, IID_PPV_ARGS(&m_rtvHeap))); - - m_rtvDescriptorSize = m_device->GetDescriptorHandleIncrementSize( - D3D12_DESCRIPTOR_HEAP_TYPE_RTV); - } - - // Create frame resources. - { - CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle( - m_rtvHeap->GetCPUDescriptorHandleForHeapStart()); - - // Create a RTV and a command allocator for each frame. - for (UINT n = 0; n < FrameCount; n++) { - ThrowIfFailed( - m_swapChain->GetBuffer(n, IID_PPV_ARGS(&m_renderTargets[n]))); - m_device->CreateRenderTargetView(m_renderTargets[n].Get(), nullptr, - rtvHandle); - rtvHandle.Offset(1, m_rtvDescriptorSize); - - ThrowIfFailed(m_device->CreateCommandAllocator( - D3D12_COMMAND_LIST_TYPE_DIRECT, - IID_PPV_ARGS(&m_commandAllocators[n]))); + ThrowIfFailed(D3D12CreateDevice(warpAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device))); + } + else { + ComPtr hardwareAdapter; + GetHardwareAdapter(factory.Get(), &hardwareAdapter); + + ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device))); + DXGI_ADAPTER_DESC1 desc; + hardwareAdapter->GetDesc1(&desc); + m_dx12deviceluid = desc.AdapterLuid; + } + + // Describe and create the command queue. + D3D12_COMMAND_QUEUE_DESC queueDesc = {}; + queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; + queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + + ThrowIfFailed(m_device->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&m_commandQueue))); + + // Describe and create the swap chain. + DXGI_SWAP_CHAIN_DESC1 swapChainDesc = {}; + swapChainDesc.BufferCount = FrameCount; + swapChainDesc.Width = m_width; + swapChainDesc.Height = m_height; + swapChainDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + swapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; + swapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD; + swapChainDesc.SampleDesc.Count = 1; + + ComPtr swapChain; + ThrowIfFailed(factory->CreateSwapChainForHwnd(m_commandQueue.Get(), // Swap chain needs the queue so that it can + // force a flush on it. + Win32Application::GetHwnd(), + &swapChainDesc, + nullptr, + nullptr, + &swapChain)); + + // This sample does not support fullscreen transitions. + ThrowIfFailed(factory->MakeWindowAssociation(Win32Application::GetHwnd(), DXGI_MWA_NO_ALT_ENTER)); + + ThrowIfFailed(swapChain.As(&m_swapChain)); + m_frameIndex = m_swapChain->GetCurrentBackBufferIndex(); + + // Create descriptor heaps. + { + // Describe and create a render target view (RTV) descriptor heap. + D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {}; + rtvHeapDesc.NumDescriptors = FrameCount; + rtvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV; + rtvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; + ThrowIfFailed(m_device->CreateDescriptorHeap(&rtvHeapDesc, IID_PPV_ARGS(&m_rtvHeap))); + + m_rtvDescriptorSize = m_device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_RTV); + } + + // Create frame resources. + { + CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(m_rtvHeap->GetCPUDescriptorHandleForHeapStart()); + + // Create a RTV and a command allocator for each frame. + for (UINT n = 0; n < FrameCount; n++) { + ThrowIfFailed(m_swapChain->GetBuffer(n, IID_PPV_ARGS(&m_renderTargets[n]))); + m_device->CreateRenderTargetView(m_renderTargets[n].Get(), nullptr, rtvHandle); + rtvHandle.Offset(1, m_rtvDescriptorSize); + + ThrowIfFailed(m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, + IID_PPV_ARGS(&m_commandAllocators[n]))); + } } - } } -void DX12CudaInterop::InitCuda() { - int num_cuda_devices = 0; - checkCudaErrors(cudaGetDeviceCount(&num_cuda_devices)); +void DX12CudaInterop::InitCuda() +{ + int num_cuda_devices = 0; + checkCudaErrors(cudaGetDeviceCount(&num_cuda_devices)); - if (!num_cuda_devices) { - throw std::exception("No CUDA Devices found"); - } - for (UINT devId = 0; devId < num_cuda_devices; devId++) { - cudaDeviceProp devProp; - checkCudaErrors(cudaGetDeviceProperties(&devProp, devId)); - - if ((memcmp(&m_dx12deviceluid.LowPart, devProp.luid, - sizeof(m_dx12deviceluid.LowPart)) == 0) && - (memcmp(&m_dx12deviceluid.HighPart, - devProp.luid + sizeof(m_dx12deviceluid.LowPart), - sizeof(m_dx12deviceluid.HighPart)) == 0)) { - checkCudaErrors(cudaSetDevice(devId)); - m_cudaDeviceID = devId; - m_nodeMask = devProp.luidDeviceNodeMask; - checkCudaErrors(cudaStreamCreate(&m_streamToRun)); - printf("CUDA Device Used [%d] %s\n", devId, devProp.name); - break; + if (!num_cuda_devices) { + throw std::exception("No CUDA Devices found"); + } + for (UINT devId = 0; devId < num_cuda_devices; devId++) { + cudaDeviceProp devProp; + checkCudaErrors(cudaGetDeviceProperties(&devProp, devId)); + + if ((memcmp(&m_dx12deviceluid.LowPart, devProp.luid, sizeof(m_dx12deviceluid.LowPart)) == 0) + && (memcmp(&m_dx12deviceluid.HighPart, + devProp.luid + sizeof(m_dx12deviceluid.LowPart), + sizeof(m_dx12deviceluid.HighPart)) + == 0)) { + checkCudaErrors(cudaSetDevice(devId)); + m_cudaDeviceID = devId; + m_nodeMask = devProp.luidDeviceNodeMask; + checkCudaErrors(cudaStreamCreate(&m_streamToRun)); + printf("CUDA Device Used [%d] %s\n", devId, devProp.name); + break; + } } - } } // Load the sample assets. -void DX12CudaInterop::LoadAssets() { - // Create a root signature. - { - CD3DX12_DESCRIPTOR_RANGE range; - CD3DX12_ROOT_PARAMETER parameter; +void DX12CudaInterop::LoadAssets() +{ + // Create a root signature. + { + CD3DX12_DESCRIPTOR_RANGE range; + CD3DX12_ROOT_PARAMETER parameter; - range.Init(D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 0); - parameter.InitAsDescriptorTable(1, &range, D3D12_SHADER_VISIBILITY_VERTEX); + range.Init(D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 0); + parameter.InitAsDescriptorTable(1, &range, D3D12_SHADER_VISIBILITY_VERTEX); - D3D12_ROOT_SIGNATURE_FLAGS rootSignatureFlags = - // Only the input assembler stage needs access to the constant buffer. - D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT | - D3D12_ROOT_SIGNATURE_FLAG_DENY_DOMAIN_SHADER_ROOT_ACCESS | - D3D12_ROOT_SIGNATURE_FLAG_DENY_GEOMETRY_SHADER_ROOT_ACCESS | - D3D12_ROOT_SIGNATURE_FLAG_DENY_HULL_SHADER_ROOT_ACCESS | - D3D12_ROOT_SIGNATURE_FLAG_DENY_PIXEL_SHADER_ROOT_ACCESS; + D3D12_ROOT_SIGNATURE_FLAGS rootSignatureFlags = + // Only the input assembler stage needs access to the constant buffer. + D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT + | D3D12_ROOT_SIGNATURE_FLAG_DENY_DOMAIN_SHADER_ROOT_ACCESS + | D3D12_ROOT_SIGNATURE_FLAG_DENY_GEOMETRY_SHADER_ROOT_ACCESS + | D3D12_ROOT_SIGNATURE_FLAG_DENY_HULL_SHADER_ROOT_ACCESS + | D3D12_ROOT_SIGNATURE_FLAG_DENY_PIXEL_SHADER_ROOT_ACCESS; - CD3DX12_ROOT_SIGNATURE_DESC descRootSignature; - descRootSignature.Init(1, ¶meter, 0, nullptr, rootSignatureFlags); - ComPtr pSignature; - ComPtr pError; - ThrowIfFailed(D3D12SerializeRootSignature( - &descRootSignature, D3D_ROOT_SIGNATURE_VERSION_1, - pSignature.GetAddressOf(), pError.GetAddressOf())); - ThrowIfFailed(m_device->CreateRootSignature( - 0, pSignature->GetBufferPointer(), pSignature->GetBufferSize(), - IID_PPV_ARGS(&m_rootSignature))); - } - // Create the pipeline state, which includes compiling and loading shaders. - { - ComPtr vertexShader; - ComPtr pixelShader; + CD3DX12_ROOT_SIGNATURE_DESC descRootSignature; + descRootSignature.Init(1, ¶meter, 0, nullptr, rootSignatureFlags); + ComPtr pSignature; + ComPtr pError; + ThrowIfFailed(D3D12SerializeRootSignature( + &descRootSignature, D3D_ROOT_SIGNATURE_VERSION_1, pSignature.GetAddressOf(), pError.GetAddressOf())); + ThrowIfFailed(m_device->CreateRootSignature( + 0, pSignature->GetBufferPointer(), pSignature->GetBufferSize(), IID_PPV_ARGS(&m_rootSignature))); + } + // Create the pipeline state, which includes compiling and loading shaders. + { + ComPtr vertexShader; + ComPtr pixelShader; #if defined(_DEBUG) - // Enable better shader debugging with the graphics debugging tools. - UINT compileFlags = D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION; + // Enable better shader debugging with the graphics debugging tools. + UINT compileFlags = D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION; #else - UINT compileFlags = 0; + UINT compileFlags = 0; #endif - std::wstring filePath = GetAssetFullPath("shaders.hlsl"); - LPCWSTR result = filePath.c_str(); - ThrowIfFailed(D3DCompileFromFile(result, nullptr, nullptr, "VSMain", - "vs_5_0", compileFlags, 0, &vertexShader, - nullptr)); - ThrowIfFailed(D3DCompileFromFile(result, nullptr, nullptr, "PSMain", - "ps_5_0", compileFlags, 0, &pixelShader, - nullptr)); + std::wstring filePath = GetAssetFullPath("shaders.hlsl"); + LPCWSTR result = filePath.c_str(); + ThrowIfFailed( + D3DCompileFromFile(result, nullptr, nullptr, "VSMain", "vs_5_0", compileFlags, 0, &vertexShader, nullptr)); + ThrowIfFailed( + D3DCompileFromFile(result, nullptr, nullptr, "PSMain", "ps_5_0", compileFlags, 0, &pixelShader, nullptr)); - // Define the vertex input layout. - D3D12_INPUT_ELEMENT_DESC inputElementDescs[] = { - {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, - D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}, - {"COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12, - D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}}; + // Define the vertex input layout. + D3D12_INPUT_ELEMENT_DESC inputElementDescs[] = { + {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}, + {"COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}}; - // Describe and create the graphics pipeline state object (PSO). - D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {}; - psoDesc.InputLayout = {inputElementDescs, _countof(inputElementDescs)}; - psoDesc.pRootSignature = m_rootSignature.Get(); - psoDesc.VS = CD3DX12_SHADER_BYTECODE(vertexShader.Get()); - psoDesc.PS = CD3DX12_SHADER_BYTECODE(pixelShader.Get()); - psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); - psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); - psoDesc.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT); - psoDesc.SampleMask = UINT_MAX; - psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT; - psoDesc.NumRenderTargets = 1; - psoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; - psoDesc.SampleDesc.Count = 1; - ThrowIfFailed(m_device->CreateGraphicsPipelineState( - &psoDesc, IID_PPV_ARGS(&m_pipelineState))); - } - - // Create the command list. - ThrowIfFailed(m_device->CreateCommandList( - 0, D3D12_COMMAND_LIST_TYPE_DIRECT, - m_commandAllocators[m_frameIndex].Get(), m_pipelineState.Get(), - IID_PPV_ARGS(&m_commandList))); - - // Command lists are created in the recording state, but there is nothing - // to record yet. The main loop expects it to be closed, so close it now. - ThrowIfFailed(m_commandList->Close()); - - // Create the vertex buffer. - { - // Define the geometry for a triangle. - vertBufWidth = m_width / 2; - vertBufHeight = m_height / 2; - const UINT vertexBufferSize = sizeof(Vertex) * vertBufWidth * vertBufHeight; - - ThrowIfFailed(m_device->CreateCommittedResource( - &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), - D3D12_HEAP_FLAG_SHARED, - &CD3DX12_RESOURCE_DESC::Buffer(vertexBufferSize), - D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER, nullptr, - IID_PPV_ARGS(&m_vertexBuffer))); - - // Initialize the vertex buffer view. - m_vertexBufferView.BufferLocation = m_vertexBuffer->GetGPUVirtualAddress(); - m_vertexBufferView.StrideInBytes = sizeof(Vertex); - m_vertexBufferView.SizeInBytes = vertexBufferSize; - - HANDLE sharedHandle; - WindowsSecurityAttributes windowsSecurityAttributes; - LPCWSTR name = NULL; - ThrowIfFailed(m_device->CreateSharedHandle( - m_vertexBuffer.Get(), &windowsSecurityAttributes, GENERIC_ALL, name, - &sharedHandle)); - - D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo; - d3d12ResourceAllocationInfo = m_device->GetResourceAllocationInfo( - m_nodeMask, 1, &CD3DX12_RESOURCE_DESC::Buffer(vertexBufferSize)); - size_t actualSize = d3d12ResourceAllocationInfo.SizeInBytes; - size_t alignment = d3d12ResourceAllocationInfo.Alignment; - - cudaExternalMemoryHandleDesc externalMemoryHandleDesc; - memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); - - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; - externalMemoryHandleDesc.handle.win32.handle = sharedHandle; - externalMemoryHandleDesc.size = actualSize; - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; - - checkCudaErrors( - cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc)); - CloseHandle(sharedHandle); - - cudaExternalMemoryBufferDesc externalMemoryBufferDesc; - memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc)); - externalMemoryBufferDesc.offset = 0; - externalMemoryBufferDesc.size = vertexBufferSize; - externalMemoryBufferDesc.flags = 0; - - checkCudaErrors(cudaExternalMemoryGetMappedBuffer( - &m_cudaDevVertptr, m_externalMemory, &externalMemoryBufferDesc)); - RunSineWaveKernel(vertBufWidth, vertBufHeight, (Vertex *)m_cudaDevVertptr, - m_streamToRun, 1.0f); - checkCudaErrors(cudaStreamSynchronize(m_streamToRun)); - - } - - // Create synchronization objects and wait until assets have been uploaded to - // the GPU. - { - ThrowIfFailed(m_device->CreateFence(m_fenceValues[m_frameIndex], - D3D12_FENCE_FLAG_SHARED, - IID_PPV_ARGS(&m_fence))); - - cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc; - - memset(&externalSemaphoreHandleDesc, 0, - sizeof(externalSemaphoreHandleDesc)); - WindowsSecurityAttributes windowsSecurityAttributes; - LPCWSTR name = NULL; - HANDLE sharedHandle; - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeD3D12Fence; - m_device->CreateSharedHandle(m_fence.Get(), &windowsSecurityAttributes, - GENERIC_ALL, name, &sharedHandle); - externalSemaphoreHandleDesc.handle.win32.handle = (void *)sharedHandle; - externalSemaphoreHandleDesc.flags = 0; - - checkCudaErrors(cudaImportExternalSemaphore(&m_externalSemaphore, - &externalSemaphoreHandleDesc)); - - m_fenceValues[m_frameIndex]++; - - // Create an event handle to use for frame synchronization. - m_fenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr); - if (m_fenceEvent == nullptr) { - ThrowIfFailed(HRESULT_FROM_WIN32(GetLastError())); + // Describe and create the graphics pipeline state object (PSO). + D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {}; + psoDesc.InputLayout = {inputElementDescs, _countof(inputElementDescs)}; + psoDesc.pRootSignature = m_rootSignature.Get(); + psoDesc.VS = CD3DX12_SHADER_BYTECODE(vertexShader.Get()); + psoDesc.PS = CD3DX12_SHADER_BYTECODE(pixelShader.Get()); + psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); + psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); + psoDesc.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT); + psoDesc.SampleMask = UINT_MAX; + psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT; + psoDesc.NumRenderTargets = 1; + psoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; + psoDesc.SampleDesc.Count = 1; + ThrowIfFailed(m_device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_pipelineState))); } - // Wait for the command list to execute; we are reusing the same command - // list in our main loop but for now, we just want to wait for setup to - // complete before continuing. - WaitForGpu(); - } + // Create the command list. + ThrowIfFailed(m_device->CreateCommandList(0, + D3D12_COMMAND_LIST_TYPE_DIRECT, + m_commandAllocators[m_frameIndex].Get(), + m_pipelineState.Get(), + IID_PPV_ARGS(&m_commandList))); + + // Command lists are created in the recording state, but there is nothing + // to record yet. The main loop expects it to be closed, so close it now. + ThrowIfFailed(m_commandList->Close()); + + // Create the vertex buffer. + { + // Define the geometry for a triangle. + vertBufWidth = m_width / 2; + vertBufHeight = m_height / 2; + const UINT vertexBufferSize = sizeof(Vertex) * vertBufWidth * vertBufHeight; + + ThrowIfFailed(m_device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), + D3D12_HEAP_FLAG_SHARED, + &CD3DX12_RESOURCE_DESC::Buffer(vertexBufferSize), + D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER, + nullptr, + IID_PPV_ARGS(&m_vertexBuffer))); + + // Initialize the vertex buffer view. + m_vertexBufferView.BufferLocation = m_vertexBuffer->GetGPUVirtualAddress(); + m_vertexBufferView.StrideInBytes = sizeof(Vertex); + m_vertexBufferView.SizeInBytes = vertexBufferSize; + + HANDLE sharedHandle; + WindowsSecurityAttributes windowsSecurityAttributes; + LPCWSTR name = NULL; + ThrowIfFailed(m_device->CreateSharedHandle( + m_vertexBuffer.Get(), &windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle)); + + D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo; + d3d12ResourceAllocationInfo = + m_device->GetResourceAllocationInfo(m_nodeMask, 1, &CD3DX12_RESOURCE_DESC::Buffer(vertexBufferSize)); + size_t actualSize = d3d12ResourceAllocationInfo.SizeInBytes; + size_t alignment = d3d12ResourceAllocationInfo.Alignment; + + cudaExternalMemoryHandleDesc externalMemoryHandleDesc; + memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc)); + + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; + externalMemoryHandleDesc.handle.win32.handle = sharedHandle; + externalMemoryHandleDesc.size = actualSize; + externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + + checkCudaErrors(cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc)); + CloseHandle(sharedHandle); + + cudaExternalMemoryBufferDesc externalMemoryBufferDesc; + memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc)); + externalMemoryBufferDesc.offset = 0; + externalMemoryBufferDesc.size = vertexBufferSize; + externalMemoryBufferDesc.flags = 0; + + checkCudaErrors( + cudaExternalMemoryGetMappedBuffer(&m_cudaDevVertptr, m_externalMemory, &externalMemoryBufferDesc)); + RunSineWaveKernel(vertBufWidth, vertBufHeight, (Vertex *)m_cudaDevVertptr, m_streamToRun, 1.0f); + checkCudaErrors(cudaStreamSynchronize(m_streamToRun)); + } + + // Create synchronization objects and wait until assets have been uploaded to + // the GPU. + { + ThrowIfFailed( + m_device->CreateFence(m_fenceValues[m_frameIndex], D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(&m_fence))); + + cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc; + + memset(&externalSemaphoreHandleDesc, 0, sizeof(externalSemaphoreHandleDesc)); + WindowsSecurityAttributes windowsSecurityAttributes; + LPCWSTR name = NULL; + HANDLE sharedHandle; + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeD3D12Fence; + m_device->CreateSharedHandle(m_fence.Get(), &windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle); + externalSemaphoreHandleDesc.handle.win32.handle = (void *)sharedHandle; + externalSemaphoreHandleDesc.flags = 0; + + checkCudaErrors(cudaImportExternalSemaphore(&m_externalSemaphore, &externalSemaphoreHandleDesc)); + + m_fenceValues[m_frameIndex]++; + + // Create an event handle to use for frame synchronization. + m_fenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr); + if (m_fenceEvent == nullptr) { + ThrowIfFailed(HRESULT_FROM_WIN32(GetLastError())); + } + + // Wait for the command list to execute; we are reusing the same command + // list in our main loop but for now, we just want to wait for setup to + // complete before continuing. + WaitForGpu(); + } } // Render the scene. -void DX12CudaInterop::OnRender() { - // Record all the commands we need to render the scene into the command list. - PopulateCommandList(); +void DX12CudaInterop::OnRender() +{ + // Record all the commands we need to render the scene into the command list. + PopulateCommandList(); - // Execute the command list. - ID3D12CommandList *ppCommandLists[] = {m_commandList.Get()}; - m_commandQueue->ExecuteCommandLists(_countof(ppCommandLists), ppCommandLists); + // Execute the command list. + ID3D12CommandList *ppCommandLists[] = {m_commandList.Get()}; + m_commandQueue->ExecuteCommandLists(_countof(ppCommandLists), ppCommandLists); - // Present the frame. - ThrowIfFailed(m_swapChain->Present(1, 0)); + // Present the frame. + ThrowIfFailed(m_swapChain->Present(1, 0)); - // Schedule a Signal command in the queue. - const UINT64 currentFenceValue = m_fenceValues[m_frameIndex]; - ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), currentFenceValue)); + // Schedule a Signal command in the queue. + const UINT64 currentFenceValue = m_fenceValues[m_frameIndex]; + ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), currentFenceValue)); - MoveToNextFrame(); + MoveToNextFrame(); } -void DX12CudaInterop::OnDestroy() { - // Ensure that the GPU is no longer referencing resources that are about to be - // cleaned up by the destructor. - WaitForGpu(); - checkCudaErrors(cudaDestroyExternalSemaphore(m_externalSemaphore)); - checkCudaErrors(cudaDestroyExternalMemory(m_externalMemory)); - checkCudaErrors(cudaFree(m_cudaDevVertptr)); - CloseHandle(m_fenceEvent); +void DX12CudaInterop::OnDestroy() +{ + // Ensure that the GPU is no longer referencing resources that are about to be + // cleaned up by the destructor. + WaitForGpu(); + checkCudaErrors(cudaDestroyExternalSemaphore(m_externalSemaphore)); + checkCudaErrors(cudaDestroyExternalMemory(m_externalMemory)); + checkCudaErrors(cudaFree(m_cudaDevVertptr)); + CloseHandle(m_fenceEvent); } -void DX12CudaInterop::PopulateCommandList() { - // Command list allocators can only be reset when the associated - // command lists have finished execution on the GPU; apps should use - // fences to determine GPU execution progress. - ThrowIfFailed(m_commandAllocators[m_frameIndex]->Reset()); +void DX12CudaInterop::PopulateCommandList() +{ + // Command list allocators can only be reset when the associated + // command lists have finished execution on the GPU; apps should use + // fences to determine GPU execution progress. + ThrowIfFailed(m_commandAllocators[m_frameIndex]->Reset()); - // However, when ExecuteCommandList() is called on a particular command - // list, that command list can then be reset at any time and must be before - // re-recording. - ThrowIfFailed(m_commandList->Reset(m_commandAllocators[m_frameIndex].Get(), - m_pipelineState.Get())); + // However, when ExecuteCommandList() is called on a particular command + // list, that command list can then be reset at any time and must be before + // re-recording. + ThrowIfFailed(m_commandList->Reset(m_commandAllocators[m_frameIndex].Get(), m_pipelineState.Get())); - m_commandList->SetGraphicsRootSignature(m_rootSignature.Get()); + m_commandList->SetGraphicsRootSignature(m_rootSignature.Get()); - // Set necessary state. - m_commandList->RSSetViewports(1, &m_viewport); - m_commandList->RSSetScissorRects(1, &m_scissorRect); + // Set necessary state. + m_commandList->RSSetViewports(1, &m_viewport); + m_commandList->RSSetScissorRects(1, &m_scissorRect); - // Indicate that the back buffer will be used as a render target. - m_commandList->ResourceBarrier( - 1, &CD3DX12_RESOURCE_BARRIER::Transition( - m_renderTargets[m_frameIndex].Get(), D3D12_RESOURCE_STATE_PRESENT, - D3D12_RESOURCE_STATE_RENDER_TARGET)); + // Indicate that the back buffer will be used as a render target. + m_commandList->ResourceBarrier(1, + &CD3DX12_RESOURCE_BARRIER::Transition(m_renderTargets[m_frameIndex].Get(), + D3D12_RESOURCE_STATE_PRESENT, + D3D12_RESOURCE_STATE_RENDER_TARGET)); - CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle( - m_rtvHeap->GetCPUDescriptorHandleForHeapStart(), m_frameIndex, - m_rtvDescriptorSize); - m_commandList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr); + CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle( + m_rtvHeap->GetCPUDescriptorHandleForHeapStart(), m_frameIndex, m_rtvDescriptorSize); + m_commandList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr); - // Record commands. - const float clearColor[] = {0.0f, 0.2f, 0.4f, 1.0f}; - m_commandList->ClearRenderTargetView(rtvHandle, clearColor, 0, nullptr); - m_commandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_POINTLIST); - m_commandList->IASetVertexBuffers(0, 1, &m_vertexBufferView); - m_commandList->DrawInstanced(vertBufHeight * vertBufWidth, 1, 0, 0); + // Record commands. + const float clearColor[] = {0.0f, 0.2f, 0.4f, 1.0f}; + m_commandList->ClearRenderTargetView(rtvHandle, clearColor, 0, nullptr); + m_commandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_POINTLIST); + m_commandList->IASetVertexBuffers(0, 1, &m_vertexBufferView); + m_commandList->DrawInstanced(vertBufHeight * vertBufWidth, 1, 0, 0); - // Indicate that the back buffer will now be used to present. - m_commandList->ResourceBarrier( - 1, &CD3DX12_RESOURCE_BARRIER::Transition( - m_renderTargets[m_frameIndex].Get(), - D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_PRESENT)); + // Indicate that the back buffer will now be used to present. + m_commandList->ResourceBarrier(1, + &CD3DX12_RESOURCE_BARRIER::Transition(m_renderTargets[m_frameIndex].Get(), + D3D12_RESOURCE_STATE_RENDER_TARGET, + D3D12_RESOURCE_STATE_PRESENT)); - ThrowIfFailed(m_commandList->Close()); + ThrowIfFailed(m_commandList->Close()); } // Wait for pending GPU work to complete. -void DX12CudaInterop::WaitForGpu() { - // Schedule a Signal command in the queue. - ThrowIfFailed( - m_commandQueue->Signal(m_fence.Get(), m_fenceValues[m_frameIndex])); +void DX12CudaInterop::WaitForGpu() +{ + // Schedule a Signal command in the queue. + ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), m_fenceValues[m_frameIndex])); - // Wait until the fence has been processed. - ThrowIfFailed( - m_fence->SetEventOnCompletion(m_fenceValues[m_frameIndex], m_fenceEvent)); - WaitForSingleObjectEx(m_fenceEvent, INFINITE, FALSE); + // Wait until the fence has been processed. + ThrowIfFailed(m_fence->SetEventOnCompletion(m_fenceValues[m_frameIndex], m_fenceEvent)); + WaitForSingleObjectEx(m_fenceEvent, INFINITE, FALSE); - // Increment the fence value for the current frame. - m_fenceValues[m_frameIndex]++; + // Increment the fence value for the current frame. + m_fenceValues[m_frameIndex]++; } // Prepare to render the next frame. -void DX12CudaInterop::MoveToNextFrame() { - const UINT64 currentFenceValue = m_fenceValues[m_frameIndex]; - cudaExternalSemaphoreWaitParams externalSemaphoreWaitParams; - memset(&externalSemaphoreWaitParams, 0, sizeof(externalSemaphoreWaitParams)); +void DX12CudaInterop::MoveToNextFrame() +{ + const UINT64 currentFenceValue = m_fenceValues[m_frameIndex]; + cudaExternalSemaphoreWaitParams externalSemaphoreWaitParams; + memset(&externalSemaphoreWaitParams, 0, sizeof(externalSemaphoreWaitParams)); - externalSemaphoreWaitParams.params.fence.value = currentFenceValue; - externalSemaphoreWaitParams.flags = 0; + externalSemaphoreWaitParams.params.fence.value = currentFenceValue; + externalSemaphoreWaitParams.flags = 0; - checkCudaErrors(cudaWaitExternalSemaphoresAsync( - &m_externalSemaphore, &externalSemaphoreWaitParams, 1, m_streamToRun)); + checkCudaErrors( + cudaWaitExternalSemaphoresAsync(&m_externalSemaphore, &externalSemaphoreWaitParams, 1, m_streamToRun)); - m_AnimTime += 0.01f; - RunSineWaveKernel(vertBufWidth, vertBufHeight, (Vertex *)m_cudaDevVertptr, - m_streamToRun, m_AnimTime); + m_AnimTime += 0.01f; + RunSineWaveKernel(vertBufWidth, vertBufHeight, (Vertex *)m_cudaDevVertptr, m_streamToRun, m_AnimTime); - cudaExternalSemaphoreSignalParams externalSemaphoreSignalParams; - memset(&externalSemaphoreSignalParams, 0, - sizeof(externalSemaphoreSignalParams)); - m_fenceValues[m_frameIndex] = currentFenceValue + 1; - externalSemaphoreSignalParams.params.fence.value = - m_fenceValues[m_frameIndex]; - externalSemaphoreSignalParams.flags = 0; + cudaExternalSemaphoreSignalParams externalSemaphoreSignalParams; + memset(&externalSemaphoreSignalParams, 0, sizeof(externalSemaphoreSignalParams)); + m_fenceValues[m_frameIndex] = currentFenceValue + 1; + externalSemaphoreSignalParams.params.fence.value = m_fenceValues[m_frameIndex]; + externalSemaphoreSignalParams.flags = 0; - checkCudaErrors(cudaSignalExternalSemaphoresAsync( - &m_externalSemaphore, &externalSemaphoreSignalParams, 1, m_streamToRun)); + checkCudaErrors( + cudaSignalExternalSemaphoresAsync(&m_externalSemaphore, &externalSemaphoreSignalParams, 1, m_streamToRun)); - // Update the frame index. - m_frameIndex = m_swapChain->GetCurrentBackBufferIndex(); + // Update the frame index. + m_frameIndex = m_swapChain->GetCurrentBackBufferIndex(); - // If the next frame is not ready to be rendered yet, wait until it is ready. - if (m_fence->GetCompletedValue() < m_fenceValues[m_frameIndex]) { - ThrowIfFailed(m_fence->SetEventOnCompletion(m_fenceValues[m_frameIndex], - m_fenceEvent)); - WaitForSingleObjectEx(m_fenceEvent, INFINITE, FALSE); - } + // If the next frame is not ready to be rendered yet, wait until it is ready. + if (m_fence->GetCompletedValue() < m_fenceValues[m_frameIndex]) { + ThrowIfFailed(m_fence->SetEventOnCompletion(m_fenceValues[m_frameIndex], m_fenceEvent)); + WaitForSingleObjectEx(m_fenceEvent, INFINITE, FALSE); + } - // Set the fence value for the next frame. - m_fenceValues[m_frameIndex] = currentFenceValue + 2; + // Set the fence value for the next frame. + m_fenceValues[m_frameIndex] = currentFenceValue + 2; } diff --git a/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.h b/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.h old mode 100755 new mode 100644 index f3c23bf1..411dae86 --- a/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.h +++ b/Samples/5_Domain_Specific/simpleD3D12/simpleD3D12.h @@ -39,43 +39,43 @@ using namespace DirectX; // the class method: OnDestroy(). using Microsoft::WRL::ComPtr; -static const char *shaderstr = - " struct PSInput \n" - " { \n" - " float4 position : SV_POSITION; \n" - " float4 color : COLOR; \n" - " } \n" - " PSInput VSMain(float3 position : POSITION, float4 color : COLOR) \n" - " { \n" - " PSInput result;\n" - " result.position = float4(position, 1.0f);\n" - " result.color = color;\n" - " return result; \n" - " } \n" - " float4 PSMain(PSInput input) : SV_TARGET \n" - " { \n" - " return input.color;\n" - " } \n"; +static const char *shaderstr = " struct PSInput \n" + " { \n" + " float4 position : SV_POSITION; \n" + " float4 color : COLOR; \n" + " } \n" + " PSInput VSMain(float3 position : POSITION, float4 color : COLOR) \n" + " { \n" + " PSInput result;\n" + " result.position = float4(position, 1.0f);\n" + " result.color = color;\n" + " return result; \n" + " } \n" + " float4 PSMain(PSInput input) : SV_TARGET \n" + " { \n" + " return input.color;\n" + " } \n"; -class DX12CudaInterop : public DX12CudaSample { - public: - DX12CudaInterop(UINT width, UINT height, std::string name); +class DX12CudaInterop : public DX12CudaSample +{ +public: + DX12CudaInterop(UINT width, UINT height, std::string name); - virtual void OnInit(); - virtual void OnRender(); - virtual void OnDestroy(); + virtual void OnInit(); + virtual void OnRender(); + virtual void OnDestroy(); - private: - // In this sample we overload the meaning of FrameCount to mean both the - // maximum number of frames that will be queued to the GPU at a time, as well - // as the number of back buffers in the DXGI swap chain. For the majority of - // applications, this is convenient and works well. However, there will be - // certain cases where an application may want to queue up more frames than - // there are back buffers available. It should be noted that excessive - // buffering of frames dependent on user input may result in noticeable - // latency in your app. - static const UINT FrameCount = 2; - std::string shadersSrc = shaderstr; +private: + // In this sample we overload the meaning of FrameCount to mean both the + // maximum number of frames that will be queued to the GPU at a time, as well + // as the number of back buffers in the DXGI swap chain. For the majority of + // applications, this is convenient and works well. However, there will be + // certain cases where an application may want to queue up more frames than + // there are back buffers available. It should be noted that excessive + // buffering of frames dependent on user input may result in noticeable + // latency in your app. + static const UINT FrameCount = 2; + std::string shadersSrc = shaderstr; #if 0 " struct PSInput \n" \ " { \n" \ @@ -95,48 +95,48 @@ class DX12CudaInterop : public DX12CudaSample { " } \n"; #endif - // Vertex Buffer dimension - size_t vertBufHeight, vertBufWidth; + // Vertex Buffer dimension + size_t vertBufHeight, vertBufWidth; - // Pipeline objects. - D3D12_VIEWPORT m_viewport; - CD3DX12_RECT m_scissorRect; - ComPtr m_swapChain; - ComPtr m_device; - ComPtr m_renderTargets[FrameCount]; - ComPtr m_commandAllocators[FrameCount]; - ComPtr m_commandQueue; - ComPtr m_rootSignature; - ComPtr m_rtvHeap; - ComPtr m_pipelineState; - ComPtr m_commandList; - UINT m_rtvDescriptorSize; + // Pipeline objects. + D3D12_VIEWPORT m_viewport; + CD3DX12_RECT m_scissorRect; + ComPtr m_swapChain; + ComPtr m_device; + ComPtr m_renderTargets[FrameCount]; + ComPtr m_commandAllocators[FrameCount]; + ComPtr m_commandQueue; + ComPtr m_rootSignature; + ComPtr m_rtvHeap; + ComPtr m_pipelineState; + ComPtr m_commandList; + UINT m_rtvDescriptorSize; - // App resources. - ComPtr m_vertexBuffer; - D3D12_VERTEX_BUFFER_VIEW m_vertexBufferView; + // App resources. + ComPtr m_vertexBuffer; + D3D12_VERTEX_BUFFER_VIEW m_vertexBufferView; - // Synchronization objects. - UINT m_frameIndex; - HANDLE m_fenceEvent; - ComPtr m_fence; - UINT64 m_fenceValues[FrameCount]; + // Synchronization objects. + UINT m_frameIndex; + HANDLE m_fenceEvent; + ComPtr m_fence; + UINT64 m_fenceValues[FrameCount]; - // CUDA objects - cudaExternalMemoryHandleType m_externalMemoryHandleType; - cudaExternalMemory_t m_externalMemory; - cudaExternalSemaphore_t m_externalSemaphore; - cudaStream_t m_streamToRun; - LUID m_dx12deviceluid; - UINT m_cudaDeviceID; - UINT m_nodeMask; - float m_AnimTime; - void *m_cudaDevVertptr = NULL; + // CUDA objects + cudaExternalMemoryHandleType m_externalMemoryHandleType; + cudaExternalMemory_t m_externalMemory; + cudaExternalSemaphore_t m_externalSemaphore; + cudaStream_t m_streamToRun; + LUID m_dx12deviceluid; + UINT m_cudaDeviceID; + UINT m_nodeMask; + float m_AnimTime; + void *m_cudaDevVertptr = NULL; - void LoadPipeline(); - void InitCuda(); - void LoadAssets(); - void PopulateCommandList(); - void MoveToNextFrame(); - void WaitForGpu(); + void LoadPipeline(); + void InitCuda(); + void LoadAssets(); + void PopulateCommandList(); + void MoveToNextFrame(); + void WaitForGpu(); }; diff --git a/Samples/5_Domain_Specific/simpleD3D12/sinewave_cuda.cu b/Samples/5_Domain_Specific/simpleD3D12/sinewave_cuda.cu old mode 100755 new mode 100644 index 55a6ea7d..65151c0f --- a/Samples/5_Domain_Specific/simpleD3D12/sinewave_cuda.cu +++ b/Samples/5_Domain_Specific/simpleD3D12/sinewave_cuda.cu @@ -27,43 +27,45 @@ #include "ShaderStructs.h" -__global__ void sinewave_gen_kernel(Vertex *vertices, unsigned int width, - unsigned int height, float time) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void sinewave_gen_kernel(Vertex *vertices, unsigned int width, unsigned int height, float time) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - // calculate uv coordinates - float u = x / (float)width; - float v = y / (float)height; - u = u * 2.0f - 1.0f; - v = v * 2.0f - 1.0f; + // calculate uv coordinates + float u = x / (float)width; + float v = y / (float)height; + u = u * 2.0f - 1.0f; + v = v * 2.0f - 1.0f; - // calculate simple sine wave pattern - float freq = 4.0f; - float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; + // calculate simple sine wave pattern + float freq = 4.0f; + float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; - if (y < height && x < width) { - // write output vertex - vertices[y * width + x].position.x = u; - vertices[y * width + x].position.y = w; - vertices[y * width + x].position.z = v; - // vertices[y*width+x].position[3] = 1.0f; - vertices[y * width + x].color.x = 1.0f; - vertices[y * width + x].color.y = 0.0f; - vertices[y * width + x].color.z = 0.0f; - vertices[y * width + x].color.w = 0.0f; - } + if (y < height && x < width) { + // write output vertex + vertices[y * width + x].position.x = u; + vertices[y * width + x].position.y = w; + vertices[y * width + x].position.z = v; + // vertices[y*width+x].position[3] = 1.0f; + vertices[y * width + x].color.x = 1.0f; + vertices[y * width + x].color.y = 0.0f; + vertices[y * width + x].color.z = 0.0f; + vertices[y * width + x].color.w = 0.0f; + } } // The host CPU Sinewave thread spawner -void RunSineWaveKernel(size_t mesh_width, size_t mesh_height, - Vertex *cudaDevVertptr, cudaStream_t streamToRun, - float AnimTime) { - dim3 block(16, 16, 1); - dim3 grid(mesh_width / 16, mesh_height / 16, 1); - Vertex *vertices = (Vertex *)cudaDevVertptr; - sinewave_gen_kernel<<>>(vertices, mesh_width, - mesh_height, AnimTime); +void RunSineWaveKernel(size_t mesh_width, + size_t mesh_height, + Vertex *cudaDevVertptr, + cudaStream_t streamToRun, + float AnimTime) +{ + dim3 block(16, 16, 1); + dim3 grid(mesh_width / 16, mesh_height / 16, 1); + Vertex *vertices = (Vertex *)cudaDevVertptr; + sinewave_gen_kernel<<>>(vertices, mesh_width, mesh_height, AnimTime); - getLastCudaError("sinewave_gen_kernel execution failed.\n"); + getLastCudaError("sinewave_gen_kernel execution failed.\n"); } diff --git a/Samples/5_Domain_Specific/simpleD3D12/stdafx.h b/Samples/5_Domain_Specific/simpleD3D12/stdafx.h old mode 100755 new mode 100644 index 210397af..32b194cb --- a/Samples/5_Domain_Specific/simpleD3D12/stdafx.h +++ b/Samples/5_Domain_Specific/simpleD3D12/stdafx.h @@ -32,17 +32,16 @@ #pragma once #ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers. +#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers. #endif -#include - #include #include #include #include -#include "d3dx12.h" - #include -#include #include +#include +#include + +#include "d3dx12.h" diff --git a/Samples/5_Domain_Specific/simpleGL/README.md b/Samples/5_Domain_Specific/simpleGL/README.md index 0019a36b..c60e9e12 100644 --- a/Samples/5_Domain_Specific/simpleGL/README.md +++ b/Samples/5_Domain_Specific/simpleGL/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/simpleGL/simpleGL.cu b/Samples/5_Domain_Specific/simpleGL/simpleGL.cu index 7ba2c9b1..b86e8763 100644 --- a/Samples/5_Domain_Specific/simpleGL/simpleGL.cu +++ b/Samples/5_Domain_Specific/simpleGL/simpleGL.cu @@ -41,81 +41,80 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include #ifdef _WIN32 -# define WINDOWS_LEAN_AND_MEAN -# define NOMINMAX -# include +#define WINDOWS_LEAN_AND_MEAN +#define NOMINMAX +#include #endif // OpenGL Graphics includes #include -#if defined (__APPLE__) || defined(MACOSX) - #pragma clang diagnostic ignored "-Wdeprecated-declarations" - #include - #ifndef glutCloseFunc - #define glutCloseFunc glutWMCloseFunc - #endif +#if defined(__APPLE__) || defined(MACOSX) +#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#include +#ifndef glutCloseFunc +#define glutCloseFunc glutWMCloseFunc +#endif #else #include #endif // includes, cuda -#include #include +#include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check - +#include // helper functions for CUDA error check #include #define MAX_EPSILON_ERROR 10.0f -#define THRESHOLD 0.30f -#define REFRESH_DELAY 10 //ms +#define THRESHOLD 0.30f +#define REFRESH_DELAY 10 // ms //////////////////////////////////////////////////////////////////////////////// // constants const unsigned int window_width = 512; const unsigned int window_height = 512; -const unsigned int mesh_width = 256; -const unsigned int mesh_height = 256; +const unsigned int mesh_width = 256; +const unsigned int mesh_height = 256; // vbo variables -GLuint vbo; +GLuint vbo; struct cudaGraphicsResource *cuda_vbo_resource; -void *d_vbo_buffer = NULL; +void *d_vbo_buffer = NULL; float g_fAnim = 0.0; // mouse controls -int mouse_old_x, mouse_old_y; -int mouse_buttons = 0; +int mouse_old_x, mouse_old_y; +int mouse_buttons = 0; float rotate_x = 0.0, rotate_y = 0.0; float translate_z = -3.0; StopWatchInterface *timer = NULL; // Auto-Verification Code -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -float avgFPS = 0.0f; -unsigned int frameCount = 0; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +float avgFPS = 0.0f; +unsigned int frameCount = 0; unsigned int g_TotalErrors = 0; -bool g_bQAReadback = false; +bool g_bQAReadback = false; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; -#define MAX(a,b) ((a > b) ? a : b) +#define MAX(a, b) ((a > b) ? a : b) //////////////////////////////////////////////////////////////////////////////// // declaration, forward @@ -124,8 +123,7 @@ void cleanup(); // GL functionality bool initGL(int *argc, char **argv); -void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res, - unsigned int vbo_res_flags); +void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res, unsigned int vbo_res_flags); void deleteVBO(GLuint *vbo, struct cudaGraphicsResource *vbo_res); // rendering callbacks @@ -148,31 +146,30 @@ const char *sSDKsample = "simpleGL (VBO)"; /////////////////////////////////////////////////////////////////////////////// __global__ void simple_vbo_kernel(float4 *pos, unsigned int width, unsigned int height, float time) { - unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; // calculate uv coordinates - float u = x / (float) width; - float v = y / (float) height; - u = u*2.0f - 1.0f; - v = v*2.0f - 1.0f; + float u = x / (float)width; + float v = y / (float)height; + u = u * 2.0f - 1.0f; + v = v * 2.0f - 1.0f; // calculate simple sine wave pattern float freq = 4.0f; - float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f; + float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; // write output vertex - pos[y*width+x] = make_float4(u, w, v, 1.0f); + pos[y * width + x] = make_float4(u, w, v, 1.0f); } -void launch_kernel(float4 *pos, unsigned int mesh_width, - unsigned int mesh_height, float time) +void launch_kernel(float4 *pos, unsigned int mesh_width, unsigned int mesh_height, float time) { // execute the kernel dim3 block(8, 8, 1); dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); - simple_vbo_kernel<<< grid, block>>>(pos, mesh_width, mesh_height, time); + simple_vbo_kernel<<>>(pos, mesh_width, mesh_height, time); } @@ -187,15 +184,13 @@ int main(int argc, char **argv) pArgv = argv; #if defined(__linux__) - setenv ("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif printf("%s starting...\n", sSDKsample); - if (argc > 1) - { - if (checkCmdLineFlag(argc, (const char **)argv, "file")) - { + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { // In this mode, we are running non-OpenGL and doing a compare of the VBO was generated correctly getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); } @@ -214,9 +209,8 @@ void computeFPS() frameCount++; fpsCount++; - if (fpsCount == fpsLimit) - { - avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + if (fpsCount == fpsLimit) { + avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); fpsCount = 0; fpsLimit = (int)MAX(avgFPS, 1.f); @@ -240,11 +234,10 @@ bool initGL(int *argc, char **argv) glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMotionFunc(motion); - glutTimerFunc(REFRESH_DELAY, timerEvent,0); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); // initialize necessary OpenGL extensions - if (! isGLVersionSupported(2,0)) - { + if (!isGLVersionSupported(2, 0)) { fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); fflush(stderr); return false; @@ -260,7 +253,7 @@ bool initGL(int *argc, char **argv) // projection glMatrixMode(GL_PROJECTION); glLoadIdentity(); - gluPerspective(60.0, (GLfloat)window_width / (GLfloat) window_height, 0.1, 10.0); + gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1, 10.0); SDK_CHECK_ERROR_GL(); @@ -280,10 +273,9 @@ bool runTest(int argc, char **argv, char *ref_file) int devID = findCudaDevice(argc, (const char **)argv); // command line mode only - if (ref_file != NULL) - { + if (ref_file != NULL) { // create VBO - checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, mesh_width*mesh_height*4*sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, mesh_width * mesh_height * 4 * sizeof(float))); // run the cuda part runAutoTest(devID, argv, ref_file); @@ -294,12 +286,10 @@ bool runTest(int argc, char **argv, char *ref_file) cudaFree(d_vbo_buffer); d_vbo_buffer = NULL; } - else - { + else { // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. - if (false == initGL(&argc, argv)) - { + if (false == initGL(&argc, argv)) { return false; } @@ -308,7 +298,7 @@ bool runTest(int argc, char **argv, char *ref_file) glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); -#if defined (__APPLE__) || defined(MACOSX) +#if defined(__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); @@ -336,9 +326,8 @@ void runCuda(struct cudaGraphicsResource **vbo_resource) float4 *dptr; checkCudaErrors(cudaGraphicsMapResources(1, vbo_resource, 0)); size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, - *vbo_resource)); - //printf("CUDA mapped VBO: May access %ld bytes\n", num_bytes); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *vbo_resource)); + // printf("CUDA mapped VBO: May access %ld bytes\n", num_bytes); // execute the kernel // dim3 block(8, 8, 1); @@ -353,11 +342,11 @@ void runCuda(struct cudaGraphicsResource **vbo_resource) #ifdef _WIN32 #ifndef FOPEN -#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) #endif #else #ifndef FOPEN -#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) #endif #endif @@ -377,7 +366,7 @@ void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) void runAutoTest(int devID, char **argv, char *ref_file) { char *reference_file = NULL; - void *imageData = malloc(mesh_width*mesh_height*sizeof(float)); + void *imageData = malloc(mesh_width * mesh_height * sizeof(float)); // execute the kernel launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim); @@ -385,16 +374,19 @@ void runAutoTest(int devID, char **argv, char *ref_file) cudaDeviceSynchronize(); getLastCudaError("launch_kernel failed"); - checkCudaErrors(cudaMemcpy(imageData, d_vbo_buffer, mesh_width*mesh_height*sizeof(float), cudaMemcpyDeviceToHost)); + checkCudaErrors( + cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float), cudaMemcpyDeviceToHost)); - sdkDumpBin2(imageData, mesh_width*mesh_height*sizeof(float), "simpleGL.bin"); + sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float), "simpleGL.bin"); reference_file = sdkFindFilePath(ref_file, argv[0]); - if (reference_file && - !sdkCompareBin2BinFloat("simpleGL.bin", reference_file, - mesh_width*mesh_height*sizeof(float), - MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) - { + if (reference_file + && !sdkCompareBin2BinFloat("simpleGL.bin", + reference_file, + mesh_width * mesh_height * sizeof(float), + MAX_EPSILON_ERROR, + THRESHOLD, + pArgv[0])) { g_TotalErrors++; } } @@ -402,8 +394,7 @@ void runAutoTest(int devID, char **argv, char *ref_file) //////////////////////////////////////////////////////////////////////////////// //! Create VBO //////////////////////////////////////////////////////////////////////////////// -void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res, - unsigned int vbo_res_flags) +void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res, unsigned int vbo_res_flags) { assert(vbo); @@ -476,10 +467,9 @@ void display() void timerEvent(int value) { - if (glutGetWindow()) - { + if (glutGetWindow()) { glutPostRedisplay(); - glutTimerFunc(REFRESH_DELAY, timerEvent,0); + glutTimerFunc(REFRESH_DELAY, timerEvent, 0); } } @@ -487,8 +477,7 @@ void cleanup() { sdkDeleteTimer(&timer); - if (vbo) - { + if (vbo) { deleteVBO(&vbo, cuda_vbo_resource); } } @@ -499,15 +488,14 @@ void cleanup() //////////////////////////////////////////////////////////////////////////////// void keyboard(unsigned char key, int /*x*/, int /*y*/) { - switch (key) - { - case (27) : - #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); - #else - glutDestroyWindow(glutGetWindow()); - return; - #endif + switch (key) { + case (27): +#if defined(__APPLE__) || defined(MACOSX) + exit(EXIT_SUCCESS); +#else + glutDestroyWindow(glutGetWindow()); + return; +#endif } } @@ -516,12 +504,10 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/) //////////////////////////////////////////////////////////////////////////////// void mouse(int button, int state, int x, int y) { - if (state == GLUT_DOWN) - { - mouse_buttons |= 1<("./data/regression.dat", - data, mesh_width * mesh_height * 3, 0.0, false); + sdkWriteFile("./data/regression.dat", data, mesh_width * mesh_height * 3, 0.0, false); } // unmap GL buffer object - if (!glUnmapBuffer(GL_ARRAY_BUFFER)) - { + if (!glUnmapBuffer(GL_ARRAY_BUFFER)) { fprintf(stderr, "Unmap buffer failed.\n"); fflush(stderr); } - checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, - cudaGraphicsMapFlagsWriteDiscard)); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); SDK_CHECK_ERROR_GL(); } diff --git a/Samples/5_Domain_Specific/simpleVulkan/Build_instructions.txt b/Samples/5_Domain_Specific/simpleVulkan/Build_instructions.txt index dc71e3d4..c0d50454 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/Build_instructions.txt +++ b/Samples/5_Domain_Specific/simpleVulkan/Build_instructions.txt @@ -21,7 +21,7 @@ For Linux: For Linux aarch64(L4T): --- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 +-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 -- install above will also provide libvulkan-dev as dependencies -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=', VULKAN_SDK_PATH in this scenario is typically "/usr" diff --git a/Samples/5_Domain_Specific/simpleVulkan/README.md b/Samples/5_Domain_Specific/simpleVulkan/README.md index d664cdd1..9f488311 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/README.md +++ b/Samples/5_Domain_Specific/simpleVulkan/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu b/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu index 26a8f03c..7c6a7737 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu +++ b/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu @@ -25,110 +25,114 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "SineWaveSimulation.h" #include #include -__global__ void sinewave(float *heightMap, unsigned int width, - unsigned int height, float time) { - const float freq = 4.0f; - const size_t stride = gridDim.x * blockDim.x; +#include "SineWaveSimulation.h" - // Iterate through the entire array in a way that is - // independent of the grid configuration - for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < width * height; - tid += stride) { - // Calculate the x, y coordinates - const size_t y = tid / width; - const size_t x = tid - y * width; - // Normalize x, y to [0,1] - const float u = ((2.0f * x) / width) - 1.0f; - const float v = ((2.0f * y) / height) - 1.0f; - // Calculate the new height value - const float w = 0.5f * sinf(u * freq + time) * cosf(v * freq + time); - // Store this new height value - heightMap[tid] = w; - } +__global__ void sinewave(float *heightMap, unsigned int width, unsigned int height, float time) +{ + const float freq = 4.0f; + const size_t stride = gridDim.x * blockDim.x; + + // Iterate through the entire array in a way that is + // independent of the grid configuration + for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < width * height; tid += stride) { + // Calculate the x, y coordinates + const size_t y = tid / width; + const size_t x = tid - y * width; + // Normalize x, y to [0,1] + const float u = ((2.0f * x) / width) - 1.0f; + const float v = ((2.0f * y) / height) - 1.0f; + // Calculate the new height value + const float w = 0.5f * sinf(u * freq + time) * cosf(v * freq + time); + // Store this new height value + heightMap[tid] = w; + } } SineWaveSimulation::SineWaveSimulation(size_t width, size_t height) - : m_heightMap(nullptr), m_width(width), m_height(height) {} - -void SineWaveSimulation::initCudaLaunchConfig(int device) { - cudaDeviceProp prop = {}; - checkCudaErrors(cudaSetDevice(device)); - checkCudaErrors(cudaGetDeviceProperties(&prop, device)); - - // We don't need large block sizes, since there's not much inter-thread - // communication - m_threads = prop.warpSize; - - // Use the occupancy calculator and fill the gpu as best as we can - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &m_blocks, sinewave, prop.warpSize, 0)); - m_blocks *= prop.multiProcessorCount; - - // Go ahead and the clamp the blocks to the minimum needed for this - // height/width - m_blocks = std::min(m_blocks, - (int)((m_width * m_height + m_threads - 1) / m_threads)); + : m_heightMap(nullptr) + , m_width(width) + , m_height(height) +{ } -int SineWaveSimulation::initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE) { - int current_device = 0; - int device_count = 0; - int devices_prohibited = 0; +void SineWaveSimulation::initCudaLaunchConfig(int device) +{ + cudaDeviceProp prop = {}; + checkCudaErrors(cudaSetDevice(device)); + checkCudaErrors(cudaGetDeviceProperties(&prop, device)); - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceCount(&device_count)); + // We don't need large block sizes, since there's not much inter-thread + // communication + m_threads = prop.warpSize; - if (device_count == 0) { - fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } + // Use the occupancy calculator and fill the gpu as best as we can + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, sinewave, prop.warpSize, 0)); + m_blocks *= prop.multiProcessorCount; - // Find the GPU which is selected by Vulkan - while (current_device < device_count) { - cudaGetDeviceProperties(&deviceProp, current_device); + // Go ahead and the clamp the blocks to the minimum needed for this + // height/width + m_blocks = std::min(m_blocks, (int)((m_width * m_height + m_threads - 1) / m_threads)); +} - if ((deviceProp.computeMode != cudaComputeModeProhibited)) { - // Compare the cuda device UUID with vulkan UUID - int ret = memcmp((void *)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE); - if (ret == 0) { - checkCudaErrors(cudaSetDevice(current_device)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - current_device, deviceProp.name, deviceProp.major, - deviceProp.minor); +int SineWaveSimulation::initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE) +{ + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; - return current_device; - } + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); - } else { - devices_prohibited++; + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); } - current_device++; - } + // Find the GPU which is selected by Vulkan + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); - if (devices_prohibited == device_count) { - fprintf(stderr, - "CUDA error:" - " No Vulkan-CUDA Interop capable GPU found.\n"); - exit(EXIT_FAILURE); - } + if ((deviceProp.computeMode != cudaComputeModeProhibited)) { + // Compare the cuda device UUID with vulkan UUID + int ret = memcmp((void *)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE); + if (ret == 0) { + checkCudaErrors(cudaSetDevice(current_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, + deviceProp.name, + deviceProp.major, + deviceProp.minor); - return -1; + return current_device; + } + } + else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No Vulkan-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; } SineWaveSimulation::~SineWaveSimulation() { m_heightMap = NULL; } -void SineWaveSimulation::initSimulation(float *heights) { - m_heightMap = heights; -} +void SineWaveSimulation::initSimulation(float *heights) { m_heightMap = heights; } -void SineWaveSimulation::stepSimulation(float time, cudaStream_t stream) { - sinewave<<>>(m_heightMap, m_width, m_height, - time); - getLastCudaError("Failed to launch CUDA simulation"); +void SineWaveSimulation::stepSimulation(float time, cudaStream_t stream) +{ + sinewave<<>>(m_heightMap, m_width, m_height, time); + getLastCudaError("Failed to launch CUDA simulation"); } diff --git a/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.h b/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.h index ba962bff..0d5039f5 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.h +++ b/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.h @@ -29,26 +29,28 @@ #ifndef __SINESIM_H__ #define __SINESIM_H__ -#include #include #include +#include + #include "linmath.h" -class SineWaveSimulation { - float *m_heightMap; - size_t m_width, m_height; - int m_blocks, m_threads; +class SineWaveSimulation +{ + float *m_heightMap; + size_t m_width, m_height; + int m_blocks, m_threads; - public: - SineWaveSimulation(size_t width, size_t height); - ~SineWaveSimulation(); - void initSimulation(float *heightMap); - void stepSimulation(float time, cudaStream_t stream = 0); - void initCudaLaunchConfig(int device); - int initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE); +public: + SineWaveSimulation(size_t width, size_t height); + ~SineWaveSimulation(); + void initSimulation(float *heightMap); + void stepSimulation(float time, cudaStream_t stream = 0); + void initCudaLaunchConfig(int device); + int initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE); - size_t getWidth() const { return m_width; } - size_t getHeight() const { return m_height; } + size_t getWidth() const { return m_width; } + size_t getHeight() const { return m_height; } }; -#endif // __SINESIM_H__ +#endif // __SINESIM_H__ diff --git a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp index d6ed98b1..a5436c75 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp +++ b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.cpp @@ -29,7 +29,9 @@ * This file contains basic cross-platform setup paths in working with Vulkan * and rendering window. It is largely based off of tutorials provided here: * https://vulkan-tutorial.com/ -*/ + */ + +#include "VulkanBaseApp.h" #include #include @@ -40,1886 +42,1839 @@ #include #include - -#include "VulkanBaseApp.h" - #define GLFW_INCLUDE_VULKAN #define GLM_FORCE_DEPTH_ZERO_TO_ONE #include #ifdef _WIN64 #include -#include #include +#include #endif /* _WIN64 */ #ifndef countof #define countof(x) (sizeof(x) / sizeof(*(x))) #endif -static const char *validationLayers[] = {"VK_LAYER_KHRONOS_validation"}; +static const char *validationLayers[] = {"VK_LAYER_KHRONOS_validation"}; static const size_t MAX_FRAMES_IN_FLIGHT = 5; -void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height) { - VulkanBaseApp *app = - reinterpret_cast(glfwGetWindowUserPointer(window)); - app->m_framebufferResized = true; +void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height) +{ + VulkanBaseApp *app = reinterpret_cast(glfwGetWindowUserPointer(window)); + app->m_framebufferResized = true; } -static VKAPI_ATTR VkBool32 VKAPI_CALL -debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, - VkDebugUtilsMessageTypeFlagsEXT messageType, - const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, - void *pUserData) { - std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; +static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + VkDebugUtilsMessageTypeFlagsEXT messageType, + const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, + void *pUserData) +{ + std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; - return VK_FALSE; + return VK_FALSE; } VulkanBaseApp::VulkanBaseApp(const std::string &appName, bool enableValidation) - : m_appName(appName), - m_enableValidation(enableValidation), - m_instance(VK_NULL_HANDLE), - m_window(nullptr), - m_debugMessenger(VK_NULL_HANDLE), - m_surface(VK_NULL_HANDLE), - m_physicalDevice(VK_NULL_HANDLE), - m_device(VK_NULL_HANDLE), - m_graphicsQueue(VK_NULL_HANDLE), - m_presentQueue(VK_NULL_HANDLE), - m_swapChain(VK_NULL_HANDLE), - m_vkDeviceUUID(), - m_swapChainImages(), - m_swapChainFormat(), - m_swapChainExtent(), - m_swapChainImageViews(), - m_shaderFiles(), - m_renderPass(), - m_pipelineLayout(VK_NULL_HANDLE), - m_graphicsPipeline(VK_NULL_HANDLE), - m_swapChainFramebuffers(), - m_commandPool(VK_NULL_HANDLE), - m_commandBuffers(), - m_imageAvailableSemaphores(), - m_renderFinishedSemaphores(), - m_inFlightFences(), - m_uniformBuffers(), - m_uniformMemory(), - m_descriptorSetLayout(VK_NULL_HANDLE), - m_descriptorPool(VK_NULL_HANDLE), - m_descriptorSets(), - m_depthImage(VK_NULL_HANDLE), - m_depthImageMemory(VK_NULL_HANDLE), - m_depthImageView(VK_NULL_HANDLE), - m_currentFrame(0), - m_framebufferResized(false) {} + : m_appName(appName) + , m_enableValidation(enableValidation) + , m_instance(VK_NULL_HANDLE) + , m_window(nullptr) + , m_debugMessenger(VK_NULL_HANDLE) + , m_surface(VK_NULL_HANDLE) + , m_physicalDevice(VK_NULL_HANDLE) + , m_device(VK_NULL_HANDLE) + , m_graphicsQueue(VK_NULL_HANDLE) + , m_presentQueue(VK_NULL_HANDLE) + , m_swapChain(VK_NULL_HANDLE) + , m_vkDeviceUUID() + , m_swapChainImages() + , m_swapChainFormat() + , m_swapChainExtent() + , m_swapChainImageViews() + , m_shaderFiles() + , m_renderPass() + , m_pipelineLayout(VK_NULL_HANDLE) + , m_graphicsPipeline(VK_NULL_HANDLE) + , m_swapChainFramebuffers() + , m_commandPool(VK_NULL_HANDLE) + , m_commandBuffers() + , m_imageAvailableSemaphores() + , m_renderFinishedSemaphores() + , m_inFlightFences() + , m_uniformBuffers() + , m_uniformMemory() + , m_descriptorSetLayout(VK_NULL_HANDLE) + , m_descriptorPool(VK_NULL_HANDLE) + , m_descriptorSets() + , m_depthImage(VK_NULL_HANDLE) + , m_depthImageMemory(VK_NULL_HANDLE) + , m_depthImageView(VK_NULL_HANDLE) + , m_currentFrame(0) + , m_framebufferResized(false) +{ +} -VkExternalSemaphoreHandleTypeFlagBits -VulkanBaseApp::getDefaultSemaphoreHandleType() { +VkExternalSemaphoreHandleTypeFlagBits VulkanBaseApp::getDefaultSemaphoreHandleType() +{ #ifdef _WIN64 - return IsWindows8OrGreater() - ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; + return IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; #else - return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; #endif /* _WIN64 */ } -VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType() { +VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType() +{ #ifdef _WIN64 - return IsWindows8Point1OrGreater() - ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; + return IsWindows8Point1OrGreater() ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; #else - return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; #endif /* _WIN64 */ } -VulkanBaseApp::~VulkanBaseApp() { - cleanupSwapChain(); +VulkanBaseApp::~VulkanBaseApp() +{ + cleanupSwapChain(); - if (m_descriptorSetLayout != VK_NULL_HANDLE) { - vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr); - } + if (m_descriptorSetLayout != VK_NULL_HANDLE) { + vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr); + } #ifdef _VK_TIMELINE_SEMAPHORE - if (m_vkPresentationSemaphore != VK_NULL_HANDLE) { - vkDestroySemaphore(m_device, m_vkPresentationSemaphore, nullptr); - } + if (m_vkPresentationSemaphore != VK_NULL_HANDLE) { + vkDestroySemaphore(m_device, m_vkPresentationSemaphore, nullptr); + } #endif /* _VK_TIMELINE_SEMAPHORE */ - for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) { - vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr); - vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr); - vkDestroyFence(m_device, m_inFlightFences[i], nullptr); - } - if (m_commandPool != VK_NULL_HANDLE) { - vkDestroyCommandPool(m_device, m_commandPool, nullptr); - } - - if (m_device != VK_NULL_HANDLE) { - vkDestroyDevice(m_device, nullptr); - } - - if (m_enableValidation) { - PFN_vkDestroyDebugUtilsMessengerEXT func = - (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr( - m_instance, "vkDestroyDebugUtilsMessengerEXT"); - if (func != nullptr) { - func(m_instance, m_debugMessenger, nullptr); + for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) { + vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr); + vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr); + vkDestroyFence(m_device, m_inFlightFences[i], nullptr); + } + if (m_commandPool != VK_NULL_HANDLE) { + vkDestroyCommandPool(m_device, m_commandPool, nullptr); } - } - if (m_surface != VK_NULL_HANDLE) { - vkDestroySurfaceKHR(m_instance, m_surface, nullptr); - } + if (m_device != VK_NULL_HANDLE) { + vkDestroyDevice(m_device, nullptr); + } - if (m_instance != VK_NULL_HANDLE) { - vkDestroyInstance(m_instance, nullptr); - } + if (m_enableValidation) { + PFN_vkDestroyDebugUtilsMessengerEXT func = + (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkDestroyDebugUtilsMessengerEXT"); + if (func != nullptr) { + func(m_instance, m_debugMessenger, nullptr); + } + } - if (m_window) { - glfwDestroyWindow(m_window); - } + if (m_surface != VK_NULL_HANDLE) { + vkDestroySurfaceKHR(m_instance, m_surface, nullptr); + } - glfwTerminate(); + if (m_instance != VK_NULL_HANDLE) { + vkDestroyInstance(m_instance, nullptr); + } + + if (m_window) { + glfwDestroyWindow(m_window); + } + + glfwTerminate(); } -void VulkanBaseApp::init() { - initWindow(); - initVulkan(); +void VulkanBaseApp::init() +{ + initWindow(); + initVulkan(); } -VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands() { - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandPool = m_commandPool; - allocInfo.commandBufferCount = 1; +VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands() +{ + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandPool = m_commandPool; + allocInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer; - vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer); + VkCommandBuffer commandBuffer; + vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer); - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - vkBeginCommandBuffer(commandBuffer, &beginInfo); + vkBeginCommandBuffer(commandBuffer, &beginInfo); - return commandBuffer; + return commandBuffer; } -void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer) { - vkEndCommandBuffer(commandBuffer); +void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer) +{ + vkEndCommandBuffer(commandBuffer); - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffer; - vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); - vkQueueWaitIdle(m_graphicsQueue); + vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); + vkQueueWaitIdle(m_graphicsQueue); - vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer); + vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer); } -void VulkanBaseApp::initWindow() { - glfwInit(); +void VulkanBaseApp::initWindow() +{ + glfwInit(); - glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); - glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); + glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); + glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); - m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr); - glfwSetWindowUserPointer(m_window, this); - glfwSetFramebufferSizeCallback(m_window, resizeCallback); + m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr); + glfwSetWindowUserPointer(m_window, this); + glfwSetFramebufferSizeCallback(m_window, resizeCallback); } -std::vector VulkanBaseApp::getRequiredExtensions() const { - return std::vector(); -} +std::vector VulkanBaseApp::getRequiredExtensions() const { return std::vector(); } -std::vector VulkanBaseApp::getRequiredDeviceExtensions() const { - return std::vector(); -} +std::vector VulkanBaseApp::getRequiredDeviceExtensions() const { return std::vector(); } -void VulkanBaseApp::initVulkan() { - createInstance(); - createSurface(); - createDevice(); - createSwapChain(); - createImageViews(); - createRenderPass(); - createDescriptorSetLayout(); - createGraphicsPipeline(); - createCommandPool(); - createDepthResources(); - createFramebuffers(); - initVulkanApp(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); - createSyncObjects(); +void VulkanBaseApp::initVulkan() +{ + createInstance(); + createSurface(); + createDevice(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createDescriptorSetLayout(); + createGraphicsPipeline(); + createCommandPool(); + createDepthResources(); + createFramebuffers(); + initVulkanApp(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); + createSyncObjects(); } #ifdef _WIN64 -class WindowsSecurityAttributes { - protected: - SECURITY_ATTRIBUTES m_winSecurityAttributes; - PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; +class WindowsSecurityAttributes +{ +protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; - public: - WindowsSecurityAttributes(); - SECURITY_ATTRIBUTES *operator&(); - ~WindowsSecurityAttributes(); +public: + WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES *operator&(); + ~WindowsSecurityAttributes(); }; -WindowsSecurityAttributes::WindowsSecurityAttributes() { - m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc( - 1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); - if (!m_winPSecurityDescriptor) { - throw std::runtime_error( - "Failed to allocate memory for security descriptor"); - } +WindowsSecurityAttributes::WindowsSecurityAttributes() +{ + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); + if (!m_winPSecurityDescriptor) { + throw std::runtime_error("Failed to allocate memory for security descriptor"); + } - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + - SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - InitializeSecurityDescriptor(m_winPSecurityDescriptor, - SECURITY_DESCRIPTOR_REVISION); + InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION); - SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = - SECURITY_WORLD_SID_AUTHORITY; - AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, - 0, 0, 0, 0, 0, ppSID); + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID); - EXPLICIT_ACCESS explicitAccess; - ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); - explicitAccess.grfAccessPermissions = - STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; - explicitAccess.grfAccessMode = SET_ACCESS; - explicitAccess.grfInheritance = INHERIT_ONLY; - explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; - explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; - explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; - SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); - SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); - m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); - m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; - m_winSecurityAttributes.bInheritHandle = TRUE; + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; } -SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { - return &m_winSecurityAttributes; -} +SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { return &m_winSecurityAttributes; } -WindowsSecurityAttributes::~WindowsSecurityAttributes() { - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + - SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); +WindowsSecurityAttributes::~WindowsSecurityAttributes() +{ + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - if (*ppSID) { - FreeSid(*ppSID); - } - if (*ppACL) { - LocalFree(*ppACL); - } - free(m_winPSecurityDescriptor); + if (*ppSID) { + FreeSid(*ppSID); + } + if (*ppACL) { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); } #endif /* _WIN64 */ -static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, +static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, const std::vector &candidates, - VkImageTiling tiling, - VkFormatFeatureFlags features) { - for (VkFormat format : candidates) { - VkFormatProperties props; - vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props); - if (tiling == VK_IMAGE_TILING_LINEAR && - (props.linearTilingFeatures & features) == features) { - return format; - } else if (tiling == VK_IMAGE_TILING_OPTIMAL && - (props.optimalTilingFeatures & features) == features) { - return format; + VkImageTiling tiling, + VkFormatFeatureFlags features) +{ + for (VkFormat format : candidates) { + VkFormatProperties props; + vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props); + if (tiling == VK_IMAGE_TILING_LINEAR && (props.linearTilingFeatures & features) == features) { + return format; + } + else if (tiling == VK_IMAGE_TILING_OPTIMAL && (props.optimalTilingFeatures & features) == features) { + return format; + } } - } - throw std::runtime_error("Failed to find supported format!"); + throw std::runtime_error("Failed to find supported format!"); } -static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, - uint32_t typeFilter, - VkMemoryPropertyFlags properties) { - VkPhysicalDeviceMemoryProperties memProperties; - vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); - for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { - if (typeFilter & (1 << i) && - (memProperties.memoryTypes[i].propertyFlags & properties) == - properties) { - return i; +static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, uint32_t typeFilter, VkMemoryPropertyFlags properties) +{ + VkPhysicalDeviceMemoryProperties memProperties; + vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } } - } - return ~0; + return ~0; } -static bool supportsValidationLayers() { - std::vector availableLayers; - uint32_t layerCount; +static bool supportsValidationLayers() +{ + std::vector availableLayers; + uint32_t layerCount; - vkEnumerateInstanceLayerProperties(&layerCount, nullptr); - availableLayers.resize(layerCount); - vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); + vkEnumerateInstanceLayerProperties(&layerCount, nullptr); + availableLayers.resize(layerCount); + vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); - for (const char *layerName : validationLayers) { - bool layerFound = false; + for (const char *layerName : validationLayers) { + bool layerFound = false; - for (const auto &layerProperties : availableLayers) { - if (strcmp(layerName, layerProperties.layerName) == 0) { - layerFound = true; - break; - } + for (const auto &layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } + } + + if (!layerFound) { + return false; + } } - if (!layerFound) { - return false; - } - } - - return true; + return true; } -void VulkanBaseApp::createInstance() { - if (m_enableValidation && !supportsValidationLayers()) { - throw std::runtime_error("Validation requested, but not supported!"); - } +void VulkanBaseApp::createInstance() +{ + if (m_enableValidation && !supportsValidationLayers()) { + throw std::runtime_error("Validation requested, but not supported!"); + } - VkApplicationInfo appInfo = {}; - appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; - appInfo.pApplicationName = m_appName.c_str(); - appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.pEngineName = "No Engine"; - appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.apiVersion = VK_API_VERSION_1_2; + VkApplicationInfo appInfo = {}; + appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + appInfo.pApplicationName = m_appName.c_str(); + appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.pEngineName = "No Engine"; + appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.apiVersion = VK_API_VERSION_1_2; - VkInstanceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; - createInfo.pApplicationInfo = &appInfo; + VkInstanceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + createInfo.pApplicationInfo = &appInfo; - std::vector exts = getRequiredExtensions(); + std::vector exts = getRequiredExtensions(); - { - uint32_t glfwExtensionCount = 0; - const char **glfwExtensions; + { + uint32_t glfwExtensionCount = 0; + const char **glfwExtensions; - glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); - exts.insert(exts.begin(), glfwExtensions, - glfwExtensions + glfwExtensionCount); + exts.insert(exts.begin(), glfwExtensions, glfwExtensions + glfwExtensionCount); + + if (m_enableValidation) { + exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + } + } + + createInfo.enabledExtensionCount = static_cast(exts.size()); + createInfo.ppEnabledExtensionNames = exts.data(); + VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {}; + if (m_enableValidation) { + createInfo.enabledLayerCount = static_cast(countof(validationLayers)); + createInfo.ppEnabledLayerNames = validationLayers; + + debugCreateInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; + debugCreateInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; + debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; + debugCreateInfo.pfnUserCallback = debugCallback; + + createInfo.pNext = &debugCreateInfo; + } + else { + createInfo.enabledLayerCount = 0; + createInfo.pNext = nullptr; + } + + if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) { + throw std::runtime_error("Failed to create Vulkan instance!"); + } if (m_enableValidation) { - exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + PFN_vkCreateDebugUtilsMessengerEXT func = + (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkCreateDebugUtilsMessengerEXT"); + if (func == nullptr || func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != VK_SUCCESS) { + throw std::runtime_error("Failed to set up debug messenger!"); + } } - } - - createInfo.enabledExtensionCount = static_cast(exts.size()); - createInfo.ppEnabledExtensionNames = exts.data(); - VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {}; - if (m_enableValidation) { - createInfo.enabledLayerCount = - static_cast(countof(validationLayers)); - createInfo.ppEnabledLayerNames = validationLayers; - - debugCreateInfo.sType = - VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; - debugCreateInfo.messageSeverity = - VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; - debugCreateInfo.messageType = - VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; - debugCreateInfo.pfnUserCallback = debugCallback; - - createInfo.pNext = &debugCreateInfo; - } else { - createInfo.enabledLayerCount = 0; - createInfo.pNext = nullptr; - } - - if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) { - throw std::runtime_error("Failed to create Vulkan instance!"); - } - - if (m_enableValidation) { - PFN_vkCreateDebugUtilsMessengerEXT func = - (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr( - m_instance, "vkCreateDebugUtilsMessengerEXT"); - if (func == nullptr || - func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != - VK_SUCCESS) { - throw std::runtime_error("Failed to set up debug messenger!"); - } - } } -void VulkanBaseApp::createSurface() { - if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != - VK_SUCCESS) { - throw std::runtime_error("failed to create window surface!"); - } +void VulkanBaseApp::createSurface() +{ + if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != VK_SUCCESS) { + throw std::runtime_error("failed to create window surface!"); + } } static bool findGraphicsQueueIndicies(VkPhysicalDevice device, - VkSurfaceKHR surface, - uint32_t &graphicsFamily, - uint32_t &presentFamily) { - uint32_t queueFamilyCount = 0; + VkSurfaceKHR surface, + uint32_t &graphicsFamily, + uint32_t &presentFamily) +{ + uint32_t queueFamilyCount = 0; - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); - std::vector queueFamilies(queueFamilyCount); - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, - queueFamilies.data()); + std::vector queueFamilies(queueFamilyCount); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data()); - graphicsFamily = presentFamily = ~0; + graphicsFamily = presentFamily = ~0; - for (uint32_t i = 0; i < queueFamilyCount; i++) { - if (queueFamilies[i].queueCount > 0) { - if (graphicsFamily == ~0 && - queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { - graphicsFamily = i; - } - uint32_t presentSupport = 0; - vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); - if (presentFamily == ~0 && presentSupport) { - presentFamily = i; - } - if (presentFamily != ~0 && graphicsFamily != ~0) { - break; - } + for (uint32_t i = 0; i < queueFamilyCount; i++) { + if (queueFamilies[i].queueCount > 0) { + if (graphicsFamily == ~0 && queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { + graphicsFamily = i; + } + uint32_t presentSupport = 0; + vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); + if (presentFamily == ~0 && presentSupport) { + presentFamily = i; + } + if (presentFamily != ~0 && graphicsFamily != ~0) { + break; + } + } } - } - return graphicsFamily != ~0 && presentFamily != ~0; + return graphicsFamily != ~0 && presentFamily != ~0; } -static bool hasAllExtensions( - VkPhysicalDevice device, - const std::vector &deviceExtensions) { - uint32_t extensionCount; - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, - nullptr); - std::vector availableExtensions(extensionCount); - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, - availableExtensions.data()); +static bool hasAllExtensions(VkPhysicalDevice device, const std::vector &deviceExtensions) +{ + uint32_t extensionCount; + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr); + std::vector availableExtensions(extensionCount); + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data()); - std::set requiredExtensions(deviceExtensions.begin(), - deviceExtensions.end()); + std::set requiredExtensions(deviceExtensions.begin(), deviceExtensions.end()); - for (const auto &extension : availableExtensions) { - requiredExtensions.erase(extension.extensionName); - } - - return requiredExtensions.empty(); -} - -static void getSwapChainProperties( - VkPhysicalDevice device, VkSurfaceKHR surface, - VkSurfaceCapabilitiesKHR &capabilities, - std::vector &formats, - std::vector &presentModes) { - vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities); - uint32_t formatCount; - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); - if (formatCount != 0) { - formats.resize(formatCount); - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, - formats.data()); - } - uint32_t presentModeCount; - vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, - nullptr); - if (presentModeCount != 0) { - presentModes.resize(presentModeCount); - vkGetPhysicalDeviceSurfacePresentModesKHR( - device, surface, &presentModeCount, presentModes.data()); - } -} - -bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const { - uint32_t graphicsQueueIndex, presentQueueIndex; - std::vector deviceExtensions = getRequiredDeviceExtensions(); - VkSurfaceCapabilitiesKHR caps; - std::vector formats; - std::vector presentModes; - deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); - getSwapChainProperties(dev, m_surface, caps, formats, presentModes); - return hasAllExtensions(dev, deviceExtensions) && !formats.empty() && - !presentModes.empty() && - findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, - presentQueueIndex); -} - -void VulkanBaseApp::createDevice() { - { - uint32_t deviceCount = 0; - vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr); - if (deviceCount == 0) { - throw std::runtime_error("Failed to find Vulkan capable GPUs!"); + for (const auto &extension : availableExtensions) { + requiredExtensions.erase(extension.extensionName); } - std::vector phyDevs(deviceCount); - vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data()); - std::vector::iterator it = - std::find_if(phyDevs.begin(), phyDevs.end(), - std::bind(&VulkanBaseApp::isSuitableDevice, this, - std::placeholders::_1)); - if (it == phyDevs.end()) { - throw std::runtime_error("No suitable device found!"); + + return requiredExtensions.empty(); +} + +static void getSwapChainProperties(VkPhysicalDevice device, + VkSurfaceKHR surface, + VkSurfaceCapabilitiesKHR &capabilities, + std::vector &formats, + std::vector &presentModes) +{ + vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities); + uint32_t formatCount; + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); + if (formatCount != 0) { + formats.resize(formatCount); + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, formats.data()); } - m_physicalDevice = *it; - } + uint32_t presentModeCount; + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr); + if (presentModeCount != 0) { + presentModes.resize(presentModeCount); + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, presentModes.data()); + } +} - uint32_t graphicsQueueIndex, presentQueueIndex; - findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, - presentQueueIndex); +bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const +{ + uint32_t graphicsQueueIndex, presentQueueIndex; + std::vector deviceExtensions = getRequiredDeviceExtensions(); + VkSurfaceCapabilitiesKHR caps; + std::vector formats; + std::vector presentModes; + deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + getSwapChainProperties(dev, m_surface, caps, formats, presentModes); + return hasAllExtensions(dev, deviceExtensions) && !formats.empty() && !presentModes.empty() + && findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, presentQueueIndex); +} - std::vector queueCreateInfos; - std::set uniqueFamilyIndices = {graphicsQueueIndex, - presentQueueIndex}; +void VulkanBaseApp::createDevice() +{ + { + uint32_t deviceCount = 0; + vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr); + if (deviceCount == 0) { + throw std::runtime_error("Failed to find Vulkan capable GPUs!"); + } + std::vector phyDevs(deviceCount); + vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data()); + std::vector::iterator it = std::find_if( + phyDevs.begin(), phyDevs.end(), std::bind(&VulkanBaseApp::isSuitableDevice, this, std::placeholders::_1)); + if (it == phyDevs.end()) { + throw std::runtime_error("No suitable device found!"); + } + m_physicalDevice = *it; + } - float queuePriority = 1.0f; + uint32_t graphicsQueueIndex, presentQueueIndex; + findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, presentQueueIndex); - for (uint32_t queueFamily : uniqueFamilyIndices) { - VkDeviceQueueCreateInfo queueCreateInfo = {}; - queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - queueCreateInfo.queueFamilyIndex = queueFamily; - queueCreateInfo.queueCount = 1; - queueCreateInfo.pQueuePriorities = &queuePriority; - queueCreateInfos.push_back(queueCreateInfo); - } + std::vector queueCreateInfos; + std::set uniqueFamilyIndices = {graphicsQueueIndex, presentQueueIndex}; - VkPhysicalDeviceFeatures deviceFeatures = {}; - deviceFeatures.fillModeNonSolid = true; + float queuePriority = 1.0f; - VkDeviceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + for (uint32_t queueFamily : uniqueFamilyIndices) { + VkDeviceQueueCreateInfo queueCreateInfo = {}; + queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queueCreateInfo.queueFamilyIndex = queueFamily; + queueCreateInfo.queueCount = 1; + queueCreateInfo.pQueuePriorities = &queuePriority; + queueCreateInfos.push_back(queueCreateInfo); + } + + VkPhysicalDeviceFeatures deviceFeatures = {}; + deviceFeatures.fillModeNonSolid = true; + + VkDeviceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; #ifdef _VK_TIMELINE_SEMAPHORE - VkPhysicalDeviceVulkan12Features vk12features = {}; - vk12features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; - vk12features.timelineSemaphore = true; - createInfo.pNext = &vk12features; + VkPhysicalDeviceVulkan12Features vk12features = {}; + vk12features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; + vk12features.timelineSemaphore = true; + createInfo.pNext = &vk12features; #endif - createInfo.pQueueCreateInfos = queueCreateInfos.data(); - createInfo.queueCreateInfoCount = - static_cast(queueCreateInfos.size()); + createInfo.pQueueCreateInfos = queueCreateInfos.data(); + createInfo.queueCreateInfoCount = static_cast(queueCreateInfos.size()); - createInfo.pEnabledFeatures = &deviceFeatures; + createInfo.pEnabledFeatures = &deviceFeatures; - std::vector deviceExtensions = getRequiredDeviceExtensions(); - deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + std::vector deviceExtensions = getRequiredDeviceExtensions(); + deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); - createInfo.enabledExtensionCount = - static_cast(deviceExtensions.size()); - createInfo.ppEnabledExtensionNames = deviceExtensions.data(); + createInfo.enabledExtensionCount = static_cast(deviceExtensions.size()); + createInfo.ppEnabledExtensionNames = deviceExtensions.data(); - if (m_enableValidation) { - createInfo.enabledLayerCount = - static_cast(countof(validationLayers)); - createInfo.ppEnabledLayerNames = validationLayers; - } else { - createInfo.enabledLayerCount = 0; - } - - if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != - VK_SUCCESS) { - throw std::runtime_error("failed to create logical device!"); - } - - vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue); - vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue); - - VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; - vkPhysicalDeviceIDProperties.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; - vkPhysicalDeviceIDProperties.pNext = NULL; - - VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; - vkPhysicalDeviceProperties2.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; - - PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; - fpGetPhysicalDeviceProperties2 = - (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr( - m_instance, "vkGetPhysicalDeviceProperties2"); - if (fpGetPhysicalDeviceProperties2 == NULL) { - throw std::runtime_error( - "Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not " - "found.\n"); - } - - fpGetPhysicalDeviceProperties2(m_physicalDevice, - &vkPhysicalDeviceProperties2); - - memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, VK_UUID_SIZE); -} - -static VkSurfaceFormatKHR chooseSwapSurfaceFormat( - const std::vector &availableFormats) { - if (availableFormats.size() == 1 && - availableFormats[0].format == VK_FORMAT_UNDEFINED) { - return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; - } - - for (const auto &availableFormat : availableFormats) { - if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && - availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { - return availableFormat; + if (m_enableValidation) { + createInfo.enabledLayerCount = static_cast(countof(validationLayers)); + createInfo.ppEnabledLayerNames = validationLayers; } - } - - return availableFormats[0]; -} - -static VkPresentModeKHR chooseSwapPresentMode( - const std::vector &availablePresentModes) { - VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; - - for (const auto &availablePresentMode : availablePresentModes) { - if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { - return availablePresentMode; - } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { - bestMode = availablePresentMode; - } - } - - return bestMode; -} - -static VkExtent2D chooseSwapExtent( - GLFWwindow *window, const VkSurfaceCapabilitiesKHR &capabilities) { - if (capabilities.currentExtent.width != - std::numeric_limits::max()) { - return capabilities.currentExtent; - } else { - int width, height; - glfwGetFramebufferSize(window, &width, &height); - VkExtent2D actualExtent = {static_cast(width), - static_cast(height)}; - - actualExtent.width = std::max( - capabilities.minImageExtent.width, - std::min(capabilities.maxImageExtent.width, actualExtent.width)); - actualExtent.height = std::max( - capabilities.minImageExtent.height, - std::min(capabilities.maxImageExtent.height, actualExtent.height)); - - return actualExtent; - } -} - -void VulkanBaseApp::createSwapChain() { - VkSurfaceCapabilitiesKHR capabilities; - VkSurfaceFormatKHR format; - VkPresentModeKHR presentMode; - VkExtent2D extent; - uint32_t imageCount; - - { - std::vector formats; - std::vector presentModes; - - getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, - presentModes); - format = chooseSwapSurfaceFormat(formats); - presentMode = chooseSwapPresentMode(presentModes); - extent = chooseSwapExtent(m_window, capabilities); - imageCount = capabilities.minImageCount + 1; - if (capabilities.maxImageCount > 0 && - imageCount > capabilities.maxImageCount) { - imageCount = capabilities.maxImageCount; - } - } - - VkSwapchainCreateInfoKHR createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; - createInfo.surface = m_surface; - - createInfo.minImageCount = imageCount; - createInfo.imageFormat = format.format; - createInfo.imageColorSpace = format.colorSpace; - createInfo.imageExtent = extent; - createInfo.imageArrayLayers = 1; - createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; - - uint32_t queueFamilyIndices[2]; - findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], - queueFamilyIndices[1]); - - if (queueFamilyIndices[0] != queueFamilyIndices[1]) { - createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; - createInfo.queueFamilyIndexCount = countof(queueFamilyIndices); - createInfo.pQueueFamilyIndices = queueFamilyIndices; - } else { - createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; - } - - createInfo.preTransform = capabilities.currentTransform; - createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; - createInfo.presentMode = presentMode; - createInfo.clipped = VK_TRUE; - - createInfo.oldSwapchain = VK_NULL_HANDLE; - - if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != - VK_SUCCESS) { - throw std::runtime_error("failed to create swap chain!"); - } - - vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr); - m_swapChainImages.resize(imageCount); - vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, - m_swapChainImages.data()); - - m_swapChainFormat = format.format; - m_swapChainExtent = extent; -} - -static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, - VkImageAspectFlags aspectFlags) { - VkImageView imageView; - VkImageViewCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - createInfo.image = image; - createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; - createInfo.format = format; - createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.subresourceRange.aspectMask = aspectFlags; - createInfo.subresourceRange.baseMipLevel = 0; - createInfo.subresourceRange.levelCount = 1; - createInfo.subresourceRange.baseArrayLayer = 0; - createInfo.subresourceRange.layerCount = 1; - if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image views!"); - } - - return imageView; -} - -static void createImage(VkPhysicalDevice physicalDevice, VkDevice device, - uint32_t width, uint32_t height, VkFormat format, - VkImageTiling tiling, VkImageUsageFlags usage, - VkMemoryPropertyFlags properties, VkImage &image, - VkDeviceMemory &imageMemory) { - VkImageCreateInfo imageInfo = {}; - imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - imageInfo.imageType = VK_IMAGE_TYPE_2D; - imageInfo.extent.width = width; - imageInfo.extent.height = height; - imageInfo.extent.depth = 1; - imageInfo.mipLevels = 1; - imageInfo.arrayLayers = 1; - imageInfo.format = format; - imageInfo.tiling = tiling; - imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - imageInfo.usage = usage; - imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; - imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - - if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { - throw std::runtime_error("failed to create image!"); - } - - VkMemoryRequirements memRequirements; - vkGetImageMemoryRequirements(device, image, &memRequirements); - - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType( - physicalDevice, memRequirements.memoryTypeBits, properties); - - if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate image memory!"); - } - - vkBindImageMemory(device, image, imageMemory, 0); -} - -void VulkanBaseApp::createImageViews() { - m_swapChainImageViews.resize(m_swapChainImages.size()); - - for (uint32_t i = 0; i < m_swapChainImages.size(); i++) { - m_swapChainImageViews[i] = - createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, - VK_IMAGE_ASPECT_COLOR_BIT); - } -} - -void VulkanBaseApp::createRenderPass() { - VkAttachmentDescription colorAttachment = {}; - colorAttachment.format = m_swapChainFormat; - colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; - colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; - colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; - colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - - VkAttachmentReference colorAttachmentRef = {}; - colorAttachmentRef.attachment = 0; - colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - - VkAttachmentDescription depthAttachment = {}; - depthAttachment.format = findSupportedFormat( - m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, - VK_FORMAT_D24_UNORM_S8_UINT}, - VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); - depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT; - depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; - depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - depthAttachment.finalLayout = - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - - VkAttachmentReference depthAttachmentRef = {}; - depthAttachmentRef.attachment = 1; - depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - - VkSubpassDescription subpass = {}; - subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - subpass.colorAttachmentCount = 1; - subpass.pColorAttachments = &colorAttachmentRef; - subpass.pDepthStencilAttachment = &depthAttachmentRef; - - VkSubpassDependency dependency = {}; - dependency.srcSubpass = VK_SUBPASS_EXTERNAL; - dependency.dstSubpass = 0; - dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.srcAccessMask = 0; - dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - - VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment}; - VkRenderPassCreateInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; - renderPassInfo.attachmentCount = countof(attachments); - renderPassInfo.pAttachments = attachments; - renderPassInfo.subpassCount = 1; - renderPassInfo.pSubpasses = &subpass; - renderPassInfo.dependencyCount = 1; - renderPassInfo.pDependencies = &dependency; - - if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != - VK_SUCCESS) { - throw std::runtime_error("failed to create render pass!"); - } -} - -void VulkanBaseApp::createDescriptorSetLayout() { - VkDescriptorSetLayoutBinding uboLayoutBinding = {}; - uboLayoutBinding.binding = 0; - uboLayoutBinding.descriptorCount = 1; - uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - uboLayoutBinding.pImmutableSamplers = nullptr; - uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - - VkDescriptorSetLayoutCreateInfo layoutInfo = {}; - layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - layoutInfo.bindingCount = 1; - layoutInfo.pBindings = &uboLayoutBinding; - - if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, - &m_descriptorSetLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor set layout!"); - } -} - -VkShaderModule createShaderModule(VkDevice device, const char *filename) { - std::vector shaderContents; - std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary); - VkShaderModuleCreateInfo createInfo = {}; - VkShaderModule shaderModule; - - if (!shaderFile.good()) { - throw std::runtime_error("Failed to load shader contents"); - } - readFile(shaderFile, shaderContents); - - createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - createInfo.codeSize = shaderContents.size(); - createInfo.pCode = reinterpret_cast(shaderContents.data()); - - if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != - VK_SUCCESS) { - throw std::runtime_error("Failed to create shader module!"); - } - - return shaderModule; -} - -void VulkanBaseApp::getVertexDescriptions( - std::vector &bindingDesc, - std::vector &attribDesc) {} - -void VulkanBaseApp::getAssemblyStateInfo( - VkPipelineInputAssemblyStateCreateInfo &info) {} - -void VulkanBaseApp::createGraphicsPipeline() { - std::vector shaderStageInfos( - m_shaderFiles.size()); - for (size_t i = 0; i < m_shaderFiles.size(); i++) { - shaderStageInfos[i] = {}; - shaderStageInfos[i].sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shaderStageInfos[i].stage = m_shaderFiles[i].first; - shaderStageInfos[i].module = - createShaderModule(m_device, m_shaderFiles[i].second.c_str()); - shaderStageInfos[i].pName = "main"; - } - - VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; - - std::vector vertexBindingDescriptions; - std::vector vertexAttributeDescriptions; - - getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions); - - vertexInputInfo.sType = - VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - vertexInputInfo.vertexBindingDescriptionCount = - static_cast(vertexBindingDescriptions.size()); - vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data(); - vertexInputInfo.vertexAttributeDescriptionCount = - static_cast(vertexAttributeDescriptions.size()); - vertexInputInfo.pVertexAttributeDescriptions = - vertexAttributeDescriptions.data(); - - VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; - getAssemblyStateInfo(inputAssembly); - - VkViewport viewport = {}; - viewport.x = 0.0f; - viewport.y = 0.0f; - viewport.width = (float)m_swapChainExtent.width; - viewport.height = (float)m_swapChainExtent.height; - viewport.minDepth = 0.0f; - viewport.maxDepth = 1.0f; - - VkRect2D scissor = {}; - scissor.offset = {0, 0}; - scissor.extent = m_swapChainExtent; - - VkPipelineViewportStateCreateInfo viewportState = {}; - viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; - viewportState.viewportCount = 1; - viewportState.pViewports = &viewport; - viewportState.scissorCount = 1; - viewportState.pScissors = &scissor; - - VkPipelineRasterizationStateCreateInfo rasterizer = {}; - rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - rasterizer.depthClampEnable = VK_FALSE; - rasterizer.rasterizerDiscardEnable = VK_FALSE; - rasterizer.polygonMode = VK_POLYGON_MODE_LINE; - rasterizer.lineWidth = 1.0f; - rasterizer.cullMode = VK_CULL_MODE_NONE; - rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE; - rasterizer.depthBiasEnable = VK_FALSE; - - VkPipelineMultisampleStateCreateInfo multisampling = {}; - multisampling.sType = - VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisampling.sampleShadingEnable = VK_FALSE; - multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; - multisampling.minSampleShading = 1.0f; // Optional - multisampling.pSampleMask = nullptr; // Optional - multisampling.alphaToCoverageEnable = VK_FALSE; // Optional - multisampling.alphaToOneEnable = VK_FALSE; // Optional - - VkPipelineDepthStencilStateCreateInfo depthStencil = {}; - depthStencil.sType = - VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; - depthStencil.depthTestEnable = VK_TRUE; - depthStencil.depthWriteEnable = VK_TRUE; - depthStencil.depthCompareOp = VK_COMPARE_OP_LESS; - depthStencil.depthBoundsTestEnable = VK_FALSE; - depthStencil.stencilTestEnable = VK_FALSE; - - VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; - colorBlendAttachment.colorWriteMask = - VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; - colorBlendAttachment.blendEnable = VK_FALSE; - - VkPipelineColorBlendStateCreateInfo colorBlending = {}; - colorBlending.sType = - VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - colorBlending.logicOpEnable = VK_FALSE; - colorBlending.logicOp = VK_LOGIC_OP_COPY; - colorBlending.attachmentCount = 1; - colorBlending.pAttachments = &colorBlendAttachment; - colorBlending.blendConstants[0] = 0.0f; - colorBlending.blendConstants[1] = 0.0f; - colorBlending.blendConstants[2] = 0.0f; - colorBlending.blendConstants[3] = 0.0f; - - VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; - pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - pipelineLayoutInfo.setLayoutCount = 1; // Optional - pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional - pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional - pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional - - if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, - &m_pipelineLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create pipeline layout!"); - } - - VkGraphicsPipelineCreateInfo pipelineInfo = {}; - pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - pipelineInfo.stageCount = static_cast(shaderStageInfos.size()); - pipelineInfo.pStages = shaderStageInfos.data(); - - pipelineInfo.pVertexInputState = &vertexInputInfo; - pipelineInfo.pInputAssemblyState = &inputAssembly; - pipelineInfo.pViewportState = &viewportState; - pipelineInfo.pRasterizationState = &rasterizer; - pipelineInfo.pMultisampleState = &multisampling; - pipelineInfo.pDepthStencilState = &depthStencil; // Optional - pipelineInfo.pColorBlendState = &colorBlending; - pipelineInfo.pDynamicState = nullptr; // Optional - - pipelineInfo.layout = m_pipelineLayout; - - pipelineInfo.renderPass = m_renderPass; - pipelineInfo.subpass = 0; - - pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional - pipelineInfo.basePipelineIndex = -1; // Optional - - if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, - nullptr, &m_graphicsPipeline) != VK_SUCCESS) { - throw std::runtime_error("failed to create graphics pipeline!"); - } - - for (size_t i = 0; i < shaderStageInfos.size(); i++) { - vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr); - } -} - -void VulkanBaseApp::createFramebuffers() { - m_swapChainFramebuffers.resize(m_swapChainImageViews.size()); - for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { - VkImageView attachments[] = {m_swapChainImageViews[i], m_depthImageView}; - - VkFramebufferCreateInfo framebufferInfo = {}; - framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; - framebufferInfo.renderPass = m_renderPass; - framebufferInfo.attachmentCount = countof(attachments); - framebufferInfo.pAttachments = attachments; - framebufferInfo.width = m_swapChainExtent.width; - framebufferInfo.height = m_swapChainExtent.height; - framebufferInfo.layers = 1; - - if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, - &m_swapChainFramebuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to create framebuffer!"); - } - } -} - -void VulkanBaseApp::createCommandPool() { - VkCommandPoolCreateInfo poolInfo = {}; - uint32_t graphicsIndex, presentIndex; - - findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, - presentIndex); - - poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - poolInfo.queueFamilyIndex = graphicsIndex; - poolInfo.flags = 0; // Optional - - if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != - VK_SUCCESS) { - throw std::runtime_error("Failed to create command pool!"); - } -} - -static void transitionImageLayout(VulkanBaseApp *app, VkImage image, - VkFormat format, VkImageLayout oldLayout, - VkImageLayout newLayout) { - VkCommandBuffer commandBuffer = app->beginSingleTimeCommands(); - - VkImageMemoryBarrier barrier = {}; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = image; - - if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - - if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || - format == VK_FORMAT_D24_UNORM_S8_UINT) { - barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; - } - } else { - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - } - - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - - VkPipelineStageFlags sourceStage; - VkPipelineStageFlags destinationStage; - - if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - - sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - - sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - } else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - - sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; - } else { - throw std::invalid_argument("unsupported layout transition!"); - } - - vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, - nullptr, 0, nullptr, 1, &barrier); - - app->endSingleTimeCommands(commandBuffer); -} - -void VulkanBaseApp::createDepthResources() { - VkFormat depthFormat = findSupportedFormat( - m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, - VK_FORMAT_D24_UNORM_S8_UINT}, - VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); - createImage(m_physicalDevice, m_device, m_swapChainExtent.width, - m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL, - VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage, - m_depthImageMemory); - m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, - VK_IMAGE_ASPECT_DEPTH_BIT); - transitionImageLayout(this, m_depthImage, depthFormat, - VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); -} - -void VulkanBaseApp::createUniformBuffers() { - VkDeviceSize size = getUniformSize(); - if (size > 0) { - m_uniformBuffers.resize(m_swapChainImages.size()); - m_uniformMemory.resize(m_swapChainImages.size()); - for (size_t i = 0; i < m_uniformBuffers.size(); i++) { - createBuffer(getUniformSize(), VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - m_uniformBuffers[i], m_uniformMemory[i]); - } - } -} - -void VulkanBaseApp::createDescriptorPool() { - VkDescriptorPoolSize poolSize = {}; - poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - poolSize.descriptorCount = static_cast(m_swapChainImages.size()); - VkDescriptorPoolCreateInfo poolInfo = {}; - poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - poolInfo.poolSizeCount = 1; - poolInfo.pPoolSizes = &poolSize; - poolInfo.maxSets = static_cast(m_swapChainImages.size()); - if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != - VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor pool!"); - } -} - -void VulkanBaseApp::createDescriptorSets() { - std::vector layouts(m_swapChainImages.size(), - m_descriptorSetLayout); - VkDescriptorSetAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - allocInfo.descriptorPool = m_descriptorPool; - allocInfo.descriptorSetCount = - static_cast(m_swapChainImages.size()); - allocInfo.pSetLayouts = layouts.data(); - m_descriptorSets.resize(m_swapChainImages.size()); - - if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate descriptor sets!"); - } - - VkDescriptorBufferInfo bufferInfo = {}; - bufferInfo.offset = 0; - bufferInfo.range = VK_WHOLE_SIZE; - VkWriteDescriptorSet descriptorWrite = {}; - descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - descriptorWrite.dstBinding = 0; - descriptorWrite.dstArrayElement = 0; - descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - descriptorWrite.descriptorCount = 1; - descriptorWrite.pBufferInfo = &bufferInfo; - descriptorWrite.pImageInfo = nullptr; // Optional - descriptorWrite.pTexelBufferView = nullptr; // Optional - - for (size_t i = 0; i < m_swapChainImages.size(); i++) { - bufferInfo.buffer = m_uniformBuffers[i]; - descriptorWrite.dstSet = m_descriptorSets[i]; - vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr); - } -} - -void VulkanBaseApp::createCommandBuffers() { - m_commandBuffers.resize(m_swapChainFramebuffers.size()); - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.commandPool = m_commandPool; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size(); - - if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate command buffers!"); - } - - for (size_t i = 0; i < m_commandBuffers.size(); i++) { - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; - beginInfo.pInheritanceInfo = nullptr; // Optional - - if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) { - throw std::runtime_error("failed to begin recording command buffer!"); + else { + createInfo.enabledLayerCount = 0; } - VkRenderPassBeginInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - renderPassInfo.renderPass = m_renderPass; - renderPassInfo.framebuffer = m_swapChainFramebuffers[i]; - - renderPassInfo.renderArea.offset = {0, 0}; - renderPassInfo.renderArea.extent = m_swapChainExtent; - - VkClearValue clearColors[2]; - clearColors[0].color = {0.0f, 0.0f, 0.0f, 1.0f}; - clearColors[1].depthStencil = {1.0f, 0}; - renderPassInfo.clearValueCount = countof(clearColors); - renderPassInfo.pClearValues = clearColors; - - vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, - VK_SUBPASS_CONTENTS_INLINE); - - vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, - m_graphicsPipeline); - - vkCmdBindDescriptorSets(m_commandBuffers[i], - VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, - 0, 1, &m_descriptorSets[i], 0, nullptr); - - fillRenderingCommandBuffer(m_commandBuffers[i]); - - vkCmdEndRenderPass(m_commandBuffers[i]); - - if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to record command buffer!"); + if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != VK_SUCCESS) { + throw std::runtime_error("failed to create logical device!"); } - } + + vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue); + vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue); + + VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; + vkPhysicalDeviceIDProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + vkPhysicalDeviceIDProperties.pNext = NULL; + + VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; + vkPhysicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; + + PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; + fpGetPhysicalDeviceProperties2 = + (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(m_instance, "vkGetPhysicalDeviceProperties2"); + if (fpGetPhysicalDeviceProperties2 == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not " + "found.\n"); + } + + fpGetPhysicalDeviceProperties2(m_physicalDevice, &vkPhysicalDeviceProperties2); + + memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, VK_UUID_SIZE); } -void VulkanBaseApp::createSyncObjects() { - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - VkFenceCreateInfo fenceInfo = {}; - fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; +static VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector &availableFormats) +{ + if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) { + return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; + } - m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT); - m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT); - m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT); + for (const auto &availableFormat : availableFormats) { + if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM + && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { + return availableFormat; + } + } - for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, - &m_imageAvailableSemaphores[i]) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); + return availableFormats[0]; +} + +static VkPresentModeKHR chooseSwapPresentMode(const std::vector &availablePresentModes) +{ + VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; + + for (const auto &availablePresentMode : availablePresentModes) { + if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { + return availablePresentMode; + } + else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { + bestMode = availablePresentMode; + } } - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, - &m_renderFinishedSemaphores[i]) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); + + return bestMode; +} + +static VkExtent2D chooseSwapExtent(GLFWwindow *window, const VkSurfaceCapabilitiesKHR &capabilities) +{ + if (capabilities.currentExtent.width != std::numeric_limits::max()) { + return capabilities.currentExtent; } - if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != - VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); + else { + int width, height; + glfwGetFramebufferSize(window, &width, &height); + VkExtent2D actualExtent = {static_cast(width), static_cast(height)}; + + actualExtent.width = std::max(capabilities.minImageExtent.width, + std::min(capabilities.maxImageExtent.width, actualExtent.width)); + actualExtent.height = std::max(capabilities.minImageExtent.height, + std::min(capabilities.maxImageExtent.height, actualExtent.height)); + + return actualExtent; + } +} + +void VulkanBaseApp::createSwapChain() +{ + VkSurfaceCapabilitiesKHR capabilities; + VkSurfaceFormatKHR format; + VkPresentModeKHR presentMode; + VkExtent2D extent; + uint32_t imageCount; + + { + std::vector formats; + std::vector presentModes; + + getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, presentModes); + format = chooseSwapSurfaceFormat(formats); + presentMode = chooseSwapPresentMode(presentModes); + extent = chooseSwapExtent(m_window, capabilities); + imageCount = capabilities.minImageCount + 1; + if (capabilities.maxImageCount > 0 && imageCount > capabilities.maxImageCount) { + imageCount = capabilities.maxImageCount; + } + } + + VkSwapchainCreateInfoKHR createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; + createInfo.surface = m_surface; + + createInfo.minImageCount = imageCount; + createInfo.imageFormat = format.format; + createInfo.imageColorSpace = format.colorSpace; + createInfo.imageExtent = extent; + createInfo.imageArrayLayers = 1; + createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + uint32_t queueFamilyIndices[2]; + findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], queueFamilyIndices[1]); + + if (queueFamilyIndices[0] != queueFamilyIndices[1]) { + createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; + createInfo.queueFamilyIndexCount = countof(queueFamilyIndices); + createInfo.pQueueFamilyIndices = queueFamilyIndices; + } + else { + createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; + } + + createInfo.preTransform = capabilities.currentTransform; + createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + createInfo.presentMode = presentMode; + createInfo.clipped = VK_TRUE; + + createInfo.oldSwapchain = VK_NULL_HANDLE; + + if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != VK_SUCCESS) { + throw std::runtime_error("failed to create swap chain!"); + } + + vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr); + m_swapChainImages.resize(imageCount); + vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, m_swapChainImages.data()); + + m_swapChainFormat = format.format; + m_swapChainExtent = extent; +} + +static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, VkImageAspectFlags aspectFlags) +{ + VkImageView imageView; + VkImageViewCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + createInfo.image = image; + createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; + createInfo.format = format; + createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.subresourceRange.aspectMask = aspectFlags; + createInfo.subresourceRange.baseMipLevel = 0; + createInfo.subresourceRange.levelCount = 1; + createInfo.subresourceRange.baseArrayLayer = 0; + createInfo.subresourceRange.layerCount = 1; + if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image views!"); + } + + return imageView; +} + +static void createImage(VkPhysicalDevice physicalDevice, + VkDevice device, + uint32_t width, + uint32_t height, + VkFormat format, + VkImageTiling tiling, + VkImageUsageFlags usage, + VkMemoryPropertyFlags properties, + VkImage &image, + VkDeviceMemory &imageMemory) +{ + VkImageCreateInfo imageInfo = {}; + imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + imageInfo.imageType = VK_IMAGE_TYPE_2D; + imageInfo.extent.width = width; + imageInfo.extent.height = height; + imageInfo.extent.depth = 1; + imageInfo.mipLevels = 1; + imageInfo.arrayLayers = 1; + imageInfo.format = format; + imageInfo.tiling = tiling; + imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imageInfo.usage = usage; + imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; + imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { + throw std::runtime_error("failed to create image!"); + } + + VkMemoryRequirements memRequirements; + vkGetImageMemoryRequirements(device, image, &memRequirements); + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(physicalDevice, memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate image memory!"); + } + + vkBindImageMemory(device, image, imageMemory, 0); +} + +void VulkanBaseApp::createImageViews() +{ + m_swapChainImageViews.resize(m_swapChainImages.size()); + + for (uint32_t i = 0; i < m_swapChainImages.size(); i++) { + m_swapChainImageViews[i] = + createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, VK_IMAGE_ASPECT_COLOR_BIT); + } +} + +void VulkanBaseApp::createRenderPass() +{ + VkAttachmentDescription colorAttachment = {}; + colorAttachment.format = m_swapChainFormat; + colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; + + VkAttachmentReference colorAttachmentRef = {}; + colorAttachmentRef.attachment = 0; + colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + + VkAttachmentDescription depthAttachment = {}; + depthAttachment.format = + findSupportedFormat(m_physicalDevice, + {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT}, + VK_IMAGE_TILING_OPTIMAL, + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); + depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + depthAttachment.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + VkAttachmentReference depthAttachmentRef = {}; + depthAttachmentRef.attachment = 1; + depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + VkSubpassDescription subpass = {}; + subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + subpass.colorAttachmentCount = 1; + subpass.pColorAttachments = &colorAttachmentRef; + subpass.pDepthStencilAttachment = &depthAttachmentRef; + + VkSubpassDependency dependency = {}; + dependency.srcSubpass = VK_SUBPASS_EXTERNAL; + dependency.dstSubpass = 0; + dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.srcAccessMask = 0; + dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + + VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment}; + VkRenderPassCreateInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + renderPassInfo.attachmentCount = countof(attachments); + renderPassInfo.pAttachments = attachments; + renderPassInfo.subpassCount = 1; + renderPassInfo.pSubpasses = &subpass; + renderPassInfo.dependencyCount = 1; + renderPassInfo.pDependencies = &dependency; + + if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != VK_SUCCESS) { + throw std::runtime_error("failed to create render pass!"); + } +} + +void VulkanBaseApp::createDescriptorSetLayout() +{ + VkDescriptorSetLayoutBinding uboLayoutBinding = {}; + uboLayoutBinding.binding = 0; + uboLayoutBinding.descriptorCount = 1; + uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + uboLayoutBinding.pImmutableSamplers = nullptr; + uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + VkDescriptorSetLayoutCreateInfo layoutInfo = {}; + layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &uboLayoutBinding; + + if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, &m_descriptorSetLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor set layout!"); + } +} + +VkShaderModule createShaderModule(VkDevice device, const char *filename) +{ + std::vector shaderContents; + std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary); + VkShaderModuleCreateInfo createInfo = {}; + VkShaderModule shaderModule; + + if (!shaderFile.good()) { + throw std::runtime_error("Failed to load shader contents"); + } + readFile(shaderFile, shaderContents); + + createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + createInfo.codeSize = shaderContents.size(); + createInfo.pCode = reinterpret_cast(shaderContents.data()); + + if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) { + throw std::runtime_error("Failed to create shader module!"); + } + + return shaderModule; +} + +void VulkanBaseApp::getVertexDescriptions(std::vector &bindingDesc, + std::vector &attribDesc) +{ +} + +void VulkanBaseApp::getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) {} + +void VulkanBaseApp::createGraphicsPipeline() +{ + std::vector shaderStageInfos(m_shaderFiles.size()); + for (size_t i = 0; i < m_shaderFiles.size(); i++) { + shaderStageInfos[i] = {}; + shaderStageInfos[i].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shaderStageInfos[i].stage = m_shaderFiles[i].first; + shaderStageInfos[i].module = createShaderModule(m_device, m_shaderFiles[i].second.c_str()); + shaderStageInfos[i].pName = "main"; + } + + VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; + + std::vector vertexBindingDescriptions; + std::vector vertexAttributeDescriptions; + + getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions); + + vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vertexInputInfo.vertexBindingDescriptionCount = static_cast(vertexBindingDescriptions.size()); + vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data(); + vertexInputInfo.vertexAttributeDescriptionCount = static_cast(vertexAttributeDescriptions.size()); + vertexInputInfo.pVertexAttributeDescriptions = vertexAttributeDescriptions.data(); + + VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; + getAssemblyStateInfo(inputAssembly); + + VkViewport viewport = {}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = (float)m_swapChainExtent.width; + viewport.height = (float)m_swapChainExtent.height; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + + VkRect2D scissor = {}; + scissor.offset = {0, 0}; + scissor.extent = m_swapChainExtent; + + VkPipelineViewportStateCreateInfo viewportState = {}; + viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewportState.viewportCount = 1; + viewportState.pViewports = &viewport; + viewportState.scissorCount = 1; + viewportState.pScissors = &scissor; + + VkPipelineRasterizationStateCreateInfo rasterizer = {}; + rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterizer.depthClampEnable = VK_FALSE; + rasterizer.rasterizerDiscardEnable = VK_FALSE; + rasterizer.polygonMode = VK_POLYGON_MODE_LINE; + rasterizer.lineWidth = 1.0f; + rasterizer.cullMode = VK_CULL_MODE_NONE; + rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE; + rasterizer.depthBiasEnable = VK_FALSE; + + VkPipelineMultisampleStateCreateInfo multisampling = {}; + multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisampling.sampleShadingEnable = VK_FALSE; + multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + multisampling.minSampleShading = 1.0f; // Optional + multisampling.pSampleMask = nullptr; // Optional + multisampling.alphaToCoverageEnable = VK_FALSE; // Optional + multisampling.alphaToOneEnable = VK_FALSE; // Optional + + VkPipelineDepthStencilStateCreateInfo depthStencil = {}; + depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + depthStencil.depthTestEnable = VK_TRUE; + depthStencil.depthWriteEnable = VK_TRUE; + depthStencil.depthCompareOp = VK_COMPARE_OP_LESS; + depthStencil.depthBoundsTestEnable = VK_FALSE; + depthStencil.stencilTestEnable = VK_FALSE; + + VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; + colorBlendAttachment.colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + colorBlendAttachment.blendEnable = VK_FALSE; + + VkPipelineColorBlendStateCreateInfo colorBlending = {}; + colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + colorBlending.logicOpEnable = VK_FALSE; + colorBlending.logicOp = VK_LOGIC_OP_COPY; + colorBlending.attachmentCount = 1; + colorBlending.pAttachments = &colorBlendAttachment; + colorBlending.blendConstants[0] = 0.0f; + colorBlending.blendConstants[1] = 0.0f; + colorBlending.blendConstants[2] = 0.0f; + colorBlending.blendConstants[3] = 0.0f; + + VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; + pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutInfo.setLayoutCount = 1; // Optional + pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional + pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional + pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional + + if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, &m_pipelineLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create pipeline layout!"); + } + + VkGraphicsPipelineCreateInfo pipelineInfo = {}; + pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipelineInfo.stageCount = static_cast(shaderStageInfos.size()); + pipelineInfo.pStages = shaderStageInfos.data(); + + pipelineInfo.pVertexInputState = &vertexInputInfo; + pipelineInfo.pInputAssemblyState = &inputAssembly; + pipelineInfo.pViewportState = &viewportState; + pipelineInfo.pRasterizationState = &rasterizer; + pipelineInfo.pMultisampleState = &multisampling; + pipelineInfo.pDepthStencilState = &depthStencil; // Optional + pipelineInfo.pColorBlendState = &colorBlending; + pipelineInfo.pDynamicState = nullptr; // Optional + + pipelineInfo.layout = m_pipelineLayout; + + pipelineInfo.renderPass = m_renderPass; + pipelineInfo.subpass = 0; + + pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional + pipelineInfo.basePipelineIndex = -1; // Optional + + if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &m_graphicsPipeline) + != VK_SUCCESS) { + throw std::runtime_error("failed to create graphics pipeline!"); + } + + for (size_t i = 0; i < shaderStageInfos.size(); i++) { + vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr); + } +} + +void VulkanBaseApp::createFramebuffers() +{ + m_swapChainFramebuffers.resize(m_swapChainImageViews.size()); + for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { + VkImageView attachments[] = {m_swapChainImageViews[i], m_depthImageView}; + + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.renderPass = m_renderPass; + framebufferInfo.attachmentCount = countof(attachments); + framebufferInfo.pAttachments = attachments; + framebufferInfo.width = m_swapChainExtent.width; + framebufferInfo.height = m_swapChainExtent.height; + framebufferInfo.layers = 1; + + if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, &m_swapChainFramebuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create framebuffer!"); + } + } +} + +void VulkanBaseApp::createCommandPool() +{ + VkCommandPoolCreateInfo poolInfo = {}; + uint32_t graphicsIndex, presentIndex; + + findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, presentIndex); + + poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + poolInfo.queueFamilyIndex = graphicsIndex; + poolInfo.flags = 0; // Optional + + if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != VK_SUCCESS) { + throw std::runtime_error("Failed to create command pool!"); + } +} + +static void transitionImageLayout(VulkanBaseApp *app, + VkImage image, + VkFormat format, + VkImageLayout oldLayout, + VkImageLayout newLayout) +{ + VkCommandBuffer commandBuffer = app->beginSingleTimeCommands(); + + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + + if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_D24_UNORM_S8_UINT) { + barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; + } + } + else { + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + } + + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags sourceStage; + VkPipelineStageFlags destinationStage; + + if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + + sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + } + else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { + barrier.srcAccessMask = 0; + barrier.dstAccessMask = + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + + sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; + } + else { + throw std::invalid_argument("unsupported layout transition!"); + } + + vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, nullptr, 0, nullptr, 1, &barrier); + + app->endSingleTimeCommands(commandBuffer); +} + +void VulkanBaseApp::createDepthResources() +{ + VkFormat depthFormat = + findSupportedFormat(m_physicalDevice, + {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT}, + VK_IMAGE_TILING_OPTIMAL, + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); + createImage(m_physicalDevice, + m_device, + m_swapChainExtent.width, + m_swapChainExtent.height, + depthFormat, + VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + m_depthImage, + m_depthImageMemory); + m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, VK_IMAGE_ASPECT_DEPTH_BIT); + transitionImageLayout( + this, m_depthImage, depthFormat, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); +} + +void VulkanBaseApp::createUniformBuffers() +{ + VkDeviceSize size = getUniformSize(); + if (size > 0) { + m_uniformBuffers.resize(m_swapChainImages.size()); + m_uniformMemory.resize(m_swapChainImages.size()); + for (size_t i = 0; i < m_uniformBuffers.size(); i++) { + createBuffer(getUniformSize(), + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + m_uniformBuffers[i], + m_uniformMemory[i]); + } + } +} + +void VulkanBaseApp::createDescriptorPool() +{ + VkDescriptorPoolSize poolSize = {}; + poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + poolSize.descriptorCount = static_cast(m_swapChainImages.size()); + VkDescriptorPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + poolInfo.maxSets = static_cast(m_swapChainImages.size()); + if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor pool!"); + } +} + +void VulkanBaseApp::createDescriptorSets() +{ + std::vector layouts(m_swapChainImages.size(), m_descriptorSetLayout); + VkDescriptorSetAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + allocInfo.descriptorPool = m_descriptorPool; + allocInfo.descriptorSetCount = static_cast(m_swapChainImages.size()); + allocInfo.pSetLayouts = layouts.data(); + m_descriptorSets.resize(m_swapChainImages.size()); + + if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate descriptor sets!"); + } + + VkDescriptorBufferInfo bufferInfo = {}; + bufferInfo.offset = 0; + bufferInfo.range = VK_WHOLE_SIZE; + VkWriteDescriptorSet descriptorWrite = {}; + descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptorWrite.dstBinding = 0; + descriptorWrite.dstArrayElement = 0; + descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + descriptorWrite.descriptorCount = 1; + descriptorWrite.pBufferInfo = &bufferInfo; + descriptorWrite.pImageInfo = nullptr; // Optional + descriptorWrite.pTexelBufferView = nullptr; // Optional + + for (size_t i = 0; i < m_swapChainImages.size(); i++) { + bufferInfo.buffer = m_uniformBuffers[i]; + descriptorWrite.dstSet = m_descriptorSets[i]; + vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr); + } +} + +void VulkanBaseApp::createCommandBuffers() +{ + m_commandBuffers.resize(m_swapChainFramebuffers.size()); + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.commandPool = m_commandPool; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size(); + + if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate command buffers!"); + } + + for (size_t i = 0; i < m_commandBuffers.size(); i++) { + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + beginInfo.pInheritanceInfo = nullptr; // Optional + + if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) { + throw std::runtime_error("failed to begin recording command buffer!"); + } + + VkRenderPassBeginInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + renderPassInfo.renderPass = m_renderPass; + renderPassInfo.framebuffer = m_swapChainFramebuffers[i]; + + renderPassInfo.renderArea.offset = {0, 0}; + renderPassInfo.renderArea.extent = m_swapChainExtent; + + VkClearValue clearColors[2]; + clearColors[0].color = {0.0f, 0.0f, 0.0f, 1.0f}; + clearColors[1].depthStencil = {1.0f, 0}; + renderPassInfo.clearValueCount = countof(clearColors); + renderPassInfo.pClearValues = clearColors; + + vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE); + + vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_graphicsPipeline); + + vkCmdBindDescriptorSets(m_commandBuffers[i], + VK_PIPELINE_BIND_POINT_GRAPHICS, + m_pipelineLayout, + 0, + 1, + &m_descriptorSets[i], + 0, + nullptr); + + fillRenderingCommandBuffer(m_commandBuffers[i]); + + vkCmdEndRenderPass(m_commandBuffers[i]); + + if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to record command buffer!"); + } + } +} + +void VulkanBaseApp::createSyncObjects() +{ + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + VkFenceCreateInfo fenceInfo = {}; + fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; + + m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT); + m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT); + m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT); + + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_imageAvailableSemaphores[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_renderFinishedSemaphores[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } } - } #ifdef _VK_TIMELINE_SEMAPHORE - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, - &m_vkPresentationSemaphore) != VK_SUCCESS) { - throw std::runtime_error("Failed to create binary semaphore!"); - } + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_vkPresentationSemaphore) != VK_SUCCESS) { + throw std::runtime_error("Failed to create binary semaphore!"); + } #endif /* _VK_TIMELINE_SEMAPHORE */ } -void VulkanBaseApp::getWaitFrameSemaphores( - std::vector &wait, - std::vector &waitStages) const {} +void VulkanBaseApp::getWaitFrameSemaphores(std::vector &wait, + std::vector &waitStages) const +{ +} -void VulkanBaseApp::getSignalFrameSemaphores( - std::vector &signal) const {} +void VulkanBaseApp::getSignalFrameSemaphores(std::vector &signal) const {} VkDeviceSize VulkanBaseApp::getUniformSize() const { return VkDeviceSize(0); } void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex) {} -void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, +void VulkanBaseApp::createBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, - VkBuffer &buffer, - VkDeviceMemory &bufferMemory) { - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + VkBuffer &buffer, + VkDeviceMemory &bufferMemory) +{ + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType( - m_physicalDevice, memRequirements.memoryTypeBits, properties); + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate buffer memory!"); - } + if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate buffer memory!"); + } - vkBindBufferMemory(m_device, buffer, bufferMemory, 0); + vkBindBufferMemory(m_device, buffer, bufferMemory, 0); } -void VulkanBaseApp::createExternalBuffer( - VkDeviceSize size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, - VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer &buffer, - VkDeviceMemory &bufferMemory) { - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; +void VulkanBaseApp::createExternalBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory) +{ + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {}; - externalMemoryBufferInfo.sType = - VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; - externalMemoryBufferInfo.handleTypes = extMemHandleType; - bufferInfo.pNext = &externalMemoryBufferInfo; + VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {}; + externalMemoryBufferInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; + externalMemoryBufferInfo.handleTypes = extMemHandleType; + bufferInfo.pNext = &externalMemoryBufferInfo; - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); #ifdef _WIN64 - WindowsSecurityAttributes winSecurityAttributes; + WindowsSecurityAttributes winSecurityAttributes; - VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; - vulkanExportMemoryWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; - vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; - vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; - vulkanExportMemoryWin32HandleInfoKHR.dwAccess = - DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; - vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; + VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; + vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; + vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; #endif /* _WIN64 */ - VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; - vulkanExportMemoryAllocateInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; + VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; + vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; #ifdef _WIN64 - vulkanExportMemoryAllocateInfoKHR.pNext = - extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR - ? &vulkanExportMemoryWin32HandleInfoKHR - : NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; + vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR + ? &vulkanExportMemoryWin32HandleInfoKHR + : NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; #else - vulkanExportMemoryAllocateInfoKHR.pNext = NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + vulkanExportMemoryAllocateInfoKHR.pNext = NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; #endif /* _WIN64 */ - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType( - m_physicalDevice, memRequirements.memoryTypeBits, properties); + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate external buffer memory!"); - } + if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate external buffer memory!"); + } - vkBindBufferMemory(m_device, buffer, bufferMemory, 0); + vkBindBufferMemory(m_device, buffer, bufferMemory, 0); } -void *VulkanBaseApp::getMemHandle( - VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType) { +void *VulkanBaseApp::getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType) +{ #ifdef _WIN64 - HANDLE handle = 0; + HANDLE handle = 0; - VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; - vkMemoryGetWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; - vkMemoryGetWin32HandleInfoKHR.pNext = NULL; - vkMemoryGetWin32HandleInfoKHR.memory = memory; - vkMemoryGetWin32HandleInfoKHR.handleType = handleType; + VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; + vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + vkMemoryGetWin32HandleInfoKHR.pNext = NULL; + vkMemoryGetWin32HandleInfoKHR.memory = memory; + vkMemoryGetWin32HandleInfoKHR.handleType = handleType; - PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; - fpGetMemoryWin32HandleKHR = - (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr( - m_device, "vkGetMemoryWin32HandleKHR"); - if (!fpGetMemoryWin32HandleKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, - &handle) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - return (void *)handle; + PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; + fpGetMemoryWin32HandleKHR = + (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryWin32HandleKHR"); + if (!fpGetMemoryWin32HandleKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + return (void *)handle; #else - int fd = -1; + int fd = -1; - VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; - vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; - vkMemoryGetFdInfoKHR.pNext = NULL; - vkMemoryGetFdInfoKHR.memory = memory; - vkMemoryGetFdInfoKHR.handleType = handleType; + VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; + vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + vkMemoryGetFdInfoKHR.pNext = NULL; + vkMemoryGetFdInfoKHR.memory = memory; + vkMemoryGetFdInfoKHR.handleType = handleType; - PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; - fpGetMemoryFdKHR = - (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR"); - if (!fpGetMemoryFdKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - return (void *)(uintptr_t)fd; + PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; + fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR"); + if (!fpGetMemoryFdKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + return (void *)(uintptr_t)fd; #endif /* _WIN64 */ } -void *VulkanBaseApp::getSemaphoreHandle( - VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) { +void *VulkanBaseApp::getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) +{ #ifdef _WIN64 - HANDLE handle; + HANDLE handle; - VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {}; - semaphoreGetWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; - semaphoreGetWin32HandleInfoKHR.pNext = NULL; - semaphoreGetWin32HandleInfoKHR.semaphore = semaphore; - semaphoreGetWin32HandleInfoKHR.handleType = handleType; + VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {}; + semaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; + semaphoreGetWin32HandleInfoKHR.pNext = NULL; + semaphoreGetWin32HandleInfoKHR.semaphore = semaphore; + semaphoreGetWin32HandleInfoKHR.handleType = handleType; - PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; - fpGetSemaphoreWin32HandleKHR = - (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr( - m_device, "vkGetSemaphoreWin32HandleKHR"); - if (!fpGetSemaphoreWin32HandleKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, - &handle) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } + PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; + fpGetSemaphoreWin32HandleKHR = + (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreWin32HandleKHR"); + if (!fpGetSemaphoreWin32HandleKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } - return (void *)handle; + return (void *)handle; #else - int fd; + int fd; - VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {}; - semaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; - semaphoreGetFdInfoKHR.pNext = NULL; - semaphoreGetFdInfoKHR.semaphore = semaphore; - semaphoreGetFdInfoKHR.handleType = handleType; + VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {}; + semaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + semaphoreGetFdInfoKHR.pNext = NULL; + semaphoreGetFdInfoKHR.semaphore = semaphore; + semaphoreGetFdInfoKHR.handleType = handleType; - PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; - fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr( - m_device, "vkGetSemaphoreFdKHR"); - if (!fpGetSemaphoreFdKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != - VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } + PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; + fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreFdKHR"); + if (!fpGetSemaphoreFdKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } - return (void *)(uintptr_t)fd; + return (void *)(uintptr_t)fd; #endif /* _WIN64 */ } -void VulkanBaseApp::createExternalSemaphore( - VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) { - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {}; - exportSemaphoreCreateInfo.sType = - VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; +void VulkanBaseApp::createExternalSemaphore(VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) +{ + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {}; + exportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; #ifdef _VK_TIMELINE_SEMAPHORE - VkSemaphoreTypeCreateInfo timelineCreateInfo; - timelineCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO; - timelineCreateInfo.pNext = NULL; - timelineCreateInfo.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE; - timelineCreateInfo.initialValue = 0; - exportSemaphoreCreateInfo.pNext = &timelineCreateInfo; + VkSemaphoreTypeCreateInfo timelineCreateInfo; + timelineCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO; + timelineCreateInfo.pNext = NULL; + timelineCreateInfo.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE; + timelineCreateInfo.initialValue = 0; + exportSemaphoreCreateInfo.pNext = &timelineCreateInfo; #else - exportSemaphoreCreateInfo.pNext = NULL; + exportSemaphoreCreateInfo.pNext = NULL; #endif /* _VK_TIMELINE_SEMAPHORE */ - exportSemaphoreCreateInfo.handleTypes = handleType; - semaphoreInfo.pNext = &exportSemaphoreCreateInfo; + exportSemaphoreCreateInfo.handleTypes = handleType; + semaphoreInfo.pNext = &exportSemaphoreCreateInfo; - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != - VK_SUCCESS) { - throw std::runtime_error( - "failed to create synchronization objects for a CUDA-Vulkan!"); - } + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != VK_SUCCESS) { + throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!"); + } } -void VulkanBaseApp::importExternalBuffer( - void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, - VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, - VkBuffer &buffer, VkDeviceMemory &memory) { - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; +void VulkanBaseApp::importExternalBuffer(void *handle, + VkExternalMemoryHandleTypeFlagBits handleType, + size_t size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &memory) +{ + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); #ifdef _WIN64 - VkImportMemoryWin32HandleInfoKHR handleInfo = {}; - handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR; - handleInfo.pNext = NULL; - handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; - handleInfo.handle = handle; - handleInfo.name = NULL; + VkImportMemoryWin32HandleInfoKHR handleInfo = {}; + handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + handleInfo.pNext = NULL; + handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; + handleInfo.handle = handle; + handleInfo.name = NULL; #else - VkImportMemoryFdInfoKHR handleInfo = {}; - handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR; - handleInfo.pNext = NULL; - handleInfo.fd = (int)(uintptr_t)handle; - handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + VkImportMemoryFdInfoKHR handleInfo = {}; + handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR; + handleInfo.pNext = NULL; + handleInfo.fd = (int)(uintptr_t)handle; + handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; #endif /* _WIN64 */ - VkMemoryAllocateInfo memAllocation = {}; - memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - memAllocation.pNext = (void *)&handleInfo; - memAllocation.allocationSize = size; - memAllocation.memoryTypeIndex = findMemoryType( - m_physicalDevice, memRequirements.memoryTypeBits, properties); + VkMemoryAllocateInfo memAllocation = {}; + memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + memAllocation.pNext = (void *)&handleInfo; + memAllocation.allocationSize = size; + memAllocation.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != - VK_SUCCESS) { - throw std::runtime_error("Failed to import allocation!"); - } + if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != VK_SUCCESS) { + throw std::runtime_error("Failed to import allocation!"); + } - vkBindBufferMemory(m_device, buffer, memory, 0); + vkBindBufferMemory(m_device, buffer, memory, 0); } -void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size) { - VkCommandBuffer commandBuffer = beginSingleTimeCommands(); +void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size) +{ + VkCommandBuffer commandBuffer = beginSingleTimeCommands(); - VkBufferCopy copyRegion = {}; - copyRegion.size = size; - vkCmdCopyBuffer(commandBuffer, src, dst, 1, ©Region); + VkBufferCopy copyRegion = {}; + copyRegion.size = size; + vkCmdCopyBuffer(commandBuffer, src, dst, 1, ©Region); - endSingleTimeCommands(commandBuffer); + endSingleTimeCommands(commandBuffer); } #ifdef _VK_TIMELINE_SEMAPHORE -void VulkanBaseApp::drawFrame() { - static uint64_t waitValue = 0; - static uint64_t signalValue = 1; +void VulkanBaseApp::drawFrame() +{ + static uint64_t waitValue = 0; + static uint64_t signalValue = 1; - VkSemaphoreWaitInfo semaphoreWaitInfo = {}; - semaphoreWaitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; - semaphoreWaitInfo.pSemaphores = &m_vkTimelineSemaphore; - semaphoreWaitInfo.semaphoreCount = 1; - semaphoreWaitInfo.pValues = &waitValue; - vkWaitSemaphores(m_device, &semaphoreWaitInfo, - std::numeric_limits::max()); + VkSemaphoreWaitInfo semaphoreWaitInfo = {}; + semaphoreWaitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; + semaphoreWaitInfo.pSemaphores = &m_vkTimelineSemaphore; + semaphoreWaitInfo.semaphoreCount = 1; + semaphoreWaitInfo.pValues = &waitValue; + vkWaitSemaphores(m_device, &semaphoreWaitInfo, std::numeric_limits::max()); - uint32_t imageIndex; - VkResult result = vkAcquireNextImageKHR( - m_device, m_swapChain, std::numeric_limits::max(), - m_vkPresentationSemaphore, VK_NULL_HANDLE, &imageIndex); - if (result == VK_ERROR_OUT_OF_DATE_KHR) { - recreateSwapChain(); - } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } + uint32_t imageIndex; + VkResult result = vkAcquireNextImageKHR(m_device, + m_swapChain, + std::numeric_limits::max(), + m_vkPresentationSemaphore, + VK_NULL_HANDLE, + &imageIndex); + if (result == VK_ERROR_OUT_OF_DATE_KHR) { + recreateSwapChain(); + } + else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } - updateUniformBuffer(imageIndex); + updateUniformBuffer(imageIndex); - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - std::vector waitSemaphores; - std::vector waitStages; + std::vector waitSemaphores; + std::vector waitStages; - waitSemaphores.push_back(m_vkTimelineSemaphore); - waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); + waitSemaphores.push_back(m_vkTimelineSemaphore); + waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); - submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); - submitInfo.pWaitSemaphores = waitSemaphores.data(); - submitInfo.pWaitDstStageMask = waitStages.data(); + submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); + submitInfo.pWaitSemaphores = waitSemaphores.data(); + submitInfo.pWaitDstStageMask = waitStages.data(); - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; - std::vector signalSemaphores; - signalSemaphores.push_back(m_vkTimelineSemaphore); - submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); - submitInfo.pSignalSemaphores = signalSemaphores.data(); + std::vector signalSemaphores; + signalSemaphores.push_back(m_vkTimelineSemaphore); + submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); + submitInfo.pSignalSemaphores = signalSemaphores.data(); - VkTimelineSemaphoreSubmitInfo timelineInfo = {}; - timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO; - timelineInfo.waitSemaphoreValueCount = 1; - timelineInfo.pWaitSemaphoreValues = &waitValue; - timelineInfo.signalSemaphoreValueCount = 1; - timelineInfo.pSignalSemaphoreValues = &signalValue; + VkTimelineSemaphoreSubmitInfo timelineInfo = {}; + timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO; + timelineInfo.waitSemaphoreValueCount = 1; + timelineInfo.pWaitSemaphoreValues = &waitValue; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; - submitInfo.pNext = &timelineInfo; + submitInfo.pNext = &timelineInfo; - if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != - VK_SUCCESS) { - throw std::runtime_error("failed to submit draw command buffer!"); - } + if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } - VkPresentInfoKHR presentInfo = {}; - presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; - presentInfo.waitSemaphoreCount = 1; - presentInfo.pWaitSemaphores = &m_vkPresentationSemaphore; + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = &m_vkPresentationSemaphore; - VkSwapchainKHR swapChains[] = {m_swapChain}; - presentInfo.swapchainCount = 1; - presentInfo.pSwapchains = swapChains; - presentInfo.pImageIndices = &imageIndex; + VkSwapchainKHR swapChains[] = {m_swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; - result = vkQueuePresentKHR(m_presentQueue, &presentInfo); - if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || - m_framebufferResized) { - recreateSwapChain(); - m_framebufferResized = false; - } else if (result != VK_SUCCESS) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } + result = vkQueuePresentKHR(m_presentQueue, &presentInfo); + if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || m_framebufferResized) { + recreateSwapChain(); + m_framebufferResized = false; + } + else if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } - m_currentFrame++; + m_currentFrame++; - waitValue += 2; - signalValue += 2; + waitValue += 2; + signalValue += 2; } #else -void VulkanBaseApp::drawFrame() { - size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT; - vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, - std::numeric_limits::max()); +void VulkanBaseApp::drawFrame() +{ + size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT; + vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, std::numeric_limits::max()); - uint32_t imageIndex; - VkResult result = vkAcquireNextImageKHR( - m_device, m_swapChain, std::numeric_limits::max(), - m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex); - if (result == VK_ERROR_OUT_OF_DATE_KHR) { - recreateSwapChain(); - } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } + uint32_t imageIndex; + VkResult result = vkAcquireNextImageKHR(m_device, + m_swapChain, + std::numeric_limits::max(), + m_imageAvailableSemaphores[currentFrameIdx], + VK_NULL_HANDLE, + &imageIndex); + if (result == VK_ERROR_OUT_OF_DATE_KHR) { + recreateSwapChain(); + } + else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } - updateUniformBuffer(imageIndex); + updateUniformBuffer(imageIndex); - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - std::vector waitSemaphores; - std::vector waitStages; + std::vector waitSemaphores; + std::vector waitStages; - waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]); - waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); - getWaitFrameSemaphores(waitSemaphores, waitStages); + waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]); + waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); + getWaitFrameSemaphores(waitSemaphores, waitStages); - submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); - submitInfo.pWaitSemaphores = waitSemaphores.data(); - submitInfo.pWaitDstStageMask = waitStages.data(); + submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); + submitInfo.pWaitSemaphores = waitSemaphores.data(); + submitInfo.pWaitDstStageMask = waitStages.data(); - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; - std::vector signalSemaphores; - getSignalFrameSemaphores(signalSemaphores); - signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]); - submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); - submitInfo.pSignalSemaphores = signalSemaphores.data(); + std::vector signalSemaphores; + getSignalFrameSemaphores(signalSemaphores); + signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]); + submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); + submitInfo.pSignalSemaphores = signalSemaphores.data(); - vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]); + vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]); - if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, - m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) { - throw std::runtime_error("failed to submit draw command buffer!"); - } + if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } - VkPresentInfoKHR presentInfo = {}; - presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; - presentInfo.waitSemaphoreCount = 1; - presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx]; + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx]; - VkSwapchainKHR swapChains[] = {m_swapChain}; - presentInfo.swapchainCount = 1; - presentInfo.pSwapchains = swapChains; - presentInfo.pImageIndices = &imageIndex; + VkSwapchainKHR swapChains[] = {m_swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; - result = vkQueuePresentKHR(m_presentQueue, &presentInfo); - if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || - m_framebufferResized) { - recreateSwapChain(); - m_framebufferResized = false; - } else if (result != VK_SUCCESS) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } + result = vkQueuePresentKHR(m_presentQueue, &presentInfo); + if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || m_framebufferResized) { + recreateSwapChain(); + m_framebufferResized = false; + } + else if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } - m_currentFrame++; + m_currentFrame++; } #endif /* _VK_TIMELINE_SEMAPHORE */ -void VulkanBaseApp::cleanupSwapChain() { - if (m_depthImageView != VK_NULL_HANDLE) { - vkDestroyImageView(m_device, m_depthImageView, nullptr); - } - if (m_depthImage != VK_NULL_HANDLE) { - vkDestroyImage(m_device, m_depthImage, nullptr); - } - if (m_depthImageMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_depthImageMemory, nullptr); - } +void VulkanBaseApp::cleanupSwapChain() +{ + if (m_depthImageView != VK_NULL_HANDLE) { + vkDestroyImageView(m_device, m_depthImageView, nullptr); + } + if (m_depthImage != VK_NULL_HANDLE) { + vkDestroyImage(m_device, m_depthImage, nullptr); + } + if (m_depthImageMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_depthImageMemory, nullptr); + } - for (size_t i = 0; i < m_uniformBuffers.size(); i++) { - vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr); - vkFreeMemory(m_device, m_uniformMemory[i], nullptr); - } + for (size_t i = 0; i < m_uniformBuffers.size(); i++) { + vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr); + vkFreeMemory(m_device, m_uniformMemory[i], nullptr); + } - if (m_descriptorPool != VK_NULL_HANDLE) { - vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr); - } + if (m_descriptorPool != VK_NULL_HANDLE) { + vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr); + } - for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) { - vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr); - } + for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) { + vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr); + } - if (m_graphicsPipeline != VK_NULL_HANDLE) { - vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr); - } + if (m_graphicsPipeline != VK_NULL_HANDLE) { + vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr); + } - if (m_pipelineLayout != VK_NULL_HANDLE) { - vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr); - } + if (m_pipelineLayout != VK_NULL_HANDLE) { + vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr); + } - if (m_renderPass != VK_NULL_HANDLE) { - vkDestroyRenderPass(m_device, m_renderPass, nullptr); - } + if (m_renderPass != VK_NULL_HANDLE) { + vkDestroyRenderPass(m_device, m_renderPass, nullptr); + } - for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { - vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr); - } + for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { + vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr); + } - if (m_swapChain != VK_NULL_HANDLE) { - vkDestroySwapchainKHR(m_device, m_swapChain, nullptr); - } + if (m_swapChain != VK_NULL_HANDLE) { + vkDestroySwapchainKHR(m_device, m_swapChain, nullptr); + } } -void VulkanBaseApp::recreateSwapChain() { - int width, height; +void VulkanBaseApp::recreateSwapChain() +{ + int width, height; - glfwGetFramebufferSize(m_window, &width, &height); - while (width == 0 || height == 0) { - glfwWaitEvents(); glfwGetFramebufferSize(m_window, &width, &height); - } + while (width == 0 || height == 0) { + glfwWaitEvents(); + glfwGetFramebufferSize(m_window, &width, &height); + } - vkDeviceWaitIdle(m_device); + vkDeviceWaitIdle(m_device); - cleanupSwapChain(); + cleanupSwapChain(); - createSwapChain(); - createImageViews(); - createRenderPass(); - createGraphicsPipeline(); - createDepthResources(); - createFramebuffers(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createGraphicsPipeline(); + createDepthResources(); + createFramebuffers(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); } -void VulkanBaseApp::mainLoop() { - while (!glfwWindowShouldClose(m_window)) { - glfwPollEvents(); - drawFrame(); - } - vkDeviceWaitIdle(m_device); +void VulkanBaseApp::mainLoop() +{ + while (!glfwWindowShouldClose(m_window)) { + glfwPollEvents(); + drawFrame(); + } + vkDeviceWaitIdle(m_device); } -void readFile(std::istream &s, std::vector &data) { - s.seekg(0, std::ios_base::end); - data.resize(s.tellg()); - s.clear(); - s.seekg(0, std::ios_base::beg); - s.read(data.data(), data.size()); +void readFile(std::istream &s, std::vector &data) +{ + s.seekg(0, std::ios_base::end); + data.resize(s.tellg()); + s.clear(); + s.seekg(0, std::ios_base::beg); + s.read(data.data(), data.size()); } diff --git a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h index fbe99f4d..efb5fac4 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h +++ b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h @@ -34,8 +34,8 @@ #include #ifdef _WIN64 #define NOMINMAX -#include #include +#include #endif /* _WIN64 */ /* remove _VK_TIMELINE_SEMAPHORE to use binary semaphores */ @@ -44,121 +44,122 @@ struct GLFWwindow; -class VulkanBaseApp { - public: - VulkanBaseApp(const std::string& appName, bool enableValidation = false); - static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType(); - static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType(); - virtual ~VulkanBaseApp(); - void init(); - void* getMemHandle(VkDeviceMemory memory, - VkExternalMemoryHandleTypeFlagBits handleType); - void* getSemaphoreHandle(VkSemaphore semaphore, - VkExternalSemaphoreHandleTypeFlagBits handleType); - void createExternalSemaphore( - VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); - void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, VkBuffer& buffer, - VkDeviceMemory& bufferMemory); - void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, - VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, - VkBuffer& buffer, VkDeviceMemory& bufferMemory); - void importExternalBuffer(void* handle, - VkExternalMemoryHandleTypeFlagBits handleType, - size_t size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, VkBuffer& buffer, - VkDeviceMemory& memory); - void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size); - VkCommandBuffer beginSingleTimeCommands(); - void endSingleTimeCommands(VkCommandBuffer commandBuffer); - void mainLoop(); +class VulkanBaseApp +{ +public: + VulkanBaseApp(const std::string &appName, bool enableValidation = false); + static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType(); + static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType(); + virtual ~VulkanBaseApp(); + void init(); + void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType); + void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); + void createExternalSemaphore(VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); + void createBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory); + void createExternalBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory); + void importExternalBuffer(void *handle, + VkExternalMemoryHandleTypeFlagBits handleType, + size_t size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &memory); + void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size); + VkCommandBuffer beginSingleTimeCommands(); + void endSingleTimeCommands(VkCommandBuffer commandBuffer); + void mainLoop(); - protected: - const std::string m_appName; - const bool m_enableValidation; - VkInstance m_instance; - VkDebugUtilsMessengerEXT m_debugMessenger; - VkSurfaceKHR m_surface; - VkPhysicalDevice m_physicalDevice; - VkDevice m_device; - VkQueue m_graphicsQueue; - VkQueue m_presentQueue; - VkSwapchainKHR m_swapChain; - std::vector m_swapChainImages; - VkFormat m_swapChainFormat; - VkExtent2D m_swapChainExtent; - std::vector m_swapChainImageViews; - std::vector > m_shaderFiles; - VkRenderPass m_renderPass; - VkPipelineLayout m_pipelineLayout; - VkPipeline m_graphicsPipeline; - std::vector m_swapChainFramebuffers; - VkCommandPool m_commandPool; - std::vector m_commandBuffers; - std::vector m_imageAvailableSemaphores; - std::vector m_renderFinishedSemaphores; - std::vector m_inFlightFences; - std::vector m_uniformBuffers; - std::vector m_uniformMemory; - VkSemaphore m_vkPresentationSemaphore; - VkSemaphore m_vkTimelineSemaphore; - VkDescriptorSetLayout m_descriptorSetLayout; - VkDescriptorPool m_descriptorPool; - std::vector m_descriptorSets; - VkImage m_depthImage; - VkDeviceMemory m_depthImageMemory; - VkImageView m_depthImageView; - size_t m_currentFrame; - bool m_framebufferResized; - uint8_t m_vkDeviceUUID[VK_UUID_SIZE]; +protected: + const std::string m_appName; + const bool m_enableValidation; + VkInstance m_instance; + VkDebugUtilsMessengerEXT m_debugMessenger; + VkSurfaceKHR m_surface; + VkPhysicalDevice m_physicalDevice; + VkDevice m_device; + VkQueue m_graphicsQueue; + VkQueue m_presentQueue; + VkSwapchainKHR m_swapChain; + std::vector m_swapChainImages; + VkFormat m_swapChainFormat; + VkExtent2D m_swapChainExtent; + std::vector m_swapChainImageViews; + std::vector> m_shaderFiles; + VkRenderPass m_renderPass; + VkPipelineLayout m_pipelineLayout; + VkPipeline m_graphicsPipeline; + std::vector m_swapChainFramebuffers; + VkCommandPool m_commandPool; + std::vector m_commandBuffers; + std::vector m_imageAvailableSemaphores; + std::vector m_renderFinishedSemaphores; + std::vector m_inFlightFences; + std::vector m_uniformBuffers; + std::vector m_uniformMemory; + VkSemaphore m_vkPresentationSemaphore; + VkSemaphore m_vkTimelineSemaphore; + VkDescriptorSetLayout m_descriptorSetLayout; + VkDescriptorPool m_descriptorPool; + std::vector m_descriptorSets; + VkImage m_depthImage; + VkDeviceMemory m_depthImageMemory; + VkImageView m_depthImageView; + size_t m_currentFrame; + bool m_framebufferResized; + uint8_t m_vkDeviceUUID[VK_UUID_SIZE]; - virtual void initVulkanApp() {} - virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {} - virtual std::vector getRequiredExtensions() const; - virtual std::vector getRequiredDeviceExtensions() const; - virtual void getVertexDescriptions( - std::vector& bindingDesc, - std::vector& attribDesc); - virtual void getAssemblyStateInfo( - VkPipelineInputAssemblyStateCreateInfo& info); - virtual void getWaitFrameSemaphores( - std::vector& wait, - std::vector& waitStages) const; - virtual void getSignalFrameSemaphores(std::vector& signal) const; - virtual VkDeviceSize getUniformSize() const; - virtual void updateUniformBuffer(uint32_t imageIndex); - virtual void drawFrame(); + virtual void initVulkanApp() {} + virtual void fillRenderingCommandBuffer(VkCommandBuffer &buffer) {} + virtual std::vector getRequiredExtensions() const; + virtual std::vector getRequiredDeviceExtensions() const; + virtual void getVertexDescriptions(std::vector &bindingDesc, + std::vector &attribDesc); + virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info); + virtual void getWaitFrameSemaphores(std::vector &wait, + std::vector &waitStages) const; + virtual void getSignalFrameSemaphores(std::vector &signal) const; + virtual VkDeviceSize getUniformSize() const; + virtual void updateUniformBuffer(uint32_t imageIndex); + virtual void drawFrame(); - private: - GLFWwindow* m_window; +private: + GLFWwindow *m_window; - void initWindow(); - void initVulkan(); - void createInstance(); - void createSurface(); - void createDevice(); - void createSwapChain(); - void createImageViews(); - void createRenderPass(); - void createDescriptorSetLayout(); - void createGraphicsPipeline(); - void createFramebuffers(); - void createCommandPool(); - void createDepthResources(); - void createUniformBuffers(); - void createDescriptorPool(); - void createDescriptorSets(); - void createCommandBuffers(); - void createSyncObjects(); + void initWindow(); + void initVulkan(); + void createInstance(); + void createSurface(); + void createDevice(); + void createSwapChain(); + void createImageViews(); + void createRenderPass(); + void createDescriptorSetLayout(); + void createGraphicsPipeline(); + void createFramebuffers(); + void createCommandPool(); + void createDepthResources(); + void createUniformBuffers(); + void createDescriptorPool(); + void createDescriptorSets(); + void createCommandBuffers(); + void createSyncObjects(); - void cleanupSwapChain(); - void recreateSwapChain(); + void cleanupSwapChain(); + void recreateSwapChain(); - bool isSuitableDevice(VkPhysicalDevice dev) const; - static void resizeCallback(GLFWwindow* window, int width, int height); + bool isSuitableDevice(VkPhysicalDevice dev) const; + static void resizeCallback(GLFWwindow *window, int width, int height); }; -void readFile(std::istream& s, std::vector& data); +void readFile(std::istream &s, std::vector &data); #endif /* __VULKANBASEAPP_H__ */ diff --git a/Samples/5_Domain_Specific/simpleVulkan/linmath.h b/Samples/5_Domain_Specific/simpleVulkan/linmath.h index dbedbc16..72ae75dd 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/linmath.h +++ b/Samples/5_Domain_Specific/simpleVulkan/linmath.h @@ -30,114 +30,155 @@ // Converts radians to degrees. #define radiansToDegrees(angleRadians) (angleRadians * 180.0 / M_PI) -typedef float vec3[3]; -static inline void vec3_add(vec3 r, vec3 const a, vec3 const b) { +typedef float vec3[3]; +static inline void vec3_add(vec3 r, vec3 const a, vec3 const b) +{ int i; - for (i = 0; i < 3; ++i) r[i] = a[i] + b[i]; + for (i = 0; i < 3; ++i) + r[i] = a[i] + b[i]; } -static inline void vec3_sub(vec3 r, vec3 const a, vec3 const b) { +static inline void vec3_sub(vec3 r, vec3 const a, vec3 const b) +{ int i; - for (i = 0; i < 3; ++i) r[i] = a[i] - b[i]; + for (i = 0; i < 3; ++i) + r[i] = a[i] - b[i]; } -static inline void vec3_scale(vec3 r, vec3 const v, float const s) { +static inline void vec3_scale(vec3 r, vec3 const v, float const s) +{ int i; - for (i = 0; i < 3; ++i) r[i] = v[i] * s; + for (i = 0; i < 3; ++i) + r[i] = v[i] * s; } -static inline float vec3_mul_inner(vec3 const a, vec3 const b) { +static inline float vec3_mul_inner(vec3 const a, vec3 const b) +{ float p = 0.f; - int i; - for (i = 0; i < 3; ++i) p += b[i] * a[i]; + int i; + for (i = 0; i < 3; ++i) + p += b[i] * a[i]; return p; } -static inline void vec3_mul_cross(vec3 r, vec3 const a, vec3 const b) { +static inline void vec3_mul_cross(vec3 r, vec3 const a, vec3 const b) +{ r[0] = a[1] * b[2] - a[2] * b[1]; r[1] = a[2] * b[0] - a[0] * b[2]; r[2] = a[0] * b[1] - a[1] * b[0]; } static inline float vec3_len(vec3 const v) { return sqrtf(vec3_mul_inner(v, v)); } -static inline void vec3_norm(vec3 r, vec3 const v) { +static inline void vec3_norm(vec3 r, vec3 const v) +{ float k = 1.f / vec3_len(v); vec3_scale(r, v, k); } -static inline void vec3_reflect(vec3 r, vec3 const v, vec3 const n) { +static inline void vec3_reflect(vec3 r, vec3 const v, vec3 const n) +{ float p = 2.f * vec3_mul_inner(v, n); - int i; - for (i = 0; i < 3; ++i) r[i] = v[i] - p * n[i]; + int i; + for (i = 0; i < 3; ++i) + r[i] = v[i] - p * n[i]; } -typedef float vec4[4]; -static inline void vec4_add(vec4 r, vec4 const a, vec4 const b) { +typedef float vec4[4]; +static inline void vec4_add(vec4 r, vec4 const a, vec4 const b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] + b[i]; } -static inline void vec4_sub(vec4 r, vec4 const a, vec4 const b) { +static inline void vec4_sub(vec4 r, vec4 const a, vec4 const b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] - b[i]; } -static inline void vec4_scale(vec4 r, vec4 v, float s) { +static inline void vec4_scale(vec4 r, vec4 v, float s) +{ int i; - for (i = 0; i < 4; ++i) r[i] = v[i] * s; + for (i = 0; i < 4; ++i) + r[i] = v[i] * s; } -static inline float vec4_mul_inner(vec4 a, vec4 b) { +static inline float vec4_mul_inner(vec4 a, vec4 b) +{ float p = 0.f; - int i; - for (i = 0; i < 4; ++i) p += b[i] * a[i]; + int i; + for (i = 0; i < 4; ++i) + p += b[i] * a[i]; return p; } -static inline void vec4_mul_cross(vec4 r, vec4 a, vec4 b) { +static inline void vec4_mul_cross(vec4 r, vec4 a, vec4 b) +{ r[0] = a[1] * b[2] - a[2] * b[1]; r[1] = a[2] * b[0] - a[0] * b[2]; r[2] = a[0] * b[1] - a[1] * b[0]; r[3] = 1.f; } static inline float vec4_len(vec4 v) { return sqrtf(vec4_mul_inner(v, v)); } -static inline void vec4_norm(vec4 r, vec4 v) { +static inline void vec4_norm(vec4 r, vec4 v) +{ float k = 1.f / vec4_len(v); vec4_scale(r, v, k); } -static inline void vec4_reflect(vec4 r, vec4 v, vec4 n) { +static inline void vec4_reflect(vec4 r, vec4 v, vec4 n) +{ float p = 2.f * vec4_mul_inner(v, n); - int i; - for (i = 0; i < 4; ++i) r[i] = v[i] - p * n[i]; + int i; + for (i = 0; i < 4; ++i) + r[i] = v[i] - p * n[i]; } -typedef vec4 mat4x4[4]; -static inline void mat4x4_identity(mat4x4 M) { +typedef vec4 mat4x4[4]; +static inline void mat4x4_identity(mat4x4 M) +{ int i, j; for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) M[i][j] = i == j ? 1.f : 0.f; + for (j = 0; j < 4; ++j) + M[i][j] = i == j ? 1.f : 0.f; } -static inline void mat4x4_dup(mat4x4 M, mat4x4 N) { +static inline void mat4x4_dup(mat4x4 M, mat4x4 N) +{ int i, j; for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) M[i][j] = N[i][j]; + for (j = 0; j < 4; ++j) + M[i][j] = N[i][j]; } -static inline void mat4x4_row(vec4 r, mat4x4 M, int i) { +static inline void mat4x4_row(vec4 r, mat4x4 M, int i) +{ int k; - for (k = 0; k < 4; ++k) r[k] = M[k][i]; + for (k = 0; k < 4; ++k) + r[k] = M[k][i]; } -static inline void mat4x4_col(vec4 r, mat4x4 M, int i) { +static inline void mat4x4_col(vec4 r, mat4x4 M, int i) +{ int k; - for (k = 0; k < 4; ++k) r[k] = M[i][k]; + for (k = 0; k < 4; ++k) + r[k] = M[i][k]; } -static inline void mat4x4_transpose(mat4x4 M, mat4x4 N) { +static inline void mat4x4_transpose(mat4x4 M, mat4x4 N) +{ int i, j; for (j = 0; j < 4; ++j) - for (i = 0; i < 4; ++i) M[i][j] = N[j][i]; + for (i = 0; i < 4; ++i) + M[i][j] = N[j][i]; } -static inline void mat4x4_add(mat4x4 M, mat4x4 a, mat4x4 b) { +static inline void mat4x4_add(mat4x4 M, mat4x4 a, mat4x4 b) +{ int i; - for (i = 0; i < 4; ++i) vec4_add(M[i], a[i], b[i]); + for (i = 0; i < 4; ++i) + vec4_add(M[i], a[i], b[i]); } -static inline void mat4x4_sub(mat4x4 M, mat4x4 a, mat4x4 b) { +static inline void mat4x4_sub(mat4x4 M, mat4x4 a, mat4x4 b) +{ int i; - for (i = 0; i < 4; ++i) vec4_sub(M[i], a[i], b[i]); + for (i = 0; i < 4; ++i) + vec4_sub(M[i], a[i], b[i]); } -static inline void mat4x4_scale(mat4x4 M, mat4x4 a, float k) { +static inline void mat4x4_scale(mat4x4 M, mat4x4 a, float k) +{ int i; - for (i = 0; i < 4; ++i) vec4_scale(M[i], a[i], k); + for (i = 0; i < 4; ++i) + vec4_scale(M[i], a[i], k); } -static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, float z) { +static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, float z) +{ int i; vec4_scale(M[0], a[0], x); vec4_scale(M[1], a[1], y); @@ -146,45 +187,54 @@ static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, floa M[3][i] = a[3][i]; } } -static inline void mat4x4_mul(mat4x4 M, mat4x4 a, mat4x4 b) { +static inline void mat4x4_mul(mat4x4 M, mat4x4 a, mat4x4 b) +{ int k, r, c; for (c = 0; c < 4; ++c) for (r = 0; r < 4; ++r) { M[c][r] = 0.f; - for (k = 0; k < 4; ++k) M[c][r] += a[k][r] * b[c][k]; + for (k = 0; k < 4; ++k) + M[c][r] += a[k][r] * b[c][k]; } } -static inline void mat4x4_mul_vec4(vec4 r, mat4x4 M, vec4 v) { +static inline void mat4x4_mul_vec4(vec4 r, mat4x4 M, vec4 v) +{ int i, j; for (j = 0; j < 4; ++j) { r[j] = 0.f; - for (i = 0; i < 4; ++i) r[j] += M[i][j] * v[i]; + for (i = 0; i < 4; ++i) + r[j] += M[i][j] * v[i]; } } -static inline void mat4x4_translate(mat4x4 T, float x, float y, float z) { +static inline void mat4x4_translate(mat4x4 T, float x, float y, float z) +{ mat4x4_identity(T); T[3][0] = x; T[3][1] = y; T[3][2] = z; } -static inline void mat4x4_translate_in_place(mat4x4 M, float x, float y, float z) { +static inline void mat4x4_translate_in_place(mat4x4 M, float x, float y, float z) +{ vec4 t = {x, y, z, 0}; vec4 r; - int i; + int i; for (i = 0; i < 4; ++i) { mat4x4_row(r, M, i); M[3][i] += vec4_mul_inner(r, t); } } -static inline void mat4x4_from_vec3_mul_outer(mat4x4 M, vec3 a, vec3 b) { +static inline void mat4x4_from_vec3_mul_outer(mat4x4 M, vec3 a, vec3 b) +{ int i, j; for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) M[i][j] = i < 3 && j < 3 ? a[i] * b[j] : 0.f; + for (j = 0; j < 4; ++j) + M[i][j] = i < 3 && j < 3 ? a[i] * b[j] : 0.f; } -static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, float angle) { +static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, float angle) +{ float s = sinf(angle); float c = cosf(angle); - vec3 u = {x, y, z}; + vec3 u = {x, y, z}; if (vec3_len(u) > 1e-4) { vec3_norm(u, u); @@ -205,29 +255,34 @@ static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, T[3][3] = 1.; mat4x4_mul(R, M, T); - } else { + } + else { mat4x4_dup(R, M); } } -static inline void mat4x4_rotate_X(mat4x4 Q, mat4x4 M, float angle) { - float s = sinf(angle); - float c = cosf(angle); +static inline void mat4x4_rotate_X(mat4x4 Q, mat4x4 M, float angle) +{ + float s = sinf(angle); + float c = cosf(angle); mat4x4 R = {{1.f, 0.f, 0.f, 0.f}, {0.f, c, s, 0.f}, {0.f, -s, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; mat4x4_mul(Q, M, R); } -static inline void mat4x4_rotate_Y(mat4x4 Q, mat4x4 M, float angle) { - float s = sinf(angle); - float c = cosf(angle); +static inline void mat4x4_rotate_Y(mat4x4 Q, mat4x4 M, float angle) +{ + float s = sinf(angle); + float c = cosf(angle); mat4x4 R = {{c, 0.f, s, 0.f}, {0.f, 1.f, 0.f, 0.f}, {-s, 0.f, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; mat4x4_mul(Q, M, R); } -static inline void mat4x4_rotate_Z(mat4x4 Q, mat4x4 M, float angle) { - float s = sinf(angle); - float c = cosf(angle); +static inline void mat4x4_rotate_Z(mat4x4 Q, mat4x4 M, float angle) +{ + float s = sinf(angle); + float c = cosf(angle); mat4x4 R = {{c, s, 0.f, 0.f}, {-s, c, 0.f, 0.f}, {0.f, 0.f, 1.f, 0.f}, {0.f, 0.f, 0.f, 1.f}}; mat4x4_mul(Q, M, R); } -static inline void mat4x4_invert(mat4x4 T, mat4x4 M) { +static inline void mat4x4_invert(mat4x4 T, mat4x4 M) +{ float s[6]; float c[6]; s[0] = M[0][0] * M[1][1] - M[1][0] * M[0][1]; @@ -267,10 +322,11 @@ static inline void mat4x4_invert(mat4x4 T, mat4x4 M) { T[3][2] = (-M[3][0] * s[3] + M[3][1] * s[1] - M[3][2] * s[0]) * idet; T[3][3] = (M[2][0] * s[3] - M[2][1] * s[1] + M[2][2] * s[0]) * idet; } -static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) { +static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) +{ mat4x4_dup(R, M); float s = 1.; - vec3 h; + vec3 h; vec3_norm(R[2], R[2]); @@ -290,7 +346,8 @@ static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) { vec3_norm(R[0], R[0]); } -static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, float n, float f) { +static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, float n, float f) +{ M[0][0] = 2.f * n / (r - l); M[0][1] = M[0][2] = M[0][3] = 0.f; @@ -305,7 +362,8 @@ static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, M[3][2] = -2.f * (f * n) / (f - n); M[3][0] = M[3][1] = M[3][3] = 0.f; } -static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, float n, float f) { +static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, float n, float f) +{ M[0][0] = 2.f / (r - l); M[0][1] = M[0][2] = M[0][3] = 0.f; @@ -320,7 +378,8 @@ static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, fl M[3][2] = -(f + n) / (f - n); M[3][3] = 1.f; } -static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float n, float f) { +static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float n, float f) +{ /* NOTE: Degrees are an unhandy unit to work with. * linmath.h uses radians for everything! */ float const a = (float)(1.f / tan(y_fov / 2.f)); @@ -345,7 +404,8 @@ static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float m[3][2] = -((2.f * f * n) / (f - n)); m[3][3] = 0.f; } -static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) { +static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) +{ /* Adapted from Android's OpenGL Matrix.java. */ /* See the OpenGL GLUT documentation for gluLookAt for a description */ /* of the algorithm. We implement it in a straightforward way: */ @@ -386,20 +446,26 @@ static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) { mat4x4_translate_in_place(m, -eye[0], -eye[1], -eye[2]); } -typedef float quat[4]; -static inline void quat_identity(quat q) { +typedef float quat[4]; +static inline void quat_identity(quat q) +{ q[0] = q[1] = q[2] = 0.f; - q[3] = 1.f; + q[3] = 1.f; } -static inline void quat_add(quat r, quat a, quat b) { +static inline void quat_add(quat r, quat a, quat b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] + b[i]; } -static inline void quat_sub(quat r, quat a, quat b) { +static inline void quat_sub(quat r, quat a, quat b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] - b[i]; } -static inline void quat_mul(quat r, quat p, quat q) { +static inline void quat_mul(quat r, quat p, quat q) +{ vec3 w; vec3_mul_cross(r, p, q); vec3_scale(w, p, q[3]); @@ -408,23 +474,30 @@ static inline void quat_mul(quat r, quat p, quat q) { vec3_add(r, r, w); r[3] = p[3] * q[3] - vec3_mul_inner(p, q); } -static inline void quat_scale(quat r, quat v, float s) { +static inline void quat_scale(quat r, quat v, float s) +{ int i; - for (i = 0; i < 4; ++i) r[i] = v[i] * s; + for (i = 0; i < 4; ++i) + r[i] = v[i] * s; } -static inline float quat_inner_product(quat a, quat b) { +static inline float quat_inner_product(quat a, quat b) +{ float p = 0.f; - int i; - for (i = 0; i < 4; ++i) p += b[i] * a[i]; + int i; + for (i = 0; i < 4; ++i) + p += b[i] * a[i]; return p; } -static inline void quat_conj(quat r, quat q) { +static inline void quat_conj(quat r, quat q) +{ int i; - for (i = 0; i < 3; ++i) r[i] = -q[i]; + for (i = 0; i < 3; ++i) + r[i] = -q[i]; r[3] = q[3]; } #define quat_norm vec4_norm -static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) { +static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) +{ quat v_ = {v[0], v[1], v[2], 0.f}; quat_conj(r, q); @@ -432,11 +505,12 @@ static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) { quat_mul(r, v_, r); quat_mul(r, q, r); } -static inline void mat4x4_from_quat(mat4x4 M, quat q) { - float a = q[3]; - float b = q[0]; - float c = q[1]; - float d = q[2]; +static inline void mat4x4_from_quat(mat4x4 M, quat q) +{ + float a = q[3]; + float b = q[0]; + float c = q[1]; + float d = q[2]; float a2 = a * a; float b2 = b * b; float c2 = c * c; @@ -458,10 +532,11 @@ static inline void mat4x4_from_quat(mat4x4 M, quat q) { M[2][3] = 0.f; M[3][0] = M[3][1] = M[3][2] = 0.f; - M[3][3] = 1.f; + M[3][3] = 1.f; } -static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) { +static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) +{ /* XXX: The way this is written only works for othogonal matrices. */ /* TODO: Take care of non-orthogonal case. */ quat_mul_vec3(R[0], q, M[0]); @@ -469,18 +544,20 @@ static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) { quat_mul_vec3(R[2], q, M[2]); R[3][0] = R[3][1] = R[3][2] = 0.f; - R[3][3] = 1.f; + R[3][3] = 1.f; } -static inline void quat_from_mat4x4(quat q, mat4x4 M) { +static inline void quat_from_mat4x4(quat q, mat4x4 M) +{ float r = 0.f; - int i; + int i; - int perm[] = {0, 1, 2, 0, 1}; - int *p = perm; + int perm[] = {0, 1, 2, 0, 1}; + int *p = perm; for (i = 0; i < 3; i++) { float m = M[i][i]; - if (m < r) continue; + if (m < r) + continue; m = r; p = &perm[i]; } diff --git a/Samples/5_Domain_Specific/simpleVulkan/main.cpp b/Samples/5_Domain_Specific/simpleVulkan/main.cpp index f676c6bc..2419220e 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/main.cpp +++ b/Samples/5_Domain_Specific/simpleVulkan/main.cpp @@ -25,20 +25,18 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "VulkanBaseApp.h" - -#include -#include -#include #include -#include "linmath.h" +#include +#include +#include #include "SineWaveSimulation.h" - -#include +#include "VulkanBaseApp.h" +#include "helper_cuda.h" +#include "linmath.h" typedef float vec2[2]; -std::string execution_path; +std::string execution_path; #ifndef NDEBUG #define ENABLE_VALIDATION (false) @@ -46,498 +44,483 @@ std::string execution_path; #define ENABLE_VALIDATION (true) #endif -class VulkanCudaSineWave : public VulkanBaseApp { - typedef struct UniformBufferObject_st { - mat4x4 modelViewProj; - } UniformBufferObject; - - VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer; - VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory; - UniformBufferObject m_ubo; - VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore; - SineWaveSimulation m_sim; - cudaStream_t m_stream; - cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore, - m_cudaTimelineSemaphore; - cudaExternalMemory_t m_cudaVertMem; - float *m_cudaHeightMap; - using chrono_tp = std::chrono::time_point; - chrono_tp m_lastTime; - size_t m_lastFrame; - - public: - VulkanCudaSineWave(size_t width, size_t height) - : VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION), - m_heightBuffer(VK_NULL_HANDLE), - m_xyBuffer(VK_NULL_HANDLE), - m_indexBuffer(VK_NULL_HANDLE), - m_heightMemory(VK_NULL_HANDLE), - m_xyMemory(VK_NULL_HANDLE), - m_indexMemory(VK_NULL_HANDLE), - m_ubo(), - m_sim(width, height), - m_stream(0), - m_vkWaitSemaphore(VK_NULL_HANDLE), - m_vkSignalSemaphore(VK_NULL_HANDLE), - m_cudaWaitSemaphore(), - m_cudaSignalSemaphore(), - m_cudaTimelineSemaphore(), - m_cudaVertMem(), - m_cudaHeightMap(nullptr), - m_lastFrame(0) { - // Our index buffer can only index 32-bits of the vertex buffer - if ((width * height) > (1ULL << 32ULL)) { - throw std::runtime_error( - "Requested height and width is too large for this sample!"); - } - // Add our compiled vulkan shader files - char *vertex_shader_path = - sdkFindFilePath("vert.spv", execution_path.c_str()); - char *fragment_shader_path = - sdkFindFilePath("frag.spv", execution_path.c_str()); - m_shaderFiles.push_back( - std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); - m_shaderFiles.push_back( - std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); - } - ~VulkanCudaSineWave() { - // Make sure there's no pending work before we start tearing down - checkCudaErrors(cudaStreamSynchronize(m_stream)); - -#ifdef _VK_TIMELINE_SEMAPHORE - if (m_vkTimelineSemaphore != VK_NULL_HANDLE) { - checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaTimelineSemaphore)); - vkDestroySemaphore(m_device, m_vkTimelineSemaphore, nullptr); - } -#endif /* _VK_TIMELINE_SEMAPHORE */ - - if (m_vkSignalSemaphore != VK_NULL_HANDLE) { - checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore)); - vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr); - } - if (m_vkWaitSemaphore != VK_NULL_HANDLE) { - checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore)); - vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr); - } - - if (m_xyBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_xyBuffer, nullptr); - } - if (m_xyMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_xyMemory, nullptr); - } - - if (m_heightBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_heightBuffer, nullptr); - } - if (m_heightMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_heightMemory, nullptr); - } - if (m_cudaHeightMap) { - checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem)); - } - - if (m_indexBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_indexBuffer, nullptr); - } - if (m_indexMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_indexMemory, nullptr); - } - } - - void fillRenderingCommandBuffer(VkCommandBuffer &commandBuffer) { - VkBuffer vertexBuffers[] = {m_heightBuffer, m_xyBuffer}; - VkDeviceSize offsets[] = {0, 0}; - vkCmdBindVertexBuffers(commandBuffer, 0, - sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), - vertexBuffers, offsets); - vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32); - vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) * - (m_sim.getHeight() - 1) * 6), - 1, 0, 0, 0); - } - - void getVertexDescriptions( - std::vector &bindingDesc, - std::vector &attribDesc) { - bindingDesc.resize(2); - attribDesc.resize(2); - - bindingDesc[0].binding = 0; - bindingDesc[0].stride = sizeof(float); - bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - - bindingDesc[1].binding = 1; - bindingDesc[1].stride = sizeof(vec2); - bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - - attribDesc[0].binding = 0; - attribDesc[0].location = 0; - attribDesc[0].format = VK_FORMAT_R32_SFLOAT; - attribDesc[0].offset = 0; - - attribDesc[1].binding = 1; - attribDesc[1].location = 1; - attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT; - attribDesc[1].offset = 0; - } - - void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) { - info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; - info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; - info.primitiveRestartEnable = VK_FALSE; - } - - void getWaitFrameSemaphores( - std::vector &wait, - std::vector &waitStages) const { - if (m_currentFrame != 0) { - // Have vulkan wait until cuda is done with the vertex buffer before - // rendering, We don't do this on the first frame, as the wait semaphore - // hasn't been initialized yet - wait.push_back(m_vkWaitSemaphore); - // We want to wait until all the pipeline commands are complete before - // letting cuda work - waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); - } - } - - void getSignalFrameSemaphores(std::vector &signal) const { - // Add this semaphore for vulkan to signal once the vertex buffer is ready - // for cuda to modify - signal.push_back(m_vkSignalSemaphore); - } - - void initVulkanApp() { - int cuda_device = -1; - - // Select cuda device where vulkan is running. - cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE); - if (cuda_device == -1) { - printf("Error: No CUDA-Vulkan interop capable device found\n"); - exit(EXIT_FAILURE); - } - - m_sim.initCudaLaunchConfig(cuda_device); - - // Create the cuda stream we'll be using - checkCudaErrors( - cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); - - const size_t nVerts = m_sim.getWidth() * m_sim.getHeight(); - const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6; - - // Create the height map cuda will write to - createExternalBuffer( - nVerts * sizeof(float), - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, getDefaultMemHandleType(), - m_heightBuffer, m_heightMemory); - - // Create the vertex buffer that will hold the xy coordinates for the grid - createBuffer(nVerts * sizeof(vec2), VK_BUFFER_USAGE_TRANSFER_DST_BIT | - VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyBuffer, m_xyMemory); - - // Create the index buffer that references from both buffers above - createBuffer( - nInds * sizeof(uint32_t), - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_indexBuffer, m_indexMemory); - - // Import the height map into cuda and retrieve a device pointer to use - importCudaExternalMemory((void **)&m_cudaHeightMap, m_cudaVertMem, - m_heightMemory, nVerts * sizeof(*m_cudaHeightMap), - getDefaultMemHandleType()); - // Set the height map to use in the simulation - m_sim.initSimulation(m_cudaHeightMap); - +class VulkanCudaSineWave : public VulkanBaseApp +{ + typedef struct UniformBufferObject_st { - // Set up the initial values for the vertex buffers with Vulkan - void *stagingBase; - VkBuffer stagingBuffer; - VkDeviceMemory stagingMemory; - VkDeviceSize stagingSz = - std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t)); - createBuffer(stagingSz, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - stagingBuffer, stagingMemory); + mat4x4 modelViewProj; + } UniformBufferObject; - vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase); + VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer; + VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory; + UniformBufferObject m_ubo; + VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore; + SineWaveSimulation m_sim; + cudaStream_t m_stream; + cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore, m_cudaTimelineSemaphore; + cudaExternalMemory_t m_cudaVertMem; + float *m_cudaHeightMap; + using chrono_tp = std::chrono::time_point; + chrono_tp m_lastTime; + size_t m_lastFrame; - memset(stagingBase, 0, nVerts * sizeof(float)); - copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float)); - - for (size_t y = 0; y < m_sim.getHeight(); y++) { - for (size_t x = 0; x < m_sim.getWidth(); x++) { - vec2 *stagedVert = (vec2 *)stagingBase; - stagedVert[y * m_sim.getWidth() + x][0] = - (2.0f * x) / (m_sim.getWidth() - 1) - 1; - stagedVert[y * m_sim.getWidth() + x][1] = - (2.0f * y) / (m_sim.getHeight() - 1) - 1; +public: + VulkanCudaSineWave(size_t width, size_t height) + : VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION) + , m_heightBuffer(VK_NULL_HANDLE) + , m_xyBuffer(VK_NULL_HANDLE) + , m_indexBuffer(VK_NULL_HANDLE) + , m_heightMemory(VK_NULL_HANDLE) + , m_xyMemory(VK_NULL_HANDLE) + , m_indexMemory(VK_NULL_HANDLE) + , m_ubo() + , m_sim(width, height) + , m_stream(0) + , m_vkWaitSemaphore(VK_NULL_HANDLE) + , m_vkSignalSemaphore(VK_NULL_HANDLE) + , m_cudaWaitSemaphore() + , m_cudaSignalSemaphore() + , m_cudaTimelineSemaphore() + , m_cudaVertMem() + , m_cudaHeightMap(nullptr) + , m_lastFrame(0) + { + // Our index buffer can only index 32-bits of the vertex buffer + if ((width * height) > (1ULL << 32ULL)) { + throw std::runtime_error("Requested height and width is too large for this sample!"); } - } - copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2)); - - { - uint32_t *indices = (uint32_t *)stagingBase; - for (size_t y = 0; y < m_sim.getHeight() - 1; y++) { - for (size_t x = 0; x < m_sim.getWidth() - 1; x++) { - indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0)); - indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); - indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); - indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); - indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1)); - indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); - indices += 6; - } - } - } - copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t)); - - vkUnmapMemory(m_device, stagingMemory); - vkDestroyBuffer(m_device, stagingBuffer, nullptr); - vkFreeMemory(m_device, stagingMemory, nullptr); + // Add our compiled vulkan shader files + char *vertex_shader_path = sdkFindFilePath("vert.spv", execution_path.c_str()); + char *fragment_shader_path = sdkFindFilePath("frag.spv", execution_path.c_str()); + m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); + m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); } + ~VulkanCudaSineWave() + { + // Make sure there's no pending work before we start tearing down + checkCudaErrors(cudaStreamSynchronize(m_stream)); #ifdef _VK_TIMELINE_SEMAPHORE - // Create the timeline semaphore to sync cuda and vulkan access to vertex - // buffer - createExternalSemaphore(m_vkTimelineSemaphore, - getDefaultSemaphoreHandleType()); - // Import the timeline semaphore cuda will use to sync cuda and vulkan - // access to vertex buffer - importCudaExternalSemaphore(m_cudaTimelineSemaphore, m_vkTimelineSemaphore, - getDefaultSemaphoreHandleType()); -#else - // Create the semaphore vulkan will signal when it's done with the vertex - // buffer - createExternalSemaphore(m_vkSignalSemaphore, - getDefaultSemaphoreHandleType()); - // Create the semaphore vulkan will wait for before using the vertex buffer - createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); - // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait - importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, - getDefaultSemaphoreHandleType()); - // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait - importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, - getDefaultSemaphoreHandleType()); + if (m_vkTimelineSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaTimelineSemaphore)); + vkDestroySemaphore(m_device, m_vkTimelineSemaphore, nullptr); + } #endif /* _VK_TIMELINE_SEMAPHORE */ - } - void importCudaExternalMemory(void **cudaPtr, cudaExternalMemory_t &cudaMem, - VkDeviceMemory &vkMem, VkDeviceSize size, - VkExternalMemoryHandleTypeFlagBits handleType) { - cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; + if (m_vkSignalSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore)); + vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr); + } + if (m_vkWaitSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore)); + vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr); + } - if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; - } else if (handleType & - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { - externalMemoryHandleDesc.type = - cudaExternalMemoryHandleTypeOpaqueWin32Kmt; - } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; - } else { - throw std::runtime_error("Unknown handle type requested!"); + if (m_xyBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_xyBuffer, nullptr); + } + if (m_xyMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_xyMemory, nullptr); + } + + if (m_heightBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_heightBuffer, nullptr); + } + if (m_heightMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_heightMemory, nullptr); + } + if (m_cudaHeightMap) { + checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem)); + } + + if (m_indexBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_indexBuffer, nullptr); + } + if (m_indexMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_indexMemory, nullptr); + } } - externalMemoryHandleDesc.size = size; + void fillRenderingCommandBuffer(VkCommandBuffer &commandBuffer) + { + VkBuffer vertexBuffers[] = {m_heightBuffer, m_xyBuffer}; + VkDeviceSize offsets[] = {0, 0}; + vkCmdBindVertexBuffers( + commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets); + vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32); + vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6), 1, 0, 0, 0); + } + + void getVertexDescriptions(std::vector &bindingDesc, + std::vector &attribDesc) + { + bindingDesc.resize(2); + attribDesc.resize(2); + + bindingDesc[0].binding = 0; + bindingDesc[0].stride = sizeof(float); + bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + bindingDesc[1].binding = 1; + bindingDesc[1].stride = sizeof(vec2); + bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + attribDesc[0].binding = 0; + attribDesc[0].location = 0; + attribDesc[0].format = VK_FORMAT_R32_SFLOAT; + attribDesc[0].offset = 0; + + attribDesc[1].binding = 1; + attribDesc[1].location = 1; + attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT; + attribDesc[1].offset = 0; + } + + void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) + { + info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + info.primitiveRestartEnable = VK_FALSE; + } + + void getWaitFrameSemaphores(std::vector &wait, std::vector &waitStages) const + { + if (m_currentFrame != 0) { + // Have vulkan wait until cuda is done with the vertex buffer before + // rendering, We don't do this on the first frame, as the wait semaphore + // hasn't been initialized yet + wait.push_back(m_vkWaitSemaphore); + // We want to wait until all the pipeline commands are complete before + // letting cuda work + waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); + } + } + + void getSignalFrameSemaphores(std::vector &signal) const + { + // Add this semaphore for vulkan to signal once the vertex buffer is ready + // for cuda to modify + signal.push_back(m_vkSignalSemaphore); + } + + void initVulkanApp() + { + int cuda_device = -1; + + // Select cuda device where vulkan is running. + cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE); + if (cuda_device == -1) { + printf("Error: No CUDA-Vulkan interop capable device found\n"); + exit(EXIT_FAILURE); + } + + m_sim.initCudaLaunchConfig(cuda_device); + + // Create the cuda stream we'll be using + checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); + + const size_t nVerts = m_sim.getWidth() * m_sim.getHeight(); + const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6; + + // Create the height map cuda will write to + createExternalBuffer(nVerts * sizeof(float), + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + getDefaultMemHandleType(), + m_heightBuffer, + m_heightMemory); + + // Create the vertex buffer that will hold the xy coordinates for the grid + createBuffer(nVerts * sizeof(vec2), + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + m_xyBuffer, + m_xyMemory); + + // Create the index buffer that references from both buffers above + createBuffer(nInds * sizeof(uint32_t), + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + m_indexBuffer, + m_indexMemory); + + // Import the height map into cuda and retrieve a device pointer to use + importCudaExternalMemory((void **)&m_cudaHeightMap, + m_cudaVertMem, + m_heightMemory, + nVerts * sizeof(*m_cudaHeightMap), + getDefaultMemHandleType()); + // Set the height map to use in the simulation + m_sim.initSimulation(m_cudaHeightMap); + + { + // Set up the initial values for the vertex buffers with Vulkan + void *stagingBase; + VkBuffer stagingBuffer; + VkDeviceMemory stagingMemory; + VkDeviceSize stagingSz = std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t)); + createBuffer(stagingSz, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + stagingBuffer, + stagingMemory); + + vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase); + + memset(stagingBase, 0, nVerts * sizeof(float)); + copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float)); + + for (size_t y = 0; y < m_sim.getHeight(); y++) { + for (size_t x = 0; x < m_sim.getWidth(); x++) { + vec2 *stagedVert = (vec2 *)stagingBase; + stagedVert[y * m_sim.getWidth() + x][0] = (2.0f * x) / (m_sim.getWidth() - 1) - 1; + stagedVert[y * m_sim.getWidth() + x][1] = (2.0f * y) / (m_sim.getHeight() - 1) - 1; + } + } + copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2)); + + { + uint32_t *indices = (uint32_t *)stagingBase; + for (size_t y = 0; y < m_sim.getHeight() - 1; y++) { + for (size_t x = 0; x < m_sim.getWidth() - 1; x++) { + indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0)); + indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); + indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); + indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); + indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1)); + indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); + indices += 6; + } + } + } + copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t)); + + vkUnmapMemory(m_device, stagingMemory); + vkDestroyBuffer(m_device, stagingBuffer, nullptr); + vkFreeMemory(m_device, stagingMemory, nullptr); + } + +#ifdef _VK_TIMELINE_SEMAPHORE + // Create the timeline semaphore to sync cuda and vulkan access to vertex + // buffer + createExternalSemaphore(m_vkTimelineSemaphore, getDefaultSemaphoreHandleType()); + // Import the timeline semaphore cuda will use to sync cuda and vulkan + // access to vertex buffer + importCudaExternalSemaphore(m_cudaTimelineSemaphore, m_vkTimelineSemaphore, getDefaultSemaphoreHandleType()); +#else + // Create the semaphore vulkan will signal when it's done with the vertex + // buffer + createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); + // Create the semaphore vulkan will wait for before using the vertex buffer + createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); + // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait + importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); + // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait + importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); +#endif /* _VK_TIMELINE_SEMAPHORE */ + } + + void importCudaExternalMemory(void **cudaPtr, + cudaExternalMemory_t &cudaMem, + VkDeviceMemory &vkMem, + VkDeviceSize size, + VkExternalMemoryHandleTypeFlagBits handleType) + { + cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; + + if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32Kmt; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; + } + else { + throw std::runtime_error("Unknown handle type requested!"); + } + + externalMemoryHandleDesc.size = size; #ifdef _WIN64 - externalMemoryHandleDesc.handle.win32.handle = - (HANDLE)getMemHandle(vkMem, handleType); + externalMemoryHandleDesc.handle.win32.handle = (HANDLE)getMemHandle(vkMem, handleType); #else - externalMemoryHandleDesc.handle.fd = - (int)(uintptr_t)getMemHandle(vkMem, handleType); + externalMemoryHandleDesc.handle.fd = (int)(uintptr_t)getMemHandle(vkMem, handleType); #endif - checkCudaErrors( - cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc)); + checkCudaErrors(cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc)); - cudaExternalMemoryBufferDesc externalMemBufferDesc = {}; - externalMemBufferDesc.offset = 0; - externalMemBufferDesc.size = size; - externalMemBufferDesc.flags = 0; + cudaExternalMemoryBufferDesc externalMemBufferDesc = {}; + externalMemBufferDesc.offset = 0; + externalMemBufferDesc.size = size; + externalMemBufferDesc.flags = 0; - checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem, - &externalMemBufferDesc)); - } + checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem, &externalMemBufferDesc)); + } - void importCudaExternalSemaphore( - cudaExternalSemaphore_t &cudaSem, VkSemaphore &vkSem, - VkExternalSemaphoreHandleTypeFlagBits handleType) { - cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; + void importCudaExternalSemaphore(cudaExternalSemaphore_t &cudaSem, + VkSemaphore &vkSem, + VkExternalSemaphoreHandleTypeFlagBits handleType) + { + cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; #ifdef _VK_TIMELINE_SEMAPHORE - if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; - } else if (handleType & - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; - } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd; - } + if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd; + } #else - if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeOpaqueWin32; - } else if (handleType & - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; - } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeOpaqueFd; - } + if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + } #endif /* _VK_TIMELINE_SEMAPHORE */ - else { - throw std::runtime_error("Unknown handle type requested!"); - } + else { + throw std::runtime_error("Unknown handle type requested!"); + } #ifdef _WIN64 - externalSemaphoreHandleDesc.handle.win32.handle = - (HANDLE)getSemaphoreHandle(vkSem, handleType); + externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType); #else - externalSemaphoreHandleDesc.handle.fd = - (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); + externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); #endif - externalSemaphoreHandleDesc.flags = 0; + externalSemaphoreHandleDesc.flags = 0; - checkCudaErrors( - cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); - } - - VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); } - - void updateUniformBuffer(uint32_t imageIndex) { - { - mat4x4 view, proj; - vec3 eye = {1.75f, 1.75f, 1.25f}; - vec3 center = {0.0f, 0.0f, -0.25f}; - vec3 up = {0.0f, 0.0f, 1.0f}; - - mat4x4_perspective( - proj, (float)degreesToRadians(45.0f), - m_swapChainExtent.width / (float)m_swapChainExtent.height, 0.1f, - 10.0f); - proj[1][1] *= -1.0f; // Flip y axis - - mat4x4_look_at(view, eye, center, up); - mat4x4_mul(m_ubo.modelViewProj, proj, view); + checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); } - void *data; - vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, - &data); - memcpy(data, &m_ubo, sizeof(m_ubo)); - vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); - } + VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); } - std::vector getRequiredExtensions() const { - std::vector extensions; - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); - return extensions; - } + void updateUniformBuffer(uint32_t imageIndex) + { + { + mat4x4 view, proj; + vec3 eye = {1.75f, 1.75f, 1.25f}; + vec3 center = {0.0f, 0.0f, -0.25f}; + vec3 up = {0.0f, 0.0f, 1.0f}; - std::vector getRequiredDeviceExtensions() const { - std::vector extensions; - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); - extensions.push_back(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); + mat4x4_perspective(proj, + (float)degreesToRadians(45.0f), + m_swapChainExtent.width / (float)m_swapChainExtent.height, + 0.1f, + 10.0f); + proj[1][1] *= -1.0f; // Flip y axis + + mat4x4_look_at(view, eye, center, up); + mat4x4_mul(m_ubo.modelViewProj, proj, view); + } + + void *data; + vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data); + memcpy(data, &m_ubo, sizeof(m_ubo)); + vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); + } + + std::vector getRequiredExtensions() const + { + std::vector extensions; + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); + return extensions; + } + + std::vector getRequiredDeviceExtensions() const + { + std::vector extensions; + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); + extensions.push_back(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); #ifdef _WIN64 - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); #else - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); #endif /* _WIN64 */ - return extensions; - } - - void drawFrame() { - static chrono_tp startTime = std::chrono::high_resolution_clock::now(); - - chrono_tp currentTime = std::chrono::high_resolution_clock::now(); - float time = std::chrono::duration( - currentTime - startTime) - .count(); - - if (m_currentFrame == 0) { - m_lastTime = startTime; + return extensions; } - float frame_time = - std::chrono::duration(currentTime - - m_lastTime) - .count(); + void drawFrame() + { + static chrono_tp startTime = std::chrono::high_resolution_clock::now(); - // Have vulkan draw the current frame... - VulkanBaseApp::drawFrame(); + chrono_tp currentTime = std::chrono::high_resolution_clock::now(); + float time = std::chrono::duration(currentTime - startTime).count(); + + if (m_currentFrame == 0) { + m_lastTime = startTime; + } + + float frame_time = std::chrono::duration(currentTime - m_lastTime).count(); + + // Have vulkan draw the current frame... + VulkanBaseApp::drawFrame(); #ifdef _VK_TIMELINE_SEMAPHORE - static uint64_t waitValue = 1; - static uint64_t signalValue = 2; + static uint64_t waitValue = 1; + static uint64_t signalValue = 2; - cudaExternalSemaphoreWaitParams waitParams = {}; - waitParams.flags = 0; - waitParams.params.fence.value = waitValue; + cudaExternalSemaphoreWaitParams waitParams = {}; + waitParams.flags = 0; + waitParams.params.fence.value = waitValue; - cudaExternalSemaphoreSignalParams signalParams = {}; - signalParams.flags = 0; - signalParams.params.fence.value = signalValue; - // Wait for vulkan to complete it's work - checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaTimelineSemaphore, - &waitParams, 1, m_stream)); - // Now step the simulation - m_sim.stepSimulation(time, m_stream); - // Signal vulkan to continue with the updated buffers - checkCudaErrors(cudaSignalExternalSemaphoresAsync( - &m_cudaTimelineSemaphore, &signalParams, 1, m_stream)); + cudaExternalSemaphoreSignalParams signalParams = {}; + signalParams.flags = 0; + signalParams.params.fence.value = signalValue; + // Wait for vulkan to complete it's work + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaTimelineSemaphore, &waitParams, 1, m_stream)); + // Now step the simulation + m_sim.stepSimulation(time, m_stream); + // Signal vulkan to continue with the updated buffers + checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaTimelineSemaphore, &signalParams, 1, m_stream)); - waitValue += 2; - signalValue += 2; + waitValue += 2; + signalValue += 2; #else - cudaExternalSemaphoreWaitParams waitParams = {}; - waitParams.flags = 0; - waitParams.params.fence.value = 0; + cudaExternalSemaphoreWaitParams waitParams = {}; + waitParams.flags = 0; + waitParams.params.fence.value = 0; - cudaExternalSemaphoreSignalParams signalParams = {}; - signalParams.flags = 0; - signalParams.params.fence.value = 0; + cudaExternalSemaphoreSignalParams signalParams = {}; + signalParams.flags = 0; + signalParams.params.fence.value = 0; - // Wait for vulkan to complete it's work - checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, - &waitParams, 1, m_stream)); - // Now step the simulation - m_sim.stepSimulation(time, m_stream); - // Signal vulkan to continue with the updated buffers - checkCudaErrors(cudaSignalExternalSemaphoresAsync( - &m_cudaSignalSemaphore, &signalParams, 1, m_stream)); + // Wait for vulkan to complete it's work + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream)); + // Now step the simulation + m_sim.stepSimulation(time, m_stream); + // Signal vulkan to continue with the updated buffers + checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream)); #endif /* _VK_TIMELINE_SEMAPHORE */ - // Output a naive measurement of the frames per second every five seconds - if (frame_time > 5) { - std::cout << "Average FPS (over " << std::fixed << std::setprecision(2) - << frame_time << " seconds): " << std::fixed - << std::setprecision(2) - << ((m_currentFrame - m_lastFrame) / frame_time) << std::endl; - m_lastFrame = m_currentFrame; - m_lastTime = currentTime; + // Output a naive measurement of the frames per second every five seconds + if (frame_time > 5) { + std::cout << "Average FPS (over " << std::fixed << std::setprecision(2) << frame_time + << " seconds): " << std::fixed << std::setprecision(2) + << ((m_currentFrame - m_lastFrame) / frame_time) << std::endl; + m_lastFrame = m_currentFrame; + m_lastTime = currentTime; + } } - } }; -int main(int argc, char **argv) { - execution_path = argv[0]; - VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL)); - app.init(); - app.mainLoop(); - return 0; +int main(int argc, char **argv) +{ + execution_path = argv[0]; + VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL)); + app.init(); + app.mainLoop(); + return 0; } diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/Build_instructions.txt b/Samples/5_Domain_Specific/simpleVulkanMMAP/Build_instructions.txt index a6c2df23..13d4328b 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/Build_instructions.txt +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/Build_instructions.txt @@ -21,7 +21,7 @@ For Linux: For Linux aarch64(L4T): --- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 +-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 -- install above will also provide libvulkan-dev as dependencies -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=', VULKAN_SDK_PATH in this scenario is typically "/usr" diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.cu b/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.cu index 7e176229..149be22e 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.cu +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.cu @@ -29,8 +29,9 @@ * See: https://www.piday.org/million/ */ -#include "MonteCarloPi.h" #include + +#include "MonteCarloPi.h" #define CUDA_DRIVER_API #include #include @@ -45,257 +46,245 @@ // ipcHandleTypeFlag variable is a convenience variable and is passed by value // to individual requests. #if defined(__linux__) -CUmemAllocationHandleType ipcHandleTypeFlag = - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #else CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32; #endif // Windows-specific LPSECURITYATTRIBUTES -void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) { +void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) +{ #if defined(__linux__) - return; + return; #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)"; - static OBJECT_ATTRIBUTES objAttributes; - static bool objAttributesConfigured = false; + static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)"; + static OBJECT_ATTRIBUTES objAttributes; + static bool objAttributesConfigured = false; - if (!objAttributesConfigured) { - PSECURITY_DESCRIPTOR secDesc; - BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA( - sddl, SDDL_REVISION_1, &secDesc, NULL); - if (result == 0) { - printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n", - GetLastError()); + if (!objAttributesConfigured) { + PSECURITY_DESCRIPTOR secDesc; + BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA(sddl, SDDL_REVISION_1, &secDesc, NULL); + if (result == 0) { + printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n", GetLastError()); + } + + InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc); + + objAttributesConfigured = true; } - InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc); - - objAttributesConfigured = true; - } - - prop->win32HandleMetaData = &objAttributes; - return; + prop->win32HandleMetaData = &objAttributes; + return; #endif } -__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, - float *numPointsInCircle, - unsigned int numPoints, float time) { - const size_t stride = gridDim.x * blockDim.x; - size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - float count = 0.0f; +__global__ void monte_carlo_kernel(vec2 *xyVector, + float *pointsInsideCircle, + float *numPointsInCircle, + unsigned int numPoints, + float time) +{ + const size_t stride = gridDim.x * blockDim.x; + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + float count = 0.0f; - curandState rgnState; - curand_init((unsigned long long)time, tid, 0, &rgnState); + curandState rgnState; + curand_init((unsigned long long)time, tid, 0, &rgnState); - for (; tid < numPoints; tid += stride) { - float x = curand_uniform(&rgnState); - float y = curand_uniform(&rgnState); - x = (2.0f * x) - 1.0f; - y = (2.0f * y) - 1.0f; - xyVector[tid][0] = x; - xyVector[tid][1] = y; + for (; tid < numPoints; tid += stride) { + float x = curand_uniform(&rgnState); + float y = curand_uniform(&rgnState); + x = (2.0f * x) - 1.0f; + y = (2.0f * y) - 1.0f; + xyVector[tid][0] = x; + xyVector[tid][1] = y; - // Compute the distance of this point form the center(0, 0) - float dist = sqrtf((x * x) + (y * y)); + // Compute the distance of this point form the center(0, 0) + float dist = sqrtf((x * x) + (y * y)); - // If distance is less than the radius of the unit circle, the point lies in - // the circle. - pointsInsideCircle[tid] = (dist <= 1.0f); - count += (dist <= 1.0f); - } - atomicAdd(numPointsInCircle, count); + // If distance is less than the radius of the unit circle, the point lies in + // the circle. + pointsInsideCircle[tid] = (dist <= 1.0f); + count += (dist <= 1.0f); + } + atomicAdd(numPointsInCircle, count); } MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points) - : m_xyVector(nullptr), - m_pointsInsideCircle(nullptr), - m_totalPointsInsideCircle(0), - m_totalPointsSimulated(0), - m_numPoints(num_points) {} - -MonteCarloPiSimulation::~MonteCarloPiSimulation() { - if (m_numPointsInCircle) { - checkCudaErrors(cudaFree(m_numPointsInCircle)); - m_numPointsInCircle = nullptr; - } - if (m_hostNumPointsInCircle) { - checkCudaErrors(cudaFreeHost(m_hostNumPointsInCircle)); - m_hostNumPointsInCircle = nullptr; - } - - cleanupSimulationAllocations(); + : m_xyVector(nullptr) + , m_pointsInsideCircle(nullptr) + , m_totalPointsInsideCircle(0) + , m_totalPointsSimulated(0) + , m_numPoints(num_points) +{ } -void MonteCarloPiSimulation::initSimulation(int cudaDevice, - cudaStream_t stream) { - m_cudaDevice = cudaDevice; - getIdealExecutionConfiguration(); +MonteCarloPiSimulation::~MonteCarloPiSimulation() +{ + if (m_numPointsInCircle) { + checkCudaErrors(cudaFree(m_numPointsInCircle)); + m_numPointsInCircle = nullptr; + } + if (m_hostNumPointsInCircle) { + checkCudaErrors(cudaFreeHost(m_hostNumPointsInCircle)); + m_hostNumPointsInCircle = nullptr; + } - // Allocate a position buffer that contains random location of the points in - // XY cartesian plane. - // Allocate a bitmap buffer which holds information of whether a point in the - // position buffer is inside the unit circle or not. - setupSimulationAllocations(); - - checkCudaErrors( - cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle))); - checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle, - sizeof(*m_hostNumPointsInCircle))); + cleanupSimulationAllocations(); } -void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream) { - checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0, - sizeof(*m_numPointsInCircle), stream)); +void MonteCarloPiSimulation::initSimulation(int cudaDevice, cudaStream_t stream) +{ + m_cudaDevice = cudaDevice; + getIdealExecutionConfiguration(); - monte_carlo_kernel<<>>( - m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time); - getLastCudaError("Failed to launch CUDA simulation"); + // Allocate a position buffer that contains random location of the points in + // XY cartesian plane. + // Allocate a bitmap buffer which holds information of whether a point in the + // position buffer is inside the unit circle or not. + setupSimulationAllocations(); - checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle, - sizeof(*m_numPointsInCircle), - cudaMemcpyDeviceToHost, stream)); - - // Queue up a stream callback to compute and print the PI value. - checkCudaErrors( - cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this)); + checkCudaErrors(cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle))); + checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle, sizeof(*m_hostNumPointsInCircle))); } -void MonteCarloPiSimulation::computePiCallback(void *args) { - MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args; - cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle); - cbData->m_totalPointsSimulated += cbData->m_numPoints; - double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle / - (double)cbData->m_totalPointsSimulated); - printf("Approximate Pi value for %zd data points: %lf \n", - cbData->m_totalPointsSimulated, piValue); +void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream) +{ + checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0, sizeof(*m_numPointsInCircle), stream)); + + monte_carlo_kernel<<>>( + m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time); + getLastCudaError("Failed to launch CUDA simulation"); + + checkCudaErrors(cudaMemcpyAsync( + m_hostNumPointsInCircle, m_numPointsInCircle, sizeof(*m_numPointsInCircle), cudaMemcpyDeviceToHost, stream)); + + // Queue up a stream callback to compute and print the PI value. + checkCudaErrors(cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this)); } -void MonteCarloPiSimulation::getIdealExecutionConfiguration() { - int warpSize = 0; - int multiProcessorCount = 0; - - checkCudaErrors(cudaSetDevice(m_cudaDevice)); - checkCudaErrors( - cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice)); - - // We don't need large block sizes, since there's not much inter-thread - // communication - m_threads = warpSize; - - // Use the occupancy calculator and fill the gpu as best as we can - checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &m_blocks, monte_carlo_kernel, warpSize, 0)); - - checkCudaErrors(cudaDeviceGetAttribute( - &multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice)); - m_blocks *= multiProcessorCount; - - // Go ahead and the clamp the blocks to the minimum needed for this - // height/width - m_blocks = - std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads)); +void MonteCarloPiSimulation::computePiCallback(void *args) +{ + MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args; + cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle); + cbData->m_totalPointsSimulated += cbData->m_numPoints; + double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle / (double)cbData->m_totalPointsSimulated); + printf("Approximate Pi value for %zd data points: %lf \n", cbData->m_totalPointsSimulated, piValue); } -void MonteCarloPiSimulation::setupSimulationAllocations() { - CUdeviceptr d_ptr = 0U; - size_t granularity = 0; - CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle; +void MonteCarloPiSimulation::getIdealExecutionConfiguration() +{ + int warpSize = 0; + int multiProcessorCount = 0; - CUmemAllocationProp allocProp = {}; - allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED; - allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - allocProp.location.id = m_cudaDevice; - allocProp.win32HandleMetaData = NULL; - allocProp.requestedHandleTypes = ipcHandleTypeFlag; + checkCudaErrors(cudaSetDevice(m_cudaDevice)); + checkCudaErrors(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice)); - // Windows-specific LPSECURITYATTRIBUTES is required when - // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope - // of which exported allocations may be tranferred to other processes. For all - // other handle types, pass NULL. - getDefaultSecurityDescriptor(&allocProp); + // We don't need large block sizes, since there's not much inter-thread + // communication + m_threads = warpSize; - // Get the recommended granularity for m_cudaDevice. - checkCudaErrors(cuMemGetAllocationGranularity( - &granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + // Use the occupancy calculator and fill the gpu as best as we can + checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, monte_carlo_kernel, warpSize, 0)); - size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector); - size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle); + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice)); + m_blocks *= multiProcessorCount; - size_t xyPositionSize = - ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity); - size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity); - m_totalAllocationSize = (xyPositionSize + inCircleSize); - - // Reserve the required contiguous VA space for the allocations - checkCudaErrors( - cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0)); - - // Create the allocations as a pinned allocation on this device. - // Create an allocation to store all the positions of points on the xy plane - // and a second allocation which stores information if the corresponding - // position is inside the unit circle or not. - checkCudaErrors( - cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0)); - checkCudaErrors( - cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0)); - - // Export the allocation to a platform-specific handle. The type of handle - // requested here must match the requestedHandleTypes field in the prop - // structure passed to cuMemCreate. The handle obtained here will be passed to - // vulkan to import the allocation. - checkCudaErrors(cuMemExportToShareableHandle( - (void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0)); - checkCudaErrors( - cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle, - cudaInCircleHandle, ipcHandleTypeFlag, 0)); - - CUdeviceptr va_position = d_ptr; - CUdeviceptr va_InCircle = va_position + xyPositionSize; - m_pointsInsideCircle = (float *)va_InCircle; - m_xyVector = (vec2 *)va_position; - - // Assign the chunk to the appropriate VA range - checkCudaErrors( - cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0)); - checkCudaErrors( - cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0)); - - // Release the handles for the allocation. Since the allocation is currently - // mapped to a VA range with a previous call to cuMemMap the actual freeing of - // memory allocation will happen on an eventual call to cuMemUnmap. Thus the - // allocation will be kept live until it is unmapped. - checkCudaErrors(cuMemRelease(cudaPositionHandle)); - checkCudaErrors(cuMemRelease(cudaInCircleHandle)); - - CUmemAccessDesc accessDescriptor = {}; - accessDescriptor.location.id = m_cudaDevice; - accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - - // Apply the access descriptor to the whole VA range. Essentially enables - // Read-Write access to the range. - checkCudaErrors( - cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1)); + // Go ahead and the clamp the blocks to the minimum needed for this + // height/width + m_blocks = std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads)); } -void MonteCarloPiSimulation::cleanupSimulationAllocations() { - if (m_xyVector && m_pointsInsideCircle) { - // Unmap the mapped virtual memory region - // Since the handles to the mapped backing stores have already been released - // by cuMemRelease, and these are the only/last mappings referencing them, - // The backing stores will be freed. - checkCudaErrors(cuMemUnmap((CUdeviceptr)m_xyVector, m_totalAllocationSize)); +void MonteCarloPiSimulation::setupSimulationAllocations() +{ + CUdeviceptr d_ptr = 0U; + size_t granularity = 0; + CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle; - checkIpcErrors(ipcCloseShareableHandle(m_posShareableHandle)); - checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle)); + CUmemAllocationProp allocProp = {}; + allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED; + allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + allocProp.location.id = m_cudaDevice; + allocProp.win32HandleMetaData = NULL; + allocProp.requestedHandleTypes = ipcHandleTypeFlag; - // Free the virtual address region. + // Windows-specific LPSECURITYATTRIBUTES is required when + // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope + // of which exported allocations may be tranferred to other processes. For all + // other handle types, pass NULL. + getDefaultSecurityDescriptor(&allocProp); + + // Get the recommended granularity for m_cudaDevice. + checkCudaErrors(cuMemGetAllocationGranularity(&granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + + size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector); + size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle); + + size_t xyPositionSize = ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity); + size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity); + m_totalAllocationSize = (xyPositionSize + inCircleSize); + + // Reserve the required contiguous VA space for the allocations + checkCudaErrors(cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0)); + + // Create the allocations as a pinned allocation on this device. + // Create an allocation to store all the positions of points on the xy plane + // and a second allocation which stores information if the corresponding + // position is inside the unit circle or not. + checkCudaErrors(cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0)); + checkCudaErrors(cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0)); + + // Export the allocation to a platform-specific handle. The type of handle + // requested here must match the requestedHandleTypes field in the prop + // structure passed to cuMemCreate. The handle obtained here will be passed to + // vulkan to import the allocation. checkCudaErrors( - cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize)); + cuMemExportToShareableHandle((void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0)); + checkCudaErrors( + cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle, cudaInCircleHandle, ipcHandleTypeFlag, 0)); - m_xyVector = nullptr; - m_pointsInsideCircle = nullptr; - } + CUdeviceptr va_position = d_ptr; + CUdeviceptr va_InCircle = va_position + xyPositionSize; + m_pointsInsideCircle = (float *)va_InCircle; + m_xyVector = (vec2 *)va_position; + + // Assign the chunk to the appropriate VA range + checkCudaErrors(cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0)); + checkCudaErrors(cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0)); + + // Release the handles for the allocation. Since the allocation is currently + // mapped to a VA range with a previous call to cuMemMap the actual freeing of + // memory allocation will happen on an eventual call to cuMemUnmap. Thus the + // allocation will be kept live until it is unmapped. + checkCudaErrors(cuMemRelease(cudaPositionHandle)); + checkCudaErrors(cuMemRelease(cudaInCircleHandle)); + + CUmemAccessDesc accessDescriptor = {}; + accessDescriptor.location.id = m_cudaDevice; + accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + // Apply the access descriptor to the whole VA range. Essentially enables + // Read-Write access to the range. + checkCudaErrors(cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1)); +} + +void MonteCarloPiSimulation::cleanupSimulationAllocations() +{ + if (m_xyVector && m_pointsInsideCircle) { + // Unmap the mapped virtual memory region + // Since the handles to the mapped backing stores have already been released + // by cuMemRelease, and these are the only/last mappings referencing them, + // The backing stores will be freed. + checkCudaErrors(cuMemUnmap((CUdeviceptr)m_xyVector, m_totalAllocationSize)); + + checkIpcErrors(ipcCloseShareableHandle(m_posShareableHandle)); + checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle)); + + // Free the virtual address region. + checkCudaErrors(cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize)); + + m_xyVector = nullptr; + m_pointsInsideCircle = nullptr; + } } diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.h b/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.h index 0f1c6322..476a651c 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.h +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/MonteCarloPi.h @@ -29,67 +29,66 @@ #ifndef __PISIM_H__ #define __PISIM_H__ -#include +#include #include #include #include -#include +#include #include "helper_multiprocess.h" typedef float vec2[2]; -class MonteCarloPiSimulation { - size_t m_numPoints; +class MonteCarloPiSimulation +{ + size_t m_numPoints; - // Pointers to Cuda allocated buffers which are imported and used by vulkan as - // vertex buffer - vec2 *m_xyVector; - float *m_pointsInsideCircle; + // Pointers to Cuda allocated buffers which are imported and used by vulkan as + // vertex buffer + vec2 *m_xyVector; + float *m_pointsInsideCircle; - // Pointers to device and host allocated memories storing number of points - // that are inside the unit circle - float *m_numPointsInCircle; - float *m_hostNumPointsInCircle; + // Pointers to device and host allocated memories storing number of points + // that are inside the unit circle + float *m_numPointsInCircle; + float *m_hostNumPointsInCircle; - int m_blocks, m_threads; + int m_blocks, m_threads; - // Total size of allocations created by cuMemMap Apis. This size is the sum of - // sizes of m_xyVector and m_pointsInsideCircle buffers. - size_t m_totalAllocationSize; + // Total size of allocations created by cuMemMap Apis. This size is the sum of + // sizes of m_xyVector and m_pointsInsideCircle buffers. + size_t m_totalAllocationSize; - // Shareable Handles(a file descriptor on Linux and NT Handle on Windows), - // used for sharing cuda - // allocated memory with Vulkan - ShareableHandle m_posShareableHandle, m_inCircleShareableHandle; + // Shareable Handles(a file descriptor on Linux and NT Handle on Windows), + // used for sharing cuda + // allocated memory with Vulkan + ShareableHandle m_posShareableHandle, m_inCircleShareableHandle; - // Cuda Device corresponding to the Vulkan Physical device - int m_cudaDevice; + // Cuda Device corresponding to the Vulkan Physical device + int m_cudaDevice; - // Track and accumulate total points that have been simulated since start of - // the sample. The idea is to get a closer approximation to PI with time. - size_t m_totalPointsInsideCircle; - size_t m_totalPointsSimulated; + // Track and accumulate total points that have been simulated since start of + // the sample. The idea is to get a closer approximation to PI with time. + size_t m_totalPointsInsideCircle; + size_t m_totalPointsSimulated; - void setupSimulationAllocations(); - void cleanupSimulationAllocations(); - void getIdealExecutionConfiguration(); + void setupSimulationAllocations(); + void cleanupSimulationAllocations(); + void getIdealExecutionConfiguration(); - public: - MonteCarloPiSimulation(size_t num_points); - ~MonteCarloPiSimulation(); - void initSimulation(int cudaDevice, cudaStream_t stream = 0); - void stepSimulation(float time, cudaStream_t stream = 0); - static void computePiCallback(void *args); +public: + MonteCarloPiSimulation(size_t num_points); + ~MonteCarloPiSimulation(); + void initSimulation(int cudaDevice, cudaStream_t stream = 0); + void stepSimulation(float time, cudaStream_t stream = 0); + static void computePiCallback(void *args); - size_t getNumPoints() const { return m_numPoints; } + size_t getNumPoints() const { return m_numPoints; } - float getNumPointsInCircle() const { return *m_hostNumPointsInCircle; } + float getNumPointsInCircle() const { return *m_hostNumPointsInCircle; } - ShareableHandle &getPositionShareableHandle() { return m_posShareableHandle; } - ShareableHandle &getInCircleShareableHandle() { - return m_inCircleShareableHandle; - } + ShareableHandle &getPositionShareableHandle() { return m_posShareableHandle; } + ShareableHandle &getInCircleShareableHandle() { return m_inCircleShareableHandle; } }; -#endif // __PISIM_H__ +#endif // __PISIM_H__ diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md b/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md index 9603b06e..3ac927f9 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md @@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp index 23151947..295943c7 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.cpp @@ -29,7 +29,9 @@ * This file contains basic cross-platform setup paths in working with Vulkan * and rendering window. It is largely based off of tutorials provided here: * https://vulkan-tutorial.com/ -*/ + */ + +#include "VulkanBaseApp.h" #include #include @@ -40,8 +42,6 @@ #include #include - -#include "VulkanBaseApp.h" #include "VulkanCudaInterop.h" #define GLFW_INCLUDE_VULKAN @@ -50,1794 +50,1745 @@ #ifdef _WIN64 #include -#include #include +#include #endif /* _WIN64 */ #ifndef countof #define countof(x) (sizeof(x) / sizeof(*(x))) #endif -static const char *validationLayers[] = {"VK_LAYER_KHRONOS_validation"}; +static const char *validationLayers[] = {"VK_LAYER_KHRONOS_validation"}; static const size_t MAX_FRAMES_IN_FLIGHT = 5; -void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height) { - VulkanBaseApp *app = - reinterpret_cast(glfwGetWindowUserPointer(window)); - app->m_framebufferResized = true; +void VulkanBaseApp::resizeCallback(GLFWwindow *window, int width, int height) +{ + VulkanBaseApp *app = reinterpret_cast(glfwGetWindowUserPointer(window)); + app->m_framebufferResized = true; } -static VKAPI_ATTR VkBool32 VKAPI_CALL -debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, - VkDebugUtilsMessageTypeFlagsEXT messageType, - const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, - void *pUserData) { - std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; +static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + VkDebugUtilsMessageTypeFlagsEXT messageType, + const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, + void *pUserData) +{ + std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; - return VK_FALSE; + return VK_FALSE; } VulkanBaseApp::VulkanBaseApp(const std::string &appName, bool enableValidation) - : m_appName(appName), - m_enableValidation(enableValidation), - m_instance(VK_NULL_HANDLE), - m_window(nullptr), - m_debugMessenger(VK_NULL_HANDLE), - m_surface(VK_NULL_HANDLE), - m_physicalDevice(VK_NULL_HANDLE), - m_device(VK_NULL_HANDLE), - m_graphicsQueue(VK_NULL_HANDLE), - m_presentQueue(VK_NULL_HANDLE), - m_swapChain(VK_NULL_HANDLE), - m_swapChainImages(), - m_swapChainFormat(), - m_swapChainExtent(), - m_swapChainImageViews(), - m_shaderFiles(), - m_renderPass(), - m_pipelineLayout(VK_NULL_HANDLE), - m_graphicsPipeline(VK_NULL_HANDLE), - m_swapChainFramebuffers(), - m_commandPool(VK_NULL_HANDLE), - m_commandBuffers(), - m_imageAvailableSemaphores(), - m_renderFinishedSemaphores(), - m_inFlightFences(), - m_uniformBuffers(), - m_uniformMemory(), - m_descriptorSetLayout(VK_NULL_HANDLE), - m_descriptorPool(VK_NULL_HANDLE), - m_descriptorSets(), - m_depthImage(VK_NULL_HANDLE), - m_depthImageMemory(VK_NULL_HANDLE), - m_depthImageView(VK_NULL_HANDLE), - m_currentFrame(0), - m_framebufferResized(false) {} + : m_appName(appName) + , m_enableValidation(enableValidation) + , m_instance(VK_NULL_HANDLE) + , m_window(nullptr) + , m_debugMessenger(VK_NULL_HANDLE) + , m_surface(VK_NULL_HANDLE) + , m_physicalDevice(VK_NULL_HANDLE) + , m_device(VK_NULL_HANDLE) + , m_graphicsQueue(VK_NULL_HANDLE) + , m_presentQueue(VK_NULL_HANDLE) + , m_swapChain(VK_NULL_HANDLE) + , m_swapChainImages() + , m_swapChainFormat() + , m_swapChainExtent() + , m_swapChainImageViews() + , m_shaderFiles() + , m_renderPass() + , m_pipelineLayout(VK_NULL_HANDLE) + , m_graphicsPipeline(VK_NULL_HANDLE) + , m_swapChainFramebuffers() + , m_commandPool(VK_NULL_HANDLE) + , m_commandBuffers() + , m_imageAvailableSemaphores() + , m_renderFinishedSemaphores() + , m_inFlightFences() + , m_uniformBuffers() + , m_uniformMemory() + , m_descriptorSetLayout(VK_NULL_HANDLE) + , m_descriptorPool(VK_NULL_HANDLE) + , m_descriptorSets() + , m_depthImage(VK_NULL_HANDLE) + , m_depthImageMemory(VK_NULL_HANDLE) + , m_depthImageView(VK_NULL_HANDLE) + , m_currentFrame(0) + , m_framebufferResized(false) +{ +} -VkExternalSemaphoreHandleTypeFlagBits -VulkanBaseApp::getDefaultSemaphoreHandleType() { +VkExternalSemaphoreHandleTypeFlagBits VulkanBaseApp::getDefaultSemaphoreHandleType() +{ #ifdef _WIN64 - return IsWindows8OrGreater() - ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; + return IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; #else - return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; #endif } -VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType() { +VkExternalMemoryHandleTypeFlagBits VulkanBaseApp::getDefaultMemHandleType() +{ #ifdef _WIN64 - return IsWindows8Point1OrGreater() - ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; + return IsWindows8Point1OrGreater() ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; #else - return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; #endif } -VulkanBaseApp::~VulkanBaseApp() { - cleanupSwapChain(); +VulkanBaseApp::~VulkanBaseApp() +{ + cleanupSwapChain(); - if (m_descriptorSetLayout != VK_NULL_HANDLE) { - vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr); - } - - for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) { - vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr); - vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr); - vkDestroyFence(m_device, m_inFlightFences[i], nullptr); - } - if (m_commandPool != VK_NULL_HANDLE) { - vkDestroyCommandPool(m_device, m_commandPool, nullptr); - } - - if (m_device != VK_NULL_HANDLE) { - vkDestroyDevice(m_device, nullptr); - } - - if (m_enableValidation) { - PFN_vkDestroyDebugUtilsMessengerEXT func = - (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr( - m_instance, "vkDestroyDebugUtilsMessengerEXT"); - if (func != nullptr) { - func(m_instance, m_debugMessenger, nullptr); + if (m_descriptorSetLayout != VK_NULL_HANDLE) { + vkDestroyDescriptorSetLayout(m_device, m_descriptorSetLayout, nullptr); } - } - if (m_surface != VK_NULL_HANDLE) { - vkDestroySurfaceKHR(m_instance, m_surface, nullptr); - } + for (size_t i = 0; i < m_renderFinishedSemaphores.size(); i++) { + vkDestroySemaphore(m_device, m_renderFinishedSemaphores[i], nullptr); + vkDestroySemaphore(m_device, m_imageAvailableSemaphores[i], nullptr); + vkDestroyFence(m_device, m_inFlightFences[i], nullptr); + } + if (m_commandPool != VK_NULL_HANDLE) { + vkDestroyCommandPool(m_device, m_commandPool, nullptr); + } - if (m_instance != VK_NULL_HANDLE) { - vkDestroyInstance(m_instance, nullptr); - } + if (m_device != VK_NULL_HANDLE) { + vkDestroyDevice(m_device, nullptr); + } - if (m_window) { - glfwDestroyWindow(m_window); - } + if (m_enableValidation) { + PFN_vkDestroyDebugUtilsMessengerEXT func = + (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkDestroyDebugUtilsMessengerEXT"); + if (func != nullptr) { + func(m_instance, m_debugMessenger, nullptr); + } + } - glfwTerminate(); + if (m_surface != VK_NULL_HANDLE) { + vkDestroySurfaceKHR(m_instance, m_surface, nullptr); + } + + if (m_instance != VK_NULL_HANDLE) { + vkDestroyInstance(m_instance, nullptr); + } + + if (m_window) { + glfwDestroyWindow(m_window); + } + + glfwTerminate(); } -void VulkanBaseApp::init() { - initWindow(); - initVulkan(); +void VulkanBaseApp::init() +{ + initWindow(); + initVulkan(); } -VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands() { - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandPool = m_commandPool; - allocInfo.commandBufferCount = 1; +VkCommandBuffer VulkanBaseApp::beginSingleTimeCommands() +{ + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandPool = m_commandPool; + allocInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer; - vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer); + VkCommandBuffer commandBuffer; + vkAllocateCommandBuffers(m_device, &allocInfo, &commandBuffer); - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - vkBeginCommandBuffer(commandBuffer, &beginInfo); + vkBeginCommandBuffer(commandBuffer, &beginInfo); - return commandBuffer; + return commandBuffer; } -void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer) { - vkEndCommandBuffer(commandBuffer); +void VulkanBaseApp::endSingleTimeCommands(VkCommandBuffer commandBuffer) +{ + vkEndCommandBuffer(commandBuffer); - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffer; - vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); - vkQueueWaitIdle(m_graphicsQueue); + vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); + vkQueueWaitIdle(m_graphicsQueue); - vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer); + vkFreeCommandBuffers(m_device, m_commandPool, 1, &commandBuffer); } -void VulkanBaseApp::initWindow() { - glfwInit(); +void VulkanBaseApp::initWindow() +{ + glfwInit(); - glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); - glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); + glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); + glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); - m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr); - glfwSetWindowUserPointer(m_window, this); - glfwSetFramebufferSizeCallback(m_window, resizeCallback); + m_window = glfwCreateWindow(1280, 800, m_appName.c_str(), nullptr, nullptr); + glfwSetWindowUserPointer(m_window, this); + glfwSetFramebufferSizeCallback(m_window, resizeCallback); } -std::vector VulkanBaseApp::getRequiredExtensions() const { - return std::vector(); -} +std::vector VulkanBaseApp::getRequiredExtensions() const { return std::vector(); } -std::vector VulkanBaseApp::getRequiredDeviceExtensions() const { - return std::vector(); -} +std::vector VulkanBaseApp::getRequiredDeviceExtensions() const { return std::vector(); } -void VulkanBaseApp::initVulkan() { - createInstance(); - createSurface(); - createDevice(); - createSwapChain(); - createImageViews(); - createRenderPass(); - createDescriptorSetLayout(); - createGraphicsPipeline(); - createCommandPool(); - createDepthResources(); - createFramebuffers(); - initVulkanApp(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); - createSyncObjects(); +void VulkanBaseApp::initVulkan() +{ + createInstance(); + createSurface(); + createDevice(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createDescriptorSetLayout(); + createGraphicsPipeline(); + createCommandPool(); + createDepthResources(); + createFramebuffers(); + initVulkanApp(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); + createSyncObjects(); } #ifdef _WIN64 -class WindowsSecurityAttributes { - protected: - SECURITY_ATTRIBUTES m_winSecurityAttributes; - PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; +class WindowsSecurityAttributes +{ +protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; - public: - WindowsSecurityAttributes(); - SECURITY_ATTRIBUTES *operator&(); - ~WindowsSecurityAttributes(); +public: + WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES *operator&(); + ~WindowsSecurityAttributes(); }; -WindowsSecurityAttributes::WindowsSecurityAttributes() { - m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc( - 1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); - if (!m_winPSecurityDescriptor) { - throw std::runtime_error( - "Failed to allocate memory for security descriptor"); - } +WindowsSecurityAttributes::WindowsSecurityAttributes() +{ + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); + if (!m_winPSecurityDescriptor) { + throw std::runtime_error("Failed to allocate memory for security descriptor"); + } - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + - SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - InitializeSecurityDescriptor(m_winPSecurityDescriptor, - SECURITY_DESCRIPTOR_REVISION); + InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION); - SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = - SECURITY_WORLD_SID_AUTHORITY; - AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, - 0, 0, 0, 0, 0, ppSID); + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID); - EXPLICIT_ACCESS explicitAccess; - ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); - explicitAccess.grfAccessPermissions = - STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; - explicitAccess.grfAccessMode = SET_ACCESS; - explicitAccess.grfInheritance = INHERIT_ONLY; - explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; - explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; - explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; - SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); - SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); - m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); - m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; - m_winSecurityAttributes.bInheritHandle = TRUE; + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; } -SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { - return &m_winSecurityAttributes; -} +SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { return &m_winSecurityAttributes; } -WindowsSecurityAttributes::~WindowsSecurityAttributes() { - PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + - SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); +WindowsSecurityAttributes::~WindowsSecurityAttributes() +{ + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - if (*ppSID) { - FreeSid(*ppSID); - } - if (*ppACL) { - LocalFree(*ppACL); - } - free(m_winPSecurityDescriptor); + if (*ppSID) { + FreeSid(*ppSID); + } + if (*ppACL) { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); } #endif /* _WIN64 */ -static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, +static VkFormat findSupportedFormat(VkPhysicalDevice physicalDevice, const std::vector &candidates, - VkImageTiling tiling, - VkFormatFeatureFlags features) { - for (VkFormat format : candidates) { - VkFormatProperties props; - vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props); - if (tiling == VK_IMAGE_TILING_LINEAR && - (props.linearTilingFeatures & features) == features) { - return format; - } else if (tiling == VK_IMAGE_TILING_OPTIMAL && - (props.optimalTilingFeatures & features) == features) { - return format; + VkImageTiling tiling, + VkFormatFeatureFlags features) +{ + for (VkFormat format : candidates) { + VkFormatProperties props; + vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &props); + if (tiling == VK_IMAGE_TILING_LINEAR && (props.linearTilingFeatures & features) == features) { + return format; + } + else if (tiling == VK_IMAGE_TILING_OPTIMAL && (props.optimalTilingFeatures & features) == features) { + return format; + } } - } - throw std::runtime_error("Failed to find supported format!"); + throw std::runtime_error("Failed to find supported format!"); } -static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, - uint32_t typeFilter, - VkMemoryPropertyFlags properties) { - VkPhysicalDeviceMemoryProperties memProperties; - vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); - for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { - if (typeFilter & (1 << i) && - (memProperties.memoryTypes[i].propertyFlags & properties) == - properties) { - return i; +static uint32_t findMemoryType(VkPhysicalDevice physicalDevice, uint32_t typeFilter, VkMemoryPropertyFlags properties) +{ + VkPhysicalDeviceMemoryProperties memProperties; + vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } } - } - return ~0; + return ~0; } -static bool supportsValidationLayers() { - std::vector availableLayers; - uint32_t layerCount; +static bool supportsValidationLayers() +{ + std::vector availableLayers; + uint32_t layerCount; - vkEnumerateInstanceLayerProperties(&layerCount, nullptr); - availableLayers.resize(layerCount); - vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); + vkEnumerateInstanceLayerProperties(&layerCount, nullptr); + availableLayers.resize(layerCount); + vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); - for (const char *layerName : validationLayers) { - bool layerFound = false; + for (const char *layerName : validationLayers) { + bool layerFound = false; - for (const auto &layerProperties : availableLayers) { - if (strcmp(layerName, layerProperties.layerName) == 0) { - layerFound = true; - break; - } + for (const auto &layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } + } + + if (!layerFound) { + return false; + } } - if (!layerFound) { - return false; - } - } - - return true; + return true; } -void VulkanBaseApp::createInstance() { - if (m_enableValidation && !supportsValidationLayers()) { - throw std::runtime_error("Validation requested, but not supported!"); - } +void VulkanBaseApp::createInstance() +{ + if (m_enableValidation && !supportsValidationLayers()) { + throw std::runtime_error("Validation requested, but not supported!"); + } - VkApplicationInfo appInfo = {}; - appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; - appInfo.pApplicationName = m_appName.c_str(); - appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.pEngineName = "No Engine"; - appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.apiVersion = VK_API_VERSION_1_0; + VkApplicationInfo appInfo = {}; + appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + appInfo.pApplicationName = m_appName.c_str(); + appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.pEngineName = "No Engine"; + appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.apiVersion = VK_API_VERSION_1_0; - VkInstanceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; - createInfo.pApplicationInfo = &appInfo; + VkInstanceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + createInfo.pApplicationInfo = &appInfo; - std::vector exts = getRequiredExtensions(); + std::vector exts = getRequiredExtensions(); - { - uint32_t glfwExtensionCount = 0; - const char **glfwExtensions; + { + uint32_t glfwExtensionCount = 0; + const char **glfwExtensions; - glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); - exts.insert(exts.begin(), glfwExtensions, - glfwExtensions + glfwExtensionCount); + exts.insert(exts.begin(), glfwExtensions, glfwExtensions + glfwExtensionCount); + + if (m_enableValidation) { + exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + } + } + + createInfo.enabledExtensionCount = static_cast(exts.size()); + createInfo.ppEnabledExtensionNames = exts.data(); + VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {}; + if (m_enableValidation) { + createInfo.enabledLayerCount = static_cast(countof(validationLayers)); + createInfo.ppEnabledLayerNames = validationLayers; + + debugCreateInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; + debugCreateInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; + debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; + debugCreateInfo.pfnUserCallback = debugCallback; + + createInfo.pNext = &debugCreateInfo; + } + else { + createInfo.enabledLayerCount = 0; + createInfo.pNext = nullptr; + } + + if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) { + throw std::runtime_error("Failed to create Vulkan instance!"); + } if (m_enableValidation) { - exts.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + PFN_vkCreateDebugUtilsMessengerEXT func = + (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(m_instance, "vkCreateDebugUtilsMessengerEXT"); + if (func == nullptr || func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != VK_SUCCESS) { + throw std::runtime_error("Failed to set up debug messenger!"); + } } - } - - createInfo.enabledExtensionCount = static_cast(exts.size()); - createInfo.ppEnabledExtensionNames = exts.data(); - VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo = {}; - if (m_enableValidation) { - createInfo.enabledLayerCount = - static_cast(countof(validationLayers)); - createInfo.ppEnabledLayerNames = validationLayers; - - debugCreateInfo.sType = - VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; - debugCreateInfo.messageSeverity = - VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; - debugCreateInfo.messageType = - VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; - debugCreateInfo.pfnUserCallback = debugCallback; - - createInfo.pNext = &debugCreateInfo; - } else { - createInfo.enabledLayerCount = 0; - createInfo.pNext = nullptr; - } - - if (vkCreateInstance(&createInfo, nullptr, &m_instance) != VK_SUCCESS) { - throw std::runtime_error("Failed to create Vulkan instance!"); - } - - if (m_enableValidation) { - PFN_vkCreateDebugUtilsMessengerEXT func = - (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr( - m_instance, "vkCreateDebugUtilsMessengerEXT"); - if (func == nullptr || - func(m_instance, &debugCreateInfo, nullptr, &m_debugMessenger) != - VK_SUCCESS) { - throw std::runtime_error("Failed to set up debug messenger!"); - } - } } -void VulkanBaseApp::createSurface() { - if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != - VK_SUCCESS) { - throw std::runtime_error("failed to create window surface!"); - } +void VulkanBaseApp::createSurface() +{ + if (glfwCreateWindowSurface(m_instance, m_window, nullptr, &m_surface) != VK_SUCCESS) { + throw std::runtime_error("failed to create window surface!"); + } } static bool findGraphicsQueueIndicies(VkPhysicalDevice device, - VkSurfaceKHR surface, - uint32_t &graphicsFamily, - uint32_t &presentFamily) { - uint32_t queueFamilyCount = 0; + VkSurfaceKHR surface, + uint32_t &graphicsFamily, + uint32_t &presentFamily) +{ + uint32_t queueFamilyCount = 0; - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); - std::vector queueFamilies(queueFamilyCount); - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, - queueFamilies.data()); + std::vector queueFamilies(queueFamilyCount); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data()); - graphicsFamily = presentFamily = ~0; + graphicsFamily = presentFamily = ~0; - for (uint32_t i = 0; i < queueFamilyCount; i++) { - if (queueFamilies[i].queueCount > 0) { - if (graphicsFamily == ~0 && - queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { - graphicsFamily = i; - } - uint32_t presentSupport = 0; - vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); - if (presentFamily == ~0 && presentSupport) { - presentFamily = i; - } - if (presentFamily != ~0 && graphicsFamily != ~0) { - break; - } + for (uint32_t i = 0; i < queueFamilyCount; i++) { + if (queueFamilies[i].queueCount > 0) { + if (graphicsFamily == ~0 && queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { + graphicsFamily = i; + } + uint32_t presentSupport = 0; + vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); + if (presentFamily == ~0 && presentSupport) { + presentFamily = i; + } + if (presentFamily != ~0 && graphicsFamily != ~0) { + break; + } + } } - } - return graphicsFamily != ~0 && presentFamily != ~0; + return graphicsFamily != ~0 && presentFamily != ~0; } -static bool hasAllExtensions( - VkPhysicalDevice device, - const std::vector &deviceExtensions) { - uint32_t extensionCount; - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, - nullptr); - std::vector availableExtensions(extensionCount); - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, - availableExtensions.data()); +static bool hasAllExtensions(VkPhysicalDevice device, const std::vector &deviceExtensions) +{ + uint32_t extensionCount; + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr); + std::vector availableExtensions(extensionCount); + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data()); - std::set requiredExtensions(deviceExtensions.begin(), - deviceExtensions.end()); + std::set requiredExtensions(deviceExtensions.begin(), deviceExtensions.end()); - for (const auto &extension : availableExtensions) { - requiredExtensions.erase(extension.extensionName); - } - - return requiredExtensions.empty(); -} - -static void getSwapChainProperties( - VkPhysicalDevice device, VkSurfaceKHR surface, - VkSurfaceCapabilitiesKHR &capabilities, - std::vector &formats, - std::vector &presentModes) { - vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities); - uint32_t formatCount; - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); - if (formatCount != 0) { - formats.resize(formatCount); - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, - formats.data()); - } - uint32_t presentModeCount; - vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, - nullptr); - if (presentModeCount != 0) { - presentModes.resize(presentModeCount); - vkGetPhysicalDeviceSurfacePresentModesKHR( - device, surface, &presentModeCount, presentModes.data()); - } -} - -bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const { - bool isSuitable = false; - uint32_t graphicsQueueIndex, presentQueueIndex; - std::vector deviceExtensions = getRequiredDeviceExtensions(); - VkSurfaceCapabilitiesKHR caps; - std::vector formats; - std::vector presentModes; - deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); - getSwapChainProperties(dev, m_surface, caps, formats, presentModes); - - VkPhysicalDeviceIDPropertiesKHR vkPhysicalDeviceIDPropertiesKHR = {}; - vkPhysicalDeviceIDPropertiesKHR.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR; - vkPhysicalDeviceIDPropertiesKHR.pNext = NULL; - - VkPhysicalDeviceProperties2KHR vkPhysicalDeviceProperties2KHR = {}; - vkPhysicalDeviceProperties2KHR.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; - vkPhysicalDeviceProperties2KHR.pNext = &vkPhysicalDeviceIDPropertiesKHR; - - vkGetPhysicalDeviceProperties2(dev, &vkPhysicalDeviceProperties2KHR); - - isSuitable = hasAllExtensions(dev, deviceExtensions) && - isDeviceCompatible(vkPhysicalDeviceIDPropertiesKHR.deviceUUID, - (size_t)VK_UUID_SIZE) && - !formats.empty() && !presentModes.empty() && - findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, - presentQueueIndex); - - if (isSuitable) { - memcpy((void *)m_deviceUUID, vkPhysicalDeviceIDPropertiesKHR.deviceUUID, - sizeof(m_deviceUUID)); - } - - return isSuitable; -} - -bool VulkanBaseApp::isVkPhysicalDeviceUuid(void *Uuid) { - return !memcmp((void *)m_deviceUUID, Uuid, (size_t)VK_UUID_SIZE); -} - -void VulkanBaseApp::createDevice() { - { - uint32_t deviceCount = 0; - vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr); - if (deviceCount == 0) { - throw std::runtime_error("Failed to find Vulkan capable GPUs!"); + for (const auto &extension : availableExtensions) { + requiredExtensions.erase(extension.extensionName); } - std::vector phyDevs(deviceCount); - vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data()); - std::vector::iterator it = - std::find_if(phyDevs.begin(), phyDevs.end(), - std::bind(&VulkanBaseApp::isSuitableDevice, this, - std::placeholders::_1)); - if (it == phyDevs.end()) { - printf("\nNo suitable device found! Waiving Execution\n"); - exit(EXIT_WAIVED); + + return requiredExtensions.empty(); +} + +static void getSwapChainProperties(VkPhysicalDevice device, + VkSurfaceKHR surface, + VkSurfaceCapabilitiesKHR &capabilities, + std::vector &formats, + std::vector &presentModes) +{ + vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &capabilities); + uint32_t formatCount; + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); + if (formatCount != 0) { + formats.resize(formatCount); + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, formats.data()); } - m_physicalDevice = *it; - } - - uint32_t graphicsQueueIndex, presentQueueIndex; - findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, - presentQueueIndex); - - std::vector queueCreateInfos; - std::set uniqueFamilyIndices = {graphicsQueueIndex, - presentQueueIndex}; - - float queuePriority = 1.0f; - - for (uint32_t queueFamily : uniqueFamilyIndices) { - VkDeviceQueueCreateInfo queueCreateInfo = {}; - queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - queueCreateInfo.queueFamilyIndex = graphicsQueueIndex; - queueCreateInfo.queueCount = 1; - queueCreateInfo.pQueuePriorities = &queuePriority; - queueCreateInfos.push_back(queueCreateInfo); - } - - VkPhysicalDeviceFeatures deviceFeatures = {}; - deviceFeatures.fillModeNonSolid = true; - - VkDeviceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; - - createInfo.pQueueCreateInfos = queueCreateInfos.data(); - createInfo.queueCreateInfoCount = - static_cast(queueCreateInfos.size()); - - createInfo.pEnabledFeatures = &deviceFeatures; - - std::vector deviceExtensions = getRequiredDeviceExtensions(); - deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); - - createInfo.enabledExtensionCount = - static_cast(deviceExtensions.size()); - createInfo.ppEnabledExtensionNames = deviceExtensions.data(); - - if (m_enableValidation) { - createInfo.enabledLayerCount = - static_cast(countof(validationLayers)); - createInfo.ppEnabledLayerNames = validationLayers; - } else { - createInfo.enabledLayerCount = 0; - } - - if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != - VK_SUCCESS) { - throw std::runtime_error("failed to create logical device!"); - } - - vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue); - vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue); -} - -static VkSurfaceFormatKHR chooseSwapSurfaceFormat( - const std::vector &availableFormats) { - if (availableFormats.size() == 1 && - availableFormats[0].format == VK_FORMAT_UNDEFINED) { - return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; - } - - for (const auto &availableFormat : availableFormats) { - if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && - availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { - return availableFormat; + uint32_t presentModeCount; + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr); + if (presentModeCount != 0) { + presentModes.resize(presentModeCount); + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, presentModes.data()); } - } - - return availableFormats[0]; } -static VkPresentModeKHR chooseSwapPresentMode( - const std::vector &availablePresentModes) { - VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; - - for (const auto &availablePresentMode : availablePresentModes) { - if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { - return availablePresentMode; - } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { - bestMode = availablePresentMode; - } - } - - return bestMode; -} - -static VkExtent2D chooseSwapExtent( - GLFWwindow *window, const VkSurfaceCapabilitiesKHR &capabilities) { - if (capabilities.currentExtent.width != - std::numeric_limits::max()) { - return capabilities.currentExtent; - } else { - int width, height; - glfwGetFramebufferSize(window, &width, &height); - VkExtent2D actualExtent = {static_cast(width), - static_cast(height)}; - - actualExtent.width = std::max( - capabilities.minImageExtent.width, - std::min(capabilities.maxImageExtent.width, actualExtent.width)); - actualExtent.height = std::max( - capabilities.minImageExtent.height, - std::min(capabilities.maxImageExtent.height, actualExtent.height)); - - return actualExtent; - } -} - -void VulkanBaseApp::createSwapChain() { - VkSurfaceCapabilitiesKHR capabilities; - VkSurfaceFormatKHR format; - VkPresentModeKHR presentMode; - VkExtent2D extent; - uint32_t imageCount; - - { +bool VulkanBaseApp::isSuitableDevice(VkPhysicalDevice dev) const +{ + bool isSuitable = false; + uint32_t graphicsQueueIndex, presentQueueIndex; + std::vector deviceExtensions = getRequiredDeviceExtensions(); + VkSurfaceCapabilitiesKHR caps; std::vector formats; - std::vector presentModes; + std::vector presentModes; + deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + getSwapChainProperties(dev, m_surface, caps, formats, presentModes); - getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, - presentModes); - format = chooseSwapSurfaceFormat(formats); - presentMode = chooseSwapPresentMode(presentModes); - extent = chooseSwapExtent(m_window, capabilities); - imageCount = capabilities.minImageCount + 1; - if (capabilities.maxImageCount > 0 && - imageCount > capabilities.maxImageCount) { - imageCount = capabilities.maxImageCount; - } - } + VkPhysicalDeviceIDPropertiesKHR vkPhysicalDeviceIDPropertiesKHR = {}; + vkPhysicalDeviceIDPropertiesKHR.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR; + vkPhysicalDeviceIDPropertiesKHR.pNext = NULL; - VkSwapchainCreateInfoKHR createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; - createInfo.surface = m_surface; + VkPhysicalDeviceProperties2KHR vkPhysicalDeviceProperties2KHR = {}; + vkPhysicalDeviceProperties2KHR.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; + vkPhysicalDeviceProperties2KHR.pNext = &vkPhysicalDeviceIDPropertiesKHR; - createInfo.minImageCount = imageCount; - createInfo.imageFormat = format.format; - createInfo.imageColorSpace = format.colorSpace; - createInfo.imageExtent = extent; - createInfo.imageArrayLayers = 1; - createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + vkGetPhysicalDeviceProperties2(dev, &vkPhysicalDeviceProperties2KHR); - uint32_t queueFamilyIndices[2]; - findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], - queueFamilyIndices[1]); + isSuitable = hasAllExtensions(dev, deviceExtensions) + && isDeviceCompatible(vkPhysicalDeviceIDPropertiesKHR.deviceUUID, (size_t)VK_UUID_SIZE) + && !formats.empty() && !presentModes.empty() + && findGraphicsQueueIndicies(dev, m_surface, graphicsQueueIndex, presentQueueIndex); - if (queueFamilyIndices[0] != queueFamilyIndices[1]) { - createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; - createInfo.queueFamilyIndexCount = countof(queueFamilyIndices); - createInfo.pQueueFamilyIndices = queueFamilyIndices; - } else { - createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; - } - - createInfo.preTransform = capabilities.currentTransform; - createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; - createInfo.presentMode = presentMode; - createInfo.clipped = VK_TRUE; - - createInfo.oldSwapchain = VK_NULL_HANDLE; - - if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != - VK_SUCCESS) { - throw std::runtime_error("failed to create swap chain!"); - } - - vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr); - m_swapChainImages.resize(imageCount); - vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, - m_swapChainImages.data()); - - m_swapChainFormat = format.format; - m_swapChainExtent = extent; -} - -static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, - VkImageAspectFlags aspectFlags) { - VkImageView imageView; - VkImageViewCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - createInfo.image = image; - createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; - createInfo.format = format; - createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; - createInfo.subresourceRange.aspectMask = aspectFlags; - createInfo.subresourceRange.baseMipLevel = 0; - createInfo.subresourceRange.levelCount = 1; - createInfo.subresourceRange.baseArrayLayer = 0; - createInfo.subresourceRange.layerCount = 1; - if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image views!"); - } - - return imageView; -} - -static void createImage(VkPhysicalDevice physicalDevice, VkDevice device, - uint32_t width, uint32_t height, VkFormat format, - VkImageTiling tiling, VkImageUsageFlags usage, - VkMemoryPropertyFlags properties, VkImage &image, - VkDeviceMemory &imageMemory) { - VkImageCreateInfo imageInfo = {}; - imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - imageInfo.imageType = VK_IMAGE_TYPE_2D; - imageInfo.extent.width = width; - imageInfo.extent.height = height; - imageInfo.extent.depth = 1; - imageInfo.mipLevels = 1; - imageInfo.arrayLayers = 1; - imageInfo.format = format; - imageInfo.tiling = tiling; - imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - imageInfo.usage = usage; - imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; - imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - - if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { - throw std::runtime_error("failed to create image!"); - } - - VkMemoryRequirements memRequirements; - vkGetImageMemoryRequirements(device, image, &memRequirements); - - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType( - physicalDevice, memRequirements.memoryTypeBits, properties); - - if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate image memory!"); - } - - vkBindImageMemory(device, image, imageMemory, 0); -} - -void VulkanBaseApp::createImageViews() { - m_swapChainImageViews.resize(m_swapChainImages.size()); - - for (uint32_t i = 0; i < m_swapChainImages.size(); i++) { - m_swapChainImageViews[i] = - createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, - VK_IMAGE_ASPECT_COLOR_BIT); - } -} - -void VulkanBaseApp::createRenderPass() { - VkAttachmentDescription colorAttachment = {}; - colorAttachment.format = m_swapChainFormat; - colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; - // Set up the render pass to preserve the contents of the attachment while - // rendering. By doing this the points already rendered are not cleared and - // thus displays growing number of points with time eventhough the number of - // points rendered per frame is constant - colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; - colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; - colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - - VkAttachmentReference colorAttachmentRef = {}; - colorAttachmentRef.attachment = 0; - colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - - VkAttachmentDescription depthAttachment = {}; - depthAttachment.format = findSupportedFormat( - m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, - VK_FORMAT_D24_UNORM_S8_UINT}, - VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); - depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT; - depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; - depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - depthAttachment.finalLayout = - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - - VkAttachmentReference depthAttachmentRef = {}; - depthAttachmentRef.attachment = 1; - depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - - VkSubpassDescription subpass = {}; - subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - subpass.colorAttachmentCount = 1; - subpass.pColorAttachments = &colorAttachmentRef; - subpass.pDepthStencilAttachment = &depthAttachmentRef; - - VkSubpassDependency dependency = {}; - dependency.srcSubpass = VK_SUBPASS_EXTERNAL; - dependency.dstSubpass = 0; - dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.srcAccessMask = 0; - dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - - VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment}; - VkRenderPassCreateInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; - renderPassInfo.attachmentCount = countof(attachments); - renderPassInfo.pAttachments = attachments; - renderPassInfo.subpassCount = 1; - renderPassInfo.pSubpasses = &subpass; - renderPassInfo.dependencyCount = 1; - renderPassInfo.pDependencies = &dependency; - - if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != - VK_SUCCESS) { - throw std::runtime_error("failed to create render pass!"); - } -} - -void VulkanBaseApp::createDescriptorSetLayout() { - VkDescriptorSetLayoutBinding uboLayoutBinding = {}; - uboLayoutBinding.binding = 0; - uboLayoutBinding.descriptorCount = 1; - uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - uboLayoutBinding.pImmutableSamplers = nullptr; - uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - - VkDescriptorSetLayoutCreateInfo layoutInfo = {}; - layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - layoutInfo.bindingCount = 1; - layoutInfo.pBindings = &uboLayoutBinding; - - if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, - &m_descriptorSetLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor set layout!"); - } -} - -VkShaderModule createShaderModule(VkDevice device, const char *filename) { - std::vector shaderContents; - std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary); - VkShaderModuleCreateInfo createInfo = {}; - VkShaderModule shaderModule; - - if (!shaderFile.good()) { - throw std::runtime_error("Failed to load shader contents"); - } - readFile(shaderFile, shaderContents); - - createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - createInfo.codeSize = shaderContents.size(); - createInfo.pCode = reinterpret_cast(shaderContents.data()); - - if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != - VK_SUCCESS) { - throw std::runtime_error("Failed to create shader module!"); - } - - return shaderModule; -} - -void VulkanBaseApp::getVertexDescriptions( - std::vector &bindingDesc, - std::vector &attribDesc) {} - -void VulkanBaseApp::getAssemblyStateInfo( - VkPipelineInputAssemblyStateCreateInfo &info) {} - -void VulkanBaseApp::createGraphicsPipeline() { - std::vector shaderStageInfos( - m_shaderFiles.size()); - for (size_t i = 0; i < m_shaderFiles.size(); i++) { - shaderStageInfos[i] = {}; - shaderStageInfos[i].sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shaderStageInfos[i].stage = m_shaderFiles[i].first; - shaderStageInfos[i].module = - createShaderModule(m_device, m_shaderFiles[i].second.c_str()); - shaderStageInfos[i].pName = "main"; - } - - VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; - - std::vector vertexBindingDescriptions; - std::vector vertexAttributeDescriptions; - - getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions); - - vertexInputInfo.sType = - VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - vertexInputInfo.vertexBindingDescriptionCount = - static_cast(vertexBindingDescriptions.size()); - vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data(); - vertexInputInfo.vertexAttributeDescriptionCount = - static_cast(vertexAttributeDescriptions.size()); - vertexInputInfo.pVertexAttributeDescriptions = - vertexAttributeDescriptions.data(); - - VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; - getAssemblyStateInfo(inputAssembly); - - VkViewport viewport = {}; - viewport.x = 0.0f; - viewport.y = 0.0f; - viewport.width = (float)m_swapChainExtent.width; - viewport.height = (float)m_swapChainExtent.height; - viewport.minDepth = 0.0f; - viewport.maxDepth = 1.0f; - - VkRect2D scissor = {}; - scissor.offset = {0, 0}; - scissor.extent = m_swapChainExtent; - - VkPipelineViewportStateCreateInfo viewportState = {}; - viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; - viewportState.viewportCount = 1; - viewportState.pViewports = &viewport; - viewportState.scissorCount = 1; - viewportState.pScissors = &scissor; - - VkPipelineRasterizationStateCreateInfo rasterizer = {}; - rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - rasterizer.depthClampEnable = VK_FALSE; - rasterizer.rasterizerDiscardEnable = VK_FALSE; - rasterizer.polygonMode = VK_POLYGON_MODE_POINT; - rasterizer.lineWidth = 1.0f; - rasterizer.cullMode = VK_CULL_MODE_NONE; - rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE; - rasterizer.depthBiasEnable = VK_FALSE; - - VkPipelineMultisampleStateCreateInfo multisampling = {}; - multisampling.sType = - VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisampling.sampleShadingEnable = VK_FALSE; - multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; - multisampling.minSampleShading = 1.0f; // Optional - multisampling.pSampleMask = nullptr; // Optional - multisampling.alphaToCoverageEnable = VK_FALSE; // Optional - multisampling.alphaToOneEnable = VK_FALSE; // Optional - - VkPipelineDepthStencilStateCreateInfo depthStencil = {}; - depthStencil.sType = - VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; - depthStencil.depthTestEnable = VK_TRUE; - depthStencil.depthWriteEnable = VK_TRUE; - depthStencil.depthCompareOp = VK_COMPARE_OP_LESS; - depthStencil.depthBoundsTestEnable = VK_FALSE; - depthStencil.stencilTestEnable = VK_FALSE; - - VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; - colorBlendAttachment.colorWriteMask = - VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; - colorBlendAttachment.blendEnable = VK_FALSE; - - VkPipelineColorBlendStateCreateInfo colorBlending = {}; - colorBlending.sType = - VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - colorBlending.logicOpEnable = VK_FALSE; - colorBlending.logicOp = VK_LOGIC_OP_COPY; - colorBlending.attachmentCount = 1; - colorBlending.pAttachments = &colorBlendAttachment; - colorBlending.blendConstants[0] = 0.0f; - colorBlending.blendConstants[1] = 0.0f; - colorBlending.blendConstants[2] = 0.0f; - colorBlending.blendConstants[3] = 0.0f; - - VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; - pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - pipelineLayoutInfo.setLayoutCount = 1; // Optional - pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional - pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional - pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional - - if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, - &m_pipelineLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create pipeline layout!"); - } - - VkGraphicsPipelineCreateInfo pipelineInfo = {}; - pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - pipelineInfo.stageCount = static_cast(shaderStageInfos.size()); - pipelineInfo.pStages = shaderStageInfos.data(); - - pipelineInfo.pVertexInputState = &vertexInputInfo; - pipelineInfo.pInputAssemblyState = &inputAssembly; - pipelineInfo.pViewportState = &viewportState; - pipelineInfo.pRasterizationState = &rasterizer; - pipelineInfo.pMultisampleState = &multisampling; - pipelineInfo.pDepthStencilState = &depthStencil; // Optional - pipelineInfo.pColorBlendState = &colorBlending; - pipelineInfo.pDynamicState = nullptr; // Optional - - pipelineInfo.layout = m_pipelineLayout; - - pipelineInfo.renderPass = m_renderPass; - pipelineInfo.subpass = 0; - - pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional - pipelineInfo.basePipelineIndex = -1; // Optional - - if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, - nullptr, &m_graphicsPipeline) != VK_SUCCESS) { - throw std::runtime_error("failed to create graphics pipeline!"); - } - - for (size_t i = 0; i < shaderStageInfos.size(); i++) { - vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr); - } -} - -void VulkanBaseApp::createFramebuffers() { - m_swapChainFramebuffers.resize(m_swapChainImageViews.size()); - for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { - VkImageView attachments[] = {m_swapChainImageViews[i], m_depthImageView}; - - VkFramebufferCreateInfo framebufferInfo = {}; - framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; - framebufferInfo.renderPass = m_renderPass; - framebufferInfo.attachmentCount = countof(attachments); - framebufferInfo.pAttachments = attachments; - framebufferInfo.width = m_swapChainExtent.width; - framebufferInfo.height = m_swapChainExtent.height; - framebufferInfo.layers = 1; - - if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, - &m_swapChainFramebuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to create framebuffer!"); - } - } -} - -void VulkanBaseApp::createCommandPool() { - VkCommandPoolCreateInfo poolInfo = {}; - uint32_t graphicsIndex, presentIndex; - - findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, - presentIndex); - - poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - poolInfo.queueFamilyIndex = graphicsIndex; - poolInfo.flags = 0; // Optional - - if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != - VK_SUCCESS) { - throw std::runtime_error("Failed to create command pool!"); - } -} - -static void transitionImageLayout(VulkanBaseApp *app, VkImage image, - VkFormat format, VkImageLayout oldLayout, - VkImageLayout newLayout) { - VkCommandBuffer commandBuffer = app->beginSingleTimeCommands(); - - VkImageMemoryBarrier barrier = {}; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = image; - - if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - - if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || - format == VK_FORMAT_D24_UNORM_S8_UINT) { - barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; - } - } else { - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - } - - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - - VkPipelineStageFlags sourceStage; - VkPipelineStageFlags destinationStage; - - if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - - sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - - sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - } else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - - sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; - } else { - throw std::invalid_argument("unsupported layout transition!"); - } - - vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, - nullptr, 0, nullptr, 1, &barrier); - - app->endSingleTimeCommands(commandBuffer); -} - -void VulkanBaseApp::createDepthResources() { - VkFormat depthFormat = findSupportedFormat( - m_physicalDevice, {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, - VK_FORMAT_D24_UNORM_S8_UINT}, - VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); - createImage(m_physicalDevice, m_device, m_swapChainExtent.width, - m_swapChainExtent.height, depthFormat, VK_IMAGE_TILING_OPTIMAL, - VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_depthImage, - m_depthImageMemory); - m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, - VK_IMAGE_ASPECT_DEPTH_BIT); - transitionImageLayout(this, m_depthImage, depthFormat, - VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); -} - -void VulkanBaseApp::createUniformBuffers() { - VkDeviceSize size = getUniformSize(); - if (size > 0) { - m_uniformBuffers.resize(m_swapChainImages.size()); - m_uniformMemory.resize(m_swapChainImages.size()); - for (size_t i = 0; i < m_uniformBuffers.size(); i++) { - createBuffer(getUniformSize(), VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - m_uniformBuffers[i], m_uniformMemory[i]); - } - } -} - -void VulkanBaseApp::createDescriptorPool() { - VkDescriptorPoolSize poolSize = {}; - poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - poolSize.descriptorCount = static_cast(m_swapChainImages.size()); - VkDescriptorPoolCreateInfo poolInfo = {}; - poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - poolInfo.poolSizeCount = 1; - poolInfo.pPoolSizes = &poolSize; - poolInfo.maxSets = static_cast(m_swapChainImages.size()); - ; - if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != - VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor pool!"); - } -} - -void VulkanBaseApp::createDescriptorSets() { - std::vector layouts(m_swapChainImages.size(), - m_descriptorSetLayout); - VkDescriptorSetAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - allocInfo.descriptorPool = m_descriptorPool; - allocInfo.descriptorSetCount = - static_cast(m_swapChainImages.size()); - allocInfo.pSetLayouts = layouts.data(); - m_descriptorSets.resize(m_swapChainImages.size()); - - if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate descriptor sets!"); - } - - VkDescriptorBufferInfo bufferInfo = {}; - bufferInfo.offset = 0; - bufferInfo.range = VK_WHOLE_SIZE; - VkWriteDescriptorSet descriptorWrite = {}; - descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - descriptorWrite.dstBinding = 0; - descriptorWrite.dstArrayElement = 0; - descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - descriptorWrite.descriptorCount = 1; - descriptorWrite.pBufferInfo = &bufferInfo; - descriptorWrite.pImageInfo = nullptr; // Optional - descriptorWrite.pTexelBufferView = nullptr; // Optional - - for (size_t i = 0; i < m_swapChainImages.size(); i++) { - bufferInfo.buffer = m_uniformBuffers[i]; - descriptorWrite.dstSet = m_descriptorSets[i]; - vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr); - } -} - -void VulkanBaseApp::createCommandBuffers() { - m_commandBuffers.resize(m_swapChainFramebuffers.size()); - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.commandPool = m_commandPool; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size(); - - if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate command buffers!"); - } - - for (size_t i = 0; i < m_commandBuffers.size(); i++) { - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; - beginInfo.pInheritanceInfo = nullptr; // Optional - - if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) { - throw std::runtime_error("failed to begin recording command buffer!"); + if (isSuitable) { + memcpy((void *)m_deviceUUID, vkPhysicalDeviceIDPropertiesKHR.deviceUUID, sizeof(m_deviceUUID)); } - VkRenderPassBeginInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - renderPassInfo.renderPass = m_renderPass; - renderPassInfo.framebuffer = m_swapChainFramebuffers[i]; - - renderPassInfo.renderArea.offset = {0, 0}; - renderPassInfo.renderArea.extent = m_swapChainExtent; - - VkClearValue clearColors[2]; - clearColors[0].color = {0.0f, 0.0f, 0.0f, 1.0f}; - clearColors[1].depthStencil = {1.0f, 0}; - renderPassInfo.clearValueCount = countof(clearColors); - renderPassInfo.pClearValues = clearColors; - - vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, - VK_SUBPASS_CONTENTS_INLINE); - - vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, - m_graphicsPipeline); - - vkCmdBindDescriptorSets(m_commandBuffers[i], - VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, - 0, 1, &m_descriptorSets[i], 0, nullptr); - - fillRenderingCommandBuffer(m_commandBuffers[i]); - - vkCmdEndRenderPass(m_commandBuffers[i]); - - if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to record command buffer!"); - } - } + return isSuitable; } -void VulkanBaseApp::createSyncObjects() { - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - VkFenceCreateInfo fenceInfo = {}; - fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; - - m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT); - m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT); - m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT); - - for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, - &m_imageAvailableSemaphores[i]) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); - } - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, - &m_renderFinishedSemaphores[i]) != VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); - } - if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != - VK_SUCCESS) { - throw std::runtime_error("Failed to create image available semaphore!"); - } - } +bool VulkanBaseApp::isVkPhysicalDeviceUuid(void *Uuid) +{ + return !memcmp((void *)m_deviceUUID, Uuid, (size_t)VK_UUID_SIZE); } -void VulkanBaseApp::getWaitFrameSemaphores( - std::vector &wait, - std::vector &waitStages) const {} +void VulkanBaseApp::createDevice() +{ + { + uint32_t deviceCount = 0; + vkEnumeratePhysicalDevices(m_instance, &deviceCount, nullptr); + if (deviceCount == 0) { + throw std::runtime_error("Failed to find Vulkan capable GPUs!"); + } + std::vector phyDevs(deviceCount); + vkEnumeratePhysicalDevices(m_instance, &deviceCount, phyDevs.data()); + std::vector::iterator it = std::find_if( + phyDevs.begin(), phyDevs.end(), std::bind(&VulkanBaseApp::isSuitableDevice, this, std::placeholders::_1)); + if (it == phyDevs.end()) { + printf("\nNo suitable device found! Waiving Execution\n"); + exit(EXIT_WAIVED); + } + m_physicalDevice = *it; + } -void VulkanBaseApp::getSignalFrameSemaphores( - std::vector &signal) const {} + uint32_t graphicsQueueIndex, presentQueueIndex; + findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsQueueIndex, presentQueueIndex); + + std::vector queueCreateInfos; + std::set uniqueFamilyIndices = {graphicsQueueIndex, presentQueueIndex}; + + float queuePriority = 1.0f; + + for (uint32_t queueFamily : uniqueFamilyIndices) { + VkDeviceQueueCreateInfo queueCreateInfo = {}; + queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queueCreateInfo.queueFamilyIndex = graphicsQueueIndex; + queueCreateInfo.queueCount = 1; + queueCreateInfo.pQueuePriorities = &queuePriority; + queueCreateInfos.push_back(queueCreateInfo); + } + + VkPhysicalDeviceFeatures deviceFeatures = {}; + deviceFeatures.fillModeNonSolid = true; + + VkDeviceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + + createInfo.pQueueCreateInfos = queueCreateInfos.data(); + createInfo.queueCreateInfoCount = static_cast(queueCreateInfos.size()); + + createInfo.pEnabledFeatures = &deviceFeatures; + + std::vector deviceExtensions = getRequiredDeviceExtensions(); + deviceExtensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + + createInfo.enabledExtensionCount = static_cast(deviceExtensions.size()); + createInfo.ppEnabledExtensionNames = deviceExtensions.data(); + + if (m_enableValidation) { + createInfo.enabledLayerCount = static_cast(countof(validationLayers)); + createInfo.ppEnabledLayerNames = validationLayers; + } + else { + createInfo.enabledLayerCount = 0; + } + + if (vkCreateDevice(m_physicalDevice, &createInfo, nullptr, &m_device) != VK_SUCCESS) { + throw std::runtime_error("failed to create logical device!"); + } + + vkGetDeviceQueue(m_device, graphicsQueueIndex, 0, &m_graphicsQueue); + vkGetDeviceQueue(m_device, presentQueueIndex, 0, &m_presentQueue); +} + +static VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector &availableFormats) +{ + if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) { + return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; + } + + for (const auto &availableFormat : availableFormats) { + if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM + && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { + return availableFormat; + } + } + + return availableFormats[0]; +} + +static VkPresentModeKHR chooseSwapPresentMode(const std::vector &availablePresentModes) +{ + VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; + + for (const auto &availablePresentMode : availablePresentModes) { + if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { + return availablePresentMode; + } + else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { + bestMode = availablePresentMode; + } + } + + return bestMode; +} + +static VkExtent2D chooseSwapExtent(GLFWwindow *window, const VkSurfaceCapabilitiesKHR &capabilities) +{ + if (capabilities.currentExtent.width != std::numeric_limits::max()) { + return capabilities.currentExtent; + } + else { + int width, height; + glfwGetFramebufferSize(window, &width, &height); + VkExtent2D actualExtent = {static_cast(width), static_cast(height)}; + + actualExtent.width = std::max(capabilities.minImageExtent.width, + std::min(capabilities.maxImageExtent.width, actualExtent.width)); + actualExtent.height = std::max(capabilities.minImageExtent.height, + std::min(capabilities.maxImageExtent.height, actualExtent.height)); + + return actualExtent; + } +} + +void VulkanBaseApp::createSwapChain() +{ + VkSurfaceCapabilitiesKHR capabilities; + VkSurfaceFormatKHR format; + VkPresentModeKHR presentMode; + VkExtent2D extent; + uint32_t imageCount; + + { + std::vector formats; + std::vector presentModes; + + getSwapChainProperties(m_physicalDevice, m_surface, capabilities, formats, presentModes); + format = chooseSwapSurfaceFormat(formats); + presentMode = chooseSwapPresentMode(presentModes); + extent = chooseSwapExtent(m_window, capabilities); + imageCount = capabilities.minImageCount + 1; + if (capabilities.maxImageCount > 0 && imageCount > capabilities.maxImageCount) { + imageCount = capabilities.maxImageCount; + } + } + + VkSwapchainCreateInfoKHR createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; + createInfo.surface = m_surface; + + createInfo.minImageCount = imageCount; + createInfo.imageFormat = format.format; + createInfo.imageColorSpace = format.colorSpace; + createInfo.imageExtent = extent; + createInfo.imageArrayLayers = 1; + createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + uint32_t queueFamilyIndices[2]; + findGraphicsQueueIndicies(m_physicalDevice, m_surface, queueFamilyIndices[0], queueFamilyIndices[1]); + + if (queueFamilyIndices[0] != queueFamilyIndices[1]) { + createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; + createInfo.queueFamilyIndexCount = countof(queueFamilyIndices); + createInfo.pQueueFamilyIndices = queueFamilyIndices; + } + else { + createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; + } + + createInfo.preTransform = capabilities.currentTransform; + createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + createInfo.presentMode = presentMode; + createInfo.clipped = VK_TRUE; + + createInfo.oldSwapchain = VK_NULL_HANDLE; + + if (vkCreateSwapchainKHR(m_device, &createInfo, nullptr, &m_swapChain) != VK_SUCCESS) { + throw std::runtime_error("failed to create swap chain!"); + } + + vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, nullptr); + m_swapChainImages.resize(imageCount); + vkGetSwapchainImagesKHR(m_device, m_swapChain, &imageCount, m_swapChainImages.data()); + + m_swapChainFormat = format.format; + m_swapChainExtent = extent; +} + +static VkImageView createImageView(VkDevice dev, VkImage image, VkFormat format, VkImageAspectFlags aspectFlags) +{ + VkImageView imageView; + VkImageViewCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + createInfo.image = image; + createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; + createInfo.format = format; + createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.subresourceRange.aspectMask = aspectFlags; + createInfo.subresourceRange.baseMipLevel = 0; + createInfo.subresourceRange.levelCount = 1; + createInfo.subresourceRange.baseArrayLayer = 0; + createInfo.subresourceRange.layerCount = 1; + if (vkCreateImageView(dev, &createInfo, nullptr, &imageView) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image views!"); + } + + return imageView; +} + +static void createImage(VkPhysicalDevice physicalDevice, + VkDevice device, + uint32_t width, + uint32_t height, + VkFormat format, + VkImageTiling tiling, + VkImageUsageFlags usage, + VkMemoryPropertyFlags properties, + VkImage &image, + VkDeviceMemory &imageMemory) +{ + VkImageCreateInfo imageInfo = {}; + imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + imageInfo.imageType = VK_IMAGE_TYPE_2D; + imageInfo.extent.width = width; + imageInfo.extent.height = height; + imageInfo.extent.depth = 1; + imageInfo.mipLevels = 1; + imageInfo.arrayLayers = 1; + imageInfo.format = format; + imageInfo.tiling = tiling; + imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imageInfo.usage = usage; + imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; + imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { + throw std::runtime_error("failed to create image!"); + } + + VkMemoryRequirements memRequirements; + vkGetImageMemoryRequirements(device, image, &memRequirements); + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(physicalDevice, memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate image memory!"); + } + + vkBindImageMemory(device, image, imageMemory, 0); +} + +void VulkanBaseApp::createImageViews() +{ + m_swapChainImageViews.resize(m_swapChainImages.size()); + + for (uint32_t i = 0; i < m_swapChainImages.size(); i++) { + m_swapChainImageViews[i] = + createImageView(m_device, m_swapChainImages[i], m_swapChainFormat, VK_IMAGE_ASPECT_COLOR_BIT); + } +} + +void VulkanBaseApp::createRenderPass() +{ + VkAttachmentDescription colorAttachment = {}; + colorAttachment.format = m_swapChainFormat; + colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + // Set up the render pass to preserve the contents of the attachment while + // rendering. By doing this the points already rendered are not cleared and + // thus displays growing number of points with time eventhough the number of + // points rendered per frame is constant + colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; + colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; + + VkAttachmentReference colorAttachmentRef = {}; + colorAttachmentRef.attachment = 0; + colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + + VkAttachmentDescription depthAttachment = {}; + depthAttachment.format = + findSupportedFormat(m_physicalDevice, + {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT}, + VK_IMAGE_TILING_OPTIMAL, + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); + depthAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + depthAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + depthAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + depthAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + depthAttachment.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + VkAttachmentReference depthAttachmentRef = {}; + depthAttachmentRef.attachment = 1; + depthAttachmentRef.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + + VkSubpassDescription subpass = {}; + subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + subpass.colorAttachmentCount = 1; + subpass.pColorAttachments = &colorAttachmentRef; + subpass.pDepthStencilAttachment = &depthAttachmentRef; + + VkSubpassDependency dependency = {}; + dependency.srcSubpass = VK_SUBPASS_EXTERNAL; + dependency.dstSubpass = 0; + dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.srcAccessMask = 0; + dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + + VkAttachmentDescription attachments[] = {colorAttachment, depthAttachment}; + VkRenderPassCreateInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + renderPassInfo.attachmentCount = countof(attachments); + renderPassInfo.pAttachments = attachments; + renderPassInfo.subpassCount = 1; + renderPassInfo.pSubpasses = &subpass; + renderPassInfo.dependencyCount = 1; + renderPassInfo.pDependencies = &dependency; + + if (vkCreateRenderPass(m_device, &renderPassInfo, nullptr, &m_renderPass) != VK_SUCCESS) { + throw std::runtime_error("failed to create render pass!"); + } +} + +void VulkanBaseApp::createDescriptorSetLayout() +{ + VkDescriptorSetLayoutBinding uboLayoutBinding = {}; + uboLayoutBinding.binding = 0; + uboLayoutBinding.descriptorCount = 1; + uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + uboLayoutBinding.pImmutableSamplers = nullptr; + uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + VkDescriptorSetLayoutCreateInfo layoutInfo = {}; + layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &uboLayoutBinding; + + if (vkCreateDescriptorSetLayout(m_device, &layoutInfo, nullptr, &m_descriptorSetLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor set layout!"); + } +} + +VkShaderModule createShaderModule(VkDevice device, const char *filename) +{ + std::vector shaderContents; + std::ifstream shaderFile(filename, std::ios_base::in | std::ios_base::binary); + VkShaderModuleCreateInfo createInfo = {}; + VkShaderModule shaderModule; + + if (!shaderFile.good()) { + throw std::runtime_error("Failed to load shader contents"); + } + readFile(shaderFile, shaderContents); + + createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + createInfo.codeSize = shaderContents.size(); + createInfo.pCode = reinterpret_cast(shaderContents.data()); + + if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) { + throw std::runtime_error("Failed to create shader module!"); + } + + return shaderModule; +} + +void VulkanBaseApp::getVertexDescriptions(std::vector &bindingDesc, + std::vector &attribDesc) +{ +} + +void VulkanBaseApp::getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) {} + +void VulkanBaseApp::createGraphicsPipeline() +{ + std::vector shaderStageInfos(m_shaderFiles.size()); + for (size_t i = 0; i < m_shaderFiles.size(); i++) { + shaderStageInfos[i] = {}; + shaderStageInfos[i].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shaderStageInfos[i].stage = m_shaderFiles[i].first; + shaderStageInfos[i].module = createShaderModule(m_device, m_shaderFiles[i].second.c_str()); + shaderStageInfos[i].pName = "main"; + } + + VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; + + std::vector vertexBindingDescriptions; + std::vector vertexAttributeDescriptions; + + getVertexDescriptions(vertexBindingDescriptions, vertexAttributeDescriptions); + + vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vertexInputInfo.vertexBindingDescriptionCount = static_cast(vertexBindingDescriptions.size()); + vertexInputInfo.pVertexBindingDescriptions = vertexBindingDescriptions.data(); + vertexInputInfo.vertexAttributeDescriptionCount = static_cast(vertexAttributeDescriptions.size()); + vertexInputInfo.pVertexAttributeDescriptions = vertexAttributeDescriptions.data(); + + VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; + getAssemblyStateInfo(inputAssembly); + + VkViewport viewport = {}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = (float)m_swapChainExtent.width; + viewport.height = (float)m_swapChainExtent.height; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + + VkRect2D scissor = {}; + scissor.offset = {0, 0}; + scissor.extent = m_swapChainExtent; + + VkPipelineViewportStateCreateInfo viewportState = {}; + viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewportState.viewportCount = 1; + viewportState.pViewports = &viewport; + viewportState.scissorCount = 1; + viewportState.pScissors = &scissor; + + VkPipelineRasterizationStateCreateInfo rasterizer = {}; + rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterizer.depthClampEnable = VK_FALSE; + rasterizer.rasterizerDiscardEnable = VK_FALSE; + rasterizer.polygonMode = VK_POLYGON_MODE_POINT; + rasterizer.lineWidth = 1.0f; + rasterizer.cullMode = VK_CULL_MODE_NONE; + rasterizer.frontFace = VK_FRONT_FACE_CLOCKWISE; + rasterizer.depthBiasEnable = VK_FALSE; + + VkPipelineMultisampleStateCreateInfo multisampling = {}; + multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisampling.sampleShadingEnable = VK_FALSE; + multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + multisampling.minSampleShading = 1.0f; // Optional + multisampling.pSampleMask = nullptr; // Optional + multisampling.alphaToCoverageEnable = VK_FALSE; // Optional + multisampling.alphaToOneEnable = VK_FALSE; // Optional + + VkPipelineDepthStencilStateCreateInfo depthStencil = {}; + depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + depthStencil.depthTestEnable = VK_TRUE; + depthStencil.depthWriteEnable = VK_TRUE; + depthStencil.depthCompareOp = VK_COMPARE_OP_LESS; + depthStencil.depthBoundsTestEnable = VK_FALSE; + depthStencil.stencilTestEnable = VK_FALSE; + + VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; + colorBlendAttachment.colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + colorBlendAttachment.blendEnable = VK_FALSE; + + VkPipelineColorBlendStateCreateInfo colorBlending = {}; + colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + colorBlending.logicOpEnable = VK_FALSE; + colorBlending.logicOp = VK_LOGIC_OP_COPY; + colorBlending.attachmentCount = 1; + colorBlending.pAttachments = &colorBlendAttachment; + colorBlending.blendConstants[0] = 0.0f; + colorBlending.blendConstants[1] = 0.0f; + colorBlending.blendConstants[2] = 0.0f; + colorBlending.blendConstants[3] = 0.0f; + + VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; + pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutInfo.setLayoutCount = 1; // Optional + pipelineLayoutInfo.pSetLayouts = &m_descriptorSetLayout; // Optional + pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional + pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional + + if (vkCreatePipelineLayout(m_device, &pipelineLayoutInfo, nullptr, &m_pipelineLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create pipeline layout!"); + } + + VkGraphicsPipelineCreateInfo pipelineInfo = {}; + pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipelineInfo.stageCount = static_cast(shaderStageInfos.size()); + pipelineInfo.pStages = shaderStageInfos.data(); + + pipelineInfo.pVertexInputState = &vertexInputInfo; + pipelineInfo.pInputAssemblyState = &inputAssembly; + pipelineInfo.pViewportState = &viewportState; + pipelineInfo.pRasterizationState = &rasterizer; + pipelineInfo.pMultisampleState = &multisampling; + pipelineInfo.pDepthStencilState = &depthStencil; // Optional + pipelineInfo.pColorBlendState = &colorBlending; + pipelineInfo.pDynamicState = nullptr; // Optional + + pipelineInfo.layout = m_pipelineLayout; + + pipelineInfo.renderPass = m_renderPass; + pipelineInfo.subpass = 0; + + pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional + pipelineInfo.basePipelineIndex = -1; // Optional + + if (vkCreateGraphicsPipelines(m_device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &m_graphicsPipeline) + != VK_SUCCESS) { + throw std::runtime_error("failed to create graphics pipeline!"); + } + + for (size_t i = 0; i < shaderStageInfos.size(); i++) { + vkDestroyShaderModule(m_device, shaderStageInfos[i].module, nullptr); + } +} + +void VulkanBaseApp::createFramebuffers() +{ + m_swapChainFramebuffers.resize(m_swapChainImageViews.size()); + for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { + VkImageView attachments[] = {m_swapChainImageViews[i], m_depthImageView}; + + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.renderPass = m_renderPass; + framebufferInfo.attachmentCount = countof(attachments); + framebufferInfo.pAttachments = attachments; + framebufferInfo.width = m_swapChainExtent.width; + framebufferInfo.height = m_swapChainExtent.height; + framebufferInfo.layers = 1; + + if (vkCreateFramebuffer(m_device, &framebufferInfo, nullptr, &m_swapChainFramebuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create framebuffer!"); + } + } +} + +void VulkanBaseApp::createCommandPool() +{ + VkCommandPoolCreateInfo poolInfo = {}; + uint32_t graphicsIndex, presentIndex; + + findGraphicsQueueIndicies(m_physicalDevice, m_surface, graphicsIndex, presentIndex); + + poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + poolInfo.queueFamilyIndex = graphicsIndex; + poolInfo.flags = 0; // Optional + + if (vkCreateCommandPool(m_device, &poolInfo, nullptr, &m_commandPool) != VK_SUCCESS) { + throw std::runtime_error("Failed to create command pool!"); + } +} + +static void transitionImageLayout(VulkanBaseApp *app, + VkImage image, + VkFormat format, + VkImageLayout oldLayout, + VkImageLayout newLayout) +{ + VkCommandBuffer commandBuffer = app->beginSingleTimeCommands(); + + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + + if (newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + + if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_D24_UNORM_S8_UINT) { + barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; + } + } + else { + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + } + + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags sourceStage; + VkPipelineStageFlags destinationStage; + + if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + + sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + } + else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { + barrier.srcAccessMask = 0; + barrier.dstAccessMask = + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + + sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; + } + else { + throw std::invalid_argument("unsupported layout transition!"); + } + + vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, nullptr, 0, nullptr, 1, &barrier); + + app->endSingleTimeCommands(commandBuffer); +} + +void VulkanBaseApp::createDepthResources() +{ + VkFormat depthFormat = + findSupportedFormat(m_physicalDevice, + {VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT}, + VK_IMAGE_TILING_OPTIMAL, + VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); + createImage(m_physicalDevice, + m_device, + m_swapChainExtent.width, + m_swapChainExtent.height, + depthFormat, + VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + m_depthImage, + m_depthImageMemory); + m_depthImageView = createImageView(m_device, m_depthImage, depthFormat, VK_IMAGE_ASPECT_DEPTH_BIT); + transitionImageLayout( + this, m_depthImage, depthFormat, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); +} + +void VulkanBaseApp::createUniformBuffers() +{ + VkDeviceSize size = getUniformSize(); + if (size > 0) { + m_uniformBuffers.resize(m_swapChainImages.size()); + m_uniformMemory.resize(m_swapChainImages.size()); + for (size_t i = 0; i < m_uniformBuffers.size(); i++) { + createBuffer(getUniformSize(), + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + m_uniformBuffers[i], + m_uniformMemory[i]); + } + } +} + +void VulkanBaseApp::createDescriptorPool() +{ + VkDescriptorPoolSize poolSize = {}; + poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + poolSize.descriptorCount = static_cast(m_swapChainImages.size()); + VkDescriptorPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + poolInfo.maxSets = static_cast(m_swapChainImages.size()); + ; + if (vkCreateDescriptorPool(m_device, &poolInfo, nullptr, &m_descriptorPool) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor pool!"); + } +} + +void VulkanBaseApp::createDescriptorSets() +{ + std::vector layouts(m_swapChainImages.size(), m_descriptorSetLayout); + VkDescriptorSetAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + allocInfo.descriptorPool = m_descriptorPool; + allocInfo.descriptorSetCount = static_cast(m_swapChainImages.size()); + allocInfo.pSetLayouts = layouts.data(); + m_descriptorSets.resize(m_swapChainImages.size()); + + if (vkAllocateDescriptorSets(m_device, &allocInfo, m_descriptorSets.data()) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate descriptor sets!"); + } + + VkDescriptorBufferInfo bufferInfo = {}; + bufferInfo.offset = 0; + bufferInfo.range = VK_WHOLE_SIZE; + VkWriteDescriptorSet descriptorWrite = {}; + descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptorWrite.dstBinding = 0; + descriptorWrite.dstArrayElement = 0; + descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + descriptorWrite.descriptorCount = 1; + descriptorWrite.pBufferInfo = &bufferInfo; + descriptorWrite.pImageInfo = nullptr; // Optional + descriptorWrite.pTexelBufferView = nullptr; // Optional + + for (size_t i = 0; i < m_swapChainImages.size(); i++) { + bufferInfo.buffer = m_uniformBuffers[i]; + descriptorWrite.dstSet = m_descriptorSets[i]; + vkUpdateDescriptorSets(m_device, 1, &descriptorWrite, 0, nullptr); + } +} + +void VulkanBaseApp::createCommandBuffers() +{ + m_commandBuffers.resize(m_swapChainFramebuffers.size()); + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.commandPool = m_commandPool; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandBufferCount = (uint32_t)m_commandBuffers.size(); + + if (vkAllocateCommandBuffers(m_device, &allocInfo, m_commandBuffers.data()) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate command buffers!"); + } + + for (size_t i = 0; i < m_commandBuffers.size(); i++) { + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + beginInfo.pInheritanceInfo = nullptr; // Optional + + if (vkBeginCommandBuffer(m_commandBuffers[i], &beginInfo) != VK_SUCCESS) { + throw std::runtime_error("failed to begin recording command buffer!"); + } + + VkRenderPassBeginInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + renderPassInfo.renderPass = m_renderPass; + renderPassInfo.framebuffer = m_swapChainFramebuffers[i]; + + renderPassInfo.renderArea.offset = {0, 0}; + renderPassInfo.renderArea.extent = m_swapChainExtent; + + VkClearValue clearColors[2]; + clearColors[0].color = {0.0f, 0.0f, 0.0f, 1.0f}; + clearColors[1].depthStencil = {1.0f, 0}; + renderPassInfo.clearValueCount = countof(clearColors); + renderPassInfo.pClearValues = clearColors; + + vkCmdBeginRenderPass(m_commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE); + + vkCmdBindPipeline(m_commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, m_graphicsPipeline); + + vkCmdBindDescriptorSets(m_commandBuffers[i], + VK_PIPELINE_BIND_POINT_GRAPHICS, + m_pipelineLayout, + 0, + 1, + &m_descriptorSets[i], + 0, + nullptr); + + fillRenderingCommandBuffer(m_commandBuffers[i]); + + vkCmdEndRenderPass(m_commandBuffers[i]); + + if (vkEndCommandBuffer(m_commandBuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to record command buffer!"); + } + } +} + +void VulkanBaseApp::createSyncObjects() +{ + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + VkFenceCreateInfo fenceInfo = {}; + fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; + + m_inFlightFences.resize(MAX_FRAMES_IN_FLIGHT); + m_imageAvailableSemaphores.resize(MAX_FRAMES_IN_FLIGHT); + m_renderFinishedSemaphores.resize(MAX_FRAMES_IN_FLIGHT); + + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_imageAvailableSemaphores[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_renderFinishedSemaphores[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + if (vkCreateFence(m_device, &fenceInfo, nullptr, &m_inFlightFences[i]) != VK_SUCCESS) { + throw std::runtime_error("Failed to create image available semaphore!"); + } + } +} + +void VulkanBaseApp::getWaitFrameSemaphores(std::vector &wait, + std::vector &waitStages) const +{ +} + +void VulkanBaseApp::getSignalFrameSemaphores(std::vector &signal) const {} VkDeviceSize VulkanBaseApp::getUniformSize() const { return VkDeviceSize(0); } void VulkanBaseApp::updateUniformBuffer(uint32_t imageIndex, size_t frame) {} -void VulkanBaseApp::createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, +void VulkanBaseApp::createBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, - VkBuffer &buffer, - VkDeviceMemory &bufferMemory) { - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + VkBuffer &buffer, + VkDeviceMemory &bufferMemory) +{ + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType( - m_physicalDevice, memRequirements.memoryTypeBits, properties); + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate buffer memory!"); - } + if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate buffer memory!"); + } - vkBindBufferMemory(m_device, buffer, bufferMemory, 0); + vkBindBufferMemory(m_device, buffer, bufferMemory, 0); } -void VulkanBaseApp::createExternalBuffer( - VkDeviceSize size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, - VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer &buffer, - VkDeviceMemory &bufferMemory) { - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; +void VulkanBaseApp::createExternalBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory) +{ + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {}; - externalMemoryBufferInfo.sType = - VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; - externalMemoryBufferInfo.handleTypes = extMemHandleType; - bufferInfo.pNext = &externalMemoryBufferInfo; + VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {}; + externalMemoryBufferInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; + externalMemoryBufferInfo.handleTypes = extMemHandleType; + bufferInfo.pNext = &externalMemoryBufferInfo; - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); #ifdef _WIN64 - WindowsSecurityAttributes winSecurityAttributes; + WindowsSecurityAttributes winSecurityAttributes; - VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; - vulkanExportMemoryWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; - vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; - vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; - vulkanExportMemoryWin32HandleInfoKHR.dwAccess = - DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; - vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; + VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; + vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; + vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; #endif - VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; - vulkanExportMemoryAllocateInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; + VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; + vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; #ifdef _WIN64 - vulkanExportMemoryAllocateInfoKHR.pNext = - extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR - ? &vulkanExportMemoryWin32HandleInfoKHR - : NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; + vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR + ? &vulkanExportMemoryWin32HandleInfoKHR + : NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; #else - vulkanExportMemoryAllocateInfoKHR.pNext = NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + vulkanExportMemoryAllocateInfoKHR.pNext = NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; #endif - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = findMemoryType( - m_physicalDevice, memRequirements.memoryTypeBits, properties); + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate external buffer memory!"); - } + if (vkAllocateMemory(m_device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate external buffer memory!"); + } - vkBindBufferMemory(m_device, buffer, bufferMemory, 0); + vkBindBufferMemory(m_device, buffer, bufferMemory, 0); } -void *VulkanBaseApp::getMemHandle( - VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType) { +void *VulkanBaseApp::getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType) +{ #ifdef _WIN64 - HANDLE handle = 0; + HANDLE handle = 0; - VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; - vkMemoryGetWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; - vkMemoryGetWin32HandleInfoKHR.pNext = NULL; - vkMemoryGetWin32HandleInfoKHR.memory = memory; - vkMemoryGetWin32HandleInfoKHR.handleType = handleType; + VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; + vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + vkMemoryGetWin32HandleInfoKHR.pNext = NULL; + vkMemoryGetWin32HandleInfoKHR.memory = memory; + vkMemoryGetWin32HandleInfoKHR.handleType = handleType; - PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; - fpGetMemoryWin32HandleKHR = - (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr( - m_device, "vkGetMemoryWin32HandleKHR"); - if (!fpGetMemoryWin32HandleKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, - &handle) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - return (void *)handle; + PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; + fpGetMemoryWin32HandleKHR = + (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryWin32HandleKHR"); + if (!fpGetMemoryWin32HandleKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + return (void *)handle; #else - int fd = -1; + int fd = -1; - VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; - vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; - vkMemoryGetFdInfoKHR.pNext = NULL; - vkMemoryGetFdInfoKHR.memory = memory; - vkMemoryGetFdInfoKHR.handleType = handleType; + VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; + vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + vkMemoryGetFdInfoKHR.pNext = NULL; + vkMemoryGetFdInfoKHR.memory = memory; + vkMemoryGetFdInfoKHR.handleType = handleType; - PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; - fpGetMemoryFdKHR = - (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR"); - if (!fpGetMemoryFdKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } - return (void *)(uintptr_t)fd; + PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; + fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(m_device, "vkGetMemoryFdKHR"); + if (!fpGetMemoryFdKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } + return (void *)(uintptr_t)fd; #endif /* _WIN64 */ } -void *VulkanBaseApp::getSemaphoreHandle( - VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) { +void *VulkanBaseApp::getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) +{ #ifdef _WIN64 - HANDLE handle; + HANDLE handle; - VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {}; - semaphoreGetWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; - semaphoreGetWin32HandleInfoKHR.pNext = NULL; - semaphoreGetWin32HandleInfoKHR.semaphore = semaphore; - semaphoreGetWin32HandleInfoKHR.handleType = handleType; + VkSemaphoreGetWin32HandleInfoKHR semaphoreGetWin32HandleInfoKHR = {}; + semaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; + semaphoreGetWin32HandleInfoKHR.pNext = NULL; + semaphoreGetWin32HandleInfoKHR.semaphore = semaphore; + semaphoreGetWin32HandleInfoKHR.handleType = handleType; - PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; - fpGetSemaphoreWin32HandleKHR = - (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr( - m_device, "vkGetSemaphoreWin32HandleKHR"); - if (!fpGetSemaphoreWin32HandleKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, - &handle) != VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } + PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; + fpGetSemaphoreWin32HandleKHR = + (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreWin32HandleKHR"); + if (!fpGetSemaphoreWin32HandleKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetSemaphoreWin32HandleKHR(m_device, &semaphoreGetWin32HandleInfoKHR, &handle) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } - return (void *)handle; + return (void *)handle; #else - int fd; + int fd; - VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {}; - semaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; - semaphoreGetFdInfoKHR.pNext = NULL; - semaphoreGetFdInfoKHR.semaphore = semaphore; - semaphoreGetFdInfoKHR.handleType = handleType; + VkSemaphoreGetFdInfoKHR semaphoreGetFdInfoKHR = {}; + semaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + semaphoreGetFdInfoKHR.pNext = NULL; + semaphoreGetFdInfoKHR.semaphore = semaphore; + semaphoreGetFdInfoKHR.handleType = handleType; - PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; - fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr( - m_device, "vkGetSemaphoreFdKHR"); - if (!fpGetSemaphoreFdKHR) { - throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); - } - if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != - VK_SUCCESS) { - throw std::runtime_error("Failed to retrieve handle for buffer!"); - } + PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; + fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(m_device, "vkGetSemaphoreFdKHR"); + if (!fpGetSemaphoreFdKHR) { + throw std::runtime_error("Failed to retrieve vkGetMemoryWin32HandleKHR!"); + } + if (fpGetSemaphoreFdKHR(m_device, &semaphoreGetFdInfoKHR, &fd) != VK_SUCCESS) { + throw std::runtime_error("Failed to retrieve handle for buffer!"); + } - return (void *)(uintptr_t)fd; + return (void *)(uintptr_t)fd; #endif } -void VulkanBaseApp::createExternalSemaphore( - VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) { - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {}; - exportSemaphoreCreateInfo.sType = - VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; +void VulkanBaseApp::createExternalSemaphore(VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType) +{ + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + VkExportSemaphoreCreateInfoKHR exportSemaphoreCreateInfo = {}; + exportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; #ifdef _WIN64 - WindowsSecurityAttributes winSecurityAttributes; + WindowsSecurityAttributes winSecurityAttributes; - VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {}; - exportSemaphoreWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; - exportSemaphoreWin32HandleInfoKHR.pNext = NULL; - exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; - exportSemaphoreWin32HandleInfoKHR.dwAccess = - DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; - exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; - exportSemaphoreCreateInfo.pNext = - (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) - ? &exportSemaphoreWin32HandleInfoKHR - : NULL; + VkExportSemaphoreWin32HandleInfoKHR exportSemaphoreWin32HandleInfoKHR = {}; + exportSemaphoreWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; + exportSemaphoreWin32HandleInfoKHR.pNext = NULL; + exportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + exportSemaphoreWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + exportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; + exportSemaphoreCreateInfo.pNext = + (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) ? &exportSemaphoreWin32HandleInfoKHR : NULL; #else - exportSemaphoreCreateInfo.pNext = NULL; + exportSemaphoreCreateInfo.pNext = NULL; #endif - exportSemaphoreCreateInfo.handleTypes = handleType; - semaphoreInfo.pNext = &exportSemaphoreCreateInfo; + exportSemaphoreCreateInfo.handleTypes = handleType; + semaphoreInfo.pNext = &exportSemaphoreCreateInfo; - if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != - VK_SUCCESS) { - throw std::runtime_error( - "failed to create synchronization objects for a CUDA-Vulkan!"); - } + if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &semaphore) != VK_SUCCESS) { + throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!"); + } } -void VulkanBaseApp::importExternalBuffer( - void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, - VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, - VkBuffer &buffer, VkDeviceMemory &memory) { - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; +void VulkanBaseApp::importExternalBuffer(void *handle, + VkExternalMemoryHandleTypeFlagBits handleType, + size_t size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &memory) +{ + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {}; - externalMemoryBufferInfo.sType = - VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; - externalMemoryBufferInfo.handleTypes = - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; - bufferInfo.pNext = &externalMemoryBufferInfo; + VkExternalMemoryBufferCreateInfo externalMemoryBufferInfo = {}; + externalMemoryBufferInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO; + externalMemoryBufferInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; + bufferInfo.pNext = &externalMemoryBufferInfo; - if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } + if (vkCreateBuffer(m_device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(m_device, buffer, &memRequirements); #ifdef _WIN64 - VkImportMemoryWin32HandleInfoKHR handleInfo = {}; - handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR; - handleInfo.pNext = NULL; - handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; - handleInfo.handle = handle; - handleInfo.name = NULL; + VkImportMemoryWin32HandleInfoKHR handleInfo = {}; + handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + handleInfo.pNext = NULL; + handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; + handleInfo.handle = handle; + handleInfo.name = NULL; #else - VkImportMemoryFdInfoKHR handleInfo = {}; - handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR; - handleInfo.pNext = NULL; - handleInfo.fd = (int)(uintptr_t)handle; - handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; + VkImportMemoryFdInfoKHR handleInfo = {}; + handleInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR; + handleInfo.pNext = NULL; + handleInfo.fd = (int)(uintptr_t)handle; + handleInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; #endif /* _WIN64 */ - VkMemoryAllocateInfo memAllocation = {}; - memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - memAllocation.pNext = (void *)&handleInfo; - memAllocation.allocationSize = memRequirements.size; - memAllocation.memoryTypeIndex = findMemoryType( - m_physicalDevice, memRequirements.memoryTypeBits, properties); + VkMemoryAllocateInfo memAllocation = {}; + memAllocation.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + memAllocation.pNext = (void *)&handleInfo; + memAllocation.allocationSize = memRequirements.size; + memAllocation.memoryTypeIndex = findMemoryType(m_physicalDevice, memRequirements.memoryTypeBits, properties); - if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != - VK_SUCCESS) { - throw std::runtime_error("Failed to import allocation!"); - } + if (vkAllocateMemory(m_device, &memAllocation, nullptr, &memory) != VK_SUCCESS) { + throw std::runtime_error("Failed to import allocation!"); + } - vkBindBufferMemory(m_device, buffer, memory, 0); + vkBindBufferMemory(m_device, buffer, memory, 0); } -void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size) { - VkCommandBuffer commandBuffer = beginSingleTimeCommands(); +void VulkanBaseApp::copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size) +{ + VkCommandBuffer commandBuffer = beginSingleTimeCommands(); - VkBufferCopy copyRegion = {}; - copyRegion.size = size; - vkCmdCopyBuffer(commandBuffer, src, dst, 1, ©Region); + VkBufferCopy copyRegion = {}; + copyRegion.size = size; + vkCmdCopyBuffer(commandBuffer, src, dst, 1, ©Region); - endSingleTimeCommands(commandBuffer); + endSingleTimeCommands(commandBuffer); } -void VulkanBaseApp::drawFrame() { - size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT; - vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, - std::numeric_limits::max()); +void VulkanBaseApp::drawFrame() +{ + size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT; + vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, std::numeric_limits::max()); - uint32_t imageIndex; - VkResult result = vkAcquireNextImageKHR( - m_device, m_swapChain, std::numeric_limits::max(), - m_imageAvailableSemaphores[currentFrameIdx], VK_NULL_HANDLE, &imageIndex); - if (result == VK_ERROR_OUT_OF_DATE_KHR) { - recreateSwapChain(); - } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } + uint32_t imageIndex; + VkResult result = vkAcquireNextImageKHR(m_device, + m_swapChain, + std::numeric_limits::max(), + m_imageAvailableSemaphores[currentFrameIdx], + VK_NULL_HANDLE, + &imageIndex); + if (result == VK_ERROR_OUT_OF_DATE_KHR) { + recreateSwapChain(); + } + else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } - updateUniformBuffer(imageIndex, m_currentFrame); + updateUniformBuffer(imageIndex, m_currentFrame); - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - std::vector waitSemaphores; - std::vector waitStages; + std::vector waitSemaphores; + std::vector waitStages; - waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]); - waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); - getWaitFrameSemaphores(waitSemaphores, waitStages); + waitSemaphores.push_back(m_imageAvailableSemaphores[currentFrameIdx]); + waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); + getWaitFrameSemaphores(waitSemaphores, waitStages); - submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); - submitInfo.pWaitSemaphores = waitSemaphores.data(); - submitInfo.pWaitDstStageMask = waitStages.data(); + submitInfo.waitSemaphoreCount = (uint32_t)waitSemaphores.size(); + submitInfo.pWaitSemaphores = waitSemaphores.data(); + submitInfo.pWaitDstStageMask = waitStages.data(); - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &m_commandBuffers[imageIndex]; - std::vector signalSemaphores; - getSignalFrameSemaphores(signalSemaphores); - signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]); - submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); - submitInfo.pSignalSemaphores = signalSemaphores.data(); + std::vector signalSemaphores; + getSignalFrameSemaphores(signalSemaphores); + signalSemaphores.push_back(m_renderFinishedSemaphores[currentFrameIdx]); + submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); + submitInfo.pSignalSemaphores = signalSemaphores.data(); - vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]); + vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]); - if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, - m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) { - throw std::runtime_error("failed to submit draw command buffer!"); - } + if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } - VkPresentInfoKHR presentInfo = {}; - presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; - presentInfo.waitSemaphoreCount = 1; - presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx]; + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = &m_renderFinishedSemaphores[currentFrameIdx]; - VkSwapchainKHR swapChains[] = {m_swapChain}; - presentInfo.swapchainCount = 1; - presentInfo.pSwapchains = swapChains; - presentInfo.pImageIndices = &imageIndex; + VkSwapchainKHR swapChains[] = {m_swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; - result = vkQueuePresentKHR(m_presentQueue, &presentInfo); - if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || - m_framebufferResized) { - recreateSwapChain(); - m_framebufferResized = false; - } else if (result != VK_SUCCESS) { - throw std::runtime_error("Failed to acquire swap chain image!"); - } + result = vkQueuePresentKHR(m_presentQueue, &presentInfo); + if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || m_framebufferResized) { + recreateSwapChain(); + m_framebufferResized = false; + } + else if (result != VK_SUCCESS) { + throw std::runtime_error("Failed to acquire swap chain image!"); + } - m_currentFrame++; + m_currentFrame++; } -void VulkanBaseApp::cleanupSwapChain() { - if (m_depthImageView != VK_NULL_HANDLE) { - vkDestroyImageView(m_device, m_depthImageView, nullptr); - } - if (m_depthImage != VK_NULL_HANDLE) { - vkDestroyImage(m_device, m_depthImage, nullptr); - } - if (m_depthImageMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_depthImageMemory, nullptr); - } +void VulkanBaseApp::cleanupSwapChain() +{ + if (m_depthImageView != VK_NULL_HANDLE) { + vkDestroyImageView(m_device, m_depthImageView, nullptr); + } + if (m_depthImage != VK_NULL_HANDLE) { + vkDestroyImage(m_device, m_depthImage, nullptr); + } + if (m_depthImageMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_depthImageMemory, nullptr); + } - for (size_t i = 0; i < m_uniformBuffers.size(); i++) { - vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr); - vkFreeMemory(m_device, m_uniformMemory[i], nullptr); - } + for (size_t i = 0; i < m_uniformBuffers.size(); i++) { + vkDestroyBuffer(m_device, m_uniformBuffers[i], nullptr); + vkFreeMemory(m_device, m_uniformMemory[i], nullptr); + } - if (m_descriptorPool != VK_NULL_HANDLE) { - vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr); - } + if (m_descriptorPool != VK_NULL_HANDLE) { + vkDestroyDescriptorPool(m_device, m_descriptorPool, nullptr); + } - for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) { - vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr); - } + for (size_t i = 0; i < m_swapChainFramebuffers.size(); i++) { + vkDestroyFramebuffer(m_device, m_swapChainFramebuffers[i], nullptr); + } - if (m_graphicsPipeline != VK_NULL_HANDLE) { - vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr); - } + if (m_graphicsPipeline != VK_NULL_HANDLE) { + vkDestroyPipeline(m_device, m_graphicsPipeline, nullptr); + } - if (m_pipelineLayout != VK_NULL_HANDLE) { - vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr); - } + if (m_pipelineLayout != VK_NULL_HANDLE) { + vkDestroyPipelineLayout(m_device, m_pipelineLayout, nullptr); + } - if (m_renderPass != VK_NULL_HANDLE) { - vkDestroyRenderPass(m_device, m_renderPass, nullptr); - } + if (m_renderPass != VK_NULL_HANDLE) { + vkDestroyRenderPass(m_device, m_renderPass, nullptr); + } - for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { - vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr); - } + for (size_t i = 0; i < m_swapChainImageViews.size(); i++) { + vkDestroyImageView(m_device, m_swapChainImageViews[i], nullptr); + } - if (m_swapChain != VK_NULL_HANDLE) { - vkDestroySwapchainKHR(m_device, m_swapChain, nullptr); - } + if (m_swapChain != VK_NULL_HANDLE) { + vkDestroySwapchainKHR(m_device, m_swapChain, nullptr); + } } -void VulkanBaseApp::recreateSwapChain() { - int width, height; +void VulkanBaseApp::recreateSwapChain() +{ + int width, height; - glfwGetFramebufferSize(m_window, &width, &height); - while (width == 0 || height == 0) { - glfwWaitEvents(); glfwGetFramebufferSize(m_window, &width, &height); - } + while (width == 0 || height == 0) { + glfwWaitEvents(); + glfwGetFramebufferSize(m_window, &width, &height); + } - vkDeviceWaitIdle(m_device); + vkDeviceWaitIdle(m_device); - cleanupSwapChain(); + cleanupSwapChain(); - createSwapChain(); - createImageViews(); - createRenderPass(); - createGraphicsPipeline(); - createDepthResources(); - createFramebuffers(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createGraphicsPipeline(); + createDepthResources(); + createFramebuffers(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); } -void VulkanBaseApp::mainLoop() { - while (!glfwWindowShouldClose(m_window)) { - glfwPollEvents(); - drawFrame(); - } - vkDeviceWaitIdle(m_device); +void VulkanBaseApp::mainLoop() +{ + while (!glfwWindowShouldClose(m_window)) { + glfwPollEvents(); + drawFrame(); + } + vkDeviceWaitIdle(m_device); } -void readFile(std::istream &s, std::vector &data) { - s.seekg(0, std::ios_base::end); - data.resize(s.tellg()); - s.clear(); - s.seekg(0, std::ios_base::beg); - s.read(data.data(), data.size()); +void readFile(std::istream &s, std::vector &data) +{ + s.seekg(0, std::ios_base::end); + data.resize(s.tellg()); + s.clear(); + s.seekg(0, std::ios_base::beg); + s.read(data.data(), data.size()); } diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h index 53d2ef65..814e321e 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h @@ -34,127 +34,128 @@ #include #ifdef _WIN64 #define NOMINMAX -#include #include +#include #endif /* _WIN64 */ struct GLFWwindow; -class VulkanBaseApp { - public: - VulkanBaseApp(const std::string& appName, bool enableValidation = false); - static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType(); - static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType(); - virtual ~VulkanBaseApp(); - void init(); - void* getMemHandle(VkDeviceMemory memory, - VkExternalMemoryHandleTypeFlagBits handleType); - void* getSemaphoreHandle(VkSemaphore semaphore, - VkExternalSemaphoreHandleTypeFlagBits handleType); - bool isVkPhysicalDeviceUuid(void* Uuid); - void createExternalSemaphore( - VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); - void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, VkBuffer& buffer, - VkDeviceMemory& bufferMemory); - void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, - VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, - VkBuffer& buffer, VkDeviceMemory& bufferMemory); - void importExternalBuffer(void* handle, - VkExternalMemoryHandleTypeFlagBits handleType, - size_t size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, VkBuffer& buffer, - VkDeviceMemory& memory); - void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size); - VkCommandBuffer beginSingleTimeCommands(); - void endSingleTimeCommands(VkCommandBuffer commandBuffer); - void mainLoop(); +class VulkanBaseApp +{ +public: + VulkanBaseApp(const std::string &appName, bool enableValidation = false); + static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType(); + static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType(); + virtual ~VulkanBaseApp(); + void init(); + void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType); + void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); + bool isVkPhysicalDeviceUuid(void *Uuid); + void createExternalSemaphore(VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); + void createBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory); + void createExternalBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory); + void importExternalBuffer(void *handle, + VkExternalMemoryHandleTypeFlagBits handleType, + size_t size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &memory); + void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size); + VkCommandBuffer beginSingleTimeCommands(); + void endSingleTimeCommands(VkCommandBuffer commandBuffer); + void mainLoop(); - protected: - const std::string m_appName; - const bool m_enableValidation; - VkInstance m_instance; - VkDebugUtilsMessengerEXT m_debugMessenger; - VkSurfaceKHR m_surface; - VkPhysicalDevice m_physicalDevice; - uint8_t m_deviceUUID[VK_UUID_SIZE]; - VkDevice m_device; - VkQueue m_graphicsQueue; - VkQueue m_presentQueue; - VkSwapchainKHR m_swapChain; - std::vector m_swapChainImages; - VkFormat m_swapChainFormat; - VkExtent2D m_swapChainExtent; - std::vector m_swapChainImageViews; - std::vector > m_shaderFiles; - VkRenderPass m_renderPass; - VkPipelineLayout m_pipelineLayout; - VkPipeline m_graphicsPipeline; - std::vector m_swapChainFramebuffers; - VkCommandPool m_commandPool; - std::vector m_commandBuffers; - std::vector m_imageAvailableSemaphores; - std::vector m_renderFinishedSemaphores; - std::vector m_inFlightFences; - std::vector m_uniformBuffers; - std::vector m_uniformMemory; - VkDescriptorSetLayout m_descriptorSetLayout; - VkDescriptorPool m_descriptorPool; - std::vector m_descriptorSets; +protected: + const std::string m_appName; + const bool m_enableValidation; + VkInstance m_instance; + VkDebugUtilsMessengerEXT m_debugMessenger; + VkSurfaceKHR m_surface; + VkPhysicalDevice m_physicalDevice; + uint8_t m_deviceUUID[VK_UUID_SIZE]; + VkDevice m_device; + VkQueue m_graphicsQueue; + VkQueue m_presentQueue; + VkSwapchainKHR m_swapChain; + std::vector m_swapChainImages; + VkFormat m_swapChainFormat; + VkExtent2D m_swapChainExtent; + std::vector m_swapChainImageViews; + std::vector> m_shaderFiles; + VkRenderPass m_renderPass; + VkPipelineLayout m_pipelineLayout; + VkPipeline m_graphicsPipeline; + std::vector m_swapChainFramebuffers; + VkCommandPool m_commandPool; + std::vector m_commandBuffers; + std::vector m_imageAvailableSemaphores; + std::vector m_renderFinishedSemaphores; + std::vector m_inFlightFences; + std::vector m_uniformBuffers; + std::vector m_uniformMemory; + VkDescriptorSetLayout m_descriptorSetLayout; + VkDescriptorPool m_descriptorPool; + std::vector m_descriptorSets; - VkImage m_depthImage; - VkDeviceMemory m_depthImageMemory; - VkImageView m_depthImageView; - size_t m_currentFrame; - bool m_framebufferResized; + VkImage m_depthImage; + VkDeviceMemory m_depthImageMemory; + VkImageView m_depthImageView; + size_t m_currentFrame; + bool m_framebufferResized; - virtual void initVulkanApp() {} - virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {} - virtual std::vector getRequiredExtensions() const; - virtual std::vector getRequiredDeviceExtensions() const; - virtual void getVertexDescriptions( - std::vector& bindingDesc, - std::vector& attribDesc); - virtual void getAssemblyStateInfo( - VkPipelineInputAssemblyStateCreateInfo& info); - virtual void getWaitFrameSemaphores( - std::vector& wait, - std::vector& waitStages) const; - virtual void getSignalFrameSemaphores(std::vector& signal) const; - virtual VkDeviceSize getUniformSize() const; - virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame); - virtual void drawFrame(); + virtual void initVulkanApp() {} + virtual void fillRenderingCommandBuffer(VkCommandBuffer &buffer) {} + virtual std::vector getRequiredExtensions() const; + virtual std::vector getRequiredDeviceExtensions() const; + virtual void getVertexDescriptions(std::vector &bindingDesc, + std::vector &attribDesc); + virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info); + virtual void getWaitFrameSemaphores(std::vector &wait, + std::vector &waitStages) const; + virtual void getSignalFrameSemaphores(std::vector &signal) const; + virtual VkDeviceSize getUniformSize() const; + virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame); + virtual void drawFrame(); - private: - GLFWwindow* m_window; +private: + GLFWwindow *m_window; - void initWindow(); - void initVulkan(); - void createInstance(); - void createSurface(); - void createDevice(); - void createSwapChain(); - void createImageViews(); - void createRenderPass(); - void createDescriptorSetLayout(); - void createGraphicsPipeline(); - void createFramebuffers(); - void createCommandPool(); - void createDepthResources(); - void createUniformBuffers(); - void createDescriptorPool(); - void createDescriptorSets(); - void createCommandBuffers(); - void createSyncObjects(); + void initWindow(); + void initVulkan(); + void createInstance(); + void createSurface(); + void createDevice(); + void createSwapChain(); + void createImageViews(); + void createRenderPass(); + void createDescriptorSetLayout(); + void createGraphicsPipeline(); + void createFramebuffers(); + void createCommandPool(); + void createDepthResources(); + void createUniformBuffers(); + void createDescriptorPool(); + void createDescriptorSets(); + void createCommandBuffers(); + void createSyncObjects(); - void cleanupSwapChain(); - void recreateSwapChain(); + void cleanupSwapChain(); + void recreateSwapChain(); - bool isSuitableDevice(VkPhysicalDevice dev) const; - static void resizeCallback(GLFWwindow* window, int width, int height); + bool isSuitableDevice(VkPhysicalDevice dev) const; + static void resizeCallback(GLFWwindow *window, int width, int height); }; -void readFile(std::istream& s, std::vector& data); +void readFile(std::istream &s, std::vector &data); #endif /* __VULKANBASEAPP_H__ */ diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanCudaInterop.h b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanCudaInterop.h index 4ad5cb75..9d9da7d3 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanCudaInterop.h +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanCudaInterop.h @@ -30,53 +30,49 @@ #define __VKCUDA_H__ #include + #include "cuda.h" #define CUDA_DRIVER_API #include -bool isDeviceCompatible(void *Uuid, size_t size) { - int cudaDevice = cudaInvalidDeviceId; - int deviceCount; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); +bool isDeviceCompatible(void *Uuid, size_t size) +{ + int cudaDevice = cudaInvalidDeviceId; + int deviceCount; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - for (int i = 0; i < deviceCount; ++i) { - cudaDeviceProp devProp = {}; - checkCudaErrors(cudaGetDeviceProperties(&devProp, i)); - if (!memcmp(&devProp.uuid, Uuid, size)) { - cudaDevice = i; - break; + for (int i = 0; i < deviceCount; ++i) { + cudaDeviceProp devProp = {}; + checkCudaErrors(cudaGetDeviceProperties(&devProp, i)); + if (!memcmp(&devProp.uuid, Uuid, size)) { + cudaDevice = i; + break; + } + } + if (cudaDevice == cudaInvalidDeviceId) { + return false; } - } - if (cudaDevice == cudaInvalidDeviceId) { - return false; - } - int deviceSupportsHandle = 0; - int attributeVal = 0; - int deviceComputeMode = 0; + int deviceSupportsHandle = 0; + int attributeVal = 0; + int deviceComputeMode = 0; - checkCudaErrors(cuDeviceGetAttribute( - &deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice)); - checkCudaErrors(cuDeviceGetAttribute( - &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cudaDevice)); + checkCudaErrors(cuDeviceGetAttribute(&deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice)); + checkCudaErrors( + cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cudaDevice)); #if defined(__linux__) - checkCudaErrors(cuDeviceGetAttribute( - &deviceSupportsHandle, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, - cudaDevice)); + checkCudaErrors(cuDeviceGetAttribute( + &deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, cudaDevice)); #else - checkCudaErrors(cuDeviceGetAttribute( - &deviceSupportsHandle, - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice)); + checkCudaErrors(cuDeviceGetAttribute( + &deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice)); #endif - if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal || - !deviceSupportsHandle) { - return false; - } - return true; + if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal || !deviceSupportsHandle) { + return false; + } + return true; } -#endif // __VKCUDA_H__ +#endif // __VKCUDA_H__ diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/main.cpp b/Samples/5_Domain_Specific/simpleVulkanMMAP/main.cpp index dd8b5c02..3f84d5ea 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/main.cpp +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/main.cpp @@ -32,20 +32,18 @@ * Vulkan. */ -#include "VulkanBaseApp.h" - -#include -#include -#include #include +#include +#include +#include +#include #include "MonteCarloPi.h" -#include -#include - +#include "VulkanBaseApp.h" +#include "helper_cuda.h" #include "helper_multiprocess.h" -//#define DEBUG +// #define DEBUG #ifndef DEBUG #define ENABLE_VALIDATION (false) #else @@ -56,297 +54,295 @@ std::string execution_path; -class VulkanCudaPi : public VulkanBaseApp { - typedef struct UniformBufferObject_st { float frame; } UniformBufferObject; +class VulkanCudaPi : public VulkanBaseApp +{ + typedef struct UniformBufferObject_st + { + float frame; + } UniformBufferObject; - VkBuffer m_inCircleBuffer, m_xyPositionBuffer; - VkDeviceMemory m_inCircleMemory, m_xyPositionMemory; - VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore; - MonteCarloPiSimulation m_sim; - UniformBufferObject m_ubo; - cudaStream_t m_stream; - cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore; - using chrono_tp = std::chrono::time_point; - chrono_tp m_lastTime; - size_t m_lastFrame; + VkBuffer m_inCircleBuffer, m_xyPositionBuffer; + VkDeviceMemory m_inCircleMemory, m_xyPositionMemory; + VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore; + MonteCarloPiSimulation m_sim; + UniformBufferObject m_ubo; + cudaStream_t m_stream; + cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore; + using chrono_tp = std::chrono::time_point; + chrono_tp m_lastTime; + size_t m_lastFrame; - public: - VulkanCudaPi(size_t num_points) - : VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION), - m_inCircleBuffer(VK_NULL_HANDLE), - m_xyPositionBuffer(VK_NULL_HANDLE), - m_inCircleMemory(VK_NULL_HANDLE), - m_xyPositionMemory(VK_NULL_HANDLE), - m_sim(num_points), - m_ubo(), - m_stream(0), - m_vkWaitSemaphore(VK_NULL_HANDLE), - m_vkSignalSemaphore(VK_NULL_HANDLE), - m_cudaWaitSemaphore(), - m_cudaSignalSemaphore(), - m_lastFrame(0) { - // Add our compiled vulkan shader files - char* vertex_shader_path = - sdkFindFilePath("vert.spv", execution_path.c_str()); - char* fragment_shader_path = - sdkFindFilePath("frag.spv", execution_path.c_str()); - m_shaderFiles.push_back( - std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); - m_shaderFiles.push_back( - std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); - } - - ~VulkanCudaPi() { - if (m_stream) { - // Make sure there's no pending work before we start tearing down - checkCudaErrors(cudaStreamSynchronize(m_stream)); - checkCudaErrors(cudaStreamDestroy(m_stream)); +public: + VulkanCudaPi(size_t num_points) + : VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION) + , m_inCircleBuffer(VK_NULL_HANDLE) + , m_xyPositionBuffer(VK_NULL_HANDLE) + , m_inCircleMemory(VK_NULL_HANDLE) + , m_xyPositionMemory(VK_NULL_HANDLE) + , m_sim(num_points) + , m_ubo() + , m_stream(0) + , m_vkWaitSemaphore(VK_NULL_HANDLE) + , m_vkSignalSemaphore(VK_NULL_HANDLE) + , m_cudaWaitSemaphore() + , m_cudaSignalSemaphore() + , m_lastFrame(0) + { + // Add our compiled vulkan shader files + char *vertex_shader_path = sdkFindFilePath("vert.spv", execution_path.c_str()); + char *fragment_shader_path = sdkFindFilePath("frag.spv", execution_path.c_str()); + m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); + m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); } - if (m_vkSignalSemaphore != VK_NULL_HANDLE) { - checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore)); - vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr); - } - if (m_vkWaitSemaphore != VK_NULL_HANDLE) { - checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore)); - vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr); - } - if (m_xyPositionBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_xyPositionBuffer, nullptr); - } - if (m_xyPositionMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_xyPositionMemory, nullptr); - } - if (m_inCircleBuffer != VK_NULL_HANDLE) { - vkDestroyBuffer(m_device, m_inCircleBuffer, nullptr); - } - if (m_inCircleMemory != VK_NULL_HANDLE) { - vkFreeMemory(m_device, m_inCircleMemory, nullptr); - } - } + ~VulkanCudaPi() + { + if (m_stream) { + // Make sure there's no pending work before we start tearing down + checkCudaErrors(cudaStreamSynchronize(m_stream)); + checkCudaErrors(cudaStreamDestroy(m_stream)); + } - void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) { - VkBuffer vertexBuffers[] = {m_inCircleBuffer, m_xyPositionBuffer}; - VkDeviceSize offsets[] = {0, 0}; - vkCmdBindVertexBuffers(commandBuffer, 0, - sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), - vertexBuffers, offsets); - vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0); - } - - void getVertexDescriptions( - std::vector& bindingDesc, - std::vector& attribDesc) { - bindingDesc.resize(2); - attribDesc.resize(2); - - bindingDesc[0].binding = 0; - bindingDesc[0].stride = sizeof(float); - bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - - bindingDesc[1].binding = 1; - bindingDesc[1].stride = sizeof(vec2); - bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - - attribDesc[0].binding = 0; - attribDesc[0].location = 0; - attribDesc[0].format = VK_FORMAT_R32_SFLOAT; - attribDesc[0].offset = 0; - - attribDesc[1].binding = 1; - attribDesc[1].location = 1; - attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT; - attribDesc[1].offset = 0; - } - - void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) { - info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; - info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; - info.primitiveRestartEnable = VK_FALSE; - } - - void getWaitFrameSemaphores( - std::vector& wait, - std::vector& waitStages) const { - if (m_currentFrame != 0) { - // Have vulkan wait until cuda is done with the vertex buffer before - // rendering - // We don't do this on the first frame, as the wait semaphore hasn't been - // initialized yet - wait.push_back(m_vkWaitSemaphore); - // We want to wait until all the pipeline commands are complete before - // letting cuda work - waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); - } - } - - void getSignalFrameSemaphores(std::vector& signal) const { - // Add this semaphore for vulkan to signal once the vertex buffer is ready - // for cuda to modify - signal.push_back(m_vkSignalSemaphore); - } - - void initVulkanApp() { - const size_t nVerts = m_sim.getNumPoints(); - - // Obtain cuda device id for the device corresponding to the Vulkan physical - // device - int deviceCount; - int cudaDevice = cudaInvalidDeviceId; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - for (int dev = 0; dev < deviceCount; ++dev) { - cudaDeviceProp devProp = {}; - checkCudaErrors(cudaGetDeviceProperties(&devProp, dev)); - if (isVkPhysicalDeviceUuid(&devProp.uuid)) { - cudaDevice = dev; - break; - } - } - if (cudaDevice == cudaInvalidDeviceId) { - throw std::runtime_error("No Suitable device found!"); + if (m_vkSignalSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore)); + vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr); + } + if (m_vkWaitSemaphore != VK_NULL_HANDLE) { + checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore)); + vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr); + } + if (m_xyPositionBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_xyPositionBuffer, nullptr); + } + if (m_xyPositionMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_xyPositionMemory, nullptr); + } + if (m_inCircleBuffer != VK_NULL_HANDLE) { + vkDestroyBuffer(m_device, m_inCircleBuffer, nullptr); + } + if (m_inCircleMemory != VK_NULL_HANDLE) { + vkFreeMemory(m_device, m_inCircleMemory, nullptr); + } } - // On the corresponding cuda device, create the cuda stream we'll using - checkCudaErrors(cudaSetDevice(cudaDevice)); - checkCudaErrors( - cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); - m_sim.initSimulation(cudaDevice, m_stream); - - importExternalBuffer( - (void*)(uintptr_t)m_sim.getPositionShareableHandle(), - getDefaultMemHandleType(), nVerts * sizeof(vec2), - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer, - m_xyPositionMemory); - - importExternalBuffer( - (void*)(uintptr_t)m_sim.getInCircleShareableHandle(), - getDefaultMemHandleType(), nVerts * sizeof(float), - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer, - m_inCircleMemory); - - // Create the semaphore vulkan will signal when it's done with the vertex - // buffer - createExternalSemaphore(m_vkSignalSemaphore, - getDefaultSemaphoreHandleType()); - // Create the semaphore vulkan will wait for before using the vertex buffer - createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); - // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait - importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, - getDefaultSemaphoreHandleType()); - // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait - importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, - getDefaultSemaphoreHandleType()); - } - - void importCudaExternalSemaphore( - cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, - VkExternalSemaphoreHandleTypeFlagBits handleType) { - cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; - - if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeOpaqueWin32; - } else if (handleType & - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; - } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { - externalSemaphoreHandleDesc.type = - cudaExternalSemaphoreHandleTypeOpaqueFd; - } else { - throw std::runtime_error("Unknown handle type requested!"); + void fillRenderingCommandBuffer(VkCommandBuffer &commandBuffer) + { + VkBuffer vertexBuffers[] = {m_inCircleBuffer, m_xyPositionBuffer}; + VkDeviceSize offsets[] = {0, 0}; + vkCmdBindVertexBuffers( + commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets); + vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0); } + void getVertexDescriptions(std::vector &bindingDesc, + std::vector &attribDesc) + { + bindingDesc.resize(2); + attribDesc.resize(2); + + bindingDesc[0].binding = 0; + bindingDesc[0].stride = sizeof(float); + bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + bindingDesc[1].binding = 1; + bindingDesc[1].stride = sizeof(vec2); + bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + attribDesc[0].binding = 0; + attribDesc[0].location = 0; + attribDesc[0].format = VK_FORMAT_R32_SFLOAT; + attribDesc[0].offset = 0; + + attribDesc[1].binding = 1; + attribDesc[1].location = 1; + attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT; + attribDesc[1].offset = 0; + } + + void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) + { + info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + info.primitiveRestartEnable = VK_FALSE; + } + + void getWaitFrameSemaphores(std::vector &wait, std::vector &waitStages) const + { + if (m_currentFrame != 0) { + // Have vulkan wait until cuda is done with the vertex buffer before + // rendering + // We don't do this on the first frame, as the wait semaphore hasn't been + // initialized yet + wait.push_back(m_vkWaitSemaphore); + // We want to wait until all the pipeline commands are complete before + // letting cuda work + waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); + } + } + + void getSignalFrameSemaphores(std::vector &signal) const + { + // Add this semaphore for vulkan to signal once the vertex buffer is ready + // for cuda to modify + signal.push_back(m_vkSignalSemaphore); + } + + void initVulkanApp() + { + const size_t nVerts = m_sim.getNumPoints(); + + // Obtain cuda device id for the device corresponding to the Vulkan physical + // device + int deviceCount; + int cudaDevice = cudaInvalidDeviceId; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + for (int dev = 0; dev < deviceCount; ++dev) { + cudaDeviceProp devProp = {}; + checkCudaErrors(cudaGetDeviceProperties(&devProp, dev)); + if (isVkPhysicalDeviceUuid(&devProp.uuid)) { + cudaDevice = dev; + break; + } + } + if (cudaDevice == cudaInvalidDeviceId) { + throw std::runtime_error("No Suitable device found!"); + } + + // On the corresponding cuda device, create the cuda stream we'll using + checkCudaErrors(cudaSetDevice(cudaDevice)); + checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); + m_sim.initSimulation(cudaDevice, m_stream); + + importExternalBuffer((void *)(uintptr_t)m_sim.getPositionShareableHandle(), + getDefaultMemHandleType(), + nVerts * sizeof(vec2), + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + m_xyPositionBuffer, + m_xyPositionMemory); + + importExternalBuffer((void *)(uintptr_t)m_sim.getInCircleShareableHandle(), + getDefaultMemHandleType(), + nVerts * sizeof(float), + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + m_inCircleBuffer, + m_inCircleMemory); + + // Create the semaphore vulkan will signal when it's done with the vertex + // buffer + createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); + // Create the semaphore vulkan will wait for before using the vertex buffer + createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); + // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait + importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); + // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait + importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); + } + + void importCudaExternalSemaphore(cudaExternalSemaphore_t &cudaSem, + VkSemaphore &vkSem, + VkExternalSemaphoreHandleTypeFlagBits handleType) + { + cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; + + if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + } + else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + } + else { + throw std::runtime_error("Unknown handle type requested!"); + } + #ifdef _WIN64 - externalSemaphoreHandleDesc.handle.win32.handle = - (HANDLE)getSemaphoreHandle(vkSem, handleType); + externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType); #else - externalSemaphoreHandleDesc.handle.fd = - (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); + externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); #endif - externalSemaphoreHandleDesc.flags = 0; + externalSemaphoreHandleDesc.flags = 0; - checkCudaErrors( - cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); - } - - VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); } - - void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) { - m_ubo.frame = (float)globalFrame; - void* data; - vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, - &data); - memcpy(data, &m_ubo, sizeof(m_ubo)); - vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); - } - - std::vector getRequiredExtensions() const { - std::vector extensions; - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME); - extensions.push_back( - VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); - return extensions; - } - - std::vector getRequiredDeviceExtensions() const { - std::vector extensions; - - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); -#ifdef _WIN64 - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); -#else - extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); - extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); -#endif /* _WIN64 */ - return extensions; - } - - void drawFrame() { - static chrono_tp startTime = std::chrono::high_resolution_clock::now(); - - chrono_tp currentTime = std::chrono::high_resolution_clock::now(); - float time = std::chrono::duration( - currentTime - startTime) - .count(); - - if (m_currentFrame == 0) { - m_lastTime = startTime; + checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); } - cudaExternalSemaphoreWaitParams waitParams = {}; - waitParams.flags = 0; - waitParams.params.fence.value = 0; + VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); } - cudaExternalSemaphoreSignalParams signalParams = {}; - signalParams.flags = 0; - signalParams.params.fence.value = 0; + void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) + { + m_ubo.frame = (float)globalFrame; + void *data; + vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data); + memcpy(data, &m_ubo, sizeof(m_ubo)); + vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); + } - // Have vulkan draw the current frame... - VulkanBaseApp::drawFrame(); - // Wait for vulkan to complete it's work - checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, - &waitParams, 1, m_stream)); - // Now step the simulation - m_sim.stepSimulation(time, m_stream); + std::vector getRequiredExtensions() const + { + std::vector extensions; + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME); + extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + return extensions; + } - // Signal vulkan to continue with the updated buffers - checkCudaErrors(cudaSignalExternalSemaphoresAsync( - &m_cudaSignalSemaphore, &signalParams, 1, m_stream)); - } + std::vector getRequiredDeviceExtensions() const + { + std::vector extensions; + + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); +#ifdef _WIN64 + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); +#else + extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); + extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); +#endif /* _WIN64 */ + return extensions; + } + + void drawFrame() + { + static chrono_tp startTime = std::chrono::high_resolution_clock::now(); + + chrono_tp currentTime = std::chrono::high_resolution_clock::now(); + float time = std::chrono::duration(currentTime - startTime).count(); + + if (m_currentFrame == 0) { + m_lastTime = startTime; + } + + cudaExternalSemaphoreWaitParams waitParams = {}; + waitParams.flags = 0; + waitParams.params.fence.value = 0; + + cudaExternalSemaphoreSignalParams signalParams = {}; + signalParams.flags = 0; + signalParams.params.fence.value = 0; + + // Have vulkan draw the current frame... + VulkanBaseApp::drawFrame(); + // Wait for vulkan to complete it's work + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream)); + // Now step the simulation + m_sim.stepSimulation(time, m_stream); + + // Signal vulkan to continue with the updated buffers + checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream)); + } }; -int main(int argc, char** argv) { - execution_path = argv[0]; - VulkanCudaPi app(NUM_SIMULATION_POINTS); - app.init(); - app.mainLoop(); - return 0; +int main(int argc, char **argv) +{ + execution_path = argv[0]; + VulkanCudaPi app(NUM_SIMULATION_POINTS); + app.init(); + app.mainLoop(); + return 0; } diff --git a/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.cpp b/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.cpp index 984f1cfe..d2d6f0fa 100644 --- a/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.cpp @@ -28,216 +28,229 @@ #include #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION #include + #include "GLSLProgram.h" -GLSLProgram::GLSLProgram(const char *vsource, const char *fsource) { - mProg = compileProgram(vsource, 0, fsource); +GLSLProgram::GLSLProgram(const char *vsource, const char *fsource) { mProg = compileProgram(vsource, 0, fsource); } + +GLSLProgram::GLSLProgram(const char *vsource, const char *gsource, const char *fsource, GLenum gsInput, GLenum gsOutput) +{ + mProg = compileProgram(vsource, gsource, fsource, gsInput, gsOutput); } -GLSLProgram::GLSLProgram(const char *vsource, const char *gsource, - const char *fsource, GLenum gsInput, GLenum gsOutput) { - mProg = compileProgram(vsource, gsource, fsource, gsInput, gsOutput); -} - -GLSLProgram::~GLSLProgram() { - if (mProg) { - glDeleteProgram(mProg); - } +GLSLProgram::~GLSLProgram() +{ + if (mProg) { + glDeleteProgram(mProg); + } } void GLSLProgram::enable() { glUseProgram(mProg); } void GLSLProgram::disable() { glUseProgram(0); } -void GLSLProgram::setUniform1f(const char *name, float value) { - GLint loc = glGetUniformLocation(mProg, name); +void GLSLProgram::setUniform1f(const char *name, float value) +{ + GLint loc = glGetUniformLocation(mProg, name); - if (loc >= 0) { - glUniform1f(loc, value); - } else { + if (loc >= 0) { + glUniform1f(loc, value); + } + else { #if _DEBUG - fprintf(stderr, "Error setting parameter '%s'\n", name); + fprintf(stderr, "Error setting parameter '%s'\n", name); #endif - } + } } -void GLSLProgram::setUniform2f(const char *name, float x, float y) { - GLint loc = glGetUniformLocation(mProg, name); +void GLSLProgram::setUniform2f(const char *name, float x, float y) +{ + GLint loc = glGetUniformLocation(mProg, name); - if (loc >= 0) { - glUniform2f(loc, x, y); - } else { + if (loc >= 0) { + glUniform2f(loc, x, y); + } + else { #if _DEBUG - fprintf(stderr, "Error setting parameter '%s'\n", name); + fprintf(stderr, "Error setting parameter '%s'\n", name); #endif - } + } } -void GLSLProgram::setUniform3f(const char *name, float x, float y, float z) { - GLint loc = glGetUniformLocation(mProg, name); +void GLSLProgram::setUniform3f(const char *name, float x, float y, float z) +{ + GLint loc = glGetUniformLocation(mProg, name); - if (loc >= 0) { - glUniform3f(loc, x, y, z); - } else { + if (loc >= 0) { + glUniform3f(loc, x, y, z); + } + else { #if _DEBUG - fprintf(stderr, "Error setting parameter '%s'\n", name); + fprintf(stderr, "Error setting parameter '%s'\n", name); #endif - } + } } -void GLSLProgram::setUniform4f(const char *name, float x, float y, float z, - float w) { - GLint loc = glGetUniformLocation(mProg, name); +void GLSLProgram::setUniform4f(const char *name, float x, float y, float z, float w) +{ + GLint loc = glGetUniformLocation(mProg, name); - if (loc >= 0) { - glUniform4f(loc, x, y, z, w); - } else { + if (loc >= 0) { + glUniform4f(loc, x, y, z, w); + } + else { #if _DEBUG - fprintf(stderr, "Error setting parameter '%s'\n", name); + fprintf(stderr, "Error setting parameter '%s'\n", name); #endif - } + } } -void GLSLProgram::setUniformMatrix4fv(const GLchar *name, GLfloat *m, - bool transpose) { - GLint loc = glGetUniformLocation(mProg, name); +void GLSLProgram::setUniformMatrix4fv(const GLchar *name, GLfloat *m, bool transpose) +{ + GLint loc = glGetUniformLocation(mProg, name); - if (loc >= 0) { - glUniformMatrix4fv(loc, 1, transpose, m); - } else { + if (loc >= 0) { + glUniformMatrix4fv(loc, 1, transpose, m); + } + else { #if _DEBUG - fprintf(stderr, "Error setting parameter '%s'\n", name); + fprintf(stderr, "Error setting parameter '%s'\n", name); #endif - } + } } -void GLSLProgram::setUniformfv(const GLchar *name, GLfloat *v, int elementSize, - int count) { - GLint loc = glGetUniformLocation(mProg, name); +void GLSLProgram::setUniformfv(const GLchar *name, GLfloat *v, int elementSize, int count) +{ + GLint loc = glGetUniformLocation(mProg, name); - if (loc == -1) { + if (loc == -1) { #ifdef _DEBUG - fprintf(stderr, "Error setting parameter '%s'\n", name); + fprintf(stderr, "Error setting parameter '%s'\n", name); #endif - return; - } - - switch (elementSize) { - case 1: - glUniform1fv(loc, count, v); - break; - - case 2: - glUniform2fv(loc, count, v); - break; - - case 3: - glUniform3fv(loc, count, v); - break; - - case 4: - glUniform4fv(loc, count, v); - break; - } -} - -void GLSLProgram::bindTexture(const char *name, GLuint tex, GLenum target, - GLint unit) { - GLint loc = glGetUniformLocation(mProg, name); - - if (loc >= 0) { - glActiveTexture(GL_TEXTURE0 + unit); - glBindTexture(target, tex); - glUseProgram(mProg); - glUniform1i(loc, unit); - glActiveTexture(GL_TEXTURE0); - } else { -#if _DEBUG - fprintf(stderr, "Error binding texture '%s'\n", name); -#endif - } -} - -GLuint GLSLProgram::checkCompileStatus(GLuint shader, GLint *status) { - glGetShaderiv(shader, GL_COMPILE_STATUS, status); - - if (!(*status)) { - char log[2048]; - int len; - glGetShaderInfoLog(shader, 2048, (GLsizei *)&len, log); - printf("Error: shader(%04d), Info log: %s\n", (int)shader, log); - glDeleteShader(shader); - return 0; - } - - return 1; -} - -GLuint GLSLProgram::compileProgram(const char *vsource, const char *gsource, - const char *fsource, GLenum gsInput, - GLenum gsOutput) { - GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER); - GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); - - GLint compiled = 0; - - glShaderSource(vertexShader, 1, &vsource, 0); - glShaderSource(fragmentShader, 1, &fsource, 0); - - glCompileShader(vertexShader); - - if (checkCompileStatus(vertexShader, &compiled) == 0) { - printf(":\n"); - printf("%s\n", vsource); - return 0; - } - - glCompileShader(fragmentShader); - - if (checkCompileStatus(fragmentShader, &compiled) == 0) { - printf(":\n"); - printf("%s\n", fsource); - return 0; - } - - GLuint program = glCreateProgram(); - - glAttachShader(program, vertexShader); - glAttachShader(program, fragmentShader); - - if (gsource) { - GLuint geomShader = glCreateShader(GL_GEOMETRY_SHADER_EXT); - glShaderSource(geomShader, 1, &gsource, 0); - glCompileShader(geomShader); - glGetShaderiv(geomShader, GL_COMPILE_STATUS, (GLint *)&compiled); - - if (checkCompileStatus(geomShader, &compiled) == 0) { - printf(":\n"); - printf("%s\n", gsource); - return 0; + return; } - glAttachShader(program, geomShader); + switch (elementSize) { + case 1: + glUniform1fv(loc, count, v); + break; - glProgramParameteriEXT(program, GL_GEOMETRY_INPUT_TYPE_EXT, gsInput); - glProgramParameteriEXT(program, GL_GEOMETRY_OUTPUT_TYPE_EXT, gsOutput); - glProgramParameteriEXT(program, GL_GEOMETRY_VERTICES_OUT_EXT, 4); - } + case 2: + glUniform2fv(loc, count, v); + break; - glLinkProgram(program); + case 3: + glUniform3fv(loc, count, v); + break; - // check if program linked - GLint success = 0; - glGetProgramiv(program, GL_LINK_STATUS, &success); - - if (!success) { - char temp[1024]; - glGetProgramInfoLog(program, 1024, 0, temp); - fprintf(stderr, "Failed to link program:\n%s\n", temp); - glDeleteProgram(program); - program = 0; - exit(EXIT_FAILURE); - } - - return program; + case 4: + glUniform4fv(loc, count, v); + break; + } +} + +void GLSLProgram::bindTexture(const char *name, GLuint tex, GLenum target, GLint unit) +{ + GLint loc = glGetUniformLocation(mProg, name); + + if (loc >= 0) { + glActiveTexture(GL_TEXTURE0 + unit); + glBindTexture(target, tex); + glUseProgram(mProg); + glUniform1i(loc, unit); + glActiveTexture(GL_TEXTURE0); + } + else { +#if _DEBUG + fprintf(stderr, "Error binding texture '%s'\n", name); +#endif + } +} + +GLuint GLSLProgram::checkCompileStatus(GLuint shader, GLint *status) +{ + glGetShaderiv(shader, GL_COMPILE_STATUS, status); + + if (!(*status)) { + char log[2048]; + int len; + glGetShaderInfoLog(shader, 2048, (GLsizei *)&len, log); + printf("Error: shader(%04d), Info log: %s\n", (int)shader, log); + glDeleteShader(shader); + return 0; + } + + return 1; +} + +GLuint GLSLProgram::compileProgram(const char *vsource, + const char *gsource, + const char *fsource, + GLenum gsInput, + GLenum gsOutput) +{ + GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER); + GLuint fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); + + GLint compiled = 0; + + glShaderSource(vertexShader, 1, &vsource, 0); + glShaderSource(fragmentShader, 1, &fsource, 0); + + glCompileShader(vertexShader); + + if (checkCompileStatus(vertexShader, &compiled) == 0) { + printf(":\n"); + printf("%s\n", vsource); + return 0; + } + + glCompileShader(fragmentShader); + + if (checkCompileStatus(fragmentShader, &compiled) == 0) { + printf(":\n"); + printf("%s\n", fsource); + return 0; + } + + GLuint program = glCreateProgram(); + + glAttachShader(program, vertexShader); + glAttachShader(program, fragmentShader); + + if (gsource) { + GLuint geomShader = glCreateShader(GL_GEOMETRY_SHADER_EXT); + glShaderSource(geomShader, 1, &gsource, 0); + glCompileShader(geomShader); + glGetShaderiv(geomShader, GL_COMPILE_STATUS, (GLint *)&compiled); + + if (checkCompileStatus(geomShader, &compiled) == 0) { + printf(":\n"); + printf("%s\n", gsource); + return 0; + } + + glAttachShader(program, geomShader); + + glProgramParameteriEXT(program, GL_GEOMETRY_INPUT_TYPE_EXT, gsInput); + glProgramParameteriEXT(program, GL_GEOMETRY_OUTPUT_TYPE_EXT, gsOutput); + glProgramParameteriEXT(program, GL_GEOMETRY_VERTICES_OUT_EXT, 4); + } + + glLinkProgram(program); + + // check if program linked + GLint success = 0; + glGetProgramiv(program, GL_LINK_STATUS, &success); + + if (!success) { + char temp[1024]; + glGetProgramInfoLog(program, 1024, 0, temp); + fprintf(stderr, "Failed to link program:\n%s\n", temp); + glDeleteProgram(program); + program = 0; + exit(EXIT_FAILURE); + } + + return program; } diff --git a/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.h b/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.h index ef4d5962..e4f43a4f 100644 --- a/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.h +++ b/Samples/5_Domain_Specific/smokeParticles/GLSLProgram.h @@ -32,35 +32,40 @@ #include -class GLSLProgram { - public: - // construct program from strings - GLSLProgram(const char *vsource, const char *fsource); - GLSLProgram(const char *vsource, const char *gsource, const char *fsource, - GLenum gsInput = GL_POINTS, GLenum gsOutput = GL_TRIANGLE_STRIP); - ~GLSLProgram(); +class GLSLProgram +{ +public: + // construct program from strings + GLSLProgram(const char *vsource, const char *fsource); + GLSLProgram(const char *vsource, + const char *gsource, + const char *fsource, + GLenum gsInput = GL_POINTS, + GLenum gsOutput = GL_TRIANGLE_STRIP); + ~GLSLProgram(); - void enable(); - void disable(); + void enable(); + void disable(); - void setUniform1f(const GLchar *name, GLfloat x); - void setUniform2f(const GLchar *name, GLfloat x, GLfloat y); - void setUniform3f(const char *name, float x, float y, float z); - void setUniform4f(const char *name, float x, float y, float z, float w); - void setUniformfv(const GLchar *name, GLfloat *v, int elementSize, - int count = 1); - void setUniformMatrix4fv(const GLchar *name, GLfloat *m, bool transpose); + void setUniform1f(const GLchar *name, GLfloat x); + void setUniform2f(const GLchar *name, GLfloat x, GLfloat y); + void setUniform3f(const char *name, float x, float y, float z); + void setUniform4f(const char *name, float x, float y, float z, float w); + void setUniformfv(const GLchar *name, GLfloat *v, int elementSize, int count = 1); + void setUniformMatrix4fv(const GLchar *name, GLfloat *m, bool transpose); - void bindTexture(const char *name, GLuint tex, GLenum target, GLint unit); + void bindTexture(const char *name, GLuint tex, GLenum target, GLint unit); - inline GLuint getProgId() { return mProg; } + inline GLuint getProgId() { return mProg; } - private: - GLuint checkCompileStatus(GLuint shader, GLint *status); - GLuint compileProgram(const char *vsource, const char *gsource, - const char *fsource, GLenum gsInput = GL_POINTS, - GLenum gsOutput = GL_TRIANGLE_STRIP); - GLuint mProg; +private: + GLuint checkCompileStatus(GLuint shader, GLint *status); + GLuint compileProgram(const char *vsource, + const char *gsource, + const char *fsource, + GLenum gsInput = GL_POINTS, + GLenum gsOutput = GL_TRIANGLE_STRIP); + GLuint mProg; }; #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/GpuArray.h b/Samples/5_Domain_Specific/smokeParticles/GpuArray.h index e5ff10d7..496196e5 100644 --- a/Samples/5_Domain_Specific/smokeParticles/GpuArray.h +++ b/Samples/5_Domain_Specific/smokeParticles/GpuArray.h @@ -29,266 +29,258 @@ Class to represent an array in GPU and CPU memory */ -#include #include +#include #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION -#include -#include #include +#include #include +#include -template -class GpuArray { - public: - GpuArray(); - ~GpuArray(); +template class GpuArray +{ +public: + GpuArray(); + ~GpuArray(); - enum Direction { - HOST_TO_DEVICE, - DEVICE_TO_HOST, - }; + enum Direction { + HOST_TO_DEVICE, + DEVICE_TO_HOST, + }; - // allocate and free - void alloc(size_t size, bool vbo = false, bool doubleBuffer = false, - bool useElementArray = false); - void free(); + // allocate and free + void alloc(size_t size, bool vbo = false, bool doubleBuffer = false, bool useElementArray = false); + void free(); - // swap buffers for double buffering - void swap(); + // swap buffers for double buffering + void swap(); - // when using vbo, must map before getting device ptr - void map(); - void unmap(); + // when using vbo, must map before getting device ptr + void map(); + void unmap(); - void copy(Direction dir, uint start = 0, uint count = 0); - void memset(T value, uint start = 0, uint count = 0); + void copy(Direction dir, uint start = 0, uint count = 0); + void memset(T value, uint start = 0, uint count = 0); - T *getDevicePtr() { return m_dptr[m_currentRead]; } - GLuint getVbo() { return m_vbo[m_currentRead]; } + T *getDevicePtr() { return m_dptr[m_currentRead]; } + GLuint getVbo() { return m_vbo[m_currentRead]; } - T *getDeviceWritePtr() { return m_dptr[m_currentWrite]; } - GLuint getWriteVbo() { return m_vbo[m_currentWrite]; } + T *getDeviceWritePtr() { return m_dptr[m_currentWrite]; } + GLuint getWriteVbo() { return m_vbo[m_currentWrite]; } - T *getHostPtr() { return m_hptr; } + T *getHostPtr() { return m_hptr; } - size_t getSize() const { return m_size; } + size_t getSize() const { return m_size; } - private: - GLuint createVbo(size_t size, bool useElementArray); +private: + GLuint createVbo(size_t size, bool useElementArray); - void allocDevice(); - void allocVbo(bool useElementArray); - void allocHost(); + void allocDevice(); + void allocVbo(bool useElementArray); + void allocHost(); - void freeDevice(); - void freeVbo(); - void freeHost(); + void freeDevice(); + void freeVbo(); + void freeHost(); - size_t m_size; - T *m_dptr[2]; - GLuint m_vbo[2]; - struct cudaGraphicsResource - *m_cuda_vbo_resource[2]; // handles OpenGL-CUDA exchange + size_t m_size; + T *m_dptr[2]; + GLuint m_vbo[2]; + struct cudaGraphicsResource *m_cuda_vbo_resource[2]; // handles OpenGL-CUDA exchange - T *m_hptr; + T *m_hptr; - bool m_useVBO; - bool m_doubleBuffer; - uint m_currentRead, m_currentWrite; + bool m_useVBO; + bool m_doubleBuffer; + uint m_currentRead, m_currentWrite; }; template GpuArray::GpuArray() - : m_size(0), m_hptr(0), m_currentRead(0), m_currentWrite(0) { - m_dptr[0] = 0; - m_dptr[1] = 0; - - m_vbo[0] = 0; - m_vbo[1] = 0; - - m_cuda_vbo_resource[0] = NULL; - m_cuda_vbo_resource[1] = NULL; -} - -template -GpuArray::~GpuArray() { - free(); -} - -template -void GpuArray::alloc(size_t size, bool vbo, bool doubleBuffer, - bool useElementArray) { - m_size = size; - - m_useVBO = vbo; - m_doubleBuffer = doubleBuffer; - - if (m_doubleBuffer) { - m_currentWrite = 1; - } - - allocHost(); - - if (vbo) { - allocVbo(useElementArray); - } else { - allocDevice(); - } -} - -template -void GpuArray::free() { - freeHost(); - - if (m_vbo) { - freeVbo(); - } - - if (m_dptr) { - freeDevice(); - } -} - -template -void GpuArray::allocHost() { - m_hptr = (T *)new T[m_size]; -} - -template -void GpuArray::freeHost() { - if (m_hptr) { - delete[] m_hptr; - m_hptr = 0; - } -} - -template -void GpuArray::allocDevice() { - checkCudaErrors(cudaMalloc((void **)&m_dptr[0], m_size * sizeof(T))); - - if (m_doubleBuffer) { - checkCudaErrors(cudaMalloc((void **)&m_dptr[1], m_size * sizeof(T))); - } -} - -template -void GpuArray::freeDevice() { - if (m_dptr[0]) { - checkCudaErrors(cudaFree(m_dptr[0])); + : m_size(0) + , m_hptr(0) + , m_currentRead(0) + , m_currentWrite(0) +{ m_dptr[0] = 0; - } - - if (m_dptr[1]) { - checkCudaErrors(cudaFree(m_dptr[1])); m_dptr[1] = 0; - } -} -template -GLuint GpuArray::createVbo(size_t size, bool useElementArray) { - GLuint vbo; - glGenBuffers(1, &vbo); - - if (useElementArray) { - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vbo); - glBufferData(GL_ELEMENT_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); - } else { - glBindBuffer(GL_ARRAY_BUFFER, vbo); - glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); - } - - return vbo; -} - -template -void GpuArray::allocVbo(bool useElementArray) { - m_vbo[0] = createVbo(m_size * sizeof(T), useElementArray); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &m_cuda_vbo_resource[0], m_vbo[0], cudaGraphicsMapFlagsWriteDiscard)); - - if (m_doubleBuffer) { - m_vbo[1] = createVbo(m_size * sizeof(T), useElementArray); - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &m_cuda_vbo_resource[1], m_vbo[1], cudaGraphicsMapFlagsWriteDiscard)); - } -} - -template -void GpuArray::freeVbo() { - if (m_vbo[0]) { - checkCudaErrors(cudaGraphicsUnregisterResource(m_cuda_vbo_resource[0])); - glDeleteBuffers(1, &m_vbo[0]); m_vbo[0] = 0; - } - - if (m_vbo[1]) { - checkCudaErrors(cudaGraphicsUnregisterResource(m_cuda_vbo_resource[1])); - glDeleteBuffers(1, &m_vbo[1]); m_vbo[1] = 0; - } + + m_cuda_vbo_resource[0] = NULL; + m_cuda_vbo_resource[1] = NULL; } -template -void GpuArray::swap() { - std::swap(m_currentRead, m_currentWrite); +template GpuArray::~GpuArray() { free(); } + +template void GpuArray::alloc(size_t size, bool vbo, bool doubleBuffer, bool useElementArray) +{ + m_size = size; + + m_useVBO = vbo; + m_doubleBuffer = doubleBuffer; + + if (m_doubleBuffer) { + m_currentWrite = 1; + } + + allocHost(); + + if (vbo) { + allocVbo(useElementArray); + } + else { + allocDevice(); + } } -template -void GpuArray::map() { - if (m_vbo[0]) { - checkCudaErrors(cudaGraphicsMapResources(1, &m_cuda_vbo_resource[0], 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&m_dptr[0], &num_bytes, m_cuda_vbo_resource[0])); - } +template void GpuArray::free() +{ + freeHost(); - if (m_doubleBuffer && m_vbo[1]) { - checkCudaErrors(cudaGraphicsMapResources(1, &m_cuda_vbo_resource[1], 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&m_dptr[1], &num_bytes, m_cuda_vbo_resource[1])); - } + if (m_vbo) { + freeVbo(); + } + + if (m_dptr) { + freeDevice(); + } } -template -void GpuArray::unmap() { - if (m_vbo[0]) { - checkCudaErrors(cudaGraphicsUnmapResources(1, &m_cuda_vbo_resource[0], 0)); - m_dptr[0] = 0; - } +template void GpuArray::allocHost() { m_hptr = (T *)new T[m_size]; } - if (m_doubleBuffer && m_vbo[1]) { - checkCudaErrors(cudaGraphicsUnmapResources(1, &m_cuda_vbo_resource[1], 0)); - m_dptr[1] = 0; - } +template void GpuArray::freeHost() +{ + if (m_hptr) { + delete[] m_hptr; + m_hptr = 0; + } } -template -void GpuArray::copy(Direction dir, uint start, uint count) { - if (count == 0) { - count = (uint)m_size; - } +template void GpuArray::allocDevice() +{ + checkCudaErrors(cudaMalloc((void **)&m_dptr[0], m_size * sizeof(T))); - map(); + if (m_doubleBuffer) { + checkCudaErrors(cudaMalloc((void **)&m_dptr[1], m_size * sizeof(T))); + } +} - switch (dir) { +template void GpuArray::freeDevice() +{ + if (m_dptr[0]) { + checkCudaErrors(cudaFree(m_dptr[0])); + m_dptr[0] = 0; + } + + if (m_dptr[1]) { + checkCudaErrors(cudaFree(m_dptr[1])); + m_dptr[1] = 0; + } +} + +template GLuint GpuArray::createVbo(size_t size, bool useElementArray) +{ + GLuint vbo; + glGenBuffers(1, &vbo); + + if (useElementArray) { + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vbo); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); + } + else { + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); + } + + return vbo; +} + +template void GpuArray::allocVbo(bool useElementArray) +{ + m_vbo[0] = createVbo(m_size * sizeof(T), useElementArray); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_cuda_vbo_resource[0], m_vbo[0], cudaGraphicsMapFlagsWriteDiscard)); + + if (m_doubleBuffer) { + m_vbo[1] = createVbo(m_size * sizeof(T), useElementArray); + checkCudaErrors( + cudaGraphicsGLRegisterBuffer(&m_cuda_vbo_resource[1], m_vbo[1], cudaGraphicsMapFlagsWriteDiscard)); + } +} + +template void GpuArray::freeVbo() +{ + if (m_vbo[0]) { + checkCudaErrors(cudaGraphicsUnregisterResource(m_cuda_vbo_resource[0])); + glDeleteBuffers(1, &m_vbo[0]); + m_vbo[0] = 0; + } + + if (m_vbo[1]) { + checkCudaErrors(cudaGraphicsUnregisterResource(m_cuda_vbo_resource[1])); + glDeleteBuffers(1, &m_vbo[1]); + m_vbo[1] = 0; + } +} + +template void GpuArray::swap() { std::swap(m_currentRead, m_currentWrite); } + +template void GpuArray::map() +{ + if (m_vbo[0]) { + checkCudaErrors(cudaGraphicsMapResources(1, &m_cuda_vbo_resource[0], 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&m_dptr[0], &num_bytes, m_cuda_vbo_resource[0])); + } + + if (m_doubleBuffer && m_vbo[1]) { + checkCudaErrors(cudaGraphicsMapResources(1, &m_cuda_vbo_resource[1], 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&m_dptr[1], &num_bytes, m_cuda_vbo_resource[1])); + } +} + +template void GpuArray::unmap() +{ + if (m_vbo[0]) { + checkCudaErrors(cudaGraphicsUnmapResources(1, &m_cuda_vbo_resource[0], 0)); + m_dptr[0] = 0; + } + + if (m_doubleBuffer && m_vbo[1]) { + checkCudaErrors(cudaGraphicsUnmapResources(1, &m_cuda_vbo_resource[1], 0)); + m_dptr[1] = 0; + } +} + +template void GpuArray::copy(Direction dir, uint start, uint count) +{ + if (count == 0) { + count = (uint)m_size; + } + + map(); + + switch (dir) { case HOST_TO_DEVICE: - checkCudaErrors(cudaMemcpy((void *)(m_dptr[m_currentRead] + start), - (void *)(m_hptr + start), count * sizeof(T), - cudaMemcpyHostToDevice)); - break; + checkCudaErrors(cudaMemcpy((void *)(m_dptr[m_currentRead] + start), + (void *)(m_hptr + start), + count * sizeof(T), + cudaMemcpyHostToDevice)); + break; case DEVICE_TO_HOST: - checkCudaErrors(cudaMemcpy((void *)(m_hptr + start), - (void *)(m_dptr[m_currentRead] + start), - count * sizeof(T), cudaMemcpyDeviceToHost)); - break; - } + checkCudaErrors(cudaMemcpy((void *)(m_hptr + start), + (void *)(m_dptr[m_currentRead] + start), + count * sizeof(T), + cudaMemcpyDeviceToHost)); + break; + } - unmap(); + unmap(); } -template -void GpuArray::memset(T value, uint start, uint count) {} +template void GpuArray::memset(T value, uint start, uint count) {} diff --git a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp index 1caade2e..3952b7b6 100644 --- a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cpp @@ -25,23 +25,22 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include -#include -#include #include #include -#include +#include +#include #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION -#include -#include #include - -#include +#include #include +#include +#include -#include "ParticleSystem.h" #include "ParticleSystem.cuh" +#include "ParticleSystem.h" #include "particles_kernel.cuh" #ifndef CUDART_PI_F @@ -53,97 +52,104 @@ */ ParticleSystem::ParticleSystem(uint numParticles, bool bUseVBO, bool bUseGL) - : m_bInitialized(false), - m_bUseVBO(bUseVBO), - m_numParticles(numParticles), - m_particleRadius(0.1f), - m_doDepthSort(false), - m_timer(NULL), - m_time(0.0f) { - m_params.gravity = make_float3(0.0f, 0.0f, 0.0f); - m_params.globalDamping = 1.0f; - m_params.noiseSpeed = make_float3(0.0f, 0.0f, 0.0f); + : m_bInitialized(false) + , m_bUseVBO(bUseVBO) + , m_numParticles(numParticles) + , m_particleRadius(0.1f) + , m_doDepthSort(false) + , m_timer(NULL) + , m_time(0.0f) +{ + m_params.gravity = make_float3(0.0f, 0.0f, 0.0f); + m_params.globalDamping = 1.0f; + m_params.noiseSpeed = make_float3(0.0f, 0.0f, 0.0f); - _initialize(numParticles, bUseGL); + _initialize(numParticles, bUseGL); } -ParticleSystem::~ParticleSystem() { - _free(); - m_numParticles = 0; +ParticleSystem::~ParticleSystem() +{ + _free(); + m_numParticles = 0; } -void ParticleSystem::_initialize(int numParticles, bool bUseGL) { - assert(!m_bInitialized); +void ParticleSystem::_initialize(int numParticles, bool bUseGL) +{ + assert(!m_bInitialized); - createNoiseTexture(64, 64, 64); + createNoiseTexture(64, 64, 64); - m_numParticles = numParticles; + m_numParticles = numParticles; - // allocate GPU arrays - m_pos.alloc(m_numParticles, m_bUseVBO, true); // create as VBO - m_vel.alloc(m_numParticles, m_bUseVBO, true); + // allocate GPU arrays + m_pos.alloc(m_numParticles, m_bUseVBO, true); // create as VBO + m_vel.alloc(m_numParticles, m_bUseVBO, true); - m_sortKeys.alloc(m_numParticles); - m_indices.alloc(m_numParticles, m_bUseVBO, false, - true); // create as index buffer + m_sortKeys.alloc(m_numParticles); + m_indices.alloc(m_numParticles, m_bUseVBO, false, + true); // create as index buffer - sdkCreateTimer(&m_timer); - setParameters(&m_params); + sdkCreateTimer(&m_timer); + setParameters(&m_params); - m_bInitialized = true; + m_bInitialized = true; } void ParticleSystem::_free() { assert(m_bInitialized); } // step the simulation -void ParticleSystem::step(float deltaTime) { - assert(m_bInitialized); +void ParticleSystem::step(float deltaTime) +{ + assert(m_bInitialized); - m_params.time = m_time; - setParameters(&m_params); + m_params.time = m_time; + setParameters(&m_params); - m_pos.map(); - m_vel.map(); + m_pos.map(); + m_vel.map(); - // integrate particles - integrateSystem(m_pos.getDevicePtr(), m_pos.getDeviceWritePtr(), - m_vel.getDevicePtr(), m_vel.getDeviceWritePtr(), deltaTime, - m_numParticles); + // integrate particles + integrateSystem(m_pos.getDevicePtr(), + m_pos.getDeviceWritePtr(), + m_vel.getDevicePtr(), + m_vel.getDeviceWritePtr(), + deltaTime, + m_numParticles); - m_pos.unmap(); - m_vel.unmap(); + m_pos.unmap(); + m_vel.unmap(); - m_pos.swap(); - m_vel.swap(); + m_pos.swap(); + m_vel.swap(); - m_time += deltaTime; + m_time += deltaTime; } // depth sort the particles -void ParticleSystem::depthSort() { - if (!m_doDepthSort) { - return; - } +void ParticleSystem::depthSort() +{ + if (!m_doDepthSort) { + return; + } - m_pos.map(); - m_indices.map(); + m_pos.map(); + m_indices.map(); - // calculate depth - calcDepth(m_pos.getDevicePtr(), m_sortKeys.getDevicePtr(), - m_indices.getDevicePtr(), m_sortVector, m_numParticles); + // calculate depth + calcDepth(m_pos.getDevicePtr(), m_sortKeys.getDevicePtr(), m_indices.getDevicePtr(), m_sortVector, m_numParticles); - // radix sort - sortParticles(m_sortKeys.getDevicePtr(), m_indices.getDevicePtr(), - m_numParticles); + // radix sort + sortParticles(m_sortKeys.getDevicePtr(), m_indices.getDevicePtr(), m_numParticles); - m_pos.unmap(); - m_indices.unmap(); + m_pos.unmap(); + m_indices.unmap(); } -uint *ParticleSystem::getSortedIndices() { - // copy sorted indices back to CPU - m_indices.copy(GpuArray::DEVICE_TO_HOST); - return m_indices.getHostPtr(); +uint *ParticleSystem::getSortedIndices() +{ + // copy sorted indices back to CPU + m_indices.copy(GpuArray::DEVICE_TO_HOST); + return m_indices.getHostPtr(); } // random float [0, 1] @@ -156,213 +162,237 @@ inline float sfrand() { return frand() * 2.0f - 1.0f; } inline vec3f svrand() { return vec3f(sfrand(), sfrand(), sfrand()); } // random point in circle -inline vec2f randCircle() { - vec2f r; +inline vec2f randCircle() +{ + vec2f r; - do { - r = vec2f(sfrand(), sfrand()); - } while (length(r) > 1.0f); + do { + r = vec2f(sfrand(), sfrand()); + } while (length(r) > 1.0f); - return r; + return r; } // random point in sphere -inline vec3f randSphere() { - vec3f r; +inline vec3f randSphere() +{ + vec3f r; - do { - r = svrand(); - } while (length(r) > 1.0f); + do { + r = svrand(); + } while (length(r) > 1.0f); - return r; + return r; } // initialize in regular grid -void ParticleSystem::initGrid(vec3f start, uint3 size, vec3f spacing, - float jitter, vec3f vel, uint numParticles, - float lifetime) { - srand(1973); +void ParticleSystem::initGrid(vec3f start, + uint3 size, + vec3f spacing, + float jitter, + vec3f vel, + uint numParticles, + float lifetime) +{ + srand(1973); - float4 *posPtr = m_pos.getHostPtr(); - float4 *velPtr = m_vel.getHostPtr(); + float4 *posPtr = m_pos.getHostPtr(); + float4 *velPtr = m_vel.getHostPtr(); - for (uint z = 0; z < size.z; z++) { - for (uint y = 0; y < size.y; y++) { - for (uint x = 0; x < size.x; x++) { - uint i = (z * size.y * size.x) + (y * size.x) + x; + for (uint z = 0; z < size.z; z++) { + for (uint y = 0; y < size.y; y++) { + for (uint x = 0; x < size.x; x++) { + uint i = (z * size.y * size.x) + (y * size.x) + x; - if (i < numParticles) { - vec3f pos = start + spacing * vec3f((float)x, (float)y, (float)z) + - svrand() * jitter; + if (i < numParticles) { + vec3f pos = start + spacing * vec3f((float)x, (float)y, (float)z) + svrand() * jitter; - posPtr[i] = make_float4(pos.x, pos.y, pos.z, 0.0f); - velPtr[i] = make_float4(vel.x, vel.y, vel.z, lifetime); + posPtr[i] = make_float4(pos.x, pos.y, pos.z, 0.0f); + velPtr[i] = make_float4(vel.x, vel.y, vel.z, lifetime); + } + } } - } } - } } // initialize in random positions within cube -void ParticleSystem::initCubeRandom(vec3f origin, vec3f size, vec3f vel, - float lifetime) { - float4 *posPtr = m_pos.getHostPtr(); - float4 *velPtr = m_vel.getHostPtr(); +void ParticleSystem::initCubeRandom(vec3f origin, vec3f size, vec3f vel, float lifetime) +{ + float4 *posPtr = m_pos.getHostPtr(); + float4 *velPtr = m_vel.getHostPtr(); - for (uint i = 0; i < m_numParticles; i++) { - vec3f pos = origin + svrand() * size; - posPtr[i] = make_float4(pos.x, pos.y, pos.z, 0.0f); - velPtr[i] = make_float4(vel.x, vel.y, vel.z, lifetime); - } + for (uint i = 0; i < m_numParticles; i++) { + vec3f pos = origin + svrand() * size; + posPtr[i] = make_float4(pos.x, pos.y, pos.z, 0.0f); + velPtr[i] = make_float4(vel.x, vel.y, vel.z, lifetime); + } } // add sphere on regular grid -void ParticleSystem::addSphere(uint &index, vec3f pos, vec3f vel, int r, - float spacing, float jitter, float lifetime) { - float4 *posPtr = m_pos.getHostPtr(); - float4 *velPtr = m_vel.getHostPtr(); +void ParticleSystem::addSphere(uint &index, vec3f pos, vec3f vel, int r, float spacing, float jitter, float lifetime) +{ + float4 *posPtr = m_pos.getHostPtr(); + float4 *velPtr = m_vel.getHostPtr(); - uint start = index; - uint count = 0; + uint start = index; + uint count = 0; - for (int z = -r; z <= r; z++) { - for (int y = -r; y <= r; y++) { - for (int x = -r; x <= r; x++) { - vec3f delta = vec3f((float)x, (float)y, (float)z) * spacing; - float dist = length(delta); + for (int z = -r; z <= r; z++) { + for (int y = -r; y <= r; y++) { + for (int x = -r; x <= r; x++) { + vec3f delta = vec3f((float)x, (float)y, (float)z) * spacing; + float dist = length(delta); - if ((dist <= spacing * r) && (index < m_numParticles)) { - // vec3f p = pos + delta + svrand()*jitter; + if ((dist <= spacing * r) && (index < m_numParticles)) { + // vec3f p = pos + delta + svrand()*jitter; - posPtr[index] = make_float4(pos.x, pos.y, pos.z, 0.0f); - velPtr[index] = make_float4(vel.x, vel.y, vel.z, lifetime); + posPtr[index] = make_float4(pos.x, pos.y, pos.z, 0.0f); + velPtr[index] = make_float4(vel.x, vel.y, vel.z, lifetime); - index++; - count++; + index++; + count++; + } + } } - } } - } - m_pos.copy(GpuArray::HOST_TO_DEVICE, start, count); - m_vel.copy(GpuArray::HOST_TO_DEVICE, start, count); + m_pos.copy(GpuArray::HOST_TO_DEVICE, start, count); + m_vel.copy(GpuArray::HOST_TO_DEVICE, start, count); } -void ParticleSystem::reset(ParticleConfig config) { - switch (config) { +void ParticleSystem::reset(ParticleConfig config) +{ + switch (config) { default: case CONFIG_RANDOM: - initCubeRandom(vec3f(0.0, 1.0, 0.0), vec3f(1.0, 1.0, 1.0), vec3f(0.0f), - 100.0); - break; + initCubeRandom(vec3f(0.0, 1.0, 0.0), vec3f(1.0, 1.0, 1.0), vec3f(0.0f), 100.0); + break; case CONFIG_GRID: { - float jitter = m_particleRadius * 0.01f; - uint s = (int)ceilf(powf((float)m_numParticles, 1.0f / 3.0f)); - uint gridSize[3]; - gridSize[0] = gridSize[1] = gridSize[2] = s; - initGrid(vec3f(-1.0, 0.0, -1.0), make_uint3(s, s, s), - vec3f(m_particleRadius * 2.0f), jitter, vec3f(0.0), - m_numParticles, 100.0); + float jitter = m_particleRadius * 0.01f; + uint s = (int)ceilf(powf((float)m_numParticles, 1.0f / 3.0f)); + uint gridSize[3]; + gridSize[0] = gridSize[1] = gridSize[2] = s; + initGrid(vec3f(-1.0, 0.0, -1.0), + make_uint3(s, s, s), + vec3f(m_particleRadius * 2.0f), + jitter, + vec3f(0.0), + m_numParticles, + 100.0); } break; - } + } - m_pos.copy(GpuArray::HOST_TO_DEVICE); - m_vel.copy(GpuArray::HOST_TO_DEVICE); + m_pos.copy(GpuArray::HOST_TO_DEVICE); + m_vel.copy(GpuArray::HOST_TO_DEVICE); } // particle emitters -void ParticleSystem::discEmitter(uint &index, vec3f pos, vec3f vel, vec3f vx, - vec3f vy, float r, int n, float lifetime, - float lifetimeVariance) { - float4 *posPtr = m_pos.getHostPtr(); - float4 *velPtr = m_vel.getHostPtr(); +void ParticleSystem::discEmitter(uint &index, + vec3f pos, + vec3f vel, + vec3f vx, + vec3f vy, + float r, + int n, + float lifetime, + float lifetimeVariance) +{ + float4 *posPtr = m_pos.getHostPtr(); + float4 *velPtr = m_vel.getHostPtr(); - uint start = index; - uint count = 0; + uint start = index; + uint count = 0; - for (int i = 0; i < n; i++) { - vec2f delta = randCircle() * r; + for (int i = 0; i < n; i++) { + vec2f delta = randCircle() * r; - if (index < m_numParticles) { - vec3f p = pos + delta.x * vx + delta.y * vy; - float lt = lifetime + frand() * lifetimeVariance; + if (index < m_numParticles) { + vec3f p = pos + delta.x * vx + delta.y * vy; + float lt = lifetime + frand() * lifetimeVariance; - posPtr[index] = make_float4(p.x, p.y, p.z, 0.0f); - velPtr[index] = make_float4(vel.x, vel.y, vel.z, lt); + posPtr[index] = make_float4(p.x, p.y, p.z, 0.0f); + velPtr[index] = make_float4(vel.x, vel.y, vel.z, lt); - index++; - count++; + index++; + count++; + } } - } - m_pos.copy(GpuArray::HOST_TO_DEVICE, start, count); - m_vel.copy(GpuArray::HOST_TO_DEVICE, start, count); + m_pos.copy(GpuArray::HOST_TO_DEVICE, start, count); + m_vel.copy(GpuArray::HOST_TO_DEVICE, start, count); } -void ParticleSystem::sphereEmitter(uint &index, vec3f pos, vec3f vel, - vec3f spread, float r, int n, float lifetime, - float lifetimeVariance) { - float4 *posPtr = m_pos.getHostPtr(); - float4 *velPtr = m_vel.getHostPtr(); +void ParticleSystem::sphereEmitter(uint &index, + vec3f pos, + vec3f vel, + vec3f spread, + float r, + int n, + float lifetime, + float lifetimeVariance) +{ + float4 *posPtr = m_pos.getHostPtr(); + float4 *velPtr = m_vel.getHostPtr(); - uint start = index; - uint count = 0; + uint start = index; + uint count = 0; - for (int i = 0; i < n; i++) { - vec3f x = randSphere(); + for (int i = 0; i < n; i++) { + vec3f x = randSphere(); - // float dist = length(x); - if (index < m_numParticles) { - vec3f p = pos + x * r; - float age = 0.0; + // float dist = length(x); + if (index < m_numParticles) { + vec3f p = pos + x * r; + float age = 0.0; - float lt = lifetime + frand() * lifetimeVariance; + float lt = lifetime + frand() * lifetimeVariance; - vec3f dir = randSphere(); - dir.y = fabs(dir.y); - vec3f v = vel + dir * spread; + vec3f dir = randSphere(); + dir.y = fabs(dir.y); + vec3f v = vel + dir * spread; - posPtr[index] = make_float4(p.x, p.y, p.z, age); - velPtr[index] = make_float4(v.x, v.y, v.z, lt); + posPtr[index] = make_float4(p.x, p.y, p.z, age); + velPtr[index] = make_float4(v.x, v.y, v.z, lt); - index++; - count++; + index++; + count++; + } } - } - m_pos.copy(GpuArray::HOST_TO_DEVICE, start, count); - m_vel.copy(GpuArray::HOST_TO_DEVICE, start, count); + m_pos.copy(GpuArray::HOST_TO_DEVICE, start, count); + m_vel.copy(GpuArray::HOST_TO_DEVICE, start, count); } -void ParticleSystem::setModelView(float *m) { - for (int i = 0; i < 16; i++) { - m_modelView.m[i] = m[i]; - } +void ParticleSystem::setModelView(float *m) +{ + for (int i = 0; i < 16; i++) { + m_modelView.m[i] = m[i]; + } } // dump particles to stdout for debugging -void ParticleSystem::dumpParticles(uint start, uint count) { - m_pos.copy(GpuArray::DEVICE_TO_HOST); - float4 *pos = m_pos.getHostPtr(); +void ParticleSystem::dumpParticles(uint start, uint count) +{ + m_pos.copy(GpuArray::DEVICE_TO_HOST); + float4 *pos = m_pos.getHostPtr(); - m_vel.copy(GpuArray::DEVICE_TO_HOST); - float4 *vel = m_vel.getHostPtr(); + m_vel.copy(GpuArray::DEVICE_TO_HOST); + float4 *vel = m_vel.getHostPtr(); - for (uint i = start; i < start + count; i++) { - printf("%d: ", i); - printf("pos: (%.4f, %.4f, %.4f, %.4f)\n", pos[i].x, pos[i].y, pos[i].z, - pos[i].w); - printf("vel: (%.4f, %.4f, %.4f, %.4f)\n", vel[i].x, vel[i].y, vel[i].z, - vel[i].w); - } + for (uint i = start; i < start + count; i++) { + printf("%d: ", i); + printf("pos: (%.4f, %.4f, %.4f, %.4f)\n", pos[i].x, pos[i].y, pos[i].z, pos[i].w); + printf("vel: (%.4f, %.4f, %.4f, %.4f)\n", vel[i].x, vel[i].y, vel[i].z, vel[i].w); + } } // dump particles to a system memory host -void ParticleSystem::dumpBin(float4 **posData, float4 **velData) { - m_pos.copy(GpuArray::DEVICE_TO_HOST); - *posData = m_pos.getHostPtr(); +void ParticleSystem::dumpBin(float4 **posData, float4 **velData) +{ + m_pos.copy(GpuArray::DEVICE_TO_HOST); + *posData = m_pos.getHostPtr(); - m_vel.copy(GpuArray::DEVICE_TO_HOST); - *velData = m_vel.getHostPtr(); + m_vel.copy(GpuArray::DEVICE_TO_HOST); + *velData = m_vel.getHostPtr(); } diff --git a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cuh b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cuh index 67ee2cc7..03499635 100644 --- a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cuh +++ b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.cuh @@ -27,18 +27,20 @@ #include "particles_kernel.cuh" -extern "C" { -void initCuda(bool bUseGL); -void setParameters(SimParams *hostParams); -void createNoiseTexture(int w, int h, int d); +extern "C" +{ + void initCuda(bool bUseGL); + void setParameters(SimParams *hostParams); + void createNoiseTexture(int w, int h, int d); -void integrateSystem(float4 *oldPos, float4 *newPos, float4 *oldVel, - float4 *newVel, float deltaTime, int numParticles); + void + integrateSystem(float4 *oldPos, float4 *newPos, float4 *oldVel, float4 *newVel, float deltaTime, int numParticles); -void calcDepth(float4 *pos, - float *keys, // output - uint *indices, // output - float3 sortVector, int numParticles); + void calcDepth(float4 *pos, + float *keys, // output + uint *indices, // output + float3 sortVector, + int numParticles); -void sortParticles(float *sortKeys, uint *indices, uint numParticles); + void sortParticles(float *sortKeys, uint *indices, uint numParticles); } diff --git a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.h b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.h index 04d1db93..dc7137dd 100644 --- a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.h +++ b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem.h @@ -29,84 +29,102 @@ #define __PARTICLESYSTEM_H__ #include -#include "particles_kernel.cuh" -#include "vector_functions.h" + #include "GpuArray.h" #include "nvMath.h" +#include "particles_kernel.cuh" +#include "vector_functions.h" using namespace nv; // CUDA BodySystem: runs on the GPU -class ParticleSystem { - public: - ParticleSystem(uint numParticles, bool bUseVBO = true, bool bUseGL = true); - ~ParticleSystem(); +class ParticleSystem +{ +public: + ParticleSystem(uint numParticles, bool bUseVBO = true, bool bUseGL = true); + ~ParticleSystem(); - enum ParticleConfig { CONFIG_RANDOM, CONFIG_GRID, _NUM_CONFIGS }; + enum ParticleConfig { CONFIG_RANDOM, CONFIG_GRID, _NUM_CONFIGS }; - void step(float deltaTime); - void depthSort(); - void reset(ParticleConfig config); + void step(float deltaTime); + void depthSort(); + void reset(ParticleConfig config); - uint getNumParticles() { return m_numParticles; } + uint getNumParticles() { return m_numParticles; } - uint getPosBuffer() { return m_pos.getVbo(); } - uint getVelBuffer() { return m_vel.getVbo(); } - uint getColorBuffer() { return 0; } - uint getSortedIndexBuffer() { return m_indices.getVbo(); } - uint *getSortedIndices(); + uint getPosBuffer() { return m_pos.getVbo(); } + uint getVelBuffer() { return m_vel.getVbo(); } + uint getColorBuffer() { return 0; } + uint getSortedIndexBuffer() { return m_indices.getVbo(); } + uint *getSortedIndices(); - float getParticleRadius() { return m_particleRadius; } + float getParticleRadius() { return m_particleRadius; } - SimParams &getParams() { return m_params; } + SimParams &getParams() { return m_params; } - void setSorting(bool x) { m_doDepthSort = x; } - void setModelView(float *m); - void setSortVector(float3 v) { m_sortVector = v; } + void setSorting(bool x) { m_doDepthSort = x; } + void setModelView(float *m); + void setSortVector(float3 v) { m_sortVector = v; } - void addSphere(uint &index, vec3f pos, vec3f vel, int r, float spacing, - float jitter, float lifetime); - void discEmitter(uint &index, vec3f pos, vec3f vel, vec3f vx, vec3f vy, - float r, int n, float lifetime, float lifetimeVariance); - void sphereEmitter(uint &index, vec3f pos, vec3f vel, vec3f spread, float r, - int n, float lifetime, float lifetimeVariance); + void addSphere(uint &index, vec3f pos, vec3f vel, int r, float spacing, float jitter, float lifetime); + void discEmitter(uint &index, + vec3f pos, + vec3f vel, + vec3f vx, + vec3f vy, + float r, + int n, + float lifetime, + float lifetimeVariance); + void sphereEmitter(uint &index, + vec3f pos, + vec3f vel, + vec3f spread, + float r, + int n, + float lifetime, + float lifetimeVariance); - void dumpParticles(uint start, uint count); - void dumpBin(float4 **posData, float4 **velData); + void dumpParticles(uint start, uint count); + void dumpBin(float4 **posData, float4 **velData); - protected: // methods - ParticleSystem() {} +protected: // methods + ParticleSystem() {} - void _initialize(int numParticlesm, bool bUseGL = true); - void _free(); + void _initialize(int numParticlesm, bool bUseGL = true); + void _free(); - void initGrid(vec3f start, uint3 size, vec3f spacing, float jitter, vec3f vel, - uint numParticles, float lifetime = 100.0f); - void initCubeRandom(vec3f origin, vec3f size, vec3f vel, - float lifetime = 100.0f); + void initGrid(vec3f start, + uint3 size, + vec3f spacing, + float jitter, + vec3f vel, + uint numParticles, + float lifetime = 100.0f); + void initCubeRandom(vec3f origin, vec3f size, vec3f vel, float lifetime = 100.0f); - protected: // data - bool m_bInitialized; - bool m_bUseVBO; - uint m_numParticles; +protected: // data + bool m_bInitialized; + bool m_bUseVBO; + uint m_numParticles; - float m_particleRadius; + float m_particleRadius; - GpuArray m_pos; - GpuArray m_vel; + GpuArray m_pos; + GpuArray m_vel; - // params - SimParams m_params; + // params + SimParams m_params; - float4x4 m_modelView; - float3 m_sortVector; - bool m_doDepthSort; + float4x4 m_modelView; + float3 m_sortVector; + bool m_doDepthSort; - GpuArray m_sortKeys; - GpuArray m_indices; // sorted indices for rendering + GpuArray m_sortKeys; + GpuArray m_indices; // sorted indices for rendering - StopWatchInterface *m_timer; - float m_time; + StopWatchInterface *m_timer; + float m_time; }; -#endif // __PARTICLESYSTEM_H__ +#endif // __PARTICLESYSTEM_H__ diff --git a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu index fc895653..09838164 100644 --- a/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu +++ b/Samples/5_Domain_Specific/smokeParticles/ParticleSystem_cuda.cu @@ -29,131 +29,128 @@ This file contains simple wrapper functions that call the CUDA kernels */ #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION -#include -#include -#include #include -#include +#include #include +#include +#include +#include +#include "ParticleSystem.cuh" +#include "particles_kernel_device.cuh" #include "thrust/device_ptr.h" #include "thrust/for_each.h" #include "thrust/iterator/zip_iterator.h" #include "thrust/sort.h" -#include "particles_kernel_device.cuh" -#include "ParticleSystem.cuh" - extern "C" { - cudaArray *noiseArray; + cudaArray *noiseArray; - void setParameters(SimParams *hostParams) - { - // copy parameters to constant memory - checkCudaErrors(cudaMemcpyToSymbol(cudaParams, hostParams, sizeof(SimParams))); - } - - // Round a / b to nearest higher integer value - int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } - - // compute grid and thread block size for a given number of elements - void computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads) - { - numThreads = min(blockSize, n); - numBlocks = iDivUp(n, numThreads); - } - - inline float frand() { return rand() / (float)RAND_MAX; } - - // create 3D texture containing random values - void createNoiseTexture(int w, int h, int d) - { - cudaExtent size = make_cudaExtent(w, h, d); - size_t elements = size.width * size.height * size.depth; - - float *volumeData = (float *)malloc(elements * 4 * sizeof(float)); - float *ptr = volumeData; - - for (size_t i = 0; i < elements; i++) + void setParameters(SimParams *hostParams) { - *ptr++ = frand() * 2.0f - 1.0f; - *ptr++ = frand() * 2.0f - 1.0f; - *ptr++ = frand() * 2.0f - 1.0f; - *ptr++ = frand() * 2.0f - 1.0f; + // copy parameters to constant memory + checkCudaErrors(cudaMemcpyToSymbol(cudaParams, hostParams, sizeof(SimParams))); } - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - checkCudaErrors(cudaMalloc3DArray(&noiseArray, &channelDesc, size)); + // Round a / b to nearest higher integer value + int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } - cudaMemcpy3DParms copyParams = {0}; - copyParams.srcPtr = make_cudaPitchedPtr( - (void *)volumeData, size.width * sizeof(float4), size.width, size.height); - copyParams.dstArray = noiseArray; - copyParams.extent = size; - copyParams.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(©Params)); + // compute grid and thread block size for a given number of elements + void computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads) + { + numThreads = min(blockSize, n); + numBlocks = iDivUp(n, numThreads); + } - free(volumeData); + inline float frand() { return rand() / (float)RAND_MAX; } - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + // create 3D texture containing random values + void createNoiseTexture(int w, int h, int d) + { + cudaExtent size = make_cudaExtent(w, h, d); + size_t elements = size.width * size.height * size.depth; - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = noiseArray; + float *volumeData = (float *)malloc(elements * 4 * sizeof(float)); + float *ptr = volumeData; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + for (size_t i = 0; i < elements; i++) { + *ptr++ = frand() * 2.0f - 1.0f; + *ptr++ = frand() * 2.0f - 1.0f; + *ptr++ = frand() * 2.0f - 1.0f; + *ptr++ = frand() * 2.0f - 1.0f; + } - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.addressMode[2] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + checkCudaErrors(cudaMalloc3DArray(&noiseArray, &channelDesc, size)); - checkCudaErrors(cudaCreateTextureObject(&noiseTex, &texRes, &texDescr, NULL)); - } + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = + make_cudaPitchedPtr((void *)volumeData, size.width * sizeof(float4), size.width, size.height); + copyParams.dstArray = noiseArray; + copyParams.extent = size; + copyParams.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(©Params)); - void integrateSystem(float4 *oldPos, float4 *newPos, float4 *oldVel, - float4 *newVel, float deltaTime, int numParticles) - { - thrust::device_ptr d_newPos(newPos); - thrust::device_ptr d_newVel(newVel); - thrust::device_ptr d_oldPos(oldPos); - thrust::device_ptr d_oldVel(oldVel); + free(volumeData); - thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple( - d_newPos, d_newVel, d_oldPos, d_oldVel)), - thrust::make_zip_iterator(thrust::make_tuple( - d_newPos + numParticles, d_newVel + numParticles, - d_oldPos + numParticles, d_oldVel + numParticles)), - integrate_functor(deltaTime, noiseTex)); - } + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - void calcDepth(float4 *pos, - float *keys, // output - uint *indices, // output - float3 sortVector, int numParticles) - { - thrust::device_ptr d_pos(pos); - thrust::device_ptr d_keys(keys); - thrust::device_ptr d_indices(indices); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = noiseArray; - thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(d_pos, d_keys)), - thrust::make_zip_iterator(thrust::make_tuple( - d_pos + numParticles, d_keys + numParticles)), - calcDepth_functor(sortVector)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - thrust::sequence(d_indices, d_indices + numParticles); - } + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.addressMode[2] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - void sortParticles(float *sortKeys, uint *indices, uint numParticles) - { - thrust::sort_by_key(thrust::device_ptr(sortKeys), - thrust::device_ptr(sortKeys + numParticles), - thrust::device_ptr(indices)); - } + checkCudaErrors(cudaCreateTextureObject(&noiseTex, &texRes, &texDescr, NULL)); + } + + void + integrateSystem(float4 *oldPos, float4 *newPos, float4 *oldVel, float4 *newVel, float deltaTime, int numParticles) + { + thrust::device_ptr d_newPos(newPos); + thrust::device_ptr d_newVel(newVel); + thrust::device_ptr d_oldPos(oldPos); + thrust::device_ptr d_oldVel(oldVel); + + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(d_newPos, d_newVel, d_oldPos, d_oldVel)), + thrust::make_zip_iterator(thrust::make_tuple( + d_newPos + numParticles, d_newVel + numParticles, d_oldPos + numParticles, d_oldVel + numParticles)), + integrate_functor(deltaTime, noiseTex)); + } + + void calcDepth(float4 *pos, + float *keys, // output + uint *indices, // output + float3 sortVector, + int numParticles) + { + thrust::device_ptr d_pos(pos); + thrust::device_ptr d_keys(keys); + thrust::device_ptr d_indices(indices); + + thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(d_pos, d_keys)), + thrust::make_zip_iterator(thrust::make_tuple(d_pos + numParticles, d_keys + numParticles)), + calcDepth_functor(sortVector)); + + thrust::sequence(d_indices, d_indices + numParticles); + } + + void sortParticles(float *sortKeys, uint *indices, uint numParticles) + { + thrust::sort_by_key(thrust::device_ptr(sortKeys), + thrust::device_ptr(sortKeys + numParticles), + thrust::device_ptr(indices)); + } } // extern "C" diff --git a/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.cpp b/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.cpp index 6d43bfb5..c025e043 100644 --- a/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.cpp @@ -33,6 +33,7 @@ #include #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION #include + #include "SmokeRenderer.h" #include "SmokeShaders.h" @@ -43,362 +44,222 @@ #include #endif -#define USE_MBLUR 1 +#define USE_MBLUR 1 #define COLOR_ATTENUATION 1 SmokeRenderer::SmokeRenderer(int maxParticles) - : mMaxParticles(maxParticles), - mNumParticles(0), - mPosVbo(0), - mVelVbo(0), - mColorVbo(0), - mIndexBuffer(0), - mParticleRadius(0.005f), - mDisplayMode(VOLUMETRIC), - mWindowW(800), - mWindowH(600), - mFov(40.0f), - m_downSample(2), - m_numSlices(32), - m_numDisplayedSlices(32), - m_sliceNo(0), - m_shadowAlpha(0.005f), - m_spriteAlpha(0.1f), - m_doBlur(false), - m_blurRadius(1.0f), - m_displayLightBuffer(false), - m_lightPos(5.0f, 5.0f, -5.0f), - m_lightTarget(0.0f, 0.0f, 0.0f), - m_lightColor(1.0f, 1.0f, 0.5f), - m_colorAttenuation(0.1f, 0.2f, 0.3f), - m_lightBufferSize(256), - m_srcLightTexture(0), - m_lightDepthTexture(0), - m_lightFbo(0), - m_imageTex(0), - m_depthTex(0), - m_imageFbo(0) { - // load shader programs - m_simpleProg = new GLSLProgram(particleVS, simplePS); - m_particleProg = new GLSLProgram(mblurVS, mblurGS, particlePS); - m_particleShadowProg = new GLSLProgram(mblurVS, mblurGS, particleShadowPS); + : mMaxParticles(maxParticles) + , mNumParticles(0) + , mPosVbo(0) + , mVelVbo(0) + , mColorVbo(0) + , mIndexBuffer(0) + , mParticleRadius(0.005f) + , mDisplayMode(VOLUMETRIC) + , mWindowW(800) + , mWindowH(600) + , mFov(40.0f) + , m_downSample(2) + , m_numSlices(32) + , m_numDisplayedSlices(32) + , m_sliceNo(0) + , m_shadowAlpha(0.005f) + , m_spriteAlpha(0.1f) + , m_doBlur(false) + , m_blurRadius(1.0f) + , m_displayLightBuffer(false) + , m_lightPos(5.0f, 5.0f, -5.0f) + , m_lightTarget(0.0f, 0.0f, 0.0f) + , m_lightColor(1.0f, 1.0f, 0.5f) + , m_colorAttenuation(0.1f, 0.2f, 0.3f) + , m_lightBufferSize(256) + , m_srcLightTexture(0) + , m_lightDepthTexture(0) + , m_lightFbo(0) + , m_imageTex(0) + , m_depthTex(0) + , m_imageFbo(0) +{ + // load shader programs + m_simpleProg = new GLSLProgram(particleVS, simplePS); + m_particleProg = new GLSLProgram(mblurVS, mblurGS, particlePS); + m_particleShadowProg = new GLSLProgram(mblurVS, mblurGS, particleShadowPS); - m_blurProg = new GLSLProgram(passThruVS, blurPS); - m_displayTexProg = new GLSLProgram(passThruVS, texture2DPS); + m_blurProg = new GLSLProgram(passThruVS, blurPS); + m_displayTexProg = new GLSLProgram(passThruVS, texture2DPS); - // create buffer for light shadows - createLightBuffer(); + // create buffer for light shadows + createLightBuffer(); - glutReportErrors(); + glutReportErrors(); } -SmokeRenderer::~SmokeRenderer() { - delete m_particleProg; - delete m_particleShadowProg; - delete m_blurProg; - delete m_displayTexProg; - delete m_simpleProg; +SmokeRenderer::~SmokeRenderer() +{ + delete m_particleProg; + delete m_particleShadowProg; + delete m_blurProg; + delete m_displayTexProg; + delete m_simpleProg; - delete m_lightFbo; - glDeleteTextures(2, m_lightTexture); - glDeleteTextures(1, &m_lightDepthTexture); + delete m_lightFbo; + glDeleteTextures(2, m_lightTexture); + glDeleteTextures(1, &m_lightDepthTexture); - delete m_imageFbo; - glDeleteTextures(1, &m_imageTex); - glDeleteTextures(1, &m_depthTex); + delete m_imageFbo; + glDeleteTextures(1, &m_imageTex); + glDeleteTextures(1, &m_depthTex); } // draw points from vertex buffer objects -void SmokeRenderer::drawPoints(int start, int count, bool sort) { - glBindBuffer(GL_ARRAY_BUFFER, mPosVbo); - glVertexPointer(4, GL_FLOAT, 0, 0); - glEnableClientState(GL_VERTEX_ARRAY); +void SmokeRenderer::drawPoints(int start, int count, bool sort) +{ + glBindBuffer(GL_ARRAY_BUFFER, mPosVbo); + glVertexPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_VERTEX_ARRAY); - if (mColorVbo) { - glBindBuffer(GL_ARRAY_BUFFER, mColorVbo); - glColorPointer(4, GL_FLOAT, 0, 0); - glEnableClientState(GL_COLOR_ARRAY); - } + if (mColorVbo) { + glBindBuffer(GL_ARRAY_BUFFER, mColorVbo); + glColorPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_COLOR_ARRAY); + } + + if (mVelVbo) { + glBindBuffer(GL_ARRAY_BUFFER, mVelVbo); + glClientActiveTexture(GL_TEXTURE0); + glTexCoordPointer(4, GL_FLOAT, 0, 0); + glEnableClientState(GL_TEXTURE_COORD_ARRAY); + } + + if (sort) { + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, mIndexBuffer); + glDrawElements(GL_POINTS, count, GL_UNSIGNED_INT, (void *)(start * sizeof(unsigned int))); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, 0); + } + else { + glDrawArrays(GL_POINTS, start, count); + } + + glDisableClientState(GL_VERTEX_ARRAY); + glDisableClientState(GL_COLOR_ARRAY); - if (mVelVbo) { - glBindBuffer(GL_ARRAY_BUFFER, mVelVbo); glClientActiveTexture(GL_TEXTURE0); - glTexCoordPointer(4, GL_FLOAT, 0, 0); - glEnableClientState(GL_TEXTURE_COORD_ARRAY); - } - - if (sort) { - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, mIndexBuffer); - glDrawElements(GL_POINTS, count, GL_UNSIGNED_INT, - (void *)(start * sizeof(unsigned int))); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, 0); - } else { - glDrawArrays(GL_POINTS, start, count); - } - - glDisableClientState(GL_VERTEX_ARRAY); - glDisableClientState(GL_COLOR_ARRAY); - - glClientActiveTexture(GL_TEXTURE0); - glDisableClientState(GL_TEXTURE_COORD_ARRAY); + glDisableClientState(GL_TEXTURE_COORD_ARRAY); } // draw points using given shader program -void SmokeRenderer::drawPointSprites(GLSLProgram *prog, int start, int count, - bool shadowed) { - glEnable(GL_DEPTH_TEST); - glDepthMask(GL_FALSE); // don't write depth - glEnable(GL_BLEND); +void SmokeRenderer::drawPointSprites(GLSLProgram *prog, int start, int count, bool shadowed) +{ + glEnable(GL_DEPTH_TEST); + glDepthMask(GL_FALSE); // don't write depth + glEnable(GL_BLEND); - prog->enable(); - prog->setUniform1f("pointRadius", mParticleRadius); + prog->enable(); + prog->setUniform1f("pointRadius", mParticleRadius); - if (shadowed) { - prog->bindTexture("shadowTex", m_lightTexture[m_srcLightTexture], - GL_TEXTURE_2D, 0); - } + if (shadowed) { + prog->bindTexture("shadowTex", m_lightTexture[m_srcLightTexture], GL_TEXTURE_2D, 0); + } - // draw points - drawPoints(start, count, true); + // draw points + drawPoints(start, count, true); - prog->disable(); + prog->disable(); - glDepthMask(GL_TRUE); - glDisable(GL_BLEND); + glDepthMask(GL_TRUE); + glDisable(GL_BLEND); } // calculate vectors for half-angle slice rendering -void SmokeRenderer::calcVectors() { - // get model view matrix - glGetFloatv(GL_MODELVIEW_MATRIX, (float *)m_modelView.get_value()); +void SmokeRenderer::calcVectors() +{ + // get model view matrix + glGetFloatv(GL_MODELVIEW_MATRIX, (float *)m_modelView.get_value()); - // calculate eye space light vector - m_lightVector = normalize(m_lightPos); - m_lightPosEye = m_modelView * vec4f(m_lightPos, 1.0); + // calculate eye space light vector + m_lightVector = normalize(m_lightPos); + m_lightPosEye = m_modelView * vec4f(m_lightPos, 1.0); - // calculate half-angle vector between view and light - m_viewVector = -vec3f(m_modelView.get_row(2)); + // calculate half-angle vector between view and light + m_viewVector = -vec3f(m_modelView.get_row(2)); - if (dot(m_viewVector, m_lightVector) > 0) { - m_halfVector = normalize(m_viewVector + m_lightVector); - m_invertedView = false; - } else { - m_halfVector = normalize(-m_viewVector + m_lightVector); - m_invertedView = true; - } + if (dot(m_viewVector, m_lightVector) > 0) { + m_halfVector = normalize(m_viewVector + m_lightVector); + m_invertedView = false; + } + else { + m_halfVector = normalize(-m_viewVector + m_lightVector); + m_invertedView = true; + } - // calculate light view matrix - glMatrixMode(GL_MODELVIEW); - glPushMatrix(); - glLoadIdentity(); - gluLookAt(m_lightPos[0], m_lightPos[1], m_lightPos[2], m_lightTarget[0], - m_lightTarget[1], m_lightTarget[2], 0.0, 1.0, 0.0); + // calculate light view matrix + glMatrixMode(GL_MODELVIEW); + glPushMatrix(); + glLoadIdentity(); + gluLookAt(m_lightPos[0], + m_lightPos[1], + m_lightPos[2], + m_lightTarget[0], + m_lightTarget[1], + m_lightTarget[2], + 0.0, + 1.0, + 0.0); - // calculate light projection matrix - glMatrixMode(GL_PROJECTION); - glPushMatrix(); - glLoadIdentity(); - gluPerspective(45.0, 1.0, 1.0, 200.0); + // calculate light projection matrix + glMatrixMode(GL_PROJECTION); + glPushMatrix(); + glLoadIdentity(); + gluPerspective(45.0, 1.0, 1.0, 200.0); - glGetFloatv(GL_MODELVIEW_MATRIX, (float *)m_lightView.get_value()); - glGetFloatv(GL_PROJECTION_MATRIX, (float *)m_lightProj.get_value()); + glGetFloatv(GL_MODELVIEW_MATRIX, (float *)m_lightView.get_value()); + glGetFloatv(GL_PROJECTION_MATRIX, (float *)m_lightProj.get_value()); - glMatrixMode(GL_PROJECTION); - glPopMatrix(); - glMatrixMode(GL_MODELVIEW); - glPopMatrix(); + glMatrixMode(GL_PROJECTION); + glPopMatrix(); + glMatrixMode(GL_MODELVIEW); + glPopMatrix(); - // construct shadow matrix - matrix4f scale; - scale.set_scale(vec3f(0.5, 0.5, 0.5)); - matrix4f translate; - translate.set_translate(vec3f(0.5, 0.5, 0.5)); + // construct shadow matrix + matrix4f scale; + scale.set_scale(vec3f(0.5, 0.5, 0.5)); + matrix4f translate; + translate.set_translate(vec3f(0.5, 0.5, 0.5)); - m_shadowMatrix = - translate * scale * m_lightProj * m_lightView * inverse(m_modelView); + m_shadowMatrix = translate * scale * m_lightProj * m_lightView * inverse(m_modelView); - // calc object space eye position - m_eyePos = inverse(m_modelView) * vec4f(0.0, 0.0, 0.0, 1.0); + // calc object space eye position + m_eyePos = inverse(m_modelView) * vec4f(0.0, 0.0, 0.0, 1.0); - // calc half vector in eye space - m_halfVectorEye = m_modelView * vec4f(m_halfVector, 0.0); + // calc half vector in eye space + m_halfVectorEye = m_modelView * vec4f(m_halfVector, 0.0); } // draw slice of particles from camera view -void SmokeRenderer::drawSlice(int i) { - m_imageFbo->Bind(); - glViewport(0, 0, m_imageW, m_imageH); +void SmokeRenderer::drawSlice(int i) +{ + m_imageFbo->Bind(); + glViewport(0, 0, m_imageW, m_imageH); - glColor4f(1.0, 1.0, 1.0, m_spriteAlpha); + glColor4f(1.0, 1.0, 1.0, m_spriteAlpha); - if (m_invertedView) { - // front-to-back - glBlendFunc(GL_ONE_MINUS_DST_ALPHA, GL_ONE); - } else { - // back-to-front - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } + if (m_invertedView) { + // front-to-back + glBlendFunc(GL_ONE_MINUS_DST_ALPHA, GL_ONE); + } + else { + // back-to-front + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + } - drawPointSprites(m_particleShadowProg, i * m_batchSize, m_batchSize, true); + drawPointSprites(m_particleShadowProg, i * m_batchSize, m_batchSize, true); - m_imageFbo->Disable(); + m_imageFbo->Disable(); } // draw slice of particles from light's point of view -void SmokeRenderer::drawSliceLightView(int i) { - glMatrixMode(GL_MODELVIEW); - glPushMatrix(); - glLoadMatrixf((GLfloat *)m_lightView.get_value()); - - glMatrixMode(GL_PROJECTION); - glPushMatrix(); - glLoadMatrixf((GLfloat *)m_lightProj.get_value()); - - m_lightFbo->Bind(); - glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); - - glColor4f(m_colorAttenuation[0] * m_shadowAlpha, - m_colorAttenuation[1] * m_shadowAlpha, - m_colorAttenuation[2] * m_shadowAlpha, 1.0); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_COLOR); - - drawPointSprites(m_particleProg, i * m_batchSize, m_batchSize, false); - - m_lightFbo->Disable(); - - glMatrixMode(GL_PROJECTION); - glPopMatrix(); - glMatrixMode(GL_MODELVIEW); - glPopMatrix(); -} - -// draw particles as slices with shadowing -void SmokeRenderer::drawSlices() { - m_batchSize = mNumParticles / m_numSlices; - - // clear light buffer - m_srcLightTexture = 0; - m_lightFbo->Bind(); - m_lightFbo->AttachTexture(GL_TEXTURE_2D, m_lightTexture[m_srcLightTexture], - GL_COLOR_ATTACHMENT0_EXT); - glClearColor(1.0f - m_lightColor[0], 1.0f - m_lightColor[1], - 1.0f - m_lightColor[2], 0.0f); - glClear(GL_COLOR_BUFFER_BIT); - m_lightFbo->Disable(); - - // clear volume image - m_imageFbo->Bind(); - glClearColor(0.0, 0.0, 0.0, 0.0); - glClear(GL_COLOR_BUFFER_BIT); - m_imageFbo->Disable(); - - glActiveTexture(GL_TEXTURE0); - glMatrixMode(GL_TEXTURE); - glLoadMatrixf((GLfloat *)m_shadowMatrix.get_value()); - - // render slices - if (m_numDisplayedSlices > m_numSlices) m_numDisplayedSlices = m_numSlices; - - for (int i = 0; i < m_numDisplayedSlices; i++) { - // draw slice from camera view, sampling light buffer - drawSlice(i); - // draw slice from light view to light buffer, accumulating shadows - drawSliceLightView(i); - - if (m_doBlur) { - blurLightBuffer(); - } - } - - glActiveTexture(GL_TEXTURE0); - glMatrixMode(GL_TEXTURE); - glLoadIdentity(); -} - -// blur light buffer to simulate scattering effects -void SmokeRenderer::blurLightBuffer() { - m_lightFbo->Bind(); - m_lightFbo->AttachTexture(GL_TEXTURE_2D, - m_lightTexture[1 - m_srcLightTexture], - GL_COLOR_ATTACHMENT0_EXT); - glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); - - m_blurProg->enable(); - m_blurProg->bindTexture("tex", m_lightTexture[m_srcLightTexture], - GL_TEXTURE_2D, 0); - m_blurProg->setUniform2f("texelSize", 1.0f / (float)m_lightBufferSize, - 1.0f / (float)m_lightBufferSize); - m_blurProg->setUniform1f("blurRadius", m_blurRadius); - glDisable(GL_DEPTH_TEST); - drawQuad(); - m_blurProg->disable(); - - m_srcLightTexture = 1 - m_srcLightTexture; - - m_lightFbo->Disable(); -} - -// display texture to screen -void SmokeRenderer::displayTexture(GLuint tex) { - m_displayTexProg->enable(); - m_displayTexProg->bindTexture("tex", tex, GL_TEXTURE_2D, 0); - drawQuad(); - m_displayTexProg->disable(); -} - -// composite final volume image on top of scene -void SmokeRenderer::compositeResult() { - glViewport(0, 0, mWindowW, mWindowH); - glDisable(GL_DEPTH_TEST); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - glEnable(GL_BLEND); - displayTexture(m_imageTex); - glDisable(GL_BLEND); -} - -void SmokeRenderer::render() { - switch (mDisplayMode) { - case POINTS: - glColor3f(1.0, 1.0, 1.0); - m_simpleProg->enable(); - drawPoints(0, mNumParticles, false); - m_simpleProg->disable(); - break; - - case SPRITES: - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_COLOR); - glColor4f(1.0, 1.0, 1.0, m_spriteAlpha); - drawPointSprites(m_particleProg, 0, mNumParticles, false); - break; - - case VOLUMETRIC: - drawSlices(); - compositeResult(); - break; - - case NUM_MODES: - break; - } - - if (m_displayLightBuffer) { - // display light buffer to screen - glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); - glDisable(GL_DEPTH_TEST); - displayTexture(m_lightTexture[m_srcLightTexture]); - glViewport(0, 0, mWindowW, mWindowH); - } - - glutReportErrors(); -} - -// render scene depth to texture -// (this is to ensure that particle are correctly occluded in the low-resolution -// render buffer) -void SmokeRenderer::beginSceneRender(Target target) { - if (target == LIGHT_BUFFER) { - m_lightFbo->Bind(); - glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); - +void SmokeRenderer::drawSliceLightView(int i) +{ glMatrixMode(GL_MODELVIEW); glPushMatrix(); glLoadMatrixf((GLfloat *)m_lightView.get_value()); @@ -406,162 +267,318 @@ void SmokeRenderer::beginSceneRender(Target target) { glMatrixMode(GL_PROJECTION); glPushMatrix(); glLoadMatrixf((GLfloat *)m_lightProj.get_value()); - } else { - m_imageFbo->Bind(); - glViewport(0, 0, m_imageW, m_imageH); - } - glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE); - glDepthMask(GL_TRUE); - glClear(GL_DEPTH_BUFFER_BIT); -} + m_lightFbo->Bind(); + glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); + + glColor4f(m_colorAttenuation[0] * m_shadowAlpha, + m_colorAttenuation[1] * m_shadowAlpha, + m_colorAttenuation[2] * m_shadowAlpha, + 1.0); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_COLOR); + + drawPointSprites(m_particleProg, i * m_batchSize, m_batchSize, false); -void SmokeRenderer::endSceneRender(Target target) { - if (target == LIGHT_BUFFER) { m_lightFbo->Disable(); + glMatrixMode(GL_PROJECTION); glPopMatrix(); glMatrixMode(GL_MODELVIEW); glPopMatrix(); - } else { - m_imageFbo->Disable(); - } +} - glViewport(0, 0, mWindowW, mWindowH); - glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); +// draw particles as slices with shadowing +void SmokeRenderer::drawSlices() +{ + m_batchSize = mNumParticles / m_numSlices; + + // clear light buffer + m_srcLightTexture = 0; + m_lightFbo->Bind(); + m_lightFbo->AttachTexture(GL_TEXTURE_2D, m_lightTexture[m_srcLightTexture], GL_COLOR_ATTACHMENT0_EXT); + glClearColor(1.0f - m_lightColor[0], 1.0f - m_lightColor[1], 1.0f - m_lightColor[2], 0.0f); + glClear(GL_COLOR_BUFFER_BIT); + m_lightFbo->Disable(); + + // clear volume image + m_imageFbo->Bind(); + glClearColor(0.0, 0.0, 0.0, 0.0); + glClear(GL_COLOR_BUFFER_BIT); + m_imageFbo->Disable(); + + glActiveTexture(GL_TEXTURE0); + glMatrixMode(GL_TEXTURE); + glLoadMatrixf((GLfloat *)m_shadowMatrix.get_value()); + + // render slices + if (m_numDisplayedSlices > m_numSlices) + m_numDisplayedSlices = m_numSlices; + + for (int i = 0; i < m_numDisplayedSlices; i++) { + // draw slice from camera view, sampling light buffer + drawSlice(i); + // draw slice from light view to light buffer, accumulating shadows + drawSliceLightView(i); + + if (m_doBlur) { + blurLightBuffer(); + } + } + + glActiveTexture(GL_TEXTURE0); + glMatrixMode(GL_TEXTURE); + glLoadIdentity(); +} + +// blur light buffer to simulate scattering effects +void SmokeRenderer::blurLightBuffer() +{ + m_lightFbo->Bind(); + m_lightFbo->AttachTexture(GL_TEXTURE_2D, m_lightTexture[1 - m_srcLightTexture], GL_COLOR_ATTACHMENT0_EXT); + glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); + + m_blurProg->enable(); + m_blurProg->bindTexture("tex", m_lightTexture[m_srcLightTexture], GL_TEXTURE_2D, 0); + m_blurProg->setUniform2f("texelSize", 1.0f / (float)m_lightBufferSize, 1.0f / (float)m_lightBufferSize); + m_blurProg->setUniform1f("blurRadius", m_blurRadius); + glDisable(GL_DEPTH_TEST); + drawQuad(); + m_blurProg->disable(); + + m_srcLightTexture = 1 - m_srcLightTexture; + + m_lightFbo->Disable(); +} + +// display texture to screen +void SmokeRenderer::displayTexture(GLuint tex) +{ + m_displayTexProg->enable(); + m_displayTexProg->bindTexture("tex", tex, GL_TEXTURE_2D, 0); + drawQuad(); + m_displayTexProg->disable(); +} + +// composite final volume image on top of scene +void SmokeRenderer::compositeResult() +{ + glViewport(0, 0, mWindowW, mWindowH); + glDisable(GL_DEPTH_TEST); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + glEnable(GL_BLEND); + displayTexture(m_imageTex); + glDisable(GL_BLEND); +} + +void SmokeRenderer::render() +{ + switch (mDisplayMode) { + case POINTS: + glColor3f(1.0, 1.0, 1.0); + m_simpleProg->enable(); + drawPoints(0, mNumParticles, false); + m_simpleProg->disable(); + break; + + case SPRITES: + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_COLOR); + glColor4f(1.0, 1.0, 1.0, m_spriteAlpha); + drawPointSprites(m_particleProg, 0, mNumParticles, false); + break; + + case VOLUMETRIC: + drawSlices(); + compositeResult(); + break; + + case NUM_MODES: + break; + } + + if (m_displayLightBuffer) { + // display light buffer to screen + glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); + glDisable(GL_DEPTH_TEST); + displayTexture(m_lightTexture[m_srcLightTexture]); + glViewport(0, 0, mWindowW, mWindowH); + } + + glutReportErrors(); +} + +// render scene depth to texture +// (this is to ensure that particle are correctly occluded in the low-resolution +// render buffer) +void SmokeRenderer::beginSceneRender(Target target) +{ + if (target == LIGHT_BUFFER) { + m_lightFbo->Bind(); + glViewport(0, 0, m_lightBufferSize, m_lightBufferSize); + + glMatrixMode(GL_MODELVIEW); + glPushMatrix(); + glLoadMatrixf((GLfloat *)m_lightView.get_value()); + + glMatrixMode(GL_PROJECTION); + glPushMatrix(); + glLoadMatrixf((GLfloat *)m_lightProj.get_value()); + } + else { + m_imageFbo->Bind(); + glViewport(0, 0, m_imageW, m_imageH); + } + + glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE); + glDepthMask(GL_TRUE); + glClear(GL_DEPTH_BUFFER_BIT); +} + +void SmokeRenderer::endSceneRender(Target target) +{ + if (target == LIGHT_BUFFER) { + m_lightFbo->Disable(); + glMatrixMode(GL_PROJECTION); + glPopMatrix(); + glMatrixMode(GL_MODELVIEW); + glPopMatrix(); + } + else { + m_imageFbo->Disable(); + } + + glViewport(0, 0, mWindowW, mWindowH); + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); } // create an OpenGL texture -GLuint SmokeRenderer::createTexture(GLenum target, int w, int h, - GLint internalformat, GLenum format) { - GLuint texid; - glGenTextures(1, &texid); - glBindTexture(target, texid); +GLuint SmokeRenderer::createTexture(GLenum target, int w, int h, GLint internalformat, GLenum format) +{ + GLuint texid; + glGenTextures(1, &texid); + glBindTexture(target, texid); - glTexParameteri(target, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameteri(target, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameteri(target, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTexParameteri(target, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameteri(target, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(target, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(target, 0, internalformat, w, h, 0, format, GL_FLOAT, 0); - return texid; + glTexImage2D(target, 0, internalformat, w, h, 0, format, GL_FLOAT, 0); + return texid; } // create buffers for off-screen rendering -void SmokeRenderer::createBuffers(int w, int h) { - if (m_imageFbo) { - glDeleteTextures(1, &m_imageTex); - glDeleteTextures(1, &m_depthTex); - delete m_imageFbo; - } +void SmokeRenderer::createBuffers(int w, int h) +{ + if (m_imageFbo) { + glDeleteTextures(1, &m_imageTex); + glDeleteTextures(1, &m_depthTex); + delete m_imageFbo; + } - mWindowW = w; - mWindowH = h; + mWindowW = w; + mWindowH = h; - m_imageW = w / m_downSample; - m_imageH = h / m_downSample; + m_imageW = w / m_downSample; + m_imageH = h / m_downSample; - // create fbo for image buffer - GLint format = GL_RGBA16F_ARB; - // GLint format = GL_LUMINANCE16F_ARB; - // GLint format = GL_RGBA8; - m_imageTex = - createTexture(GL_TEXTURE_2D, m_imageW, m_imageH, format, GL_RGBA); - m_depthTex = createTexture(GL_TEXTURE_2D, m_imageW, m_imageH, - GL_DEPTH_COMPONENT24_ARB, GL_DEPTH_COMPONENT); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + // create fbo for image buffer + GLint format = GL_RGBA16F_ARB; + // GLint format = GL_LUMINANCE16F_ARB; + // GLint format = GL_RGBA8; + m_imageTex = createTexture(GL_TEXTURE_2D, m_imageW, m_imageH, format, GL_RGBA); + m_depthTex = createTexture(GL_TEXTURE_2D, m_imageW, m_imageH, GL_DEPTH_COMPONENT24_ARB, GL_DEPTH_COMPONENT); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - m_imageFbo = new FramebufferObject(); - m_imageFbo->AttachTexture(GL_TEXTURE_2D, m_imageTex, - GL_COLOR_ATTACHMENT0_EXT); - m_imageFbo->AttachTexture(GL_TEXTURE_2D, m_depthTex, GL_DEPTH_ATTACHMENT_EXT); - m_imageFbo->IsValid(); + m_imageFbo = new FramebufferObject(); + m_imageFbo->AttachTexture(GL_TEXTURE_2D, m_imageTex, GL_COLOR_ATTACHMENT0_EXT); + m_imageFbo->AttachTexture(GL_TEXTURE_2D, m_depthTex, GL_DEPTH_ATTACHMENT_EXT); + m_imageFbo->IsValid(); } -void SmokeRenderer::setLightColor(vec3f c) { - m_lightColor = c; +void SmokeRenderer::setLightColor(vec3f c) +{ + m_lightColor = c; - // set light texture border color - GLfloat borderColor[4] = {1.0f - m_lightColor[0], 1.0f - m_lightColor[1], - 1.0f - m_lightColor[2], 0.0f}; + // set light texture border color + GLfloat borderColor[4] = {1.0f - m_lightColor[0], 1.0f - m_lightColor[1], 1.0f - m_lightColor[2], 0.0f}; - glBindTexture(GL_TEXTURE_2D, m_lightTexture[0]); - glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, borderColor); + glBindTexture(GL_TEXTURE_2D, m_lightTexture[0]); + glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, borderColor); - glBindTexture(GL_TEXTURE_2D, m_lightTexture[1]); - glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, borderColor); + glBindTexture(GL_TEXTURE_2D, m_lightTexture[1]); + glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, borderColor); - glBindTexture(GL_TEXTURE_2D, 0); + glBindTexture(GL_TEXTURE_2D, 0); } // create FBOs for light buffer -void SmokeRenderer::createLightBuffer() { - GLint format = GL_RGBA16F_ARB; - // GLint format = GL_RGBA8; - // GLint format = GL_LUMINANCE16F_ARB; +void SmokeRenderer::createLightBuffer() +{ + GLint format = GL_RGBA16F_ARB; + // GLint format = GL_RGBA8; + // GLint format = GL_LUMINANCE16F_ARB; - m_lightTexture[0] = createTexture(GL_TEXTURE_2D, m_lightBufferSize, - m_lightBufferSize, format, GL_RGBA); - // make shadows clamp to light color at edges - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER); + m_lightTexture[0] = createTexture(GL_TEXTURE_2D, m_lightBufferSize, m_lightBufferSize, format, GL_RGBA); + // make shadows clamp to light color at edges + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER); - m_lightTexture[1] = createTexture(GL_TEXTURE_2D, m_lightBufferSize, - m_lightBufferSize, format, GL_RGBA); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER); + m_lightTexture[1] = createTexture(GL_TEXTURE_2D, m_lightBufferSize, m_lightBufferSize, format, GL_RGBA); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER); - m_lightDepthTexture = - createTexture(GL_TEXTURE_2D, m_lightBufferSize, m_lightBufferSize, - GL_DEPTH_COMPONENT24_ARB, GL_DEPTH_COMPONENT); + m_lightDepthTexture = createTexture( + GL_TEXTURE_2D, m_lightBufferSize, m_lightBufferSize, GL_DEPTH_COMPONENT24_ARB, GL_DEPTH_COMPONENT); - m_lightFbo = new FramebufferObject(); - m_lightFbo->AttachTexture(GL_TEXTURE_2D, m_lightTexture[m_srcLightTexture], - GL_COLOR_ATTACHMENT0_EXT); - m_lightFbo->AttachTexture(GL_TEXTURE_2D, m_lightDepthTexture, - GL_DEPTH_ATTACHMENT_EXT); - m_lightFbo->IsValid(); + m_lightFbo = new FramebufferObject(); + m_lightFbo->AttachTexture(GL_TEXTURE_2D, m_lightTexture[m_srcLightTexture], GL_COLOR_ATTACHMENT0_EXT); + m_lightFbo->AttachTexture(GL_TEXTURE_2D, m_lightDepthTexture, GL_DEPTH_ATTACHMENT_EXT); + m_lightFbo->IsValid(); } -void SmokeRenderer::setWindowSize(int w, int h) { - mAspect = (float)mWindowW / (float)mWindowH; - mInvFocalLen = tan(mFov * 0.5f * NV_PI / 180.0f); +void SmokeRenderer::setWindowSize(int w, int h) +{ + mAspect = (float)mWindowW / (float)mWindowH; + mInvFocalLen = tan(mFov * 0.5f * NV_PI / 180.0f); - createBuffers(w, h); + createBuffers(w, h); } -void SmokeRenderer::drawQuad() { - glBegin(GL_QUADS); - glTexCoord2f(0.0f, 0.0f); - glVertex2f(-1.0f, -1.0f); - glTexCoord2f(1.0f, 0.0f); - glVertex2f(1.0f, -1.0f); - glTexCoord2f(1.0f, 1.0f); - glVertex2f(1.0f, 1.0f); - glTexCoord2f(0.0f, 1.0f); - glVertex2f(-1.0f, 1.0f); - glEnd(); +void SmokeRenderer::drawQuad() +{ + glBegin(GL_QUADS); + glTexCoord2f(0.0f, 0.0f); + glVertex2f(-1.0f, -1.0f); + glTexCoord2f(1.0f, 0.0f); + glVertex2f(1.0f, -1.0f); + glTexCoord2f(1.0f, 1.0f); + glVertex2f(1.0f, 1.0f); + glTexCoord2f(0.0f, 1.0f); + glVertex2f(-1.0f, 1.0f); + glEnd(); } -void SmokeRenderer::drawVector(vec3f v) { - glBegin(GL_LINES); - glVertex3f(0.0f, 0.0f, 0.0f); - glVertex3fv((float *)&v[0]); - glEnd(); +void SmokeRenderer::drawVector(vec3f v) +{ + glBegin(GL_LINES); + glVertex3f(0.0f, 0.0f, 0.0f); + glVertex3fv((float *)&v[0]); + glEnd(); } // render vectors to screen for debugging -void SmokeRenderer::debugVectors() { - glColor3f(1.0f, 1.0f, 0.0f); - drawVector(m_lightVector); +void SmokeRenderer::debugVectors() +{ + glColor3f(1.0f, 1.0f, 0.0f); + drawVector(m_lightVector); - glColor3f(0.0f, 1.0f, 0.0f); - drawVector(m_viewVector); + glColor3f(0.0f, 1.0f, 0.0f); + drawVector(m_viewVector); - glColor3f(0.0f, 0.0f, 1.0f); - drawVector(-m_viewVector); + glColor3f(0.0f, 0.0f, 1.0f); + drawVector(-m_viewVector); - glColor3f(1.0f, 0.0f, 0.0f); - drawVector(m_halfVector); + glColor3f(1.0f, 0.0f, 0.0f); + drawVector(m_halfVector); } diff --git a/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.h b/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.h index 8a6b4bfb..44df5cdd 100644 --- a/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.h +++ b/Samples/5_Domain_Specific/smokeParticles/SmokeRenderer.h @@ -30,139 +30,139 @@ #ifndef SMOKE_RENDERER_H #define SMOKE_RENDERER_H -#include "framebufferObject.h" #include "GLSLProgram.h" +#include "framebufferObject.h" #include "nvMath.h" using namespace nv; -class SmokeRenderer { - public: - SmokeRenderer(int maxParticles); - ~SmokeRenderer(); +class SmokeRenderer +{ +public: + SmokeRenderer(int maxParticles); + ~SmokeRenderer(); - enum DisplayMode { POINTS, SPRITES, VOLUMETRIC, NUM_MODES }; + enum DisplayMode { POINTS, SPRITES, VOLUMETRIC, NUM_MODES }; - enum Target { LIGHT_BUFFER, SCENE_BUFFER }; + enum Target { LIGHT_BUFFER, SCENE_BUFFER }; - void setDisplayMode(DisplayMode mode) { mDisplayMode = mode; } + void setDisplayMode(DisplayMode mode) { mDisplayMode = mode; } - void setNumParticles(unsigned int x) { mNumParticles = x; } - void setPositionBuffer(GLuint vbo) { mPosVbo = vbo; } - void setVelocityBuffer(GLuint vbo) { mVelVbo = vbo; } - void setColorBuffer(GLuint vbo) { mColorVbo = vbo; } - void setIndexBuffer(GLuint ib) { mIndexBuffer = ib; } + void setNumParticles(unsigned int x) { mNumParticles = x; } + void setPositionBuffer(GLuint vbo) { mPosVbo = vbo; } + void setVelocityBuffer(GLuint vbo) { mVelVbo = vbo; } + void setColorBuffer(GLuint vbo) { mColorVbo = vbo; } + void setIndexBuffer(GLuint ib) { mIndexBuffer = ib; } - void setParticleRadius(float x) { mParticleRadius = x; } - void setWindowSize(int w, int h); - void setFOV(float fov) { mFov = fov; } + void setParticleRadius(float x) { mParticleRadius = x; } + void setWindowSize(int w, int h); + void setFOV(float fov) { mFov = fov; } - void setNumSlices(int x) { m_numSlices = x; } - void setNumDisplayedSlices(int x) { m_numDisplayedSlices = x; } + void setNumSlices(int x) { m_numSlices = x; } + void setNumDisplayedSlices(int x) { m_numDisplayedSlices = x; } - void setAlpha(float x) { m_spriteAlpha = x; } - void setShadowAlpha(float x) { m_shadowAlpha = x; } - void setColorAttenuation(vec3f c) { m_colorAttenuation = c; } - void setLightColor(vec3f c); + void setAlpha(float x) { m_spriteAlpha = x; } + void setShadowAlpha(float x) { m_shadowAlpha = x; } + void setColorAttenuation(vec3f c) { m_colorAttenuation = c; } + void setLightColor(vec3f c); - void setDoBlur(bool b) { m_doBlur = b; } - void setBlurRadius(float x) { m_blurRadius = x; } - void setDisplayLightBuffer(bool b) { m_displayLightBuffer = b; } + void setDoBlur(bool b) { m_doBlur = b; } + void setBlurRadius(float x) { m_blurRadius = x; } + void setDisplayLightBuffer(bool b) { m_displayLightBuffer = b; } - void beginSceneRender(Target target); - void endSceneRender(Target target); + void beginSceneRender(Target target); + void endSceneRender(Target target); - void setLightPosition(vec3f v) { m_lightPos = v; } - void setLightTarget(vec3f v) { m_lightTarget = v; } + void setLightPosition(vec3f v) { m_lightPos = v; } + void setLightTarget(vec3f v) { m_lightTarget = v; } - vec4f getLightPositionEyeSpace() { return m_lightPosEye; } - matrix4f getShadowMatrix() { return m_shadowMatrix; } + vec4f getLightPositionEyeSpace() { return m_lightPosEye; } + matrix4f getShadowMatrix() { return m_shadowMatrix; } - GLuint getShadowTexture() { return m_lightTexture[m_srcLightTexture]; } + GLuint getShadowTexture() { return m_lightTexture[m_srcLightTexture]; } - void calcVectors(); - vec3f getSortVector() { return m_halfVector; } + void calcVectors(); + vec3f getSortVector() { return m_halfVector; } - void render(); - void debugVectors(); + void render(); + void debugVectors(); - private: - void drawPoints(int start, int count, bool sort); - void drawPointSprites(GLSLProgram *prog, int start, int count, bool shadowed); +private: + void drawPoints(int start, int count, bool sort); + void drawPointSprites(GLSLProgram *prog, int start, int count, bool shadowed); - void drawSlice(int i); - void drawSliceLightView(int i); - void drawSlices(); - void displayTexture(GLuint tex); - void compositeResult(); - void blurLightBuffer(); - void depthSort(); + void drawSlice(int i); + void drawSliceLightView(int i); + void drawSlices(); + void displayTexture(GLuint tex); + void compositeResult(); + void blurLightBuffer(); + void depthSort(); - GLuint createTexture(GLenum target, int w, int h, GLint internalformat, - GLenum format); - void createBuffers(int w, int h); - void createLightBuffer(); + GLuint createTexture(GLenum target, int w, int h, GLint internalformat, GLenum format); + void createBuffers(int w, int h); + void createLightBuffer(); - void drawQuad(); - void drawVector(vec3f v); + void drawQuad(); + void drawVector(vec3f v); - // particle data - unsigned int mMaxParticles; - unsigned int mNumParticles; + // particle data + unsigned int mMaxParticles; + unsigned int mNumParticles; - GLuint mPosVbo; - GLuint mVelVbo; - GLuint mColorVbo; - GLuint mIndexBuffer; + GLuint mPosVbo; + GLuint mVelVbo; + GLuint mColorVbo; + GLuint mIndexBuffer; - float mParticleRadius; - DisplayMode mDisplayMode; + float mParticleRadius; + DisplayMode mDisplayMode; - // window - unsigned int mWindowW, mWindowH; - float mAspect, mInvFocalLen; - float mFov; + // window + unsigned int mWindowW, mWindowH; + float mAspect, mInvFocalLen; + float mFov; - int m_downSample; - int m_imageW, m_imageH; + int m_downSample; + int m_imageW, m_imageH; - int m_numSlices; - int m_numDisplayedSlices; - int m_batchSize; - int m_sliceNo; + int m_numSlices; + int m_numDisplayedSlices; + int m_batchSize; + int m_sliceNo; - float m_shadowAlpha; - float m_spriteAlpha; - bool m_doBlur; - float m_blurRadius; - bool m_displayLightBuffer; + float m_shadowAlpha; + float m_spriteAlpha; + bool m_doBlur; + float m_blurRadius; + bool m_displayLightBuffer; - vec3f m_lightVector, m_lightPos, m_lightTarget; - vec3f m_lightColor; - vec3f m_colorAttenuation; - float m_lightDistance; + vec3f m_lightVector, m_lightPos, m_lightTarget; + vec3f m_lightColor; + vec3f m_colorAttenuation; + float m_lightDistance; - matrix4f m_modelView, m_lightView, m_lightProj, m_shadowMatrix; - vec3f m_viewVector, m_halfVector; - bool m_invertedView; - vec4f m_eyePos; - vec4f m_halfVectorEye; - vec4f m_lightPosEye; + matrix4f m_modelView, m_lightView, m_lightProj, m_shadowMatrix; + vec3f m_viewVector, m_halfVector; + bool m_invertedView; + vec4f m_eyePos; + vec4f m_halfVectorEye; + vec4f m_lightPosEye; - // programs - GLSLProgram *m_simpleProg; - GLSLProgram *m_particleProg, *m_particleShadowProg; - GLSLProgram *m_displayTexProg, *m_blurProg; + // programs + GLSLProgram *m_simpleProg; + GLSLProgram *m_particleProg, *m_particleShadowProg; + GLSLProgram *m_displayTexProg, *m_blurProg; - // image buffers - int m_lightBufferSize; - GLuint m_lightTexture[2]; - int m_srcLightTexture; - GLuint m_lightDepthTexture; - FramebufferObject *m_lightFbo; + // image buffers + int m_lightBufferSize; + GLuint m_lightTexture[2]; + int m_srcLightTexture; + GLuint m_lightDepthTexture; + FramebufferObject *m_lightFbo; - GLuint m_imageTex, m_depthTex; - FramebufferObject *m_imageFbo; + GLuint m_imageTex, m_depthTex; + FramebufferObject *m_imageFbo; }; #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/SmokeShaders.cpp b/Samples/5_Domain_Specific/smokeParticles/SmokeShaders.cpp index f0793e7b..c446ed1f 100644 --- a/Samples/5_Domain_Specific/smokeParticles/SmokeShaders.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/SmokeShaders.cpp @@ -38,231 +38,229 @@ const char *particleVS = STRINGIFY( \n vec4 wpos = vec4(gl_Vertex.xyz, 1.0); \n gl_Position = gl_ModelViewProjectionMatrix *wpos; \n - + // calculate window-space point size \n vec4 eyeSpacePos = gl_ModelViewMatrix *wpos; \n float dist = length(eyeSpacePos.xyz); \n gl_PointSize = pointRadius * (pointScale / dist); \n - + gl_TexCoord[0] = gl_MultiTexCoord0; // sprite texcoord \n gl_TexCoord[1] = eyeSpacePos; \n - + gl_FrontColor = gl_Color; \n } \n ); // motion blur shaders -const char *mblurVS = STRINGIFY( - uniform float timestep; \n - void main() \n - { - \n - vec3 pos = gl_Vertex.xyz; \n - vec3 vel = gl_MultiTexCoord0.xyz; \n - vec3 pos2 = (pos - vel*timestep).xyz; // previous position \n +const char *mblurVS = STRINGIFY(uniform float timestep; \n void + main() \n { + \n vec3 pos = gl_Vertex.xyz; + \n vec3 vel = gl_MultiTexCoord0.xyz; + \n vec3 pos2 = (pos - vel * timestep).xyz; // previous position \n - gl_Position = gl_ModelViewMatrix * vec4(pos, 1.0); \n // eye space - gl_TexCoord[0] = gl_ModelViewMatrix * vec4(pos2, 1.0); \n + gl_Position = gl_ModelViewMatrix * vec4(pos, 1.0); + \n // eye space + gl_TexCoord[0] = gl_ModelViewMatrix * vec4(pos2, 1.0); + \n - // aging \n - float lifetime = gl_MultiTexCoord0.w; \n - float age = gl_Vertex.w; \n - float phase = (lifetime > 0.0) ? (age / lifetime) : 1.0; \n // [0, 1] + // aging \n + float lifetime = gl_MultiTexCoord0.w; + \n float age = gl_Vertex.w; + \n float phase = (lifetime > 0.0) ? (age / lifetime) : 1.0; + \n // [0, 1] - gl_TexCoord[1].x = phase; \n - float fade = 1.0 - phase; \n - // float fade = 1.0; \n + gl_TexCoord[1] + .x = phase; + \n float fade = 1.0 - phase; + \n + // float fade = 1.0; \n - // gl_FrontColor = gl_Color; \n - gl_FrontColor = vec4(gl_Color.xyz, gl_Color.w*fade); \n - } \n - ); + // gl_FrontColor = gl_Color; \n + gl_FrontColor = vec4(gl_Color.xyz, gl_Color.w * fade); + \n + } \n); // motion blur geometry shader // - outputs stretched quad between previous and current positions -const char *mblurGS = - "#version 120\n" - "#extension GL_EXT_geometry_shader4 : enable\n" - STRINGIFY( - uniform float pointRadius; // point size in world space \n - void main() \n - { +const char *mblurGS = "#version 120\n" + "#extension GL_EXT_geometry_shader4 : enable\n" STRINGIFY( + uniform float pointRadius; // point size in world space \n + void main() \n { + \n + // aging \n + float phase = gl_TexCoordIn[0][1].x; + \n float radius = pointRadius; + \n + + // eye space \n + vec3 pos = gl_PositionIn[0].xyz; + \n vec3 pos2 = gl_TexCoordIn[0][0].xyz; + \n vec3 motion = pos - pos2; + \n vec3 dir = normalize(motion); + \n float len = length(motion); + \n + + vec3 x = dir * radius; + \n vec3 view = normalize(-pos); + \n vec3 y = normalize(cross(dir, view)) * radius; + \n float facing = dot(view, dir); + \n + + // check for very small motion to avoid jitter \n + float threshold = 0.01; + \n + + if ((len < threshold) || (facing > 0.95) || (facing < -0.95)) + { + \n pos2 = pos; + \n x = vec3(radius, 0.0, 0.0); + \n y = vec3(0.0, -radius, 0.0); + \n + } + \n + + // output quad \n + gl_FrontColor = gl_FrontColorIn[0]; + \n gl_TexCoord[0] = vec4(0, 0, 0, phase); + \n gl_TexCoord[1] = gl_PositionIn[0]; + \n gl_Position = gl_ProjectionMatrix * vec4(pos + x + y, 1); + \n EmitVertex(); + \n + + gl_TexCoord[0] = vec4(0, 1, 0, phase); + \n gl_TexCoord[1] = gl_PositionIn[0]; + \n gl_Position = gl_ProjectionMatrix * vec4(pos + x - y, 1); + \n EmitVertex(); + \n + + gl_TexCoord[0] = vec4(1, 0, 0, phase); + \n gl_TexCoord[1] = gl_PositionIn[0]; + \n gl_Position = gl_ProjectionMatrix * vec4(pos2 - x + y, 1); + \n EmitVertex(); + \n + + gl_TexCoord[0] = vec4(1, 1, 0, phase); + \n gl_TexCoord[1] = gl_PositionIn[0]; + \n gl_Position = gl_ProjectionMatrix * vec4(pos2 - x - y, 1); + \n EmitVertex(); + \n + } \n); + + +const char *simplePS = STRINGIFY(void main() \n { + \n gl_FragColor = gl_Color; \n - // aging \n - float phase = gl_TexCoordIn[0][1].x; \n - float radius = pointRadius; \n - - // eye space \n - vec3 pos = gl_PositionIn[0].xyz; \n - vec3 pos2 = gl_TexCoordIn[0][0].xyz; \n - vec3 motion = pos - pos2; \n - vec3 dir = normalize(motion); \n - float len = length(motion); \n - - vec3 x = dir *radius; \n - vec3 view = normalize(-pos); \n - vec3 y = normalize(cross(dir, view)) * radius; \n - float facing = dot(view, dir); \n - - // check for very small motion to avoid jitter \n - float threshold = 0.01; \n - - if ((len < threshold) || (facing > 0.95) || (facing < -0.95)) - { - \n - pos2 = pos; - \n - x = vec3(radius, 0.0, 0.0); - \n - y = vec3(0.0, -radius, 0.0); - \n - } \n - - // output quad \n - gl_FrontColor = gl_FrontColorIn[0]; \n - gl_TexCoord[0] = vec4(0, 0, 0, phase); \n - gl_TexCoord[1] = gl_PositionIn[0]; \n - gl_Position = gl_ProjectionMatrix * vec4(pos + x + y, 1); \n - EmitVertex(); \n - - gl_TexCoord[0] = vec4(0, 1, 0, phase); \n - gl_TexCoord[1] = gl_PositionIn[0]; \n - gl_Position = gl_ProjectionMatrix * vec4(pos + x - y, 1); \n - EmitVertex(); \n - - gl_TexCoord[0] = vec4(1, 0, 0, phase); \n - gl_TexCoord[1] = gl_PositionIn[0]; \n - gl_Position = gl_ProjectionMatrix * vec4(pos2 - x + y, 1); \n - EmitVertex(); \n - - gl_TexCoord[0] = vec4(1, 1, 0, phase); \n - gl_TexCoord[1] = gl_PositionIn[0]; \n - gl_Position = gl_ProjectionMatrix * vec4(pos2 - x - y, 1); \n - EmitVertex(); \n - } \n - ); - - -const char *simplePS = STRINGIFY( - void main() \n - { - \n - gl_FragColor = gl_Color; \n - } \n - ); +} \n); // render particle without shadows -const char *particlePS = STRINGIFY( - uniform float pointRadius; \n - void main() \n - { - \n - // calculate eye-space sphere normal from texture coordinates \n - vec3 N; \n - N.xy = gl_TexCoord[0].xy*vec2(2.0, -2.0) + vec2(-1.0, 1.0); \n - float r2 = dot(N.xy, N.xy); \n +const char *particlePS = STRINGIFY(uniform float pointRadius; \n void + main() \n { + \n + // calculate eye-space sphere normal from texture coordinates \n + vec3 N; + \n N.xy = gl_TexCoord[0].xy * vec2(2.0, -2.0) + vec2(-1.0, 1.0); + \n float r2 = dot(N.xy, N.xy); + \n - if (r2 > 1.0) discard; // kill pixels outside circle \n - N.z = sqrt(1.0-r2); \n + if (r2 > 1.0) discard; // kill pixels outside circle \n + N.z = sqrt(1.0 - r2); + \n - // float alpha = saturate(1.0 - r2); \n - float alpha = clamp((1.0 - r2), 0.0, 1.0); \n - alpha *= gl_Color.w; \n + // float alpha = saturate(1.0 - r2); \n + float alpha = clamp((1.0 - r2), 0.0, 1.0); + \n alpha *= gl_Color.w; + \n - gl_FragColor = vec4(gl_Color.xyz * alpha, alpha); \n - } \n - ); + gl_FragColor = vec4(gl_Color.xyz * alpha, alpha); + \n + } \n); // render particle including shadows const char *particleShadowPS = STRINGIFY( - uniform float pointRadius; \n - uniform sampler2D shadowTex; \n - uniform sampler2D depthTex; \n - void main() \n - { - \n - // calculate eye-space sphere normal from texture coordinates \n - vec3 N; \n - N.xy = gl_TexCoord[0].xy*vec2(2.0, -2.0) + vec2(-1.0, 1.0); \n - float r2 = dot(N.xy, N.xy); \n + uniform float pointRadius; \n uniform sampler2D shadowTex; \n uniform sampler2D depthTex; \n void + main() \n { + \n + // calculate eye-space sphere normal from texture coordinates \n + vec3 N; + \n N.xy = gl_TexCoord[0].xy * vec2(2.0, -2.0) + vec2(-1.0, 1.0); + \n float r2 = dot(N.xy, N.xy); + \n - if (r2 > 1.0) discard; \n // kill pixels outside circle - N.z = sqrt(1.0-r2); \n - vec4 eyeSpacePos = gl_TexCoord[1]; \n - vec4 eyeSpaceSpherePos = vec4(eyeSpacePos.xyz + N*pointRadius, 1.0); \n // point on sphere - vec4 shadowPos = gl_TextureMatrix[0] * eyeSpaceSpherePos; \n - vec3 shadow = vec3(1.0) - texture2DProj(shadowTex, shadowPos.xyw).xyz; \n - // float alpha = saturate(1.0 - r2); \n - float alpha = clamp((1.0 - r2), 0.0, 1.0); \n - alpha *= gl_Color.w; \n + if (r2 > 1.0) discard; + \n // kill pixels outside circle + N.z = sqrt(1.0 - r2); + \n vec4 eyeSpacePos = gl_TexCoord[1]; + \n vec4 eyeSpaceSpherePos = vec4(eyeSpacePos.xyz + N * pointRadius, 1.0); + \n // point on sphere + vec4 shadowPos = gl_TextureMatrix[0] * eyeSpaceSpherePos; + \n vec3 shadow = vec3(1.0) - texture2DProj(shadowTex, shadowPos.xyw).xyz; + \n + // float alpha = saturate(1.0 - r2); \n + float alpha = clamp((1.0 - r2), 0.0, 1.0); + \n alpha *= gl_Color.w; + \n - gl_FragColor = vec4(gl_Color.xyz *shadow * alpha, alpha); \n // premul alpha - } - ); + gl_FragColor = vec4(gl_Color.xyz * shadow * alpha, alpha); + \n // premul alpha + }); // render particle as lit sphere const char *particleSpherePS = STRINGIFY( - uniform float pointRadius; \n - uniform vec3 lightDir = vec3(0.577, 0.577, 0.577); \n - void main() \n - { + uniform float pointRadius; \n uniform vec3 lightDir = vec3(0.577, 0.577, 0.577); \n void + main() \n { + \n + // calculate eye-space sphere normal from texture coordinates \n + vec3 N; + \n N.xy = gl_TexCoord[0].xy * vec2(2.0, -2.0) + vec2(-1.0, 1.0); + \n float r2 = dot(N.xy, N.xy); + \n + + if (r2 > 1.0) discard; // kill pixels outside circle \n + N.z = sqrt(1.0 - r2); + \n + + // calculate depth \n + vec4 eyeSpacePos = + vec4(gl_TexCoord[1].xyz + N * pointRadius, 1.0); // position of this pixel on sphere in eye space \n + vec4 clipSpacePos = gl_ProjectionMatrix * eyeSpacePos; + \n gl_FragDepth = (clipSpacePos.z / clipSpacePos.w) * 0.5 + 0.5; + \n + + float diffuse = max(0.0, dot(N, lightDir)); + \n + + gl_FragColor = diffuse * gl_Color; + \n + } \n); + +const char *passThruVS = STRINGIFY(void main() \n { + \n gl_Position = gl_Vertex; + \n gl_TexCoord[0] = gl_MultiTexCoord0; + \n gl_FrontColor = gl_Color; \n - // calculate eye-space sphere normal from texture coordinates \n - vec3 N; \n - N.xy = gl_TexCoord[0].xy*vec2(2.0, -2.0) + vec2(-1.0, 1.0); \n - float r2 = dot(N.xy, N.xy); \n +} \n); - if (r2 > 1.0) discard; // kill pixels outside circle \n - N.z = sqrt(1.0-r2); \n - - // calculate depth \n - vec4 eyeSpacePos = vec4(gl_TexCoord[1].xyz + N*pointRadius, 1.0); // position of this pixel on sphere in eye space \n - vec4 clipSpacePos = gl_ProjectionMatrix *eyeSpacePos; \n - gl_FragDepth = (clipSpacePos.z / clipSpacePos.w)*0.5+0.5; \n - - float diffuse = max(0.0, dot(N, lightDir)); \n - - gl_FragColor = diffuse *gl_Color; \n - } \n - ); - -const char *passThruVS = STRINGIFY( - void main() \n - { - \n - gl_Position = gl_Vertex; \n - gl_TexCoord[0] = gl_MultiTexCoord0; \n - gl_FrontColor = gl_Color; \n - } \n - ); - -const char *texture2DPS = STRINGIFY( - uniform sampler2D tex; \n - void main() \n - { - \n - gl_FragColor = texture2D(tex, gl_TexCoord[0].xy); \n - } \n - ); +const char *texture2DPS = STRINGIFY(uniform sampler2D tex; \n void + main() \n { + \n gl_FragColor = texture2D(tex, gl_TexCoord[0].xy); + \n + } \n); // 4 tap 3x3 gaussian blur const char *blurPS = STRINGIFY( - uniform sampler2D tex; \n - uniform vec2 texelSize; \n - uniform float blurRadius; \n - void main() \n - { - \n - vec4 c; \n - c = texture2D(tex, gl_TexCoord[0].xy + vec2(-0.5, -0.5)*texelSize*blurRadius); \n - c += texture2D(tex, gl_TexCoord[0].xy + vec2(0.5, -0.5)*texelSize*blurRadius); \n - c += texture2D(tex, gl_TexCoord[0].xy + vec2(0.5, 0.5)*texelSize*blurRadius); \n - c += texture2D(tex, gl_TexCoord[0].xy + vec2(-0.5, 0.5)*texelSize*blurRadius); \n - c *= 0.25; \n + uniform sampler2D tex; \n uniform vec2 texelSize; \n uniform float blurRadius; \n void + main() \n { + \n vec4 c; + \n c = texture2D(tex, gl_TexCoord[0].xy + vec2(-0.5, -0.5) * texelSize * blurRadius); + \n c += texture2D(tex, gl_TexCoord[0].xy + vec2(0.5, -0.5) * texelSize * blurRadius); + \n c += texture2D(tex, gl_TexCoord[0].xy + vec2(0.5, 0.5) * texelSize * blurRadius); + \n c += texture2D(tex, gl_TexCoord[0].xy + vec2(-0.5, 0.5) * texelSize * blurRadius); + \n c *= 0.25; + \n - gl_FragColor = c; \n - } \n - ); + gl_FragColor = c; + \n + } \n); // floor shader const char *floorVS = STRINGIFY( diff --git a/Samples/5_Domain_Specific/smokeParticles/framebufferObject.cpp b/Samples/5_Domain_Specific/smokeParticles/framebufferObject.cpp index 8681b181..5e02e67a 100644 --- a/Samples/5_Domain_Specific/smokeParticles/framebufferObject.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/framebufferObject.cpp @@ -67,289 +67,300 @@ */ #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION -#include #include "framebufferObject.h" + +#include #include using namespace std; FramebufferObject::FramebufferObject() - : m_fboId(_GenerateFboId()), m_savedFboId(0) { - // Bind this FBO so that it actually gets created now - _GuardedBind(); - _GuardedUnbind(); + : m_fboId(_GenerateFboId()) + , m_savedFboId(0) +{ + // Bind this FBO so that it actually gets created now + _GuardedBind(); + _GuardedUnbind(); } -FramebufferObject::~FramebufferObject() { - glDeleteFramebuffersEXT(1, &m_fboId); +FramebufferObject::~FramebufferObject() { glDeleteFramebuffersEXT(1, &m_fboId); } + +void FramebufferObject::Bind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_fboId); } + +void FramebufferObject::Disable() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); } + +void FramebufferObject::AttachTexture(GLenum texTarget, GLuint texId, GLenum attachment, int mipLevel, int zSlice) +{ + _GuardedBind(); + + /* + #ifndef NDEBUG + if( GetAttachedId(attachment) != texId ) { + #endif + */ + + _FramebufferTextureND(attachment, texTarget, texId, mipLevel, zSlice); + + /* + #ifndef NDEBUG + } + else { + cerr << "FramebufferObject::AttachTexture PERFORMANCE WARNING:\n" + << "\tRedundant bind of texture (id = " << texId << ").\n" + << "\tHINT : Compile with -DNDEBUG to remove this warning.\n"; + } + #endif + */ + + _GuardedUnbind(); } -void FramebufferObject::Bind() { - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_fboId); +void FramebufferObject::AttachTextures(int numTextures, + GLenum texTarget[], + GLuint texId[], + GLenum attachment[], + int mipLevel[], + int zSlice[]) +{ + for (int i = 0; i < numTextures; ++i) { + AttachTexture(texTarget[i], + texId[i], + attachment ? attachment[i] : (GL_COLOR_ATTACHMENT0_EXT + i), + mipLevel ? mipLevel[i] : 0, + zSlice ? zSlice[i] : 0); + } } -void FramebufferObject::Disable() { - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); -} +void FramebufferObject::AttachRenderBuffer(GLuint buffId, GLenum attachment) +{ + _GuardedBind(); -void FramebufferObject::AttachTexture(GLenum texTarget, GLuint texId, - GLenum attachment, int mipLevel, - int zSlice) { - _GuardedBind(); +#ifndef NDEBUG - /* - #ifndef NDEBUG - if( GetAttachedId(attachment) != texId ) { - #endif - */ + if (GetAttachedId(attachment) != buffId) { +#endif - _FramebufferTextureND(attachment, texTarget, texId, mipLevel, zSlice); + glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, attachment, GL_RENDERBUFFER_EXT, buffId); - /* - #ifndef NDEBUG +#ifndef NDEBUG } else { - cerr << "FramebufferObject::AttachTexture PERFORMANCE WARNING:\n" - << "\tRedundant bind of texture (id = " << texId << ").\n" - << "\tHINT : Compile with -DNDEBUG to remove this warning.\n"; + cerr << "FramebufferObject::AttachRenderBuffer PERFORMANCE WARNING:\n" + << "\tRedundant bind of Renderbuffer (id = " << buffId << ")\n" + << "\tHINT : Compile with -DNDEBUG to remove this warning.\n"; } - #endif - */ - - _GuardedUnbind(); -} - -void FramebufferObject::AttachTextures(int numTextures, GLenum texTarget[], - GLuint texId[], GLenum attachment[], - int mipLevel[], int zSlice[]) { - for (int i = 0; i < numTextures; ++i) { - AttachTexture(texTarget[i], texId[i], - attachment ? attachment[i] : (GL_COLOR_ATTACHMENT0_EXT + i), - mipLevel ? mipLevel[i] : 0, zSlice ? zSlice[i] : 0); - } -} - -void FramebufferObject::AttachRenderBuffer(GLuint buffId, GLenum attachment) { - _GuardedBind(); - -#ifndef NDEBUG - - if (GetAttachedId(attachment) != buffId) { -#endif - - glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, attachment, - GL_RENDERBUFFER_EXT, buffId); - -#ifndef NDEBUG - } else { - cerr << "FramebufferObject::AttachRenderBuffer PERFORMANCE WARNING:\n" - << "\tRedundant bind of Renderbuffer (id = " << buffId << ")\n" - << "\tHINT : Compile with -DNDEBUG to remove this warning.\n"; - } #endif - _GuardedUnbind(); + _GuardedUnbind(); } -void FramebufferObject::AttachRenderBuffers(int numBuffers, GLuint buffId[], - GLenum attachment[]) { - for (int i = 0; i < numBuffers; ++i) { - AttachRenderBuffer( - buffId[i], attachment ? attachment[i] : (GL_COLOR_ATTACHMENT0_EXT + i)); - } +void FramebufferObject::AttachRenderBuffers(int numBuffers, GLuint buffId[], GLenum attachment[]) +{ + for (int i = 0; i < numBuffers; ++i) { + AttachRenderBuffer(buffId[i], attachment ? attachment[i] : (GL_COLOR_ATTACHMENT0_EXT + i)); + } } -void FramebufferObject::Unattach(GLenum attachment) { - _GuardedBind(); - GLenum type = GetAttachedType(attachment); +void FramebufferObject::Unattach(GLenum attachment) +{ + _GuardedBind(); + GLenum type = GetAttachedType(attachment); - switch (type) { + switch (type) { case GL_NONE: - break; + break; case GL_RENDERBUFFER_EXT: - AttachRenderBuffer(0, attachment); - break; + AttachRenderBuffer(0, attachment); + break; case GL_TEXTURE: - AttachTexture(GL_TEXTURE_2D, 0, attachment); - break; + AttachTexture(GL_TEXTURE_2D, 0, attachment); + break; default: - cerr << "FramebufferObject::unbind_attachment ERROR: Unknown attached " - "resource type\n"; - } + cerr << "FramebufferObject::unbind_attachment ERROR: Unknown attached " + "resource type\n"; + } - _GuardedUnbind(); + _GuardedUnbind(); } -void FramebufferObject::UnattachAll() { - int numAttachments = GetMaxColorAttachments(); +void FramebufferObject::UnattachAll() +{ + int numAttachments = GetMaxColorAttachments(); - for (int i = 0; i < numAttachments; ++i) { - Unattach(GL_COLOR_ATTACHMENT0_EXT + i); - } + for (int i = 0; i < numAttachments; ++i) { + Unattach(GL_COLOR_ATTACHMENT0_EXT + i); + } } -int FramebufferObject::GetMaxColorAttachments() { - GLint maxAttach = 0; - glGetIntegerv(GL_MAX_COLOR_ATTACHMENTS_EXT, &maxAttach); - return maxAttach; +int FramebufferObject::GetMaxColorAttachments() +{ + GLint maxAttach = 0; + glGetIntegerv(GL_MAX_COLOR_ATTACHMENTS_EXT, &maxAttach); + return maxAttach; } -GLuint FramebufferObject::_GenerateFboId() { - GLuint id = 0; - glGenFramebuffersEXT(1, &id); - return id; +GLuint FramebufferObject::_GenerateFboId() +{ + GLuint id = 0; + glGenFramebuffersEXT(1, &id); + return id; } -void FramebufferObject::_GuardedBind() { - // Only binds if m_fboId is different than the currently bound FBO - glGetIntegerv(GL_FRAMEBUFFER_BINDING_EXT, &m_savedFboId); +void FramebufferObject::_GuardedBind() +{ + // Only binds if m_fboId is different than the currently bound FBO + glGetIntegerv(GL_FRAMEBUFFER_BINDING_EXT, &m_savedFboId); - if (m_fboId != (GLuint)m_savedFboId) { - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_fboId); - } + if (m_fboId != (GLuint)m_savedFboId) { + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, m_fboId); + } } -void FramebufferObject::_GuardedUnbind() { - // Returns FBO binding to the previously enabled FBO - if (m_fboId != (GLuint)m_savedFboId) { - glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, (GLuint)m_savedFboId); - } +void FramebufferObject::_GuardedUnbind() +{ + // Returns FBO binding to the previously enabled FBO + if (m_fboId != (GLuint)m_savedFboId) { + glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, (GLuint)m_savedFboId); + } } void FramebufferObject::_FramebufferTextureND(GLenum attachment, - GLenum texTarget, GLuint texId, - int mipLevel, int zSlice) { - if (texTarget == GL_TEXTURE_1D) { - glFramebufferTexture1DEXT(GL_FRAMEBUFFER_EXT, attachment, GL_TEXTURE_1D, - texId, mipLevel); - } else if (texTarget == GL_TEXTURE_3D) { - glFramebufferTexture3DEXT(GL_FRAMEBUFFER_EXT, attachment, GL_TEXTURE_3D, - texId, mipLevel, zSlice); - } else { - // Default is GL_TEXTURE_2D, GL_TEXTURE_RECTANGLE_ARB, or cube faces - glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachment, texTarget, texId, - mipLevel); - } + GLenum texTarget, + GLuint texId, + int mipLevel, + int zSlice) +{ + if (texTarget == GL_TEXTURE_1D) { + glFramebufferTexture1DEXT(GL_FRAMEBUFFER_EXT, attachment, GL_TEXTURE_1D, texId, mipLevel); + } + else if (texTarget == GL_TEXTURE_3D) { + glFramebufferTexture3DEXT(GL_FRAMEBUFFER_EXT, attachment, GL_TEXTURE_3D, texId, mipLevel, zSlice); + } + else { + // Default is GL_TEXTURE_2D, GL_TEXTURE_RECTANGLE_ARB, or cube faces + glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachment, texTarget, texId, mipLevel); + } } #ifndef NDEBUG -bool FramebufferObject::IsValid(ostream &ostr) { - _GuardedBind(); +bool FramebufferObject::IsValid(ostream &ostr) +{ + _GuardedBind(); - bool isOK = false; + bool isOK = false; - GLenum status; - status = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT); + GLenum status; + status = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT); - switch (status) { - case GL_FRAMEBUFFER_COMPLETE_EXT: // Everything's OK - isOK = true; - break; + switch (status) { + case GL_FRAMEBUFFER_COMPLETE_EXT: // Everything's OK + isOK = true; + break; case GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_EXT: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_EXT\n"; - isOK = false; - break; + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_EXT\n"; + isOK = false; + break; case GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT\n"; - isOK = false; - break; + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT\n"; + isOK = false; + break; case GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT\n"; - isOK = false; - break; + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT\n"; + isOK = false; + break; case GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT\n"; - isOK = false; - break; + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT\n"; + isOK = false; + break; case GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER_EXT: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER_EXT\n"; - isOK = false; - break; + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER_EXT\n"; + isOK = false; + break; case GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER_EXT: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER_EXT\n"; - isOK = false; - break; + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER_EXT\n"; + isOK = false; + break; case GL_FRAMEBUFFER_UNSUPPORTED_EXT: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "GL_FRAMEBUFFER_UNSUPPORTED_EXT\n"; - isOK = false; - break; + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "GL_FRAMEBUFFER_UNSUPPORTED_EXT\n"; + isOK = false; + break; default: - ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" - << "Unknown ERROR\n"; - isOK = false; - } + ostr << "glift::CheckFramebufferStatus() ERROR:\n\t" + << "Unknown ERROR\n"; + isOK = false; + } - _GuardedUnbind(); - return isOK; + _GuardedUnbind(); + return isOK; } -#endif // NDEBUG +#endif // NDEBUG /// Accessors -GLenum FramebufferObject::GetAttachedType(GLenum attachment) { - // Returns GL_RENDERBUFFER_EXT or GL_TEXTURE - _GuardedBind(); - GLint type = 0; - glGetFramebufferAttachmentParameterivEXT( - GL_FRAMEBUFFER_EXT, attachment, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT, - &type); - _GuardedUnbind(); - return GLenum(type); +GLenum FramebufferObject::GetAttachedType(GLenum attachment) +{ + // Returns GL_RENDERBUFFER_EXT or GL_TEXTURE + _GuardedBind(); + GLint type = 0; + glGetFramebufferAttachmentParameterivEXT( + GL_FRAMEBUFFER_EXT, attachment, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT, &type); + _GuardedUnbind(); + return GLenum(type); } -GLuint FramebufferObject::GetAttachedId(GLenum attachment) { - _GuardedBind(); - GLint id = 0; - glGetFramebufferAttachmentParameterivEXT( - GL_FRAMEBUFFER_EXT, attachment, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT, - &id); - _GuardedUnbind(); - return GLuint(id); +GLuint FramebufferObject::GetAttachedId(GLenum attachment) +{ + _GuardedBind(); + GLint id = 0; + glGetFramebufferAttachmentParameterivEXT( + GL_FRAMEBUFFER_EXT, attachment, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT, &id); + _GuardedUnbind(); + return GLuint(id); } -GLint FramebufferObject::GetAttachedMipLevel(GLenum attachment) { - _GuardedBind(); - GLint level = 0; - glGetFramebufferAttachmentParameterivEXT( - GL_FRAMEBUFFER_EXT, attachment, - GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT, &level); - _GuardedUnbind(); - return level; +GLint FramebufferObject::GetAttachedMipLevel(GLenum attachment) +{ + _GuardedBind(); + GLint level = 0; + glGetFramebufferAttachmentParameterivEXT( + GL_FRAMEBUFFER_EXT, attachment, GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT, &level); + _GuardedUnbind(); + return level; } -GLint FramebufferObject::GetAttachedCubeFace(GLenum attachment) { - _GuardedBind(); - GLint level = 0; - glGetFramebufferAttachmentParameterivEXT( - GL_FRAMEBUFFER_EXT, attachment, - GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_EXT, &level); - _GuardedUnbind(); - return level; +GLint FramebufferObject::GetAttachedCubeFace(GLenum attachment) +{ + _GuardedBind(); + GLint level = 0; + glGetFramebufferAttachmentParameterivEXT( + GL_FRAMEBUFFER_EXT, attachment, GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_EXT, &level); + _GuardedUnbind(); + return level; } -GLint FramebufferObject::GetAttachedZSlice(GLenum attachment) { - _GuardedBind(); - GLint slice = 0; - glGetFramebufferAttachmentParameterivEXT( - GL_FRAMEBUFFER_EXT, attachment, - GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_EXT, &slice); - _GuardedUnbind(); - return slice; +GLint FramebufferObject::GetAttachedZSlice(GLenum attachment) +{ + _GuardedBind(); + GLint slice = 0; + glGetFramebufferAttachmentParameterivEXT( + GL_FRAMEBUFFER_EXT, attachment, GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_EXT, &slice); + _GuardedUnbind(); + return slice; } diff --git a/Samples/5_Domain_Specific/smokeParticles/framebufferObject.h b/Samples/5_Domain_Specific/smokeParticles/framebufferObject.h index 78e804d4..3816a75d 100644 --- a/Samples/5_Domain_Specific/smokeParticles/framebufferObject.h +++ b/Samples/5_Domain_Specific/smokeParticles/framebufferObject.h @@ -69,6 +69,7 @@ #ifndef UCDAVIS_FRAMEBUFFER_OBJECT_H #define UCDAVIS_FRAMEBUFFER_OBJECT_H +#include #include /*! @@ -126,43 +127,47 @@ Performance Notes: // OpenGL framebuffer. FramebufferObject::Disable(); */ -class FramebufferObject { - public: - /// Ctor/Dtor - FramebufferObject(); - virtual ~FramebufferObject(); +class FramebufferObject +{ +public: + /// Ctor/Dtor + FramebufferObject(); + virtual ~FramebufferObject(); - /// Bind this FBO as current render target - void Bind(); + /// Bind this FBO as current render target + void Bind(); - /// Bind a texture to the "attachment" point of this FBO - virtual void AttachTexture(GLenum texTarget, GLuint texId, - GLenum attachment = GL_COLOR_ATTACHMENT0_EXT, - int mipLevel = 0, int zSlice = 0); + /// Bind a texture to the "attachment" point of this FBO + virtual void AttachTexture(GLenum texTarget, + GLuint texId, + GLenum attachment = GL_COLOR_ATTACHMENT0_EXT, + int mipLevel = 0, + int zSlice = 0); - /// Bind an array of textures to multiple "attachment" points of this FBO - /// - By default, the first 'numTextures' attachments are used, - /// starting with GL_COLOR_ATTACHMENT0_EXT - virtual void AttachTextures(int numTextures, GLenum texTarget[], - GLuint texId[], GLenum attachment[] = NULL, - int mipLevel[] = NULL, int zSlice[] = NULL); + /// Bind an array of textures to multiple "attachment" points of this FBO + /// - By default, the first 'numTextures' attachments are used, + /// starting with GL_COLOR_ATTACHMENT0_EXT + virtual void AttachTextures(int numTextures, + GLenum texTarget[], + GLuint texId[], + GLenum attachment[] = NULL, + int mipLevel[] = NULL, + int zSlice[] = NULL); - /// Bind a render buffer to the "attachment" point of this FBO - virtual void AttachRenderBuffer(GLuint buffId, - GLenum attachment = GL_COLOR_ATTACHMENT0_EXT); + /// Bind a render buffer to the "attachment" point of this FBO + virtual void AttachRenderBuffer(GLuint buffId, GLenum attachment = GL_COLOR_ATTACHMENT0_EXT); - /// Bind an array of render buffers to corresponding "attachment" points - /// of this FBO. - /// - By default, the first 'numBuffers' attachments are used, - /// starting with GL_COLOR_ATTACHMENT0_EXT - virtual void AttachRenderBuffers(int numBuffers, GLuint buffId[], - GLenum attachment[] = NULL); + /// Bind an array of render buffers to corresponding "attachment" points + /// of this FBO. + /// - By default, the first 'numBuffers' attachments are used, + /// starting with GL_COLOR_ATTACHMENT0_EXT + virtual void AttachRenderBuffers(int numBuffers, GLuint buffId[], GLenum attachment[] = NULL); - /// Free any resource bound to the "attachment" point of this FBO - void Unattach(GLenum attachment); + /// Free any resource bound to the "attachment" point of this FBO + void Unattach(GLenum attachment); - /// Free any resources bound to any attachment points of this FBO - void UnattachAll(); + /// Free any resources bound to any attachment points of this FBO + void UnattachAll(); /// Is this FBO currently a valid render target? /// - Sends output to std::cerr by default but can @@ -172,55 +177,54 @@ class FramebufferObject { /// mode but always returns "true" if NDEBUG is /// is defined (optimized builds) #ifndef NDEBUG - bool IsValid(std::ostream &ostr = std::cerr); + bool IsValid(std::ostream &ostr = std::cerr); #else - bool IsValid(std::ostream &ostr = std::cerr) { return true; } + bool IsValid(std::ostream &ostr = std::cerr) { return true; } #endif - /// BEGIN : Accessors - /// Is attached type GL_RENDERBUFFER_EXT or GL_TEXTURE? - GLenum GetAttachedType(GLenum attachment); + /// BEGIN : Accessors + /// Is attached type GL_RENDERBUFFER_EXT or GL_TEXTURE? + GLenum GetAttachedType(GLenum attachment); - /// What is the Id of Renderbuffer/texture currently - /// attached to "attachment?" - GLuint GetAttachedId(GLenum attachment); + /// What is the Id of Renderbuffer/texture currently + /// attached to "attachment?" + GLuint GetAttachedId(GLenum attachment); - /// Which mipmap level is currently attached to "attachment?" - GLint GetAttachedMipLevel(GLenum attachment); + /// Which mipmap level is currently attached to "attachment?" + GLint GetAttachedMipLevel(GLenum attachment); - /// Which cube face is currently attached to "attachment?" - GLint GetAttachedCubeFace(GLenum attachment); + /// Which cube face is currently attached to "attachment?" + GLint GetAttachedCubeFace(GLenum attachment); - /// Which z-slice is currently attached to "attachment?" - GLint GetAttachedZSlice(GLenum attachment); - /// END : Accessors + /// Which z-slice is currently attached to "attachment?" + GLint GetAttachedZSlice(GLenum attachment); + /// END : Accessors - /// BEGIN : Static methods global to all FBOs - /// Return number of color attachments permitted - static int GetMaxColorAttachments(); + /// BEGIN : Static methods global to all FBOs + /// Return number of color attachments permitted + static int GetMaxColorAttachments(); - /// Disable all FBO rendering and return to traditional, - /// windowing-system controlled framebuffer - /// NOTE: - /// This is NOT an "unbind" for this specific FBO, but rather - /// disables all FBO rendering. This call is intentionally "static" - /// and named "Disable" instead of "Unbind" for this reason. The - /// motivation for this strange semantic is performance. Providing - /// "Unbind" would likely lead to a large number of unnecessary - /// FBO enabling/disabling. - static void Disable(); - /// END : Static methods global to all FBOs + /// Disable all FBO rendering and return to traditional, + /// windowing-system controlled framebuffer + /// NOTE: + /// This is NOT an "unbind" for this specific FBO, but rather + /// disables all FBO rendering. This call is intentionally "static" + /// and named "Disable" instead of "Unbind" for this reason. The + /// motivation for this strange semantic is performance. Providing + /// "Unbind" would likely lead to a large number of unnecessary + /// FBO enabling/disabling. + static void Disable(); + /// END : Static methods global to all FBOs - protected: - void _GuardedBind(); - void _GuardedUnbind(); - void _FramebufferTextureND(GLenum attachment, GLenum texTarget, GLuint texId, - int mipLevel, int zSlice); - static GLuint _GenerateFboId(); +protected: + void _GuardedBind(); + void _GuardedUnbind(); + void _FramebufferTextureND(GLenum attachment, GLenum texTarget, GLuint texId, int mipLevel, int zSlice); + static GLuint _GenerateFboId(); - private: - GLuint m_fboId; - GLint m_savedFboId; +private: + GLuint m_fboId; + GLint m_savedFboId; }; #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/nvMath.h b/Samples/5_Domain_Specific/smokeParticles/nvMath.h index a86dd797..2d2472d6 100644 --- a/Samples/5_Domain_Specific/smokeParticles/nvMath.h +++ b/Samples/5_Domain_Specific/smokeParticles/nvMath.h @@ -61,29 +61,29 @@ #define NV_MATH_H #include - -#include #include #include +#include #define NV_PI float(3.1415926535897932384626433832795) namespace nv { -typedef vec2 vec2f; -typedef vec3 vec3f; -typedef vec3 vec3i; +typedef vec2 vec2f; +typedef vec3 vec3f; +typedef vec3 vec3i; typedef vec3 vec3ui; -typedef vec4 vec4f; -typedef matrix4 matrix4f; -typedef quaternion quaternionf; +typedef vec4 vec4f; +typedef matrix4 matrix4f; +typedef quaternion quaternionf; -inline void applyRotation(const quaternionf &r) { - float angle; - vec3f axis; - r.get_value(axis, angle); - glRotatef(angle / 3.1415926f * 180.0f, axis[0], axis[1], axis[2]); +inline void applyRotation(const quaternionf &r) +{ + float angle; + vec3f axis; + r.get_value(axis, angle); + glRotatef(angle / 3.1415926f * 180.0f, axis[0], axis[1], axis[2]); } -}; +}; // namespace nv #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/nvMatrix.h b/Samples/5_Domain_Specific/smokeParticles/nvMatrix.h index 5b4db78c..8c073b32 100644 --- a/Samples/5_Domain_Specific/smokeParticles/nvMatrix.h +++ b/Samples/5_Domain_Specific/smokeParticles/nvMatrix.h @@ -64,383 +64,419 @@ namespace nv { -template -class vec2; -template -class vec3; -template -class vec4; +template class vec2; +template class vec3; +template class vec4; //////////////////////////////////////////////////////////////////////////////// // // Matrix // //////////////////////////////////////////////////////////////////////////////// -template -class matrix4 { - public: - matrix4() { make_identity(); } +template class matrix4 +{ +public: + matrix4() { make_identity(); } - matrix4(T t) { set_value(t); } + matrix4(T t) { set_value(t); } - matrix4(const T *m) { set_value(m); } + matrix4(const T *m) { set_value(m); } - matrix4(T a00, T a01, T a02, T a03, T a10, T a11, T a12, T a13, T a20, T a21, - T a22, T a23, T a30, T a31, T a32, T a33) - : _11(a00), - _12(a01), - _13(a02), - _14(a03), - _21(a10), - _22(a11), - _23(a12), - _24(a13), - _31(a20), - _32(a21), - _33(a22), - _34(a23), - _41(a30), - _42(a31), - _43(a32), - _44(a33) {} - - void get_value(T *mp) const { - int c = 0; - - for (int j = 0; j < 4; j++) - for (int i = 0; i < 4; i++) { - mp[c++] = element(i, j); - } - } - - const T *get_value() const { return _array; } - - void set_value(T *mp) { - int c = 0; - - for (int j = 0; j < 4; j++) - for (int i = 0; i < 4; i++) { - element(i, j) = mp[c++]; - } - } - - void set_value(T r) { - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) { - element(i, j) = r; - } - } - - void make_identity() { - element(0, 0) = 1.0; - element(0, 1) = 0.0; - element(0, 2) = 0.0; - element(0, 3) = 0.0; - - element(1, 0) = 0.0; - element(1, 1) = 1.0; - element(1, 2) = 0.0; - element(1, 3) = 0.0; - - element(2, 0) = 0.0; - element(2, 1) = 0.0; - element(2, 2) = 1.0; - element(2, 3) = 0.0; - - element(3, 0) = 0.0; - element(3, 1) = 0.0; - element(3, 2) = 0.0; - element(3, 3) = 1.0; - } - - // set a uniform scale - void set_scale(T s) { - element(0, 0) = s; - element(1, 1) = s; - element(2, 2) = s; - } - - void set_scale(const vec3 &s) { - for (int i = 0; i < 3; i++) { - element(i, i) = s[i]; - } - } - - void set_translate(const vec3 &t) { - for (int i = 0; i < 3; i++) { - element(i, 3) = t[i]; - } - } - - void set_row(int r, const vec4 &t) { - for (int i = 0; i < 4; i++) { - element(r, i) = t[i]; - } - } - - void set_column(int c, const vec4 &t) { - for (int i = 0; i < 4; i++) { - element(i, c) = t[i]; - } - } - - vec4 get_row(int r) const { - vec4 v; - - for (int i = 0; i < 4; i++) { - v[i] = element(r, i); + matrix4(T a00, + T a01, + T a02, + T a03, + T a10, + T a11, + T a12, + T a13, + T a20, + T a21, + T a22, + T a23, + T a30, + T a31, + T a32, + T a33) + : _11(a00) + , _12(a01) + , _13(a02) + , _14(a03) + , _21(a10) + , _22(a11) + , _23(a12) + , _24(a13) + , _31(a20) + , _32(a21) + , _33(a22) + , _34(a23) + , _41(a30) + , _42(a31) + , _43(a32) + , _44(a33) + { } - return v; - } + void get_value(T *mp) const + { + int c = 0; - vec4 get_column(int c) const { - vec4 v; - - for (int i = 0; i < 4; i++) { - v[i] = element(i, c); + for (int j = 0; j < 4; j++) + for (int i = 0; i < 4; i++) { + mp[c++] = element(i, j); + } } - return v; - } + const T *get_value() const { return _array; } - friend matrix4 inverse(const matrix4 &m) { - matrix4 minv; + void set_value(T *mp) + { + int c = 0; - T r1[8], r2[8], r3[8], r4[8]; - T *s[4], *tmprow; + for (int j = 0; j < 4; j++) + for (int i = 0; i < 4; i++) { + element(i, j) = mp[c++]; + } + } - s[0] = &r1[0]; - s[1] = &r2[0]; - s[2] = &r3[0]; - s[3] = &r4[0]; + void set_value(T r) + { + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) { + element(i, j) = r; + } + } - register int i, j, p, jj; + void make_identity() + { + element(0, 0) = 1.0; + element(0, 1) = 0.0; + element(0, 2) = 0.0; + element(0, 3) = 0.0; - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - s[i][j] = m.element(i, j); + element(1, 0) = 0.0; + element(1, 1) = 1.0; + element(1, 2) = 0.0; + element(1, 3) = 0.0; - if (i == j) { - s[i][j + 4] = 1.0; - } else { - s[i][j + 4] = 0.0; + element(2, 0) = 0.0; + element(2, 1) = 0.0; + element(2, 2) = 1.0; + element(2, 3) = 0.0; + + element(3, 0) = 0.0; + element(3, 1) = 0.0; + element(3, 2) = 0.0; + element(3, 3) = 1.0; + } + + // set a uniform scale + void set_scale(T s) + { + element(0, 0) = s; + element(1, 1) = s; + element(2, 2) = s; + } + + void set_scale(const vec3 &s) + { + for (int i = 0; i < 3; i++) { + element(i, i) = s[i]; } - } } - T scp[4]; + void set_translate(const vec3 &t) + { + for (int i = 0; i < 3; i++) { + element(i, 3) = t[i]; + } + } - for (i = 0; i < 4; i++) { - scp[i] = T(fabs(s[i][0])); + void set_row(int r, const vec4 &t) + { + for (int i = 0; i < 4; i++) { + element(r, i) = t[i]; + } + } - for (j = 1; j < 4; j++) - if (T(fabs(s[i][j])) > scp[i]) { - scp[i] = T(fabs(s[i][j])); + void set_column(int c, const vec4 &t) + { + for (int i = 0; i < 4; i++) { + element(i, c) = t[i]; + } + } + + vec4 get_row(int r) const + { + vec4 v; + + for (int i = 0; i < 4; i++) { + v[i] = element(r, i); } - if (scp[i] == 0.0) { - return minv; // singular matrix! - } + return v; } - int pivot_to; - T scp_max; + vec4 get_column(int c) const + { + vec4 v; - for (i = 0; i < 4; i++) { - // select pivot row - pivot_to = i; - scp_max = T(fabs(s[i][i] / scp[i])); - - // find out which row should be on top - for (p = i + 1; p < 4; p++) - if (T(fabs(s[p][i] / scp[p])) > scp_max) { - scp_max = T(fabs(s[p][i] / scp[p])); - pivot_to = p; + for (int i = 0; i < 4; i++) { + v[i] = element(i, c); } - // Pivot if necessary - if (pivot_to != i) { - tmprow = s[i]; - s[i] = s[pivot_to]; - s[pivot_to] = tmprow; - T tmpscp; - tmpscp = scp[i]; - scp[i] = scp[pivot_to]; - scp[pivot_to] = tmpscp; - } - - T mji; - - // perform gaussian elimination - for (j = i + 1; j < 4; j++) { - mji = s[j][i] / s[i][i]; - s[j][i] = 0.0; - - for (jj = i + 1; jj < 8; jj++) { - s[j][jj] -= mji * s[i][jj]; - } - } + return v; } - if (s[3][3] == 0.0) { - return minv; // singular matrix! - } + friend matrix4 inverse(const matrix4 &m) + { + matrix4 minv; - // - // Now we have an upper triangular matrix. - // - // x x x x | y y y y - // 0 x x x | y y y y - // 0 0 x x | y y y y - // 0 0 0 x | y y y y - // - // we'll back substitute to get the inverse - // - // 1 0 0 0 | z z z z - // 0 1 0 0 | z z z z - // 0 0 1 0 | z z z z - // 0 0 0 1 | z z z z - // + T r1[8], r2[8], r3[8], r4[8]; + T *s[4], *tmprow; - T mij; + s[0] = &r1[0]; + s[1] = &r2[0]; + s[2] = &r3[0]; + s[3] = &r4[0]; - for (i = 3; i > 0; i--) { - for (j = i - 1; j > -1; j--) { - mij = s[j][i] / s[i][i]; + register int i, j, p, jj; - for (jj = j + 1; jj < 8; jj++) { - s[j][jj] -= mij * s[i][jj]; - } - } - } + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + s[i][j] = m.element(i, j); - for (i = 0; i < 4; i++) - for (j = 0; j < 4; j++) { - minv(i, j) = s[i][j + 4] / s[i][i]; - } - - return minv; - } - - friend matrix4 transpose(const matrix4 &m) { - matrix4 mtrans; - - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) { - mtrans(i, j) = m.element(j, i); - } - - return mtrans; - } - - matrix4 &operator*=(const matrix4 &rhs) { - matrix4 mt(*this); - set_value(T(0)); - - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) - for (int c = 0; c < 4; c++) { - element(i, j) += mt(i, c) * rhs(c, j); + if (i == j) { + s[i][j + 4] = 1.0; + } + else { + s[i][j + 4] = 0.0; + } + } } - return *this; - } + T scp[4]; - friend matrix4 operator*(const matrix4 &lhs, const matrix4 &rhs) { - matrix4 r(T(0)); + for (i = 0; i < 4; i++) { + scp[i] = T(fabs(s[i][0])); - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) - for (int c = 0; c < 4; c++) { - r.element(i, j) += lhs(i, c) * rhs(c, j); + for (j = 1; j < 4; j++) + if (T(fabs(s[i][j])) > scp[i]) { + scp[i] = T(fabs(s[i][j])); + } + + if (scp[i] == 0.0) { + return minv; // singular matrix! + } } - return r; - } + int pivot_to; + T scp_max; - // dst = M * src - vec4 operator*(const vec4 &src) const { - vec4 r; + for (i = 0; i < 4; i++) { + // select pivot row + pivot_to = i; + scp_max = T(fabs(s[i][i] / scp[i])); - for (int i = 0; i < 4; i++) - r[i] = (src[0] * element(i, 0) + src[1] * element(i, 1) + - src[2] * element(i, 2) + src[3] * element(i, 3)); + // find out which row should be on top + for (p = i + 1; p < 4; p++) + if (T(fabs(s[p][i] / scp[p])) > scp_max) { + scp_max = T(fabs(s[p][i] / scp[p])); + pivot_to = p; + } - return r; - } + // Pivot if necessary + if (pivot_to != i) { + tmprow = s[i]; + s[i] = s[pivot_to]; + s[pivot_to] = tmprow; + T tmpscp; + tmpscp = scp[i]; + scp[i] = scp[pivot_to]; + scp[pivot_to] = tmpscp; + } - // dst = src * M - friend vec4 operator*(const vec4 &lhs, const matrix4 &rhs) { - vec4 r; + T mji; - for (int i = 0; i < 4; i++) - r[i] = (lhs[0] * rhs.element(0, i) + lhs[1] * rhs.element(1, i) + - lhs[2] * rhs.element(2, i) + lhs[3] * rhs.element(3, i)); + // perform gaussian elimination + for (j = i + 1; j < 4; j++) { + mji = s[j][i] / s[i][i]; + s[j][i] = 0.0; - return r; - } + for (jj = i + 1; jj < 8; jj++) { + s[j][jj] -= mji * s[i][jj]; + } + } + } - T &operator()(int row, int col) { return element(row, col); } + if (s[3][3] == 0.0) { + return minv; // singular matrix! + } - const T &operator()(int row, int col) const { return element(row, col); } + // + // Now we have an upper triangular matrix. + // + // x x x x | y y y y + // 0 x x x | y y y y + // 0 0 x x | y y y y + // 0 0 0 x | y y y y + // + // we'll back substitute to get the inverse + // + // 1 0 0 0 | z z z z + // 0 1 0 0 | z z z z + // 0 0 1 0 | z z z z + // 0 0 0 1 | z z z z + // - T &element(int row, int col) { return _array[row | (col << 2)]; } + T mij; - const T &element(int row, int col) const { return _array[row | (col << 2)]; } + for (i = 3; i > 0; i--) { + for (j = i - 1; j > -1; j--) { + mij = s[j][i] / s[i][i]; - matrix4 &operator*=(const T &r) { - for (int i = 0; i < 4; ++i) { - element(0, i) *= r; - element(1, i) *= r; - element(2, i) *= r; - element(3, i) *= r; + for (jj = j + 1; jj < 8; jj++) { + s[j][jj] -= mij * s[i][jj]; + } + } + } + + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) { + minv(i, j) = s[i][j + 4] / s[i][i]; + } + + return minv; } - return *this; - } + friend matrix4 transpose(const matrix4 &m) + { + matrix4 mtrans; - matrix4 &operator+=(const matrix4 &mat) { - for (int i = 0; i < 4; ++i) { - element(0, i) += mat.element(0, i); - element(1, i) += mat.element(1, i); - element(2, i) += mat.element(2, i); - element(3, i) += mat.element(3, i); + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) { + mtrans(i, j) = m.element(j, i); + } + + return mtrans; } - return *this; - } + matrix4 &operator*=(const matrix4 &rhs) + { + matrix4 mt(*this); + set_value(T(0)); - friend bool operator==(const matrix4 &lhs, const matrix4 &rhs) { - bool r = true; + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) + for (int c = 0; c < 4; c++) { + element(i, j) += mt(i, c) * rhs(c, j); + } - for (int i = 0; i < 16; i++) { - r &= lhs._array[i] == rhs._array[i]; + return *this; } - return r; - } + friend matrix4 operator*(const matrix4 &lhs, const matrix4 &rhs) + { + matrix4 r(T(0)); - friend bool operator!=(const matrix4 &lhs, const matrix4 &rhs) { - bool r = true; + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) + for (int c = 0; c < 4; c++) { + r.element(i, j) += lhs(i, c) * rhs(c, j); + } - for (int i = 0; i < 16; i++) { - r &= lhs._array[i] != rhs._array[i]; + return r; } - return r; - } + // dst = M * src + vec4 operator*(const vec4 &src) const + { + vec4 r; - union { - struct { - T _11, _12, _13, _14; // standard names for components - T _21, _22, _23, _24; // standard names for components - T _31, _32, _33, _34; // standard names for components - T _41, _42, _43, _44; // standard names for components + for (int i = 0; i < 4; i++) + r[i] = (src[0] * element(i, 0) + src[1] * element(i, 1) + src[2] * element(i, 2) + src[3] * element(i, 3)); + + return r; + } + + // dst = src * M + friend vec4 operator*(const vec4 &lhs, const matrix4 &rhs) + { + vec4 r; + + for (int i = 0; i < 4; i++) + r[i] = (lhs[0] * rhs.element(0, i) + lhs[1] * rhs.element(1, i) + lhs[2] * rhs.element(2, i) + + lhs[3] * rhs.element(3, i)); + + return r; + } + + T &operator()(int row, int col) { return element(row, col); } + + const T &operator()(int row, int col) const { return element(row, col); } + + T &element(int row, int col) { return _array[row | (col << 2)]; } + + const T &element(int row, int col) const { return _array[row | (col << 2)]; } + + matrix4 &operator*=(const T &r) + { + for (int i = 0; i < 4; ++i) { + element(0, i) *= r; + element(1, i) *= r; + element(2, i) *= r; + element(3, i) *= r; + } + + return *this; + } + + matrix4 &operator+=(const matrix4 &mat) + { + for (int i = 0; i < 4; ++i) { + element(0, i) += mat.element(0, i); + element(1, i) += mat.element(1, i); + element(2, i) += mat.element(2, i); + element(3, i) += mat.element(3, i); + } + + return *this; + } + + friend bool operator==(const matrix4 &lhs, const matrix4 &rhs) + { + bool r = true; + + for (int i = 0; i < 16; i++) { + r &= lhs._array[i] == rhs._array[i]; + } + + return r; + } + + friend bool operator!=(const matrix4 &lhs, const matrix4 &rhs) + { + bool r = true; + + for (int i = 0; i < 16; i++) { + r &= lhs._array[i] != rhs._array[i]; + } + + return r; + } + + union + { + struct + { + T _11, _12, _13, _14; // standard names for components + T _21, _22, _23, _24; // standard names for components + T _31, _32, _33, _34; // standard names for components + T _41, _42, _43, _44; // standard names for components + }; + T _array[16]; // array access }; - T _array[16]; // array access - }; -}; }; +}; // namespace nv #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/nvQuaternion.h b/Samples/5_Domain_Specific/smokeParticles/nvQuaternion.h index d31ddaa2..9d53c179 100644 --- a/Samples/5_Domain_Specific/smokeParticles/nvQuaternion.h +++ b/Samples/5_Domain_Specific/smokeParticles/nvQuaternion.h @@ -63,12 +63,9 @@ namespace nv { -template -class vec2; -template -class vec3; -template -class vec4; +template class vec2; +template class vec3; +template class vec4; //////////////////////////////////////////////////////////////////////////////// // @@ -76,363 +73,383 @@ class vec4; // //////////////////////////////////////////////////////////////////////////////// -template -class quaternion { - public: - quaternion() : x(0.0), y(0.0), z(0.0), w(0.0) {} - - quaternion(const T v[4]) { set_value(v); } - - quaternion(T q0, T q1, T q2, T q3) { set_value(q0, q1, q2, q3); } - - quaternion(const matrix4 &m) { set_value(m); } - - quaternion(const vec3 &axis, T radians) { set_value(axis, radians); } - - quaternion(const vec3 &rotateFrom, const vec3 &rotateTo) { - set_value(rotateFrom, rotateTo); - } - - quaternion(const vec3 &from_look, const vec3 &from_up, - const vec3 &to_look, const vec3 &to_up) { - set_value(from_look, from_up, to_look, to_up); - } - - const T *get_value() const { return &_array[0]; } - - void get_value(T &q0, T &q1, T &q2, T &q3) const { - q0 = _array[0]; - q1 = _array[1]; - q2 = _array[2]; - q3 = _array[3]; - } - - quaternion &set_value(T q0, T q1, T q2, T q3) { - _array[0] = q0; - _array[1] = q1; - _array[2] = q2; - _array[3] = q3; - return *this; - } - - void get_value(vec3 &axis, T &radians) const { - radians = T(acos(_array[3]) * T(2.0)); - - if (radians == T(0.0)) { - axis = vec3(0.0, 0.0, 1.0); - } else { - axis[0] = _array[0]; - axis[1] = _array[1]; - axis[2] = _array[2]; - axis = normalize(axis); - } - } - - void get_value(matrix4 &m) const { - T s, xs, ys, zs, wx, wy, wz, xx, xy, xz, yy, yz, zz; - - T norm = _array[0] * _array[0] + _array[1] * _array[1] + - _array[2] * _array[2] + _array[3] * _array[3]; - - s = (norm == T(0.0)) ? T(0.0) : (T(2.0) / norm); - - xs = _array[0] * s; - ys = _array[1] * s; - zs = _array[2] * s; - - wx = _array[3] * xs; - wy = _array[3] * ys; - wz = _array[3] * zs; - - xx = _array[0] * xs; - xy = _array[0] * ys; - xz = _array[0] * zs; - - yy = _array[1] * ys; - yz = _array[1] * zs; - zz = _array[2] * zs; - - m(0, 0) = T(T(1.0) - (yy + zz)); - m(1, 0) = T(xy + wz); - m(2, 0) = T(xz - wy); - - m(0, 1) = T(xy - wz); - m(1, 1) = T(T(1.0) - (xx + zz)); - m(2, 1) = T(yz + wx); - - m(0, 2) = T(xz + wy); - m(1, 2) = T(yz - wx); - m(2, 2) = T(T(1.0) - (xx + yy)); - - m(3, 0) = m(3, 1) = m(3, 2) = m(0, 3) = m(1, 3) = m(2, 3) = T(0.0); - m(3, 3) = T(1.0); - } - - quaternion &set_value(const T *qp) { - for (int i = 0; i < 4; i++) { - _array[i] = qp[i]; +template class quaternion +{ +public: + quaternion() + : x(0.0) + , y(0.0) + , z(0.0) + , w(0.0) + { } - return *this; - } + quaternion(const T v[4]) { set_value(v); } - quaternion &set_value(const matrix4 &m) { - T tr, s; - int i, j, k; - const int nxt[3] = {1, 2, 0}; + quaternion(T q0, T q1, T q2, T q3) { set_value(q0, q1, q2, q3); } - tr = m(0, 0) + m(1, 1) + m(2, 2); + quaternion(const matrix4 &m) { set_value(m); } - if (tr > T(0)) { - s = T(sqrt(tr + m(3, 3))); - _array[3] = T(s * 0.5); - s = T(0.5) / s; + quaternion(const vec3 &axis, T radians) { set_value(axis, radians); } - _array[0] = T((m(1, 2) - m(2, 1)) * s); - _array[1] = T((m(2, 0) - m(0, 2)) * s); - _array[2] = T((m(0, 1) - m(1, 0)) * s); - } else { - i = 0; + quaternion(const vec3 &rotateFrom, const vec3 &rotateTo) { set_value(rotateFrom, rotateTo); } - if (m(1, 1) > m(0, 0)) { - i = 1; - } - - if (m(2, 2) > m(i, i)) { - i = 2; - } - - j = nxt[i]; - k = nxt[j]; - - s = T(sqrt((m(i, j) - (m(j, j) + m(k, k))) + T(1.0))); - - _array[i] = T(s * 0.5); - s = T(0.5 / s); - - _array[3] = T((m(j, k) - m(k, j)) * s); - _array[j] = T((m(i, j) + m(j, i)) * s); - _array[k] = T((m(i, k) + m(k, i)) * s); + quaternion(const vec3 &from_look, const vec3 &from_up, const vec3 &to_look, const vec3 &to_up) + { + set_value(from_look, from_up, to_look, to_up); } - return *this; - } + const T *get_value() const { return &_array[0]; } - quaternion &set_value(const vec3 &axis, T theta) { - T sqnorm = square_norm(axis); - - if (sqnorm == T(0.0)) { - // axis too small. - x = y = z = T(0.0); - w = T(1.0); - } else { - theta *= T(0.5); - T sin_theta = T(sin(theta)); - - if (sqnorm != T(1)) { - sin_theta /= T(sqrt(sqnorm)); - } - - x = sin_theta * axis[0]; - y = sin_theta * axis[1]; - z = sin_theta * axis[2]; - w = T(cos(theta)); + void get_value(T &q0, T &q1, T &q2, T &q3) const + { + q0 = _array[0]; + q1 = _array[1]; + q2 = _array[2]; + q3 = _array[3]; } - return *this; - } - - quaternion &set_value(const vec3 &rotateFrom, const vec3 &rotateTo) { - vec3 p1, p2; - T alpha; - - p1 = normalize(rotateFrom); - p2 = normalize(rotateTo); - - alpha = dot(p1, p2); - - if (alpha == T(1.0)) { - *this = quaternion(); - return *this; + quaternion &set_value(T q0, T q1, T q2, T q3) + { + _array[0] = q0; + _array[1] = q1; + _array[2] = q2; + _array[3] = q3; + return *this; } - // ensures that the anti-parallel case leads to a positive dot - if (alpha == T(-1.0)) { - vec3 v; + void get_value(vec3 &axis, T &radians) const + { + radians = T(acos(_array[3]) * T(2.0)); - if (p1[0] != p1[1] || p1[0] != p1[2]) { - v = vec3(p1[1], p1[2], p1[0]); - } else { - v = vec3(-p1[0], p1[1], p1[2]); - } - - v -= p1 * dot(p1, v); - v = normalize(v); - - set_value(v, T(3.1415926)); - return *this; + if (radians == T(0.0)) { + axis = vec3(0.0, 0.0, 1.0); + } + else { + axis[0] = _array[0]; + axis[1] = _array[1]; + axis[2] = _array[2]; + axis = normalize(axis); + } } - p1 = normalize(cross(p1, p2)); + void get_value(matrix4 &m) const + { + T s, xs, ys, zs, wx, wy, wz, xx, xy, xz, yy, yz, zz; - set_value(p1, T(acos(alpha))); + T norm = _array[0] * _array[0] + _array[1] * _array[1] + _array[2] * _array[2] + _array[3] * _array[3]; - return *this; - } + s = (norm == T(0.0)) ? T(0.0) : (T(2.0) / norm); - quaternion &set_value(const vec3 &from_look, const vec3 &from_up, - const vec3 &to_look, const vec3 &to_up) { - quaternion r_look = quaternion(from_look, to_look); + xs = _array[0] * s; + ys = _array[1] * s; + zs = _array[2] * s; - vec3 rotated_from_up(from_up); - r_look.mult_vec(rotated_from_up); + wx = _array[3] * xs; + wy = _array[3] * ys; + wz = _array[3] * zs; - quaternion r_twist = quaternion(rotated_from_up, to_up); + xx = _array[0] * xs; + xy = _array[0] * ys; + xz = _array[0] * zs; - *this = r_twist; - *this *= r_look; - return *this; - } + yy = _array[1] * ys; + yz = _array[1] * zs; + zz = _array[2] * zs; - quaternion &operator*=(const quaternion &qr) { - quaternion ql(*this); + m(0, 0) = T(T(1.0) - (yy + zz)); + m(1, 0) = T(xy + wz); + m(2, 0) = T(xz - wy); - w = ql.w * qr.w - ql.x * qr.x - ql.y * qr.y - ql.z * qr.z; - x = ql.w * qr.x + ql.x * qr.w + ql.y * qr.z - ql.z * qr.y; - y = ql.w * qr.y + ql.y * qr.w + ql.z * qr.x - ql.x * qr.z; - z = ql.w * qr.z + ql.z * qr.w + ql.x * qr.y - ql.y * qr.x; + m(0, 1) = T(xy - wz); + m(1, 1) = T(T(1.0) - (xx + zz)); + m(2, 1) = T(yz + wx); - return *this; - } + m(0, 2) = T(xz + wy); + m(1, 2) = T(yz - wx); + m(2, 2) = T(T(1.0) - (xx + yy)); - friend quaternion normalize(const quaternion &q) { - quaternion r(q); - T rnorm = T(1.0) / T(sqrt(q.w * q.w + q.x * q.x + q.y * q.y + q.z * q.z)); - - r.x *= rnorm; - r.y *= rnorm; - r.z *= rnorm; - r.w *= rnorm; - } - - friend quaternion conjugate(const quaternion &q) { - quaternion r(q); - r._array[0] *= T(-1.0); - r._array[1] *= T(-1.0); - r._array[2] *= T(-1.0); - return r; - } - - friend quaternion inverse(const quaternion &q) { return conjugate(q); } - - // - // Quaternion multiplication with cartesian vector - // v' = q*v*q(star) - // - void mult_vec(const vec3 &src, vec3 &dst) const { - T v_coef = w * w - x * x - y * y - z * z; - T u_coef = T(2.0) * (src[0] * x + src[1] * y + src[2] * z); - T c_coef = T(2.0) * w; - - dst.v[0] = - v_coef * src.v[0] + u_coef * x + c_coef * (y * src.v[2] - z * src.v[1]); - dst.v[1] = - v_coef * src.v[1] + u_coef * y + c_coef * (z * src.v[0] - x * src.v[2]); - dst.v[2] = - v_coef * src.v[2] + u_coef * z + c_coef * (x * src.v[1] - y * src.v[0]); - } - - void mult_vec(vec3 &src_and_dst) const { - mult_vec(vec3(src_and_dst), src_and_dst); - } - - void scale_angle(T scaleFactor) { - vec3 axis; - T radians; - - get_value(axis, radians); - radians *= scaleFactor; - set_value(axis, radians); - } - - friend quaternion slerp(const quaternion &p, const quaternion &q, - T alpha) { - quaternion r; - - T cos_omega = p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w; - // if B is on opposite hemisphere from A, use -B instead - - int bflip; - - if ((bflip = (cos_omega < T(0)))) { - cos_omega = -cos_omega; + m(3, 0) = m(3, 1) = m(3, 2) = m(0, 3) = m(1, 3) = m(2, 3) = T(0.0); + m(3, 3) = T(1.0); } - // complementary interpolation parameter - T beta = T(1) - alpha; + quaternion &set_value(const T *qp) + { + for (int i = 0; i < 4; i++) { + _array[i] = qp[i]; + } - if (cos_omega >= T(1)) { - return p; + return *this; } - T omega = T(acos(cos_omega)); - T one_over_sin_omega = T(1.0) / T(sin(omega)); + quaternion &set_value(const matrix4 &m) + { + T tr, s; + int i, j, k; + const int nxt[3] = {1, 2, 0}; - beta = T(sin(omega * beta) * one_over_sin_omega); - alpha = T(sin(omega * alpha) * one_over_sin_omega); + tr = m(0, 0) + m(1, 1) + m(2, 2); - if (bflip) { - alpha = -alpha; + if (tr > T(0)) { + s = T(sqrt(tr + m(3, 3))); + _array[3] = T(s * 0.5); + s = T(0.5) / s; + + _array[0] = T((m(1, 2) - m(2, 1)) * s); + _array[1] = T((m(2, 0) - m(0, 2)) * s); + _array[2] = T((m(0, 1) - m(1, 0)) * s); + } + else { + i = 0; + + if (m(1, 1) > m(0, 0)) { + i = 1; + } + + if (m(2, 2) > m(i, i)) { + i = 2; + } + + j = nxt[i]; + k = nxt[j]; + + s = T(sqrt((m(i, j) - (m(j, j) + m(k, k))) + T(1.0))); + + _array[i] = T(s * 0.5); + s = T(0.5 / s); + + _array[3] = T((m(j, k) - m(k, j)) * s); + _array[j] = T((m(i, j) + m(j, i)) * s); + _array[k] = T((m(i, k) + m(k, i)) * s); + } + + return *this; } - r.x = beta * p._array[0] + alpha * q._array[0]; - r.y = beta * p._array[1] + alpha * q._array[1]; - r.z = beta * p._array[2] + alpha * q._array[2]; - r.w = beta * p._array[3] + alpha * q._array[3]; - return r; - } + quaternion &set_value(const vec3 &axis, T theta) + { + T sqnorm = square_norm(axis); - T &operator[](int i) { return _array[i]; } + if (sqnorm == T(0.0)) { + // axis too small. + x = y = z = T(0.0); + w = T(1.0); + } + else { + theta *= T(0.5); + T sin_theta = T(sin(theta)); - const T &operator[](int i) const { return _array[i]; } + if (sqnorm != T(1)) { + sin_theta /= T(sqrt(sqnorm)); + } - friend bool operator==(const quaternion &lhs, const quaternion &rhs) { - bool r = true; + x = sin_theta * axis[0]; + y = sin_theta * axis[1]; + z = sin_theta * axis[2]; + w = T(cos(theta)); + } - for (int i = 0; i < 4; i++) { - r &= lhs._array[i] == rhs._array[i]; + return *this; } - return r; - } + quaternion &set_value(const vec3 &rotateFrom, const vec3 &rotateTo) + { + vec3 p1, p2; + T alpha; - friend bool operator!=(const quaternion &lhs, const quaternion &rhs) { - bool r = true; + p1 = normalize(rotateFrom); + p2 = normalize(rotateTo); - for (int i = 0; i < 4; i++) { - r &= lhs._array[i] == rhs._array[i]; + alpha = dot(p1, p2); + + if (alpha == T(1.0)) { + *this = quaternion(); + return *this; + } + + // ensures that the anti-parallel case leads to a positive dot + if (alpha == T(-1.0)) { + vec3 v; + + if (p1[0] != p1[1] || p1[0] != p1[2]) { + v = vec3(p1[1], p1[2], p1[0]); + } + else { + v = vec3(-p1[0], p1[1], p1[2]); + } + + v -= p1 * dot(p1, v); + v = normalize(v); + + set_value(v, T(3.1415926)); + return *this; + } + + p1 = normalize(cross(p1, p2)); + + set_value(p1, T(acos(alpha))); + + return *this; } - return r; - } + quaternion & + set_value(const vec3 &from_look, const vec3 &from_up, const vec3 &to_look, const vec3 &to_up) + { + quaternion r_look = quaternion(from_look, to_look); - friend quaternion operator*(const quaternion &lhs, - const quaternion &rhs) { - quaternion r(lhs); - r *= rhs; - return r; - } + vec3 rotated_from_up(from_up); + r_look.mult_vec(rotated_from_up); - union { - struct { - T x; - T y; - T z; - T w; + quaternion r_twist = quaternion(rotated_from_up, to_up); + + *this = r_twist; + *this *= r_look; + return *this; + } + + quaternion &operator*=(const quaternion &qr) + { + quaternion ql(*this); + + w = ql.w * qr.w - ql.x * qr.x - ql.y * qr.y - ql.z * qr.z; + x = ql.w * qr.x + ql.x * qr.w + ql.y * qr.z - ql.z * qr.y; + y = ql.w * qr.y + ql.y * qr.w + ql.z * qr.x - ql.x * qr.z; + z = ql.w * qr.z + ql.z * qr.w + ql.x * qr.y - ql.y * qr.x; + + return *this; + } + + friend quaternion normalize(const quaternion &q) + { + quaternion r(q); + T rnorm = T(1.0) / T(sqrt(q.w * q.w + q.x * q.x + q.y * q.y + q.z * q.z)); + + r.x *= rnorm; + r.y *= rnorm; + r.z *= rnorm; + r.w *= rnorm; + } + + friend quaternion conjugate(const quaternion &q) + { + quaternion r(q); + r._array[0] *= T(-1.0); + r._array[1] *= T(-1.0); + r._array[2] *= T(-1.0); + return r; + } + + friend quaternion inverse(const quaternion &q) { return conjugate(q); } + + // + // Quaternion multiplication with cartesian vector + // v' = q*v*q(star) + // + void mult_vec(const vec3 &src, vec3 &dst) const + { + T v_coef = w * w - x * x - y * y - z * z; + T u_coef = T(2.0) * (src[0] * x + src[1] * y + src[2] * z); + T c_coef = T(2.0) * w; + + dst.v[0] = v_coef * src.v[0] + u_coef * x + c_coef * (y * src.v[2] - z * src.v[1]); + dst.v[1] = v_coef * src.v[1] + u_coef * y + c_coef * (z * src.v[0] - x * src.v[2]); + dst.v[2] = v_coef * src.v[2] + u_coef * z + c_coef * (x * src.v[1] - y * src.v[0]); + } + + void mult_vec(vec3 &src_and_dst) const { mult_vec(vec3(src_and_dst), src_and_dst); } + + void scale_angle(T scaleFactor) + { + vec3 axis; + T radians; + + get_value(axis, radians); + radians *= scaleFactor; + set_value(axis, radians); + } + + friend quaternion slerp(const quaternion &p, const quaternion &q, T alpha) + { + quaternion r; + + T cos_omega = p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w; + // if B is on opposite hemisphere from A, use -B instead + + int bflip; + + if ((bflip = (cos_omega < T(0)))) { + cos_omega = -cos_omega; + } + + // complementary interpolation parameter + T beta = T(1) - alpha; + + if (cos_omega >= T(1)) { + return p; + } + + T omega = T(acos(cos_omega)); + T one_over_sin_omega = T(1.0) / T(sin(omega)); + + beta = T(sin(omega * beta) * one_over_sin_omega); + alpha = T(sin(omega * alpha) * one_over_sin_omega); + + if (bflip) { + alpha = -alpha; + } + + r.x = beta * p._array[0] + alpha * q._array[0]; + r.y = beta * p._array[1] + alpha * q._array[1]; + r.z = beta * p._array[2] + alpha * q._array[2]; + r.w = beta * p._array[3] + alpha * q._array[3]; + return r; + } + + T &operator[](int i) { return _array[i]; } + + const T &operator[](int i) const { return _array[i]; } + + friend bool operator==(const quaternion &lhs, const quaternion &rhs) + { + bool r = true; + + for (int i = 0; i < 4; i++) { + r &= lhs._array[i] == rhs._array[i]; + } + + return r; + } + + friend bool operator!=(const quaternion &lhs, const quaternion &rhs) + { + bool r = true; + + for (int i = 0; i < 4; i++) { + r &= lhs._array[i] == rhs._array[i]; + } + + return r; + } + + friend quaternion operator*(const quaternion &lhs, const quaternion &rhs) + { + quaternion r(lhs); + r *= rhs; + return r; + } + + union + { + struct + { + T x; + T y; + T z; + T w; + }; + T _array[4]; }; - T _array[4]; - }; -}; }; +}; // namespace nv #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/nvVector.h b/Samples/5_Domain_Specific/smokeParticles/nvVector.h index f43a2f9e..a7c046c8 100644 --- a/Samples/5_Domain_Specific/smokeParticles/nvVector.h +++ b/Samples/5_Domain_Specific/smokeParticles/nvVector.h @@ -63,237 +63,259 @@ namespace nv { -template -class vec2; -template -class vec3; -template -class vec4; +template class vec2; +template class vec3; +template class vec4; ////////////////////////////////////////////////////////////////////// // // vec2 - template class for 2-tuple vector // ////////////////////////////////////////////////////////////////////// -template -class vec2 { - public: - typedef T value_type; - int size() const { return 2; } +template class vec2 +{ +public: + typedef T value_type; + int size() const { return 2; } - //////////////////////////////////////////////////////// - // - // Constructors - // - //////////////////////////////////////////////////////// + //////////////////////////////////////////////////////// + // + // Constructors + // + //////////////////////////////////////////////////////// - // Default/scalar constructor - vec2(const T &t = T()) { - for (int i = 0; i < size(); i++) { - _array[i] = t; - } - } - - // Construct from array - vec2(const T *tp) { - for (int i = 0; i < size(); i++) { - _array[i] = tp[i]; - } - } - - // Construct from explicit values - vec2(const T v0, const T v1) { - x = v0; - y = v1; - } - - explicit vec2(const vec3 &u) { - for (int i = 0; i < size(); i++) { - _array[i] = u._array[i]; - } - } - - explicit vec2(const vec4 &u) { - for (int i = 0; i < size(); i++) { - _array[i] = u._array[i]; - } - } - - const T *get_value() const { return _array; } - - vec2 &set_value(const T *rhs) { - for (int i = 0; i < size(); i++) { - _array[i] = rhs[i]; + // Default/scalar constructor + vec2(const T &t = T()) + { + for (int i = 0; i < size(); i++) { + _array[i] = t; + } } - return *this; - } - - // indexing operators - T &operator[](int i) { return _array[i]; } - - const T &operator[](int i) const { return _array[i]; } - - // type-cast operators - operator T *() { return _array; } - - operator const T *() const { return _array; } - - //////////////////////////////////////////////////////// - // - // Math operators - // - //////////////////////////////////////////////////////// - - // scalar multiply assign - friend vec2 &operator*=(vec2 &lhs, T d) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] *= d; + // Construct from array + vec2(const T *tp) + { + for (int i = 0; i < size(); i++) { + _array[i] = tp[i]; + } } - return lhs; - } - - // component-wise vector multiply assign - friend vec2 &operator*=(vec2 &lhs, const vec2 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] *= rhs[i]; + // Construct from explicit values + vec2(const T v0, const T v1) + { + x = v0; + y = v1; } - return lhs; - } - - // scalar divide assign - friend vec2 &operator/=(vec2 &lhs, T d) { - if (d == 0) { - return lhs; + explicit vec2(const vec3 &u) + { + for (int i = 0; i < size(); i++) { + _array[i] = u._array[i]; + } } - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] /= d; + explicit vec2(const vec4 &u) + { + for (int i = 0; i < size(); i++) { + _array[i] = u._array[i]; + } } - return lhs; - } + const T *get_value() const { return _array; } - // component-wise vector divide assign - friend vec2 &operator/=(vec2 &lhs, const vec2 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] /= rhs._array[i]; + vec2 &set_value(const T *rhs) + { + for (int i = 0; i < size(); i++) { + _array[i] = rhs[i]; + } + + return *this; } - return lhs; - } + // indexing operators + T &operator[](int i) { return _array[i]; } - // component-wise vector add assign - friend vec2 &operator+=(vec2 &lhs, const vec2 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] += rhs._array[i]; + const T &operator[](int i) const { return _array[i]; } + + // type-cast operators + operator T *() { return _array; } + + operator const T *() const { return _array; } + + //////////////////////////////////////////////////////// + // + // Math operators + // + //////////////////////////////////////////////////////// + + // scalar multiply assign + friend vec2 &operator*=(vec2 &lhs, T d) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] *= d; + } + + return lhs; } - return lhs; - } + // component-wise vector multiply assign + friend vec2 &operator*=(vec2 &lhs, const vec2 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] *= rhs[i]; + } - // component-wise vector subtract assign - friend vec2 &operator-=(vec2 &lhs, const vec2 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] -= rhs._array[i]; + return lhs; } - return lhs; - } + // scalar divide assign + friend vec2 &operator/=(vec2 &lhs, T d) + { + if (d == 0) { + return lhs; + } - // unary negate - friend vec2 operator-(const vec2 &rhs) { - vec2 rv; + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] /= d; + } - for (int i = 0; i < rhs.size(); i++) { - rv._array[i] = -rhs._array[i]; + return lhs; } - return rv; - } + // component-wise vector divide assign + friend vec2 &operator/=(vec2 &lhs, const vec2 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] /= rhs._array[i]; + } - // vector add - friend vec2 operator+(const vec2 &lhs, const vec2 &rhs) { - vec2 rt(lhs); - return rt += rhs; - } - - // vector subtract - friend vec2 operator-(const vec2 &lhs, const vec2 &rhs) { - vec2 rt(lhs); - return rt -= rhs; - } - - // scalar multiply - friend vec2 operator*(const vec2 &lhs, T rhs) { - vec2 rt(lhs); - return rt *= rhs; - } - - // scalar multiply - friend vec2 operator*(T lhs, const vec2 &rhs) { - vec2 rt(lhs); - return rt *= rhs; - } - - // vector component-wise multiply - friend vec2 operator*(const vec2 &lhs, const vec2 &rhs) { - vec2 rt(lhs); - return rt *= rhs; - } - - // scalar multiply - friend vec2 operator/(const vec2 &lhs, T rhs) { - vec2 rt(lhs); - return rt /= rhs; - } - - // vector component-wise multiply - friend vec2 operator/(const vec2 &lhs, const vec2 &rhs) { - vec2 rt(lhs); - return rt /= rhs; - } - - //////////////////////////////////////////////////////// - // - // Comparison operators - // - //////////////////////////////////////////////////////// - - // equality - friend bool operator==(const vec2 &lhs, const vec2 &rhs) { - bool r = true; - - for (int i = 0; i < lhs.size(); i++) { - r &= lhs._array[i] == rhs._array[i]; + return lhs; } - return r; - } + // component-wise vector add assign + friend vec2 &operator+=(vec2 &lhs, const vec2 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] += rhs._array[i]; + } - // inequality - friend bool operator!=(const vec2 &lhs, const vec2 &rhs) { - bool r = true; - - for (int i = 0; i < lhs.size(); i++) { - r &= lhs._array[i] != rhs._array[i]; + return lhs; } - return r; - } + // component-wise vector subtract assign + friend vec2 &operator-=(vec2 &lhs, const vec2 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] -= rhs._array[i]; + } - // data intentionally left public to allow vec2.x - union { - struct { - T x, y; // standard names for components + return lhs; + } + + // unary negate + friend vec2 operator-(const vec2 &rhs) + { + vec2 rv; + + for (int i = 0; i < rhs.size(); i++) { + rv._array[i] = -rhs._array[i]; + } + + return rv; + } + + // vector add + friend vec2 operator+(const vec2 &lhs, const vec2 &rhs) + { + vec2 rt(lhs); + return rt += rhs; + } + + // vector subtract + friend vec2 operator-(const vec2 &lhs, const vec2 &rhs) + { + vec2 rt(lhs); + return rt -= rhs; + } + + // scalar multiply + friend vec2 operator*(const vec2 &lhs, T rhs) + { + vec2 rt(lhs); + return rt *= rhs; + } + + // scalar multiply + friend vec2 operator*(T lhs, const vec2 &rhs) + { + vec2 rt(lhs); + return rt *= rhs; + } + + // vector component-wise multiply + friend vec2 operator*(const vec2 &lhs, const vec2 &rhs) + { + vec2 rt(lhs); + return rt *= rhs; + } + + // scalar multiply + friend vec2 operator/(const vec2 &lhs, T rhs) + { + vec2 rt(lhs); + return rt /= rhs; + } + + // vector component-wise multiply + friend vec2 operator/(const vec2 &lhs, const vec2 &rhs) + { + vec2 rt(lhs); + return rt /= rhs; + } + + //////////////////////////////////////////////////////// + // + // Comparison operators + // + //////////////////////////////////////////////////////// + + // equality + friend bool operator==(const vec2 &lhs, const vec2 &rhs) + { + bool r = true; + + for (int i = 0; i < lhs.size(); i++) { + r &= lhs._array[i] == rhs._array[i]; + } + + return r; + } + + // inequality + friend bool operator!=(const vec2 &lhs, const vec2 &rhs) + { + bool r = true; + + for (int i = 0; i < lhs.size(); i++) { + r &= lhs._array[i] != rhs._array[i]; + } + + return r; + } + + // data intentionally left public to allow vec2.x + union + { + struct + { + T x, y; // standard names for components + }; + struct + { + T s, t; // standard names for components + }; + T _array[2]; // array access }; - struct { - T s, t; // standard names for components - }; - T _array[2]; // array access - }; }; ////////////////////////////////////////////////////////////////////// @@ -301,243 +323,269 @@ class vec2 { // vec3 - template class for 3-tuple vector // ////////////////////////////////////////////////////////////////////// -template -class vec3 { - public: - typedef T value_type; - int size() const { return 3; } +template class vec3 +{ +public: + typedef T value_type; + int size() const { return 3; } - //////////////////////////////////////////////////////// - // - // Constructors - // - //////////////////////////////////////////////////////// + //////////////////////////////////////////////////////// + // + // Constructors + // + //////////////////////////////////////////////////////// - // Default/scalar constructor - vec3(const T &t = T()) { - for (int i = 0; i < size(); i++) { - _array[i] = t; - } - } - - // Construct from array - vec3(const T *tp) { - for (int i = 0; i < size(); i++) { - _array[i] = tp[i]; - } - } - - // Construct from explicit values - vec3(const T v0, const T v1, const T v2) { - x = v0; - y = v1; - z = v2; - } - - explicit vec3(const vec4 &u) { - for (int i = 0; i < size(); i++) { - _array[i] = u._array[i]; - } - } - - explicit vec3(const vec2 &u, T v0) { - x = u.x; - y = u.y; - z = v0; - } - - const T *get_value() const { return _array; } - - vec3 &set_value(const T *rhs) { - for (int i = 0; i < size(); i++) { - _array[i] = rhs[i]; + // Default/scalar constructor + vec3(const T &t = T()) + { + for (int i = 0; i < size(); i++) { + _array[i] = t; + } } - return *this; - } - - // indexing operators - T &operator[](int i) { return _array[i]; } - - const T &operator[](int i) const { return _array[i]; } - - // type-cast operators - operator T *() { return _array; } - - operator const T *() const { return _array; } - - //////////////////////////////////////////////////////// - // - // Math operators - // - //////////////////////////////////////////////////////// - - // scalar multiply assign - friend vec3 &operator*=(vec3 &lhs, T d) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] *= d; + // Construct from array + vec3(const T *tp) + { + for (int i = 0; i < size(); i++) { + _array[i] = tp[i]; + } } - return lhs; - } - - // component-wise vector multiply assign - friend vec3 &operator*=(vec3 &lhs, const vec3 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] *= rhs[i]; + // Construct from explicit values + vec3(const T v0, const T v1, const T v2) + { + x = v0; + y = v1; + z = v2; } - return lhs; - } - - // scalar divide assign - friend vec3 &operator/=(vec3 &lhs, T d) { - if (d == 0) { - return lhs; + explicit vec3(const vec4 &u) + { + for (int i = 0; i < size(); i++) { + _array[i] = u._array[i]; + } } - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] /= d; + explicit vec3(const vec2 &u, T v0) + { + x = u.x; + y = u.y; + z = v0; } - return lhs; - } + const T *get_value() const { return _array; } - // component-wise vector divide assign - friend vec3 &operator/=(vec3 &lhs, const vec3 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] /= rhs._array[i]; + vec3 &set_value(const T *rhs) + { + for (int i = 0; i < size(); i++) { + _array[i] = rhs[i]; + } + + return *this; } - return lhs; - } + // indexing operators + T &operator[](int i) { return _array[i]; } - // component-wise vector add assign - friend vec3 &operator+=(vec3 &lhs, const vec3 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] += rhs._array[i]; + const T &operator[](int i) const { return _array[i]; } + + // type-cast operators + operator T *() { return _array; } + + operator const T *() const { return _array; } + + //////////////////////////////////////////////////////// + // + // Math operators + // + //////////////////////////////////////////////////////// + + // scalar multiply assign + friend vec3 &operator*=(vec3 &lhs, T d) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] *= d; + } + + return lhs; } - return lhs; - } + // component-wise vector multiply assign + friend vec3 &operator*=(vec3 &lhs, const vec3 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] *= rhs[i]; + } - // component-wise vector subtract assign - friend vec3 &operator-=(vec3 &lhs, const vec3 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] -= rhs._array[i]; + return lhs; } - return lhs; - } + // scalar divide assign + friend vec3 &operator/=(vec3 &lhs, T d) + { + if (d == 0) { + return lhs; + } - // unary negate - friend vec3 operator-(const vec3 &rhs) { - vec3 rv; + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] /= d; + } - for (int i = 0; i < rhs.size(); i++) { - rv._array[i] = -rhs._array[i]; + return lhs; } - return rv; - } + // component-wise vector divide assign + friend vec3 &operator/=(vec3 &lhs, const vec3 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] /= rhs._array[i]; + } - // vector add - friend vec3 operator+(const vec3 &lhs, const vec3 &rhs) { - vec3 rt(lhs); - return rt += rhs; - } - - // vector subtract - friend vec3 operator-(const vec3 &lhs, const vec3 &rhs) { - vec3 rt(lhs); - return rt -= rhs; - } - - // scalar multiply - friend vec3 operator*(const vec3 &lhs, T rhs) { - vec3 rt(lhs); - return rt *= rhs; - } - - // scalar multiply - friend vec3 operator*(T lhs, const vec3 &rhs) { - vec3 rt(lhs); - return rt *= rhs; - } - - // vector component-wise multiply - friend vec3 operator*(const vec3 &lhs, const vec3 &rhs) { - vec3 rt(lhs); - return rt *= rhs; - } - - // scalar multiply - friend vec3 operator/(const vec3 &lhs, T rhs) { - vec3 rt(lhs); - return rt /= rhs; - } - - // vector component-wise multiply - friend vec3 operator/(const vec3 &lhs, const vec3 &rhs) { - vec3 rt(lhs); - return rt /= rhs; - } - - //////////////////////////////////////////////////////// - // - // Comparison operators - // - //////////////////////////////////////////////////////// - - // equality - friend bool operator==(const vec3 &lhs, const vec3 &rhs) { - bool r = true; - - for (int i = 0; i < lhs.size(); i++) { - r &= lhs._array[i] == rhs._array[i]; + return lhs; } - return r; - } + // component-wise vector add assign + friend vec3 &operator+=(vec3 &lhs, const vec3 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] += rhs._array[i]; + } - // inequality - friend bool operator!=(const vec3 &lhs, const vec3 &rhs) { - bool r = true; - - for (int i = 0; i < lhs.size(); i++) { - r &= lhs._array[i] != rhs._array[i]; + return lhs; } - return r; - } + // component-wise vector subtract assign + friend vec3 &operator-=(vec3 &lhs, const vec3 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] -= rhs._array[i]; + } - //////////////////////////////////////////////////////////////////////////////// - // - // dimension specific operations - // - //////////////////////////////////////////////////////////////////////////////// + return lhs; + } - // cross product - friend vec3 cross(const vec3 &lhs, const vec3 &rhs) { - vec3 r; + // unary negate + friend vec3 operator-(const vec3 &rhs) + { + vec3 rv; - r.x = lhs.y * rhs.z - lhs.z * rhs.y; - r.y = lhs.z * rhs.x - lhs.x * rhs.z; - r.z = lhs.x * rhs.y - lhs.y * rhs.x; + for (int i = 0; i < rhs.size(); i++) { + rv._array[i] = -rhs._array[i]; + } - return r; - } + return rv; + } - // data intentionally left public to allow vec2.x - union { - struct { - T x, y, z; // standard names for components + // vector add + friend vec3 operator+(const vec3 &lhs, const vec3 &rhs) + { + vec3 rt(lhs); + return rt += rhs; + } + + // vector subtract + friend vec3 operator-(const vec3 &lhs, const vec3 &rhs) + { + vec3 rt(lhs); + return rt -= rhs; + } + + // scalar multiply + friend vec3 operator*(const vec3 &lhs, T rhs) + { + vec3 rt(lhs); + return rt *= rhs; + } + + // scalar multiply + friend vec3 operator*(T lhs, const vec3 &rhs) + { + vec3 rt(lhs); + return rt *= rhs; + } + + // vector component-wise multiply + friend vec3 operator*(const vec3 &lhs, const vec3 &rhs) + { + vec3 rt(lhs); + return rt *= rhs; + } + + // scalar multiply + friend vec3 operator/(const vec3 &lhs, T rhs) + { + vec3 rt(lhs); + return rt /= rhs; + } + + // vector component-wise multiply + friend vec3 operator/(const vec3 &lhs, const vec3 &rhs) + { + vec3 rt(lhs); + return rt /= rhs; + } + + //////////////////////////////////////////////////////// + // + // Comparison operators + // + //////////////////////////////////////////////////////// + + // equality + friend bool operator==(const vec3 &lhs, const vec3 &rhs) + { + bool r = true; + + for (int i = 0; i < lhs.size(); i++) { + r &= lhs._array[i] == rhs._array[i]; + } + + return r; + } + + // inequality + friend bool operator!=(const vec3 &lhs, const vec3 &rhs) + { + bool r = true; + + for (int i = 0; i < lhs.size(); i++) { + r &= lhs._array[i] != rhs._array[i]; + } + + return r; + } + + //////////////////////////////////////////////////////////////////////////////// + // + // dimension specific operations + // + //////////////////////////////////////////////////////////////////////////////// + + // cross product + friend vec3 cross(const vec3 &lhs, const vec3 &rhs) + { + vec3 r; + + r.x = lhs.y * rhs.z - lhs.z * rhs.y; + r.y = lhs.z * rhs.x - lhs.x * rhs.z; + r.z = lhs.x * rhs.y - lhs.y * rhs.x; + + return r; + } + + // data intentionally left public to allow vec2.x + union + { + struct + { + T x, y, z; // standard names for components + }; + struct + { + T s, t, r; // standard names for components + }; + T _array[3]; // array access }; - struct { - T s, t, r; // standard names for components - }; - T _array[3]; // array access - }; }; ////////////////////////////////////////////////////////////////////// @@ -545,229 +593,254 @@ class vec3 { // vec4 - template class for 4-tuple vector // ////////////////////////////////////////////////////////////////////// -template -class vec4 { - public: - typedef T value_type; - int size() const { return 4; } +template class vec4 +{ +public: + typedef T value_type; + int size() const { return 4; } - //////////////////////////////////////////////////////// - // - // Constructors - // - //////////////////////////////////////////////////////// + //////////////////////////////////////////////////////// + // + // Constructors + // + //////////////////////////////////////////////////////// - // Default/scalar constructor - vec4(const T &t = T()) { - for (int i = 0; i < size(); i++) { - _array[i] = t; - } - } - - // Construct from array - vec4(const T *tp) { - for (int i = 0; i < size(); i++) { - _array[i] = tp[i]; - } - } - - // Construct from explicit values - vec4(const T v0, const T v1, const T v2, const T v3) { - x = v0; - y = v1; - z = v2; - w = v3; - } - - explicit vec4(const vec3 &u, T v0) { - x = u.x; - y = u.y; - z = u.z; - w = v0; - } - - explicit vec4(const vec2 &u, T v0, T v1) { - x = u.x; - y = u.y; - z = v0; - w = v1; - } - - const T *get_value() const { return _array; } - - vec4 &set_value(const T *rhs) { - for (int i = 0; i < size(); i++) { - _array[i] = rhs[i]; + // Default/scalar constructor + vec4(const T &t = T()) + { + for (int i = 0; i < size(); i++) { + _array[i] = t; + } } - return *this; - } - - // indexing operators - T &operator[](int i) { return _array[i]; } - - const T &operator[](int i) const { return _array[i]; } - - // type-cast operators - operator T *() { return _array; } - - operator const T *() const { return _array; } - - //////////////////////////////////////////////////////// - // - // Math operators - // - //////////////////////////////////////////////////////// - - // scalar multiply assign - friend vec4 &operator*=(vec4 &lhs, T d) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] *= d; + // Construct from array + vec4(const T *tp) + { + for (int i = 0; i < size(); i++) { + _array[i] = tp[i]; + } } - return lhs; - } - - // component-wise vector multiply assign - friend vec4 &operator*=(vec4 &lhs, const vec4 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] *= rhs[i]; + // Construct from explicit values + vec4(const T v0, const T v1, const T v2, const T v3) + { + x = v0; + y = v1; + z = v2; + w = v3; } - return lhs; - } - - // scalar divide assign - friend vec4 &operator/=(vec4 &lhs, T d) { - if (d == 0) { - return lhs; + explicit vec4(const vec3 &u, T v0) + { + x = u.x; + y = u.y; + z = u.z; + w = v0; } - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] /= d; + explicit vec4(const vec2 &u, T v0, T v1) + { + x = u.x; + y = u.y; + z = v0; + w = v1; } - return lhs; - } + const T *get_value() const { return _array; } - // component-wise vector divide assign - friend vec4 &operator/=(vec4 &lhs, const vec4 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] /= rhs._array[i]; + vec4 &set_value(const T *rhs) + { + for (int i = 0; i < size(); i++) { + _array[i] = rhs[i]; + } + + return *this; } - return lhs; - } + // indexing operators + T &operator[](int i) { return _array[i]; } - // component-wise vector add assign - friend vec4 &operator+=(vec4 &lhs, const vec4 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] += rhs._array[i]; + const T &operator[](int i) const { return _array[i]; } + + // type-cast operators + operator T *() { return _array; } + + operator const T *() const { return _array; } + + //////////////////////////////////////////////////////// + // + // Math operators + // + //////////////////////////////////////////////////////// + + // scalar multiply assign + friend vec4 &operator*=(vec4 &lhs, T d) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] *= d; + } + + return lhs; } - return lhs; - } + // component-wise vector multiply assign + friend vec4 &operator*=(vec4 &lhs, const vec4 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] *= rhs[i]; + } - // component-wise vector subtract assign - friend vec4 &operator-=(vec4 &lhs, const vec4 &rhs) { - for (int i = 0; i < lhs.size(); i++) { - lhs._array[i] -= rhs._array[i]; + return lhs; } - return lhs; - } + // scalar divide assign + friend vec4 &operator/=(vec4 &lhs, T d) + { + if (d == 0) { + return lhs; + } - // unary negate - friend vec4 operator-(const vec4 &rhs) { - vec4 rv; + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] /= d; + } - for (int i = 0; i < rhs.size(); i++) { - rv._array[i] = -rhs._array[i]; + return lhs; } - return rv; - } + // component-wise vector divide assign + friend vec4 &operator/=(vec4 &lhs, const vec4 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] /= rhs._array[i]; + } - // vector add - friend vec4 operator+(const vec4 &lhs, const vec4 &rhs) { - vec4 rt(lhs); - return rt += rhs; - } - - // vector subtract - friend vec4 operator-(const vec4 &lhs, const vec4 &rhs) { - vec4 rt(lhs); - return rt -= rhs; - } - - // scalar multiply - friend vec4 operator*(const vec4 &lhs, T rhs) { - vec4 rt(lhs); - return rt *= rhs; - } - - // scalar multiply - friend vec4 operator*(T lhs, const vec4 &rhs) { - vec4 rt(lhs); - return rt *= rhs; - } - - // vector component-wise multiply - friend vec4 operator*(const vec4 &lhs, const vec4 &rhs) { - vec4 rt(lhs); - return rt *= rhs; - } - - // scalar multiply - friend vec4 operator/(const vec4 &lhs, T rhs) { - vec4 rt(lhs); - return rt /= rhs; - } - - // vector component-wise multiply - friend vec4 operator/(const vec4 &lhs, const vec4 &rhs) { - vec4 rt(lhs); - return rt /= rhs; - } - - //////////////////////////////////////////////////////// - // - // Comparison operators - // - //////////////////////////////////////////////////////// - - // equality - friend bool operator==(const vec4 &lhs, const vec4 &rhs) { - bool r = true; - - for (int i = 0; i < lhs.size(); i++) { - r &= lhs._array[i] == rhs._array[i]; + return lhs; } - return r; - } + // component-wise vector add assign + friend vec4 &operator+=(vec4 &lhs, const vec4 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] += rhs._array[i]; + } - // inequality - friend bool operator!=(const vec4 &lhs, const vec4 &rhs) { - bool r = true; - - for (int i = 0; i < lhs.size(); i++) { - r &= lhs._array[i] != rhs._array[i]; + return lhs; } - return r; - } + // component-wise vector subtract assign + friend vec4 &operator-=(vec4 &lhs, const vec4 &rhs) + { + for (int i = 0; i < lhs.size(); i++) { + lhs._array[i] -= rhs._array[i]; + } - // data intentionally left public to allow vec2.x - union { - struct { - T x, y, z, w; // standard names for components + return lhs; + } + + // unary negate + friend vec4 operator-(const vec4 &rhs) + { + vec4 rv; + + for (int i = 0; i < rhs.size(); i++) { + rv._array[i] = -rhs._array[i]; + } + + return rv; + } + + // vector add + friend vec4 operator+(const vec4 &lhs, const vec4 &rhs) + { + vec4 rt(lhs); + return rt += rhs; + } + + // vector subtract + friend vec4 operator-(const vec4 &lhs, const vec4 &rhs) + { + vec4 rt(lhs); + return rt -= rhs; + } + + // scalar multiply + friend vec4 operator*(const vec4 &lhs, T rhs) + { + vec4 rt(lhs); + return rt *= rhs; + } + + // scalar multiply + friend vec4 operator*(T lhs, const vec4 &rhs) + { + vec4 rt(lhs); + return rt *= rhs; + } + + // vector component-wise multiply + friend vec4 operator*(const vec4 &lhs, const vec4 &rhs) + { + vec4 rt(lhs); + return rt *= rhs; + } + + // scalar multiply + friend vec4 operator/(const vec4 &lhs, T rhs) + { + vec4 rt(lhs); + return rt /= rhs; + } + + // vector component-wise multiply + friend vec4 operator/(const vec4 &lhs, const vec4 &rhs) + { + vec4 rt(lhs); + return rt /= rhs; + } + + //////////////////////////////////////////////////////// + // + // Comparison operators + // + //////////////////////////////////////////////////////// + + // equality + friend bool operator==(const vec4 &lhs, const vec4 &rhs) + { + bool r = true; + + for (int i = 0; i < lhs.size(); i++) { + r &= lhs._array[i] == rhs._array[i]; + } + + return r; + } + + // inequality + friend bool operator!=(const vec4 &lhs, const vec4 &rhs) + { + bool r = true; + + for (int i = 0; i < lhs.size(); i++) { + r &= lhs._array[i] != rhs._array[i]; + } + + return r; + } + + // data intentionally left public to allow vec2.x + union + { + struct + { + T x, y, z, w; // standard names for components + }; + struct + { + T s, t, r, q; // standard names for components + }; + T _array[4]; // array access }; - struct { - T s, t, r, q; // standard names for components - }; - T _array[4]; // array access - }; }; //////////////////////////////////////////////////////////////////////////////// @@ -777,59 +850,59 @@ class vec4 { //////////////////////////////////////////////////////////////////////////////// // compute the dot product of two vectors -template -inline typename T::value_type dot(const T &lhs, const T &rhs) { - typename T::value_type r = 0; +template inline typename T::value_type dot(const T &lhs, const T &rhs) +{ + typename T::value_type r = 0; - for (int i = 0; i < lhs.size(); i++) { - r += lhs._array[i] * rhs._array[i]; - } + for (int i = 0; i < lhs.size(); i++) { + r += lhs._array[i] * rhs._array[i]; + } - return r; + return r; } // return the length of the provided vector -template -inline typename T::value_type length(const T &vec) { - typename T::value_type r = 0; +template inline typename T::value_type length(const T &vec) +{ + typename T::value_type r = 0; - for (int i = 0; i < vec.size(); i++) { - r += vec._array[i] * vec._array[i]; - } + for (int i = 0; i < vec.size(); i++) { + r += vec._array[i] * vec._array[i]; + } - return typename T::value_type(sqrt(r)); + return typename T::value_type(sqrt(r)); } // return the squared norm -template -inline typename T::value_type square_norm(const T &vec) { - typename T::value_type r = 0; +template inline typename T::value_type square_norm(const T &vec) +{ + typename T::value_type r = 0; - for (int i = 0; i < vec.size(); i++) { - r += vec._array[i] * vec._array[i]; - } + for (int i = 0; i < vec.size(); i++) { + r += vec._array[i] * vec._array[i]; + } - return r; + return r; } // return the normalized version of the vector -template -inline T normalize(const T &vec) { - typename T::value_type sum(0); - T r; +template inline T normalize(const T &vec) +{ + typename T::value_type sum(0); + T r; - for (int i = 0; i < vec.size(); i++) { - sum += vec._array[i] * vec._array[i]; - } - - sum = typename T::value_type(sqrt(sum)); - - if (sum > 0) for (int i = 0; i < vec.size(); i++) { - r._array[i] = vec._array[i] / sum; + sum += vec._array[i] * vec._array[i]; } - return r; + sum = typename T::value_type(sqrt(sum)); + + if (sum > 0) + for (int i = 0; i < vec.size(); i++) { + r._array[i] = vec._array[i] / sum; + } + + return r; } // In VC8 : min and max are already defined by a #define... @@ -840,28 +913,28 @@ inline T normalize(const T &vec) { #undef max #endif // componentwise min -template -inline T min(const T &lhs, const T &rhs) { - T rt; +template inline T min(const T &lhs, const T &rhs) +{ + T rt; - for (int i = 0; i < lhs.size(); i++) { - rt._array[i] = std::min(lhs._array[i], rhs._array[i]); - } + for (int i = 0; i < lhs.size(); i++) { + rt._array[i] = std::min(lhs._array[i], rhs._array[i]); + } - return rt; + return rt; } // componentwise max -template -inline T max(const T &lhs, const T &rhs) { - T rt; +template inline T max(const T &lhs, const T &rhs) +{ + T rt; - for (int i = 0; i < lhs.size(); i++) { - rt._array[i] = std::max(lhs._array[i], rhs._array[i]); - } + for (int i = 0; i < lhs.size(); i++) { + rt._array[i] = std::max(lhs._array[i], rhs._array[i]); + } - return rt; + return rt; } -}; +}; // namespace nv #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/particleDemo.cpp b/Samples/5_Domain_Specific/smokeParticles/particleDemo.cpp index 2b23cf22..fa34fb69 100644 --- a/Samples/5_Domain_Specific/smokeParticles/particleDemo.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/particleDemo.cpp @@ -38,12 +38,11 @@ This file handles OpenGL initialization and the user interface. */ -#include -#include #include -#include - +#include +#include #include +#include #if defined(__APPLE__) || defined(__MACOSX) #pragma clang diagnostic ignored "-Wdeprecated-declarations" #include @@ -54,30 +53,29 @@ #include #endif -#include #include - +#include #include -#include "ParticleSystem.h" -#include "ParticleSystem.cuh" -#include "SmokeRenderer.h" -#include "paramgl.h" #include "GLSLProgram.h" +#include "ParticleSystem.cuh" +#include "ParticleSystem.h" +#include "SmokeRenderer.h" #include "SmokeShaders.h" +#include "paramgl.h" uint numParticles = 1 << 16; -ParticleSystem *psystem = 0; -SmokeRenderer *renderer = 0; -GLSLProgram *floorProg = 0; +ParticleSystem *psystem = 0; +SmokeRenderer *renderer = 0; +GLSLProgram *floorProg = 0; int winWidth = 1280, winHeight = 1024; int g_TotalErrors = 0; // view params -int ox, oy; -int buttonState = 0; +int ox, oy; +int buttonState = 0; bool keyDown[256]; vec3f cameraPos(0, -1, -4); @@ -89,855 +87,860 @@ vec3f cursorPosLag(cursorPos); vec3f lightPos(5.0, 5.0, -5.0); -const float inertia = 0.1f; +const float inertia = 0.1f; const float translateSpeed = 0.002f; -const float cursorSpeed = 0.01f; -const float rotateSpeed = 0.2f; -const float walkSpeed = 0.05f; +const float cursorSpeed = 0.01f; +const float rotateSpeed = 0.2f; +const float walkSpeed = 0.05f; enum { M_VIEW = 0, M_MOVE_CURSOR, M_MOVE_LIGHT }; -int mode = 0; +int mode = 0; int displayMode = (int)SmokeRenderer::VOLUMETRIC; // QA AutoTest bool g_bQAReadback = false; // toggles -bool displayEnabled = true; -bool paused = false; -bool displaySliders = false; -bool wireframe = false; -bool animateEmitter = true; -bool emitterOn = true; -bool sort = true; +bool displayEnabled = true; +bool paused = false; +bool displaySliders = false; +bool wireframe = false; +bool animateEmitter = true; +bool emitterOn = true; +bool sort = true; bool displayLightBuffer = false; -bool drawVectors = false; -bool doBlur = false; +bool drawVectors = false; +bool doBlur = false; -float emitterVel = 0.0f; -uint emitterRate = 1000; +float emitterVel = 0.0f; +uint emitterRate = 1000; float emitterRadius = 0.25; float emitterSpread = 0.0; -uint emitterIndex = 0; +uint emitterIndex = 0; // simulation parameters -float timestep = 0.5f; -float currentTime = 0.0f; -float spriteSize = 0.05f; -float alpha = 0.1f; -float shadowAlpha = 0.02f; +float timestep = 0.5f; +float currentTime = 0.0f; +float spriteSize = 0.05f; +float alpha = 0.1f; +float shadowAlpha = 0.02f; float particleLifetime = (float)numParticles / (float)emitterRate; vec3f lightColor(1.0f, 1.0f, 0.8f); vec3f colorAttenuation(0.5f, 0.75f, 1.0f); float blurRadius = 2.0f; -int numSlices = 64; +int numSlices = 64; int numDisplayedSlices = numSlices; // fps -static int fpsCount = 0; -static int fpsLimit = 1; -StopWatchInterface *timer = NULL; +static int fpsCount = 0; +static int fpsLimit = 1; +StopWatchInterface *timer = NULL; -float modelView[16]; +float modelView[16]; ParamListGL *params; GLuint floorTex = 0; // CheckRender object for verification #define MAX_EPSILON_ERROR 10.0f -#define THRESHOLD 0.40f +#define THRESHOLD 0.40f // Define the files that are to be saved and the reference images for validation const char *sSDKsample = "CUDA Smoke Particles"; -const char *sRefBin[] = {"ref_smokePart_pos.bin", "ref_smokePart_vel.bin", - NULL}; +const char *sRefBin[] = {"ref_smokePart_pos.bin", "ref_smokePart_vel.bin", NULL}; void runEmitter(); // initialize particle system -void initParticles(int numParticles, bool bUseVBO, bool bUseGL) { - psystem = new ParticleSystem(numParticles, bUseVBO, bUseGL); - psystem->reset(ParticleSystem::CONFIG_RANDOM); +void initParticles(int numParticles, bool bUseVBO, bool bUseGL) +{ + psystem = new ParticleSystem(numParticles, bUseVBO, bUseGL); + psystem->reset(ParticleSystem::CONFIG_RANDOM); - if (bUseVBO) { - renderer = new SmokeRenderer(numParticles); - renderer->setLightTarget(vec3f(0.0, 1.0, 0.0)); + if (bUseVBO) { + renderer = new SmokeRenderer(numParticles); + renderer->setLightTarget(vec3f(0.0, 1.0, 0.0)); - sdkCreateTimer(&timer); - } + sdkCreateTimer(&timer); + } } -void cleanup() { - if (psystem) { - delete psystem; - } +void cleanup() +{ + if (psystem) { + delete psystem; + } - if (renderer) { - delete renderer; - } + if (renderer) { + delete renderer; + } - if (floorProg) { - delete floorProg; - } + if (floorProg) { + delete floorProg; + } - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); - if (params) { - delete params; - } + if (params) { + delete params; + } - if (floorTex) { - glDeleteTextures(1, &floorTex); - } + if (floorTex) { + glDeleteTextures(1, &floorTex); + } } -void renderScene() { - glEnable(GL_DEPTH_TEST); - glDepthMask(GL_TRUE); +void renderScene() +{ + glEnable(GL_DEPTH_TEST); + glDepthMask(GL_TRUE); - // draw floor - floorProg->enable(); - floorProg->bindTexture("tex", floorTex, GL_TEXTURE_2D, 0); - floorProg->bindTexture("shadowTex", renderer->getShadowTexture(), - GL_TEXTURE_2D, 1); - floorProg->setUniformfv("lightPosEye", renderer->getLightPositionEyeSpace(), - 3); - floorProg->setUniformfv("lightColor", lightColor, 3); + // draw floor + floorProg->enable(); + floorProg->bindTexture("tex", floorTex, GL_TEXTURE_2D, 0); + floorProg->bindTexture("shadowTex", renderer->getShadowTexture(), GL_TEXTURE_2D, 1); + floorProg->setUniformfv("lightPosEye", renderer->getLightPositionEyeSpace(), 3); + floorProg->setUniformfv("lightColor", lightColor, 3); - // set shadow matrix as texture matrix - matrix4f shadowMatrix = renderer->getShadowMatrix(); - glActiveTexture(GL_TEXTURE0); - glMatrixMode(GL_TEXTURE); - glLoadMatrixf((GLfloat *)shadowMatrix.get_value()); + // set shadow matrix as texture matrix + matrix4f shadowMatrix = renderer->getShadowMatrix(); + glActiveTexture(GL_TEXTURE0); + glMatrixMode(GL_TEXTURE); + glLoadMatrixf((GLfloat *)shadowMatrix.get_value()); - glColor3f(1.0, 1.0, 1.0); - glNormal3f(0.0, 1.0, 0.0); - glBegin(GL_QUADS); - { - float s = 20.f; - float rep = 20.f; - glTexCoord2f(0.f, 0.f); - glVertex3f(-s, 0, -s); - glTexCoord2f(rep, 0.f); - glVertex3f(s, 0, -s); - glTexCoord2f(rep, rep); - glVertex3f(s, 0, s); - glTexCoord2f(0.f, rep); - glVertex3f(-s, 0, s); - } - glEnd(); - floorProg->disable(); + glColor3f(1.0, 1.0, 1.0); + glNormal3f(0.0, 1.0, 0.0); + glBegin(GL_QUADS); + { + float s = 20.f; + float rep = 20.f; + glTexCoord2f(0.f, 0.f); + glVertex3f(-s, 0, -s); + glTexCoord2f(rep, 0.f); + glVertex3f(s, 0, -s); + glTexCoord2f(rep, rep); + glVertex3f(s, 0, s); + glTexCoord2f(0.f, rep); + glVertex3f(-s, 0, s); + } + glEnd(); + floorProg->disable(); - glMatrixMode(GL_TEXTURE); - glLoadIdentity(); + glMatrixMode(GL_TEXTURE); + glLoadIdentity(); - // draw light - glMatrixMode(GL_MODELVIEW); - glPushMatrix(); - glTranslatef(lightPos.x, lightPos.y, lightPos.z); - glColor3fv(&lightColor[0]); - glutSolidSphere(0.1, 10, 5); - glPopMatrix(); + // draw light + glMatrixMode(GL_MODELVIEW); + glPushMatrix(); + glTranslatef(lightPos.x, lightPos.y, lightPos.z); + glColor3fv(&lightColor[0]); + glutSolidSphere(0.1, 10, 5); + glPopMatrix(); } // main rendering loop -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - // move camera - if (cameraPos[1] > 0.0f) { - cameraPos[1] = 0.0f; - } - - cameraPosLag += (cameraPos - cameraPosLag) * inertia; - cameraRotLag += (cameraRot - cameraRotLag) * inertia; - cursorPosLag += (cursorPos - cursorPosLag) * inertia; - - // view transform - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); - glRotatef(cameraRotLag[0], 1.0, 0.0, 0.0); - glRotatef(cameraRotLag[1], 0.0, 1.0, 0.0); - glTranslatef(cameraPosLag[0], cameraPosLag[1], cameraPosLag[2]); - - glGetFloatv(GL_MODELVIEW_MATRIX, modelView); - - // update the simulation - if (!paused) { - if (emitterOn) { - runEmitter(); + // move camera + if (cameraPos[1] > 0.0f) { + cameraPos[1] = 0.0f; } - SimParams &p = psystem->getParams(); - p.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); + cameraPosLag += (cameraPos - cameraPosLag) * inertia; + cameraRotLag += (cameraRot - cameraRotLag) * inertia; + cursorPosLag += (cursorPos - cursorPosLag) * inertia; - psystem->step(timestep); - currentTime += timestep; - } + // view transform + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + glRotatef(cameraRotLag[0], 1.0, 0.0, 0.0); + glRotatef(cameraRotLag[1], 0.0, 1.0, 0.0); + glTranslatef(cameraPosLag[0], cameraPosLag[1], cameraPosLag[2]); - renderer->calcVectors(); - vec3f sortVector = renderer->getSortVector(); + glGetFloatv(GL_MODELVIEW_MATRIX, modelView); - psystem->setSortVector(make_float3(sortVector.x, sortVector.y, sortVector.z)); - psystem->setModelView(modelView); - psystem->setSorting(sort); - psystem->depthSort(); + // update the simulation + if (!paused) { + if (emitterOn) { + runEmitter(); + } - // render - glClearColor(0.0, 0.0, 0.0, 1.0); - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - renderScene(); + SimParams &p = psystem->getParams(); + p.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); - // draw particles - if (displayEnabled) { - // render scene to offscreen buffers to get correct occlusion - renderer->beginSceneRender(SmokeRenderer::LIGHT_BUFFER); + psystem->step(timestep); + currentTime += timestep; + } + + renderer->calcVectors(); + vec3f sortVector = renderer->getSortVector(); + + psystem->setSortVector(make_float3(sortVector.x, sortVector.y, sortVector.z)); + psystem->setModelView(modelView); + psystem->setSorting(sort); + psystem->depthSort(); + + // render + glClearColor(0.0, 0.0, 0.0, 1.0); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); renderScene(); - renderer->endSceneRender(SmokeRenderer::LIGHT_BUFFER); - renderer->beginSceneRender(SmokeRenderer::SCENE_BUFFER); - renderScene(); - renderer->endSceneRender(SmokeRenderer::SCENE_BUFFER); + // draw particles + if (displayEnabled) { + // render scene to offscreen buffers to get correct occlusion + renderer->beginSceneRender(SmokeRenderer::LIGHT_BUFFER); + renderScene(); + renderer->endSceneRender(SmokeRenderer::LIGHT_BUFFER); - renderer->setPositionBuffer(psystem->getPosBuffer()); - renderer->setVelocityBuffer(psystem->getVelBuffer()); - renderer->setIndexBuffer(psystem->getSortedIndexBuffer()); + renderer->beginSceneRender(SmokeRenderer::SCENE_BUFFER); + renderScene(); + renderer->endSceneRender(SmokeRenderer::SCENE_BUFFER); - renderer->setNumParticles(psystem->getNumParticles()); - renderer->setParticleRadius(spriteSize); - renderer->setDisplayLightBuffer(displayLightBuffer); - renderer->setAlpha(alpha); - renderer->setShadowAlpha(shadowAlpha); - renderer->setLightPosition(lightPos); - renderer->setColorAttenuation(colorAttenuation); - renderer->setLightColor(lightColor); - renderer->setNumSlices(numSlices); - renderer->setNumDisplayedSlices(numDisplayedSlices); - renderer->setBlurRadius(blurRadius); + renderer->setPositionBuffer(psystem->getPosBuffer()); + renderer->setVelocityBuffer(psystem->getVelBuffer()); + renderer->setIndexBuffer(psystem->getSortedIndexBuffer()); - renderer->render(); + renderer->setNumParticles(psystem->getNumParticles()); + renderer->setParticleRadius(spriteSize); + renderer->setDisplayLightBuffer(displayLightBuffer); + renderer->setAlpha(alpha); + renderer->setShadowAlpha(shadowAlpha); + renderer->setLightPosition(lightPos); + renderer->setColorAttenuation(colorAttenuation); + renderer->setLightColor(lightColor); + renderer->setNumSlices(numSlices); + renderer->setNumDisplayedSlices(numDisplayedSlices); + renderer->setBlurRadius(blurRadius); - if (drawVectors) { - renderer->debugVectors(); - } - } + renderer->render(); - // display sliders - if (displaySliders) { - glDisable(GL_DEPTH_TEST); - glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color - glEnable(GL_BLEND); - params->Render(0, 0); - glDisable(GL_BLEND); - glEnable(GL_DEPTH_TEST); - } - - glutSwapBuffers(); - glutReportErrors(); - sdkStopTimer(&timer); - - fpsCount++; - - // this displays the frame rate updated every second (independent of frame - // rate) - if (fpsCount >= fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "CUDA Smoke Particles (%d particles): %3.1f fps", numParticles, - ifps); - glutSetWindowTitle(fps); - fpsCount = 0; - fpsLimit = (ifps > 1.f) ? (int)ifps : 1; - - if (paused) { - fpsLimit = 0; + if (drawVectors) { + renderer->debugVectors(); + } } - sdkResetTimer(&timer); - } + // display sliders + if (displaySliders) { + glDisable(GL_DEPTH_TEST); + glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color + glEnable(GL_BLEND); + params->Render(0, 0); + glDisable(GL_BLEND); + glEnable(GL_DEPTH_TEST); + } + + glutSwapBuffers(); + glutReportErrors(); + sdkStopTimer(&timer); + + fpsCount++; + + // this displays the frame rate updated every second (independent of frame + // rate) + if (fpsCount >= fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "CUDA Smoke Particles (%d particles): %3.1f fps", numParticles, ifps); + glutSetWindowTitle(fps); + fpsCount = 0; + fpsLimit = (ifps > 1.f) ? (int)ifps : 1; + + if (paused) { + fpsLimit = 0; + } + + sdkResetTimer(&timer); + } } // GLUT callback functions -void reshape(int w, int h) { - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - gluPerspective(60.0, (float)w / (float)h, 0.01, 100.0); +void reshape(int w, int h) +{ + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (float)w / (float)h, 0.01, 100.0); - glMatrixMode(GL_MODELVIEW); - glViewport(0, 0, w, h); + glMatrixMode(GL_MODELVIEW); + glViewport(0, 0, w, h); - renderer->setFOV(60.0); - renderer->setWindowSize(w, h); + renderer->setFOV(60.0); + renderer->setWindowSize(w, h); } -void mouse(int button, int state, int x, int y) { - int mods; +void mouse(int button, int state, int x, int y) +{ + int mods; - if (state == GLUT_DOWN) { - buttonState |= 1 << button; - } else if (state == GLUT_UP) { - buttonState = 0; - } - - mods = glutGetModifiers(); - - if (mods & GLUT_ACTIVE_SHIFT) { - buttonState = 2; - } else if (mods & GLUT_ACTIVE_CTRL) { - buttonState = 3; - } - - ox = x; - oy = y; - - if (displaySliders) { - if (params->Mouse(x, y, button, state)) { - glutPostRedisplay(); - return; + if (state == GLUT_DOWN) { + buttonState |= 1 << button; + } + else if (state == GLUT_UP) { + buttonState = 0; } - } - glutPostRedisplay(); + mods = glutGetModifiers(); + + if (mods & GLUT_ACTIVE_SHIFT) { + buttonState = 2; + } + else if (mods & GLUT_ACTIVE_CTRL) { + buttonState = 3; + } + + ox = x; + oy = y; + + if (displaySliders) { + if (params->Mouse(x, y, button, state)) { + glutPostRedisplay(); + return; + } + } + + glutPostRedisplay(); } // transform vector by matrix -void xform(vec3f &v, vec3f &r, float *m) { - r.x = v.x * m[0] + v.y * m[4] + v.z * m[8] + m[12]; - r.y = v.x * m[1] + v.y * m[5] + v.z * m[9] + m[13]; - r.z = v.x * m[2] + v.y * m[6] + v.z * m[10] + m[14]; +void xform(vec3f &v, vec3f &r, float *m) +{ + r.x = v.x * m[0] + v.y * m[4] + v.z * m[8] + m[12]; + r.y = v.x * m[1] + v.y * m[5] + v.z * m[9] + m[13]; + r.z = v.x * m[2] + v.y * m[6] + v.z * m[10] + m[14]; } // transform vector by transpose of matrix (assuming orthonormal) -void ixform(vec3f &v, vec3f &r, float *m) { - r.x = v.x * m[0] + v.y * m[1] + v.z * m[2]; - r.y = v.x * m[4] + v.y * m[5] + v.z * m[6]; - r.z = v.x * m[8] + v.y * m[9] + v.z * m[10]; +void ixform(vec3f &v, vec3f &r, float *m) +{ + r.x = v.x * m[0] + v.y * m[1] + v.z * m[2]; + r.y = v.x * m[4] + v.y * m[5] + v.z * m[6]; + r.z = v.x * m[8] + v.y * m[9] + v.z * m[10]; } -void ixformPoint(vec3f &v, vec3f &r, float *m) { - vec3f x; - x.x = v.x - m[12]; - x.y = v.y - m[13]; - x.z = v.z - m[14]; - ixform(x, r, m); +void ixformPoint(vec3f &v, vec3f &r, float *m) +{ + vec3f x; + x.x = v.x - m[12]; + x.y = v.y - m[13]; + x.z = v.z - m[14]; + ixform(x, r, m); } -void motion(int x, int y) { - float dx, dy; - dx = (float)(x - ox); - dy = (float)(y - oy); +void motion(int x, int y) +{ + float dx, dy; + dx = (float)(x - ox); + dy = (float)(y - oy); - if (displaySliders) { - if (params->Motion(x, y)) { - ox = x; - oy = y; - glutPostRedisplay(); - return; + if (displaySliders) { + if (params->Motion(x, y)) { + ox = x; + oy = y; + glutPostRedisplay(); + return; + } } - } - switch (mode) { + switch (mode) { case M_VIEW: { - if (buttonState == 1) { - // left = rotate - cameraRot[0] += dy * rotateSpeed; - cameraRot[1] += dx * rotateSpeed; - } + if (buttonState == 1) { + // left = rotate + cameraRot[0] += dy * rotateSpeed; + cameraRot[1] += dx * rotateSpeed; + } - if (buttonState == 2) { - // middle = translate - vec3f v = vec3f(dx * translateSpeed, -dy * translateSpeed, 0.0f); - vec3f r; - ixform(v, r, modelView); - cameraPos += r; - } + if (buttonState == 2) { + // middle = translate + vec3f v = vec3f(dx * translateSpeed, -dy * translateSpeed, 0.0f); + vec3f r; + ixform(v, r, modelView); + cameraPos += r; + } - if (buttonState == 3) { - // left+middle = zoom - vec3f v = vec3f(0.0, 0.0, dy * translateSpeed); - vec3f r; - ixform(v, r, modelView); - cameraPos += r; - } + if (buttonState == 3) { + // left+middle = zoom + vec3f v = vec3f(0.0, 0.0, dy * translateSpeed); + vec3f r; + ixform(v, r, modelView); + cameraPos += r; + } } break; case M_MOVE_CURSOR: { - if (buttonState == 1) { - vec3f v = vec3f(dx * cursorSpeed, -dy * cursorSpeed, 0.0f); - vec3f r; - ixform(v, r, modelView); - cursorPos += r; - } else if (buttonState == 2) { - vec3f v = vec3f(0.0f, 0.0f, dy * cursorSpeed); - vec3f r; - ixform(v, r, modelView); - cursorPos += r; - } + if (buttonState == 1) { + vec3f v = vec3f(dx * cursorSpeed, -dy * cursorSpeed, 0.0f); + vec3f r; + ixform(v, r, modelView); + cursorPos += r; + } + else if (buttonState == 2) { + vec3f v = vec3f(0.0f, 0.0f, dy * cursorSpeed); + vec3f r; + ixform(v, r, modelView); + cursorPos += r; + } } break; case M_MOVE_LIGHT: - if (buttonState == 1) { - vec3f v = vec3f(dx * cursorSpeed, -dy * cursorSpeed, 0.0f); - vec3f r; - ixform(v, r, modelView); - lightPos += r; - } else if (buttonState == 2) { - vec3f v = vec3f(0.0f, 0.0f, dy * cursorSpeed); - vec3f r; - ixform(v, r, modelView); - lightPos += r; - } + if (buttonState == 1) { + vec3f v = vec3f(dx * cursorSpeed, -dy * cursorSpeed, 0.0f); + vec3f r; + ixform(v, r, modelView); + lightPos += r; + } + else if (buttonState == 2) { + vec3f v = vec3f(0.0f, 0.0f, dy * cursorSpeed); + vec3f r; + ixform(v, r, modelView); + lightPos += r; + } - break; - } + break; + } - ox = x; - oy = y; - glutPostRedisplay(); + ox = x; + oy = y; + glutPostRedisplay(); } // commented out to remove unused parameter warnings in Linux -void key(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void key(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case ' ': - paused = !paused; - break; + paused = !paused; + break; case 13: - psystem->step(timestep); - renderer->setPositionBuffer(psystem->getPosBuffer()); - renderer->setVelocityBuffer(psystem->getVelBuffer()); - break; + psystem->step(timestep); + renderer->setPositionBuffer(psystem->getPosBuffer()); + renderer->setVelocityBuffer(psystem->getVelBuffer()); + break; case '\033': - cleanup(); - exit(EXIT_SUCCESS); - break; + cleanup(); + exit(EXIT_SUCCESS); + break; case 'v': - mode = M_VIEW; - animateEmitter = true; - break; + mode = M_VIEW; + animateEmitter = true; + break; case 'm': - mode = M_MOVE_CURSOR; - animateEmitter = false; - break; + mode = M_MOVE_CURSOR; + animateEmitter = false; + break; case 'l': - mode = M_MOVE_LIGHT; - break; + mode = M_MOVE_LIGHT; + break; case 'r': - displayEnabled = !displayEnabled; - break; + displayEnabled = !displayEnabled; + break; case '1': - psystem->reset(ParticleSystem::CONFIG_RANDOM); - break; + psystem->reset(ParticleSystem::CONFIG_RANDOM); + break; case '2': - emitterOn ^= 1; - break; + emitterOn ^= 1; + break; case 'W': - wireframe = !wireframe; - break; + wireframe = !wireframe; + break; case 'h': - displaySliders = !displaySliders; - break; + displaySliders = !displaySliders; + break; case 'o': - sort ^= 1; - psystem->setSorting(sort); - break; + sort ^= 1; + psystem->setSorting(sort); + break; case 'D': - displayLightBuffer ^= 1; - break; + displayLightBuffer ^= 1; + break; case 'p': - displayMode = (displayMode + 1) % SmokeRenderer::NUM_MODES; - renderer->setDisplayMode((SmokeRenderer::DisplayMode)displayMode); - break; + displayMode = (displayMode + 1) % SmokeRenderer::NUM_MODES; + renderer->setDisplayMode((SmokeRenderer::DisplayMode)displayMode); + break; case 'P': - displayMode--; + displayMode--; - if (displayMode < 0) { - displayMode = SmokeRenderer::NUM_MODES - 1; - } + if (displayMode < 0) { + displayMode = SmokeRenderer::NUM_MODES - 1; + } - renderer->setDisplayMode((SmokeRenderer::DisplayMode)displayMode); - break; + renderer->setDisplayMode((SmokeRenderer::DisplayMode)displayMode); + break; case 'V': - drawVectors ^= 1; - break; + drawVectors ^= 1; + break; case '=': - numSlices *= 2; + numSlices *= 2; - if (numSlices > 256) { - numSlices = 256; - } + if (numSlices > 256) { + numSlices = 256; + } - numDisplayedSlices = numSlices; - break; + numDisplayedSlices = numSlices; + break; case '-': - if (numSlices > 1) { - numSlices /= 2; - } + if (numSlices > 1) { + numSlices /= 2; + } - numDisplayedSlices = numSlices; - break; + numDisplayedSlices = numSlices; + break; case 'b': - doBlur ^= 1; - renderer->setDoBlur(doBlur); - break; - } + doBlur ^= 1; + renderer->setDoBlur(doBlur); + break; + } - printf("numSlices = %d\n", numSlices); - keyDown[key] = true; + printf("numSlices = %d\n", numSlices); + keyDown[key] = true; - glutPostRedisplay(); + glutPostRedisplay(); } void keyUp(unsigned char key, int /*x*/, int /*y*/) { keyDown[key] = false; } -void runEmitter() { - vec3f vel = vec3f(0, emitterVel, 0); - vec3f vx(1, 0, 0); - vec3f vy(0, 0, 1); - vec3f spread(emitterSpread, 0.0f, emitterSpread); +void runEmitter() +{ + vec3f vel = vec3f(0, emitterVel, 0); + vec3f vx(1, 0, 0); + vec3f vy(0, 0, 1); + vec3f spread(emitterSpread, 0.0f, emitterSpread); - psystem->sphereEmitter(emitterIndex, cursorPosLag, vel, spread, emitterRadius, - ftoi(emitterRate * timestep), particleLifetime, - particleLifetime * 0.1f); + psystem->sphereEmitter(emitterIndex, + cursorPosLag, + vel, + spread, + emitterRadius, + ftoi(emitterRate * timestep), + particleLifetime, + particleLifetime * 0.1f); - if (emitterIndex > numParticles - 1) { - emitterIndex = 0; - } + if (emitterIndex > numParticles - 1) { + emitterIndex = 0; + } } -void special(int k, int x, int y) { - if (displaySliders) { - params->Special(k, x, y); - } +void special(int k, int x, int y) +{ + if (displaySliders) { + params->Special(k, x, y); + } } -void idle(void) { - // move camera in view direction - /* - 0 4 8 12 x - 1 5 9 13 y - 2 6 10 14 z - */ - if (keyDown['w']) { - cameraPos[0] += modelView[2] * walkSpeed; - cameraPos[1] += modelView[6] * walkSpeed; - cameraPos[2] += modelView[10] * walkSpeed; - } +void idle(void) +{ + // move camera in view direction + /* + 0 4 8 12 x + 1 5 9 13 y + 2 6 10 14 z + */ + if (keyDown['w']) { + cameraPos[0] += modelView[2] * walkSpeed; + cameraPos[1] += modelView[6] * walkSpeed; + cameraPos[2] += modelView[10] * walkSpeed; + } - if (keyDown['s']) { - cameraPos[0] -= modelView[2] * walkSpeed; - cameraPos[1] -= modelView[6] * walkSpeed; - cameraPos[2] -= modelView[10] * walkSpeed; - } + if (keyDown['s']) { + cameraPos[0] -= modelView[2] * walkSpeed; + cameraPos[1] -= modelView[6] * walkSpeed; + cameraPos[2] -= modelView[10] * walkSpeed; + } - if (keyDown['a']) { - cameraPos[0] += modelView[0] * walkSpeed; - cameraPos[1] += modelView[4] * walkSpeed; - cameraPos[2] += modelView[8] * walkSpeed; - } + if (keyDown['a']) { + cameraPos[0] += modelView[0] * walkSpeed; + cameraPos[1] += modelView[4] * walkSpeed; + cameraPos[2] += modelView[8] * walkSpeed; + } - if (keyDown['d']) { - cameraPos[0] -= modelView[0] * walkSpeed; - cameraPos[1] -= modelView[4] * walkSpeed; - cameraPos[2] -= modelView[8] * walkSpeed; - } + if (keyDown['d']) { + cameraPos[0] -= modelView[0] * walkSpeed; + cameraPos[1] -= modelView[4] * walkSpeed; + cameraPos[2] -= modelView[8] * walkSpeed; + } - if (keyDown['e']) { - cameraPos[0] += modelView[1] * walkSpeed; - cameraPos[1] += modelView[5] * walkSpeed; - cameraPos[2] += modelView[9] * walkSpeed; - } + if (keyDown['e']) { + cameraPos[0] += modelView[1] * walkSpeed; + cameraPos[1] += modelView[5] * walkSpeed; + cameraPos[2] += modelView[9] * walkSpeed; + } - if (keyDown['q']) { - cameraPos[0] -= modelView[1] * walkSpeed; - cameraPos[1] -= modelView[5] * walkSpeed; - cameraPos[2] -= modelView[9] * walkSpeed; - } + if (keyDown['q']) { + cameraPos[0] -= modelView[1] * walkSpeed; + cameraPos[1] -= modelView[5] * walkSpeed; + cameraPos[2] -= modelView[9] * walkSpeed; + } - if (animateEmitter) { - const float speed = 0.02f; - cursorPos.x = sin(currentTime * speed) * 1.5f; - cursorPos.y = 1.5f + sin(currentTime * speed * 1.3f); - cursorPos.z = cos(currentTime * speed) * 1.5f; - } + if (animateEmitter) { + const float speed = 0.02f; + cursorPos.x = sin(currentTime * speed) * 1.5f; + cursorPos.y = 1.5f + sin(currentTime * speed * 1.3f); + cursorPos.z = cos(currentTime * speed) * 1.5f; + } - glutPostRedisplay(); + glutPostRedisplay(); } // initialize sliders -void initParams() { - // create a new parameter list - params = new ParamListGL("misc"); +void initParams() +{ + // create a new parameter list + params = new ParamListGL("misc"); - params->AddParam(new Param("displayed slices", numDisplayedSlices, 0, - 256, 1, &numDisplayedSlices)); + params->AddParam(new Param("displayed slices", numDisplayedSlices, 0, 256, 1, &numDisplayedSlices)); - params->AddParam( - new Param("time step", timestep, 0.0f, 1.0f, 0.001f, ×tep)); + params->AddParam(new Param("time step", timestep, 0.0f, 1.0f, 0.001f, ×tep)); - SimParams &p = psystem->getParams(); - params->AddParam( - new Param("damping", 0.99f, 0.0f, 1.0f, 0.001f, &p.globalDamping)); - params->AddParam( - new Param("gravity", 0.0f, 0.01f, -0.01f, 0.0001f, &p.gravity.y)); + SimParams &p = psystem->getParams(); + params->AddParam(new Param("damping", 0.99f, 0.0f, 1.0f, 0.001f, &p.globalDamping)); + params->AddParam(new Param("gravity", 0.0f, 0.01f, -0.01f, 0.0001f, &p.gravity.y)); - params->AddParam( - new Param("noise freq", 0.1f, 0.0f, 1.0f, 0.001f, &p.noiseFreq)); - params->AddParam(new Param("noise strength", 0.001f, 0.0f, 0.01f, - 0.001f, &p.noiseAmp)); - params->AddParam(new Param("noise anim", 0.0f, -0.001f, 0.001f, - 0.0001f, &p.noiseSpeed.y)); + params->AddParam(new Param("noise freq", 0.1f, 0.0f, 1.0f, 0.001f, &p.noiseFreq)); + params->AddParam(new Param("noise strength", 0.001f, 0.0f, 0.01f, 0.001f, &p.noiseAmp)); + params->AddParam(new Param("noise anim", 0.0f, -0.001f, 0.001f, 0.0001f, &p.noiseSpeed.y)); - params->AddParam(new Param("sprite size", spriteSize, 0.0f, 0.1f, - 0.001f, &spriteSize)); - params->AddParam( - new Param("alpha", alpha, 0.0f, 1.0f, 0.001f, &alpha)); + params->AddParam(new Param("sprite size", spriteSize, 0.0f, 0.1f, 0.001f, &spriteSize)); + params->AddParam(new Param("alpha", alpha, 0.0f, 1.0f, 0.001f, &alpha)); - params->AddParam(new Param("light color r", lightColor[0], 0.0f, 1.0f, - 0.01f, &lightColor[0])); - params->AddParam(new Param("light color g", lightColor[1], 0.0f, 1.0f, - 0.01f, &lightColor[1])); - params->AddParam(new Param("light color b", lightColor[2], 0.0f, 1.0f, - 0.01f, &lightColor[2])); + params->AddParam(new Param("light color r", lightColor[0], 0.0f, 1.0f, 0.01f, &lightColor[0])); + params->AddParam(new Param("light color g", lightColor[1], 0.0f, 1.0f, 0.01f, &lightColor[1])); + params->AddParam(new Param("light color b", lightColor[2], 0.0f, 1.0f, 0.01f, &lightColor[2])); - params->AddParam(new Param("atten color r", colorAttenuation[0], 0.0f, - 1.0f, 0.01f, &colorAttenuation[0])); - params->AddParam(new Param("atten color g", colorAttenuation[1], 0.0f, - 1.0f, 0.01f, &colorAttenuation[1])); - params->AddParam(new Param("atten color b", colorAttenuation[2], 0.0f, - 1.0f, 0.01f, &colorAttenuation[2])); - params->AddParam(new Param("shadow alpha", shadowAlpha, 0.0f, 0.1f, - 0.001f, &shadowAlpha)); + params->AddParam(new Param("atten color r", colorAttenuation[0], 0.0f, 1.0f, 0.01f, &colorAttenuation[0])); + params->AddParam(new Param("atten color g", colorAttenuation[1], 0.0f, 1.0f, 0.01f, &colorAttenuation[1])); + params->AddParam(new Param("atten color b", colorAttenuation[2], 0.0f, 1.0f, 0.01f, &colorAttenuation[2])); + params->AddParam(new Param("shadow alpha", shadowAlpha, 0.0f, 0.1f, 0.001f, &shadowAlpha)); - params->AddParam(new Param("blur radius", blurRadius, 0.0f, 10.0f, - 0.1f, &blurRadius)); + params->AddParam(new Param("blur radius", blurRadius, 0.0f, 10.0f, 0.1f, &blurRadius)); - params->AddParam(new Param("emitter radius", emitterRadius, 0.0f, 2.0f, - 0.01f, &emitterRadius)); - params->AddParam( - new Param("emitter rate", emitterRate, 0, 10000, 1, &emitterRate)); - params->AddParam(new Param("emitter velocity", emitterVel, 0.0f, 0.1f, - 0.001f, &emitterVel)); - params->AddParam(new Param("emitter spread", emitterSpread, 0.0f, 0.1f, - 0.001f, &emitterSpread)); + params->AddParam(new Param("emitter radius", emitterRadius, 0.0f, 2.0f, 0.01f, &emitterRadius)); + params->AddParam(new Param("emitter rate", emitterRate, 0, 10000, 1, &emitterRate)); + params->AddParam(new Param("emitter velocity", emitterVel, 0.0f, 0.1f, 0.001f, &emitterVel)); + params->AddParam(new Param("emitter spread", emitterSpread, 0.0f, 0.1f, 0.001f, &emitterSpread)); - params->AddParam(new Param("particle lifetime", particleLifetime, 0.0f, - 1000.0f, 1.0f, &particleLifetime)); + params->AddParam(new Param("particle lifetime", particleLifetime, 0.0f, 1000.0f, 1.0f, &particleLifetime)); } void mainMenu(int i) { key((unsigned char)i, 0, 0); } -void initMenus() { - glutCreateMenu(mainMenu); - glutAddMenuEntry("Reset block [1]", '1'); - glutAddMenuEntry("Toggle emitter [2]", '2'); - glutAddMenuEntry("Toggle animation [ ]", ' '); - glutAddMenuEntry("Step animation [ret]", 13); - glutAddMenuEntry("View mode [v]", 'v'); - glutAddMenuEntry("Move cursor mode [m]", 'm'); - glutAddMenuEntry("Move light mode [l]", 'l'); - glutAddMenuEntry("Toggle point rendering [p]", 'p'); - glutAddMenuEntry("Toggle sliders [h]", 'h'); - glutAddMenuEntry("Toggle sorting [o]", 'o'); - glutAddMenuEntry("Toggle vectors [V]", 'V'); - glutAddMenuEntry("Display light buffer [D]", 'D'); - glutAddMenuEntry("Toggle shadow blur [b]", 'b'); - glutAddMenuEntry("Increase no. slices [=]", '='); - glutAddMenuEntry("Decrease no. slices [-]", '-'); - glutAddMenuEntry("Quit (esc)", '\033'); - glutAttachMenu(GLUT_RIGHT_BUTTON); +void initMenus() +{ + glutCreateMenu(mainMenu); + glutAddMenuEntry("Reset block [1]", '1'); + glutAddMenuEntry("Toggle emitter [2]", '2'); + glutAddMenuEntry("Toggle animation [ ]", ' '); + glutAddMenuEntry("Step animation [ret]", 13); + glutAddMenuEntry("View mode [v]", 'v'); + glutAddMenuEntry("Move cursor mode [m]", 'm'); + glutAddMenuEntry("Move light mode [l]", 'l'); + glutAddMenuEntry("Toggle point rendering [p]", 'p'); + glutAddMenuEntry("Toggle sliders [h]", 'h'); + glutAddMenuEntry("Toggle sorting [o]", 'o'); + glutAddMenuEntry("Toggle vectors [V]", 'V'); + glutAddMenuEntry("Display light buffer [D]", 'D'); + glutAddMenuEntry("Toggle shadow blur [b]", 'b'); + glutAddMenuEntry("Increase no. slices [=]", '='); + glutAddMenuEntry("Decrease no. slices [-]", '-'); + glutAddMenuEntry("Quit (esc)", '\033'); + glutAttachMenu(GLUT_RIGHT_BUTTON); } -GLuint createTexture(GLenum target, GLint internalformat, GLenum format, int w, - int h, void *data) { - GLuint tex; - glGenTextures(1, &tex); - glBindTexture(target, tex); - glTexParameteri(target, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexParameteri(target, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameteri(target, GL_TEXTURE_WRAP_S, GL_REPEAT); - glTexParameteri(target, GL_TEXTURE_WRAP_T, GL_REPEAT); - glTexParameteri(target, GL_GENERATE_MIPMAP_SGIS, GL_TRUE); - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glTexImage2D(target, 0, internalformat, w, h, 0, format, GL_UNSIGNED_BYTE, - data); - return tex; +GLuint createTexture(GLenum target, GLint internalformat, GLenum format, int w, int h, void *data) +{ + GLuint tex; + glGenTextures(1, &tex); + glBindTexture(target, tex); + glTexParameteri(target, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexParameteri(target, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(target, GL_TEXTURE_WRAP_S, GL_REPEAT); + glTexParameteri(target, GL_TEXTURE_WRAP_T, GL_REPEAT); + glTexParameteri(target, GL_GENERATE_MIPMAP_SGIS, GL_TRUE); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexImage2D(target, 0, internalformat, w, h, 0, format, GL_UNSIGNED_BYTE, data); + return tex; } -GLuint loadTexture(char *filename) { - unsigned char *data = 0; - unsigned int width, height; - sdkLoadPPM4ub(filename, &data, &width, &height); +GLuint loadTexture(char *filename) +{ + unsigned char *data = 0; + unsigned int width, height; + sdkLoadPPM4ub(filename, &data, &width, &height); - if (!data) { - printf("Error opening file '%s'\n", filename); - return 0; - } + if (!data) { + printf("Error opening file '%s'\n", filename); + return 0; + } - printf("Loaded '%s', %d x %d pixels\n", filename, width, height); + printf("Loaded '%s', %d x %d pixels\n", filename, width, height); - return createTexture(GL_TEXTURE_2D, GL_RGBA8, GL_RGBA, width, height, data); + return createTexture(GL_TEXTURE_2D, GL_RGBA8, GL_RGBA, width, height, data); } // initialize OpenGL -void initGL(int *argc, char **argv) { - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); - glutInitWindowSize(winWidth, winHeight); - glutCreateWindow("CUDA Smoke Particles"); +void initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); + glutInitWindowSize(winWidth, winHeight); + glutCreateWindow("CUDA Smoke Particles"); - if (!isGLVersionSupported(2, 0)) { - fprintf(stderr, - "The following required OpenGL extensions " - "missing:\n\tGL_VERSION_2_0\n\tGL_VERSION_1_5\n"); - exit(EXIT_SUCCESS); - } + if (!isGLVersionSupported(2, 0)) { + fprintf(stderr, + "The following required OpenGL extensions " + "missing:\n\tGL_VERSION_2_0\n\tGL_VERSION_1_5\n"); + exit(EXIT_SUCCESS); + } - if (!areGLExtensionsSupported("GL_ARB_multitexture " - "GL_ARB_vertex_buffer_object " - "GL_EXT_geometry_shader4")) { - fprintf(stderr, - "The following required OpenGL extensions " - "missing:\n\tGL_ARB_multitexture\n\tGL_ARB_vertex_buffer_" - "object\n\tGL_EXT_geometry_shader4.\n"); - exit(EXIT_SUCCESS); - } + if (!areGLExtensionsSupported("GL_ARB_multitexture " + "GL_ARB_vertex_buffer_object " + "GL_EXT_geometry_shader4")) { + fprintf(stderr, + "The following required OpenGL extensions " + "missing:\n\tGL_ARB_multitexture\n\tGL_ARB_vertex_buffer_" + "object\n\tGL_EXT_geometry_shader4.\n"); + exit(EXIT_SUCCESS); + } #if defined(WIN32) - if (wglewIsSupported("WGL_EXT_swap_control")) { - // disable vertical sync - wglSwapIntervalEXT(0); - } + if (wglewIsSupported("WGL_EXT_swap_control")) { + // disable vertical sync + wglSwapIntervalEXT(0); + } #endif - glEnable(GL_DEPTH_TEST); + glEnable(GL_DEPTH_TEST); - // load floor texture - char *imagePath = sdkFindFilePath("floortile.ppm", argv[0]); + // load floor texture + char *imagePath = sdkFindFilePath("floortile.ppm", argv[0]); - if (imagePath == NULL) { - fprintf(stderr, "Error finding floor image file\n"); - exit(EXIT_FAILURE); - } + if (imagePath == NULL) { + fprintf(stderr, "Error finding floor image file\n"); + exit(EXIT_FAILURE); + } - floorTex = loadTexture(imagePath); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, - GL_LINEAR_MIPMAP_LINEAR); - glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 16.0f); + floorTex = loadTexture(imagePath); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_LINEAR); + glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_ANISOTROPY_EXT, 16.0f); - floorProg = new GLSLProgram(floorVS, floorPS); + floorProg = new GLSLProgram(floorVS, floorPS); - glutReportErrors(); + glutReportErrors(); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s Starting...\n\n", sSDKsample); + printf("%s Starting...\n\n", sSDKsample); - printf( - "NOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n\n"); + printf("NOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "n")) { - numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "n")) { + numParticles = getCmdLineArgumentInt(argc, (const char **)argv, "n"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { + g_bQAReadback = true; + } } - if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { - g_bQAReadback = true; + if (g_bQAReadback) { + // For Automated testing, we do not use OpenGL/CUDA interop + findCudaDevice(argc, (const char **)argv); + + // This code path is used for Automated Testing + initParticles(numParticles, false, false); + initParams(); + + if (emitterOn) { + runEmitter(); + } + + SimParams ¶ms = psystem->getParams(); + params.cursorPos = make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); + + psystem->step(timestep); + + float4 *pos = NULL, *vel = NULL; + + psystem->dumpBin(&pos, &vel); + + sdkDumpBin(pos, numParticles * sizeof(float4), "smokeParticles_pos.bin"); + sdkDumpBin(vel, numParticles * sizeof(float4), "smokeParticles_vel.bin"); + + if (!sdkCompareBin2BinFloat("smokeParticles_pos.bin", + sRefBin[0], + numParticles * sizeof(float4), + MAX_EPSILON_ERROR, + THRESHOLD, + argv[0])) { + g_TotalErrors++; + } + + if (!sdkCompareBin2BinFloat("smokeParticles_vel.bin", + sRefBin[1], + numParticles * sizeof(float4), + MAX_EPSILON_ERROR, + THRESHOLD, + argv[0])) { + g_TotalErrors++; + } + + delete psystem; } - } + else { + // Normal smokeParticles rendering path + // 1st initialize OpenGL context, so we can properly set the GL for CUDA. + // This is needed to achieve optimal performance with OpenGL/CUDA interop. + initGL(&argc, argv); - if (g_bQAReadback) { - // For Automated testing, we do not use OpenGL/CUDA interop - findCudaDevice(argc, (const char **)argv); + findCudaDevice(argc, (const char **)argv); - // This code path is used for Automated Testing - initParticles(numParticles, false, false); - initParams(); + // This is the normal code path for SmokeParticles + initParticles(numParticles, true, true); + initParams(); + initMenus(); - if (emitterOn) { - runEmitter(); + glutDisplayFunc(display); + glutReshapeFunc(reshape); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutKeyboardFunc(key); + glutKeyboardUpFunc(keyUp); + glutSpecialFunc(special); + glutIdleFunc(idle); + + glutMainLoop(); } - SimParams ¶ms = psystem->getParams(); - params.cursorPos = - make_float3(cursorPosLag.x, cursorPosLag.y, cursorPosLag.z); - - psystem->step(timestep); - - float4 *pos = NULL, *vel = NULL; - - psystem->dumpBin(&pos, &vel); - - sdkDumpBin(pos, numParticles * sizeof(float4), "smokeParticles_pos.bin"); - sdkDumpBin(vel, numParticles * sizeof(float4), "smokeParticles_vel.bin"); - - if (!sdkCompareBin2BinFloat("smokeParticles_pos.bin", sRefBin[0], - numParticles * sizeof(float4), - MAX_EPSILON_ERROR, THRESHOLD, argv[0])) { - g_TotalErrors++; - } - - if (!sdkCompareBin2BinFloat("smokeParticles_vel.bin", sRefBin[1], - numParticles * sizeof(float4), - MAX_EPSILON_ERROR, THRESHOLD, argv[0])) { - g_TotalErrors++; - } - - delete psystem; - } else { - // Normal smokeParticles rendering path - // 1st initialize OpenGL context, so we can properly set the GL for CUDA. - // This is needed to achieve optimal performance with OpenGL/CUDA interop. - initGL(&argc, argv); - - findCudaDevice(argc, (const char **)argv); - - // This is the normal code path for SmokeParticles - initParticles(numParticles, true, true); - initParams(); - initMenus(); - - glutDisplayFunc(display); - glutReshapeFunc(reshape); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutKeyboardFunc(key); - glutKeyboardUpFunc(keyUp); - glutSpecialFunc(special); - glutIdleFunc(idle); - - glutMainLoop(); - } - - exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); + exit(g_TotalErrors > 0 ? EXIT_FAILURE : EXIT_SUCCESS); } diff --git a/Samples/5_Domain_Specific/smokeParticles/particles_kernel.cuh b/Samples/5_Domain_Specific/smokeParticles/particles_kernel.cuh index e58fae1a..de0c7c92 100644 --- a/Samples/5_Domain_Specific/smokeParticles/particles_kernel.cuh +++ b/Samples/5_Domain_Specific/smokeParticles/particles_kernel.cuh @@ -31,19 +31,21 @@ #include "vector_types.h" typedef unsigned int uint; -struct SimParams { - float3 gravity; - float globalDamping; - float noiseFreq; - float noiseAmp; - float3 cursorPos; +struct SimParams +{ + float3 gravity; + float globalDamping; + float noiseFreq; + float noiseAmp; + float3 cursorPos; - float time; - float3 noiseSpeed; + float time; + float3 noiseSpeed; }; -struct float4x4 { - float m[16]; +struct float4x4 +{ + float m[16]; }; #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/particles_kernel_device.cuh b/Samples/5_Domain_Specific/smokeParticles/particles_kernel_device.cuh index 0bf37c7d..8e6e449b 100644 --- a/Samples/5_Domain_Specific/smokeParticles/particles_kernel_device.cuh +++ b/Samples/5_Domain_Specific/smokeParticles/particles_kernel_device.cuh @@ -35,6 +35,10 @@ #include "helper_math.h" #include "math_constants.h" #include "particles_kernel.cuh" +#include "thrust/device_ptr.h" +#include "thrust/for_each.h" +#include "thrust/iterator/zip_iterator.h" +#include "thrust/sort.h" cudaTextureObject_t noiseTex; // simulation parameters @@ -43,76 +47,75 @@ __constant__ SimParams cudaParams; // look up in 3D noise texture __device__ float3 noise3D(float3 p, cudaTextureObject_t noiseTex) { - float4 n = tex3D(noiseTex, p.x, p.y, p.z); - return make_float3(n.x, n.y, n.z); + float4 n = tex3D(noiseTex, p.x, p.y, p.z); + return make_float3(n.x, n.y, n.z); } // integrate particle attributes struct integrate_functor { - float deltaTime; - cudaTextureObject_t noiseTex; + float deltaTime; + cudaTextureObject_t noiseTex; - __host__ __device__ integrate_functor(float delta_time, - cudaTextureObject_t noise_Tex) - : deltaTime(delta_time), noiseTex(noise_Tex) {} - - template - __device__ void operator()(Tuple t) - { - volatile float4 posData = thrust::get<2>(t); - volatile float4 velData = thrust::get<3>(t); - - float3 pos = make_float3(posData.x, posData.y, posData.z); - float3 vel = make_float3(velData.x, velData.y, velData.z); - - // update particle age - float age = posData.w; - float lifetime = velData.w; - - if (age < lifetime) + __host__ __device__ integrate_functor(float delta_time, cudaTextureObject_t noise_Tex) + : deltaTime(delta_time) + , noiseTex(noise_Tex) { - age += deltaTime; - } - else - { - age = lifetime; } - // apply accelerations - vel += cudaParams.gravity * deltaTime; + template __device__ void operator()(Tuple t) + { + volatile float4 posData = thrust::get<2>(t); + volatile float4 velData = thrust::get<3>(t); - // apply procedural noise - float3 noise = noise3D( - pos * cudaParams.noiseFreq + cudaParams.time * cudaParams.noiseSpeed, noiseTex); - vel += noise * cudaParams.noiseAmp; + float3 pos = make_float3(posData.x, posData.y, posData.z); + float3 vel = make_float3(velData.x, velData.y, velData.z); - // new position = old position + velocity * deltaTime - pos += vel * deltaTime; + // update particle age + float age = posData.w; + float lifetime = velData.w; - vel *= cudaParams.globalDamping; + if (age < lifetime) { + age += deltaTime; + } + else { + age = lifetime; + } - // store new position and velocity - thrust::get<0>(t) = make_float4(pos, age); - thrust::get<1>(t) = make_float4(vel, velData.w); - } + // apply accelerations + vel += cudaParams.gravity * deltaTime; + + // apply procedural noise + float3 noise = noise3D(pos * cudaParams.noiseFreq + cudaParams.time * cudaParams.noiseSpeed, noiseTex); + vel += noise * cudaParams.noiseAmp; + + // new position = old position + velocity * deltaTime + pos += vel * deltaTime; + + vel *= cudaParams.globalDamping; + + // store new position and velocity + thrust::get<0>(t) = make_float4(pos, age); + thrust::get<1>(t) = make_float4(vel, velData.w); + } }; struct calcDepth_functor { - float3 sortVector; + float3 sortVector; - __host__ __device__ calcDepth_functor(float3 sort_vector) - : sortVector(sort_vector) {} + __host__ __device__ calcDepth_functor(float3 sort_vector) + : sortVector(sort_vector) + { + } - template - __host__ __device__ void operator()(Tuple t) - { - volatile float4 p = thrust::get<0>(t); - float key = -dot(make_float3(p.x, p.y, p.z), - sortVector); // project onto sort vector - thrust::get<1>(t) = key; - } + template __host__ __device__ void operator()(Tuple t) + { + volatile float4 p = thrust::get<0>(t); + float key = -dot(make_float3(p.x, p.y, p.z), + sortVector); // project onto sort vector + thrust::get<1>(t) = key; + } }; #endif diff --git a/Samples/5_Domain_Specific/smokeParticles/renderbuffer.cpp b/Samples/5_Domain_Specific/smokeParticles/renderbuffer.cpp index 4080406e..952b6437 100644 --- a/Samples/5_Domain_Specific/smokeParticles/renderbuffer.cpp +++ b/Samples/5_Domain_Specific/smokeParticles/renderbuffer.cpp @@ -50,14 +50,16 @@ */ #define HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION -#include #include "renderbuffer.h" + +#include #include using namespace std; Renderbuffer::Renderbuffer() : m_bufId(_CreateBufferId()) -{} +{ +} Renderbuffer::Renderbuffer(GLenum internalFormat, int width, int height) : m_bufId(_CreateBufferId()) @@ -65,27 +67,17 @@ Renderbuffer::Renderbuffer(GLenum internalFormat, int width, int height) Set(internalFormat, width, height); } -Renderbuffer::~Renderbuffer() -{ - glDeleteRenderbuffersEXT(1, &m_bufId); -} +Renderbuffer::~Renderbuffer() { glDeleteRenderbuffersEXT(1, &m_bufId); } -void Renderbuffer::Bind() -{ - glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, m_bufId); -} +void Renderbuffer::Bind() { glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, m_bufId); } -void Renderbuffer::Unbind() -{ - glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, 0); -} +void Renderbuffer::Unbind() { glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, 0); } void Renderbuffer::Set(GLenum internalFormat, int width, int height) { int maxSize = Renderbuffer::GetMaxSize(); - if (width > maxSize || height > maxSize) - { + if (width > maxSize || height > maxSize) { cerr << "Renderbuffer::Renderbuffer() ERROR:\n\t" << "Size too big (" << width << ", " << height << ")\n"; return; @@ -95,8 +87,7 @@ void Renderbuffer::Set(GLenum internalFormat, int width, int height) GLint savedId = 0; glGetIntegerv(GL_RENDERBUFFER_BINDING_EXT, &savedId); - if (savedId != (GLint)m_bufId) - { + if (savedId != (GLint)m_bufId) { Bind(); } @@ -104,16 +95,12 @@ void Renderbuffer::Set(GLenum internalFormat, int width, int height) glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, internalFormat, width, height); // Guarded unbind - if (savedId != (GLint)m_bufId) - { + if (savedId != (GLint)m_bufId) { glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, savedId); } } -GLuint Renderbuffer::GetId() const -{ - return m_bufId; -} +GLuint Renderbuffer::GetId() const { return m_bufId; } GLint Renderbuffer::GetMaxSize() { @@ -128,4 +115,3 @@ GLuint Renderbuffer::_CreateBufferId() glGenRenderbuffersEXT(1, &id); return id; } - diff --git a/Samples/5_Domain_Specific/smokeParticles/renderbuffer.h b/Samples/5_Domain_Specific/smokeParticles/renderbuffer.h index 7e94555d..17bc2204 100644 --- a/Samples/5_Domain_Specific/smokeParticles/renderbuffer.h +++ b/Samples/5_Domain_Specific/smokeParticles/renderbuffer.h @@ -85,23 +85,22 @@ Usage Notes: */ class Renderbuffer { - public: - /// Ctors/Dtors - Renderbuffer(); - Renderbuffer(GLenum internalFormat, int width, int height); - ~Renderbuffer(); +public: + /// Ctors/Dtors + Renderbuffer(); + Renderbuffer(GLenum internalFormat, int width, int height); + ~Renderbuffer(); - void Bind(); - void Unbind(); - void Set(GLenum internalFormat, int width, int height); - GLuint GetId() const; + void Bind(); + void Unbind(); + void Set(GLenum internalFormat, int width, int height); + GLuint GetId() const; - static GLint GetMaxSize(); + static GLint GetMaxSize(); - private: - GLuint m_bufId; - static GLuint _CreateBufferId(); +private: + GLuint m_bufId; + static GLuint _CreateBufferId(); }; #endif - diff --git a/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity.cu b/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity.cu index f4a71e9e..e2569d3c 100644 --- a/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity.cu +++ b/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity.cu @@ -30,19 +30,20 @@ */ // includes, system -#include -#include -#include #include +#include +#include +#include // includes, kernels #include + #include "stereoDisparity_kernel.cuh" // includes, project -#include // helper for shared that are common to CUDA Samples -#include // helper for checking cuda initialization and error checking -#include // helper functions for string parsing +#include // helper for checking cuda initialization and error checking +#include // helper for shared that are common to CUDA Samples +#include // helper functions for string parsing static const char *sSDKsample = "[stereoDisparity]\0"; @@ -55,228 +56,230 @@ void runTest(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - printf("%s Starting...\n\n", sSDKsample); - runTest(argc, argv); +int main(int argc, char **argv) +{ + printf("%s Starting...\n\n", sSDKsample); + runTest(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! CUDA Sample for calculating depth maps //////////////////////////////////////////////////////////////////////////////// -void runTest(int argc, char **argv) { - cudaDeviceProp deviceProp; - deviceProp.major = 0; - deviceProp.minor = 0; - int dev = 0; +void runTest(int argc, char **argv) +{ + cudaDeviceProp deviceProp; + deviceProp.major = 0; + deviceProp.minor = 0; + int dev = 0; - // This will pick the best possible CUDA capable device - dev = findCudaDevice(argc, (const char **)argv); + // This will pick the best possible CUDA capable device + dev = findCudaDevice(argc, (const char **)argv); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - // Statistics about the GPU device - printf( - "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", - deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); + // Statistics about the GPU device + printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", + deviceProp.multiProcessorCount, + deviceProp.major, + deviceProp.minor); - StopWatchInterface *timer; - sdkCreateTimer(&timer); + StopWatchInterface *timer; + sdkCreateTimer(&timer); - // Search parameters - int minDisp = -16; - int maxDisp = 0; + // Search parameters + int minDisp = -16; + int maxDisp = 0; - // Load image data - // allocate mem for the images on host side - // initialize pointers to NULL to request lib call to allocate as needed - // PPM images are loaded into 4 byte/pixel memory (RGBX) - unsigned char *h_img0 = NULL; - unsigned char *h_img1 = NULL; - unsigned int w, h; - char *fname0 = sdkFindFilePath("stereo.im0.640x533.ppm", argv[0]); - char *fname1 = sdkFindFilePath("stereo.im1.640x533.ppm", argv[0]); + // Load image data + // allocate mem for the images on host side + // initialize pointers to NULL to request lib call to allocate as needed + // PPM images are loaded into 4 byte/pixel memory (RGBX) + unsigned char *h_img0 = NULL; + unsigned char *h_img1 = NULL; + unsigned int w, h; + char *fname0 = sdkFindFilePath("stereo.im0.640x533.ppm", argv[0]); + char *fname1 = sdkFindFilePath("stereo.im1.640x533.ppm", argv[0]); - printf("Loaded <%s> as image 0\n", fname0); + printf("Loaded <%s> as image 0\n", fname0); - if (!sdkLoadPPM4ub(fname0, &h_img0, &w, &h)) { - fprintf(stderr, "Failed to load <%s>\n", fname0); - } + if (!sdkLoadPPM4ub(fname0, &h_img0, &w, &h)) { + fprintf(stderr, "Failed to load <%s>\n", fname0); + } - printf("Loaded <%s> as image 1\n", fname1); + printf("Loaded <%s> as image 1\n", fname1); - if (!sdkLoadPPM4ub(fname1, &h_img1, &w, &h)) { - fprintf(stderr, "Failed to load <%s>\n", fname1); - } + if (!sdkLoadPPM4ub(fname1, &h_img1, &w, &h)) { + fprintf(stderr, "Failed to load <%s>\n", fname1); + } - dim3 numThreads = dim3(blockSize_x, blockSize_y, 1); - dim3 numBlocks = dim3(iDivUp(w, numThreads.x), iDivUp(h, numThreads.y)); - unsigned int numData = w * h; - unsigned int memSize = sizeof(int) * numData; + dim3 numThreads = dim3(blockSize_x, blockSize_y, 1); + dim3 numBlocks = dim3(iDivUp(w, numThreads.x), iDivUp(h, numThreads.y)); + unsigned int numData = w * h; + unsigned int memSize = sizeof(int) * numData; - // allocate mem for the result on host side - unsigned int *h_odata = (unsigned int *)malloc(memSize); + // allocate mem for the result on host side + unsigned int *h_odata = (unsigned int *)malloc(memSize); - // initialize the memory - for (unsigned int i = 0; i < numData; i++) h_odata[i] = 0; + // initialize the memory + for (unsigned int i = 0; i < numData; i++) + h_odata[i] = 0; - // allocate device memory for result - unsigned int *d_odata, *d_img0, *d_img1; + // allocate device memory for result + unsigned int *d_odata, *d_img0, *d_img1; - checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); - checkCudaErrors(cudaMalloc((void **)&d_img0, memSize)); - checkCudaErrors(cudaMalloc((void **)&d_img1, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_odata, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_img0, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_img1, memSize)); - // copy host memory to device to initialize to zeros - checkCudaErrors(cudaMemcpy(d_img0, h_img0, memSize, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_img1, h_img1, memSize, cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(d_odata, h_odata, memSize, cudaMemcpyHostToDevice)); + // copy host memory to device to initialize to zeros + checkCudaErrors(cudaMemcpy(d_img0, h_img0, memSize, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_img1, h_img1, memSize, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_odata, h_odata, memSize, cudaMemcpyHostToDevice)); - cudaChannelFormatDesc ca_desc0 = cudaCreateChannelDesc(); - cudaChannelFormatDesc ca_desc1 = cudaCreateChannelDesc(); + cudaChannelFormatDesc ca_desc0 = cudaCreateChannelDesc(); + cudaChannelFormatDesc ca_desc1 = cudaCreateChannelDesc(); - cudaTextureObject_t tex2Dleft, tex2Dright; - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaTextureObject_t tex2Dleft, tex2Dright; + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = d_img0; - texRes.res.pitch2D.desc = ca_desc0; - texRes.res.pitch2D.width = w; - texRes.res.pitch2D.height = h; - texRes.res.pitch2D.pitchInBytes = w * 4; + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = d_img0; + texRes.res.pitch2D.desc = ca_desc0; + texRes.res.pitch2D.width = w; + texRes.res.pitch2D.height = h; + texRes.res.pitch2D.pitchInBytes = w * 4; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&tex2Dleft, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&tex2Dleft, &texRes, &texDescr, NULL)); - memset(&texRes, 0, sizeof(cudaResourceDesc)); + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypePitch2D; - texRes.res.pitch2D.devPtr = d_img1; - texRes.res.pitch2D.desc = ca_desc1; - texRes.res.pitch2D.width = w; - texRes.res.pitch2D.height = h; - texRes.res.pitch2D.pitchInBytes = w * 4; + texRes.resType = cudaResourceTypePitch2D; + texRes.res.pitch2D.devPtr = d_img1; + texRes.res.pitch2D.desc = ca_desc1; + texRes.res.pitch2D.width = w; + texRes.res.pitch2D.height = h; + texRes.res.pitch2D.pitchInBytes = w * 4; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors( - cudaCreateTextureObject(&tex2Dright, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&tex2Dright, &texRes, &texDescr, NULL)); - // First run the warmup kernel (which we'll use to get the GPU in the correct - // max power state - stereoDisparityKernel<<>>( - d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright); - cudaDeviceSynchronize(); + // First run the warmup kernel (which we'll use to get the GPU in the correct + // max power state + stereoDisparityKernel<<>>( + d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright); + cudaDeviceSynchronize(); - // Allocate CUDA events that we'll use for timing - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); + // Allocate CUDA events that we'll use for timing + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); - printf("Launching CUDA stereoDisparityKernel()\n"); + printf("Launching CUDA stereoDisparityKernel()\n"); - // Record the start event - checkCudaErrors(cudaEventRecord(start, NULL)); + // Record the start event + checkCudaErrors(cudaEventRecord(start, NULL)); - // launch the stereoDisparity kernel - stereoDisparityKernel<<>>( - d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright); + // launch the stereoDisparity kernel + stereoDisparityKernel<<>>( + d_img0, d_img1, d_odata, w, h, minDisp, maxDisp, tex2Dleft, tex2Dright); - // Record the stop event - checkCudaErrors(cudaEventRecord(stop, NULL)); + // Record the stop event + checkCudaErrors(cudaEventRecord(stop, NULL)); - // Wait for the stop event to complete - checkCudaErrors(cudaEventSynchronize(stop)); + // Wait for the stop event to complete + checkCudaErrors(cudaEventSynchronize(stop)); - // Check to make sure the kernel didn't fail - getLastCudaError("Kernel execution failed"); + // Check to make sure the kernel didn't fail + getLastCudaError("Kernel execution failed"); - float msecTotal = 0.0f; - checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); + float msecTotal = 0.0f; + checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); - // Copy result from device to host for verification - checkCudaErrors( - cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost)); + // Copy result from device to host for verification + checkCudaErrors(cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost)); - printf("Input Size [%dx%d], ", w, h); - printf("Kernel size [%dx%d], ", (2 * RAD + 1), (2 * RAD + 1)); - printf("Disparities [%d:%d]\n", minDisp, maxDisp); + printf("Input Size [%dx%d], ", w, h); + printf("Kernel size [%dx%d], ", (2 * RAD + 1), (2 * RAD + 1)); + printf("Disparities [%d:%d]\n", minDisp, maxDisp); - printf("GPU processing time : %.4f (ms)\n", msecTotal); - printf("Pixel throughput : %.3f Mpixels/sec\n", - ((float)(w * h * 1000.f) / msecTotal) / 1000000); + printf("GPU processing time : %.4f (ms)\n", msecTotal); + printf("Pixel throughput : %.3f Mpixels/sec\n", ((float)(w * h * 1000.f) / msecTotal) / 1000000); - // calculate sum of resultant GPU image - unsigned int checkSum = 0; + // calculate sum of resultant GPU image + unsigned int checkSum = 0; - for (unsigned int i = 0; i < w * h; i++) { - checkSum += h_odata[i]; - } + for (unsigned int i = 0; i < w * h; i++) { + checkSum += h_odata[i]; + } - printf("GPU Checksum = %u, ", checkSum); + printf("GPU Checksum = %u, ", checkSum); - // write out the resulting disparity image. - unsigned char *dispOut = (unsigned char *)malloc(numData); - int mult = 20; - const char *fnameOut = "output_GPU.pgm"; + // write out the resulting disparity image. + unsigned char *dispOut = (unsigned char *)malloc(numData); + int mult = 20; + const char *fnameOut = "output_GPU.pgm"; - for (unsigned int i = 0; i < numData; i++) { - dispOut[i] = (int)h_odata[i] * mult; - } + for (unsigned int i = 0; i < numData; i++) { + dispOut[i] = (int)h_odata[i] * mult; + } - printf("GPU image: <%s>\n", fnameOut); - sdkSavePGM(fnameOut, dispOut, w, h); + printf("GPU image: <%s>\n", fnameOut); + sdkSavePGM(fnameOut, dispOut, w, h); - // compute reference solution - printf("Computing CPU reference...\n"); - cpu_gold_stereo((unsigned int *)h_img0, (unsigned int *)h_img1, - (unsigned int *)h_odata, w, h, minDisp, maxDisp); - unsigned int cpuCheckSum = 0; + // compute reference solution + printf("Computing CPU reference...\n"); + cpu_gold_stereo((unsigned int *)h_img0, (unsigned int *)h_img1, (unsigned int *)h_odata, w, h, minDisp, maxDisp); + unsigned int cpuCheckSum = 0; - for (unsigned int i = 0; i < w * h; i++) { - cpuCheckSum += h_odata[i]; - } + for (unsigned int i = 0; i < w * h; i++) { + cpuCheckSum += h_odata[i]; + } - printf("CPU Checksum = %u, ", cpuCheckSum); - const char *cpuFnameOut = "output_CPU.pgm"; + printf("CPU Checksum = %u, ", cpuCheckSum); + const char *cpuFnameOut = "output_CPU.pgm"; - for (unsigned int i = 0; i < numData; i++) { - dispOut[i] = (int)h_odata[i] * mult; - } + for (unsigned int i = 0; i < numData; i++) { + dispOut[i] = (int)h_odata[i] * mult; + } - printf("CPU image: <%s>\n", cpuFnameOut); - sdkSavePGM(cpuFnameOut, dispOut, w, h); + printf("CPU image: <%s>\n", cpuFnameOut); + sdkSavePGM(cpuFnameOut, dispOut, w, h); - // cleanup memory - checkCudaErrors(cudaFree(d_odata)); - checkCudaErrors(cudaFree(d_img0)); - checkCudaErrors(cudaFree(d_img1)); + // cleanup memory + checkCudaErrors(cudaFree(d_odata)); + checkCudaErrors(cudaFree(d_img0)); + checkCudaErrors(cudaFree(d_img1)); - if (h_odata != NULL) free(h_odata); + if (h_odata != NULL) + free(h_odata); - if (h_img0 != NULL) free(h_img0); + if (h_img0 != NULL) + free(h_img0); - if (h_img1 != NULL) free(h_img1); + if (h_img1 != NULL) + free(h_img1); - if (dispOut != NULL) free(dispOut); + if (dispOut != NULL) + free(dispOut); - sdkDeleteTimer(&timer); + sdkDeleteTimer(&timer); - exit((checkSum == cpuCheckSum) ? EXIT_SUCCESS : EXIT_FAILURE); + exit((checkSum == cpuCheckSum) ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity_kernel.cuh b/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity_kernel.cuh index 89f9b5f9..3b744f24 100644 --- a/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity_kernel.cuh +++ b/Samples/5_Domain_Specific/stereoDisparity/stereoDisparity_kernel.cuh @@ -54,18 +54,17 @@ namespace cg = cooperative_groups; // "ptx_isa_3.0K.pdf" // included in the NVIDIA GPU Computing Toolkit //////////////////////////////////////////////////////////////////////////////// -__device__ unsigned int __usad4(unsigned int A, unsigned int B, - unsigned int C = 0) { - unsigned int result; +__device__ unsigned int __usad4(unsigned int A, unsigned int B, unsigned int C = 0) +{ + unsigned int result; - // Kepler (SM 3.x) and higher supports a 4 vector SAD SIMD - asm( - "vabsdiff4.u32.u32.u32.add" - " %0, %1, %2, %3;" - : "=r"(result) - : "r"(A), "r"(B), "r"(C)); + // Kepler (SM 3.x) and higher supports a 4 vector SAD SIMD + asm("vabsdiff4.u32.u32.u32.add" + " %0, %1, %2, %3;" + : "=r"(result) + : "r"(A), "r"(B), "r"(C)); - return result; + return result; } //////////////////////////////////////////////////////////////////////////////// @@ -88,165 +87,178 @@ __device__ unsigned int __usad4(unsigned int A, unsigned int B, //! @param minDisparity leftmost search range //! @param maxDisparity rightmost search range //////////////////////////////////////////////////////////////////////////////// -__global__ void stereoDisparityKernel(unsigned int *g_img0, - unsigned int *g_img1, - unsigned int *g_odata, int w, int h, - int minDisparity, int maxDisparity, +__global__ void stereoDisparityKernel(unsigned int *g_img0, + unsigned int *g_img1, + unsigned int *g_odata, + int w, + int h, + int minDisparity, + int maxDisparity, cudaTextureObject_t tex2Dleft, - cudaTextureObject_t tex2Dright) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - // access thread id - const int tidx = blockDim.x * blockIdx.x + threadIdx.x; - const int tidy = blockDim.y * blockIdx.y + threadIdx.y; - const unsigned int sidx = threadIdx.x + RAD; - const unsigned int sidy = threadIdx.y + RAD; + cudaTextureObject_t tex2Dright) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // access thread id + const int tidx = blockDim.x * blockIdx.x + threadIdx.x; + const int tidy = blockDim.y * blockIdx.y + threadIdx.y; + const unsigned int sidx = threadIdx.x + RAD; + const unsigned int sidy = threadIdx.y + RAD; - unsigned int imLeft; - unsigned int imRight; - unsigned int cost; - unsigned int bestCost = 9999999; - unsigned int bestDisparity = 0; - __shared__ unsigned int diff[blockSize_y + 2 * RAD][blockSize_x + 2 * RAD]; + unsigned int imLeft; + unsigned int imRight; + unsigned int cost; + unsigned int bestCost = 9999999; + unsigned int bestDisparity = 0; + __shared__ unsigned int diff[blockSize_y + 2 * RAD][blockSize_x + 2 * RAD]; - // store needed values for left image into registers (constant indexed local - // vars) - unsigned int imLeftA[STEPS]; - unsigned int imLeftB[STEPS]; + // store needed values for left image into registers (constant indexed local + // vars) + unsigned int imLeftA[STEPS]; + unsigned int imLeftB[STEPS]; - for (int i = 0; i < STEPS; i++) { - int offset = -RAD + i * RAD; - imLeftA[i] = tex2D(tex2Dleft, tidx - RAD, tidy + offset); - imLeftB[i] = - tex2D(tex2Dleft, tidx - RAD + blockSize_x, tidy + offset); - } + for (int i = 0; i < STEPS; i++) { + int offset = -RAD + i * RAD; + imLeftA[i] = tex2D(tex2Dleft, tidx - RAD, tidy + offset); + imLeftB[i] = tex2D(tex2Dleft, tidx - RAD + blockSize_x, tidy + offset); + } - // for a fixed camera system this could be hardcoded and loop unrolled - for (int d = minDisparity; d <= maxDisparity; d++) { + // for a fixed camera system this could be hardcoded and loop unrolled + for (int d = minDisparity; d <= maxDisparity; d++) { // LEFT #pragma unroll - for (int i = 0; i < STEPS; i++) { - int offset = -RAD + i * RAD; - // imLeft = tex2D( tex2Dleft, tidx-RAD, tidy+offset ); - imLeft = imLeftA[i]; - imRight = tex2D(tex2Dright, tidx - RAD + d, tidy + offset); - cost = __usad4(imLeft, imRight); - diff[sidy + offset][sidx - RAD] = cost; - } + for (int i = 0; i < STEPS; i++) { + int offset = -RAD + i * RAD; + // imLeft = tex2D( tex2Dleft, tidx-RAD, tidy+offset ); + imLeft = imLeftA[i]; + imRight = tex2D(tex2Dright, tidx - RAD + d, tidy + offset); + cost = __usad4(imLeft, imRight); + diff[sidy + offset][sidx - RAD] = cost; + } // RIGHT #pragma unroll - for (int i = 0; i < STEPS; i++) { - int offset = -RAD + i * RAD; + for (int i = 0; i < STEPS; i++) { + int offset = -RAD + i * RAD; - if (threadIdx.x < 2 * RAD) { - // imLeft = tex2D( tex2Dleft, tidx-RAD+blockSize_x, tidy+offset ); - imLeft = imLeftB[i]; - imRight = tex2D(tex2Dright, tidx - RAD + blockSize_x + d, - tidy + offset); - cost = __usad4(imLeft, imRight); - diff[sidy + offset][sidx - RAD + blockSize_x] = cost; - } - } + if (threadIdx.x < 2 * RAD) { + // imLeft = tex2D( tex2Dleft, tidx-RAD+blockSize_x, tidy+offset ); + imLeft = imLeftB[i]; + imRight = tex2D(tex2Dright, tidx - RAD + blockSize_x + d, tidy + offset); + cost = __usad4(imLeft, imRight); + diff[sidy + offset][sidx - RAD + blockSize_x] = cost; + } + } - cg::sync(cta); + cg::sync(cta); // sum cost horizontally #pragma unroll - for (int j = 0; j < STEPS; j++) { - int offset = -RAD + j * RAD; - cost = 0; + for (int j = 0; j < STEPS; j++) { + int offset = -RAD + j * RAD; + cost = 0; #pragma unroll - for (int i = -RAD; i <= RAD; i++) { - cost += diff[sidy + offset][sidx + i]; - } - - cg::sync(cta); - diff[sidy + offset][sidx] = cost; - cg::sync(cta); - } - - // sum cost vertically - cost = 0; -#pragma unroll - - for (int i = -RAD; i <= RAD; i++) { - cost += diff[sidy + i][sidx]; - } - - // see if it is better or not - if (cost < bestCost) { - bestCost = cost; - bestDisparity = d + 8; - } - - cg::sync(cta); - } - - if (tidy < h && tidx < w) { - g_odata[tidy * w + tidx] = bestDisparity; - } -} - -void cpu_gold_stereo(unsigned int *img0, unsigned int *img1, - unsigned int *odata, int w, int h, int minDisparity, - int maxDisparity) { - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - unsigned int bestCost = 9999999; - unsigned int bestDisparity = 0; - - for (int d = minDisparity; d <= maxDisparity; d++) { - unsigned int cost = 0; - - for (int i = -RAD; i <= RAD; i++) { - for (int j = -RAD; j <= RAD; j++) { - // border clamping - int yy, xx, xxd; - yy = y + i; - - if (yy < 0) yy = 0; - - if (yy >= h) yy = h - 1; - - xx = x + j; - - if (xx < 0) xx = 0; - - if (xx >= w) xx = w - 1; - - xxd = x + j + d; - - if (xxd < 0) xxd = 0; - - if (xxd >= w) xxd = w - 1; - - // sum abs diff across components - unsigned char *A = (unsigned char *)&img0[yy * w + xx]; - unsigned char *B = (unsigned char *)&img1[yy * w + xxd]; - unsigned int absdiff = 0; - - for (int k = 0; k < 4; k++) { - absdiff += abs((int)(A[k] - B[k])); + for (int i = -RAD; i <= RAD; i++) { + cost += diff[sidy + offset][sidx + i]; } - cost += absdiff; - } + cg::sync(cta); + diff[sidy + offset][sidx] = cost; + cg::sync(cta); } + // sum cost vertically + cost = 0; +#pragma unroll + + for (int i = -RAD; i <= RAD; i++) { + cost += diff[sidy + i][sidx]; + } + + // see if it is better or not if (cost < bestCost) { - bestCost = cost; - bestDisparity = d + 8; + bestCost = cost; + bestDisparity = d + 8; } - } // end for disparities - - // store to best disparity - odata[y * w + x] = bestDisparity; + cg::sync(cta); + } + + if (tidy < h && tidx < w) { + g_odata[tidy * w + tidx] = bestDisparity; } - } } -#endif // #ifndef _STEREODISPARITY_KERNEL_H_ + +void cpu_gold_stereo(unsigned int *img0, + unsigned int *img1, + unsigned int *odata, + int w, + int h, + int minDisparity, + int maxDisparity) +{ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + unsigned int bestCost = 9999999; + unsigned int bestDisparity = 0; + + for (int d = minDisparity; d <= maxDisparity; d++) { + unsigned int cost = 0; + + for (int i = -RAD; i <= RAD; i++) { + for (int j = -RAD; j <= RAD; j++) { + // border clamping + int yy, xx, xxd; + yy = y + i; + + if (yy < 0) + yy = 0; + + if (yy >= h) + yy = h - 1; + + xx = x + j; + + if (xx < 0) + xx = 0; + + if (xx >= w) + xx = w - 1; + + xxd = x + j + d; + + if (xxd < 0) + xxd = 0; + + if (xxd >= w) + xxd = w - 1; + + // sum abs diff across components + unsigned char *A = (unsigned char *)&img0[yy * w + xx]; + unsigned char *B = (unsigned char *)&img1[yy * w + xxd]; + unsigned int absdiff = 0; + + for (int k = 0; k < 4; k++) { + absdiff += abs((int)(A[k] - B[k])); + } + + cost += absdiff; + } + } + + if (cost < bestCost) { + bestCost = cost; + bestDisparity = d + 8; + } + + } // end for disparities + + // store to best disparity + odata[y * w + x] = bestDisparity; + } + } +} +#endif // #ifndef _STEREODISPARITY_KERNEL_H_ diff --git a/Samples/5_Domain_Specific/volumeFiltering/README.md b/Samples/5_Domain_Specific/volumeFiltering/README.md index 10bd00e0..96af932f 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/README.md +++ b/Samples/5_Domain_Specific/volumeFiltering/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/volumeFiltering/volume.cpp b/Samples/5_Domain_Specific/volumeFiltering/volume.cpp index 221d6ac7..eb574592 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/volume.cpp +++ b/Samples/5_Domain_Specific/volumeFiltering/volume.cpp @@ -31,62 +31,60 @@ // Helper functions #include #include + #include "volume.h" -void Volume_init(Volume *vol, cudaExtent dataSize, void *h_data, - int allowStore) { - // create 3D array - vol->channelDesc = cudaCreateChannelDesc(); - checkCudaErrors( - cudaMalloc3DArray(&vol->content, &vol->channelDesc, dataSize, - allowStore ? cudaArraySurfaceLoadStore : 0)); - vol->size = dataSize; +void Volume_init(Volume *vol, cudaExtent dataSize, void *h_data, int allowStore) +{ + // create 3D array + vol->channelDesc = cudaCreateChannelDesc(); + checkCudaErrors( + cudaMalloc3DArray(&vol->content, &vol->channelDesc, dataSize, allowStore ? cudaArraySurfaceLoadStore : 0)); + vol->size = dataSize; - if (h_data) { - // copy data to 3D array - cudaMemcpy3DParms copyParams = {0}; - copyParams.srcPtr = - make_cudaPitchedPtr(h_data, dataSize.width * sizeof(VolumeType), - dataSize.width, dataSize.height); - copyParams.dstArray = vol->content; - copyParams.extent = dataSize; - copyParams.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(©Params)); - } + if (h_data) { + // copy data to 3D array + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = + make_cudaPitchedPtr(h_data, dataSize.width * sizeof(VolumeType), dataSize.width, dataSize.height); + copyParams.dstArray = vol->content; + copyParams.extent = dataSize; + copyParams.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(©Params)); + } - if (allowStore) { - cudaResourceDesc surfRes; - memset(&surfRes, 0, sizeof(cudaResourceDesc)); - surfRes.resType = cudaResourceTypeArray; - surfRes.res.array.array = vol->content; + if (allowStore) { + cudaResourceDesc surfRes; + memset(&surfRes, 0, sizeof(cudaResourceDesc)); + surfRes.resType = cudaResourceTypeArray; + surfRes.res.array.array = vol->content; - checkCudaErrors(cudaCreateSurfaceObject(&vol->volumeSurf, &surfRes)); - } + checkCudaErrors(cudaCreateSurfaceObject(&vol->volumeSurf, &surfRes)); + } - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = vol->content; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = vol->content; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.addressMode[2] = cudaAddressModeWrap; - texDescr.readMode = - cudaReadModeNormalizedFloat; // VolumeTypeInfo::readMode; + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.addressMode[2] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeNormalizedFloat; // VolumeTypeInfo::readMode; - checkCudaErrors( - cudaCreateTextureObject(&vol->volumeTex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&vol->volumeTex, &texRes, &texDescr, NULL)); } -void Volume_deinit(Volume *vol) { - checkCudaErrors(cudaDestroyTextureObject(vol->volumeTex)); - checkCudaErrors(cudaDestroySurfaceObject(vol->volumeSurf)); - checkCudaErrors(cudaFreeArray(vol->content)); - vol->content = 0; +void Volume_deinit(Volume *vol) +{ + checkCudaErrors(cudaDestroyTextureObject(vol->volumeTex)); + checkCudaErrors(cudaDestroySurfaceObject(vol->volumeSurf)); + checkCudaErrors(cudaFreeArray(vol->content)); + vol->content = 0; } diff --git a/Samples/5_Domain_Specific/volumeFiltering/volume.h b/Samples/5_Domain_Specific/volumeFiltering/volume.h index a9e60a17..eac6c1ba 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/volume.h +++ b/Samples/5_Domain_Specific/volumeFiltering/volume.h @@ -32,18 +32,20 @@ typedef unsigned char VolumeType; -extern "C" { +extern "C" +{ -struct Volume { - cudaArray *content; - cudaExtent size; - cudaChannelFormatDesc channelDesc; - cudaTextureObject_t volumeTex; - cudaSurfaceObject_t volumeSurf; -}; + struct Volume + { + cudaArray *content; + cudaExtent size; + cudaChannelFormatDesc channelDesc; + cudaTextureObject_t volumeTex; + cudaSurfaceObject_t volumeSurf; + }; -void Volume_init(Volume *vol, cudaExtent size, void *data, int allowStore); -void Volume_deinit(Volume *vol); + void Volume_init(Volume *vol, cudaExtent size, void *data, int allowStore); + void Volume_deinit(Volume *vol); }; ////////////////////////////////////////////////////////////////////////// @@ -53,29 +55,32 @@ void Volume_deinit(Volume *vol); /* Helper class to do popular integer storage to float conversions if required */ -template -struct VolumeTypeInfo {}; - -template <> -struct VolumeTypeInfo { - static const cudaTextureReadMode readMode = cudaReadModeNormalizedFloat; - static __inline__ __device__ unsigned char convert(float sampled) { - return (unsigned char)(__saturatef(sampled) * 255.0); - } +template struct VolumeTypeInfo +{ }; -template <> -struct VolumeTypeInfo { - static const cudaTextureReadMode readMode = cudaReadModeNormalizedFloat; - static __inline__ __device__ unsigned short convert(float sampled) { - return (unsigned short)(__saturatef(sampled) * 65535.0); - } +template <> struct VolumeTypeInfo +{ + static const cudaTextureReadMode readMode = cudaReadModeNormalizedFloat; + static __inline__ __device__ unsigned char convert(float sampled) + { + return (unsigned char)(__saturatef(sampled) * 255.0); + } }; -template <> -struct VolumeTypeInfo { - static const cudaTextureReadMode readMode = cudaReadModeElementType; - static __inline__ __device__ float convert(float sampled) { return sampled; } +template <> struct VolumeTypeInfo +{ + static const cudaTextureReadMode readMode = cudaReadModeNormalizedFloat; + static __inline__ __device__ unsigned short convert(float sampled) + { + return (unsigned short)(__saturatef(sampled) * 65535.0); + } +}; + +template <> struct VolumeTypeInfo +{ + static const cudaTextureReadMode readMode = cudaReadModeElementType; + static __inline__ __device__ float convert(float sampled) { return sampled; } }; #endif diff --git a/Samples/5_Domain_Specific/volumeFiltering/volumeFilter.h b/Samples/5_Domain_Specific/volumeFiltering/volumeFilter.h index 76ee5a3d..4244ab82 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/volumeFilter.h +++ b/Samples/5_Domain_Specific/volumeFiltering/volumeFilter.h @@ -31,12 +31,18 @@ #define VOLUMEFILTER_MAXWEIGHTS 125 #include + #include "volume.h" -extern "C" { -Volume *VolumeFilter_runFilter(Volume *input, Volume *output0, Volume *output1, - int iterations, int numWeights, float4 *weights, - float postWeightOffset); +extern "C" +{ + Volume *VolumeFilter_runFilter(Volume *input, + Volume *output0, + Volume *output1, + int iterations, + int numWeights, + float4 *weights, + float postWeightOffset); }; #endif diff --git a/Samples/5_Domain_Specific/volumeFiltering/volumeFilter_kernel.cu b/Samples/5_Domain_Specific/volumeFiltering/volumeFilter_kernel.cu index 4fc384ce..0a3b86b2 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/volumeFilter_kernel.cu +++ b/Samples/5_Domain_Specific/volumeFiltering/volumeFilter_kernel.cu @@ -30,86 +30,88 @@ #include #include + #include "volumeFilter.h" -typedef unsigned int uint; -typedef unsigned char uchar; +typedef unsigned int uint; +typedef unsigned char uchar; typedef unsigned short ushort; __constant__ float4 c_filterData[VOLUMEFILTER_MAXWEIGHTS]; -__global__ void d_filter_surface3d(int filterSize, float filter_offset, - cudaExtent volumeSize, +__global__ void d_filter_surface3d(int filterSize, + float filter_offset, + cudaExtent volumeSize, cudaTextureObject_t volumeTexIn, - cudaSurfaceObject_t volumeTexOut) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - int z = blockIdx.z * blockDim.z + threadIdx.z; + cudaSurfaceObject_t volumeTexOut) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int z = blockIdx.z * blockDim.z + threadIdx.z; - if (x >= volumeSize.width || y >= volumeSize.height || - z >= volumeSize.depth) { - return; - } - - float filtered = 0; - float4 basecoord = make_float4(x, y, z, 0); - - for (int i = 0; i < filterSize; i++) { - float4 coord = basecoord + c_filterData[i]; - filtered += tex3D(volumeTexIn, coord.x, coord.y, coord.z) * - c_filterData[i].w; - } - - filtered += filter_offset; - - VolumeType output = VolumeTypeInfo::convert(filtered); - - // surface writes need byte offsets for x! - surf3Dwrite(output, volumeTexOut, x * sizeof(VolumeType), y, z); -} - -static unsigned int iDivUp(size_t a, size_t b) { - size_t val = (a % b != 0) ? (a / b + 1) : (a / b); - if (val > UINT_MAX) { - fprintf(stderr, "\nUINT_MAX limit exceeded in iDivUp() exiting.....\n"); - exit(EXIT_FAILURE); // val exceeds limit - } - - return static_cast(val); -} - -extern "C" Volume *VolumeFilter_runFilter(Volume *input, Volume *output0, - Volume *output1, int iterations, - int numWeights, float4 *weights, - float postWeightOffset) { - Volume *swap = 0; - cudaExtent size = input->size; - unsigned int dim = 32 / sizeof(VolumeType); - dim3 blockSize(dim, dim, 1); - dim3 gridSize(iDivUp(size.width, blockSize.x), - iDivUp(size.height, blockSize.y), - iDivUp(size.depth, blockSize.z)); - - // set weights - checkCudaErrors( - cudaMemcpyToSymbol(c_filterData, weights, sizeof(float4) * numWeights)); - - for (int i = 0; i < iterations; i++) { - d_filter_surface3d<<>>(numWeights, postWeightOffset, - size, input->volumeTex, - output0->volumeSurf); - - getLastCudaError("filter kernel failed"); - - swap = input; - input = output0; - output0 = swap; - - if (i == 0) { - output0 = output1; + if (x >= volumeSize.width || y >= volumeSize.height || z >= volumeSize.depth) { + return; } - } - return input; + float filtered = 0; + float4 basecoord = make_float4(x, y, z, 0); + + for (int i = 0; i < filterSize; i++) { + float4 coord = basecoord + c_filterData[i]; + filtered += tex3D(volumeTexIn, coord.x, coord.y, coord.z) * c_filterData[i].w; + } + + filtered += filter_offset; + + VolumeType output = VolumeTypeInfo::convert(filtered); + + // surface writes need byte offsets for x! + surf3Dwrite(output, volumeTexOut, x * sizeof(VolumeType), y, z); +} + +static unsigned int iDivUp(size_t a, size_t b) +{ + size_t val = (a % b != 0) ? (a / b + 1) : (a / b); + if (val > UINT_MAX) { + fprintf(stderr, "\nUINT_MAX limit exceeded in iDivUp() exiting.....\n"); + exit(EXIT_FAILURE); // val exceeds limit + } + + return static_cast(val); +} + +extern "C" Volume *VolumeFilter_runFilter(Volume *input, + Volume *output0, + Volume *output1, + int iterations, + int numWeights, + float4 *weights, + float postWeightOffset) +{ + Volume *swap = 0; + cudaExtent size = input->size; + unsigned int dim = 32 / sizeof(VolumeType); + dim3 blockSize(dim, dim, 1); + dim3 gridSize(iDivUp(size.width, blockSize.x), iDivUp(size.height, blockSize.y), iDivUp(size.depth, blockSize.z)); + + // set weights + checkCudaErrors(cudaMemcpyToSymbol(c_filterData, weights, sizeof(float4) * numWeights)); + + for (int i = 0; i < iterations; i++) { + d_filter_surface3d<<>>( + numWeights, postWeightOffset, size, input->volumeTex, output0->volumeSurf); + + getLastCudaError("filter kernel failed"); + + swap = input; + input = output0; + output0 = swap; + + if (i == 0) { + output0 = output1; + } + } + + return input; } #endif diff --git a/Samples/5_Domain_Specific/volumeFiltering/volumeFiltering.cpp b/Samples/5_Domain_Specific/volumeFiltering/volumeFiltering.cpp index 0218bf09..0924031d 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/volumeFiltering.cpp +++ b/Samples/5_Domain_Specific/volumeFiltering/volumeFiltering.cpp @@ -26,19 +26,19 @@ // OpenGL Graphics includes #include -#if defined (__APPLE__) || defined(MACOSX) - #pragma clang diagnostic ignored "-Wdeprecated-declarations" - #include - #ifndef glutCloseFunc - #define glutCloseFunc glutWMCloseFunc - #endif +#if defined(__APPLE__) || defined(MACOSX) +#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#include +#ifndef glutCloseFunc +#define glutCloseFunc glutWMCloseFunc +#endif #else #include #endif // CUDA Runtime and Interop -#include #include +#include // Helper functions #include @@ -47,7 +47,7 @@ // CUDA utilities and system includes #include -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; #define MAX_EPSILON_ERROR 5.00f @@ -61,7 +61,7 @@ const char *sSDKsample = "CUDA 3D Volume Filtering"; #include "volumeRender.h" const char *volumeFilename = "Bucky.raw"; -cudaExtent volumeSize = make_cudaExtent(32, 32, 32); +cudaExtent volumeSize = make_cudaExtent(32, 32, 32); uint width = 512, height = 512; dim3 blockSize(16, 16); @@ -69,45 +69,45 @@ dim3 gridSize; float3 viewRotation; float3 viewTranslation = make_float3(0.0, 0.0, -4.0f); -float invViewMatrix[12]; +float invViewMatrix[12]; -float density = 0.05f; -float brightness = 1.0f; -float transferOffset = 0.0f; -float transferScale = 1.0f; -bool linearFiltering = true; -bool preIntegrated = true; -StopWatchInterface *animationTimer = NULL; +float density = 0.05f; +float brightness = 1.0f; +float transferOffset = 0.0f; +float transferScale = 1.0f; +bool linearFiltering = true; +bool preIntegrated = true; +StopWatchInterface *animationTimer = NULL; -float filterFactor = 0.0f; -bool filterAnimation = true; -int filterIterations = 2; -float filterTimeScale = 0.001f; -float filterBias = 0.0f; -float4 filterWeights[3*3*3]; +float filterFactor = 0.0f; +bool filterAnimation = true; +int filterIterations = 2; +float filterTimeScale = 0.001f; +float filterBias = 0.0f; +float4 filterWeights[3 * 3 * 3]; -Volume volumeOriginal; -Volume volumeFilter0; -Volume volumeFilter1; +Volume volumeOriginal; +Volume volumeFilter0; +Volume volumeFilter1; -GLuint pbo = 0; // OpenGL pixel buffer object -GLuint volumeTex = 0; // OpenGL texture object +GLuint pbo = 0; // OpenGL pixel buffer object +GLuint volumeTex = 0; // OpenGL texture object struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) StopWatchInterface *timer = 0; // Auto-Verification Code -const int frameCheckNumber = 2; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -unsigned int frameCount = 0; -unsigned int g_TotalErrors = 0; +const int frameCheckNumber = 2; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; -int *pArgc; +int *pArgc; char **pArgv; -#define MAX(a,b) ((a > b) ? a : b) +#define MAX(a, b) ((a > b) ? a : b) ////////////////////////////////////////////////////////////////////////// // QA RELATED @@ -117,9 +117,8 @@ void computeFPS() frameCount++; fpsCount++; - if (fpsCount == fpsLimit) - { - char fps[256]; + if (fpsCount == fpsLimit) { + char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "CUDA 3D Volume Filtering: %3.1f fps", ifps); @@ -134,78 +133,48 @@ void computeFPS() ////////////////////////////////////////////////////////////////////////// // 3D FILTER -static float filteroffsets[3*3*3][3] = -{ - {-1,-1,-1},{ 0,-1,-1},{ 1,-1,-1}, - {-1, 0,-1},{ 0, 0,-1},{ 1, 0,-1}, - {-1, 1,-1},{ 0, 1,-1},{ 1, 1,-1}, +static float filteroffsets[3 * 3 * 3][3] = { + {-1, -1, -1}, {0, -1, -1}, {1, -1, -1}, {-1, 0, -1}, {0, 0, -1}, {1, 0, -1}, {-1, 1, -1}, {0, 1, -1}, {1, 1, -1}, - {-1,-1, 0},{ 0,-1, 0},{ 1,-1, 0}, - {-1, 0, 0},{ 0, 0, 0},{ 1, 0, 0}, - {-1, 1, 0},{ 0, 1, 0},{ 1, 1, 0}, + {-1, -1, 0}, {0, -1, 0}, {1, -1, 0}, {-1, 0, 0}, {0, 0, 0}, {1, 0, 0}, {-1, 1, 0}, {0, 1, 0}, {1, 1, 0}, - {-1,-1, 1},{ 0,-1, 1},{ 1,-1, 1}, - {-1, 0, 1},{ 0, 0, 1},{ 1, 0, 1}, - {-1, 1, 1},{ 0, 1, 1},{ 1, 1, 1}, + {-1, -1, 1}, {0, -1, 1}, {1, -1, 1}, {-1, 0, 1}, {0, 0, 1}, {1, 0, 1}, {-1, 1, 1}, {0, 1, 1}, {1, 1, 1}, }; -static float filterblur[3*3*3] = -{ - 0,1,0, - 1,2,1, - 0,1,0, +static float filterblur[3 * 3 * 3] = { + 0, 1, 0, 1, 2, 1, 0, 1, 0, - 1,2,1, - 2,4,2, - 1,2,1, + 1, 2, 1, 2, 4, 2, 1, 2, 1, - 0,1,0, - 1,2,1, - 0,1,0, + 0, 1, 0, 1, 2, 1, 0, 1, 0, }; -static float filtersharpen[3*3*3] = -{ - 0,0,0, - 0,-2,0, - 0,0,0, +static float filtersharpen[3 * 3 * 3] = { + 0, 0, 0, 0, -2, 0, 0, 0, 0, - 0,-2,0, - -2,15,-2, - 0,-2,0, + 0, -2, 0, -2, 15, -2, 0, -2, 0, - 0,0,0, - 0,-2,0, - 0,0,0, + 0, 0, 0, 0, -2, 0, 0, 0, 0, }; -static float filterpassthru[3*3*3] = -{ - 0,0,0, - 0,0,0, - 0,0,0, +static float filterpassthru[3 * 3 * 3] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0,0,0, - 0,1,0, - 0,0,0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0,0,0, - 0,0,0, - 0,0,0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; void FilterKernel_init() { - float sumblur = 0.0f; + float sumblur = 0.0f; float sumsharpen = 0.0f; - for (int i = 0; i < 3*3*3; i++) - { + for (int i = 0; i < 3 * 3 * 3; i++) { sumblur += filterblur[i]; sumsharpen += filtersharpen[i]; } - for (int i = 0; i < 3*3*3; i++) - { + for (int i = 0; i < 3 * 3 * 3; i++) { filterblur[i] /= sumblur; filtersharpen[i] /= sumsharpen; @@ -217,37 +186,30 @@ void FilterKernel_init() void FilterKernel_update(float blurfactor) { - if (blurfactor > 0.0f) - { - for (int i = 0; i < 3*3*3; i++) - { + if (blurfactor > 0.0f) { + for (int i = 0; i < 3 * 3 * 3; i++) { filterWeights[i].w = filterblur[i] * blurfactor + filterpassthru[i] * (1.0f - blurfactor); } } - else - { + else { blurfactor = -blurfactor; - for (int i = 0; i < 3*3*3; i++) - { + for (int i = 0; i < 3 * 3 * 3; i++) { filterWeights[i].w = filtersharpen[i] * blurfactor + filterpassthru[i] * (1.0f - blurfactor); } } - } void filter() { - if (filterAnimation) - { + if (filterAnimation) { filterFactor = cosf(sdkGetTimerValue(&animationTimer) * filterTimeScale); } FilterKernel_update(filterFactor); - Volume *volumeRender = VolumeFilter_runFilter(&volumeOriginal,&volumeFilter0,&volumeFilter1, - filterIterations, 3*3*3,filterWeights,filterBias); - + Volume *volumeRender = VolumeFilter_runFilter( + &volumeOriginal, &volumeFilter0, &volumeFilter1, filterIterations, 3 * 3 * 3, filterWeights, filterBias); } ////////////////////////////////////////////////////////////////////////// @@ -257,22 +219,30 @@ void filter() void render() { - VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); + VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4) * 3); // map PBO to get CUDA device pointer uint *d_output; // map PBO to get CUDA device pointer checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, - cuda_pbo_resource)); - //printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); + // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); // clear image - checkCudaErrors(cudaMemset(d_output, 0, width*height*4)); + checkCudaErrors(cudaMemset(d_output, 0, width * height * 4)); // call CUDA kernel, writing results to PBO - VolumeRender_render(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale, volumeOriginal.volumeTex); + VolumeRender_render(gridSize, + blockSize, + d_output, + width, + height, + density, + brightness, + transferOffset, + transferScale, + volumeOriginal.volumeTex); getLastCudaError("render kernel failed"); @@ -295,16 +265,16 @@ void display() glGetFloatv(GL_MODELVIEW_MATRIX, modelView); glPopMatrix(); - invViewMatrix[0] = modelView[0]; - invViewMatrix[1] = modelView[4]; - invViewMatrix[2] = modelView[8]; - invViewMatrix[3] = modelView[12]; - invViewMatrix[4] = modelView[1]; - invViewMatrix[5] = modelView[5]; - invViewMatrix[6] = modelView[9]; - invViewMatrix[7] = modelView[13]; - invViewMatrix[8] = modelView[2]; - invViewMatrix[9] = modelView[6]; + invViewMatrix[0] = modelView[0]; + invViewMatrix[1] = modelView[4]; + invViewMatrix[2] = modelView[8]; + invViewMatrix[3] = modelView[12]; + invViewMatrix[4] = modelView[1]; + invViewMatrix[5] = modelView[5]; + invViewMatrix[6] = modelView[9]; + invViewMatrix[7] = modelView[13]; + invViewMatrix[8] = modelView[2]; + invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; @@ -349,88 +319,86 @@ void display() computeFPS(); } -void idle() -{ - glutPostRedisplay(); -} +void idle() { glutPostRedisplay(); } ////////////////////////////////////////////////////////////////////////// // LOGIC void keyboard(unsigned char key, int x, int y) { - switch (key) - { - case 27: - #if defined (__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); - #else - glutDestroyWindow(glutGetWindow()); - return; - #endif - break; + switch (key) { + case 27: +#if defined(__APPLE__) || defined(MACOSX) + exit(EXIT_SUCCESS); +#else + glutDestroyWindow(glutGetWindow()); + return; +#endif + break; - case ' ': - filterAnimation = !filterAnimation; + case ' ': + filterAnimation = !filterAnimation; - if (!filterAnimation) - { - sdkStopTimer(&animationTimer); - } - else - { - sdkStartTimer(&animationTimer); - } + if (!filterAnimation) { + sdkStopTimer(&animationTimer); + } + else { + sdkStartTimer(&animationTimer); + } - break; + break; - case 'f': - linearFiltering = !linearFiltering; - VolumeRender_setTextureFilterMode(linearFiltering, &volumeOriginal); - break; + case 'f': + linearFiltering = !linearFiltering; + VolumeRender_setTextureFilterMode(linearFiltering, &volumeOriginal); + break; - case 'p': - preIntegrated = !preIntegrated; - VolumeRender_setPreIntegrated(preIntegrated); - break; + case 'p': + preIntegrated = !preIntegrated; + VolumeRender_setPreIntegrated(preIntegrated); + break; - case '+': - density += 0.01f; - break; + case '+': + density += 0.01f; + break; - case '-': - density -= 0.01f; - break; + case '-': + density -= 0.01f; + break; - case ']': - brightness += 0.1f; - break; + case ']': + brightness += 0.1f; + break; - case '[': - brightness -= 0.1f; - break; + case '[': + brightness -= 0.1f; + break; - case ';': - transferOffset += 0.01f; - break; + case ';': + transferOffset += 0.01f; + break; - case '\'': - transferOffset -= 0.01f; - break; + case '\'': + transferOffset -= 0.01f; + break; - case '.': - transferScale += 0.01f; - break; + case '.': + transferScale += 0.01f; + break; - case ',': - transferScale -= 0.01f; - break; + case ',': + transferScale -= 0.01f; + break; - default: - break; + default: + break; } - printf("density = %.2f, brightness = %.2f, transferOffset = %.2f, transferScale = %.2f\n", density, brightness, transferOffset, transferScale); + printf("density = %.2f, brightness = %.2f, transferOffset = %.2f, transferScale = %.2f\n", + density, + brightness, + transferOffset, + transferScale); glutPostRedisplay(); } @@ -439,12 +407,10 @@ int buttonState = 0; void mouse(int button, int state, int x, int y) { - if (state == GLUT_DOWN) - { - buttonState |= 1< + #include "volume.h" -extern "C" { -void VolumeRender_init(); -void VolumeRender_deinit(); +extern "C" +{ + void VolumeRender_init(); + void VolumeRender_deinit(); -void VolumeRender_setPreIntegrated(int state); -void VolumeRender_setTextureFilterMode(bool bLinearFilter, Volume *volume); -void VolumeRender_render(dim3 gridSize, dim3 blockSize, uint *d_output, - uint imageW, uint imageH, float density, - float brightness, float transferOffset, - float transferScale, cudaTextureObject_t tex); -void VolumeRender_copyInvViewMatrix(float *invViewMatrix, size_t sizeofMatrix); + void VolumeRender_setPreIntegrated(int state); + void VolumeRender_setTextureFilterMode(bool bLinearFilter, Volume *volume); + void VolumeRender_render(dim3 gridSize, + dim3 blockSize, + uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale, + cudaTextureObject_t tex); + void VolumeRender_copyInvViewMatrix(float *invViewMatrix, size_t sizeofMatrix); }; #endif diff --git a/Samples/5_Domain_Specific/volumeFiltering/volumeRender_kernel.cu b/Samples/5_Domain_Specific/volumeFiltering/volumeRender_kernel.cu index 73904823..4d98d577 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/volumeRender_kernel.cu +++ b/Samples/5_Domain_Specific/volumeFiltering/volumeRender_kernel.cu @@ -32,26 +32,27 @@ #include #include + #include "volumeRender.h" -#define VOLUMERENDER_TFS 2 -#define VOLUMERENDER_TF_PREINTSIZE 1024 +#define VOLUMERENDER_TFS 2 +#define VOLUMERENDER_TF_PREINTSIZE 1024 #define VOLUMERENDER_TF_PREINTSTEPS 1024 -#define VOLUMERENDER_TF_PREINTRAY 4 +#define VOLUMERENDER_TF_PREINTRAY 4 enum TFMode { - TF_SINGLE_1D = 0, // single 1D TF for everything - TF_LAYERED_2D_PREINT = 1, // layered 2D TF uses pre-integration - TF_LAYERED_2D = 2, // layered 2D TF without pre-integration behavior + TF_SINGLE_1D = 0, // single 1D TF for everything + TF_LAYERED_2D_PREINT = 1, // layered 2D TF uses pre-integration + TF_LAYERED_2D = 2, // layered 2D TF without pre-integration behavior }; -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; -static bool usePreInt = true; +static bool usePreInt = true; static cudaArray *d_transferIntegrate = 0; -static cudaArray *d_transferFunc = 0; -static cudaArray *d_transferArray = 0; +static cudaArray *d_transferFunc = 0; +static cudaArray *d_transferArray = 0; // 1D transfer function texture cudaTextureObject_t transferTex; @@ -62,508 +63,669 @@ cudaSurfaceObject_t transferIntegrateSurf; cudaTextureObject_t transferLayerPreintTex; cudaSurfaceObject_t transferLayerPreintSurf; -typedef struct { float4 m[3]; } float3x4; +typedef struct +{ + float4 m[3]; +} float3x4; -__constant__ float3x4 c_invViewMatrix; // inverse view matrix +__constant__ float3x4 c_invViewMatrix; // inverse view matrix -struct Ray { - float3 o; // origin - float3 d; // direction +struct Ray +{ + float3 o; // origin + float3 d; // direction }; // intersect ray with a box // http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm -__device__ int intersectBox(Ray r, float3 boxmin, float3 boxmax, float *tnear, - float *tfar) { - // compute intersection of ray with all six bbox planes - float3 invR = make_float3(1.0f) / r.d; - float3 tbot = invR * (boxmin - r.o); - float3 ttop = invR * (boxmax - r.o); +__device__ int intersectBox(Ray r, float3 boxmin, float3 boxmax, float *tnear, float *tfar) +{ + // compute intersection of ray with all six bbox planes + float3 invR = make_float3(1.0f) / r.d; + float3 tbot = invR * (boxmin - r.o); + float3 ttop = invR * (boxmax - r.o); - // re-order intersections to find smallest and largest on each axis - float3 tmin = fminf(ttop, tbot); - float3 tmax = fmaxf(ttop, tbot); + // re-order intersections to find smallest and largest on each axis + float3 tmin = fminf(ttop, tbot); + float3 tmax = fmaxf(ttop, tbot); - // find the largest tmin and the smallest tmax - float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z)); - float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z)); + // find the largest tmin and the smallest tmax + float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z)); + float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z)); - *tnear = largest_tmin; - *tfar = smallest_tmax; + *tnear = largest_tmin; + *tfar = smallest_tmax; - return smallest_tmax > largest_tmin; + return smallest_tmax > largest_tmin; } // transform vector by matrix (no translation) -__device__ float3 mul(const float3x4 &M, const float3 &v) { - float3 r; - r.x = dot(v, make_float3(M.m[0])); - r.y = dot(v, make_float3(M.m[1])); - r.z = dot(v, make_float3(M.m[2])); - return r; +__device__ float3 mul(const float3x4 &M, const float3 &v) +{ + float3 r; + r.x = dot(v, make_float3(M.m[0])); + r.y = dot(v, make_float3(M.m[1])); + r.z = dot(v, make_float3(M.m[2])); + return r; } // transform vector by matrix with translation -__device__ float4 mul(const float3x4 &M, const float4 &v) { - float4 r; - r.x = dot(v, M.m[0]); - r.y = dot(v, M.m[1]); - r.z = dot(v, M.m[2]); - r.w = 1.0f; - return r; +__device__ float4 mul(const float3x4 &M, const float4 &v) +{ + float4 r; + r.x = dot(v, M.m[0]); + r.y = dot(v, M.m[1]); + r.z = dot(v, M.m[2]); + r.w = 1.0f; + return r; } -__device__ uint rgbaFloatToInt(float4 rgba) { - rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] - rgba.y = __saturatef(rgba.y); - rgba.z = __saturatef(rgba.z); - rgba.w = __saturatef(rgba.w); - return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | - (uint(rgba.y * 255) << 8) | uint(rgba.x * 255); +__device__ uint rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | (uint(rgba.y * 255) << 8) | uint(rgba.x * 255); } template -__device__ void d_render(uint *d_output, uint imageW, uint imageH, - float density, float brightness, float transferOffset, - float transferScale, cudaTextureObject_t volumeTex, +__device__ void d_render(uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale, + cudaTextureObject_t volumeTex, cudaTextureObject_t transferTex, cudaTextureObject_t transferLayerPreintTex, - float transferWeight = 0.0f) { - const float rayscale = - float(TFMODE != TF_SINGLE_1D ? VOLUMERENDER_TF_PREINTRAY : 1); - const int maxSteps = 512; - const float tstep = 0.01f * rayscale; - const float opacityThreshold = 0.95f; - const float3 boxMin = make_float3(-1.0f, -1.0f, -1.0f); - const float3 boxMax = make_float3(1.0f, 1.0f, 1.0f); + float transferWeight = 0.0f) +{ + const float rayscale = float(TFMODE != TF_SINGLE_1D ? VOLUMERENDER_TF_PREINTRAY : 1); + const int maxSteps = 512; + const float tstep = 0.01f * rayscale; + const float opacityThreshold = 0.95f; + const float3 boxMin = make_float3(-1.0f, -1.0f, -1.0f); + const float3 boxMax = make_float3(1.0f, 1.0f, 1.0f); - density *= rayscale; + density *= rayscale; - uint x = blockIdx.x * blockDim.x + threadIdx.x; - uint y = blockIdx.y * blockDim.y + threadIdx.y; + uint x = blockIdx.x * blockDim.x + threadIdx.x; + uint y = blockIdx.y * blockDim.y + threadIdx.y; - if ((x >= imageW) || (y >= imageH)) return; + if ((x >= imageW) || (y >= imageH)) + return; - float u = (x / (float)imageW) * 2.0f - 1.0f; - float v = (y / (float)imageH) * 2.0f - 1.0f; + float u = (x / (float)imageW) * 2.0f - 1.0f; + float v = (y / (float)imageH) * 2.0f - 1.0f; - // calculate eye ray in world space - Ray eyeRay; - eyeRay.o = - make_float3(mul(c_invViewMatrix, make_float4(0.0f, 0.0f, 0.0f, 1.0f))); - eyeRay.d = normalize(make_float3(u, v, -2.0f)); - eyeRay.d = mul(c_invViewMatrix, eyeRay.d); + // calculate eye ray in world space + Ray eyeRay; + eyeRay.o = make_float3(mul(c_invViewMatrix, make_float4(0.0f, 0.0f, 0.0f, 1.0f))); + eyeRay.d = normalize(make_float3(u, v, -2.0f)); + eyeRay.d = mul(c_invViewMatrix, eyeRay.d); - // find intersection with box - float tnear, tfar; - int hit = intersectBox(eyeRay, boxMin, boxMax, &tnear, &tfar); + // find intersection with box + float tnear, tfar; + int hit = intersectBox(eyeRay, boxMin, boxMax, &tnear, &tfar); - if (!hit) return; + if (!hit) + return; - if (tnear < 0.0f) tnear = 0.0f; // clamp to near plane + if (tnear < 0.0f) + tnear = 0.0f; // clamp to near plane - // march along ray from front to back, accumulating color - float4 sum = make_float4(0.0f); - float t = tnear; - float3 pos = eyeRay.o + eyeRay.d * tnear; - float3 step = eyeRay.d * tstep; + // march along ray from front to back, accumulating color + float4 sum = make_float4(0.0f); + float t = tnear; + float3 pos = eyeRay.o + eyeRay.d * tnear; + float3 step = eyeRay.d * tstep; - float lastsample = 0; + float lastsample = 0; - // lastsample = (lastsample-transferOffset)*transferScale; - for (int i = 0; i < maxSteps; i++) { - // read from 3D texture - // remap position to [0, 1] coordinates - float3 coord = make_float3(pos.x * 0.5f + 0.5f, pos.y * 0.5f + 0.5f, - pos.z * 0.5f + 0.5f); - float sample = tex3D(volumeTex, coord.x, coord.y, coord.z); - // sample = (sample-transferOffset)*transferScale; - // sample *= 64.0f; // scale for 10-bit data + // lastsample = (lastsample-transferOffset)*transferScale; + for (int i = 0; i < maxSteps; i++) { + // read from 3D texture + // remap position to [0, 1] coordinates + float3 coord = make_float3(pos.x * 0.5f + 0.5f, pos.y * 0.5f + 0.5f, pos.z * 0.5f + 0.5f); + float sample = tex3D(volumeTex, coord.x, coord.y, coord.z); + // sample = (sample-transferOffset)*transferScale; + // sample *= 64.0f; // scale for 10-bit data - // lookup in transfer function texture - float4 col; - int tfid = (pos.x < 0); + // lookup in transfer function texture + float4 col; + int tfid = (pos.x < 0); - if (TFMODE != TF_SINGLE_1D) { - col = tex2DLayered(transferLayerPreintTex, sample, - TFMODE == TF_LAYERED_2D ? sample : lastsample, - tfid); - col.w *= density; - lastsample = sample; - } else { - col = tex1D(transferTex, sample); - col.w *= 0; + if (TFMODE != TF_SINGLE_1D) { + col = tex2DLayered( + transferLayerPreintTex, sample, TFMODE == TF_LAYERED_2D ? sample : lastsample, tfid); + col.w *= density; + lastsample = sample; + } + else { + col = tex1D(transferTex, sample); + col.w *= 0; + } + + // "under" operator for back-to-front blending + // sum = lerp(sum, col, col.w); + + // pre-multiply alpha + col.x *= col.w; + col.y *= col.w; + col.z *= col.w; + // "over" operator for front-to-back blending + sum = sum + col * (1.0f - sum.w); + + // exit early if opaque + if (sum.w > opacityThreshold) + break; + + t += tstep; + + if (t > tfar) + break; + + pos += step; } - // "under" operator for back-to-front blending - // sum = lerp(sum, col, col.w); + sum *= brightness; - // pre-multiply alpha - col.x *= col.w; - col.y *= col.w; - col.z *= col.w; - // "over" operator for front-to-back blending - sum = sum + col * (1.0f - sum.w); - - // exit early if opaque - if (sum.w > opacityThreshold) break; - - t += tstep; - - if (t > tfar) break; - - pos += step; - } - - sum *= brightness; - - // write output color - d_output[y * imageW + x] = rgbaFloatToInt(sum); + // write output color + d_output[y * imageW + x] = rgbaFloatToInt(sum); } -__global__ void d_render_regular(uint *d_output, uint imageW, uint imageH, - float density, float brightness, - float transferOffset, float transferScale, +__global__ void d_render_regular(uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale, cudaTextureObject_t volumeTex, cudaTextureObject_t transferTex, cudaTextureObject_t transferLayerPreintTex, - float transferWeight = 0.0f) { - d_render(d_output, imageW, imageH, density, brightness, - transferOffset, transferScale, volumeTex, transferTex, - transferLayerPreintTex, transferWeight); + float transferWeight = 0.0f) +{ + d_render(d_output, + imageW, + imageH, + density, + brightness, + transferOffset, + transferScale, + volumeTex, + transferTex, + transferLayerPreintTex, + transferWeight); } -__global__ void d_render_preint(uint *d_output, uint imageW, uint imageH, - float density, float brightness, - float transferOffset, float transferScale, +__global__ void d_render_preint(uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale, cudaTextureObject_t volumeTex, cudaTextureObject_t transferTex, cudaTextureObject_t transferLayerPreintTex, - float transferWeight = 0.0f) { - d_render(d_output, imageW, imageH, density, brightness, - transferOffset, transferScale, volumeTex, - transferTex, transferLayerPreintTex, - transferWeight); + float transferWeight = 0.0f) +{ + d_render(d_output, + imageW, + imageH, + density, + brightness, + transferOffset, + transferScale, + volumeTex, + transferTex, + transferLayerPreintTex, + transferWeight); } -__global__ void d_render_preint_off(uint *d_output, uint imageW, uint imageH, - float density, float brightness, - float transferOffset, float transferScale, +__global__ void d_render_preint_off(uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale, cudaTextureObject_t volumeTex, cudaTextureObject_t transferTex, cudaTextureObject_t transferLayerPreintTex, - float transferWeight = 0.0f) { - d_render(d_output, imageW, imageH, density, brightness, - transferOffset, transferScale, volumeTex, transferTex, - transferLayerPreintTex, transferWeight); + float transferWeight = 0.0f) +{ + d_render(d_output, + imageW, + imageH, + density, + brightness, + transferOffset, + transferScale, + volumeTex, + transferTex, + transferLayerPreintTex, + transferWeight); } ////////////////////////////////////////////////////////////////////////// -__global__ void d_integrate_trapezoidal( - cudaExtent extent, cudaTextureObject_t transferTex, - cudaSurfaceObject_t transferIntegrateSurf) { - uint x = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void +d_integrate_trapezoidal(cudaExtent extent, cudaTextureObject_t transferTex, cudaSurfaceObject_t transferIntegrateSurf) +{ + uint x = blockIdx.x * blockDim.x + threadIdx.x; - // for higher speed could use hierarchical approach for sum - if (x >= extent.width) { - return; - } + // for higher speed could use hierarchical approach for sum + if (x >= extent.width) { + return; + } - float stepsize = 1.0 / float(extent.width - 1); - float to = float(x) * stepsize; + float stepsize = 1.0 / float(extent.width - 1); + float to = float(x) * stepsize; - float4 outclr = make_float4(0, 0, 0, 0); - float incr = stepsize; + float4 outclr = make_float4(0, 0, 0, 0); + float incr = stepsize; - float4 lastval = tex1D(transferTex, 0); + float4 lastval = tex1D(transferTex, 0); - float cur = incr; + float cur = incr; - while (cur < to + incr * 0.5) { - float4 val = tex1D(transferTex, cur); - float4 trapezoid = (lastval + val) / 2.0f; - lastval = val; + while (cur < to + incr * 0.5) { + float4 val = tex1D(transferTex, cur); + float4 trapezoid = (lastval + val) / 2.0f; + lastval = val; - outclr += trapezoid; - cur += incr; - } + outclr += trapezoid; + cur += incr; + } - // surface writes need byte offsets for x! - surf1Dwrite(outclr, transferIntegrateSurf, x * sizeof(float4)); + // surface writes need byte offsets for x! + surf1Dwrite(outclr, transferIntegrateSurf, x * sizeof(float4)); } -__global__ void d_preintegrate(int layer, float steps, cudaExtent extent, +__global__ void d_preintegrate(int layer, + float steps, + cudaExtent extent, cudaTextureObject_t transferTex, cudaTextureObject_t transferIntegrateTex, - cudaSurfaceObject_t transferLayerPreintSurf) { - uint x = blockIdx.x * blockDim.x + threadIdx.x; - uint y = blockIdx.y * blockDim.y + threadIdx.y; + cudaSurfaceObject_t transferLayerPreintSurf) +{ + uint x = blockIdx.x * blockDim.x + threadIdx.x; + uint y = blockIdx.y * blockDim.y + threadIdx.y; - if (x >= extent.width || y >= extent.height) { - return; - } + if (x >= extent.width || y >= extent.height) { + return; + } - float sx = float(x) / float(extent.width); - float sy = float(y) / float(extent.height); + float sx = float(x) / float(extent.width); + float sy = float(y) / float(extent.height); - float smax = max(sx, sy); - float smin = min(sx, sy); + float smax = max(sx, sy); + float smin = min(sx, sy); - float4 iv; + float4 iv; - if (x != y) { - // assumes square textures! - float fracc = smax - smin; - fracc = 1.0 / (fracc * steps); + if (x != y) { + // assumes square textures! + float fracc = smax - smin; + fracc = 1.0 / (fracc * steps); - float4 intmax = tex1D(transferIntegrateTex, smax); - float4 intmin = tex1D(transferIntegrateTex, smin); - iv.x = (intmax.x - intmin.x) * fracc; - iv.y = (intmax.y - intmin.y) * fracc; - iv.z = (intmax.z - intmin.z) * fracc; - // iv.w = (intmax.w - intmin.w)*fracc; - iv.w = (1.0 - exp(-(intmax.w - intmin.w) * fracc)); - } else { - float4 sample = tex1D(transferTex, smin); - iv.x = sample.x; - iv.y = sample.y; - iv.z = sample.z; - // iv.w = sample.w; - iv.w = (1.0 - exp(-sample.w)); - } + float4 intmax = tex1D(transferIntegrateTex, smax); + float4 intmin = tex1D(transferIntegrateTex, smin); + iv.x = (intmax.x - intmin.x) * fracc; + iv.y = (intmax.y - intmin.y) * fracc; + iv.z = (intmax.z - intmin.z) * fracc; + // iv.w = (intmax.w - intmin.w)*fracc; + iv.w = (1.0 - exp(-(intmax.w - intmin.w) * fracc)); + } + else { + float4 sample = tex1D(transferTex, smin); + iv.x = sample.x; + iv.y = sample.y; + iv.z = sample.z; + // iv.w = sample.w; + iv.w = (1.0 - exp(-sample.w)); + } - iv.x = __saturatef(iv.x); - iv.y = __saturatef(iv.y); - iv.z = __saturatef(iv.z); - iv.w = __saturatef(iv.w); + iv.x = __saturatef(iv.x); + iv.y = __saturatef(iv.y); + iv.z = __saturatef(iv.z); + iv.w = __saturatef(iv.w); - // surface writes need byte offsets for x! - surf2DLayeredwrite(iv, transferLayerPreintSurf, x * sizeof(float4), y, layer); + // surface writes need byte offsets for x! + surf2DLayeredwrite(iv, transferLayerPreintSurf, x * sizeof(float4), y, layer); } ////////////////////////////////////////////////////////////////////////// -void VolumeRender_setTextureFilterMode(bool bLinearFilter, Volume *vol) { - if (vol->volumeTex) { - checkCudaErrors(cudaDestroyTextureObject(vol->volumeTex)); - } - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); +void VolumeRender_setTextureFilterMode(bool bLinearFilter, Volume *vol) +{ + if (vol->volumeTex) { + checkCudaErrors(cudaDestroyTextureObject(vol->volumeTex)); + } + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = vol->content; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = vol->content; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = true; - texDescr.filterMode = - bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint; + texDescr.normalizedCoords = true; + texDescr.filterMode = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.addressMode[2] = cudaAddressModeWrap; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.addressMode[2] = cudaAddressModeWrap; - texDescr.readMode = VolumeTypeInfo::readMode; + texDescr.readMode = VolumeTypeInfo::readMode; - checkCudaErrors( - cudaCreateTextureObject(&vol->volumeTex, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&vol->volumeTex, &texRes, &texDescr, NULL)); } -static unsigned int iDivUp(size_t a, size_t b) { - size_t val = (a % b != 0) ? (a / b + 1) : (a / b); - if (val > UINT_MAX) { - fprintf(stderr, "\nUINT_MAX limit exceeded in iDivUp() exiting.....\n"); - exit(EXIT_FAILURE); // val exceeds limit - } +static unsigned int iDivUp(size_t a, size_t b) +{ + size_t val = (a % b != 0) ? (a / b + 1) : (a / b); + if (val > UINT_MAX) { + fprintf(stderr, "\nUINT_MAX limit exceeded in iDivUp() exiting.....\n"); + exit(EXIT_FAILURE); // val exceeds limit + } - return static_cast(val); + return static_cast(val); } -void VolumeRender_updateTF(int tfIdx, int numColors, float4 *colors) { - if (d_transferFunc) { +void VolumeRender_updateTF(int tfIdx, int numColors, float4 *colors) +{ + if (d_transferFunc) { + checkCudaErrors(cudaFreeArray(d_transferFunc)); + d_transferFunc = 0; + } + + cudaChannelFormatDesc channelFloat4 = cudaCreateChannelDesc(); + checkCudaErrors(cudaMallocArray(&d_transferFunc, &channelFloat4, numColors, 1)); + checkCudaErrors( + cudaMemcpy2DToArray(d_transferFunc, 0, 0, colors, 0, sizeof(float4) * numColors, 1, cudaMemcpyHostToDevice)); + + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_transferFunc; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; + + checkCudaErrors(cudaCreateTextureObject(&transferTex, &texRes, &texDescr, NULL)); + + if (tfIdx < 0 || tfIdx >= VOLUMERENDER_TFS) { + return; + } + + { + cudaExtent extent = {VOLUMERENDER_TF_PREINTSTEPS, 0, 0}; + dim3 blockSize(32, 1, 1); + dim3 gridSize(iDivUp(extent.width, blockSize.x), 1, 1); + d_integrate_trapezoidal<<>>(extent, transferTex, transferIntegrateSurf); + } + + { + cudaExtent extent = {VOLUMERENDER_TF_PREINTSIZE, VOLUMERENDER_TF_PREINTSIZE, VOLUMERENDER_TFS}; + dim3 blockSize(16, 16, 1); + dim3 gridSize(iDivUp(extent.width, blockSize.x), iDivUp(extent.height, blockSize.y), 1); + d_preintegrate<<>>(tfIdx, + float(VOLUMERENDER_TF_PREINTSTEPS), + extent, + transferTex, + transferIntegrateTex, + transferLayerPreintSurf); + } +} + +void VolumeRender_init() +{ + cudaResourceDesc texRes; + cudaTextureDesc texDescr; + + cudaChannelFormatDesc channelFloat4 = cudaCreateChannelDesc(); + cudaExtent extent = {VOLUMERENDER_TF_PREINTSIZE, VOLUMERENDER_TF_PREINTSIZE, VOLUMERENDER_TFS}; + checkCudaErrors( + cudaMalloc3DArray(&d_transferArray, &channelFloat4, extent, cudaArrayLayered | cudaArraySurfaceLoadStore)); + + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_transferArray; + + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.readMode = cudaReadModeElementType; + + checkCudaErrors(cudaCreateTextureObject(&transferLayerPreintTex, &texRes, &texDescr, NULL)); + + cudaResourceDesc surfRes; + memset(&surfRes, 0, sizeof(cudaResourceDesc)); + surfRes.resType = cudaResourceTypeArray; + surfRes.res.array.array = d_transferArray; + + checkCudaErrors(cudaCreateSurfaceObject(&transferLayerPreintSurf, &surfRes)); + + checkCudaErrors(cudaMallocArray( + &d_transferIntegrate, &channelFloat4, VOLUMERENDER_TF_PREINTSTEPS, 0, cudaArraySurfaceLoadStore)); + + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_transferIntegrate; + + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + + texDescr.addressMode[0] = cudaAddressModeClamp; + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.addressMode[2] = cudaAddressModeClamp; + + texDescr.readMode = cudaReadModeElementType; + + checkCudaErrors(cudaCreateTextureObject(&transferIntegrateTex, &texRes, &texDescr, NULL)); + + memset(&surfRes, 0, sizeof(cudaResourceDesc)); + surfRes.resType = cudaResourceTypeArray; + surfRes.res.array.array = d_transferIntegrate; + + checkCudaErrors(cudaCreateSurfaceObject(&transferIntegrateSurf, &surfRes)); + + // create transfer function texture + float4 transferFunc0[] = { + { + 0.0, + 0.0, + 0.0, + 0.0, + }, + { + 1.0, + 0.0, + 0.0, + 1.0, + }, + { + 1.0, + 0.5, + 0.0, + 1.0, + }, + { + 1.0, + 1.0, + 0.0, + 1.0, + }, + { + 0.0, + 1.0, + 0.0, + 1.0, + }, + { + 0.0, + 1.0, + 1.0, + 1.0, + }, + { + 0.0, + 0.0, + 1.0, + 1.0, + }, + { + 1.0, + 0.0, + 1.0, + 1.0, + }, + { + 0.0, + 0.0, + 0.0, + 0.0, + }, + }; + + float4 transferFunc1[] = { + { + 0.0, + 0.0, + 0.0, + 0.0, + }, + { + 0.0, + 1.0, + 0.0, + 0.125, + }, + { + 0.0, + 0.5, + 1.0, + 0.125, + }, + { + 0.0, + 1.0, + 1.0, + 0.125, + }, + { + 0.0, + 1.0, + 0.0, + 0.125, + }, + { + 0.25, + 0.75, + 0.0, + 1.0, + }, + { + 0.75, + 0.25, + 0.0, + 0.125, + }, + { + 1.0, + 0.75, + 0.0, + 0.125, + }, + { + 0.0, + 0.0, + 0.0, + 0.0, + }, + }; + + VolumeRender_updateTF(1, sizeof(transferFunc1) / sizeof(float4), transferFunc1); + VolumeRender_updateTF(0, sizeof(transferFunc0) / sizeof(float4), transferFunc0); +} + +void VolumeRender_deinit() +{ + checkCudaErrors(cudaDestroyTextureObject(transferTex)); + checkCudaErrors(cudaDestroyTextureObject(transferIntegrateTex)); + checkCudaErrors(cudaDestroySurfaceObject(transferIntegrateSurf)); + checkCudaErrors(cudaDestroyTextureObject(transferLayerPreintTex)); + checkCudaErrors(cudaDestroySurfaceObject(transferLayerPreintSurf)); checkCudaErrors(cudaFreeArray(d_transferFunc)); - d_transferFunc = 0; - } - - cudaChannelFormatDesc channelFloat4 = cudaCreateChannelDesc(); - checkCudaErrors( - cudaMallocArray(&d_transferFunc, &channelFloat4, numColors, 1)); - checkCudaErrors(cudaMemcpy2DToArray(d_transferFunc, 0, 0, colors, 0, - sizeof(float4) * numColors, 1, - cudaMemcpyHostToDevice)); - - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_transferFunc; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors( - cudaCreateTextureObject(&transferTex, &texRes, &texDescr, NULL)); - - if (tfIdx < 0 || tfIdx >= VOLUMERENDER_TFS) { - return; - } - - { - cudaExtent extent = {VOLUMERENDER_TF_PREINTSTEPS, 0, 0}; - dim3 blockSize(32, 1, 1); - dim3 gridSize(iDivUp(extent.width, blockSize.x), 1, 1); - d_integrate_trapezoidal<<>>(extent, transferTex, - transferIntegrateSurf); - } - - { - cudaExtent extent = {VOLUMERENDER_TF_PREINTSIZE, VOLUMERENDER_TF_PREINTSIZE, - VOLUMERENDER_TFS}; - dim3 blockSize(16, 16, 1); - dim3 gridSize(iDivUp(extent.width, blockSize.x), - iDivUp(extent.height, blockSize.y), 1); - d_preintegrate<<>>( - tfIdx, float(VOLUMERENDER_TF_PREINTSTEPS), extent, transferTex, - transferIntegrateTex, transferLayerPreintSurf); - } -} - -void VolumeRender_init() { - cudaResourceDesc texRes; - cudaTextureDesc texDescr; - - cudaChannelFormatDesc channelFloat4 = cudaCreateChannelDesc(); - cudaExtent extent = {VOLUMERENDER_TF_PREINTSIZE, VOLUMERENDER_TF_PREINTSIZE, - VOLUMERENDER_TFS}; - checkCudaErrors( - cudaMalloc3DArray(&d_transferArray, &channelFloat4, extent, - cudaArrayLayered | cudaArraySurfaceLoadStore)); - - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_transferArray; - - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors(cudaCreateTextureObject(&transferLayerPreintTex, &texRes, - &texDescr, NULL)); - - cudaResourceDesc surfRes; - memset(&surfRes, 0, sizeof(cudaResourceDesc)); - surfRes.resType = cudaResourceTypeArray; - surfRes.res.array.array = d_transferArray; - - checkCudaErrors(cudaCreateSurfaceObject(&transferLayerPreintSurf, &surfRes)); - - checkCudaErrors(cudaMallocArray(&d_transferIntegrate, &channelFloat4, - VOLUMERENDER_TF_PREINTSTEPS, 0, - cudaArraySurfaceLoadStore)); - - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_transferIntegrate; - - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - - texDescr.addressMode[0] = cudaAddressModeClamp; - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.addressMode[2] = cudaAddressModeClamp; - - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors( - cudaCreateTextureObject(&transferIntegrateTex, &texRes, &texDescr, NULL)); - - memset(&surfRes, 0, sizeof(cudaResourceDesc)); - surfRes.resType = cudaResourceTypeArray; - surfRes.res.array.array = d_transferIntegrate; - - checkCudaErrors(cudaCreateSurfaceObject(&transferIntegrateSurf, &surfRes)); - - // create transfer function texture - float4 transferFunc0[] = { - { 0.0, 0.0, 0.0, 0.0, }, - { 1.0, 0.0, 0.0, 1.0, }, - { 1.0, 0.5, 0.0, 1.0, }, - { 1.0, 1.0, 0.0, 1.0, }, - { 0.0, 1.0, 0.0, 1.0, }, - { 0.0, 1.0, 1.0, 1.0, }, - { 0.0, 0.0, 1.0, 1.0, }, - { 1.0, 0.0, 1.0, 1.0, }, - { 0.0, 0.0, 0.0, 0.0, }, - }; - - float4 transferFunc1[] = { - { 0.0, 0.0, 0.0, 0.0, }, - { 0.0, 1.0, 0.0, 0.125, }, - { 0.0, 0.5, 1.0, 0.125, }, - { 0.0, 1.0, 1.0, 0.125, }, - { 0.0, 1.0, 0.0, 0.125, }, - { 0.25, 0.75, 0.0, 1.0, }, - { 0.75, 0.25, 0.0, 0.125, }, - { 1.0, 0.75, 0.0, 0.125, }, - { 0.0, 0.0, 0.0, 0.0, }, - }; - - VolumeRender_updateTF(1, sizeof(transferFunc1) / sizeof(float4), - transferFunc1); - VolumeRender_updateTF(0, sizeof(transferFunc0) / sizeof(float4), - transferFunc0); -} - -void VolumeRender_deinit() { - checkCudaErrors(cudaDestroyTextureObject(transferTex)); - checkCudaErrors(cudaDestroyTextureObject(transferIntegrateTex)); - checkCudaErrors(cudaDestroySurfaceObject(transferIntegrateSurf)); - checkCudaErrors(cudaDestroyTextureObject(transferLayerPreintTex)); - checkCudaErrors(cudaDestroySurfaceObject(transferLayerPreintSurf)); - checkCudaErrors(cudaFreeArray(d_transferFunc)); - checkCudaErrors(cudaFreeArray(d_transferArray)); - checkCudaErrors(cudaFreeArray(d_transferIntegrate)); - d_transferArray = 0; - d_transferFunc = 0; - d_transferIntegrate = 0; + checkCudaErrors(cudaFreeArray(d_transferArray)); + checkCudaErrors(cudaFreeArray(d_transferIntegrate)); + d_transferArray = 0; + d_transferFunc = 0; + d_transferIntegrate = 0; } void VolumeRender_setPreIntegrated(int state) { usePreInt = !!state; } -void VolumeRender_render(dim3 gridSize, dim3 blockSize, uint *d_output, - uint imageW, uint imageH, float density, - float brightness, float transferOffset, - float transferScale, cudaTextureObject_t volumeTex) { - if (usePreInt) { - d_render_preint<<>>( - d_output, imageW, imageH, density, brightness, transferOffset, - transferScale, volumeTex, transferTex, transferLayerPreintTex); - } else { - d_render_preint_off<<>>( - d_output, imageW, imageH, density, brightness, transferOffset, - transferScale, volumeTex, transferTex, transferLayerPreintTex); - } +void VolumeRender_render(dim3 gridSize, + dim3 blockSize, + uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale, + cudaTextureObject_t volumeTex) +{ + if (usePreInt) { + d_render_preint<<>>(d_output, + imageW, + imageH, + density, + brightness, + transferOffset, + transferScale, + volumeTex, + transferTex, + transferLayerPreintTex); + } + else { + d_render_preint_off<<>>(d_output, + imageW, + imageH, + density, + brightness, + transferOffset, + transferScale, + volumeTex, + transferTex, + transferLayerPreintTex); + } } -void VolumeRender_copyInvViewMatrix(float *invViewMatrix, size_t sizeofMatrix) { - checkCudaErrors( - cudaMemcpyToSymbol(c_invViewMatrix, invViewMatrix, sizeofMatrix)); +void VolumeRender_copyInvViewMatrix(float *invViewMatrix, size_t sizeofMatrix) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_invViewMatrix, invViewMatrix, sizeofMatrix)); } -#endif // #ifndef _VOLUMERENDER_KERNEL_CU_ +#endif // #ifndef _VOLUMERENDER_KERNEL_CU_ diff --git a/Samples/5_Domain_Specific/volumeRender/README.md b/Samples/5_Domain_Specific/volumeRender/README.md index 3a452362..21077a67 100644 --- a/Samples/5_Domain_Specific/volumeRender/README.md +++ b/Samples/5_Domain_Specific/volumeRender/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/volumeRender/volumeRender.cpp b/Samples/5_Domain_Specific/volumeRender/volumeRender.cpp index 6868fb6a..9b325584 100644 --- a/Samples/5_Domain_Specific/volumeRender/volumeRender.cpp +++ b/Samples/5_Domain_Specific/volumeRender/volumeRender.cpp @@ -53,12 +53,12 @@ #endif // CUDA Runtime, Interop, and includes -#include #include #include -#include -#include +#include #include +#include +#include // CUDA utilities #include @@ -68,11 +68,11 @@ #include #include -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; #define MAX_EPSILON_ERROR 5.00f -#define THRESHOLD 0.30f +#define THRESHOLD 0.30f // Define the files that are to be save and the reference images for validation const char *sOriginal[] = {"volume.ppm", NULL}; @@ -81,8 +81,8 @@ const char *sReference[] = {"ref_volume.ppm", NULL}; const char *sSDKsample = "CUDA 3D Volume Render"; -const char *volumeFilename = "Bucky.raw"; -cudaExtent volumeSize = make_cudaExtent(32, 32, 32); +const char *volumeFilename = "Bucky.raw"; +cudaExtent volumeSize = make_cudaExtent(32, 32, 32); typedef unsigned char VolumeType; // char *volumeFilename = "mrt16_angio.raw"; @@ -95,29 +95,28 @@ dim3 gridSize; float3 viewRotation; float3 viewTranslation = make_float3(0.0, 0.0, -4.0f); -float invViewMatrix[12]; +float invViewMatrix[12]; -float density = 0.05f; -float brightness = 1.0f; -float transferOffset = 0.0f; -float transferScale = 1.0f; -bool linearFiltering = true; +float density = 0.05f; +float brightness = 1.0f; +float transferOffset = 0.0f; +float transferScale = 1.0f; +bool linearFiltering = true; -GLuint pbo = 0; // OpenGL pixel buffer object -GLuint tex = 0; // OpenGL texture object -struct cudaGraphicsResource - *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) +GLuint pbo = 0; // OpenGL pixel buffer object +GLuint tex = 0; // OpenGL texture object +struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO) StopWatchInterface *timer = 0; // Auto-Verification Code -const int frameCheckNumber = 2; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -unsigned int frameCount = 0; +const int frameCheckNumber = 2; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +unsigned int frameCount = 0; -int *pArgc; +int *pArgc; char **pArgv; #ifndef MAX @@ -127,93 +126,99 @@ char **pArgv; extern "C" void setTextureFilterMode(bool bLinearFilter); extern "C" void initCuda(void *h_volume, cudaExtent volumeSize); extern "C" void freeCudaBuffers(); -extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, - uint imageW, uint imageH, float density, - float brightness, float transferOffset, +extern "C" void render_kernel(dim3 gridSize, + dim3 blockSize, + uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, float transferScale); extern "C" void copyInvViewMatrix(float *invViewMatrix, size_t sizeofMatrix); void initPixelBuffer(); -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - char fps[256]; - float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - sprintf(fps, "Volume Render: %3.1f fps", ifps); + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "Volume Render: %3.1f fps", ifps); - glutSetWindowTitle(fps); - fpsCount = 0; + glutSetWindowTitle(fps); + fpsCount = 0; - fpsLimit = (int)MAX(1.f, ifps); - sdkResetTimer(&timer); - } + fpsLimit = (int)MAX(1.f, ifps); + sdkResetTimer(&timer); + } } // render image using CUDA -void render() { - copyInvViewMatrix(invViewMatrix, sizeof(float4) * 3); +void render() +{ + copyInvViewMatrix(invViewMatrix, sizeof(float4) * 3); - // map PBO to get CUDA device pointer - uint *d_output; - // map PBO to get CUDA device pointer - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&d_output, &num_bytes, cuda_pbo_resource)); - // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); + // map PBO to get CUDA device pointer + uint *d_output; + // map PBO to get CUDA device pointer + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); + // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); - // clear image - checkCudaErrors(cudaMemset(d_output, 0, width * height * 4)); + // clear image + checkCudaErrors(cudaMemset(d_output, 0, width * height * 4)); - // call CUDA kernel, writing results to PBO - render_kernel(gridSize, blockSize, d_output, width, height, density, - brightness, transferOffset, transferScale); + // call CUDA kernel, writing results to PBO + render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); - getLastCudaError("kernel failed"); + getLastCudaError("kernel failed"); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); } // display results using OpenGL (called by GLUT) -void display() { - sdkStartTimer(&timer); +void display() +{ + sdkStartTimer(&timer); - // use OpenGL to build view matrix - GLfloat modelView[16]; - glMatrixMode(GL_MODELVIEW); - glPushMatrix(); - glLoadIdentity(); - glRotatef(-viewRotation.x, 1.0, 0.0, 0.0); - glRotatef(-viewRotation.y, 0.0, 1.0, 0.0); - glTranslatef(-viewTranslation.x, -viewTranslation.y, -viewTranslation.z); - glGetFloatv(GL_MODELVIEW_MATRIX, modelView); - glPopMatrix(); + // use OpenGL to build view matrix + GLfloat modelView[16]; + glMatrixMode(GL_MODELVIEW); + glPushMatrix(); + glLoadIdentity(); + glRotatef(-viewRotation.x, 1.0, 0.0, 0.0); + glRotatef(-viewRotation.y, 0.0, 1.0, 0.0); + glTranslatef(-viewTranslation.x, -viewTranslation.y, -viewTranslation.z); + glGetFloatv(GL_MODELVIEW_MATRIX, modelView); + glPopMatrix(); - invViewMatrix[0] = modelView[0]; - invViewMatrix[1] = modelView[4]; - invViewMatrix[2] = modelView[8]; - invViewMatrix[3] = modelView[12]; - invViewMatrix[4] = modelView[1]; - invViewMatrix[5] = modelView[5]; - invViewMatrix[6] = modelView[9]; - invViewMatrix[7] = modelView[13]; - invViewMatrix[8] = modelView[2]; - invViewMatrix[9] = modelView[6]; - invViewMatrix[10] = modelView[10]; - invViewMatrix[11] = modelView[14]; + invViewMatrix[0] = modelView[0]; + invViewMatrix[1] = modelView[4]; + invViewMatrix[2] = modelView[8]; + invViewMatrix[3] = modelView[12]; + invViewMatrix[4] = modelView[1]; + invViewMatrix[5] = modelView[5]; + invViewMatrix[6] = modelView[9]; + invViewMatrix[7] = modelView[13]; + invViewMatrix[8] = modelView[2]; + invViewMatrix[9] = modelView[6]; + invViewMatrix[10] = modelView[10]; + invViewMatrix[11] = modelView[14]; - render(); + render(); - // display results - glClear(GL_COLOR_BUFFER_BIT); + // display results + glClear(GL_COLOR_BUFFER_BIT); - // draw image from PBO - glDisable(GL_DEPTH_TEST); + // draw image from PBO + glDisable(GL_DEPTH_TEST); - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); #if 0 // draw using glDrawPixels (slower) glRasterPos2i(0, 0); @@ -221,417 +226,423 @@ void display() { glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); #else - // draw using texture + // draw using texture - // copy from pbo to texture - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBindTexture(GL_TEXTURE_2D, tex); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, - GL_UNSIGNED_BYTE, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + // copy from pbo to texture + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBindTexture(GL_TEXTURE_2D, tex); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - // draw textured quad - glEnable(GL_TEXTURE_2D); - glBegin(GL_QUADS); - glTexCoord2f(0, 0); - glVertex2f(0, 0); - glTexCoord2f(1, 0); - glVertex2f(1, 0); - glTexCoord2f(1, 1); - glVertex2f(1, 1); - glTexCoord2f(0, 1); - glVertex2f(0, 1); - glEnd(); + // draw textured quad + glEnable(GL_TEXTURE_2D); + glBegin(GL_QUADS); + glTexCoord2f(0, 0); + glVertex2f(0, 0); + glTexCoord2f(1, 0); + glVertex2f(1, 0); + glTexCoord2f(1, 1); + glVertex2f(1, 1); + glTexCoord2f(0, 1); + glVertex2f(0, 1); + glEnd(); - glDisable(GL_TEXTURE_2D); - glBindTexture(GL_TEXTURE_2D, 0); + glDisable(GL_TEXTURE_2D); + glBindTexture(GL_TEXTURE_2D, 0); #endif - glutSwapBuffers(); - glutReportErrors(); + glutSwapBuffers(); + glutReportErrors(); - sdkStopTimer(&timer); + sdkStopTimer(&timer); - computeFPS(); + computeFPS(); } void idle() { glutPostRedisplay(); } -void keyboard(unsigned char key, int x, int y) { - switch (key) { +void keyboard(unsigned char key, int x, int y) +{ + switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) - exit(EXIT_SUCCESS); + exit(EXIT_SUCCESS); #else - glutDestroyWindow(glutGetWindow()); - return; + glutDestroyWindow(glutGetWindow()); + return; #endif - break; + break; case 'f': - linearFiltering = !linearFiltering; - setTextureFilterMode(linearFiltering); - break; + linearFiltering = !linearFiltering; + setTextureFilterMode(linearFiltering); + break; case '+': - density += 0.01f; - break; + density += 0.01f; + break; case '-': - density -= 0.01f; - break; + density -= 0.01f; + break; case ']': - brightness += 0.1f; - break; + brightness += 0.1f; + break; case '[': - brightness -= 0.1f; - break; + brightness -= 0.1f; + break; case ';': - transferOffset += 0.01f; - break; + transferOffset += 0.01f; + break; case '\'': - transferOffset -= 0.01f; - break; + transferOffset -= 0.01f; + break; case '.': - transferScale += 0.01f; - break; + transferScale += 0.01f; + break; case ',': - transferScale -= 0.01f; - break; + transferScale -= 0.01f; + break; default: - break; - } + break; + } - printf( - "density = %.2f, brightness = %.2f, transferOffset = %.2f, transferScale " - "= %.2f\n", - density, brightness, transferOffset, transferScale); - glutPostRedisplay(); + printf("density = %.2f, brightness = %.2f, transferOffset = %.2f, transferScale " + "= %.2f\n", + density, + brightness, + transferOffset, + transferScale); + glutPostRedisplay(); } int ox, oy; int buttonState = 0; -void mouse(int button, int state, int x, int y) { - if (state == GLUT_DOWN) { - buttonState |= 1 << button; - } else if (state == GLUT_UP) { - buttonState = 0; - } +void mouse(int button, int state, int x, int y) +{ + if (state == GLUT_DOWN) { + buttonState |= 1 << button; + } + else if (state == GLUT_UP) { + buttonState = 0; + } - ox = x; - oy = y; - glutPostRedisplay(); + ox = x; + oy = y; + glutPostRedisplay(); } -void motion(int x, int y) { - float dx, dy; - dx = (float)(x - ox); - dy = (float)(y - oy); +void motion(int x, int y) +{ + float dx, dy; + dx = (float)(x - ox); + dy = (float)(y - oy); - if (buttonState == 4) { - // right = zoom - viewTranslation.z += dy / 100.0f; - } else if (buttonState == 2) { - // middle = translate - viewTranslation.x += dx / 100.0f; - viewTranslation.y -= dy / 100.0f; - } else if (buttonState == 1) { - // left = rotate - viewRotation.x += dy / 5.0f; - viewRotation.y += dx / 5.0f; - } + if (buttonState == 4) { + // right = zoom + viewTranslation.z += dy / 100.0f; + } + else if (buttonState == 2) { + // middle = translate + viewTranslation.x += dx / 100.0f; + viewTranslation.y -= dy / 100.0f; + } + else if (buttonState == 1) { + // left = rotate + viewRotation.x += dy / 5.0f; + viewRotation.y += dx / 5.0f; + } - ox = x; - oy = y; - glutPostRedisplay(); + ox = x; + oy = y; + glutPostRedisplay(); } int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } -void reshape(int w, int h) { - width = w; - height = h; - initPixelBuffer(); +void reshape(int w, int h) +{ + width = w; + height = h; + initPixelBuffer(); - // calculate new grid size - gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); + // calculate new grid size + gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); - glViewport(0, 0, w, h); + glViewport(0, 0, w, h); - glMatrixMode(GL_MODELVIEW); - glLoadIdentity(); + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } -void cleanup() { - sdkDeleteTimer(&timer); +void cleanup() +{ + sdkDeleteTimer(&timer); - freeCudaBuffers(); + freeCudaBuffers(); - if (pbo) { - cudaGraphicsUnregisterResource(cuda_pbo_resource); - glDeleteBuffers(1, &pbo); - glDeleteTextures(1, &tex); - } - // Calling cudaProfilerStop causes all profile data to be - // flushed before the application exits - checkCudaErrors(cudaProfilerStop()); + if (pbo) { + cudaGraphicsUnregisterResource(cuda_pbo_resource); + glDeleteBuffers(1, &pbo); + glDeleteTextures(1, &tex); + } + // Calling cudaProfilerStop causes all profile data to be + // flushed before the application exits + checkCudaErrors(cudaProfilerStop()); } -void initGL(int *argc, char **argv) { - // initialize GLUT callback functions - glutInit(argc, argv); - glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); - glutInitWindowSize(width, height); - glutCreateWindow("CUDA volume rendering"); +void initGL(int *argc, char **argv) +{ + // initialize GLUT callback functions + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); + glutInitWindowSize(width, height); + glutCreateWindow("CUDA volume rendering"); - if (!isGLVersionSupported(2, 0) || - !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { - printf("Required OpenGL extensions are missing."); - exit(EXIT_SUCCESS); - } + if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { + printf("Required OpenGL extensions are missing."); + exit(EXIT_SUCCESS); + } } -void initPixelBuffer() { - if (pbo) { - // unregister this buffer object from CUDA C - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); +void initPixelBuffer() +{ + if (pbo) { + // unregister this buffer object from CUDA C + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); - // delete old buffer - glDeleteBuffers(1, &pbo); - glDeleteTextures(1, &tex); - } + // delete old buffer + glDeleteBuffers(1, &pbo); + glDeleteTextures(1, &tex); + } - // create pixel buffer object for display - glGenBuffers(1, &pbo); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); - glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, - 0, GL_STREAM_DRAW_ARB); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + // create pixel buffer object for display + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); - // register this buffer object with CUDA - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); - // create texture for display - glGenTextures(1, &tex); - glBindTexture(GL_TEXTURE_2D, tex); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, - GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); + // create texture for display + glGenTextures(1, &tex); + glBindTexture(GL_TEXTURE_2D, tex); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); } // Load raw data from disk -void *loadRawFile(char *filename, size_t size) { - FILE *fp = fopen(filename, "rb"); +void *loadRawFile(char *filename, size_t size) +{ + FILE *fp = fopen(filename, "rb"); - if (!fp) { - fprintf(stderr, "Error opening file '%s'\n", filename); - return 0; - } - - void *data = malloc(size); - size_t read = fread(data, 1, size, fp); - fclose(fp); - -#if defined(_MSC_VER_) - printf("Read '%s', %Iu bytes\n", filename, read); -#else - printf("Read '%s', %zu bytes\n", filename, read); -#endif - - return data; -} - -void runSingleTest(const char *ref_file, const char *exec_path) { - bool bTestResult = true; - - uint *d_output; - checkCudaErrors( - cudaMalloc((void **)&d_output, width * height * sizeof(uint))); - checkCudaErrors(cudaMemset(d_output, 0, width * height * sizeof(uint))); - - float modelView[16] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f}; - - invViewMatrix[0] = modelView[0]; - invViewMatrix[1] = modelView[4]; - invViewMatrix[2] = modelView[8]; - invViewMatrix[3] = modelView[12]; - invViewMatrix[4] = modelView[1]; - invViewMatrix[5] = modelView[5]; - invViewMatrix[6] = modelView[9]; - invViewMatrix[7] = modelView[13]; - invViewMatrix[8] = modelView[2]; - invViewMatrix[9] = modelView[6]; - invViewMatrix[10] = modelView[10]; - invViewMatrix[11] = modelView[14]; - - // call CUDA kernel, writing results to PBO - copyInvViewMatrix(invViewMatrix, sizeof(float4) * 3); - - // Start timer 0 and process n loops on the GPU - int nIter = 10; - - for (int i = -1; i < nIter; i++) { - if (i == 0) { - cudaDeviceSynchronize(); - sdkStartTimer(&timer); + if (!fp) { + fprintf(stderr, "Error opening file '%s'\n", filename); + return 0; } - render_kernel(gridSize, blockSize, d_output, width, height, density, - brightness, transferOffset, transferScale); - } + void *data = malloc(size); + size_t read = fread(data, 1, size, fp); + fclose(fp); - cudaDeviceSynchronize(); - sdkStopTimer(&timer); - // Get elapsed time and throughput, then log to sample and master logs - double dAvgTime = sdkGetTimerValue(&timer) / (nIter * 1000.0); - printf( - "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u " - "Texels, NumDevsUsed = %u, Workgroup = %u\n", - (1.0e-6 * width * height) / dAvgTime, dAvgTime, (width * height), 1, - blockSize.x * blockSize.y); +#if defined(_MSC_VER_) + printf("Read '%s', %Iu bytes\n", filename, read); +#else + printf("Read '%s', %zu bytes\n", filename, read); +#endif - getLastCudaError("Error: render_kernel() execution FAILED"); - checkCudaErrors(cudaDeviceSynchronize()); + return data; +} - unsigned char *h_output = (unsigned char *)malloc(width * height * 4); - checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * 4, - cudaMemcpyDeviceToHost)); +void runSingleTest(const char *ref_file, const char *exec_path) +{ + bool bTestResult = true; - sdkSavePPM4ub("volume.ppm", h_output, width, height); - bTestResult = - sdkComparePPM("volume.ppm", sdkFindFilePath(ref_file, exec_path), - MAX_EPSILON_ERROR, THRESHOLD, true); + uint *d_output; + checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(uint))); + checkCudaErrors(cudaMemset(d_output, 0, width * height * sizeof(uint))); - cudaFree(d_output); - free(h_output); - cleanup(); + float modelView[16] = { + 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f}; - exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); + invViewMatrix[0] = modelView[0]; + invViewMatrix[1] = modelView[4]; + invViewMatrix[2] = modelView[8]; + invViewMatrix[3] = modelView[12]; + invViewMatrix[4] = modelView[1]; + invViewMatrix[5] = modelView[5]; + invViewMatrix[6] = modelView[9]; + invViewMatrix[7] = modelView[13]; + invViewMatrix[8] = modelView[2]; + invViewMatrix[9] = modelView[6]; + invViewMatrix[10] = modelView[10]; + invViewMatrix[11] = modelView[14]; + + // call CUDA kernel, writing results to PBO + copyInvViewMatrix(invViewMatrix, sizeof(float4) * 3); + + // Start timer 0 and process n loops on the GPU + int nIter = 10; + + for (int i = -1; i < nIter; i++) { + if (i == 0) { + cudaDeviceSynchronize(); + sdkStartTimer(&timer); + } + + render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); + } + + cudaDeviceSynchronize(); + sdkStopTimer(&timer); + // Get elapsed time and throughput, then log to sample and master logs + double dAvgTime = sdkGetTimerValue(&timer) / (nIter * 1000.0); + printf("volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u " + "Texels, NumDevsUsed = %u, Workgroup = %u\n", + (1.0e-6 * width * height) / dAvgTime, + dAvgTime, + (width * height), + 1, + blockSize.x * blockSize.y); + + getLastCudaError("Error: render_kernel() execution FAILED"); + checkCudaErrors(cudaDeviceSynchronize()); + + unsigned char *h_output = (unsigned char *)malloc(width * height * 4); + checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * 4, cudaMemcpyDeviceToHost)); + + sdkSavePPM4ub("volume.ppm", h_output, width, height); + bTestResult = sdkComparePPM("volume.ppm", sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, true); + + cudaFree(d_output); + free(h_output); + cleanup(); + + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - pArgc = &argc; - pArgv = argv; +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; - char *ref_file = NULL; + char *ref_file = NULL; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - // start logs - printf("%s Starting...\n\n", sSDKsample); + // start logs + printf("%s Starting...\n\n", sSDKsample); - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); - fpsLimit = frameCheckNumber; - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + fpsLimit = frameCheckNumber; + } - if (ref_file) { - findCudaDevice(argc, (const char **)argv); - } else { - // First initialize OpenGL context, so we can properly set the GL for CUDA. - // This is necessary in order to achieve optimal performance with - // OpenGL/CUDA interop. - initGL(&argc, argv); + if (ref_file) { + findCudaDevice(argc, (const char **)argv); + } + else { + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with + // OpenGL/CUDA interop. + initGL(&argc, argv); - findCudaDevice(argc, (const char **)argv); - } + findCudaDevice(argc, (const char **)argv); + } - // parse arguments - char *filename; + // parse arguments + char *filename; - if (getCmdLineArgumentString(argc, (const char **)argv, "volume", - &filename)) { - volumeFilename = filename; - } + if (getCmdLineArgumentString(argc, (const char **)argv, "volume", &filename)) { + volumeFilename = filename; + } - int n; + int n; - if (checkCmdLineFlag(argc, (const char **)argv, "size")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "size"); - volumeSize.width = volumeSize.height = volumeSize.depth = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "size")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "size"); + volumeSize.width = volumeSize.height = volumeSize.depth = n; + } - if (checkCmdLineFlag(argc, (const char **)argv, "xsize")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "xsize"); - volumeSize.width = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "xsize")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "xsize"); + volumeSize.width = n; + } - if (checkCmdLineFlag(argc, (const char **)argv, "ysize")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "ysize"); - volumeSize.height = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "ysize")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "ysize"); + volumeSize.height = n; + } - if (checkCmdLineFlag(argc, (const char **)argv, "zsize")) { - n = getCmdLineArgumentInt(argc, (const char **)argv, "zsize"); - volumeSize.depth = n; - } + if (checkCmdLineFlag(argc, (const char **)argv, "zsize")) { + n = getCmdLineArgumentInt(argc, (const char **)argv, "zsize"); + volumeSize.depth = n; + } - // load volume data - char *path = sdkFindFilePath(volumeFilename, argv[0]); + // load volume data + char *path = sdkFindFilePath(volumeFilename, argv[0]); - if (path == 0) { - printf("Error finding file '%s'\n", volumeFilename); - exit(EXIT_FAILURE); - } + if (path == 0) { + printf("Error finding file '%s'\n", volumeFilename); + exit(EXIT_FAILURE); + } - size_t size = volumeSize.width * volumeSize.height * volumeSize.depth * - sizeof(VolumeType); - void *h_volume = loadRawFile(path, size); + size_t size = volumeSize.width * volumeSize.height * volumeSize.depth * sizeof(VolumeType); + void *h_volume = loadRawFile(path, size); - initCuda(h_volume, volumeSize); - free(h_volume); + initCuda(h_volume, volumeSize); + free(h_volume); - sdkCreateTimer(&timer); + sdkCreateTimer(&timer); - printf( - "Press '+' and '-' to change density (0.01 increments)\n" - " ']' and '[' to change brightness\n" - " ';' and ''' to modify transfer function offset\n" - " '.' and ',' to modify transfer function scale\n\n"); + printf("Press '+' and '-' to change density (0.01 increments)\n" + " ']' and '[' to change brightness\n" + " ';' and ''' to modify transfer function offset\n" + " '.' and ',' to modify transfer function scale\n\n"); - // calculate new grid size - gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); + // calculate new grid size + gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); - if (ref_file) { - runSingleTest(ref_file, argv[0]); - } else { - // This is the normal rendering path for VolumeRender - glutDisplayFunc(display); - glutKeyboardFunc(keyboard); - glutMouseFunc(mouse); - glutMotionFunc(motion); - glutReshapeFunc(reshape); - glutIdleFunc(idle); + if (ref_file) { + runSingleTest(ref_file, argv[0]); + } + else { + // This is the normal rendering path for VolumeRender + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutMouseFunc(mouse); + glutMotionFunc(motion); + glutReshapeFunc(reshape); + glutIdleFunc(idle); - initPixelBuffer(); + initPixelBuffer(); #if defined(__APPLE__) || defined(MACOSX) - atexit(cleanup); + atexit(cleanup); #else - glutCloseFunc(cleanup); + glutCloseFunc(cleanup); #endif - glutMainLoop(); - } + glutMainLoop(); + } } diff --git a/Samples/5_Domain_Specific/volumeRender/volumeRender_kernel.cu b/Samples/5_Domain_Specific/volumeRender/volumeRender_kernel.cu index 2f037947..34e05a66 100644 --- a/Samples/5_Domain_Specific/volumeRender/volumeRender_kernel.cu +++ b/Samples/5_Domain_Specific/volumeRender/volumeRender_kernel.cu @@ -33,7 +33,7 @@ #include #include -typedef unsigned int uint; +typedef unsigned int uint; typedef unsigned char uchar; cudaArray *d_volumeArray = 0; @@ -42,270 +42,328 @@ cudaArray *d_transferFuncArray; typedef unsigned char VolumeType; // typedef unsigned short VolumeType; -cudaTextureObject_t texObject; // For 3D texture -cudaTextureObject_t transferTex; // For 1D transfer function texture +cudaTextureObject_t texObject; // For 3D texture +cudaTextureObject_t transferTex; // For 1D transfer function texture -typedef struct { float4 m[3]; } float3x4; +typedef struct +{ + float4 m[3]; +} float3x4; -__constant__ float3x4 c_invViewMatrix; // inverse view matrix +__constant__ float3x4 c_invViewMatrix; // inverse view matrix -struct Ray { - float3 o; // origin - float3 d; // direction +struct Ray +{ + float3 o; // origin + float3 d; // direction }; // intersect ray with a box // http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm -__device__ int intersectBox(Ray r, float3 boxmin, float3 boxmax, float *tnear, - float *tfar) { - // compute intersection of ray with all six bbox planes - float3 invR = make_float3(1.0f) / r.d; - float3 tbot = invR * (boxmin - r.o); - float3 ttop = invR * (boxmax - r.o); +__device__ int intersectBox(Ray r, float3 boxmin, float3 boxmax, float *tnear, float *tfar) +{ + // compute intersection of ray with all six bbox planes + float3 invR = make_float3(1.0f) / r.d; + float3 tbot = invR * (boxmin - r.o); + float3 ttop = invR * (boxmax - r.o); - // re-order intersections to find smallest and largest on each axis - float3 tmin = fminf(ttop, tbot); - float3 tmax = fmaxf(ttop, tbot); + // re-order intersections to find smallest and largest on each axis + float3 tmin = fminf(ttop, tbot); + float3 tmax = fmaxf(ttop, tbot); - // find the largest tmin and the smallest tmax - float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z)); - float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z)); + // find the largest tmin and the smallest tmax + float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z)); + float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z)); - *tnear = largest_tmin; - *tfar = smallest_tmax; + *tnear = largest_tmin; + *tfar = smallest_tmax; - return smallest_tmax > largest_tmin; + return smallest_tmax > largest_tmin; } // transform vector by matrix (no translation) -__device__ float3 mul(const float3x4 &M, const float3 &v) { - float3 r; - r.x = dot(v, make_float3(M.m[0])); - r.y = dot(v, make_float3(M.m[1])); - r.z = dot(v, make_float3(M.m[2])); - return r; +__device__ float3 mul(const float3x4 &M, const float3 &v) +{ + float3 r; + r.x = dot(v, make_float3(M.m[0])); + r.y = dot(v, make_float3(M.m[1])); + r.z = dot(v, make_float3(M.m[2])); + return r; } // transform vector by matrix with translation -__device__ float4 mul(const float3x4 &M, const float4 &v) { - float4 r; - r.x = dot(v, M.m[0]); - r.y = dot(v, M.m[1]); - r.z = dot(v, M.m[2]); - r.w = 1.0f; - return r; +__device__ float4 mul(const float3x4 &M, const float4 &v) +{ + float4 r; + r.x = dot(v, M.m[0]); + r.y = dot(v, M.m[1]); + r.z = dot(v, M.m[2]); + r.w = 1.0f; + return r; } -__device__ uint rgbaFloatToInt(float4 rgba) { - rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] - rgba.y = __saturatef(rgba.y); - rgba.z = __saturatef(rgba.z); - rgba.w = __saturatef(rgba.w); - return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | - (uint(rgba.y * 255) << 8) | uint(rgba.x * 255); +__device__ uint rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | (uint(rgba.y * 255) << 8) | uint(rgba.x * 255); } -__global__ void d_render(uint *d_output, uint imageW, uint imageH, - float density, float brightness, float transferOffset, - float transferScale, cudaTextureObject_t tex, - cudaTextureObject_t transferTex) { - const int maxSteps = 500; - const float tstep = 0.01f; - const float opacityThreshold = 0.95f; - const float3 boxMin = make_float3(-1.0f, -1.0f, -1.0f); - const float3 boxMax = make_float3(1.0f, 1.0f, 1.0f); +__global__ void d_render(uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale, + cudaTextureObject_t tex, + cudaTextureObject_t transferTex) +{ + const int maxSteps = 500; + const float tstep = 0.01f; + const float opacityThreshold = 0.95f; + const float3 boxMin = make_float3(-1.0f, -1.0f, -1.0f); + const float3 boxMax = make_float3(1.0f, 1.0f, 1.0f); - uint x = blockIdx.x * blockDim.x + threadIdx.x; - uint y = blockIdx.y * blockDim.y + threadIdx.y; + uint x = blockIdx.x * blockDim.x + threadIdx.x; + uint y = blockIdx.y * blockDim.y + threadIdx.y; - if ((x >= imageW) || (y >= imageH)) return; + if ((x >= imageW) || (y >= imageH)) + return; - float u = (x / (float)imageW) * 2.0f - 1.0f; - float v = (y / (float)imageH) * 2.0f - 1.0f; + float u = (x / (float)imageW) * 2.0f - 1.0f; + float v = (y / (float)imageH) * 2.0f - 1.0f; - // calculate eye ray in world space - Ray eyeRay; - eyeRay.o = - make_float3(mul(c_invViewMatrix, make_float4(0.0f, 0.0f, 0.0f, 1.0f))); - eyeRay.d = normalize(make_float3(u, v, -2.0f)); - eyeRay.d = mul(c_invViewMatrix, eyeRay.d); + // calculate eye ray in world space + Ray eyeRay; + eyeRay.o = make_float3(mul(c_invViewMatrix, make_float4(0.0f, 0.0f, 0.0f, 1.0f))); + eyeRay.d = normalize(make_float3(u, v, -2.0f)); + eyeRay.d = mul(c_invViewMatrix, eyeRay.d); - // find intersection with box - float tnear, tfar; - int hit = intersectBox(eyeRay, boxMin, boxMax, &tnear, &tfar); + // find intersection with box + float tnear, tfar; + int hit = intersectBox(eyeRay, boxMin, boxMax, &tnear, &tfar); - if (!hit) return; + if (!hit) + return; - if (tnear < 0.0f) tnear = 0.0f; // clamp to near plane + if (tnear < 0.0f) + tnear = 0.0f; // clamp to near plane - // march along ray from front to back, accumulating color - float4 sum = make_float4(0.0f); - float t = tnear; - float3 pos = eyeRay.o + eyeRay.d * tnear; - float3 step = eyeRay.d * tstep; + // march along ray from front to back, accumulating color + float4 sum = make_float4(0.0f); + float t = tnear; + float3 pos = eyeRay.o + eyeRay.d * tnear; + float3 step = eyeRay.d * tstep; - for (int i = 0; i < maxSteps; i++) { - // read from 3D texture - // remap position to [0, 1] coordinates - float sample = tex3D(tex, pos.x * 0.5f + 0.5f, pos.y * 0.5f + 0.5f, - pos.z * 0.5f + 0.5f); - // sample *= 64.0f; // scale for 10-bit data + for (int i = 0; i < maxSteps; i++) { + // read from 3D texture + // remap position to [0, 1] coordinates + float sample = tex3D(tex, pos.x * 0.5f + 0.5f, pos.y * 0.5f + 0.5f, pos.z * 0.5f + 0.5f); + // sample *= 64.0f; // scale for 10-bit data - // lookup in transfer function texture - float4 col = - tex1D(transferTex, (sample - transferOffset) * transferScale); - col.w *= density; + // lookup in transfer function texture + float4 col = tex1D(transferTex, (sample - transferOffset) * transferScale); + col.w *= density; - // "under" operator for back-to-front blending - // sum = lerp(sum, col, col.w); + // "under" operator for back-to-front blending + // sum = lerp(sum, col, col.w); - // pre-multiply alpha - col.x *= col.w; - col.y *= col.w; - col.z *= col.w; - // "over" operator for front-to-back blending - sum = sum + col * (1.0f - sum.w); + // pre-multiply alpha + col.x *= col.w; + col.y *= col.w; + col.z *= col.w; + // "over" operator for front-to-back blending + sum = sum + col * (1.0f - sum.w); - // exit early if opaque - if (sum.w > opacityThreshold) break; + // exit early if opaque + if (sum.w > opacityThreshold) + break; - t += tstep; + t += tstep; - if (t > tfar) break; + if (t > tfar) + break; - pos += step; - } + pos += step; + } - sum *= brightness; + sum *= brightness; - // write output color - d_output[y * imageW + x] = rgbaFloatToInt(sum); + // write output color + d_output[y * imageW + x] = rgbaFloatToInt(sum); } -extern "C" void setTextureFilterMode(bool bLinearFilter) { - if (texObject) { +extern "C" void setTextureFilterMode(bool bLinearFilter) +{ + if (texObject) { + checkCudaErrors(cudaDestroyTextureObject(texObject)); + } + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_volumeArray; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; + texDescr.filterMode = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint; + + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + texDescr.addressMode[2] = cudaAddressModeWrap; + + texDescr.readMode = cudaReadModeNormalizedFloat; + + checkCudaErrors(cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); +} + +extern "C" void initCuda(void *h_volume, cudaExtent volumeSize) +{ + // create 3D array + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); + + // copy data to 3D array + cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = + make_cudaPitchedPtr(h_volume, volumeSize.width * sizeof(VolumeType), volumeSize.width, volumeSize.height); + copyParams.dstArray = d_volumeArray; + copyParams.extent = volumeSize; + copyParams.kind = cudaMemcpyHostToDevice; + checkCudaErrors(cudaMemcpy3D(©Params)); + + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_volumeArray; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; // access with normalized texture coordinates + texDescr.filterMode = cudaFilterModeLinear; // linear interpolation + + texDescr.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates + texDescr.addressMode[1] = cudaAddressModeClamp; + texDescr.addressMode[2] = cudaAddressModeClamp; + + texDescr.readMode = cudaReadModeNormalizedFloat; + + checkCudaErrors(cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); + + // create transfer function texture + float4 transferFunc[] = { + { + 0.0, + 0.0, + 0.0, + 0.0, + }, + { + 1.0, + 0.0, + 0.0, + 1.0, + }, + { + 1.0, + 0.5, + 0.0, + 1.0, + }, + { + 1.0, + 1.0, + 0.0, + 1.0, + }, + { + 0.0, + 1.0, + 0.0, + 1.0, + }, + { + 0.0, + 1.0, + 1.0, + 1.0, + }, + { + 0.0, + 0.0, + 1.0, + 1.0, + }, + { + 1.0, + 0.0, + 1.0, + 1.0, + }, + { + 0.0, + 0.0, + 0.0, + 0.0, + }, + }; + + cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc(); + cudaArray *d_transferFuncArray; + checkCudaErrors(cudaMallocArray(&d_transferFuncArray, &channelDesc2, sizeof(transferFunc) / sizeof(float4), 1)); + checkCudaErrors(cudaMemcpy2DToArray( + d_transferFuncArray, 0, 0, transferFunc, 0, sizeof(transferFunc), 1, cudaMemcpyHostToDevice)); + + memset(&texRes, 0, sizeof(cudaResourceDesc)); + + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_transferFuncArray; + + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; // access with normalized texture coordinates + texDescr.filterMode = cudaFilterModeLinear; + + texDescr.addressMode[0] = cudaAddressModeClamp; // wrap texture coordinates + + texDescr.readMode = cudaReadModeElementType; + + checkCudaErrors(cudaCreateTextureObject(&transferTex, &texRes, &texDescr, NULL)); +} + +extern "C" void freeCudaBuffers() +{ checkCudaErrors(cudaDestroyTextureObject(texObject)); - } - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_volumeArray; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = - bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint; - - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - texDescr.addressMode[2] = cudaAddressModeWrap; - - texDescr.readMode = cudaReadModeNormalizedFloat; - - checkCudaErrors( - cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaDestroyTextureObject(transferTex)); + checkCudaErrors(cudaFreeArray(d_volumeArray)); + checkCudaErrors(cudaFreeArray(d_transferFuncArray)); } -extern "C" void initCuda(void *h_volume, cudaExtent volumeSize) { - // create 3D array - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); - - // copy data to 3D array - cudaMemcpy3DParms copyParams = {0}; - copyParams.srcPtr = - make_cudaPitchedPtr(h_volume, volumeSize.width * sizeof(VolumeType), - volumeSize.width, volumeSize.height); - copyParams.dstArray = d_volumeArray; - copyParams.extent = volumeSize; - copyParams.kind = cudaMemcpyHostToDevice; - checkCudaErrors(cudaMemcpy3D(©Params)); - - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_volumeArray; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = - true; // access with normalized texture coordinates - texDescr.filterMode = cudaFilterModeLinear; // linear interpolation - - texDescr.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates - texDescr.addressMode[1] = cudaAddressModeClamp; - texDescr.addressMode[2] = cudaAddressModeClamp; - - texDescr.readMode = cudaReadModeNormalizedFloat; - - checkCudaErrors( - cudaCreateTextureObject(&texObject, &texRes, &texDescr, NULL)); - - // create transfer function texture - float4 transferFunc[] = { - { 0.0, 0.0, 0.0, 0.0, }, - { 1.0, 0.0, 0.0, 1.0, }, - { 1.0, 0.5, 0.0, 1.0, }, - { 1.0, 1.0, 0.0, 1.0, }, - { 0.0, 1.0, 0.0, 1.0, }, - { 0.0, 1.0, 1.0, 1.0, }, - { 0.0, 0.0, 1.0, 1.0, }, - { 1.0, 0.0, 1.0, 1.0, }, - { 0.0, 0.0, 0.0, 0.0, }, - }; - - cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc(); - cudaArray *d_transferFuncArray; - checkCudaErrors(cudaMallocArray(&d_transferFuncArray, &channelDesc2, - sizeof(transferFunc) / sizeof(float4), 1)); - checkCudaErrors(cudaMemcpy2DToArray(d_transferFuncArray, 0, 0, transferFunc, - 0, sizeof(transferFunc), 1, - cudaMemcpyHostToDevice)); - - memset(&texRes, 0, sizeof(cudaResourceDesc)); - - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_transferFuncArray; - - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = - true; // access with normalized texture coordinates - texDescr.filterMode = cudaFilterModeLinear; - - texDescr.addressMode[0] = cudaAddressModeClamp; // wrap texture coordinates - - texDescr.readMode = cudaReadModeElementType; - - checkCudaErrors( - cudaCreateTextureObject(&transferTex, &texRes, &texDescr, NULL)); +extern "C" void render_kernel(dim3 gridSize, + dim3 blockSize, + uint *d_output, + uint imageW, + uint imageH, + float density, + float brightness, + float transferOffset, + float transferScale) +{ + d_render<<>>( + d_output, imageW, imageH, density, brightness, transferOffset, transferScale, texObject, transferTex); } -extern "C" void freeCudaBuffers() { - checkCudaErrors(cudaDestroyTextureObject(texObject)); - checkCudaErrors(cudaDestroyTextureObject(transferTex)); - checkCudaErrors(cudaFreeArray(d_volumeArray)); - checkCudaErrors(cudaFreeArray(d_transferFuncArray)); +extern "C" void copyInvViewMatrix(float *invViewMatrix, size_t sizeofMatrix) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_invViewMatrix, invViewMatrix, sizeofMatrix)); } -extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, - uint imageW, uint imageH, float density, - float brightness, float transferOffset, - float transferScale) { - d_render<<>>(d_output, imageW, imageH, density, - brightness, transferOffset, transferScale, - texObject, transferTex); -} - -extern "C" void copyInvViewMatrix(float *invViewMatrix, size_t sizeofMatrix) { - checkCudaErrors( - cudaMemcpyToSymbol(c_invViewMatrix, invViewMatrix, sizeofMatrix)); -} - -#endif // #ifndef _VOLUMERENDER_KERNEL_CU_ +#endif // #ifndef _VOLUMERENDER_KERNEL_CU_ diff --git a/Samples/5_Domain_Specific/vulkanImageCUDA/Build_instructions.txt b/Samples/5_Domain_Specific/vulkanImageCUDA/Build_instructions.txt index 48ce584a..787e98d4 100644 --- a/Samples/5_Domain_Specific/vulkanImageCUDA/Build_instructions.txt +++ b/Samples/5_Domain_Specific/vulkanImageCUDA/Build_instructions.txt @@ -21,7 +21,7 @@ For Linux: For Linux aarch64(L4T): --- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 +-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 -- install above will also provide libvulkan-dev as dependencies -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=', VULKAN_SDK_PATH in this scenario is typically "/usr" diff --git a/Samples/5_Domain_Specific/vulkanImageCUDA/README.md b/Samples/5_Domain_Specific/vulkanImageCUDA/README.md index 44ef3f62..9a742ce4 100644 --- a/Samples/5_Domain_Specific/vulkanImageCUDA/README.md +++ b/Samples/5_Domain_Specific/vulkanImageCUDA/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/5_Domain_Specific/vulkanImageCUDA/linmath.h b/Samples/5_Domain_Specific/vulkanImageCUDA/linmath.h index b4d386cc..6369f224 100644 --- a/Samples/5_Domain_Specific/vulkanImageCUDA/linmath.h +++ b/Samples/5_Domain_Specific/vulkanImageCUDA/linmath.h @@ -29,114 +29,155 @@ // Converts radians to degrees. #define radiansToDegrees(angleRadians) (angleRadians * 180.0 / M_PI) -typedef float vec3[3]; -static inline void vec3_add(vec3 r, vec3 const a, vec3 const b) { +typedef float vec3[3]; +static inline void vec3_add(vec3 r, vec3 const a, vec3 const b) +{ int i; - for (i = 0; i < 3; ++i) r[i] = a[i] + b[i]; + for (i = 0; i < 3; ++i) + r[i] = a[i] + b[i]; } -static inline void vec3_sub(vec3 r, vec3 const a, vec3 const b) { +static inline void vec3_sub(vec3 r, vec3 const a, vec3 const b) +{ int i; - for (i = 0; i < 3; ++i) r[i] = a[i] - b[i]; + for (i = 0; i < 3; ++i) + r[i] = a[i] - b[i]; } -static inline void vec3_scale(vec3 r, vec3 const v, float const s) { +static inline void vec3_scale(vec3 r, vec3 const v, float const s) +{ int i; - for (i = 0; i < 3; ++i) r[i] = v[i] * s; + for (i = 0; i < 3; ++i) + r[i] = v[i] * s; } -static inline float vec3_mul_inner(vec3 const a, vec3 const b) { +static inline float vec3_mul_inner(vec3 const a, vec3 const b) +{ float p = 0.f; - int i; - for (i = 0; i < 3; ++i) p += b[i] * a[i]; + int i; + for (i = 0; i < 3; ++i) + p += b[i] * a[i]; return p; } -static inline void vec3_mul_cross(vec3 r, vec3 const a, vec3 const b) { +static inline void vec3_mul_cross(vec3 r, vec3 const a, vec3 const b) +{ r[0] = a[1] * b[2] - a[2] * b[1]; r[1] = a[2] * b[0] - a[0] * b[2]; r[2] = a[0] * b[1] - a[1] * b[0]; } static inline float vec3_len(vec3 const v) { return sqrtf(vec3_mul_inner(v, v)); } -static inline void vec3_norm(vec3 r, vec3 const v) { +static inline void vec3_norm(vec3 r, vec3 const v) +{ float k = 1.f / vec3_len(v); vec3_scale(r, v, k); } -static inline void vec3_reflect(vec3 r, vec3 const v, vec3 const n) { +static inline void vec3_reflect(vec3 r, vec3 const v, vec3 const n) +{ float p = 2.f * vec3_mul_inner(v, n); - int i; - for (i = 0; i < 3; ++i) r[i] = v[i] - p * n[i]; + int i; + for (i = 0; i < 3; ++i) + r[i] = v[i] - p * n[i]; } -typedef float vec4[4]; -static inline void vec4_add(vec4 r, vec4 const a, vec4 const b) { +typedef float vec4[4]; +static inline void vec4_add(vec4 r, vec4 const a, vec4 const b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] + b[i]; } -static inline void vec4_sub(vec4 r, vec4 const a, vec4 const b) { +static inline void vec4_sub(vec4 r, vec4 const a, vec4 const b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] - b[i]; } -static inline void vec4_scale(vec4 r, vec4 v, float s) { +static inline void vec4_scale(vec4 r, vec4 v, float s) +{ int i; - for (i = 0; i < 4; ++i) r[i] = v[i] * s; + for (i = 0; i < 4; ++i) + r[i] = v[i] * s; } -static inline float vec4_mul_inner(vec4 a, vec4 b) { +static inline float vec4_mul_inner(vec4 a, vec4 b) +{ float p = 0.f; - int i; - for (i = 0; i < 4; ++i) p += b[i] * a[i]; + int i; + for (i = 0; i < 4; ++i) + p += b[i] * a[i]; return p; } -static inline void vec4_mul_cross(vec4 r, vec4 a, vec4 b) { +static inline void vec4_mul_cross(vec4 r, vec4 a, vec4 b) +{ r[0] = a[1] * b[2] - a[2] * b[1]; r[1] = a[2] * b[0] - a[0] * b[2]; r[2] = a[0] * b[1] - a[1] * b[0]; r[3] = 1.f; } static inline float vec4_len(vec4 v) { return sqrtf(vec4_mul_inner(v, v)); } -static inline void vec4_norm(vec4 r, vec4 v) { +static inline void vec4_norm(vec4 r, vec4 v) +{ float k = 1.f / vec4_len(v); vec4_scale(r, v, k); } -static inline void vec4_reflect(vec4 r, vec4 v, vec4 n) { +static inline void vec4_reflect(vec4 r, vec4 v, vec4 n) +{ float p = 2.f * vec4_mul_inner(v, n); - int i; - for (i = 0; i < 4; ++i) r[i] = v[i] - p * n[i]; + int i; + for (i = 0; i < 4; ++i) + r[i] = v[i] - p * n[i]; } -typedef vec4 mat4x4[4]; -static inline void mat4x4_identity(mat4x4 M) { +typedef vec4 mat4x4[4]; +static inline void mat4x4_identity(mat4x4 M) +{ int i, j; for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) M[i][j] = i == j ? 1.f : 0.f; + for (j = 0; j < 4; ++j) + M[i][j] = i == j ? 1.f : 0.f; } -static inline void mat4x4_dup(mat4x4 M, mat4x4 N) { +static inline void mat4x4_dup(mat4x4 M, mat4x4 N) +{ int i, j; for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) M[i][j] = N[i][j]; + for (j = 0; j < 4; ++j) + M[i][j] = N[i][j]; } -static inline void mat4x4_row(vec4 r, mat4x4 M, int i) { +static inline void mat4x4_row(vec4 r, mat4x4 M, int i) +{ int k; - for (k = 0; k < 4; ++k) r[k] = M[k][i]; + for (k = 0; k < 4; ++k) + r[k] = M[k][i]; } -static inline void mat4x4_col(vec4 r, mat4x4 M, int i) { +static inline void mat4x4_col(vec4 r, mat4x4 M, int i) +{ int k; - for (k = 0; k < 4; ++k) r[k] = M[i][k]; + for (k = 0; k < 4; ++k) + r[k] = M[i][k]; } -static inline void mat4x4_transpose(mat4x4 M, mat4x4 N) { +static inline void mat4x4_transpose(mat4x4 M, mat4x4 N) +{ int i, j; for (j = 0; j < 4; ++j) - for (i = 0; i < 4; ++i) M[i][j] = N[j][i]; + for (i = 0; i < 4; ++i) + M[i][j] = N[j][i]; } -static inline void mat4x4_add(mat4x4 M, mat4x4 a, mat4x4 b) { +static inline void mat4x4_add(mat4x4 M, mat4x4 a, mat4x4 b) +{ int i; - for (i = 0; i < 4; ++i) vec4_add(M[i], a[i], b[i]); + for (i = 0; i < 4; ++i) + vec4_add(M[i], a[i], b[i]); } -static inline void mat4x4_sub(mat4x4 M, mat4x4 a, mat4x4 b) { +static inline void mat4x4_sub(mat4x4 M, mat4x4 a, mat4x4 b) +{ int i; - for (i = 0; i < 4; ++i) vec4_sub(M[i], a[i], b[i]); + for (i = 0; i < 4; ++i) + vec4_sub(M[i], a[i], b[i]); } -static inline void mat4x4_scale(mat4x4 M, mat4x4 a, float k) { +static inline void mat4x4_scale(mat4x4 M, mat4x4 a, float k) +{ int i; - for (i = 0; i < 4; ++i) vec4_scale(M[i], a[i], k); + for (i = 0; i < 4; ++i) + vec4_scale(M[i], a[i], k); } -static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, float z) { +static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, float z) +{ int i; vec4_scale(M[0], a[0], x); vec4_scale(M[1], a[1], y); @@ -145,45 +186,54 @@ static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, floa M[3][i] = a[3][i]; } } -static inline void mat4x4_mul(mat4x4 M, mat4x4 a, mat4x4 b) { +static inline void mat4x4_mul(mat4x4 M, mat4x4 a, mat4x4 b) +{ int k, r, c; for (c = 0; c < 4; ++c) for (r = 0; r < 4; ++r) { M[c][r] = 0.f; - for (k = 0; k < 4; ++k) M[c][r] += a[k][r] * b[c][k]; + for (k = 0; k < 4; ++k) + M[c][r] += a[k][r] * b[c][k]; } } -static inline void mat4x4_mul_vec4(vec4 r, mat4x4 M, vec4 v) { +static inline void mat4x4_mul_vec4(vec4 r, mat4x4 M, vec4 v) +{ int i, j; for (j = 0; j < 4; ++j) { r[j] = 0.f; - for (i = 0; i < 4; ++i) r[j] += M[i][j] * v[i]; + for (i = 0; i < 4; ++i) + r[j] += M[i][j] * v[i]; } } -static inline void mat4x4_translate(mat4x4 T, float x, float y, float z) { +static inline void mat4x4_translate(mat4x4 T, float x, float y, float z) +{ mat4x4_identity(T); T[3][0] = x; T[3][1] = y; T[3][2] = z; } -static inline void mat4x4_translate_in_place(mat4x4 M, float x, float y, float z) { +static inline void mat4x4_translate_in_place(mat4x4 M, float x, float y, float z) +{ vec4 t = {x, y, z, 0}; vec4 r; - int i; + int i; for (i = 0; i < 4; ++i) { mat4x4_row(r, M, i); M[3][i] += vec4_mul_inner(r, t); } } -static inline void mat4x4_from_vec3_mul_outer(mat4x4 M, vec3 a, vec3 b) { +static inline void mat4x4_from_vec3_mul_outer(mat4x4 M, vec3 a, vec3 b) +{ int i, j; for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) M[i][j] = i < 3 && j < 3 ? a[i] * b[j] : 0.f; + for (j = 0; j < 4; ++j) + M[i][j] = i < 3 && j < 3 ? a[i] * b[j] : 0.f; } -static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, float angle) { +static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, float angle) +{ float s = sinf(angle); float c = cosf(angle); - vec3 u = {x, y, z}; + vec3 u = {x, y, z}; if (vec3_len(u) > 1e-4) { vec3_norm(u, u); @@ -204,29 +254,34 @@ static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, T[3][3] = 1.; mat4x4_mul(R, M, T); - } else { + } + else { mat4x4_dup(R, M); } } -static inline void mat4x4_rotate_X(mat4x4 Q, mat4x4 M, float angle) { - float s = sinf(angle); - float c = cosf(angle); +static inline void mat4x4_rotate_X(mat4x4 Q, mat4x4 M, float angle) +{ + float s = sinf(angle); + float c = cosf(angle); mat4x4 R = {{1.f, 0.f, 0.f, 0.f}, {0.f, c, s, 0.f}, {0.f, -s, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; mat4x4_mul(Q, M, R); } -static inline void mat4x4_rotate_Y(mat4x4 Q, mat4x4 M, float angle) { - float s = sinf(angle); - float c = cosf(angle); +static inline void mat4x4_rotate_Y(mat4x4 Q, mat4x4 M, float angle) +{ + float s = sinf(angle); + float c = cosf(angle); mat4x4 R = {{c, 0.f, s, 0.f}, {0.f, 1.f, 0.f, 0.f}, {-s, 0.f, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; mat4x4_mul(Q, M, R); } -static inline void mat4x4_rotate_Z(mat4x4 Q, mat4x4 M, float angle) { - float s = sinf(angle); - float c = cosf(angle); +static inline void mat4x4_rotate_Z(mat4x4 Q, mat4x4 M, float angle) +{ + float s = sinf(angle); + float c = cosf(angle); mat4x4 R = {{c, s, 0.f, 0.f}, {-s, c, 0.f, 0.f}, {0.f, 0.f, 1.f, 0.f}, {0.f, 0.f, 0.f, 1.f}}; mat4x4_mul(Q, M, R); } -static inline void mat4x4_invert(mat4x4 T, mat4x4 M) { +static inline void mat4x4_invert(mat4x4 T, mat4x4 M) +{ float s[6]; float c[6]; s[0] = M[0][0] * M[1][1] - M[1][0] * M[0][1]; @@ -266,10 +321,11 @@ static inline void mat4x4_invert(mat4x4 T, mat4x4 M) { T[3][2] = (-M[3][0] * s[3] + M[3][1] * s[1] - M[3][2] * s[0]) * idet; T[3][3] = (M[2][0] * s[3] - M[2][1] * s[1] + M[2][2] * s[0]) * idet; } -static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) { +static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) +{ mat4x4_dup(R, M); float s = 1.; - vec3 h; + vec3 h; vec3_norm(R[2], R[2]); @@ -289,7 +345,8 @@ static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) { vec3_norm(R[0], R[0]); } -static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, float n, float f) { +static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, float n, float f) +{ M[0][0] = 2.f * n / (r - l); M[0][1] = M[0][2] = M[0][3] = 0.f; @@ -304,7 +361,8 @@ static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, M[3][2] = -2.f * (f * n) / (f - n); M[3][0] = M[3][1] = M[3][3] = 0.f; } -static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, float n, float f) { +static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, float n, float f) +{ M[0][0] = 2.f / (r - l); M[0][1] = M[0][2] = M[0][3] = 0.f; @@ -319,7 +377,8 @@ static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, fl M[3][2] = -(f + n) / (f - n); M[3][3] = 1.f; } -static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float n, float f) { +static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float n, float f) +{ /* NOTE: Degrees are an unhandy unit to work with. * linmath.h uses radians for everything! */ float const a = (float)(1.f / tan(y_fov / 2.f)); @@ -344,7 +403,8 @@ static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float m[3][2] = -((2.f * f * n) / (f - n)); m[3][3] = 0.f; } -static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) { +static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) +{ /* Adapted from Android's OpenGL Matrix.java. */ /* See the OpenGL GLUT documentation for gluLookAt for a description */ /* of the algorithm. We implement it in a straightforward way: */ @@ -385,20 +445,26 @@ static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) { mat4x4_translate_in_place(m, -eye[0], -eye[1], -eye[2]); } -typedef float quat[4]; -static inline void quat_identity(quat q) { +typedef float quat[4]; +static inline void quat_identity(quat q) +{ q[0] = q[1] = q[2] = 0.f; - q[3] = 1.f; + q[3] = 1.f; } -static inline void quat_add(quat r, quat a, quat b) { +static inline void quat_add(quat r, quat a, quat b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] + b[i]; } -static inline void quat_sub(quat r, quat a, quat b) { +static inline void quat_sub(quat r, quat a, quat b) +{ int i; - for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; + for (i = 0; i < 4; ++i) + r[i] = a[i] - b[i]; } -static inline void quat_mul(quat r, quat p, quat q) { +static inline void quat_mul(quat r, quat p, quat q) +{ vec3 w; vec3_mul_cross(r, p, q); vec3_scale(w, p, q[3]); @@ -407,23 +473,30 @@ static inline void quat_mul(quat r, quat p, quat q) { vec3_add(r, r, w); r[3] = p[3] * q[3] - vec3_mul_inner(p, q); } -static inline void quat_scale(quat r, quat v, float s) { +static inline void quat_scale(quat r, quat v, float s) +{ int i; - for (i = 0; i < 4; ++i) r[i] = v[i] * s; + for (i = 0; i < 4; ++i) + r[i] = v[i] * s; } -static inline float quat_inner_product(quat a, quat b) { +static inline float quat_inner_product(quat a, quat b) +{ float p = 0.f; - int i; - for (i = 0; i < 4; ++i) p += b[i] * a[i]; + int i; + for (i = 0; i < 4; ++i) + p += b[i] * a[i]; return p; } -static inline void quat_conj(quat r, quat q) { +static inline void quat_conj(quat r, quat q) +{ int i; - for (i = 0; i < 3; ++i) r[i] = -q[i]; + for (i = 0; i < 3; ++i) + r[i] = -q[i]; r[3] = q[3]; } #define quat_norm vec4_norm -static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) { +static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) +{ quat v_ = {v[0], v[1], v[2], 0.f}; quat_conj(r, q); @@ -431,11 +504,12 @@ static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) { quat_mul(r, v_, r); quat_mul(r, q, r); } -static inline void mat4x4_from_quat(mat4x4 M, quat q) { - float a = q[3]; - float b = q[0]; - float c = q[1]; - float d = q[2]; +static inline void mat4x4_from_quat(mat4x4 M, quat q) +{ + float a = q[3]; + float b = q[0]; + float c = q[1]; + float d = q[2]; float a2 = a * a; float b2 = b * b; float c2 = c * c; @@ -457,10 +531,11 @@ static inline void mat4x4_from_quat(mat4x4 M, quat q) { M[2][3] = 0.f; M[3][0] = M[3][1] = M[3][2] = 0.f; - M[3][3] = 1.f; + M[3][3] = 1.f; } -static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) { +static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) +{ /* XXX: The way this is written only works for othogonal matrices. */ /* TODO: Take care of non-orthogonal case. */ quat_mul_vec3(R[0], q, M[0]); @@ -468,18 +543,20 @@ static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) { quat_mul_vec3(R[2], q, M[2]); R[3][0] = R[3][1] = R[3][2] = 0.f; - R[3][3] = 1.f; + R[3][3] = 1.f; } -static inline void quat_from_mat4x4(quat q, mat4x4 M) { +static inline void quat_from_mat4x4(quat q, mat4x4 M) +{ float r = 0.f; - int i; + int i; - int perm[] = {0, 1, 2, 0, 1}; - int *p = perm; + int perm[] = {0, 1, 2, 0, 1}; + int *p = perm; for (i = 0; i < 3; i++) { float m = M[i][i]; - if (m < r) continue; + if (m < r) + continue; m = r; p = &perm[i]; } diff --git a/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu b/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu index caae09c5..f782bbb9 100644 --- a/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu +++ b/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu @@ -27,10 +27,10 @@ #define GLFW_INCLUDE_VULKAN #ifdef _WIN64 +#include #include #include #include -#include #define _USE_MATH_DEFINES #endif @@ -45,28 +45,26 @@ #include #include #include +#include +#include #include +#include +#include +#include #include #include #include #include #include -#include -#include -#include -#include -#include - #include "linmath.h" -#define WIDTH 800 +#define WIDTH 800 #define HEIGHT 600 const int MAX_FRAMES = 4; -const std::vector validationLayers = { - "VK_LAYER_KHRONOS_validation"}; +const std::vector validationLayers = {"VK_LAYER_KHRONOS_validation"}; #ifdef NDEBUG const bool enableValidationLayers = true; @@ -76,20 +74,21 @@ const bool enableValidationLayers = false; std::string execution_path; -VkResult CreateDebugUtilsMessengerEXT( - VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkDebugUtilsMessengerEXT* pDebugMessenger) { - auto func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr( - instance, "vkCreateDebugUtilsMessengerEXT"); - if (func != nullptr) { - return func(instance, pCreateInfo, pAllocator, pDebugMessenger); - } else { - return VK_ERROR_EXTENSION_NOT_PRESENT; - } +VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, + const VkDebugUtilsMessengerCreateInfoEXT *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkDebugUtilsMessengerEXT *pDebugMessenger) +{ + auto func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT"); + if (func != nullptr) { + return func(instance, pCreateInfo, pAllocator, pDebugMessenger); + } + else { + return VK_ERROR_EXTENSION_NOT_PRESENT; + } }; -const std::vector deviceExtensions = { +const std::vector deviceExtensions = { VK_KHR_SWAPCHAIN_EXTENSION_NAME, VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME, VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME, @@ -103,2534 +102,2475 @@ const std::vector deviceExtensions = { }; #ifdef _WIN64 -class WindowsSecurityAttributes { - protected: - SECURITY_ATTRIBUTES m_winSecurityAttributes; - PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; +class WindowsSecurityAttributes +{ +protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; - public: - WindowsSecurityAttributes(); - SECURITY_ATTRIBUTES* operator&(); - ~WindowsSecurityAttributes(); +public: + WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES *operator&(); + ~WindowsSecurityAttributes(); }; -WindowsSecurityAttributes::WindowsSecurityAttributes() { - m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc( - 1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void**)); +WindowsSecurityAttributes::WindowsSecurityAttributes() +{ + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); - PSID* ppSID = - (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*)); + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - InitializeSecurityDescriptor(m_winPSecurityDescriptor, - SECURITY_DESCRIPTOR_REVISION); + InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION); - SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = - SECURITY_WORLD_SID_AUTHORITY; - AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, - 0, 0, 0, 0, 0, ppSID); + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID); - EXPLICIT_ACCESS explicitAccess; - ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); - explicitAccess.grfAccessPermissions = - STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; - explicitAccess.grfAccessMode = SET_ACCESS; - explicitAccess.grfInheritance = INHERIT_ONLY; - explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; - explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; - explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; - SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); - SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); - m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); - m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; - m_winSecurityAttributes.bInheritHandle = TRUE; + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; } -SECURITY_ATTRIBUTES* WindowsSecurityAttributes::operator&() { - return &m_winSecurityAttributes; -} +SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() { return &m_winSecurityAttributes; } -WindowsSecurityAttributes::~WindowsSecurityAttributes() { - PSID* ppSID = - (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); - PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*)); +WindowsSecurityAttributes::~WindowsSecurityAttributes() +{ + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); - if (*ppSID) { - FreeSid(*ppSID); - } - if (*ppACL) { - LocalFree(*ppACL); - } - free(m_winPSecurityDescriptor); + if (*ppSID) { + FreeSid(*ppSID); + } + if (*ppACL) { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); } #endif -void DestroyDebugUtilsMessengerEXT(VkInstance instance, - VkDebugUtilsMessengerEXT debugMessenger, - const VkAllocationCallbacks* pAllocator) { - auto func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr( - instance, "vkDestroyDebugUtilsMessengerEXT"); - if (func != nullptr) { - func(instance, debugMessenger, pAllocator); - } +void DestroyDebugUtilsMessengerEXT(VkInstance instance, + VkDebugUtilsMessengerEXT debugMessenger, + const VkAllocationCallbacks *pAllocator) +{ + auto func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT"); + if (func != nullptr) { + func(instance, debugMessenger, pAllocator); + } } -struct QueueFamilyIndices { - int graphicsFamily = -1; - int presentFamily = -1; +struct QueueFamilyIndices +{ + int graphicsFamily = -1; + int presentFamily = -1; - bool isComplete() { return graphicsFamily >= 0 && presentFamily >= 0; } + bool isComplete() { return graphicsFamily >= 0 && presentFamily >= 0; } }; -struct SwapChainSupportDetails { - VkSurfaceCapabilitiesKHR capabilities; - std::vector formats; - std::vector presentModes; +struct SwapChainSupportDetails +{ + VkSurfaceCapabilitiesKHR capabilities; + std::vector formats; + std::vector presentModes; }; typedef float vec2[2]; -struct Vertex { - vec4 pos; - vec3 color; - vec2 texCoord; +struct Vertex +{ + vec4 pos; + vec3 color; + vec2 texCoord; - static VkVertexInputBindingDescription getBindingDescription() { - VkVertexInputBindingDescription bindingDescription = {}; - bindingDescription.binding = 0; - bindingDescription.stride = sizeof(Vertex); - bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + static VkVertexInputBindingDescription getBindingDescription() + { + VkVertexInputBindingDescription bindingDescription = {}; + bindingDescription.binding = 0; + bindingDescription.stride = sizeof(Vertex); + bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - return bindingDescription; - } + return bindingDescription; + } - static std::array - getAttributeDescriptions() { - std::array attributeDescriptions = {}; + static std::array getAttributeDescriptions() + { + std::array attributeDescriptions = {}; - attributeDescriptions[0].binding = 0; - attributeDescriptions[0].location = 0; - attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT; - attributeDescriptions[0].offset = offsetof(Vertex, pos); + attributeDescriptions[0].binding = 0; + attributeDescriptions[0].location = 0; + attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT; + attributeDescriptions[0].offset = offsetof(Vertex, pos); - attributeDescriptions[1].binding = 0; - attributeDescriptions[1].location = 1; - attributeDescriptions[1].format = VK_FORMAT_R32G32B32_SFLOAT; - attributeDescriptions[1].offset = offsetof(Vertex, color); + attributeDescriptions[1].binding = 0; + attributeDescriptions[1].location = 1; + attributeDescriptions[1].format = VK_FORMAT_R32G32B32_SFLOAT; + attributeDescriptions[1].offset = offsetof(Vertex, color); - attributeDescriptions[2].binding = 0; - attributeDescriptions[2].location = 2; - attributeDescriptions[2].format = VK_FORMAT_R32G32_SFLOAT; - attributeDescriptions[2].offset = offsetof(Vertex, texCoord); + attributeDescriptions[2].binding = 0; + attributeDescriptions[2].location = 2; + attributeDescriptions[2].format = VK_FORMAT_R32G32_SFLOAT; + attributeDescriptions[2].offset = offsetof(Vertex, texCoord); - return attributeDescriptions; - } + return attributeDescriptions; + } }; -struct UniformBufferObject { - alignas(16) mat4x4 model; - alignas(16) mat4x4 view; - alignas(16) mat4x4 proj; +struct UniformBufferObject +{ + alignas(16) mat4x4 model; + alignas(16) mat4x4 view; + alignas(16) mat4x4 proj; }; -const std::vector vertices = { - {{-1.0f, -1.0f, 0.0f, 1.0f}, {1.0f, 0.0f, 0.0f}, {0.0f, 0.0f}}, - {{1.0f, -1.0f, 0.0f, 1.0f}, {0.0f, 1.0f, 0.0f}, {1.0f, 0.0f}}, - {{1.0f, 1.0f, 0.0f, 1.0f}, {0.0f, 0.0f, 1.0f}, {1.0f, 1.0f}}, - {{-1.0f, 1.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f}, {0.0f, 1.0f}}}; +const std::vector vertices = {{{-1.0f, -1.0f, 0.0f, 1.0f}, {1.0f, 0.0f, 0.0f}, {0.0f, 0.0f}}, + {{1.0f, -1.0f, 0.0f, 1.0f}, {0.0f, 1.0f, 0.0f}, {1.0f, 0.0f}}, + {{1.0f, 1.0f, 0.0f, 1.0f}, {0.0f, 0.0f, 1.0f}, {1.0f, 1.0f}}, + {{-1.0f, 1.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f}, {0.0f, 1.0f}}}; const std::vector indices = {0, 1, 2, 2, 3, 0}; // convert floating point rgba color to 32-bit integer -__device__ unsigned int rgbaFloatToInt(float4 rgba) { - rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] - rgba.y = __saturatef(rgba.y); - rgba.z = __saturatef(rgba.z); - rgba.w = __saturatef(rgba.w); - return ((unsigned int)(rgba.w * 255.0f) << 24) | - ((unsigned int)(rgba.z * 255.0f) << 16) | - ((unsigned int)(rgba.y * 255.0f) << 8) | - ((unsigned int)(rgba.x * 255.0f)); +__device__ unsigned int rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return ((unsigned int)(rgba.w * 255.0f) << 24) | ((unsigned int)(rgba.z * 255.0f) << 16) + | ((unsigned int)(rgba.y * 255.0f) << 8) | ((unsigned int)(rgba.x * 255.0f)); } -__device__ float4 rgbaIntToFloat(unsigned int c) { - float4 rgba; - rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; - rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; - rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; - rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; - return rgba; +__device__ float4 rgbaIntToFloat(unsigned int c) +{ + float4 rgba; + rgba.x = (c & 0xff) * 0.003921568627f; // /255.0f; + rgba.y = ((c >> 8) & 0xff) * 0.003921568627f; // /255.0f; + rgba.z = ((c >> 16) & 0xff) * 0.003921568627f; // /255.0f; + rgba.w = ((c >> 24) & 0xff) * 0.003921568627f; // /255.0f; + return rgba; } int filter_radius = 14; int g_nFilterSign = 1; // This varies the filter radius, so we can see automatic animation -void varySigma() { - filter_radius += g_nFilterSign; +void varySigma() +{ + filter_radius += g_nFilterSign; - if (filter_radius > 64) { - filter_radius = 64; // clamp to 64 and then negate sign - g_nFilterSign = -1; - } else if (filter_radius < 0) { - filter_radius = 0; - g_nFilterSign = 1; - } + if (filter_radius > 64) { + filter_radius = 64; // clamp to 64 and then negate sign + g_nFilterSign = -1; + } + else if (filter_radius < 0) { + filter_radius = 0; + g_nFilterSign = 1; + } } // row pass using texture lookups -__global__ void d_boxfilter_rgba_x(cudaSurfaceObject_t* dstSurfMipMapArray, - cudaTextureObject_t textureMipMapInput, - size_t baseWidth, size_t baseHeight, - size_t mipLevels, int filter_radius) { - float scale = 1.0f / (float)((filter_radius << 1) + 1); - unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void d_boxfilter_rgba_x(cudaSurfaceObject_t *dstSurfMipMapArray, + cudaTextureObject_t textureMipMapInput, + size_t baseWidth, + size_t baseHeight, + size_t mipLevels, + int filter_radius) +{ + float scale = 1.0f / (float)((filter_radius << 1) + 1); + unsigned int y = blockIdx.x * blockDim.x + threadIdx.x; - if (y < baseHeight) { - for (uint32_t mipLevelIdx = 0; mipLevelIdx < mipLevels; mipLevelIdx++) { - uint32_t width = - (baseWidth >> mipLevelIdx) ? (baseWidth >> mipLevelIdx) : 1; - uint32_t height = - (baseHeight >> mipLevelIdx) ? (baseHeight >> mipLevelIdx) : 1; - if (y < height && filter_radius < width) { - float px = 1.0 / width; - float py = 1.0 / height; - float4 t = make_float4(0.0f); - for (int x = -filter_radius; x <= filter_radius; x++) { - t += tex2DLod(textureMipMapInput, x * px, y * py, - (float)mipLevelIdx); + if (y < baseHeight) { + for (uint32_t mipLevelIdx = 0; mipLevelIdx < mipLevels; mipLevelIdx++) { + uint32_t width = (baseWidth >> mipLevelIdx) ? (baseWidth >> mipLevelIdx) : 1; + uint32_t height = (baseHeight >> mipLevelIdx) ? (baseHeight >> mipLevelIdx) : 1; + if (y < height && filter_radius < width) { + float px = 1.0 / width; + float py = 1.0 / height; + float4 t = make_float4(0.0f); + for (int x = -filter_radius; x <= filter_radius; x++) { + t += tex2DLod(textureMipMapInput, x * px, y * py, (float)mipLevelIdx); + } + + unsigned int dataB = rgbaFloatToInt(t * scale); + surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], 0, y); + + for (int x = 1; x < width; x++) { + t += tex2DLod(textureMipMapInput, (x + filter_radius) * px, y * py, (float)mipLevelIdx); + t -= tex2DLod(textureMipMapInput, (x - filter_radius - 1) * px, y * py, (float)mipLevelIdx); + unsigned int dataB = rgbaFloatToInt(t * scale); + surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], x * sizeof(uchar4), y); + } + } } - - unsigned int dataB = rgbaFloatToInt(t * scale); - surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], 0, y); - - for (int x = 1; x < width; x++) { - t += tex2DLod(textureMipMapInput, (x + filter_radius) * px, - y * py, (float)mipLevelIdx); - t -= - tex2DLod(textureMipMapInput, (x - filter_radius - 1) * px, - y * py, (float)mipLevelIdx); - unsigned int dataB = rgbaFloatToInt(t * scale); - surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], - x * sizeof(uchar4), y); - } - } } - } } // column pass using coalesced global memory reads -__global__ void d_boxfilter_rgba_y(cudaSurfaceObject_t* dstSurfMipMapArray, - cudaSurfaceObject_t* srcSurfMipMapArray, - size_t baseWidth, size_t baseHeight, - size_t mipLevels, int filter_radius) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - float scale = 1.0f / (float)((filter_radius << 1) + 1); +__global__ void d_boxfilter_rgba_y(cudaSurfaceObject_t *dstSurfMipMapArray, + cudaSurfaceObject_t *srcSurfMipMapArray, + size_t baseWidth, + size_t baseHeight, + size_t mipLevels, + int filter_radius) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + float scale = 1.0f / (float)((filter_radius << 1) + 1); - for (uint32_t mipLevelIdx = 0; mipLevelIdx < mipLevels; mipLevelIdx++) { - uint32_t width = - (baseWidth >> mipLevelIdx) ? (baseWidth >> mipLevelIdx) : 1; - uint32_t height = - (baseHeight >> mipLevelIdx) ? (baseHeight >> mipLevelIdx) : 1; + for (uint32_t mipLevelIdx = 0; mipLevelIdx < mipLevels; mipLevelIdx++) { + uint32_t width = (baseWidth >> mipLevelIdx) ? (baseWidth >> mipLevelIdx) : 1; + uint32_t height = (baseHeight >> mipLevelIdx) ? (baseHeight >> mipLevelIdx) : 1; - if (x < width && height > filter_radius) { - float4 t; - // do left edge - int colInBytes = x * sizeof(uchar4); - unsigned int pixFirst = surf2Dread( - srcSurfMipMapArray[mipLevelIdx], colInBytes, 0); - t = rgbaIntToFloat(pixFirst) * filter_radius; + if (x < width && height > filter_radius) { + float4 t; + // do left edge + int colInBytes = x * sizeof(uchar4); + unsigned int pixFirst = surf2Dread(srcSurfMipMapArray[mipLevelIdx], colInBytes, 0); + t = rgbaIntToFloat(pixFirst) * filter_radius; - for (int y = 0; (y < (filter_radius + 1)) && (y < height); y++) { - unsigned int pix = surf2Dread( - srcSurfMipMapArray[mipLevelIdx], colInBytes, y); - t += rgbaIntToFloat(pix); - } + for (int y = 0; (y < (filter_radius + 1)) && (y < height); y++) { + unsigned int pix = surf2Dread(srcSurfMipMapArray[mipLevelIdx], colInBytes, y); + t += rgbaIntToFloat(pix); + } - unsigned int dataB = rgbaFloatToInt(t * scale); - surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, 0); + unsigned int dataB = rgbaFloatToInt(t * scale); + surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, 0); - for (int y = 1; (y < filter_radius + 1) && ((y + filter_radius) < height); - y++) { - unsigned int pix = surf2Dread( - srcSurfMipMapArray[mipLevelIdx], colInBytes, y + filter_radius); - t += rgbaIntToFloat(pix); - t -= rgbaIntToFloat(pixFirst); + for (int y = 1; (y < filter_radius + 1) && ((y + filter_radius) < height); y++) { + unsigned int pix = + surf2Dread(srcSurfMipMapArray[mipLevelIdx], colInBytes, y + filter_radius); + t += rgbaIntToFloat(pix); + t -= rgbaIntToFloat(pixFirst); - dataB = rgbaFloatToInt(t * scale); - surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, y); - } + dataB = rgbaFloatToInt(t * scale); + surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, y); + } - // main loop - for (int y = (filter_radius + 1); y < (height - filter_radius); y++) { - unsigned int pix = surf2Dread( - srcSurfMipMapArray[mipLevelIdx], colInBytes, y + filter_radius); - t += rgbaIntToFloat(pix); + // main loop + for (int y = (filter_radius + 1); y < (height - filter_radius); y++) { + unsigned int pix = + surf2Dread(srcSurfMipMapArray[mipLevelIdx], colInBytes, y + filter_radius); + t += rgbaIntToFloat(pix); - pix = surf2Dread(srcSurfMipMapArray[mipLevelIdx], - colInBytes, y - filter_radius - 1); - t -= rgbaIntToFloat(pix); + pix = surf2Dread(srcSurfMipMapArray[mipLevelIdx], colInBytes, y - filter_radius - 1); + t -= rgbaIntToFloat(pix); - dataB = rgbaFloatToInt(t * scale); - surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, y); - } + dataB = rgbaFloatToInt(t * scale); + surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, y); + } - // do right edge - unsigned int pixLast = surf2Dread( - srcSurfMipMapArray[mipLevelIdx], colInBytes, height - 1); - for (int y = height - filter_radius; - (y < height) && ((y - filter_radius - 1) > 1); y++) { - t += rgbaIntToFloat(pixLast); - unsigned int pix = surf2Dread( - srcSurfMipMapArray[mipLevelIdx], colInBytes, y - filter_radius - 1); - t -= rgbaIntToFloat(pix); - dataB = rgbaFloatToInt(t * scale); - surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, y); - } + // do right edge + unsigned int pixLast = surf2Dread(srcSurfMipMapArray[mipLevelIdx], colInBytes, height - 1); + for (int y = height - filter_radius; (y < height) && ((y - filter_radius - 1) > 1); y++) { + t += rgbaIntToFloat(pixLast); + unsigned int pix = + surf2Dread(srcSurfMipMapArray[mipLevelIdx], colInBytes, y - filter_radius - 1); + t -= rgbaIntToFloat(pix); + dataB = rgbaFloatToInt(t * scale); + surf2Dwrite(dataB, dstSurfMipMapArray[mipLevelIdx], colInBytes, y); + } + } } - } } -class vulkanImageCUDA { - public: - void loadImageData(const std::string& filename) { - // load image (needed so we can get the width and height before we create - // the window - char* image_path = - sdkFindFilePath(filename.c_str(), execution_path.c_str()); - - if (image_path == 0) { - printf("Error finding image file '%s'\n", filename.c_str()); - exit(EXIT_FAILURE); - } - - sdkLoadPPM4(image_path, (unsigned char**)&image_data, &imageWidth, - &imageHeight); - - if (!image_data) { - printf("Error opening file '%s'\n", image_path); - exit(EXIT_FAILURE); - } - - printf("Loaded '%s', %d x %d pixels\n", image_path, imageWidth, - imageHeight); - } - - void run() { - initWindow(); - initVulkan(); - initCuda(); - mainLoop(); - cleanup(); - } - - private: - GLFWwindow* window; - - VkInstance instance; - VkDebugUtilsMessengerEXT debugMessenger; - VkSurfaceKHR surface; - - VkPhysicalDevice physicalDevice = VK_NULL_HANDLE; - VkDevice device; - uint8_t vkDeviceUUID[VK_UUID_SIZE]; - - VkQueue graphicsQueue; - VkQueue presentQueue; - - VkSwapchainKHR swapChain; - std::vector swapChainImages; - VkFormat swapChainImageFormat; - VkExtent2D swapChainExtent; - std::vector swapChainImageViews; - std::vector swapChainFramebuffers; - - VkRenderPass renderPass; - VkDescriptorSetLayout descriptorSetLayout; - VkPipelineLayout pipelineLayout; - VkPipeline graphicsPipeline; - - VkCommandPool commandPool; - - VkImage textureImage; - VkDeviceMemory textureImageMemory; - VkImageView textureImageView; - VkSampler textureSampler; - - VkBuffer vertexBuffer; - VkDeviceMemory vertexBufferMemory; - VkBuffer indexBuffer; - VkDeviceMemory indexBufferMemory; - - std::vector uniformBuffers; - std::vector uniformBuffersMemory; - - VkDescriptorPool descriptorPool; - std::vector descriptorSets; - - std::vector commandBuffers; - - std::vector imageAvailableSemaphores; - std::vector renderFinishedSemaphores; - VkSemaphore cudaUpdateVkSemaphore, vkUpdateCudaSemaphore; - std::vector inFlightFences; - - size_t currentFrame = 0; - - bool framebufferResized = false; - -#ifdef _WIN64 - PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; - PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; -#else - PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR = NULL; - PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR = NULL; -#endif - - PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; - - unsigned int* image_data = NULL; - unsigned int imageWidth, imageHeight; - unsigned int mipLevels = 1; - size_t totalImageMemSize; - - // CUDA objects - cudaExternalMemory_t cudaExtMemImageBuffer; - cudaMipmappedArray_t cudaMipmappedImageArray, cudaMipmappedImageArrayTemp, - cudaMipmappedImageArrayOrig; - std::vector surfaceObjectList, surfaceObjectListTemp; - cudaSurfaceObject_t *d_surfaceObjectList, *d_surfaceObjectListTemp; - cudaTextureObject_t textureObjMipMapInput; - - cudaExternalSemaphore_t cudaExtCudaUpdateVkSemaphore; - cudaExternalSemaphore_t cudaExtVkUpdateCudaSemaphore; - cudaStream_t streamToRun; - - void initWindow() { - glfwInit(); - - glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); - - window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan Image CUDA Box Filter", - nullptr, nullptr); - glfwSetWindowUserPointer(window, this); - glfwSetFramebufferSizeCallback(window, framebufferResizeCallback); - } - - static void framebufferResizeCallback(GLFWwindow* window, int width, - int height) { - auto app = - reinterpret_cast(glfwGetWindowUserPointer(window)); - app->framebufferResized = true; - } - - void initVulkan() { - createInstance(); - setupDebugMessenger(); - createSurface(); - pickPhysicalDevice(); - createLogicalDevice(); - getKhrExtensionsFn(); - createSwapChain(); - createImageViews(); - createRenderPass(); - createDescriptorSetLayout(); - createGraphicsPipeline(); - createFramebuffers(); - createCommandPool(); - createTextureImage(); - createTextureImageView(); - createTextureSampler(); - createVertexBuffer(); - createIndexBuffer(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); - createSyncObjects(); - createSyncObjectsExt(); - } - - void initCuda() { - setCudaVkDevice(); - checkCudaErrors(cudaStreamCreate(&streamToRun)); - cudaVkImportImageMem(); - cudaVkImportSemaphore(); - } - - void mainLoop() { - updateUniformBuffer(); - while (!glfwWindowShouldClose(window)) { - glfwPollEvents(); - drawFrame(); - } - - vkDeviceWaitIdle(device); - } - - void cleanupSwapChain() { - for (auto framebuffer : swapChainFramebuffers) { - vkDestroyFramebuffer(device, framebuffer, nullptr); - } - - vkFreeCommandBuffers(device, commandPool, - static_cast(commandBuffers.size()), - commandBuffers.data()); - - vkDestroyPipeline(device, graphicsPipeline, nullptr); - vkDestroyPipelineLayout(device, pipelineLayout, nullptr); - vkDestroyRenderPass(device, renderPass, nullptr); - - for (auto imageView : swapChainImageViews) { - vkDestroyImageView(device, imageView, nullptr); - } - - vkDestroySwapchainKHR(device, swapChain, nullptr); - - for (size_t i = 0; i < swapChainImages.size(); i++) { - vkDestroyBuffer(device, uniformBuffers[i], nullptr); - vkFreeMemory(device, uniformBuffersMemory[i], nullptr); - } - - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - } - - void cleanup() { - cleanupSwapChain(); - - vkDestroySampler(device, textureSampler, nullptr); - vkDestroyImageView(device, textureImageView, nullptr); - - for (int i = 0; i < mipLevels; i++) { - checkCudaErrors(cudaDestroySurfaceObject(surfaceObjectList[i])); - checkCudaErrors(cudaDestroySurfaceObject(surfaceObjectListTemp[i])); - } - - checkCudaErrors(cudaFree(d_surfaceObjectList)); - checkCudaErrors(cudaFree(d_surfaceObjectListTemp)); - checkCudaErrors(cudaFreeMipmappedArray(cudaMipmappedImageArrayTemp)); - checkCudaErrors(cudaFreeMipmappedArray(cudaMipmappedImageArrayOrig)); - checkCudaErrors(cudaFreeMipmappedArray(cudaMipmappedImageArray)); - checkCudaErrors(cudaDestroyTextureObject(textureObjMipMapInput)); - checkCudaErrors(cudaDestroyExternalMemory(cudaExtMemImageBuffer)); - checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtCudaUpdateVkSemaphore)); - checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtVkUpdateCudaSemaphore)); - - vkDestroyImage(device, textureImage, nullptr); - vkFreeMemory(device, textureImageMemory, nullptr); - - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); - - vkDestroyBuffer(device, indexBuffer, nullptr); - vkFreeMemory(device, indexBufferMemory, nullptr); - - vkDestroyBuffer(device, vertexBuffer, nullptr); - vkFreeMemory(device, vertexBufferMemory, nullptr); - - vkDestroySemaphore(device, cudaUpdateVkSemaphore, nullptr); - vkDestroySemaphore(device, vkUpdateCudaSemaphore, nullptr); - - for (size_t i = 0; i < MAX_FRAMES; i++) { - vkDestroySemaphore(device, renderFinishedSemaphores[i], nullptr); - vkDestroySemaphore(device, imageAvailableSemaphores[i], nullptr); - vkDestroyFence(device, inFlightFences[i], nullptr); - } - - vkDestroyCommandPool(device, commandPool, nullptr); - - vkDestroyDevice(device, nullptr); - - if (enableValidationLayers) { - DestroyDebugUtilsMessengerEXT(instance, debugMessenger, nullptr); - } - - vkDestroySurfaceKHR(instance, surface, nullptr); - vkDestroyInstance(instance, nullptr); - - glfwDestroyWindow(window); - - glfwTerminate(); - } - - void recreateSwapChain() { - int width = 0, height = 0; - while (width == 0 || height == 0) { - glfwGetFramebufferSize(window, &width, &height); - glfwWaitEvents(); - } - - vkDeviceWaitIdle(device); - - cleanupSwapChain(); - - createSwapChain(); - createImageViews(); - createRenderPass(); - createGraphicsPipeline(); - createFramebuffers(); - createUniformBuffers(); - createDescriptorPool(); - createDescriptorSets(); - createCommandBuffers(); - } - - void createInstance() { - if (enableValidationLayers && !checkValidationLayerSupport()) { - throw std::runtime_error( - "validation layers requested, but not available!"); - } - - VkApplicationInfo appInfo = {}; - appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; - appInfo.pApplicationName = "Vulkan Image CUDA Interop"; - appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.pEngineName = "No Engine"; - appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); - appInfo.apiVersion = VK_API_VERSION_1_1; - - VkInstanceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; - createInfo.pApplicationInfo = &appInfo; - - auto extensions = getRequiredExtensions(); - createInfo.enabledExtensionCount = static_cast(extensions.size()); - createInfo.ppEnabledExtensionNames = extensions.data(); - - VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo; - if (enableValidationLayers) { - createInfo.enabledLayerCount = - static_cast(validationLayers.size()); - createInfo.ppEnabledLayerNames = validationLayers.data(); - - populateDebugMessengerCreateInfo(debugCreateInfo); - createInfo.pNext = (VkDebugUtilsMessengerCreateInfoEXT*)&debugCreateInfo; - } else { - createInfo.enabledLayerCount = 0; - - createInfo.pNext = nullptr; - } - - if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) { - throw std::runtime_error("failed to create instance!"); - } - - fpGetPhysicalDeviceProperties2 = - (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr( - instance, "vkGetPhysicalDeviceProperties2"); - if (fpGetPhysicalDeviceProperties2 == NULL) { - throw std::runtime_error( - "Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not " - "found.\n"); - } - -#ifdef _WIN64 - fpGetMemoryWin32HandleKHR = - (PFN_vkGetMemoryWin32HandleKHR)vkGetInstanceProcAddr( - instance, "vkGetMemoryWin32HandleKHR"); - if (fpGetMemoryWin32HandleKHR == NULL) { - throw std::runtime_error( - "Vulkan: Proc address for \"vkGetMemoryWin32HandleKHR\" not " - "found.\n"); - } -#else - fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetInstanceProcAddr( - instance, "vkGetMemoryFdKHR"); - if (fpGetMemoryFdKHR == NULL) { - throw std::runtime_error( - "Vulkan: Proc address for \"vkGetMemoryFdKHR\" not found.\n"); - } else { - std::cout << "Vulkan proc address for vkGetMemoryFdKHR - " - << fpGetMemoryFdKHR << std::endl; - } -#endif - } - - void populateDebugMessengerCreateInfo( - VkDebugUtilsMessengerCreateInfoEXT& createInfo) { - createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; - createInfo.messageSeverity = - VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; - createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; - createInfo.pfnUserCallback = debugCallback; - } - - void setupDebugMessenger() { - if (!enableValidationLayers) return; - - VkDebugUtilsMessengerCreateInfoEXT createInfo; - populateDebugMessengerCreateInfo(createInfo); - - if (CreateDebugUtilsMessengerEXT(instance, &createInfo, nullptr, - &debugMessenger) != VK_SUCCESS) { - throw std::runtime_error("failed to set up debug messenger!"); - } - } - - void createSurface() { - if (glfwCreateWindowSurface(instance, window, nullptr, &surface) != - VK_SUCCESS) { - throw std::runtime_error("failed to create window surface!"); - } - } - - void pickPhysicalDevice() { - uint32_t deviceCount = 0; - vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr); - - if (deviceCount == 0) { - throw std::runtime_error("failed to find GPUs with Vulkan support!"); - } - - std::vector devices(deviceCount); - vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data()); - - for (const auto& device : devices) { - if (isDeviceSuitable(device)) { - physicalDevice = device; - break; - } - } - - if (physicalDevice == VK_NULL_HANDLE) { - throw std::runtime_error("failed to find a suitable GPU!"); - } - - std::cout << "Selected physical device = " << physicalDevice << std::endl; - - VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; - vkPhysicalDeviceIDProperties.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; - vkPhysicalDeviceIDProperties.pNext = NULL; - - VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; - vkPhysicalDeviceProperties2.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; - - fpGetPhysicalDeviceProperties2(physicalDevice, - &vkPhysicalDeviceProperties2); - - memcpy(vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, - sizeof(vkDeviceUUID)); - } - - void getKhrExtensionsFn() { -#ifdef _WIN64 - - fpGetSemaphoreWin32HandleKHR = - (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr( - device, "vkGetSemaphoreWin32HandleKHR"); - if (fpGetSemaphoreWin32HandleKHR == NULL) { - throw std::runtime_error( - "Vulkan: Proc address for \"vkGetSemaphoreWin32HandleKHR\" not " - "found.\n"); - } -#else - fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr( - device, "vkGetSemaphoreFdKHR"); - if (fpGetSemaphoreFdKHR == NULL) { - throw std::runtime_error( - "Vulkan: Proc address for \"vkGetSemaphoreFdKHR\" not found.\n"); - } -#endif - } - - int setCudaVkDevice() { - int current_device = 0; - int device_count = 0; - int devices_prohibited = 0; - - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) { - fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - // Find the GPU which is selected by Vulkan - while (current_device < device_count) { - cudaGetDeviceProperties(&deviceProp, current_device); - - if ((deviceProp.computeMode != cudaComputeModeProhibited)) { - // Compare the cuda device UUID with vulkan UUID - int ret = memcmp(&deviceProp.uuid, &vkDeviceUUID, VK_UUID_SIZE); - if (ret == 0) { - checkCudaErrors(cudaSetDevice(current_device)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - current_device, deviceProp.name, deviceProp.major, - deviceProp.minor); - - return current_device; +class vulkanImageCUDA +{ +public: + void loadImageData(const std::string &filename) + { + // load image (needed so we can get the width and height before we create + // the window + char *image_path = sdkFindFilePath(filename.c_str(), execution_path.c_str()); + + if (image_path == 0) { + printf("Error finding image file '%s'\n", filename.c_str()); + exit(EXIT_FAILURE); } - } else { - devices_prohibited++; - } + sdkLoadPPM4(image_path, (unsigned char **)&image_data, &imageWidth, &imageHeight); - current_device++; - } - - if (devices_prohibited == device_count) { - fprintf(stderr, - "CUDA error:" - " No Vulkan-CUDA Interop capable GPU found.\n"); - exit(EXIT_FAILURE); - } - - return -1; - } - - void createLogicalDevice() { - QueueFamilyIndices indices = findQueueFamilies(physicalDevice); - - std::vector queueCreateInfos; - std::set uniqueQueueFamilies = {indices.graphicsFamily, - indices.presentFamily}; - - float queuePriority = 1.0f; - for (int queueFamily : uniqueQueueFamilies) { - VkDeviceQueueCreateInfo queueCreateInfo = {}; - queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - queueCreateInfo.queueFamilyIndex = queueFamily; - queueCreateInfo.queueCount = 1; - queueCreateInfo.pQueuePriorities = &queuePriority; - queueCreateInfos.push_back(queueCreateInfo); - } - - VkPhysicalDeviceFeatures deviceFeatures = {}; - deviceFeatures.samplerAnisotropy = VK_TRUE; - - VkDeviceCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; - - createInfo.pQueueCreateInfos = queueCreateInfos.data(); - createInfo.queueCreateInfoCount = queueCreateInfos.size(); - - createInfo.pEnabledFeatures = &deviceFeatures; - std::vector enabledExtensionNameList; - - for (int i = 0; i < deviceExtensions.size(); i++) { - enabledExtensionNameList.push_back(deviceExtensions[i]); - } - if (enableValidationLayers) { - createInfo.enabledLayerCount = - static_cast(validationLayers.size()); - createInfo.ppEnabledLayerNames = validationLayers.data(); - } else { - createInfo.enabledLayerCount = 0; - } - createInfo.enabledExtensionCount = - static_cast(enabledExtensionNameList.size()); - createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data(); - - if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) != - VK_SUCCESS) { - throw std::runtime_error("failed to create logical device!"); - } - vkGetDeviceQueue(device, indices.graphicsFamily, 0, &graphicsQueue); - vkGetDeviceQueue(device, indices.presentFamily, 0, &presentQueue); - } - - void createSwapChain() { - SwapChainSupportDetails swapChainSupport = - querySwapChainSupport(physicalDevice); - - VkSurfaceFormatKHR surfaceFormat = - chooseSwapSurfaceFormat(swapChainSupport.formats); - VkPresentModeKHR presentMode = - chooseSwapPresentMode(swapChainSupport.presentModes); - VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities); - - uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1; - if (swapChainSupport.capabilities.maxImageCount > 0 && - imageCount > swapChainSupport.capabilities.maxImageCount) { - imageCount = swapChainSupport.capabilities.maxImageCount; - } - - VkSwapchainCreateInfoKHR createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; - createInfo.surface = surface; - - createInfo.minImageCount = imageCount; - createInfo.imageFormat = surfaceFormat.format; - createInfo.imageColorSpace = surfaceFormat.colorSpace; - createInfo.imageExtent = extent; - createInfo.imageArrayLayers = 1; - createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; - - QueueFamilyIndices indices = findQueueFamilies(physicalDevice); - uint32_t queueFamilyIndices[] = {(uint32_t)indices.graphicsFamily, - (uint32_t)indices.presentFamily}; - - if (indices.graphicsFamily != indices.presentFamily) { - createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; - createInfo.queueFamilyIndexCount = 2; - createInfo.pQueueFamilyIndices = queueFamilyIndices; - } else { - createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; - } - - createInfo.preTransform = swapChainSupport.capabilities.currentTransform; - createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; - createInfo.presentMode = presentMode; - createInfo.clipped = VK_TRUE; - - if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) != - VK_SUCCESS) { - throw std::runtime_error("failed to create swap chain!"); - } - - vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr); - swapChainImages.resize(imageCount); - vkGetSwapchainImagesKHR(device, swapChain, &imageCount, - swapChainImages.data()); - - swapChainImageFormat = surfaceFormat.format; - swapChainExtent = extent; - } - - void createImageViews() { - swapChainImageViews.resize(swapChainImages.size()); - - for (size_t i = 0; i < swapChainImages.size(); i++) { - swapChainImageViews[i] = - createImageView(swapChainImages[i], swapChainImageFormat); - } - } - - void createRenderPass() { - VkAttachmentDescription colorAttachment = {}; - colorAttachment.format = swapChainImageFormat; - colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; - colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; - colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; - colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - - VkAttachmentReference colorAttachmentRef = {}; - colorAttachmentRef.attachment = 0; - colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - - VkSubpassDescription subpass = {}; - subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - subpass.colorAttachmentCount = 1; - subpass.pColorAttachments = &colorAttachmentRef; - - VkSubpassDependency dependency = {}; - dependency.srcSubpass = VK_SUBPASS_EXTERNAL; - dependency.dstSubpass = 0; - dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.srcAccessMask = 0; - dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - - VkRenderPassCreateInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; - renderPassInfo.attachmentCount = 1; - renderPassInfo.pAttachments = &colorAttachment; - renderPassInfo.subpassCount = 1; - renderPassInfo.pSubpasses = &subpass; - renderPassInfo.dependencyCount = 1; - renderPassInfo.pDependencies = &dependency; - - if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) != - VK_SUCCESS) { - throw std::runtime_error("failed to create render pass!"); - } - } - - void createDescriptorSetLayout() { - VkDescriptorSetLayoutBinding uboLayoutBinding = {}; - uboLayoutBinding.binding = 0; - uboLayoutBinding.descriptorCount = 1; - uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - uboLayoutBinding.pImmutableSamplers = nullptr; - uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - - VkDescriptorSetLayoutBinding samplerLayoutBinding = {}; - samplerLayoutBinding.binding = 1; - samplerLayoutBinding.descriptorCount = 1; - samplerLayoutBinding.descriptorType = - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - samplerLayoutBinding.pImmutableSamplers = nullptr; - samplerLayoutBinding.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; - - std::array bindings = { - uboLayoutBinding, samplerLayoutBinding}; - VkDescriptorSetLayoutCreateInfo layoutInfo = {}; - layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - layoutInfo.bindingCount = static_cast(bindings.size()); - layoutInfo.pBindings = bindings.data(); - - if (vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, - &descriptorSetLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor set layout!"); - } - } - - void createGraphicsPipeline() { - auto vertShaderCode = readFile("vert.spv"); - auto fragShaderCode = readFile("frag.spv"); - - VkShaderModule vertShaderModule = createShaderModule(vertShaderCode); - VkShaderModule fragShaderModule = createShaderModule(fragShaderCode); - - VkPipelineShaderStageCreateInfo vertShaderStageInfo = {}; - vertShaderStageInfo.sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT; - vertShaderStageInfo.module = vertShaderModule; - vertShaderStageInfo.pName = "main"; - - VkPipelineShaderStageCreateInfo fragShaderStageInfo = {}; - fragShaderStageInfo.sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT; - fragShaderStageInfo.module = fragShaderModule; - fragShaderStageInfo.pName = "main"; - - VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, - fragShaderStageInfo}; - - VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; - vertexInputInfo.sType = - VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - - auto bindingDescription = Vertex::getBindingDescription(); - auto attributeDescriptions = Vertex::getAttributeDescriptions(); - - vertexInputInfo.vertexBindingDescriptionCount = 1; - vertexInputInfo.vertexAttributeDescriptionCount = - static_cast(attributeDescriptions.size()); - vertexInputInfo.pVertexBindingDescriptions = &bindingDescription; - vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data(); - - VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; - inputAssembly.sType = - VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; - inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; - inputAssembly.primitiveRestartEnable = VK_FALSE; - - VkViewport viewport = {}; - viewport.x = 0.0f; - viewport.y = 0.0f; - viewport.width = (float)swapChainExtent.width; - viewport.height = (float)swapChainExtent.height; - viewport.minDepth = 0.0f; - viewport.maxDepth = 1.0f; - - VkRect2D scissor = {}; - scissor.offset = {0, 0}; - scissor.extent = swapChainExtent; - - VkPipelineViewportStateCreateInfo viewportState = {}; - viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; - viewportState.viewportCount = 1; - viewportState.pViewports = &viewport; - viewportState.scissorCount = 1; - viewportState.pScissors = &scissor; - - VkPipelineRasterizationStateCreateInfo rasterizer = {}; - rasterizer.sType = - VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - rasterizer.depthClampEnable = VK_FALSE; - rasterizer.rasterizerDiscardEnable = VK_FALSE; - rasterizer.polygonMode = VK_POLYGON_MODE_FILL; - rasterizer.lineWidth = 1.0f; - rasterizer.cullMode = VK_CULL_MODE_BACK_BIT; - rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; - rasterizer.depthBiasEnable = VK_FALSE; - - VkPipelineMultisampleStateCreateInfo multisampling = {}; - multisampling.sType = - VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisampling.sampleShadingEnable = VK_FALSE; - multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; - - VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; - colorBlendAttachment.colorWriteMask = - VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; - colorBlendAttachment.blendEnable = VK_FALSE; - - VkPipelineColorBlendStateCreateInfo colorBlending = {}; - colorBlending.sType = - VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - colorBlending.logicOpEnable = VK_FALSE; - colorBlending.logicOp = VK_LOGIC_OP_COPY; - colorBlending.attachmentCount = 1; - colorBlending.pAttachments = &colorBlendAttachment; - colorBlending.blendConstants[0] = 0.0f; - colorBlending.blendConstants[1] = 0.0f; - colorBlending.blendConstants[2] = 0.0f; - colorBlending.blendConstants[3] = 0.0f; - - VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; - pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - pipelineLayoutInfo.setLayoutCount = 1; - pipelineLayoutInfo.pSetLayouts = &descriptorSetLayout; - - if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, - &pipelineLayout) != VK_SUCCESS) { - throw std::runtime_error("failed to create pipeline layout!"); - } - - VkGraphicsPipelineCreateInfo pipelineInfo = {}; - pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - pipelineInfo.stageCount = 2; - pipelineInfo.pStages = shaderStages; - pipelineInfo.pVertexInputState = &vertexInputInfo; - pipelineInfo.pInputAssemblyState = &inputAssembly; - pipelineInfo.pViewportState = &viewportState; - pipelineInfo.pRasterizationState = &rasterizer; - pipelineInfo.pMultisampleState = &multisampling; - pipelineInfo.pColorBlendState = &colorBlending; - pipelineInfo.layout = pipelineLayout; - pipelineInfo.renderPass = renderPass; - pipelineInfo.subpass = 0; - pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; - - if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, - nullptr, &graphicsPipeline) != VK_SUCCESS) { - throw std::runtime_error("failed to create graphics pipeline!"); - } - - vkDestroyShaderModule(device, fragShaderModule, nullptr); - vkDestroyShaderModule(device, vertShaderModule, nullptr); - } - - void createFramebuffers() { - swapChainFramebuffers.resize(swapChainImageViews.size()); - - for (size_t i = 0; i < swapChainImageViews.size(); i++) { - VkImageView attachments[] = {swapChainImageViews[i]}; - - VkFramebufferCreateInfo framebufferInfo = {}; - framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; - framebufferInfo.renderPass = renderPass; - framebufferInfo.attachmentCount = 1; - framebufferInfo.pAttachments = attachments; - framebufferInfo.width = swapChainExtent.width; - framebufferInfo.height = swapChainExtent.height; - framebufferInfo.layers = 1; - - if (vkCreateFramebuffer(device, &framebufferInfo, nullptr, - &swapChainFramebuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to create framebuffer!"); - } - } - } - - void createCommandPool() { - QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice); - - VkCommandPoolCreateInfo poolInfo = {}; - poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily; - - if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) != - VK_SUCCESS) { - throw std::runtime_error("failed to create graphics command pool!"); - } - } - - void createTextureImage() { - VkDeviceSize imageSize = imageWidth * imageHeight * 4; - mipLevels = static_cast( - std::floor(std::log2(std::max(imageWidth, imageHeight)))) + - 1; - printf("mipLevels = %d\n", mipLevels); - - if (!image_data) { - throw std::runtime_error("failed to load texture image!"); - } - - VkBuffer stagingBuffer; - VkDeviceMemory stagingBufferMemory; - createBuffer(imageSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - stagingBuffer, stagingBufferMemory); - - void* data; - vkMapMemory(device, stagingBufferMemory, 0, imageSize, 0, &data); - memcpy(data, image_data, static_cast(imageSize)); - vkUnmapMemory(device, stagingBufferMemory); - - // VK_FORMAT_R8G8B8A8_UNORM changed to VK_FORMAT_R8G8B8A8_UINT - createImage( - imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UNORM, - VK_IMAGE_TILING_OPTIMAL, - VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | - VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, textureImage, textureImageMemory); - - transitionImageLayout(textureImage, VK_FORMAT_R8G8B8A8_UINT, - VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - copyBufferToImage(stagingBuffer, textureImage, - static_cast(imageWidth), - static_cast(imageHeight)); - - vkDestroyBuffer(device, stagingBuffer, nullptr); - vkFreeMemory(device, stagingBufferMemory, nullptr); - - generateMipmaps(textureImage, VK_FORMAT_R8G8B8A8_UNORM); - } - - void generateMipmaps(VkImage image, VkFormat imageFormat) { - VkFormatProperties formatProperties; - vkGetPhysicalDeviceFormatProperties(physicalDevice, imageFormat, - &formatProperties); - - if (!(formatProperties.optimalTilingFeatures & - VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT)) { - throw std::runtime_error( - "texture image format does not support linear blitting!"); - } - - VkCommandBuffer commandBuffer = beginSingleTimeCommands(); - - VkImageMemoryBarrier barrier = {}; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.image = image; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - barrier.subresourceRange.levelCount = 1; - - int32_t mipWidth = imageWidth; - int32_t mipHeight = imageHeight; - - for (uint32_t i = 1; i < mipLevels; i++) { - barrier.subresourceRange.baseMipLevel = i - 1; - barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, - nullptr, 1, &barrier); - - VkImageBlit blit = {}; - blit.srcOffsets[0] = {0, 0, 0}; - blit.srcOffsets[1] = {mipWidth, mipHeight, 1}; - blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - blit.srcSubresource.mipLevel = i - 1; - blit.srcSubresource.baseArrayLayer = 0; - blit.srcSubresource.layerCount = 1; - blit.dstOffsets[0] = {0, 0, 0}; - blit.dstOffsets[1] = {mipWidth > 1 ? mipWidth / 2 : 1, - mipHeight > 1 ? mipHeight / 2 : 1, 1}; - blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - blit.dstSubresource.mipLevel = i; - blit.dstSubresource.baseArrayLayer = 0; - blit.dstSubresource.layerCount = 1; - - vkCmdBlitImage(commandBuffer, image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, - image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &blit, - VK_FILTER_LINEAR); - - barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; - barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, nullptr, - 0, nullptr, 1, &barrier); - - if (mipWidth > 1) mipWidth /= 2; - if (mipHeight > 1) mipHeight /= 2; - } - - barrier.subresourceRange.baseMipLevel = mipLevels - 1; - barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, nullptr, - 0, nullptr, 1, &barrier); - - endSingleTimeCommands(commandBuffer); - } - -#ifdef _WIN64 // For windows - HANDLE getVkImageMemHandle( - VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) { - HANDLE handle; - - VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; - vkMemoryGetWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; - vkMemoryGetWin32HandleInfoKHR.pNext = NULL; - vkMemoryGetWin32HandleInfoKHR.memory = textureImageMemory; - vkMemoryGetWin32HandleInfoKHR.handleType = - (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType; - - fpGetMemoryWin32HandleKHR(device, &vkMemoryGetWin32HandleInfoKHR, &handle); - return handle; - } - HANDLE getVkSemaphoreHandle( - VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, - VkSemaphore& semVkCuda) { - HANDLE handle; - - VkSemaphoreGetWin32HandleInfoKHR vulkanSemaphoreGetWin32HandleInfoKHR = {}; - vulkanSemaphoreGetWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; - vulkanSemaphoreGetWin32HandleInfoKHR.pNext = NULL; - vulkanSemaphoreGetWin32HandleInfoKHR.semaphore = semVkCuda; - vulkanSemaphoreGetWin32HandleInfoKHR.handleType = - externalSemaphoreHandleType; - - fpGetSemaphoreWin32HandleKHR(device, &vulkanSemaphoreGetWin32HandleInfoKHR, - &handle); - - return handle; - } -#else - int getVkImageMemHandle( - VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) { - if (externalMemoryHandleType == - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR) { - int fd; - - VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; - vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; - vkMemoryGetFdInfoKHR.pNext = NULL; - vkMemoryGetFdInfoKHR.memory = textureImageMemory; - vkMemoryGetFdInfoKHR.handleType = - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; - - fpGetMemoryFdKHR(device, &vkMemoryGetFdInfoKHR, &fd); - - return fd; - } - return -1; - } - - int getVkSemaphoreHandle( - VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, - VkSemaphore& semVkCuda) { - if (externalSemaphoreHandleType == - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { - int fd; - - VkSemaphoreGetFdInfoKHR vulkanSemaphoreGetFdInfoKHR = {}; - vulkanSemaphoreGetFdInfoKHR.sType = - VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; - vulkanSemaphoreGetFdInfoKHR.pNext = NULL; - vulkanSemaphoreGetFdInfoKHR.semaphore = semVkCuda; - vulkanSemaphoreGetFdInfoKHR.handleType = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; - - fpGetSemaphoreFdKHR(device, &vulkanSemaphoreGetFdInfoKHR, &fd); - - return fd; - } - return -1; - } -#endif - - void createTextureImageView() { - textureImageView = createImageView(textureImage, VK_FORMAT_R8G8B8A8_UNORM); - } - - void createTextureSampler() { - VkSamplerCreateInfo samplerInfo = {}; - samplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; - samplerInfo.magFilter = VK_FILTER_LINEAR; - samplerInfo.minFilter = VK_FILTER_LINEAR; - samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; - samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; - samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; - samplerInfo.anisotropyEnable = VK_TRUE; - samplerInfo.maxAnisotropy = 16; - samplerInfo.borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK; - samplerInfo.unnormalizedCoordinates = VK_FALSE; - samplerInfo.compareEnable = VK_FALSE; - samplerInfo.compareOp = VK_COMPARE_OP_ALWAYS; - samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; - samplerInfo.minLod = 0; // Optional - samplerInfo.maxLod = static_cast(mipLevels); - samplerInfo.mipLodBias = 0; // Optional - - if (vkCreateSampler(device, &samplerInfo, nullptr, &textureSampler) != - VK_SUCCESS) { - throw std::runtime_error("failed to create texture sampler!"); - } - } - - VkImageView createImageView(VkImage image, VkFormat format) { - VkImageViewCreateInfo viewInfo = {}; - viewInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - viewInfo.image = image; - viewInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; - viewInfo.format = format; - viewInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - viewInfo.subresourceRange.baseMipLevel = 0; - viewInfo.subresourceRange.levelCount = mipLevels; - viewInfo.subresourceRange.baseArrayLayer = 0; - viewInfo.subresourceRange.layerCount = 1; - - VkImageView imageView; - if (vkCreateImageView(device, &viewInfo, nullptr, &imageView) != - VK_SUCCESS) { - throw std::runtime_error("failed to create texture image view!"); - } - - return imageView; - } - - void createImage(uint32_t width, uint32_t height, VkFormat format, - VkImageTiling tiling, VkImageUsageFlags usage, - VkMemoryPropertyFlags properties, VkImage& image, - VkDeviceMemory& imageMemory) { - VkImageCreateInfo imageInfo = {}; - imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - imageInfo.imageType = VK_IMAGE_TYPE_2D; - imageInfo.extent.width = width; - imageInfo.extent.height = height; - imageInfo.extent.depth = 1; - imageInfo.mipLevels = mipLevels; - imageInfo.arrayLayers = 1; - imageInfo.format = format; - imageInfo.tiling = tiling; - imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - imageInfo.usage = usage; - imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; - imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - - VkExternalMemoryImageCreateInfo vkExternalMemImageCreateInfo = {}; - vkExternalMemImageCreateInfo.sType = - VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO; - vkExternalMemImageCreateInfo.pNext = NULL; -#ifdef _WIN64 - vkExternalMemImageCreateInfo.handleTypes = - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; -#else - vkExternalMemImageCreateInfo.handleTypes = - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; -#endif - - imageInfo.pNext = &vkExternalMemImageCreateInfo; - - if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { - throw std::runtime_error("failed to create image!"); - } - - VkMemoryRequirements memRequirements; - vkGetImageMemoryRequirements(device, image, &memRequirements); - -#ifdef _WIN64 - WindowsSecurityAttributes winSecurityAttributes; - - VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; - vulkanExportMemoryWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; - vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; - vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; - vulkanExportMemoryWin32HandleInfoKHR.dwAccess = - DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; - vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; -#endif - VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; - vulkanExportMemoryAllocateInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; -#ifdef _WIN64 - vulkanExportMemoryAllocateInfoKHR.pNext = - IsWindows8OrGreater() ? &vulkanExportMemoryWin32HandleInfoKHR : NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = - IsWindows8OrGreater() - ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; -#else - vulkanExportMemoryAllocateInfoKHR.pNext = NULL; - vulkanExportMemoryAllocateInfoKHR.handleTypes = - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; -#endif - - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; - allocInfo.memoryTypeIndex = - findMemoryType(memRequirements.memoryTypeBits, properties); - - VkMemoryRequirements vkMemoryRequirements = {}; - vkGetImageMemoryRequirements(device, image, &vkMemoryRequirements); - totalImageMemSize = vkMemoryRequirements.size; - - if (vkAllocateMemory(device, &allocInfo, nullptr, &textureImageMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate image memory!"); - } - - vkBindImageMemory(device, image, textureImageMemory, 0); - } - - void cudaVkImportSemaphore() { - cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc; - memset(&externalSemaphoreHandleDesc, 0, - sizeof(externalSemaphoreHandleDesc)); -#ifdef _WIN64 - externalSemaphoreHandleDesc.type = - IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 - : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; - externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle( - IsWindows8OrGreater() - ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, - cudaUpdateVkSemaphore); -#else - externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; - externalSemaphoreHandleDesc.handle.fd = getVkSemaphoreHandle( - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, cudaUpdateVkSemaphore); -#endif - externalSemaphoreHandleDesc.flags = 0; - - checkCudaErrors(cudaImportExternalSemaphore(&cudaExtCudaUpdateVkSemaphore, - &externalSemaphoreHandleDesc)); - - memset(&externalSemaphoreHandleDesc, 0, - sizeof(externalSemaphoreHandleDesc)); -#ifdef _WIN64 - externalSemaphoreHandleDesc.type = - IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 - : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; - ; - externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle( - IsWindows8OrGreater() - ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, - vkUpdateCudaSemaphore); -#else - externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; - externalSemaphoreHandleDesc.handle.fd = getVkSemaphoreHandle( - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, vkUpdateCudaSemaphore); -#endif - externalSemaphoreHandleDesc.flags = 0; - checkCudaErrors(cudaImportExternalSemaphore(&cudaExtVkUpdateCudaSemaphore, - &externalSemaphoreHandleDesc)); - printf("CUDA Imported Vulkan semaphore\n"); - } - - void cudaVkImportImageMem() { - cudaExternalMemoryHandleDesc cudaExtMemHandleDesc; - memset(&cudaExtMemHandleDesc, 0, sizeof(cudaExtMemHandleDesc)); -#ifdef _WIN64 - cudaExtMemHandleDesc.type = - IsWindows8OrGreater() ? cudaExternalMemoryHandleTypeOpaqueWin32 - : cudaExternalMemoryHandleTypeOpaqueWin32Kmt; - cudaExtMemHandleDesc.handle.win32.handle = getVkImageMemHandle( - IsWindows8OrGreater() - ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT); -#else - cudaExtMemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; - - cudaExtMemHandleDesc.handle.fd = - getVkImageMemHandle(VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR); -#endif - cudaExtMemHandleDesc.size = totalImageMemSize; - - checkCudaErrors(cudaImportExternalMemory(&cudaExtMemImageBuffer, - &cudaExtMemHandleDesc)); - - cudaExternalMemoryMipmappedArrayDesc externalMemoryMipmappedArrayDesc; - - memset(&externalMemoryMipmappedArrayDesc, 0, - sizeof(externalMemoryMipmappedArrayDesc)); - - cudaExtent extent = make_cudaExtent(imageWidth, imageHeight, 0); - cudaChannelFormatDesc formatDesc; - formatDesc.x = 8; - formatDesc.y = 8; - formatDesc.z = 8; - formatDesc.w = 8; - formatDesc.f = cudaChannelFormatKindUnsigned; - - externalMemoryMipmappedArrayDesc.offset = 0; - externalMemoryMipmappedArrayDesc.formatDesc = formatDesc; - externalMemoryMipmappedArrayDesc.extent = extent; - externalMemoryMipmappedArrayDesc.flags = 0; - externalMemoryMipmappedArrayDesc.numLevels = mipLevels; - - checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray( - &cudaMipmappedImageArray, cudaExtMemImageBuffer, - &externalMemoryMipmappedArrayDesc)); - - checkCudaErrors(cudaMallocMipmappedArray(&cudaMipmappedImageArrayTemp, - &formatDesc, extent, mipLevels)); - checkCudaErrors(cudaMallocMipmappedArray(&cudaMipmappedImageArrayOrig, - &formatDesc, extent, mipLevels)); - - for (int mipLevelIdx = 0; mipLevelIdx < mipLevels; mipLevelIdx++) { - cudaArray_t cudaMipLevelArray, cudaMipLevelArrayTemp, - cudaMipLevelArrayOrig; - cudaResourceDesc resourceDesc; - - checkCudaErrors(cudaGetMipmappedArrayLevel( - &cudaMipLevelArray, cudaMipmappedImageArray, mipLevelIdx)); - checkCudaErrors(cudaGetMipmappedArrayLevel( - &cudaMipLevelArrayTemp, cudaMipmappedImageArrayTemp, mipLevelIdx)); - checkCudaErrors(cudaGetMipmappedArrayLevel( - &cudaMipLevelArrayOrig, cudaMipmappedImageArrayOrig, mipLevelIdx)); - - uint32_t width = - (imageWidth >> mipLevelIdx) ? (imageWidth >> mipLevelIdx) : 1; - uint32_t height = - (imageHeight >> mipLevelIdx) ? (imageHeight >> mipLevelIdx) : 1; - checkCudaErrors(cudaMemcpy2DArrayToArray( - cudaMipLevelArrayOrig, 0, 0, cudaMipLevelArray, 0, 0, - width * sizeof(uchar4), height, cudaMemcpyDeviceToDevice)); - - memset(&resourceDesc, 0, sizeof(resourceDesc)); - resourceDesc.resType = cudaResourceTypeArray; - resourceDesc.res.array.array = cudaMipLevelArray; - - cudaSurfaceObject_t surfaceObject; - checkCudaErrors(cudaCreateSurfaceObject(&surfaceObject, &resourceDesc)); - - surfaceObjectList.push_back(surfaceObject); - - memset(&resourceDesc, 0, sizeof(resourceDesc)); - resourceDesc.resType = cudaResourceTypeArray; - resourceDesc.res.array.array = cudaMipLevelArrayTemp; - - cudaSurfaceObject_t surfaceObjectTemp; - checkCudaErrors( - cudaCreateSurfaceObject(&surfaceObjectTemp, &resourceDesc)); - surfaceObjectListTemp.push_back(surfaceObjectTemp); - } - - cudaResourceDesc resDescr; - memset(&resDescr, 0, sizeof(cudaResourceDesc)); - - resDescr.resType = cudaResourceTypeMipmappedArray; - resDescr.res.mipmap.mipmap = cudaMipmappedImageArrayOrig; - - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - - texDescr.normalizedCoords = true; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.mipmapFilterMode = cudaFilterModeLinear; - - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.addressMode[1] = cudaAddressModeWrap; - - texDescr.maxMipmapLevelClamp = float(mipLevels - 1); - - texDescr.readMode = cudaReadModeNormalizedFloat; - - checkCudaErrors(cudaCreateTextureObject(&textureObjMipMapInput, &resDescr, - &texDescr, NULL)); - - checkCudaErrors(cudaMalloc((void**)&d_surfaceObjectList, - sizeof(cudaSurfaceObject_t) * mipLevels)); - checkCudaErrors(cudaMalloc((void**)&d_surfaceObjectListTemp, - sizeof(cudaSurfaceObject_t) * mipLevels)); - - checkCudaErrors(cudaMemcpy(d_surfaceObjectList, surfaceObjectList.data(), - sizeof(cudaSurfaceObject_t) * mipLevels, - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy( - d_surfaceObjectListTemp, surfaceObjectListTemp.data(), - sizeof(cudaSurfaceObject_t) * mipLevels, cudaMemcpyHostToDevice)); - - printf("CUDA Kernel Vulkan image buffer\n"); - } - - void cudaUpdateVkImage() { - cudaVkSemaphoreWait(cudaExtVkUpdateCudaSemaphore); - - int nthreads = 128; - - /*Perform 2D box filter on image using CUDA */ - d_boxfilter_rgba_x<<>>( - d_surfaceObjectListTemp, textureObjMipMapInput, imageWidth, imageHeight, - mipLevels, filter_radius); - - d_boxfilter_rgba_y<<>>( - d_surfaceObjectList, d_surfaceObjectListTemp, imageWidth, imageHeight, - mipLevels, filter_radius); - - varySigma(); - - cudaVkSemaphoreSignal(cudaExtCudaUpdateVkSemaphore); - } - - void transitionImageLayout(VkImage image, VkFormat format, - VkImageLayout oldLayout, VkImageLayout newLayout) { - VkCommandBuffer commandBuffer = beginSingleTimeCommands(); - - VkImageMemoryBarrier barrier = {}; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = image; - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = mipLevels; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - - VkPipelineStageFlags sourceStage; - VkPipelineStageFlags destinationStage; - - if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - - sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - - sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - } else { - throw std::invalid_argument("unsupported layout transition!"); - } - - vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, - nullptr, 0, nullptr, 1, &barrier); - - endSingleTimeCommands(commandBuffer); - } - - void copyBufferToImage(VkBuffer buffer, VkImage image, uint32_t width, - uint32_t height) { - VkCommandBuffer commandBuffer = beginSingleTimeCommands(); - - VkBufferImageCopy region = {}; - region.bufferOffset = 0; - region.bufferRowLength = 0; - region.bufferImageHeight = 0; - region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - region.imageSubresource.mipLevel = 0; - region.imageSubresource.baseArrayLayer = 0; - region.imageSubresource.layerCount = 1; - region.imageOffset = {0, 0, 0}; - region.imageExtent = {width, height, 1}; - - vkCmdCopyBufferToImage(commandBuffer, buffer, image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); - - endSingleTimeCommands(commandBuffer); - } - - void createVertexBuffer() { - VkDeviceSize bufferSize = sizeof(vertices[0]) * vertices.size(); - - VkBuffer stagingBuffer; - VkDeviceMemory stagingBufferMemory; - createBuffer(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - stagingBuffer, stagingBufferMemory); - - void* data; - vkMapMemory(device, stagingBufferMemory, 0, bufferSize, 0, &data); - memcpy(data, vertices.data(), (size_t)bufferSize); - vkUnmapMemory(device, stagingBufferMemory); - - createBuffer( - bufferSize, - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, vertexBuffer, vertexBufferMemory); - - copyBuffer(stagingBuffer, vertexBuffer, bufferSize); - - vkDestroyBuffer(device, stagingBuffer, nullptr); - vkFreeMemory(device, stagingBufferMemory, nullptr); - } - - void createIndexBuffer() { - VkDeviceSize bufferSize = sizeof(indices[0]) * indices.size(); - - VkBuffer stagingBuffer; - VkDeviceMemory stagingBufferMemory; - createBuffer(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - stagingBuffer, stagingBufferMemory); - - void* data; - vkMapMemory(device, stagingBufferMemory, 0, bufferSize, 0, &data); - memcpy(data, indices.data(), (size_t)bufferSize); - vkUnmapMemory(device, stagingBufferMemory); - - createBuffer( - bufferSize, - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, indexBuffer, indexBufferMemory); - - copyBuffer(stagingBuffer, indexBuffer, bufferSize); - - vkDestroyBuffer(device, stagingBuffer, nullptr); - vkFreeMemory(device, stagingBufferMemory, nullptr); - } - - void createUniformBuffers() { - VkDeviceSize bufferSize = sizeof(UniformBufferObject); - - uniformBuffers.resize(swapChainImages.size()); - uniformBuffersMemory.resize(swapChainImages.size()); - - for (size_t i = 0; i < swapChainImages.size(); i++) { - createBuffer(bufferSize, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - uniformBuffers[i], uniformBuffersMemory[i]); - } - } - - void createDescriptorPool() { - std::array poolSizes = {}; - poolSizes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - poolSizes[0].descriptorCount = - static_cast(swapChainImages.size()); - poolSizes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - poolSizes[1].descriptorCount = - static_cast(swapChainImages.size()); - - VkDescriptorPoolCreateInfo poolInfo = {}; - poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - poolInfo.poolSizeCount = static_cast(poolSizes.size()); - poolInfo.pPoolSizes = poolSizes.data(); - poolInfo.maxSets = static_cast(swapChainImages.size()); - - if (vkCreateDescriptorPool(device, &poolInfo, nullptr, &descriptorPool) != - VK_SUCCESS) { - throw std::runtime_error("failed to create descriptor pool!"); - } - } - - void createDescriptorSets() { - std::vector layouts(swapChainImages.size(), - descriptorSetLayout); - VkDescriptorSetAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - allocInfo.descriptorPool = descriptorPool; - allocInfo.descriptorSetCount = - static_cast(swapChainImages.size()); - allocInfo.pSetLayouts = layouts.data(); - - descriptorSets.resize(swapChainImages.size()); - if (vkAllocateDescriptorSets(device, &allocInfo, descriptorSets.data()) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate descriptor sets!"); - } - - for (size_t i = 0; i < swapChainImages.size(); i++) { - VkDescriptorBufferInfo bufferInfo = {}; - bufferInfo.buffer = uniformBuffers[i]; - bufferInfo.offset = 0; - bufferInfo.range = sizeof(UniformBufferObject); - - VkDescriptorImageInfo imageInfo = {}; - imageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - imageInfo.imageView = textureImageView; - imageInfo.sampler = textureSampler; - - std::array descriptorWrites = {}; - - descriptorWrites[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - descriptorWrites[0].dstSet = descriptorSets[i]; - descriptorWrites[0].dstBinding = 0; - descriptorWrites[0].dstArrayElement = 0; - descriptorWrites[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - descriptorWrites[0].descriptorCount = 1; - descriptorWrites[0].pBufferInfo = &bufferInfo; - - descriptorWrites[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - descriptorWrites[1].dstSet = descriptorSets[i]; - descriptorWrites[1].dstBinding = 1; - descriptorWrites[1].dstArrayElement = 0; - descriptorWrites[1].descriptorType = - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - descriptorWrites[1].descriptorCount = 1; - descriptorWrites[1].pImageInfo = &imageInfo; - - vkUpdateDescriptorSets(device, - static_cast(descriptorWrites.size()), - descriptorWrites.data(), 0, nullptr); - } - } - - void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, - VkMemoryPropertyFlags properties, VkBuffer& buffer, - VkDeviceMemory& bufferMemory) { - VkBufferCreateInfo bufferInfo = {}; - bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferInfo.size = size; - bufferInfo.usage = usage; - bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - - if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { - throw std::runtime_error("failed to create buffer!"); - } - - VkMemoryRequirements memRequirements; - vkGetBufferMemoryRequirements(device, buffer, &memRequirements); - - VkMemoryAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memRequirements.size; - allocInfo.memoryTypeIndex = - findMemoryType(memRequirements.memoryTypeBits, properties); - - if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate buffer memory!"); - } - - vkBindBufferMemory(device, buffer, bufferMemory, 0); - } - - VkCommandBuffer beginSingleTimeCommands() { - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandPool = commandPool; - allocInfo.commandBufferCount = 1; - - VkCommandBuffer commandBuffer; - vkAllocateCommandBuffers(device, &allocInfo, &commandBuffer); - - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - - vkBeginCommandBuffer(commandBuffer, &beginInfo); - - return commandBuffer; - } - - void endSingleTimeCommands(VkCommandBuffer commandBuffer) { - vkEndCommandBuffer(commandBuffer); - - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - - vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); - vkQueueWaitIdle(graphicsQueue); - - vkFreeCommandBuffers(device, commandPool, 1, &commandBuffer); - } - - void copyBuffer(VkBuffer srcBuffer, VkBuffer dstBuffer, VkDeviceSize size) { - VkCommandBuffer commandBuffer = beginSingleTimeCommands(); - - VkBufferCopy copyRegion = {}; - copyRegion.size = size; - vkCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, 1, ©Region); - - endSingleTimeCommands(commandBuffer); - } - - uint32_t findMemoryType(uint32_t typeFilter, - VkMemoryPropertyFlags properties) { - VkPhysicalDeviceMemoryProperties memProperties; - vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); - - for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { - if ((typeFilter & (1 << i)) && - (memProperties.memoryTypes[i].propertyFlags & properties) == - properties) { - return i; - } - } - - throw std::runtime_error("failed to find suitable memory type!"); - } - - void createCommandBuffers() { - commandBuffers.resize(swapChainFramebuffers.size()); - - VkCommandBufferAllocateInfo allocInfo = {}; - allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - allocInfo.commandPool = commandPool; - allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - allocInfo.commandBufferCount = (uint32_t)commandBuffers.size(); - - if (vkAllocateCommandBuffers(device, &allocInfo, commandBuffers.data()) != - VK_SUCCESS) { - throw std::runtime_error("failed to allocate command buffers!"); - } - - for (size_t i = 0; i < commandBuffers.size(); i++) { - VkCommandBufferBeginInfo beginInfo = {}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; - - if (vkBeginCommandBuffer(commandBuffers[i], &beginInfo) != VK_SUCCESS) { - throw std::runtime_error("failed to begin recording command buffer!"); - } - - VkRenderPassBeginInfo renderPassInfo = {}; - renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - renderPassInfo.renderPass = renderPass; - renderPassInfo.framebuffer = swapChainFramebuffers[i]; - renderPassInfo.renderArea.offset = {0, 0}; - renderPassInfo.renderArea.extent = swapChainExtent; - - VkClearValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f}; - renderPassInfo.clearValueCount = 1; - renderPassInfo.pClearValues = &clearColor; - - vkCmdBeginRenderPass(commandBuffers[i], &renderPassInfo, - VK_SUBPASS_CONTENTS_INLINE); - - vkCmdBindPipeline(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, - graphicsPipeline); - - VkBuffer vertexBuffers[] = {vertexBuffer}; - VkDeviceSize offsets[] = {0}; - vkCmdBindVertexBuffers(commandBuffers[i], 0, 1, vertexBuffers, offsets); - - vkCmdBindIndexBuffer(commandBuffers[i], indexBuffer, 0, - VK_INDEX_TYPE_UINT16); - - vkCmdBindDescriptorSets(commandBuffers[i], - VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, - 0, 1, &descriptorSets[i], 0, nullptr); - - vkCmdDrawIndexed(commandBuffers[i], static_cast(indices.size()), - 1, 0, 0, 0); - // vkCmdDraw(commandBuffers[i], static_cast(vertices.size()), 1, - // 0, 0); - - vkCmdEndRenderPass(commandBuffers[i]); - - if (vkEndCommandBuffer(commandBuffers[i]) != VK_SUCCESS) { - throw std::runtime_error("failed to record command buffer!"); - } - } - } - - void createSyncObjects() { - imageAvailableSemaphores.resize(MAX_FRAMES); - renderFinishedSemaphores.resize(MAX_FRAMES); - inFlightFences.resize(MAX_FRAMES); - - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - - VkFenceCreateInfo fenceInfo = {}; - fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; - - for (size_t i = 0; i < MAX_FRAMES; i++) { - if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, - &imageAvailableSemaphores[i]) != VK_SUCCESS || - vkCreateSemaphore(device, &semaphoreInfo, nullptr, - &renderFinishedSemaphores[i]) != VK_SUCCESS || - vkCreateFence(device, &fenceInfo, nullptr, &inFlightFences[i]) != - VK_SUCCESS) { - throw std::runtime_error( - "failed to create synchronization objects for a frame!"); - } - } - } - - void createSyncObjectsExt() { - VkSemaphoreCreateInfo semaphoreInfo = {}; - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - - memset(&semaphoreInfo, 0, sizeof(semaphoreInfo)); - semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - -#ifdef _WIN64 - WindowsSecurityAttributes winSecurityAttributes; - - VkExportSemaphoreWin32HandleInfoKHR - vulkanExportSemaphoreWin32HandleInfoKHR = {}; - vulkanExportSemaphoreWin32HandleInfoKHR.sType = - VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; - vulkanExportSemaphoreWin32HandleInfoKHR.pNext = NULL; - vulkanExportSemaphoreWin32HandleInfoKHR.pAttributes = - &winSecurityAttributes; - vulkanExportSemaphoreWin32HandleInfoKHR.dwAccess = - DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; - vulkanExportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; -#endif - VkExportSemaphoreCreateInfoKHR vulkanExportSemaphoreCreateInfo = {}; - vulkanExportSemaphoreCreateInfo.sType = - VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; -#ifdef _WIN64 - vulkanExportSemaphoreCreateInfo.pNext = - IsWindows8OrGreater() ? &vulkanExportSemaphoreWin32HandleInfoKHR : NULL; - vulkanExportSemaphoreCreateInfo.handleTypes = - IsWindows8OrGreater() - ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT - : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; -#else - vulkanExportSemaphoreCreateInfo.pNext = NULL; - vulkanExportSemaphoreCreateInfo.handleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; -#endif - semaphoreInfo.pNext = &vulkanExportSemaphoreCreateInfo; - - if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, - &cudaUpdateVkSemaphore) != VK_SUCCESS || - vkCreateSemaphore(device, &semaphoreInfo, nullptr, - &vkUpdateCudaSemaphore) != VK_SUCCESS) { - throw std::runtime_error( - "failed to create synchronization objects for a CUDA-Vulkan!"); - } - } - - void updateUniformBuffer() { - UniformBufferObject ubo = {}; - - mat4x4_identity(ubo.model); - mat4x4 Model; - mat4x4_dup(Model, ubo.model); - mat4x4_rotate(ubo.model, Model, 0.0f, 0.0f, 1.0f, degreesToRadians(135.0f)); - - vec3 eye = {2.0f, 2.0f, 2.0f}; - vec3 center = {0.0f, 0.0f, 0.0f}; - vec3 up = {0.0f, 0.0f, 1.0f}; - mat4x4_look_at(ubo.view, eye, center, up); - - mat4x4_perspective(ubo.proj, degreesToRadians(45.0f), - swapChainExtent.width / (float)swapChainExtent.height, - 0.1f, 10.0f); - ubo.proj[1][1] *= -1; - - for (size_t i = 0; i < swapChainImages.size(); i++) { - void* data; - vkMapMemory(device, uniformBuffersMemory[i], 0, sizeof(ubo), 0, &data); - memcpy(data, &ubo, sizeof(ubo)); - vkUnmapMemory(device, uniformBuffersMemory[i]); - } - } - - void drawFrame() { - static int startSubmit = 0; - - vkWaitForFences(device, 1, &inFlightFences[currentFrame], VK_TRUE, - std::numeric_limits::max()); - - uint32_t imageIndex; - VkResult result = vkAcquireNextImageKHR( - device, swapChain, std::numeric_limits::max(), - imageAvailableSemaphores[currentFrame], VK_NULL_HANDLE, &imageIndex); - - if (result == VK_ERROR_OUT_OF_DATE_KHR) { - recreateSwapChain(); - return; - } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { - throw std::runtime_error("failed to acquire swap chain image!"); - } - - vkResetFences(device, 1, &inFlightFences[currentFrame]); - - if (!startSubmit) { - submitVulkan(imageIndex); - startSubmit = 1; - } else { - submitVulkanCuda(imageIndex); - } - - VkPresentInfoKHR presentInfo = {}; - presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; - - VkSemaphore signalSemaphores[] = {renderFinishedSemaphores[currentFrame]}; - - presentInfo.waitSemaphoreCount = 1; - presentInfo.pWaitSemaphores = signalSemaphores; - - VkSwapchainKHR swapChains[] = {swapChain}; - presentInfo.swapchainCount = 1; - presentInfo.pSwapchains = swapChains; - presentInfo.pImageIndices = &imageIndex; - presentInfo.pResults = nullptr; // Optional - - result = vkQueuePresentKHR(presentQueue, &presentInfo); - - if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || - framebufferResized) { - framebufferResized = false; - recreateSwapChain(); - } else if (result != VK_SUCCESS) { - throw std::runtime_error("failed to present swap chain image!"); - } - - cudaUpdateVkImage(); - - currentFrame = (currentFrame + 1) % MAX_FRAMES; - // Added sleep of 10 millisecs so that CPU does not submit too much work to - // GPU - std::this_thread::sleep_for(std::chrono::microseconds(10000)); - char title[256]; - sprintf(title, "Vulkan Image CUDA Box Filter (radius=%d)", filter_radius); - glfwSetWindowTitle(window, title); - } - - void cudaVkSemaphoreSignal(cudaExternalSemaphore_t& extSemaphore) { - cudaExternalSemaphoreSignalParams extSemaphoreSignalParams; - memset(&extSemaphoreSignalParams, 0, sizeof(extSemaphoreSignalParams)); - - extSemaphoreSignalParams.params.fence.value = 0; - extSemaphoreSignalParams.flags = 0; - checkCudaErrors(cudaSignalExternalSemaphoresAsync( - &extSemaphore, &extSemaphoreSignalParams, 1, streamToRun)); - } - - void cudaVkSemaphoreWait(cudaExternalSemaphore_t& extSemaphore) { - cudaExternalSemaphoreWaitParams extSemaphoreWaitParams; - - memset(&extSemaphoreWaitParams, 0, sizeof(extSemaphoreWaitParams)); - - extSemaphoreWaitParams.params.fence.value = 0; - extSemaphoreWaitParams.flags = 0; - - checkCudaErrors(cudaWaitExternalSemaphoresAsync( - &extSemaphore, &extSemaphoreWaitParams, 1, streamToRun)); - } - - void submitVulkan(uint32_t imageIndex) { - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - - VkSemaphore waitSemaphores[] = {imageAvailableSemaphores[currentFrame]}; - VkPipelineStageFlags waitStages[] = { - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT}; - submitInfo.waitSemaphoreCount = 1; - submitInfo.pWaitSemaphores = waitSemaphores; - submitInfo.pWaitDstStageMask = waitStages; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; - - VkSemaphore signalSemaphores[] = {renderFinishedSemaphores[currentFrame], - vkUpdateCudaSemaphore}; - - submitInfo.signalSemaphoreCount = 2; - submitInfo.pSignalSemaphores = signalSemaphores; - - if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, - inFlightFences[currentFrame]) != VK_SUCCESS) { - throw std::runtime_error("failed to submit draw command buffer!"); - } - } - - void submitVulkanCuda(uint32_t imageIndex) { - VkSubmitInfo submitInfo = {}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - - VkSemaphore waitSemaphores[] = {imageAvailableSemaphores[currentFrame], - cudaUpdateVkSemaphore}; - VkPipelineStageFlags waitStages[] = { - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT}; - submitInfo.waitSemaphoreCount = 2; - submitInfo.pWaitSemaphores = waitSemaphores; - submitInfo.pWaitDstStageMask = waitStages; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; - - VkSemaphore signalSemaphores[] = {renderFinishedSemaphores[currentFrame], - vkUpdateCudaSemaphore}; - - submitInfo.signalSemaphoreCount = 2; - submitInfo.pSignalSemaphores = signalSemaphores; - - if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, - inFlightFences[currentFrame]) != VK_SUCCESS) { - throw std::runtime_error("failed to submit draw command buffer!"); - } - } - - VkShaderModule createShaderModule(const std::vector& code) { - VkShaderModuleCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - createInfo.codeSize = code.size(); - createInfo.pCode = reinterpret_cast(code.data()); - - VkShaderModule shaderModule; - if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != - VK_SUCCESS) { - throw std::runtime_error("failed to create shader module!"); - } - - return shaderModule; - } - - VkSurfaceFormatKHR chooseSwapSurfaceFormat( - const std::vector& availableFormats) { - if (availableFormats.size() == 1 && - availableFormats[0].format == VK_FORMAT_UNDEFINED) { - return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; - } - - for (const auto& availableFormat : availableFormats) { - if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && - availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { - return availableFormat; - } - } - - return availableFormats[0]; - } - - VkPresentModeKHR chooseSwapPresentMode( - const std::vector& availablePresentModes) { - VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; - - for (const auto& availablePresentMode : availablePresentModes) { - if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { - return availablePresentMode; - } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { - bestMode = availablePresentMode; - } - } - - return bestMode; - } - - VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities) { - if (capabilities.currentExtent.width != - std::numeric_limits::max()) { - return capabilities.currentExtent; - } else { - int width, height; - glfwGetFramebufferSize(window, &width, &height); - - VkExtent2D actualExtent = {static_cast(width), - static_cast(height)}; - - actualExtent.width = std::max( - capabilities.minImageExtent.width, - std::min(capabilities.maxImageExtent.width, actualExtent.width)); - actualExtent.height = std::max( - capabilities.minImageExtent.height, - std::min(capabilities.maxImageExtent.height, actualExtent.height)); - - return actualExtent; - } - } - - SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) { - SwapChainSupportDetails details; - - vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, - &details.capabilities); - - uint32_t formatCount; - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, - nullptr); - - if (formatCount != 0) { - details.formats.resize(formatCount); - vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, - details.formats.data()); - } - - uint32_t presentModeCount; - vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, - &presentModeCount, nullptr); - - if (presentModeCount != 0) { - details.presentModes.resize(presentModeCount); - vkGetPhysicalDeviceSurfacePresentModesKHR( - device, surface, &presentModeCount, details.presentModes.data()); - } - - return details; - } - - bool isDeviceSuitable(VkPhysicalDevice device) { - QueueFamilyIndices indices = findQueueFamilies(device); - - bool extensionsSupported = checkDeviceExtensionSupport(device); - - bool swapChainAdequate = false; - if (extensionsSupported) { - SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device); - swapChainAdequate = !swapChainSupport.formats.empty() && - !swapChainSupport.presentModes.empty(); - } - - VkPhysicalDeviceFeatures supportedFeatures; - vkGetPhysicalDeviceFeatures(device, &supportedFeatures); - - return indices.isComplete() && extensionsSupported && swapChainAdequate && - supportedFeatures.samplerAnisotropy; - } - - bool checkDeviceExtensionSupport(VkPhysicalDevice device) { - uint32_t extensionCount; - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, - nullptr); - - std::vector availableExtensions(extensionCount); - vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, - availableExtensions.data()); - - std::set requiredExtensions(deviceExtensions.begin(), - deviceExtensions.end()); - - for (const auto& extension : availableExtensions) { - requiredExtensions.erase(extension.extensionName); - } - - return requiredExtensions.empty(); - } - - QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) { - QueueFamilyIndices indices; - - uint32_t queueFamilyCount = 0; - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, - nullptr); - - std::vector queueFamilies(queueFamilyCount); - vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, - queueFamilies.data()); - - int i = 0; - for (const auto& queueFamily : queueFamilies) { - if (queueFamily.queueCount > 0 && - queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) { - indices.graphicsFamily = i; - } - - VkBool32 presentSupport = false; - vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); - - if (queueFamily.queueCount > 0 && presentSupport) { - indices.presentFamily = i; - } - - if (indices.isComplete()) { - break; - } - - i++; - } - - return indices; - } - - std::vector getRequiredExtensions() { - uint32_t glfwExtensionCount = 0; - const char** glfwExtensions; - glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); - - std::vector extensions(glfwExtensions, - glfwExtensions + glfwExtensionCount); - - if (enableValidationLayers) { - extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); - } - - return extensions; - } - - bool checkValidationLayerSupport() { - uint32_t layerCount; - vkEnumerateInstanceLayerProperties(&layerCount, nullptr); - - std::vector availableLayers(layerCount); - vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); - - for (const char* layerName : validationLayers) { - bool layerFound = false; - - for (const auto& layerProperties : availableLayers) { - if (strcmp(layerName, layerProperties.layerName) == 0) { - layerFound = true; - break; + if (!image_data) { + printf("Error opening file '%s'\n", image_path); + exit(EXIT_FAILURE); } - } - if (!layerFound) { - return false; - } + printf("Loaded '%s', %d x %d pixels\n", image_path, imageWidth, imageHeight); } - return true; - } - - static std::vector readFile(const std::string& filename) { - char* file_path = sdkFindFilePath(filename.c_str(), execution_path.c_str()); - std::ifstream file(file_path, std::ios::ate | std::ios::binary); - - if (!file.is_open()) { - throw std::runtime_error("failed to open file!"); + void run() + { + initWindow(); + initVulkan(); + initCuda(); + mainLoop(); + cleanup(); } - size_t fileSize = (size_t)file.tellg(); - std::vector buffer(fileSize); +private: + GLFWwindow *window; - file.seekg(0); - file.read(buffer.data(), fileSize); + VkInstance instance; + VkDebugUtilsMessengerEXT debugMessenger; + VkSurfaceKHR surface; - file.close(); + VkPhysicalDevice physicalDevice = VK_NULL_HANDLE; + VkDevice device; + uint8_t vkDeviceUUID[VK_UUID_SIZE]; - return buffer; - } + VkQueue graphicsQueue; + VkQueue presentQueue; - static VKAPI_ATTR VkBool32 VKAPI_CALL - debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, - VkDebugUtilsMessageTypeFlagsEXT messageType, - const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, - void* pUserData) { - std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; + VkSwapchainKHR swapChain; + std::vector swapChainImages; + VkFormat swapChainImageFormat; + VkExtent2D swapChainExtent; + std::vector swapChainImageViews; + std::vector swapChainFramebuffers; - return VK_FALSE; - } + VkRenderPass renderPass; + VkDescriptorSetLayout descriptorSetLayout; + VkPipelineLayout pipelineLayout; + VkPipeline graphicsPipeline; + + VkCommandPool commandPool; + + VkImage textureImage; + VkDeviceMemory textureImageMemory; + VkImageView textureImageView; + VkSampler textureSampler; + + VkBuffer vertexBuffer; + VkDeviceMemory vertexBufferMemory; + VkBuffer indexBuffer; + VkDeviceMemory indexBufferMemory; + + std::vector uniformBuffers; + std::vector uniformBuffersMemory; + + VkDescriptorPool descriptorPool; + std::vector descriptorSets; + + std::vector commandBuffers; + + std::vector imageAvailableSemaphores; + std::vector renderFinishedSemaphores; + VkSemaphore cudaUpdateVkSemaphore, vkUpdateCudaSemaphore; + std::vector inFlightFences; + + size_t currentFrame = 0; + + bool framebufferResized = false; + +#ifdef _WIN64 + PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; + PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; +#else + PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR = NULL; + PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR = NULL; +#endif + + PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; + + unsigned int *image_data = NULL; + unsigned int imageWidth, imageHeight; + unsigned int mipLevels = 1; + size_t totalImageMemSize; + + // CUDA objects + cudaExternalMemory_t cudaExtMemImageBuffer; + cudaMipmappedArray_t cudaMipmappedImageArray, cudaMipmappedImageArrayTemp, cudaMipmappedImageArrayOrig; + std::vector surfaceObjectList, surfaceObjectListTemp; + cudaSurfaceObject_t *d_surfaceObjectList, *d_surfaceObjectListTemp; + cudaTextureObject_t textureObjMipMapInput; + + cudaExternalSemaphore_t cudaExtCudaUpdateVkSemaphore; + cudaExternalSemaphore_t cudaExtVkUpdateCudaSemaphore; + cudaStream_t streamToRun; + + void initWindow() + { + glfwInit(); + + glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); + + window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan Image CUDA Box Filter", nullptr, nullptr); + glfwSetWindowUserPointer(window, this); + glfwSetFramebufferSizeCallback(window, framebufferResizeCallback); + } + + static void framebufferResizeCallback(GLFWwindow *window, int width, int height) + { + auto app = reinterpret_cast(glfwGetWindowUserPointer(window)); + app->framebufferResized = true; + } + + void initVulkan() + { + createInstance(); + setupDebugMessenger(); + createSurface(); + pickPhysicalDevice(); + createLogicalDevice(); + getKhrExtensionsFn(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createDescriptorSetLayout(); + createGraphicsPipeline(); + createFramebuffers(); + createCommandPool(); + createTextureImage(); + createTextureImageView(); + createTextureSampler(); + createVertexBuffer(); + createIndexBuffer(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); + createSyncObjects(); + createSyncObjectsExt(); + } + + void initCuda() + { + setCudaVkDevice(); + checkCudaErrors(cudaStreamCreate(&streamToRun)); + cudaVkImportImageMem(); + cudaVkImportSemaphore(); + } + + void mainLoop() + { + updateUniformBuffer(); + while (!glfwWindowShouldClose(window)) { + glfwPollEvents(); + drawFrame(); + } + + vkDeviceWaitIdle(device); + } + + void cleanupSwapChain() + { + for (auto framebuffer : swapChainFramebuffers) { + vkDestroyFramebuffer(device, framebuffer, nullptr); + } + + vkFreeCommandBuffers(device, commandPool, static_cast(commandBuffers.size()), commandBuffers.data()); + + vkDestroyPipeline(device, graphicsPipeline, nullptr); + vkDestroyPipelineLayout(device, pipelineLayout, nullptr); + vkDestroyRenderPass(device, renderPass, nullptr); + + for (auto imageView : swapChainImageViews) { + vkDestroyImageView(device, imageView, nullptr); + } + + vkDestroySwapchainKHR(device, swapChain, nullptr); + + for (size_t i = 0; i < swapChainImages.size(); i++) { + vkDestroyBuffer(device, uniformBuffers[i], nullptr); + vkFreeMemory(device, uniformBuffersMemory[i], nullptr); + } + + vkDestroyDescriptorPool(device, descriptorPool, nullptr); + } + + void cleanup() + { + cleanupSwapChain(); + + vkDestroySampler(device, textureSampler, nullptr); + vkDestroyImageView(device, textureImageView, nullptr); + + for (int i = 0; i < mipLevels; i++) { + checkCudaErrors(cudaDestroySurfaceObject(surfaceObjectList[i])); + checkCudaErrors(cudaDestroySurfaceObject(surfaceObjectListTemp[i])); + } + + checkCudaErrors(cudaFree(d_surfaceObjectList)); + checkCudaErrors(cudaFree(d_surfaceObjectListTemp)); + checkCudaErrors(cudaFreeMipmappedArray(cudaMipmappedImageArrayTemp)); + checkCudaErrors(cudaFreeMipmappedArray(cudaMipmappedImageArrayOrig)); + checkCudaErrors(cudaFreeMipmappedArray(cudaMipmappedImageArray)); + checkCudaErrors(cudaDestroyTextureObject(textureObjMipMapInput)); + checkCudaErrors(cudaDestroyExternalMemory(cudaExtMemImageBuffer)); + checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtCudaUpdateVkSemaphore)); + checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtVkUpdateCudaSemaphore)); + + vkDestroyImage(device, textureImage, nullptr); + vkFreeMemory(device, textureImageMemory, nullptr); + + vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); + + vkDestroyBuffer(device, indexBuffer, nullptr); + vkFreeMemory(device, indexBufferMemory, nullptr); + + vkDestroyBuffer(device, vertexBuffer, nullptr); + vkFreeMemory(device, vertexBufferMemory, nullptr); + + vkDestroySemaphore(device, cudaUpdateVkSemaphore, nullptr); + vkDestroySemaphore(device, vkUpdateCudaSemaphore, nullptr); + + for (size_t i = 0; i < MAX_FRAMES; i++) { + vkDestroySemaphore(device, renderFinishedSemaphores[i], nullptr); + vkDestroySemaphore(device, imageAvailableSemaphores[i], nullptr); + vkDestroyFence(device, inFlightFences[i], nullptr); + } + + vkDestroyCommandPool(device, commandPool, nullptr); + + vkDestroyDevice(device, nullptr); + + if (enableValidationLayers) { + DestroyDebugUtilsMessengerEXT(instance, debugMessenger, nullptr); + } + + vkDestroySurfaceKHR(instance, surface, nullptr); + vkDestroyInstance(instance, nullptr); + + glfwDestroyWindow(window); + + glfwTerminate(); + } + + void recreateSwapChain() + { + int width = 0, height = 0; + while (width == 0 || height == 0) { + glfwGetFramebufferSize(window, &width, &height); + glfwWaitEvents(); + } + + vkDeviceWaitIdle(device); + + cleanupSwapChain(); + + createSwapChain(); + createImageViews(); + createRenderPass(); + createGraphicsPipeline(); + createFramebuffers(); + createUniformBuffers(); + createDescriptorPool(); + createDescriptorSets(); + createCommandBuffers(); + } + + void createInstance() + { + if (enableValidationLayers && !checkValidationLayerSupport()) { + throw std::runtime_error("validation layers requested, but not available!"); + } + + VkApplicationInfo appInfo = {}; + appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + appInfo.pApplicationName = "Vulkan Image CUDA Interop"; + appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.pEngineName = "No Engine"; + appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.apiVersion = VK_API_VERSION_1_1; + + VkInstanceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + createInfo.pApplicationInfo = &appInfo; + + auto extensions = getRequiredExtensions(); + createInfo.enabledExtensionCount = static_cast(extensions.size()); + createInfo.ppEnabledExtensionNames = extensions.data(); + + VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo; + if (enableValidationLayers) { + createInfo.enabledLayerCount = static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + + populateDebugMessengerCreateInfo(debugCreateInfo); + createInfo.pNext = (VkDebugUtilsMessengerCreateInfoEXT *)&debugCreateInfo; + } + else { + createInfo.enabledLayerCount = 0; + + createInfo.pNext = nullptr; + } + + if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) { + throw std::runtime_error("failed to create instance!"); + } + + fpGetPhysicalDeviceProperties2 = + (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(instance, "vkGetPhysicalDeviceProperties2"); + if (fpGetPhysicalDeviceProperties2 == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not " + "found.\n"); + } + +#ifdef _WIN64 + fpGetMemoryWin32HandleKHR = + (PFN_vkGetMemoryWin32HandleKHR)vkGetInstanceProcAddr(instance, "vkGetMemoryWin32HandleKHR"); + if (fpGetMemoryWin32HandleKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetMemoryWin32HandleKHR\" not " + "found.\n"); + } +#else + fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetInstanceProcAddr(instance, "vkGetMemoryFdKHR"); + if (fpGetMemoryFdKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetMemoryFdKHR\" not found.\n"); + } + else { + std::cout << "Vulkan proc address for vkGetMemoryFdKHR - " << fpGetMemoryFdKHR << std::endl; + } +#endif + } + + void populateDebugMessengerCreateInfo(VkDebugUtilsMessengerCreateInfoEXT &createInfo) + { + createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; + createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; + createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT + | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; + createInfo.pfnUserCallback = debugCallback; + } + + void setupDebugMessenger() + { + if (!enableValidationLayers) + return; + + VkDebugUtilsMessengerCreateInfoEXT createInfo; + populateDebugMessengerCreateInfo(createInfo); + + if (CreateDebugUtilsMessengerEXT(instance, &createInfo, nullptr, &debugMessenger) != VK_SUCCESS) { + throw std::runtime_error("failed to set up debug messenger!"); + } + } + + void createSurface() + { + if (glfwCreateWindowSurface(instance, window, nullptr, &surface) != VK_SUCCESS) { + throw std::runtime_error("failed to create window surface!"); + } + } + + void pickPhysicalDevice() + { + uint32_t deviceCount = 0; + vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr); + + if (deviceCount == 0) { + throw std::runtime_error("failed to find GPUs with Vulkan support!"); + } + + std::vector devices(deviceCount); + vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data()); + + for (const auto &device : devices) { + if (isDeviceSuitable(device)) { + physicalDevice = device; + break; + } + } + + if (physicalDevice == VK_NULL_HANDLE) { + throw std::runtime_error("failed to find a suitable GPU!"); + } + + std::cout << "Selected physical device = " << physicalDevice << std::endl; + + VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; + vkPhysicalDeviceIDProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + vkPhysicalDeviceIDProperties.pNext = NULL; + + VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; + vkPhysicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; + + fpGetPhysicalDeviceProperties2(physicalDevice, &vkPhysicalDeviceProperties2); + + memcpy(vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, sizeof(vkDeviceUUID)); + } + + void getKhrExtensionsFn() + { +#ifdef _WIN64 + + fpGetSemaphoreWin32HandleKHR = + (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(device, "vkGetSemaphoreWin32HandleKHR"); + if (fpGetSemaphoreWin32HandleKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetSemaphoreWin32HandleKHR\" not " + "found.\n"); + } +#else + fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(device, "vkGetSemaphoreFdKHR"); + if (fpGetSemaphoreFdKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetSemaphoreFdKHR\" not found.\n"); + } +#endif + } + + int setCudaVkDevice() + { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the GPU which is selected by Vulkan + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); + + if ((deviceProp.computeMode != cudaComputeModeProhibited)) { + // Compare the cuda device UUID with vulkan UUID + int ret = memcmp(&deviceProp.uuid, &vkDeviceUUID, VK_UUID_SIZE); + if (ret == 0) { + checkCudaErrors(cudaSetDevice(current_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, + deviceProp.name, + deviceProp.major, + deviceProp.minor); + + return current_device; + } + } + else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No Vulkan-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; + } + + void createLogicalDevice() + { + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + + std::vector queueCreateInfos; + std::set uniqueQueueFamilies = {indices.graphicsFamily, indices.presentFamily}; + + float queuePriority = 1.0f; + for (int queueFamily : uniqueQueueFamilies) { + VkDeviceQueueCreateInfo queueCreateInfo = {}; + queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queueCreateInfo.queueFamilyIndex = queueFamily; + queueCreateInfo.queueCount = 1; + queueCreateInfo.pQueuePriorities = &queuePriority; + queueCreateInfos.push_back(queueCreateInfo); + } + + VkPhysicalDeviceFeatures deviceFeatures = {}; + deviceFeatures.samplerAnisotropy = VK_TRUE; + + VkDeviceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + + createInfo.pQueueCreateInfos = queueCreateInfos.data(); + createInfo.queueCreateInfoCount = queueCreateInfos.size(); + + createInfo.pEnabledFeatures = &deviceFeatures; + std::vector enabledExtensionNameList; + + for (int i = 0; i < deviceExtensions.size(); i++) { + enabledExtensionNameList.push_back(deviceExtensions[i]); + } + if (enableValidationLayers) { + createInfo.enabledLayerCount = static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + } + else { + createInfo.enabledLayerCount = 0; + } + createInfo.enabledExtensionCount = static_cast(enabledExtensionNameList.size()); + createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data(); + + if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) != VK_SUCCESS) { + throw std::runtime_error("failed to create logical device!"); + } + vkGetDeviceQueue(device, indices.graphicsFamily, 0, &graphicsQueue); + vkGetDeviceQueue(device, indices.presentFamily, 0, &presentQueue); + } + + void createSwapChain() + { + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice); + + VkSurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats); + VkPresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes); + VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities); + + uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1; + if (swapChainSupport.capabilities.maxImageCount > 0 + && imageCount > swapChainSupport.capabilities.maxImageCount) { + imageCount = swapChainSupport.capabilities.maxImageCount; + } + + VkSwapchainCreateInfoKHR createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; + createInfo.surface = surface; + + createInfo.minImageCount = imageCount; + createInfo.imageFormat = surfaceFormat.format; + createInfo.imageColorSpace = surfaceFormat.colorSpace; + createInfo.imageExtent = extent; + createInfo.imageArrayLayers = 1; + createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + uint32_t queueFamilyIndices[] = {(uint32_t)indices.graphicsFamily, (uint32_t)indices.presentFamily}; + + if (indices.graphicsFamily != indices.presentFamily) { + createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; + createInfo.queueFamilyIndexCount = 2; + createInfo.pQueueFamilyIndices = queueFamilyIndices; + } + else { + createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; + } + + createInfo.preTransform = swapChainSupport.capabilities.currentTransform; + createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + createInfo.presentMode = presentMode; + createInfo.clipped = VK_TRUE; + + if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) != VK_SUCCESS) { + throw std::runtime_error("failed to create swap chain!"); + } + + vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr); + swapChainImages.resize(imageCount); + vkGetSwapchainImagesKHR(device, swapChain, &imageCount, swapChainImages.data()); + + swapChainImageFormat = surfaceFormat.format; + swapChainExtent = extent; + } + + void createImageViews() + { + swapChainImageViews.resize(swapChainImages.size()); + + for (size_t i = 0; i < swapChainImages.size(); i++) { + swapChainImageViews[i] = createImageView(swapChainImages[i], swapChainImageFormat); + } + } + + void createRenderPass() + { + VkAttachmentDescription colorAttachment = {}; + colorAttachment.format = swapChainImageFormat; + colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; + + VkAttachmentReference colorAttachmentRef = {}; + colorAttachmentRef.attachment = 0; + colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + + VkSubpassDescription subpass = {}; + subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + subpass.colorAttachmentCount = 1; + subpass.pColorAttachments = &colorAttachmentRef; + + VkSubpassDependency dependency = {}; + dependency.srcSubpass = VK_SUBPASS_EXTERNAL; + dependency.dstSubpass = 0; + dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.srcAccessMask = 0; + dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + + VkRenderPassCreateInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + renderPassInfo.attachmentCount = 1; + renderPassInfo.pAttachments = &colorAttachment; + renderPassInfo.subpassCount = 1; + renderPassInfo.pSubpasses = &subpass; + renderPassInfo.dependencyCount = 1; + renderPassInfo.pDependencies = &dependency; + + if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) != VK_SUCCESS) { + throw std::runtime_error("failed to create render pass!"); + } + } + + void createDescriptorSetLayout() + { + VkDescriptorSetLayoutBinding uboLayoutBinding = {}; + uboLayoutBinding.binding = 0; + uboLayoutBinding.descriptorCount = 1; + uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + uboLayoutBinding.pImmutableSamplers = nullptr; + uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + VkDescriptorSetLayoutBinding samplerLayoutBinding = {}; + samplerLayoutBinding.binding = 1; + samplerLayoutBinding.descriptorCount = 1; + samplerLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + samplerLayoutBinding.pImmutableSamplers = nullptr; + samplerLayoutBinding.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + + std::array bindings = {uboLayoutBinding, samplerLayoutBinding}; + VkDescriptorSetLayoutCreateInfo layoutInfo = {}; + layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + + if (vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &descriptorSetLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor set layout!"); + } + } + + void createGraphicsPipeline() + { + auto vertShaderCode = readFile("vert.spv"); + auto fragShaderCode = readFile("frag.spv"); + + VkShaderModule vertShaderModule = createShaderModule(vertShaderCode); + VkShaderModule fragShaderModule = createShaderModule(fragShaderCode); + + VkPipelineShaderStageCreateInfo vertShaderStageInfo = {}; + vertShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT; + vertShaderStageInfo.module = vertShaderModule; + vertShaderStageInfo.pName = "main"; + + VkPipelineShaderStageCreateInfo fragShaderStageInfo = {}; + fragShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT; + fragShaderStageInfo.module = fragShaderModule; + fragShaderStageInfo.pName = "main"; + + VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; + vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + + auto bindingDescription = Vertex::getBindingDescription(); + auto attributeDescriptions = Vertex::getAttributeDescriptions(); + + vertexInputInfo.vertexBindingDescriptionCount = 1; + vertexInputInfo.vertexAttributeDescriptionCount = static_cast(attributeDescriptions.size()); + vertexInputInfo.pVertexBindingDescriptions = &bindingDescription; + vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data(); + + VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; + inputAssembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + inputAssembly.primitiveRestartEnable = VK_FALSE; + + VkViewport viewport = {}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = (float)swapChainExtent.width; + viewport.height = (float)swapChainExtent.height; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + + VkRect2D scissor = {}; + scissor.offset = {0, 0}; + scissor.extent = swapChainExtent; + + VkPipelineViewportStateCreateInfo viewportState = {}; + viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewportState.viewportCount = 1; + viewportState.pViewports = &viewport; + viewportState.scissorCount = 1; + viewportState.pScissors = &scissor; + + VkPipelineRasterizationStateCreateInfo rasterizer = {}; + rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterizer.depthClampEnable = VK_FALSE; + rasterizer.rasterizerDiscardEnable = VK_FALSE; + rasterizer.polygonMode = VK_POLYGON_MODE_FILL; + rasterizer.lineWidth = 1.0f; + rasterizer.cullMode = VK_CULL_MODE_BACK_BIT; + rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rasterizer.depthBiasEnable = VK_FALSE; + + VkPipelineMultisampleStateCreateInfo multisampling = {}; + multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisampling.sampleShadingEnable = VK_FALSE; + multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + + VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; + colorBlendAttachment.colorWriteMask = + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + colorBlendAttachment.blendEnable = VK_FALSE; + + VkPipelineColorBlendStateCreateInfo colorBlending = {}; + colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + colorBlending.logicOpEnable = VK_FALSE; + colorBlending.logicOp = VK_LOGIC_OP_COPY; + colorBlending.attachmentCount = 1; + colorBlending.pAttachments = &colorBlendAttachment; + colorBlending.blendConstants[0] = 0.0f; + colorBlending.blendConstants[1] = 0.0f; + colorBlending.blendConstants[2] = 0.0f; + colorBlending.blendConstants[3] = 0.0f; + + VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; + pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutInfo.setLayoutCount = 1; + pipelineLayoutInfo.pSetLayouts = &descriptorSetLayout; + + if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, &pipelineLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create pipeline layout!"); + } + + VkGraphicsPipelineCreateInfo pipelineInfo = {}; + pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipelineInfo.stageCount = 2; + pipelineInfo.pStages = shaderStages; + pipelineInfo.pVertexInputState = &vertexInputInfo; + pipelineInfo.pInputAssemblyState = &inputAssembly; + pipelineInfo.pViewportState = &viewportState; + pipelineInfo.pRasterizationState = &rasterizer; + pipelineInfo.pMultisampleState = &multisampling; + pipelineInfo.pColorBlendState = &colorBlending; + pipelineInfo.layout = pipelineLayout; + pipelineInfo.renderPass = renderPass; + pipelineInfo.subpass = 0; + pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; + + if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline) + != VK_SUCCESS) { + throw std::runtime_error("failed to create graphics pipeline!"); + } + + vkDestroyShaderModule(device, fragShaderModule, nullptr); + vkDestroyShaderModule(device, vertShaderModule, nullptr); + } + + void createFramebuffers() + { + swapChainFramebuffers.resize(swapChainImageViews.size()); + + for (size_t i = 0; i < swapChainImageViews.size(); i++) { + VkImageView attachments[] = {swapChainImageViews[i]}; + + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.renderPass = renderPass; + framebufferInfo.attachmentCount = 1; + framebufferInfo.pAttachments = attachments; + framebufferInfo.width = swapChainExtent.width; + framebufferInfo.height = swapChainExtent.height; + framebufferInfo.layers = 1; + + if (vkCreateFramebuffer(device, &framebufferInfo, nullptr, &swapChainFramebuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create framebuffer!"); + } + } + } + + void createCommandPool() + { + QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice); + + VkCommandPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily; + + if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) != VK_SUCCESS) { + throw std::runtime_error("failed to create graphics command pool!"); + } + } + + void createTextureImage() + { + VkDeviceSize imageSize = imageWidth * imageHeight * 4; + mipLevels = static_cast(std::floor(std::log2(std::max(imageWidth, imageHeight)))) + 1; + printf("mipLevels = %d\n", mipLevels); + + if (!image_data) { + throw std::runtime_error("failed to load texture image!"); + } + + VkBuffer stagingBuffer; + VkDeviceMemory stagingBufferMemory; + createBuffer(imageSize, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + stagingBuffer, + stagingBufferMemory); + + void *data; + vkMapMemory(device, stagingBufferMemory, 0, imageSize, 0, &data); + memcpy(data, image_data, static_cast(imageSize)); + vkUnmapMemory(device, stagingBufferMemory); + + // VK_FORMAT_R8G8B8A8_UNORM changed to VK_FORMAT_R8G8B8A8_UINT + createImage(imageWidth, + imageHeight, + VK_FORMAT_R8G8B8A8_UNORM, + VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT + | VK_IMAGE_USAGE_SAMPLED_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + textureImage, + textureImageMemory); + + transitionImageLayout( + textureImage, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + copyBufferToImage( + stagingBuffer, textureImage, static_cast(imageWidth), static_cast(imageHeight)); + + vkDestroyBuffer(device, stagingBuffer, nullptr); + vkFreeMemory(device, stagingBufferMemory, nullptr); + + generateMipmaps(textureImage, VK_FORMAT_R8G8B8A8_UNORM); + } + + void generateMipmaps(VkImage image, VkFormat imageFormat) + { + VkFormatProperties formatProperties; + vkGetPhysicalDeviceFormatProperties(physicalDevice, imageFormat, &formatProperties); + + if (!(formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT)) { + throw std::runtime_error("texture image format does not support linear blitting!"); + } + + VkCommandBuffer commandBuffer = beginSingleTimeCommands(); + + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.image = image; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.subresourceRange.levelCount = 1; + + int32_t mipWidth = imageWidth; + int32_t mipHeight = imageHeight; + + for (uint32_t i = 1; i < mipLevels; i++) { + barrier.subresourceRange.baseMipLevel = i - 1; + barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + + vkCmdPipelineBarrier(commandBuffer, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, + 0, + nullptr, + 0, + nullptr, + 1, + &barrier); + + VkImageBlit blit = {}; + blit.srcOffsets[0] = {0, 0, 0}; + blit.srcOffsets[1] = {mipWidth, mipHeight, 1}; + blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + blit.srcSubresource.mipLevel = i - 1; + blit.srcSubresource.baseArrayLayer = 0; + blit.srcSubresource.layerCount = 1; + blit.dstOffsets[0] = {0, 0, 0}; + blit.dstOffsets[1] = {mipWidth > 1 ? mipWidth / 2 : 1, mipHeight > 1 ? mipHeight / 2 : 1, 1}; + blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + blit.dstSubresource.mipLevel = i; + blit.dstSubresource.baseArrayLayer = 0; + blit.dstSubresource.layerCount = 1; + + vkCmdBlitImage(commandBuffer, + image, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + 1, + &blit, + VK_FILTER_LINEAR); + + barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + vkCmdPipelineBarrier(commandBuffer, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + 0, + 0, + nullptr, + 0, + nullptr, + 1, + &barrier); + + if (mipWidth > 1) + mipWidth /= 2; + if (mipHeight > 1) + mipHeight /= 2; + } + + barrier.subresourceRange.baseMipLevel = mipLevels - 1; + barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + vkCmdPipelineBarrier(commandBuffer, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + 0, + 0, + nullptr, + 0, + nullptr, + 1, + &barrier); + + endSingleTimeCommands(commandBuffer); + } + +#ifdef _WIN64 // For windows + HANDLE getVkImageMemHandle(VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) + { + HANDLE handle; + + VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; + vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + vkMemoryGetWin32HandleInfoKHR.pNext = NULL; + vkMemoryGetWin32HandleInfoKHR.memory = textureImageMemory; + vkMemoryGetWin32HandleInfoKHR.handleType = (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType; + + fpGetMemoryWin32HandleKHR(device, &vkMemoryGetWin32HandleInfoKHR, &handle); + return handle; + } + HANDLE getVkSemaphoreHandle(VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, + VkSemaphore &semVkCuda) + { + HANDLE handle; + + VkSemaphoreGetWin32HandleInfoKHR vulkanSemaphoreGetWin32HandleInfoKHR = {}; + vulkanSemaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; + vulkanSemaphoreGetWin32HandleInfoKHR.pNext = NULL; + vulkanSemaphoreGetWin32HandleInfoKHR.semaphore = semVkCuda; + vulkanSemaphoreGetWin32HandleInfoKHR.handleType = externalSemaphoreHandleType; + + fpGetSemaphoreWin32HandleKHR(device, &vulkanSemaphoreGetWin32HandleInfoKHR, &handle); + + return handle; + } +#else + int getVkImageMemHandle(VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) + { + if (externalMemoryHandleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR) { + int fd; + + VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; + vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + vkMemoryGetFdInfoKHR.pNext = NULL; + vkMemoryGetFdInfoKHR.memory = textureImageMemory; + vkMemoryGetFdInfoKHR.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + fpGetMemoryFdKHR(device, &vkMemoryGetFdInfoKHR, &fd); + + return fd; + } + return -1; + } + + int getVkSemaphoreHandle(VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, + VkSemaphore &semVkCuda) + { + if (externalSemaphoreHandleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + int fd; + + VkSemaphoreGetFdInfoKHR vulkanSemaphoreGetFdInfoKHR = {}; + vulkanSemaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + vulkanSemaphoreGetFdInfoKHR.pNext = NULL; + vulkanSemaphoreGetFdInfoKHR.semaphore = semVkCuda; + vulkanSemaphoreGetFdInfoKHR.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + fpGetSemaphoreFdKHR(device, &vulkanSemaphoreGetFdInfoKHR, &fd); + + return fd; + } + return -1; + } +#endif + + void createTextureImageView() { textureImageView = createImageView(textureImage, VK_FORMAT_R8G8B8A8_UNORM); } + + void createTextureSampler() + { + VkSamplerCreateInfo samplerInfo = {}; + samplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + samplerInfo.magFilter = VK_FILTER_LINEAR; + samplerInfo.minFilter = VK_FILTER_LINEAR; + samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; + samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; + samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; + samplerInfo.anisotropyEnable = VK_TRUE; + samplerInfo.maxAnisotropy = 16; + samplerInfo.borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK; + samplerInfo.unnormalizedCoordinates = VK_FALSE; + samplerInfo.compareEnable = VK_FALSE; + samplerInfo.compareOp = VK_COMPARE_OP_ALWAYS; + samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; + samplerInfo.minLod = 0; // Optional + samplerInfo.maxLod = static_cast(mipLevels); + samplerInfo.mipLodBias = 0; // Optional + + if (vkCreateSampler(device, &samplerInfo, nullptr, &textureSampler) != VK_SUCCESS) { + throw std::runtime_error("failed to create texture sampler!"); + } + } + + VkImageView createImageView(VkImage image, VkFormat format) + { + VkImageViewCreateInfo viewInfo = {}; + viewInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + viewInfo.image = image; + viewInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; + viewInfo.format = format; + viewInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + viewInfo.subresourceRange.baseMipLevel = 0; + viewInfo.subresourceRange.levelCount = mipLevels; + viewInfo.subresourceRange.baseArrayLayer = 0; + viewInfo.subresourceRange.layerCount = 1; + + VkImageView imageView; + if (vkCreateImageView(device, &viewInfo, nullptr, &imageView) != VK_SUCCESS) { + throw std::runtime_error("failed to create texture image view!"); + } + + return imageView; + } + + void createImage(uint32_t width, + uint32_t height, + VkFormat format, + VkImageTiling tiling, + VkImageUsageFlags usage, + VkMemoryPropertyFlags properties, + VkImage &image, + VkDeviceMemory &imageMemory) + { + VkImageCreateInfo imageInfo = {}; + imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + imageInfo.imageType = VK_IMAGE_TYPE_2D; + imageInfo.extent.width = width; + imageInfo.extent.height = height; + imageInfo.extent.depth = 1; + imageInfo.mipLevels = mipLevels; + imageInfo.arrayLayers = 1; + imageInfo.format = format; + imageInfo.tiling = tiling; + imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imageInfo.usage = usage; + imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; + imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + VkExternalMemoryImageCreateInfo vkExternalMemImageCreateInfo = {}; + vkExternalMemImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO; + vkExternalMemImageCreateInfo.pNext = NULL; +#ifdef _WIN64 + vkExternalMemImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT; +#else + vkExternalMemImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; +#endif + + imageInfo.pNext = &vkExternalMemImageCreateInfo; + + if (vkCreateImage(device, &imageInfo, nullptr, &image) != VK_SUCCESS) { + throw std::runtime_error("failed to create image!"); + } + + VkMemoryRequirements memRequirements; + vkGetImageMemoryRequirements(device, image, &memRequirements); + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; + vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; + vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; +#endif + VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; + vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; +#ifdef _WIN64 + vulkanExportMemoryAllocateInfoKHR.pNext = IsWindows8OrGreater() ? &vulkanExportMemoryWin32HandleInfoKHR : NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = IsWindows8OrGreater() + ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; +#else + vulkanExportMemoryAllocateInfoKHR.pNext = NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; +#endif + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; + allocInfo.memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties); + + VkMemoryRequirements vkMemoryRequirements = {}; + vkGetImageMemoryRequirements(device, image, &vkMemoryRequirements); + totalImageMemSize = vkMemoryRequirements.size; + + if (vkAllocateMemory(device, &allocInfo, nullptr, &textureImageMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate image memory!"); + } + + vkBindImageMemory(device, image, textureImageMemory, 0); + } + + void cudaVkImportSemaphore() + { + cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc; + memset(&externalSemaphoreHandleDesc, 0, sizeof(externalSemaphoreHandleDesc)); +#ifdef _WIN64 + externalSemaphoreHandleDesc.type = IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 + : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + externalSemaphoreHandleDesc.handle.win32.handle = + getVkSemaphoreHandle(IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + cudaUpdateVkSemaphore); +#else + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + externalSemaphoreHandleDesc.handle.fd = + getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, cudaUpdateVkSemaphore); +#endif + externalSemaphoreHandleDesc.flags = 0; + + checkCudaErrors(cudaImportExternalSemaphore(&cudaExtCudaUpdateVkSemaphore, &externalSemaphoreHandleDesc)); + + memset(&externalSemaphoreHandleDesc, 0, sizeof(externalSemaphoreHandleDesc)); +#ifdef _WIN64 + externalSemaphoreHandleDesc.type = IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 + : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + ; + externalSemaphoreHandleDesc.handle.win32.handle = + getVkSemaphoreHandle(IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + vkUpdateCudaSemaphore); +#else + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + externalSemaphoreHandleDesc.handle.fd = + getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, vkUpdateCudaSemaphore); +#endif + externalSemaphoreHandleDesc.flags = 0; + checkCudaErrors(cudaImportExternalSemaphore(&cudaExtVkUpdateCudaSemaphore, &externalSemaphoreHandleDesc)); + printf("CUDA Imported Vulkan semaphore\n"); + } + + void cudaVkImportImageMem() + { + cudaExternalMemoryHandleDesc cudaExtMemHandleDesc; + memset(&cudaExtMemHandleDesc, 0, sizeof(cudaExtMemHandleDesc)); +#ifdef _WIN64 + cudaExtMemHandleDesc.type = IsWindows8OrGreater() ? cudaExternalMemoryHandleTypeOpaqueWin32 + : cudaExternalMemoryHandleTypeOpaqueWin32Kmt; + cudaExtMemHandleDesc.handle.win32.handle = + getVkImageMemHandle(IsWindows8OrGreater() ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT); +#else + cudaExtMemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; + + cudaExtMemHandleDesc.handle.fd = getVkImageMemHandle(VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR); +#endif + cudaExtMemHandleDesc.size = totalImageMemSize; + + checkCudaErrors(cudaImportExternalMemory(&cudaExtMemImageBuffer, &cudaExtMemHandleDesc)); + + cudaExternalMemoryMipmappedArrayDesc externalMemoryMipmappedArrayDesc; + + memset(&externalMemoryMipmappedArrayDesc, 0, sizeof(externalMemoryMipmappedArrayDesc)); + + cudaExtent extent = make_cudaExtent(imageWidth, imageHeight, 0); + cudaChannelFormatDesc formatDesc; + formatDesc.x = 8; + formatDesc.y = 8; + formatDesc.z = 8; + formatDesc.w = 8; + formatDesc.f = cudaChannelFormatKindUnsigned; + + externalMemoryMipmappedArrayDesc.offset = 0; + externalMemoryMipmappedArrayDesc.formatDesc = formatDesc; + externalMemoryMipmappedArrayDesc.extent = extent; + externalMemoryMipmappedArrayDesc.flags = 0; + externalMemoryMipmappedArrayDesc.numLevels = mipLevels; + + checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray( + &cudaMipmappedImageArray, cudaExtMemImageBuffer, &externalMemoryMipmappedArrayDesc)); + + checkCudaErrors(cudaMallocMipmappedArray(&cudaMipmappedImageArrayTemp, &formatDesc, extent, mipLevels)); + checkCudaErrors(cudaMallocMipmappedArray(&cudaMipmappedImageArrayOrig, &formatDesc, extent, mipLevels)); + + for (int mipLevelIdx = 0; mipLevelIdx < mipLevels; mipLevelIdx++) { + cudaArray_t cudaMipLevelArray, cudaMipLevelArrayTemp, cudaMipLevelArrayOrig; + cudaResourceDesc resourceDesc; + + checkCudaErrors(cudaGetMipmappedArrayLevel(&cudaMipLevelArray, cudaMipmappedImageArray, mipLevelIdx)); + checkCudaErrors( + cudaGetMipmappedArrayLevel(&cudaMipLevelArrayTemp, cudaMipmappedImageArrayTemp, mipLevelIdx)); + checkCudaErrors( + cudaGetMipmappedArrayLevel(&cudaMipLevelArrayOrig, cudaMipmappedImageArrayOrig, mipLevelIdx)); + + uint32_t width = (imageWidth >> mipLevelIdx) ? (imageWidth >> mipLevelIdx) : 1; + uint32_t height = (imageHeight >> mipLevelIdx) ? (imageHeight >> mipLevelIdx) : 1; + checkCudaErrors(cudaMemcpy2DArrayToArray(cudaMipLevelArrayOrig, + 0, + 0, + cudaMipLevelArray, + 0, + 0, + width * sizeof(uchar4), + height, + cudaMemcpyDeviceToDevice)); + + memset(&resourceDesc, 0, sizeof(resourceDesc)); + resourceDesc.resType = cudaResourceTypeArray; + resourceDesc.res.array.array = cudaMipLevelArray; + + cudaSurfaceObject_t surfaceObject; + checkCudaErrors(cudaCreateSurfaceObject(&surfaceObject, &resourceDesc)); + + surfaceObjectList.push_back(surfaceObject); + + memset(&resourceDesc, 0, sizeof(resourceDesc)); + resourceDesc.resType = cudaResourceTypeArray; + resourceDesc.res.array.array = cudaMipLevelArrayTemp; + + cudaSurfaceObject_t surfaceObjectTemp; + checkCudaErrors(cudaCreateSurfaceObject(&surfaceObjectTemp, &resourceDesc)); + surfaceObjectListTemp.push_back(surfaceObjectTemp); + } + + cudaResourceDesc resDescr; + memset(&resDescr, 0, sizeof(cudaResourceDesc)); + + resDescr.resType = cudaResourceTypeMipmappedArray; + resDescr.res.mipmap.mipmap = cudaMipmappedImageArrayOrig; + + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + + texDescr.normalizedCoords = true; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.mipmapFilterMode = cudaFilterModeLinear; + + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.addressMode[1] = cudaAddressModeWrap; + + texDescr.maxMipmapLevelClamp = float(mipLevels - 1); + + texDescr.readMode = cudaReadModeNormalizedFloat; + + checkCudaErrors(cudaCreateTextureObject(&textureObjMipMapInput, &resDescr, &texDescr, NULL)); + + checkCudaErrors(cudaMalloc((void **)&d_surfaceObjectList, sizeof(cudaSurfaceObject_t) * mipLevels)); + checkCudaErrors(cudaMalloc((void **)&d_surfaceObjectListTemp, sizeof(cudaSurfaceObject_t) * mipLevels)); + + checkCudaErrors(cudaMemcpy(d_surfaceObjectList, + surfaceObjectList.data(), + sizeof(cudaSurfaceObject_t) * mipLevels, + cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_surfaceObjectListTemp, + surfaceObjectListTemp.data(), + sizeof(cudaSurfaceObject_t) * mipLevels, + cudaMemcpyHostToDevice)); + + printf("CUDA Kernel Vulkan image buffer\n"); + } + + void cudaUpdateVkImage() + { + cudaVkSemaphoreWait(cudaExtVkUpdateCudaSemaphore); + + int nthreads = 128; + + /*Perform 2D box filter on image using CUDA */ + d_boxfilter_rgba_x<<>>( + d_surfaceObjectListTemp, textureObjMipMapInput, imageWidth, imageHeight, mipLevels, filter_radius); + + d_boxfilter_rgba_y<<>>( + d_surfaceObjectList, d_surfaceObjectListTemp, imageWidth, imageHeight, mipLevels, filter_radius); + + varySigma(); + + cudaVkSemaphoreSignal(cudaExtCudaUpdateVkSemaphore); + } + + void transitionImageLayout(VkImage image, VkFormat format, VkImageLayout oldLayout, VkImageLayout newLayout) + { + VkCommandBuffer commandBuffer = beginSingleTimeCommands(); + + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = mipLevels; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags sourceStage; + VkPipelineStageFlags destinationStage; + + if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + + sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + && newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT; + destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + } + else { + throw std::invalid_argument("unsupported layout transition!"); + } + + vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, nullptr, 0, nullptr, 1, &barrier); + + endSingleTimeCommands(commandBuffer); + } + + void copyBufferToImage(VkBuffer buffer, VkImage image, uint32_t width, uint32_t height) + { + VkCommandBuffer commandBuffer = beginSingleTimeCommands(); + + VkBufferImageCopy region = {}; + region.bufferOffset = 0; + region.bufferRowLength = 0; + region.bufferImageHeight = 0; + region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + region.imageSubresource.mipLevel = 0; + region.imageSubresource.baseArrayLayer = 0; + region.imageSubresource.layerCount = 1; + region.imageOffset = {0, 0, 0}; + region.imageExtent = {width, height, 1}; + + vkCmdCopyBufferToImage(commandBuffer, buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); + + endSingleTimeCommands(commandBuffer); + } + + void createVertexBuffer() + { + VkDeviceSize bufferSize = sizeof(vertices[0]) * vertices.size(); + + VkBuffer stagingBuffer; + VkDeviceMemory stagingBufferMemory; + createBuffer(bufferSize, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + stagingBuffer, + stagingBufferMemory); + + void *data; + vkMapMemory(device, stagingBufferMemory, 0, bufferSize, 0, &data); + memcpy(data, vertices.data(), (size_t)bufferSize); + vkUnmapMemory(device, stagingBufferMemory); + + createBuffer(bufferSize, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + vertexBuffer, + vertexBufferMemory); + + copyBuffer(stagingBuffer, vertexBuffer, bufferSize); + + vkDestroyBuffer(device, stagingBuffer, nullptr); + vkFreeMemory(device, stagingBufferMemory, nullptr); + } + + void createIndexBuffer() + { + VkDeviceSize bufferSize = sizeof(indices[0]) * indices.size(); + + VkBuffer stagingBuffer; + VkDeviceMemory stagingBufferMemory; + createBuffer(bufferSize, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + stagingBuffer, + stagingBufferMemory); + + void *data; + vkMapMemory(device, stagingBufferMemory, 0, bufferSize, 0, &data); + memcpy(data, indices.data(), (size_t)bufferSize); + vkUnmapMemory(device, stagingBufferMemory); + + createBuffer(bufferSize, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + indexBuffer, + indexBufferMemory); + + copyBuffer(stagingBuffer, indexBuffer, bufferSize); + + vkDestroyBuffer(device, stagingBuffer, nullptr); + vkFreeMemory(device, stagingBufferMemory, nullptr); + } + + void createUniformBuffers() + { + VkDeviceSize bufferSize = sizeof(UniformBufferObject); + + uniformBuffers.resize(swapChainImages.size()); + uniformBuffersMemory.resize(swapChainImages.size()); + + for (size_t i = 0; i < swapChainImages.size(); i++) { + createBuffer(bufferSize, + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + uniformBuffers[i], + uniformBuffersMemory[i]); + } + } + + void createDescriptorPool() + { + std::array poolSizes = {}; + poolSizes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + poolSizes[0].descriptorCount = static_cast(swapChainImages.size()); + poolSizes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + poolSizes[1].descriptorCount = static_cast(swapChainImages.size()); + + VkDescriptorPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + poolInfo.poolSizeCount = static_cast(poolSizes.size()); + poolInfo.pPoolSizes = poolSizes.data(); + poolInfo.maxSets = static_cast(swapChainImages.size()); + + if (vkCreateDescriptorPool(device, &poolInfo, nullptr, &descriptorPool) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor pool!"); + } + } + + void createDescriptorSets() + { + std::vector layouts(swapChainImages.size(), descriptorSetLayout); + VkDescriptorSetAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + allocInfo.descriptorPool = descriptorPool; + allocInfo.descriptorSetCount = static_cast(swapChainImages.size()); + allocInfo.pSetLayouts = layouts.data(); + + descriptorSets.resize(swapChainImages.size()); + if (vkAllocateDescriptorSets(device, &allocInfo, descriptorSets.data()) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate descriptor sets!"); + } + + for (size_t i = 0; i < swapChainImages.size(); i++) { + VkDescriptorBufferInfo bufferInfo = {}; + bufferInfo.buffer = uniformBuffers[i]; + bufferInfo.offset = 0; + bufferInfo.range = sizeof(UniformBufferObject); + + VkDescriptorImageInfo imageInfo = {}; + imageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + imageInfo.imageView = textureImageView; + imageInfo.sampler = textureSampler; + + std::array descriptorWrites = {}; + + descriptorWrites[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptorWrites[0].dstSet = descriptorSets[i]; + descriptorWrites[0].dstBinding = 0; + descriptorWrites[0].dstArrayElement = 0; + descriptorWrites[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + descriptorWrites[0].descriptorCount = 1; + descriptorWrites[0].pBufferInfo = &bufferInfo; + + descriptorWrites[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptorWrites[1].dstSet = descriptorSets[i]; + descriptorWrites[1].dstBinding = 1; + descriptorWrites[1].dstArrayElement = 0; + descriptorWrites[1].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + descriptorWrites[1].descriptorCount = 1; + descriptorWrites[1].pImageInfo = &imageInfo; + + vkUpdateDescriptorSets( + device, static_cast(descriptorWrites.size()), descriptorWrites.data(), 0, nullptr); + } + } + + void createBuffer(VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, + VkBuffer &buffer, + VkDeviceMemory &bufferMemory) + { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(device, buffer, &memRequirements); + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate buffer memory!"); + } + + vkBindBufferMemory(device, buffer, bufferMemory, 0); + } + + VkCommandBuffer beginSingleTimeCommands() + { + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandPool = commandPool; + allocInfo.commandBufferCount = 1; + + VkCommandBuffer commandBuffer; + vkAllocateCommandBuffers(device, &allocInfo, &commandBuffer); + + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + + vkBeginCommandBuffer(commandBuffer, &beginInfo); + + return commandBuffer; + } + + void endSingleTimeCommands(VkCommandBuffer commandBuffer) + { + vkEndCommandBuffer(commandBuffer); + + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffer; + + vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE); + vkQueueWaitIdle(graphicsQueue); + + vkFreeCommandBuffers(device, commandPool, 1, &commandBuffer); + } + + void copyBuffer(VkBuffer srcBuffer, VkBuffer dstBuffer, VkDeviceSize size) + { + VkCommandBuffer commandBuffer = beginSingleTimeCommands(); + + VkBufferCopy copyRegion = {}; + copyRegion.size = size; + vkCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, 1, ©Region); + + endSingleTimeCommands(commandBuffer); + } + + uint32_t findMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) + { + VkPhysicalDeviceMemoryProperties memProperties; + vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); + + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } + } + + throw std::runtime_error("failed to find suitable memory type!"); + } + + void createCommandBuffers() + { + commandBuffers.resize(swapChainFramebuffers.size()); + + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.commandPool = commandPool; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandBufferCount = (uint32_t)commandBuffers.size(); + + if (vkAllocateCommandBuffers(device, &allocInfo, commandBuffers.data()) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate command buffers!"); + } + + for (size_t i = 0; i < commandBuffers.size(); i++) { + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + + if (vkBeginCommandBuffer(commandBuffers[i], &beginInfo) != VK_SUCCESS) { + throw std::runtime_error("failed to begin recording command buffer!"); + } + + VkRenderPassBeginInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + renderPassInfo.renderPass = renderPass; + renderPassInfo.framebuffer = swapChainFramebuffers[i]; + renderPassInfo.renderArea.offset = {0, 0}; + renderPassInfo.renderArea.extent = swapChainExtent; + + VkClearValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f}; + renderPassInfo.clearValueCount = 1; + renderPassInfo.pClearValues = &clearColor; + + vkCmdBeginRenderPass(commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE); + + vkCmdBindPipeline(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline); + + VkBuffer vertexBuffers[] = {vertexBuffer}; + VkDeviceSize offsets[] = {0}; + vkCmdBindVertexBuffers(commandBuffers[i], 0, 1, vertexBuffers, offsets); + + vkCmdBindIndexBuffer(commandBuffers[i], indexBuffer, 0, VK_INDEX_TYPE_UINT16); + + vkCmdBindDescriptorSets(commandBuffers[i], + VK_PIPELINE_BIND_POINT_GRAPHICS, + pipelineLayout, + 0, + 1, + &descriptorSets[i], + 0, + nullptr); + + vkCmdDrawIndexed(commandBuffers[i], static_cast(indices.size()), 1, 0, 0, 0); + // vkCmdDraw(commandBuffers[i], static_cast(vertices.size()), 1, + // 0, 0); + + vkCmdEndRenderPass(commandBuffers[i]); + + if (vkEndCommandBuffer(commandBuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to record command buffer!"); + } + } + } + + void createSyncObjects() + { + imageAvailableSemaphores.resize(MAX_FRAMES); + renderFinishedSemaphores.resize(MAX_FRAMES); + inFlightFences.resize(MAX_FRAMES); + + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + + VkFenceCreateInfo fenceInfo = {}; + fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT; + + for (size_t i = 0; i < MAX_FRAMES; i++) { + if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &imageAvailableSemaphores[i]) != VK_SUCCESS + || vkCreateSemaphore(device, &semaphoreInfo, nullptr, &renderFinishedSemaphores[i]) != VK_SUCCESS + || vkCreateFence(device, &fenceInfo, nullptr, &inFlightFences[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create synchronization objects for a frame!"); + } + } + } + + void createSyncObjectsExt() + { + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + + memset(&semaphoreInfo, 0, sizeof(semaphoreInfo)); + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportSemaphoreWin32HandleInfoKHR vulkanExportSemaphoreWin32HandleInfoKHR = {}; + vulkanExportSemaphoreWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; + vulkanExportSemaphoreWin32HandleInfoKHR.pNext = NULL; + vulkanExportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportSemaphoreWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; +#endif + VkExportSemaphoreCreateInfoKHR vulkanExportSemaphoreCreateInfo = {}; + vulkanExportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; +#ifdef _WIN64 + vulkanExportSemaphoreCreateInfo.pNext = IsWindows8OrGreater() ? &vulkanExportSemaphoreWin32HandleInfoKHR : NULL; + vulkanExportSemaphoreCreateInfo.handleTypes = IsWindows8OrGreater() + ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT + : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; +#else + vulkanExportSemaphoreCreateInfo.pNext = NULL; + vulkanExportSemaphoreCreateInfo.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif + semaphoreInfo.pNext = &vulkanExportSemaphoreCreateInfo; + + if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &cudaUpdateVkSemaphore) != VK_SUCCESS + || vkCreateSemaphore(device, &semaphoreInfo, nullptr, &vkUpdateCudaSemaphore) != VK_SUCCESS) { + throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!"); + } + } + + void updateUniformBuffer() + { + UniformBufferObject ubo = {}; + + mat4x4_identity(ubo.model); + mat4x4 Model; + mat4x4_dup(Model, ubo.model); + mat4x4_rotate(ubo.model, Model, 0.0f, 0.0f, 1.0f, degreesToRadians(135.0f)); + + vec3 eye = {2.0f, 2.0f, 2.0f}; + vec3 center = {0.0f, 0.0f, 0.0f}; + vec3 up = {0.0f, 0.0f, 1.0f}; + mat4x4_look_at(ubo.view, eye, center, up); + + mat4x4_perspective( + ubo.proj, degreesToRadians(45.0f), swapChainExtent.width / (float)swapChainExtent.height, 0.1f, 10.0f); + ubo.proj[1][1] *= -1; + + for (size_t i = 0; i < swapChainImages.size(); i++) { + void *data; + vkMapMemory(device, uniformBuffersMemory[i], 0, sizeof(ubo), 0, &data); + memcpy(data, &ubo, sizeof(ubo)); + vkUnmapMemory(device, uniformBuffersMemory[i]); + } + } + + void drawFrame() + { + static int startSubmit = 0; + + vkWaitForFences(device, 1, &inFlightFences[currentFrame], VK_TRUE, std::numeric_limits::max()); + + uint32_t imageIndex; + VkResult result = vkAcquireNextImageKHR(device, + swapChain, + std::numeric_limits::max(), + imageAvailableSemaphores[currentFrame], + VK_NULL_HANDLE, + &imageIndex); + + if (result == VK_ERROR_OUT_OF_DATE_KHR) { + recreateSwapChain(); + return; + } + else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { + throw std::runtime_error("failed to acquire swap chain image!"); + } + + vkResetFences(device, 1, &inFlightFences[currentFrame]); + + if (!startSubmit) { + submitVulkan(imageIndex); + startSubmit = 1; + } + else { + submitVulkanCuda(imageIndex); + } + + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphores[currentFrame]}; + + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = signalSemaphores; + + VkSwapchainKHR swapChains[] = {swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; + presentInfo.pResults = nullptr; // Optional + + result = vkQueuePresentKHR(presentQueue, &presentInfo); + + if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || framebufferResized) { + framebufferResized = false; + recreateSwapChain(); + } + else if (result != VK_SUCCESS) { + throw std::runtime_error("failed to present swap chain image!"); + } + + cudaUpdateVkImage(); + + currentFrame = (currentFrame + 1) % MAX_FRAMES; + // Added sleep of 10 millisecs so that CPU does not submit too much work to + // GPU + std::this_thread::sleep_for(std::chrono::microseconds(10000)); + char title[256]; + sprintf(title, "Vulkan Image CUDA Box Filter (radius=%d)", filter_radius); + glfwSetWindowTitle(window, title); + } + + void cudaVkSemaphoreSignal(cudaExternalSemaphore_t &extSemaphore) + { + cudaExternalSemaphoreSignalParams extSemaphoreSignalParams; + memset(&extSemaphoreSignalParams, 0, sizeof(extSemaphoreSignalParams)); + + extSemaphoreSignalParams.params.fence.value = 0; + extSemaphoreSignalParams.flags = 0; + checkCudaErrors(cudaSignalExternalSemaphoresAsync(&extSemaphore, &extSemaphoreSignalParams, 1, streamToRun)); + } + + void cudaVkSemaphoreWait(cudaExternalSemaphore_t &extSemaphore) + { + cudaExternalSemaphoreWaitParams extSemaphoreWaitParams; + + memset(&extSemaphoreWaitParams, 0, sizeof(extSemaphoreWaitParams)); + + extSemaphoreWaitParams.params.fence.value = 0; + extSemaphoreWaitParams.flags = 0; + + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&extSemaphore, &extSemaphoreWaitParams, 1, streamToRun)); + } + + void submitVulkan(uint32_t imageIndex) + { + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + VkSemaphore waitSemaphores[] = {imageAvailableSemaphores[currentFrame]}; + VkPipelineStageFlags waitStages[] = {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT}; + submitInfo.waitSemaphoreCount = 1; + submitInfo.pWaitSemaphores = waitSemaphores; + submitInfo.pWaitDstStageMask = waitStages; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphores[currentFrame], vkUpdateCudaSemaphore}; + + submitInfo.signalSemaphoreCount = 2; + submitInfo.pSignalSemaphores = signalSemaphores; + + if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + } + + void submitVulkanCuda(uint32_t imageIndex) + { + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + VkSemaphore waitSemaphores[] = {imageAvailableSemaphores[currentFrame], cudaUpdateVkSemaphore}; + VkPipelineStageFlags waitStages[] = {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT}; + submitInfo.waitSemaphoreCount = 2; + submitInfo.pWaitSemaphores = waitSemaphores; + submitInfo.pWaitDstStageMask = waitStages; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphores[currentFrame], vkUpdateCudaSemaphore}; + + submitInfo.signalSemaphoreCount = 2; + submitInfo.pSignalSemaphores = signalSemaphores; + + if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + } + + VkShaderModule createShaderModule(const std::vector &code) + { + VkShaderModuleCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + createInfo.codeSize = code.size(); + createInfo.pCode = reinterpret_cast(code.data()); + + VkShaderModule shaderModule; + if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) { + throw std::runtime_error("failed to create shader module!"); + } + + return shaderModule; + } + + VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector &availableFormats) + { + if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) { + return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; + } + + for (const auto &availableFormat : availableFormats) { + if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM + && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { + return availableFormat; + } + } + + return availableFormats[0]; + } + + VkPresentModeKHR chooseSwapPresentMode(const std::vector &availablePresentModes) + { + VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; + + for (const auto &availablePresentMode : availablePresentModes) { + if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { + return availablePresentMode; + } + else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { + bestMode = availablePresentMode; + } + } + + return bestMode; + } + + VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR &capabilities) + { + if (capabilities.currentExtent.width != std::numeric_limits::max()) { + return capabilities.currentExtent; + } + else { + int width, height; + glfwGetFramebufferSize(window, &width, &height); + + VkExtent2D actualExtent = {static_cast(width), static_cast(height)}; + + actualExtent.width = std::max(capabilities.minImageExtent.width, + std::min(capabilities.maxImageExtent.width, actualExtent.width)); + actualExtent.height = std::max(capabilities.minImageExtent.height, + std::min(capabilities.maxImageExtent.height, actualExtent.height)); + + return actualExtent; + } + } + + SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) + { + SwapChainSupportDetails details; + + vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &details.capabilities); + + uint32_t formatCount; + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); + + if (formatCount != 0) { + details.formats.resize(formatCount); + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, details.formats.data()); + } + + uint32_t presentModeCount; + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr); + + if (presentModeCount != 0) { + details.presentModes.resize(presentModeCount); + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, details.presentModes.data()); + } + + return details; + } + + bool isDeviceSuitable(VkPhysicalDevice device) + { + QueueFamilyIndices indices = findQueueFamilies(device); + + bool extensionsSupported = checkDeviceExtensionSupport(device); + + bool swapChainAdequate = false; + if (extensionsSupported) { + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device); + swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty(); + } + + VkPhysicalDeviceFeatures supportedFeatures; + vkGetPhysicalDeviceFeatures(device, &supportedFeatures); + + return indices.isComplete() && extensionsSupported && swapChainAdequate && supportedFeatures.samplerAnisotropy; + } + + bool checkDeviceExtensionSupport(VkPhysicalDevice device) + { + uint32_t extensionCount; + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr); + + std::vector availableExtensions(extensionCount); + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data()); + + std::set requiredExtensions(deviceExtensions.begin(), deviceExtensions.end()); + + for (const auto &extension : availableExtensions) { + requiredExtensions.erase(extension.extensionName); + } + + return requiredExtensions.empty(); + } + + QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) + { + QueueFamilyIndices indices; + + uint32_t queueFamilyCount = 0; + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); + + std::vector queueFamilies(queueFamilyCount); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data()); + + int i = 0; + for (const auto &queueFamily : queueFamilies) { + if (queueFamily.queueCount > 0 && queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) { + indices.graphicsFamily = i; + } + + VkBool32 presentSupport = false; + vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); + + if (queueFamily.queueCount > 0 && presentSupport) { + indices.presentFamily = i; + } + + if (indices.isComplete()) { + break; + } + + i++; + } + + return indices; + } + + std::vector getRequiredExtensions() + { + uint32_t glfwExtensionCount = 0; + const char **glfwExtensions; + glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + + std::vector extensions(glfwExtensions, glfwExtensions + glfwExtensionCount); + + if (enableValidationLayers) { + extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + } + + return extensions; + } + + bool checkValidationLayerSupport() + { + uint32_t layerCount; + vkEnumerateInstanceLayerProperties(&layerCount, nullptr); + + std::vector availableLayers(layerCount); + vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); + + for (const char *layerName : validationLayers) { + bool layerFound = false; + + for (const auto &layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } + } + + if (!layerFound) { + return false; + } + } + + return true; + } + + static std::vector readFile(const std::string &filename) + { + char *file_path = sdkFindFilePath(filename.c_str(), execution_path.c_str()); + std::ifstream file(file_path, std::ios::ate | std::ios::binary); + + if (!file.is_open()) { + throw std::runtime_error("failed to open file!"); + } + + size_t fileSize = (size_t)file.tellg(); + std::vector buffer(fileSize); + + file.seekg(0); + file.read(buffer.data(), fileSize); + + file.close(); + + return buffer; + } + + static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + VkDebugUtilsMessageTypeFlagsEXT messageType, + const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, + void *pUserData) + { + std::cerr << "validation layer: " << pCallbackData->pMessage << std::endl; + + return VK_FALSE; + } }; -int main(int argc, char** argv) { - execution_path = argv[0]; - std::string image_filename = "teapot1024.ppm"; +int main(int argc, char **argv) +{ + execution_path = argv[0]; + std::string image_filename = "teapot1024.ppm"; - if (checkCmdLineFlag(argc, (const char**)argv, "file")) { - getCmdLineArgumentString(argc, (const char**)argv, "file", - (char**)&image_filename); - } + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&image_filename); + } - vulkanImageCUDA app; + vulkanImageCUDA app; - try { - // This app only works on ppm images - app.loadImageData(image_filename); - app.run(); - } catch (const std::exception& e) { - std::cerr << e.what() << std::endl; - return EXIT_FAILURE; - } + try { + // This app only works on ppm images + app.loadImageData(image_filename); + app.run(); + } + catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return EXIT_FAILURE; + } - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/Samples/6_Performance/LargeKernelParameter/LargeKernelParameter.cu b/Samples/6_Performance/LargeKernelParameter/LargeKernelParameter.cu index 1b686a8a..177945a4 100644 --- a/Samples/6_Performance/LargeKernelParameter/LargeKernelParameter.cu +++ b/Samples/6_Performance/LargeKernelParameter/LargeKernelParameter.cu @@ -29,9 +29,9 @@ * This is a simple test showing performance and usability * improvements with large kernel parameters introduced in CUDA 12.1 */ +#include #include #include -#include // Utility includes #include @@ -40,134 +40,141 @@ using namespace std; using namespace std::chrono; #define TEST_ITERATIONS (1000) -#define TOTAL_PARAMS (8000) // ints -#define KERNEL_PARAM_LIMIT (1024) // ints +#define TOTAL_PARAMS (8000) // ints +#define KERNEL_PARAM_LIMIT (1024) // ints #define CONST_COPIED_PARAMS (TOTAL_PARAMS - KERNEL_PARAM_LIMIT) __constant__ int excess_params[CONST_COPIED_PARAMS]; -typedef struct { - int param[KERNEL_PARAM_LIMIT]; +typedef struct +{ + int param[KERNEL_PARAM_LIMIT]; } param_t; -typedef struct { - int param[TOTAL_PARAMS]; +typedef struct +{ + int param[TOTAL_PARAMS]; } param_large_t; // Kernel with 4KB kernel parameter limit -__global__ void kernelDefault(__grid_constant__ const param_t p, int *result) { - int tmp = 0; +__global__ void kernelDefault(__grid_constant__ const param_t p, int *result) +{ + int tmp = 0; - // accumulate kernel parameters - for (int i = 0; i < KERNEL_PARAM_LIMIT; ++i) { - tmp += p.param[i]; - } + // accumulate kernel parameters + for (int i = 0; i < KERNEL_PARAM_LIMIT; ++i) { + tmp += p.param[i]; + } - // accumulate excess values passed via const memory - for (int i = 0; i < CONST_COPIED_PARAMS; ++i) { - tmp += excess_params[i]; - } + // accumulate excess values passed via const memory + for (int i = 0; i < CONST_COPIED_PARAMS; ++i) { + tmp += excess_params[i]; + } - *result = tmp; + *result = tmp; } // Kernel with 32,764 byte kernel parameter limit -__global__ void kernelLargeParam(__grid_constant__ const param_large_t p, int *result) { - int tmp = 0; +__global__ void kernelLargeParam(__grid_constant__ const param_large_t p, int *result) +{ + int tmp = 0; - // accumulate kernel parameters - for (int i = 0; i < TOTAL_PARAMS; ++i) { - tmp += p.param[i]; - } + // accumulate kernel parameters + for (int i = 0; i < TOTAL_PARAMS; ++i) { + tmp += p.param[i]; + } - *result = tmp; + *result = tmp; } static void report_time(std::chrono::time_point start, std::chrono::time_point end, - int iters) { - auto usecs = duration_cast>(end - start); - cout << usecs.count() / iters << endl; + int iters) +{ + auto usecs = duration_cast>(end - start); + cout << usecs.count() / iters << endl; } -int main() { - int rc; - cudaFree(0); +int main() +{ + int rc; + cudaFree(0); - param_t p; - param_large_t p_large; + param_t p; + param_large_t p_large; - // pageable host memory that holds excess constants passed via constant memory - int *copied_params = (int *)malloc(CONST_COPIED_PARAMS * sizeof(int)); - assert(copied_params); + // pageable host memory that holds excess constants passed via constant memory + int *copied_params = (int *)malloc(CONST_COPIED_PARAMS * sizeof(int)); + assert(copied_params); - // storage for computed result - int *d_result; - int h_result; - checkCudaErrors(cudaMalloc(&d_result, sizeof(int))); + // storage for computed result + int *d_result; + int h_result; + checkCudaErrors(cudaMalloc(&d_result, sizeof(int))); - int expected_result = 0; + int expected_result = 0; - // fill in data for validation - for (int i = 0; i < KERNEL_PARAM_LIMIT; ++i) { - p.param[i] = (i & 0xFF); - } - for (int i = KERNEL_PARAM_LIMIT; i < TOTAL_PARAMS; ++i) { - copied_params[i - KERNEL_PARAM_LIMIT] = (i & 0xFF); - } - for (int i = 0; i < TOTAL_PARAMS; ++i) { - p_large.param[i] = (i & 0xFF); - expected_result += (i & 0xFF); - } - - // warmup, verify correctness - checkCudaErrors(cudaMemcpyToSymbol(excess_params, copied_params, CONST_COPIED_PARAMS * sizeof(int), 0, cudaMemcpyHostToDevice)); - kernelDefault<<<1,1>>>(p, d_result); - checkCudaErrors(cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaDeviceSynchronize()); - if(h_result != expected_result) { - std::cout << "Test failed" << std::endl; - rc=-1; - goto Exit; - } - - kernelLargeParam<<<1,1>>>(p_large, d_result); - checkCudaErrors(cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaDeviceSynchronize()); - if(h_result != expected_result) { - std::cout << "Test failed" << std::endl; - rc=-1; - goto Exit; - } - - // benchmark default kernel parameter limit - { - auto start = steady_clock::now(); - for (int i = 0; i < TEST_ITERATIONS; ++i) { - checkCudaErrors(cudaMemcpyToSymbol(excess_params, copied_params, CONST_COPIED_PARAMS * sizeof(int), 0, cudaMemcpyHostToDevice)); - kernelDefault<<<1, 1>>>(p, d_result); + // fill in data for validation + for (int i = 0; i < KERNEL_PARAM_LIMIT; ++i) { + p.param[i] = (i & 0xFF); + } + for (int i = KERNEL_PARAM_LIMIT; i < TOTAL_PARAMS; ++i) { + copied_params[i - KERNEL_PARAM_LIMIT] = (i & 0xFF); + } + for (int i = 0; i < TOTAL_PARAMS; ++i) { + p_large.param[i] = (i & 0xFF); + expected_result += (i & 0xFF); } - checkCudaErrors(cudaDeviceSynchronize()); - auto end = steady_clock::now(); - std::cout << "Kernel 4KB parameter limit - time (us):"; - report_time(start, end, TEST_ITERATIONS); - // benchmark large kernel parameter limit - start = steady_clock::now(); - for (int i = 0; i < TEST_ITERATIONS; ++i) { - kernelLargeParam<<<1, 1>>>(p_large, d_result); - } + // warmup, verify correctness + checkCudaErrors( + cudaMemcpyToSymbol(excess_params, copied_params, CONST_COPIED_PARAMS * sizeof(int), 0, cudaMemcpyHostToDevice)); + kernelDefault<<<1, 1>>>(p, d_result); + checkCudaErrors(cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost)); checkCudaErrors(cudaDeviceSynchronize()); - end = steady_clock::now(); - std::cout << "Kernel 32,764 byte parameter limit - time (us):"; - report_time(start, end, TEST_ITERATIONS); - } - std::cout << "Test passed!" << std::endl; - rc=0; + if (h_result != expected_result) { + std::cout << "Test failed" << std::endl; + rc = -1; + goto Exit; + } + + kernelLargeParam<<<1, 1>>>(p_large, d_result); + checkCudaErrors(cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaDeviceSynchronize()); + if (h_result != expected_result) { + std::cout << "Test failed" << std::endl; + rc = -1; + goto Exit; + } + + // benchmark default kernel parameter limit + { + auto start = steady_clock::now(); + for (int i = 0; i < TEST_ITERATIONS; ++i) { + checkCudaErrors(cudaMemcpyToSymbol( + excess_params, copied_params, CONST_COPIED_PARAMS * sizeof(int), 0, cudaMemcpyHostToDevice)); + kernelDefault<<<1, 1>>>(p, d_result); + } + checkCudaErrors(cudaDeviceSynchronize()); + auto end = steady_clock::now(); + std::cout << "Kernel 4KB parameter limit - time (us):"; + report_time(start, end, TEST_ITERATIONS); + + // benchmark large kernel parameter limit + start = steady_clock::now(); + for (int i = 0; i < TEST_ITERATIONS; ++i) { + kernelLargeParam<<<1, 1>>>(p_large, d_result); + } + checkCudaErrors(cudaDeviceSynchronize()); + end = steady_clock::now(); + std::cout << "Kernel 32,764 byte parameter limit - time (us):"; + report_time(start, end, TEST_ITERATIONS); + } + std::cout << "Test passed!" << std::endl; + rc = 0; Exit: - // cleanup - cudaFree(d_result); - free(copied_params); - return rc; + // cleanup + cudaFree(d_result); + free(copied_params); + return rc; } diff --git a/Samples/6_Performance/UnifiedMemoryPerf/commonDefs.hpp b/Samples/6_Performance/UnifiedMemoryPerf/commonDefs.hpp index 9efbdc5d..7fd9dfcd 100644 --- a/Samples/6_Performance/UnifiedMemoryPerf/commonDefs.hpp +++ b/Samples/6_Performance/UnifiedMemoryPerf/commonDefs.hpp @@ -33,33 +33,28 @@ #define ONE_MB (ONE_KB * ONE_KB) extern size_t maxSampleSizeInMb; -extern int numKernelRuns; -extern int verboseResults; +extern int numKernelRuns; +extern int verboseResults; -extern unsigned int findNumSizesToTest(unsigned int minSize, - unsigned int maxSize, - unsigned int multiplier); +extern unsigned int findNumSizesToTest(unsigned int minSize, unsigned int maxSize, unsigned int multiplier); // For Tracking the different memory allocation types typedef enum memAllocType_enum { - MEMALLOC_TYPE_START, - USE_MANAGED_MEMORY_WITH_HINTS = MEMALLOC_TYPE_START, - USE_MANAGED_MEMORY_WITH_HINTS_ASYNC, - USE_MANAGED_MEMORY, - USE_ZERO_COPY, - USE_HOST_PAGEABLE_AND_DEVICE_MEMORY, - USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC, - USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY, - USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC, - MEMALLOC_TYPE_END = USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC, - MEMALLOC_TYPE_INVALID, - MEMALLOC_TYPE_COUNT = MEMALLOC_TYPE_INVALID + MEMALLOC_TYPE_START, + USE_MANAGED_MEMORY_WITH_HINTS = MEMALLOC_TYPE_START, + USE_MANAGED_MEMORY_WITH_HINTS_ASYNC, + USE_MANAGED_MEMORY, + USE_ZERO_COPY, + USE_HOST_PAGEABLE_AND_DEVICE_MEMORY, + USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC, + USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY, + USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC, + MEMALLOC_TYPE_END = USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC, + MEMALLOC_TYPE_INVALID, + MEMALLOC_TYPE_COUNT = MEMALLOC_TYPE_INVALID } MemAllocType; -typedef enum bandwidthType_enum { - READ_BANDWIDTH, - WRITE_BANDWIDTH -} BandwidthType; +typedef enum bandwidthType_enum { READ_BANDWIDTH, WRITE_BANDWIDTH } BandwidthType; extern const char *memAllocTypeStr[]; extern const char *memAllocTypeShortStr[]; @@ -67,22 +62,20 @@ extern const char *memAllocTypeShortStr[]; struct resultsData; struct testResults; -void createAndInitTestResults(struct testResults **results, - const char *testName, - unsigned int numMeasurements, - unsigned int numSizesToTest); +void createAndInitTestResults(struct testResults **results, + const char *testName, + unsigned int numMeasurements, + unsigned int numSizesToTest); unsigned long *getPtrSizesToTest(struct testResults *results); void freeTestResultsAndAllResultsData(struct testResults *results); -void createResultDataAndAddToTestResults(struct resultsData **ptrData, - struct testResults *results, - const char *resultsName, - bool printOnlyInVerbose, - bool reportAsBandwidth); -double *getPtrRunTimesInMs(struct resultsData *data, int allocType, - int sizeIndex); +void createResultDataAndAddToTestResults(struct resultsData **ptrData, + struct testResults *results, + const char *resultsName, + bool printOnlyInVerbose, + bool reportAsBandwidth); +double *getPtrRunTimesInMs(struct resultsData *data, int allocType, int sizeIndex); -void printResults(struct testResults *results, - bool print_launch_transfer_results, bool print_std_deviation); +void printResults(struct testResults *results, bool print_launch_transfer_results, bool print_std_deviation); #endif diff --git a/Samples/6_Performance/UnifiedMemoryPerf/commonKernels.cu b/Samples/6_Performance/UnifiedMemoryPerf/commonKernels.cu index 73fecbc1..96e2e695 100644 --- a/Samples/6_Performance/UnifiedMemoryPerf/commonKernels.cu +++ b/Samples/6_Performance/UnifiedMemoryPerf/commonKernels.cu @@ -27,7 +27,8 @@ #include "commonKernels.hpp" -__global__ void spinWhileLessThanOne(volatile unsigned int *latch) { - while (latch[0] < 1) - ; +__global__ void spinWhileLessThanOne(volatile unsigned int *latch) +{ + while (latch[0] < 1) + ; } diff --git a/Samples/6_Performance/UnifiedMemoryPerf/helperFunctions.cpp b/Samples/6_Performance/UnifiedMemoryPerf/helperFunctions.cpp index d9483e06..99d7f5bf 100644 --- a/Samples/6_Performance/UnifiedMemoryPerf/helperFunctions.cpp +++ b/Samples/6_Performance/UnifiedMemoryPerf/helperFunctions.cpp @@ -27,277 +27,267 @@ #include #include + #include "commonDefs.hpp" #define CU_INIT_UUID #include #define UNITS_Time "ms" -#define UNITS_BW "MB/s" -#define KB_str "KB" -#define MB_str "MB" +#define UNITS_BW "MB/s" +#define KB_str "KB" +#define MB_str "MB" -struct resultsData { - char resultsName[64]; - struct testResults *results; - // this has MEMALLOC_TYPE_COUNT * results->numSizesToTest * - // results->numMeasurements elements - double **runTimesInMs[MEMALLOC_TYPE_COUNT]; - double *averageRunTimesInMs[MEMALLOC_TYPE_COUNT]; - double *stdDevRunTimesInMs[MEMALLOC_TYPE_COUNT]; - double *stdDevBandwidthInMBps[MEMALLOC_TYPE_COUNT]; - bool printOnlyInVerbose; - bool reportAsBandwidth; - struct resultsData *next; +struct resultsData +{ + char resultsName[64]; + struct testResults *results; + // this has MEMALLOC_TYPE_COUNT * results->numSizesToTest * + // results->numMeasurements elements + double **runTimesInMs[MEMALLOC_TYPE_COUNT]; + double *averageRunTimesInMs[MEMALLOC_TYPE_COUNT]; + double *stdDevRunTimesInMs[MEMALLOC_TYPE_COUNT]; + double *stdDevBandwidthInMBps[MEMALLOC_TYPE_COUNT]; + bool printOnlyInVerbose; + bool reportAsBandwidth; + struct resultsData *next; }; -struct testResults { - char testName[64]; - unsigned int numMeasurements; - unsigned long *sizesToTest; - unsigned int numSizesToTest; - struct resultsData *resultsDataHead; - struct resultsData *resultsDataTail; +struct testResults +{ + char testName[64]; + unsigned int numMeasurements; + unsigned long *sizesToTest; + unsigned int numSizesToTest; + struct resultsData *resultsDataHead; + struct resultsData *resultsDataTail; }; -unsigned int findNumSizesToTest(unsigned int minSize, unsigned int maxSize, - unsigned int multiplier) { - unsigned int numSizesToTest = 0; - while (minSize <= maxSize) { - numSizesToTest++; - minSize *= multiplier; - } - return numSizesToTest; +unsigned int findNumSizesToTest(unsigned int minSize, unsigned int maxSize, unsigned int multiplier) +{ + unsigned int numSizesToTest = 0; + while (minSize <= maxSize) { + numSizesToTest++; + minSize *= multiplier; + } + return numSizesToTest; } -int compareDoubles(const void *ptr1, const void *ptr2) { - return (*(double *)ptr1 > *(double *)ptr2) ? 1 : -1; -} +int compareDoubles(const void *ptr1, const void *ptr2) { return (*(double *)ptr1 > *(double *)ptr2) ? 1 : -1; } -static inline double getTimeOrBandwidth(double runTimeInMs, unsigned long size, - bool getBandwidth) { - return (getBandwidth) ? (1000 * (size / runTimeInMs)) / ONE_MB : runTimeInMs; +static inline double getTimeOrBandwidth(double runTimeInMs, unsigned long size, bool getBandwidth) +{ + return (getBandwidth) ? (1000 * (size / runTimeInMs)) / ONE_MB : runTimeInMs; } void createAndInitTestResults(struct testResults **ptrResults, - const char *testName, - unsigned int numMeasurements, - unsigned int numSizesToTest) { - unsigned int i; - struct testResults *results; - results = (struct testResults *)malloc(sizeof(struct testResults)); - memset(results, 0, sizeof(struct testResults)); - strcpy(results->testName, testName); - results->numMeasurements = numMeasurements; - results->numSizesToTest = numSizesToTest; - results->sizesToTest = - (unsigned long *)malloc(numSizesToTest * sizeof(unsigned long)); - results->resultsDataHead = NULL; - results->resultsDataTail = NULL; + const char *testName, + unsigned int numMeasurements, + unsigned int numSizesToTest) +{ + unsigned int i; + struct testResults *results; + results = (struct testResults *)malloc(sizeof(struct testResults)); + memset(results, 0, sizeof(struct testResults)); + strcpy(results->testName, testName); + results->numMeasurements = numMeasurements; + results->numSizesToTest = numSizesToTest; + results->sizesToTest = (unsigned long *)malloc(numSizesToTest * sizeof(unsigned long)); + results->resultsDataHead = NULL; + results->resultsDataTail = NULL; - *ptrResults = results; + *ptrResults = results; } -unsigned long *getPtrSizesToTest(struct testResults *results) { - return results->sizesToTest; -} +unsigned long *getPtrSizesToTest(struct testResults *results) { return results->sizesToTest; } void createResultDataAndAddToTestResults(struct resultsData **ptrData, - struct testResults *results, - const char *resultsName, - bool printOnlyInVerbose, - bool reportAsBandwidth) { - unsigned int i, j; - struct resultsData *data; - data = (struct resultsData *)malloc(sizeof(struct resultsData)); - memset(data, 0, sizeof(struct resultsData)); - strcpy(data->resultsName, resultsName); - data->results = results; - for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { - data->runTimesInMs[i] = - (double **)malloc(results->numSizesToTest * sizeof(double *)); - for (j = 0; j < results->numSizesToTest; j++) { - data->runTimesInMs[i][j] = - (double *)malloc(results->numMeasurements * sizeof(double)); - } - data->averageRunTimesInMs[i] = - (double *)malloc(results->numSizesToTest * sizeof(double)); - data->stdDevRunTimesInMs[i] = - (double *)malloc(results->numSizesToTest * sizeof(double)); - data->stdDevBandwidthInMBps[i] = - (double *)malloc(results->numSizesToTest * sizeof(double)); - } - data->printOnlyInVerbose = printOnlyInVerbose; - data->reportAsBandwidth = reportAsBandwidth; - data->next = NULL; - *ptrData = data; - if (results->resultsDataHead == NULL) { - results->resultsDataHead = data; - results->resultsDataTail = data; - } else { - results->resultsDataTail->next = data; - results->resultsDataTail = data; - } -} - -double *getPtrRunTimesInMs(struct resultsData *data, int allocType, - int sizeIndex) { - return data->runTimesInMs[allocType][sizeIndex]; -} - -void freeTestResultsAndAllResultsData(struct testResults *results) { - struct resultsData *data, *dataToFree; - unsigned int i, j; - for (data = results->resultsDataHead; data != NULL;) { + struct testResults *results, + const char *resultsName, + bool printOnlyInVerbose, + bool reportAsBandwidth) +{ + unsigned int i, j; + struct resultsData *data; + data = (struct resultsData *)malloc(sizeof(struct resultsData)); + memset(data, 0, sizeof(struct resultsData)); + strcpy(data->resultsName, resultsName); + data->results = results; for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { - for (j = 0; j < results->numSizesToTest; j++) { - free(data->runTimesInMs[i][j]); - } - free(data->runTimesInMs[i]); - free(data->averageRunTimesInMs[i]); - free(data->stdDevRunTimesInMs[i]); - free(data->stdDevBandwidthInMBps[i]); - } - dataToFree = data; - data = data->next; - free(dataToFree); - } - free(results->sizesToTest); - free(results); -} - -void calculateAverageAndStdDev(double *pAverage, double *pStdDev, - double *allResults, unsigned int count) { - unsigned int i; - double average = 0.0; - double stdDev = 0.0; - for (i = 0; i < count; i++) { - average += allResults[i]; - } - average /= count; - for (i = 0; i < count; i++) { - stdDev += (allResults[i] - average) * (allResults[i] - average); - } - stdDev /= count; - stdDev = sqrt(stdDev); - *pAverage = average; - *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average); -} - -void calculateStdDevBandwidth(double *pStdDev, double *allResults, - unsigned int count, unsigned long size) { - unsigned int i; - double bandwidth; - double average = 0.0; - double stdDev = 0.0; - for (i = 0; i < count; i++) { - bandwidth = (1000 * (size / allResults[i])) / ONE_MB; - average += bandwidth; - } - average /= count; - for (i = 0; i < count; i++) { - bandwidth = (1000 * (size / allResults[i])) / ONE_MB; - stdDev += (bandwidth - average) * (bandwidth - average); - } - stdDev /= count; - stdDev = sqrt(stdDev); - *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average); -} - -void printTimesInTableFormat(struct testResults *results, - struct resultsData *data, bool printAverage, - bool printStdDev) { - unsigned int i, j; - bool printStdDevBandwidth = printStdDev && data->reportAsBandwidth; - printf("Size_KB"); - for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { - printf("\t%7s", memAllocTypeShortStr[i]); - } - printf("\n"); - for (j = 0; j < results->numSizesToTest; j++) { - printf("%lu", results->sizesToTest[j] / ONE_KB); - for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { - printf(data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf", - printStdDevBandwidth - ? data->stdDevBandwidthInMBps[i][j] - : getTimeOrBandwidth( - printAverage ? data->averageRunTimesInMs[i][j] - : data->stdDevRunTimesInMs[i][j], - results->sizesToTest[j], data->reportAsBandwidth)); - } - printf("\n"); - } -} - -void printAllResultsInVerboseMode(struct testResults *results, - struct resultsData *data) { - unsigned int i, j, k; - for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { - printf("Verbose mode, printing all results for %s\n", memAllocTypeStr[i]); - printf("Instance"); - for (j = 0; j < results->numSizesToTest; j++) { - printf("\t%lu", results->sizesToTest[j] / ONE_KB); - } - printf("\n"); - for (k = 0; k < results->numMeasurements; k++) { - printf("%u", k); - for (j = 0; j < results->numSizesToTest; j++) { - printf(data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf", - getTimeOrBandwidth(data->runTimesInMs[i][j][k], - results->sizesToTest[j], - data->reportAsBandwidth)); - } - printf("\n"); - } - } -} - -void printResults(struct testResults *results, - bool print_launch_transfer_results, - bool print_std_deviation) { - char vulcanPrint[256]; - char resultNameNoSpaces[64]; - unsigned int i, j, k; - struct resultsData *resultsIter; - bool sizeGreaterThan1MB; - for (resultsIter = results->resultsDataHead; resultsIter != NULL; - resultsIter = resultsIter->next) { - if (!verboseResults && resultsIter->printOnlyInVerbose) { - continue; - } - if (!print_launch_transfer_results) { - if (!(strcmp(resultsIter->resultsName, "Overall Time") == 0)) { - continue; - } - } - // regular print - printf("\n%s For %s ", resultsIter->resultsName, results->testName); - printf("\n"); - for (j = 0; j < results->numSizesToTest; j++) { - for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { - calculateAverageAndStdDev(&resultsIter->averageRunTimesInMs[i][j], - &resultsIter->stdDevRunTimesInMs[i][j], - resultsIter->runTimesInMs[i][j], - results->numMeasurements); - if (resultsIter->reportAsBandwidth) { - calculateStdDevBandwidth(&resultsIter->stdDevBandwidthInMBps[i][j], - resultsIter->runTimesInMs[i][j], - results->numMeasurements, - results->sizesToTest[j]); + data->runTimesInMs[i] = (double **)malloc(results->numSizesToTest * sizeof(double *)); + for (j = 0; j < results->numSizesToTest; j++) { + data->runTimesInMs[i][j] = (double *)malloc(results->numMeasurements * sizeof(double)); } - } + data->averageRunTimesInMs[i] = (double *)malloc(results->numSizesToTest * sizeof(double)); + data->stdDevRunTimesInMs[i] = (double *)malloc(results->numSizesToTest * sizeof(double)); + data->stdDevBandwidthInMBps[i] = (double *)malloc(results->numSizesToTest * sizeof(double)); } - printf("\nPrinting Average of %u measurements in (%s)\n", - results->numMeasurements, - resultsIter->reportAsBandwidth ? UNITS_BW : UNITS_Time); - printTimesInTableFormat(results, resultsIter, true, false); - if (print_std_deviation) { - printf( - "\nPrinting Standard Deviation as %% of average of %u measurements\n", - results->numMeasurements); - printTimesInTableFormat(results, resultsIter, false, true); + data->printOnlyInVerbose = printOnlyInVerbose; + data->reportAsBandwidth = reportAsBandwidth; + data->next = NULL; + *ptrData = data; + if (results->resultsDataHead == NULL) { + results->resultsDataHead = data; + results->resultsDataTail = data; } - if (verboseResults) { - printAllResultsInVerboseMode(results, resultsIter); + else { + results->resultsDataTail->next = data; + results->resultsDataTail = data; + } +} + +double *getPtrRunTimesInMs(struct resultsData *data, int allocType, int sizeIndex) +{ + return data->runTimesInMs[allocType][sizeIndex]; +} + +void freeTestResultsAndAllResultsData(struct testResults *results) +{ + struct resultsData *data, *dataToFree; + unsigned int i, j; + for (data = results->resultsDataHead; data != NULL;) { + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + for (j = 0; j < results->numSizesToTest; j++) { + free(data->runTimesInMs[i][j]); + } + free(data->runTimesInMs[i]); + free(data->averageRunTimesInMs[i]); + free(data->stdDevRunTimesInMs[i]); + free(data->stdDevBandwidthInMBps[i]); + } + dataToFree = data; + data = data->next; + free(dataToFree); + } + free(results->sizesToTest); + free(results); +} + +void calculateAverageAndStdDev(double *pAverage, double *pStdDev, double *allResults, unsigned int count) +{ + unsigned int i; + double average = 0.0; + double stdDev = 0.0; + for (i = 0; i < count; i++) { + average += allResults[i]; + } + average /= count; + for (i = 0; i < count; i++) { + stdDev += (allResults[i] - average) * (allResults[i] - average); + } + stdDev /= count; + stdDev = sqrt(stdDev); + *pAverage = average; + *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average); +} + +void calculateStdDevBandwidth(double *pStdDev, double *allResults, unsigned int count, unsigned long size) +{ + unsigned int i; + double bandwidth; + double average = 0.0; + double stdDev = 0.0; + for (i = 0; i < count; i++) { + bandwidth = (1000 * (size / allResults[i])) / ONE_MB; + average += bandwidth; + } + average /= count; + for (i = 0; i < count; i++) { + bandwidth = (1000 * (size / allResults[i])) / ONE_MB; + stdDev += (bandwidth - average) * (bandwidth - average); + } + stdDev /= count; + stdDev = sqrt(stdDev); + *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average); +} + +void printTimesInTableFormat(struct testResults *results, struct resultsData *data, bool printAverage, bool printStdDev) +{ + unsigned int i, j; + bool printStdDevBandwidth = printStdDev && data->reportAsBandwidth; + printf("Size_KB"); + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + printf("\t%7s", memAllocTypeShortStr[i]); + } + printf("\n"); + for (j = 0; j < results->numSizesToTest; j++) { + printf("%lu", results->sizesToTest[j] / ONE_KB); + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + printf(data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf", + printStdDevBandwidth ? data->stdDevBandwidthInMBps[i][j] + : getTimeOrBandwidth(printAverage ? data->averageRunTimesInMs[i][j] + : data->stdDevRunTimesInMs[i][j], + results->sizesToTest[j], + data->reportAsBandwidth)); + } + printf("\n"); + } +} + +void printAllResultsInVerboseMode(struct testResults *results, struct resultsData *data) +{ + unsigned int i, j, k; + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + printf("Verbose mode, printing all results for %s\n", memAllocTypeStr[i]); + printf("Instance"); + for (j = 0; j < results->numSizesToTest; j++) { + printf("\t%lu", results->sizesToTest[j] / ONE_KB); + } + printf("\n"); + for (k = 0; k < results->numMeasurements; k++) { + printf("%u", k); + for (j = 0; j < results->numSizesToTest; j++) { + printf( + data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf", + getTimeOrBandwidth(data->runTimesInMs[i][j][k], results->sizesToTest[j], data->reportAsBandwidth)); + } + printf("\n"); + } + } +} + +void printResults(struct testResults *results, bool print_launch_transfer_results, bool print_std_deviation) +{ + char vulcanPrint[256]; + char resultNameNoSpaces[64]; + unsigned int i, j, k; + struct resultsData *resultsIter; + bool sizeGreaterThan1MB; + for (resultsIter = results->resultsDataHead; resultsIter != NULL; resultsIter = resultsIter->next) { + if (!verboseResults && resultsIter->printOnlyInVerbose) { + continue; + } + if (!print_launch_transfer_results) { + if (!(strcmp(resultsIter->resultsName, "Overall Time") == 0)) { + continue; + } + } + // regular print + printf("\n%s For %s ", resultsIter->resultsName, results->testName); + printf("\n"); + for (j = 0; j < results->numSizesToTest; j++) { + for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) { + calculateAverageAndStdDev(&resultsIter->averageRunTimesInMs[i][j], + &resultsIter->stdDevRunTimesInMs[i][j], + resultsIter->runTimesInMs[i][j], + results->numMeasurements); + if (resultsIter->reportAsBandwidth) { + calculateStdDevBandwidth(&resultsIter->stdDevBandwidthInMBps[i][j], + resultsIter->runTimesInMs[i][j], + results->numMeasurements, + results->sizesToTest[j]); + } + } + } + printf("\nPrinting Average of %u measurements in (%s)\n", + results->numMeasurements, + resultsIter->reportAsBandwidth ? UNITS_BW : UNITS_Time); + printTimesInTableFormat(results, resultsIter, true, false); + if (print_std_deviation) { + printf("\nPrinting Standard Deviation as %% of average of %u measurements\n", results->numMeasurements); + printTimesInTableFormat(results, resultsIter, false, true); + } + if (verboseResults) { + printAllResultsInVerboseMode(results, resultsIter); + } } - } } diff --git a/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu b/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu index 6cccbd17..b260fbcc 100644 --- a/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu +++ b/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu @@ -27,671 +27,641 @@ #include #include + #include "commonDefs.hpp" #include "commonKernels.hpp" #define VERIFY_GPU_CORRECTNESS 0 size_t maxSampleSizeInMb = 64; -int numKernelRuns = 20; -int verboseResults = 0; +int numKernelRuns = 20; +int verboseResults = 0; -const char *memAllocTypeStr[MEMALLOC_TYPE_COUNT] = { - "Managed_Memory_With_Hints", - "Managed_Memory_With_Hints_FullyAsync", - "Managed_Memory_NoHints", - "Zero_Copy", - "Memcpy_HostMalloc_DeviceCudaMalloc", - "MemcpyAsync_HostMalloc_DeviceCudaMalloc", - "Memcpy_HostCudaHostAlloc_DeviceCudaMalloc", - "MemcpyAsync_HostCudaHostAlloc_DeviceCudaMalloc"}; +const char *memAllocTypeStr[MEMALLOC_TYPE_COUNT] = {"Managed_Memory_With_Hints", + "Managed_Memory_With_Hints_FullyAsync", + "Managed_Memory_NoHints", + "Zero_Copy", + "Memcpy_HostMalloc_DeviceCudaMalloc", + "MemcpyAsync_HostMalloc_DeviceCudaMalloc", + "Memcpy_HostCudaHostAlloc_DeviceCudaMalloc", + "MemcpyAsync_HostCudaHostAlloc_DeviceCudaMalloc"}; const char *memAllocTypeShortStr[MEMALLOC_TYPE_COUNT] = { - "UMhint", // Managed Memory With Hints - "UMhntAs", // Managed Memory With_Hints Async - "UMeasy", // Managed_Memory with No Hints - "0Copy", // Zero Copy - "MemCopy", // USE HOST PAGEABLE AND DEVICE_MEMORY - "CpAsync", // USE HOST PAGEABLE AND DEVICE_MEMORY ASYNC - "CpHpglk", // USE HOST PAGELOCKED AND DEVICE MEMORY - "CpPglAs" // USE HOST PAGELOCKED AND DEVICE MEMORY ASYNC + "UMhint", // Managed Memory With Hints + "UMhntAs", // Managed Memory With_Hints Async + "UMeasy", // Managed_Memory with No Hints + "0Copy", // Zero Copy + "MemCopy", // USE HOST PAGEABLE AND DEVICE_MEMORY + "CpAsync", // USE HOST PAGEABLE AND DEVICE_MEMORY ASYNC + "CpHpglk", // USE HOST PAGELOCKED AND DEVICE MEMORY + "CpPglAs" // USE HOST PAGELOCKED AND DEVICE MEMORY ASYNC }; -static float RandFloat(float low, float high) { - float t = (float)rand() / (float)RAND_MAX; - return (1.0f - t) * low + t * high; +static float RandFloat(float low, float high) +{ + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; } -void fillMatrixWithRandomValues(float *matrix, unsigned int matrixDim) { - unsigned int i, j; - for (i = 0; i < matrixDim; ++i) { - for (j = 0; j < matrixDim; ++j) { - matrix[j + i * matrixDim] = RandFloat(0.0f, 10.0f); +void fillMatrixWithRandomValues(float *matrix, unsigned int matrixDim) +{ + unsigned int i, j; + for (i = 0; i < matrixDim; ++i) { + for (j = 0; j < matrixDim; ++j) { + matrix[j + i * matrixDim] = RandFloat(0.0f, 10.0f); + } } - } } #if VERIFY_GPU_CORRECTNESS -void verifyMatrixMultiplyCorrectness(float *C, float *A, float *B, - unsigned int matrixDim) { - unsigned int i, j, k, numErrors = 0; - for (i = 0; i < matrixDim; ++i) { - for (j = 0; j < matrixDim; ++j) { - float result = 0.0f; - for (k = 0; k < matrixDim; ++k) { - result += A[k + i * matrixDim] * B[j + k * matrixDim]; - } - if (fabs(C[j + i * matrixDim] - result) > 0.001 * matrixDim) { - printf("At [%u, %u]: Expected %f, Found %f\n", i, j, result, - C[j + i * matrixDim]); - ++numErrors; - } +void verifyMatrixMultiplyCorrectness(float *C, float *A, float *B, unsigned int matrixDim) +{ + unsigned int i, j, k, numErrors = 0; + for (i = 0; i < matrixDim; ++i) { + for (j = 0; j < matrixDim; ++j) { + float result = 0.0f; + for (k = 0; k < matrixDim; ++k) { + result += A[k + i * matrixDim] * B[j + k * matrixDim]; + } + if (fabs(C[j + i * matrixDim] - result) > 0.001 * matrixDim) { + printf("At [%u, %u]: Expected %f, Found %f\n", i, j, result, C[j + i * matrixDim]); + ++numErrors; + } + } + } + if (numErrors != 0) { + printf("%d value mismatches occured\n", numErrors); + fflush(stdout); + exit(EXIT_FAILURE); // exit since value mismatches occured } - } - if (numErrors != 0) { - printf("%d value mismatches occured\n", numErrors); - fflush(stdout); - exit(EXIT_FAILURE); // exit since value mismatches occured - } } #endif -void copyMatrix(float *dstMatrix, float *srcMatrix, unsigned int matrixDim) { - size_t size = matrixDim * matrixDim * sizeof(float); - memcpy(dstMatrix, srcMatrix, size); +void copyMatrix(float *dstMatrix, float *srcMatrix, unsigned int matrixDim) +{ + size_t size = matrixDim * matrixDim * sizeof(float); + memcpy(dstMatrix, srcMatrix, size); } -void verifyMatrixData(float *expectedData, float *observedData, - unsigned int matrixDim) { - unsigned int i, j, numErrors = 0; - for (i = 0; i < matrixDim; ++i) { - for (j = 0; j < matrixDim; ++j) { - if (expectedData[j + i * matrixDim] != observedData[j + i * matrixDim]) { - ++numErrors; - if (verboseResults) { - printf("At [%u, %u]: Expected %f, Found %f\n", i, j, - expectedData[j + i * matrixDim], - observedData[j + i * matrixDim]); +void verifyMatrixData(float *expectedData, float *observedData, unsigned int matrixDim) +{ + unsigned int i, j, numErrors = 0; + for (i = 0; i < matrixDim; ++i) { + for (j = 0; j < matrixDim; ++j) { + if (expectedData[j + i * matrixDim] != observedData[j + i * matrixDim]) { + ++numErrors; + if (verboseResults) { + printf("At [%u, %u]: Expected %f, Found %f\n", + i, + j, + expectedData[j + i * matrixDim], + observedData[j + i * matrixDim]); + } + } } - } } - } - if (numErrors != 0) { - printf("%d value mismatches occured\n", numErrors); - fflush(stdout); - exit(EXIT_FAILURE); // exit since value mismatches occured - } + if (numErrors != 0) { + printf("%d value mismatches occured\n", numErrors); + fflush(stdout); + exit(EXIT_FAILURE); // exit since value mismatches occured + } } #define BLOCK_SIZE 32 -__global__ void matrixMultiplyKernel(float *C, float *A, float *B, - unsigned int matrixDim) { - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; +__global__ void matrixMultiplyKernel(float *C, float *A, float *B, unsigned int matrixDim) +{ + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; - unsigned int wA = matrixDim; - unsigned int wB = matrixDim; + unsigned int wA = matrixDim; + unsigned int wB = matrixDim; - // Index of the first sub-matrix of A processed by the block - int aBegin = matrixDim * BLOCK_SIZE * by; + // Index of the first sub-matrix of A processed by the block + int aBegin = matrixDim * BLOCK_SIZE * by; - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * bx; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * bx; - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[ty][tx] = A[a + wA * ty + tx]; - Bs[ty][tx] = B[b + wB * ty + tx]; + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[ty][tx] = A[a + wA * ty + tx]; + Bs[ty][tx] = B[b + wB * ty + tx]; - // Synchronize to make sure the matrices are loaded - __syncthreads(); + // Synchronize to make sure the matrices are loaded + __syncthreads(); - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[ty][k] * Bs[k][tx]; + for (int k = 0; k < BLOCK_SIZE; ++k) { + Csub += As[ty][k] * Bs[k][tx]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); } - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; - C[c + wB * ty + tx] = Csub; + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; + C[c + wB * ty + tx] = Csub; } -void runMatrixMultiplyKernel(unsigned int matrixDim, int allocType, - unsigned int numLoops, double *gpuLaunchCallsTimes, - double *gpuTransferToCallsTimes, - double *gpuTransferFromCallsTimes, - double *gpuLaunchAndTransferCallsTimes, - double *gpuLaunchTransferSyncTimes, - double *cpuAccessTimes, double *overallTimes, - int device_id) { - float *dptrA = NULL, *hptrA = NULL; - float *dptrB = NULL, *hptrB = NULL; - float *dptrC = NULL, *hptrC = NULL; - float *randValuesX = NULL, *randValuesY = NULL; - float *randValuesVerifyXmulY = NULL, *randValuesVerifyYmulX = NULL; - bool copyRequired = false, hintsRequired = false; - bool someTransferOpRequired; - bool isAsync = false; - cudaStream_t streamToRunOn; - unsigned int *latch; - size_t size = matrixDim * matrixDim * sizeof(float); - dim3 threads(32, 32); - dim3 grid(matrixDim / threads.x, matrixDim / threads.y); - StopWatchInterface *gpuLaunchCallsTimer = 0, *gpuTransferCallsTimer = 0; - StopWatchInterface *gpuSyncTimer = 0, *cpuAccessTimer = 0; - sdkCreateTimer(&gpuLaunchCallsTimer); - sdkCreateTimer(&gpuTransferCallsTimer); - sdkCreateTimer(&gpuSyncTimer); - sdkCreateTimer(&cpuAccessTimer); - unsigned int i; +void runMatrixMultiplyKernel(unsigned int matrixDim, + int allocType, + unsigned int numLoops, + double *gpuLaunchCallsTimes, + double *gpuTransferToCallsTimes, + double *gpuTransferFromCallsTimes, + double *gpuLaunchAndTransferCallsTimes, + double *gpuLaunchTransferSyncTimes, + double *cpuAccessTimes, + double *overallTimes, + int device_id) +{ + float *dptrA = NULL, *hptrA = NULL; + float *dptrB = NULL, *hptrB = NULL; + float *dptrC = NULL, *hptrC = NULL; + float *randValuesX = NULL, *randValuesY = NULL; + float *randValuesVerifyXmulY = NULL, *randValuesVerifyYmulX = NULL; + bool copyRequired = false, hintsRequired = false; + bool someTransferOpRequired; + bool isAsync = false; + cudaStream_t streamToRunOn; + unsigned int *latch; + size_t size = matrixDim * matrixDim * sizeof(float); + dim3 threads(32, 32); + dim3 grid(matrixDim / threads.x, matrixDim / threads.y); + StopWatchInterface *gpuLaunchCallsTimer = 0, *gpuTransferCallsTimer = 0; + StopWatchInterface *gpuSyncTimer = 0, *cpuAccessTimer = 0; + sdkCreateTimer(&gpuLaunchCallsTimer); + sdkCreateTimer(&gpuTransferCallsTimer); + sdkCreateTimer(&gpuSyncTimer); + sdkCreateTimer(&cpuAccessTimer); + unsigned int i; - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device_id)); - checkCudaErrors(cudaStreamCreate(&streamToRunOn)); + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device_id)); + checkCudaErrors(cudaStreamCreate(&streamToRunOn)); - randValuesX = (float *)malloc(size); - if (!randValuesX) { - exit(EXIT_FAILURE); // exit since memory allocation error - } - randValuesY = (float *)malloc(size); - if (!randValuesY) { - exit(EXIT_FAILURE); // exit since memory allocation error - } - randValuesVerifyXmulY = (float *)malloc(size); - if (!randValuesVerifyXmulY) { - exit(EXIT_FAILURE); // exit since memory allocation error - } - randValuesVerifyYmulX = (float *)malloc(size); - if (!randValuesVerifyYmulX) { - exit(EXIT_FAILURE); // exit since memory allocation error - } - checkCudaErrors(cudaMalloc(&dptrA, size)); - checkCudaErrors(cudaMalloc(&dptrB, size)); - checkCudaErrors(cudaMalloc(&dptrC, size)); + randValuesX = (float *)malloc(size); + if (!randValuesX) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + randValuesY = (float *)malloc(size); + if (!randValuesY) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + randValuesVerifyXmulY = (float *)malloc(size); + if (!randValuesVerifyXmulY) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + randValuesVerifyYmulX = (float *)malloc(size); + if (!randValuesVerifyYmulX) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + checkCudaErrors(cudaMalloc(&dptrA, size)); + checkCudaErrors(cudaMalloc(&dptrB, size)); + checkCudaErrors(cudaMalloc(&dptrC, size)); - fillMatrixWithRandomValues(randValuesX, matrixDim); - fillMatrixWithRandomValues(randValuesY, matrixDim); + fillMatrixWithRandomValues(randValuesX, matrixDim); + fillMatrixWithRandomValues(randValuesY, matrixDim); - checkCudaErrors( - cudaMemcpyAsync(dptrA, randValuesX, size, cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpyAsync(dptrB, randValuesY, size, cudaMemcpyHostToDevice)); - matrixMultiplyKernel<<>>(dptrC, dptrA, dptrB, matrixDim); - checkCudaErrors(cudaMemcpyAsync(randValuesVerifyXmulY, dptrC, size, - cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaStreamSynchronize(NULL)); - matrixMultiplyKernel<<>>(dptrC, dptrB, dptrA, matrixDim); - checkCudaErrors(cudaMemcpyAsync(randValuesVerifyYmulX, dptrC, size, - cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaStreamSynchronize(NULL)); + checkCudaErrors(cudaMemcpyAsync(dptrA, randValuesX, size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyAsync(dptrB, randValuesY, size, cudaMemcpyHostToDevice)); + matrixMultiplyKernel<<>>(dptrC, dptrA, dptrB, matrixDim); + checkCudaErrors(cudaMemcpyAsync(randValuesVerifyXmulY, dptrC, size, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaStreamSynchronize(NULL)); + matrixMultiplyKernel<<>>(dptrC, dptrB, dptrA, matrixDim); + checkCudaErrors(cudaMemcpyAsync(randValuesVerifyYmulX, dptrC, size, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaStreamSynchronize(NULL)); #if VERIFY_GPU_CORRECTNESS - verifyMatrixMultiplyCorrectness(randValuesVerifyXmulY, randValuesX, - randValuesY, matrixDim); - verifyMatrixMultiplyCorrectness(randValuesVerifyYmulX, randValuesY, - randValuesX, matrixDim); + verifyMatrixMultiplyCorrectness(randValuesVerifyXmulY, randValuesX, randValuesY, matrixDim); + verifyMatrixMultiplyCorrectness(randValuesVerifyYmulX, randValuesY, randValuesX, matrixDim); #endif - checkCudaErrors(cudaFree(dptrA)); - checkCudaErrors(cudaFree(dptrB)); - checkCudaErrors(cudaFree(dptrC)); + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); - checkCudaErrors(cudaMallocHost(&latch, sizeof(unsigned int))); + checkCudaErrors(cudaMallocHost(&latch, sizeof(unsigned int))); - switch (allocType) { + switch (allocType) { case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY: case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC: - hptrA = (float *)malloc(size); - if (!hptrA) { - exit(EXIT_FAILURE); // exit since memory allocation error - } - hptrB = (float *)malloc(size); - if (!hptrB) { - exit(EXIT_FAILURE); // exit since memory allocation error - } - hptrC = (float *)malloc(size); - if (!hptrC) { - exit(EXIT_FAILURE); // exit since memory allocation error - } - checkCudaErrors(cudaMalloc(&dptrA, size)); - checkCudaErrors(cudaMalloc(&dptrB, size)); - checkCudaErrors(cudaMalloc(&dptrC, size)); - copyRequired = true; - break; + hptrA = (float *)malloc(size); + if (!hptrA) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + hptrB = (float *)malloc(size); + if (!hptrB) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + hptrC = (float *)malloc(size); + if (!hptrC) { + exit(EXIT_FAILURE); // exit since memory allocation error + } + checkCudaErrors(cudaMalloc(&dptrA, size)); + checkCudaErrors(cudaMalloc(&dptrB, size)); + checkCudaErrors(cudaMalloc(&dptrC, size)); + copyRequired = true; + break; case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY: case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC: - checkCudaErrors(cudaMallocHost(&hptrA, size)); - checkCudaErrors(cudaMallocHost(&hptrB, size)); - checkCudaErrors(cudaMallocHost(&hptrC, size)); - checkCudaErrors(cudaMalloc(&dptrA, size)); - checkCudaErrors(cudaMalloc(&dptrB, size)); - checkCudaErrors(cudaMalloc(&dptrC, size)); - copyRequired = true; - break; + checkCudaErrors(cudaMallocHost(&hptrA, size)); + checkCudaErrors(cudaMallocHost(&hptrB, size)); + checkCudaErrors(cudaMallocHost(&hptrC, size)); + checkCudaErrors(cudaMalloc(&dptrA, size)); + checkCudaErrors(cudaMalloc(&dptrB, size)); + checkCudaErrors(cudaMalloc(&dptrC, size)); + copyRequired = true; + break; case USE_ZERO_COPY: - checkCudaErrors(cudaMallocHost(&hptrA, size)); - checkCudaErrors(cudaMallocHost(&hptrB, size)); - checkCudaErrors(cudaMallocHost(&hptrC, size)); - checkCudaErrors(cudaHostGetDevicePointer(&dptrA, hptrA, 0)); - checkCudaErrors(cudaHostGetDevicePointer(&dptrB, hptrB, 0)); - checkCudaErrors(cudaHostGetDevicePointer(&dptrC, hptrC, 0)); - break; + checkCudaErrors(cudaMallocHost(&hptrA, size)); + checkCudaErrors(cudaMallocHost(&hptrB, size)); + checkCudaErrors(cudaMallocHost(&hptrC, size)); + checkCudaErrors(cudaHostGetDevicePointer(&dptrA, hptrA, 0)); + checkCudaErrors(cudaHostGetDevicePointer(&dptrB, hptrB, 0)); + checkCudaErrors(cudaHostGetDevicePointer(&dptrC, hptrC, 0)); + break; case USE_MANAGED_MEMORY: - checkCudaErrors(cudaMallocManaged(&dptrA, size)); - checkCudaErrors(cudaMallocManaged(&dptrB, size)); - checkCudaErrors(cudaMallocManaged(&dptrC, size)); - hptrA = dptrA; - hptrB = dptrB; - hptrC = dptrC; - break; - - case USE_MANAGED_MEMORY_WITH_HINTS: - case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC: - if (deviceProp.concurrentManagedAccess) { checkCudaErrors(cudaMallocManaged(&dptrA, size)); checkCudaErrors(cudaMallocManaged(&dptrB, size)); checkCudaErrors(cudaMallocManaged(&dptrC, size)); - checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); - } else { - checkCudaErrors(cudaMallocManaged(&dptrA, size, cudaMemAttachHost)); - checkCudaErrors(cudaMallocManaged(&dptrB, size, cudaMemAttachHost)); - checkCudaErrors(cudaMallocManaged(&dptrC, size, cudaMemAttachHost)); - } - hptrA = dptrA; - hptrB = dptrB; - hptrC = dptrC; - hintsRequired = true; - break; + hptrA = dptrA; + hptrB = dptrB; + hptrC = dptrC; + break; + + case USE_MANAGED_MEMORY_WITH_HINTS: + case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC: + if (deviceProp.concurrentManagedAccess) { + checkCudaErrors(cudaMallocManaged(&dptrA, size)); + checkCudaErrors(cudaMallocManaged(&dptrB, size)); + checkCudaErrors(cudaMallocManaged(&dptrC, size)); + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); + } + else { + checkCudaErrors(cudaMallocManaged(&dptrA, size, cudaMemAttachHost)); + checkCudaErrors(cudaMallocManaged(&dptrB, size, cudaMemAttachHost)); + checkCudaErrors(cudaMallocManaged(&dptrC, size, cudaMemAttachHost)); + } + hptrA = dptrA; + hptrB = dptrB; + hptrC = dptrC; + hintsRequired = true; + break; default: - exit(EXIT_FAILURE); // exit with error - } - - if (allocType == USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC || - allocType == USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC || - allocType == USE_MANAGED_MEMORY_WITH_HINTS_ASYNC) { - isAsync = true; - } - - someTransferOpRequired = copyRequired || hintsRequired; - - // fill buffers with 0 to avoid any first access page-fault overheads. - memset(hptrA, 0, size); - memset(hptrB, 0, size); - memset(hptrC, 0, size); - - for (i = 0; i < numLoops; i++) { - cpuAccessTimes[i] = 0.0; - gpuLaunchCallsTimes[i] = 0.0; - gpuTransferToCallsTimes[i] = 0.0; - gpuTransferFromCallsTimes[i] = 0.0; - - sdkStartTimer(&cpuAccessTimer); - { - copyMatrix(hptrA, (i & 0x1 == 0) ? randValuesX : randValuesY, matrixDim); - copyMatrix(hptrB, (i & 0x1 == 0) ? randValuesY : randValuesX, matrixDim); - } - sdkStopTimer(&cpuAccessTimer); - cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer); - sdkResetTimer(&cpuAccessTimer); - - if (isAsync && hintsRequired) { - *latch = 0; - // Prevent any work on stream from starting until all work is pushed - spinWhileLessThanOne<<<1, 1, 0, streamToRunOn>>>(latch); + exit(EXIT_FAILURE); // exit with error } - if (someTransferOpRequired) { - sdkStartTimer(&gpuTransferCallsTimer); - if (copyRequired) { + if (allocType == USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC + || allocType == USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC + || allocType == USE_MANAGED_MEMORY_WITH_HINTS_ASYNC) { + isAsync = true; + } + + someTransferOpRequired = copyRequired || hintsRequired; + + // fill buffers with 0 to avoid any first access page-fault overheads. + memset(hptrA, 0, size); + memset(hptrB, 0, size); + memset(hptrC, 0, size); + + for (i = 0; i < numLoops; i++) { + cpuAccessTimes[i] = 0.0; + gpuLaunchCallsTimes[i] = 0.0; + gpuTransferToCallsTimes[i] = 0.0; + gpuTransferFromCallsTimes[i] = 0.0; + + sdkStartTimer(&cpuAccessTimer); + { + copyMatrix(hptrA, (i & 0x1 == 0) ? randValuesX : randValuesY, matrixDim); + copyMatrix(hptrB, (i & 0x1 == 0) ? randValuesY : randValuesX, matrixDim); + } + sdkStopTimer(&cpuAccessTimer); + cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer); + sdkResetTimer(&cpuAccessTimer); + + if (isAsync && hintsRequired) { + *latch = 0; + // Prevent any work on stream from starting until all work is pushed + spinWhileLessThanOne<<<1, 1, 0, streamToRunOn>>>(latch); + } + + if (someTransferOpRequired) { + sdkStartTimer(&gpuTransferCallsTimer); + if (copyRequired) { + if (isAsync) { + checkCudaErrors(cudaMemcpyAsync(dptrA, hptrA, size, cudaMemcpyHostToDevice, streamToRunOn)); + checkCudaErrors(cudaMemcpyAsync(dptrB, hptrB, size, cudaMemcpyHostToDevice, streamToRunOn)); + } + else { + checkCudaErrors(cudaMemcpy(dptrA, hptrA, size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(dptrB, hptrB, size, cudaMemcpyHostToDevice)); + } + } + if (hintsRequired) { + if (deviceProp.concurrentManagedAccess) { + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, device_id, streamToRunOn)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, device_id, streamToRunOn)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, device_id, streamToRunOn)); + } + else { + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, cudaMemAttachGlobal)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0, cudaMemAttachGlobal)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0, cudaMemAttachGlobal)); + } + if (!isAsync) { + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + } + + sdkStopTimer(&gpuTransferCallsTimer); + gpuTransferToCallsTimes[i] += sdkGetAverageTimerValue(&gpuTransferCallsTimer); + sdkResetTimer(&gpuTransferCallsTimer); + } + + sdkStartTimer(&gpuLaunchCallsTimer); + { + matrixMultiplyKernel<<>>(dptrC, dptrA, dptrB, matrixDim); + if (!isAsync) { + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + } + sdkStopTimer(&gpuLaunchCallsTimer); + + gpuLaunchCallsTimes[i] += sdkGetAverageTimerValue(&gpuLaunchCallsTimer); + sdkResetTimer(&gpuLaunchCallsTimer); + + if (someTransferOpRequired) { + sdkStartTimer(&gpuTransferCallsTimer); + if (hintsRequired) { + if (deviceProp.concurrentManagedAccess) { + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); + } + else { + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0, cudaMemAttachHost)); + checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0, cudaMemAttachHost)); + } + if (!isAsync) { + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + } + if (copyRequired) { + if (isAsync) { + checkCudaErrors(cudaMemcpyAsync(hptrC, dptrC, size, cudaMemcpyDeviceToHost, streamToRunOn)); + } + else { + checkCudaErrors(cudaMemcpy(hptrC, dptrC, size, cudaMemcpyDeviceToHost)); + } + } + sdkStopTimer(&gpuTransferCallsTimer); + gpuTransferFromCallsTimes[i] += sdkGetAverageTimerValue(&gpuTransferCallsTimer); + sdkResetTimer(&gpuTransferCallsTimer); + } + gpuLaunchAndTransferCallsTimes[i] = + gpuLaunchCallsTimes[i] + gpuTransferToCallsTimes[i] + gpuTransferFromCallsTimes[i]; + gpuLaunchTransferSyncTimes[i] = gpuLaunchAndTransferCallsTimes[i]; if (isAsync) { - checkCudaErrors(cudaMemcpyAsync( - dptrA, hptrA, size, cudaMemcpyHostToDevice, streamToRunOn)); - checkCudaErrors(cudaMemcpyAsync( - dptrB, hptrB, size, cudaMemcpyHostToDevice, streamToRunOn)); - } else { - checkCudaErrors( - cudaMemcpy(dptrA, hptrA, size, cudaMemcpyHostToDevice)); - checkCudaErrors( - cudaMemcpy(dptrB, hptrB, size, cudaMemcpyHostToDevice)); + sdkStartTimer(&gpuSyncTimer); + { + if (hintsRequired) { + *latch = 1; + } + checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); + } + sdkStopTimer(&gpuSyncTimer); + gpuLaunchTransferSyncTimes[i] += sdkGetAverageTimerValue(&gpuSyncTimer); + sdkResetTimer(&gpuSyncTimer); } - } - if (hintsRequired) { - if (deviceProp.concurrentManagedAccess) { - checkCudaErrors( - cudaMemPrefetchAsync(dptrA, size, device_id, streamToRunOn)); - checkCudaErrors( - cudaMemPrefetchAsync(dptrB, size, device_id, streamToRunOn)); - checkCudaErrors( - cudaMemPrefetchAsync(dptrC, size, device_id, streamToRunOn)); - } else { - checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, - cudaMemAttachGlobal)); - checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0, - cudaMemAttachGlobal)); - checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0, - cudaMemAttachGlobal)); - } - if (!isAsync) { - checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); - } - } - sdkStopTimer(&gpuTransferCallsTimer); - gpuTransferToCallsTimes[i] += - sdkGetAverageTimerValue(&gpuTransferCallsTimer); - sdkResetTimer(&gpuTransferCallsTimer); + sdkStartTimer(&cpuAccessTimer); + { + verifyMatrixData((i & 0x1 == 0) ? randValuesVerifyXmulY : randValuesVerifyYmulX, hptrC, matrixDim); + } + sdkStopTimer(&cpuAccessTimer); + cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer); + sdkResetTimer(&cpuAccessTimer); + overallTimes[i] = cpuAccessTimes[i] + gpuLaunchTransferSyncTimes[i]; } - sdkStartTimer(&gpuLaunchCallsTimer); - { - matrixMultiplyKernel<<>>( - dptrC, dptrA, dptrB, matrixDim); - if (!isAsync) { - checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); - } - } - sdkStopTimer(&gpuLaunchCallsTimer); - - gpuLaunchCallsTimes[i] += sdkGetAverageTimerValue(&gpuLaunchCallsTimer); - sdkResetTimer(&gpuLaunchCallsTimer); - - if (someTransferOpRequired) { - sdkStartTimer(&gpuTransferCallsTimer); - if (hintsRequired) { - if (deviceProp.concurrentManagedAccess) { - checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); - } else { - checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, - cudaMemAttachHost)); - checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0, - cudaMemAttachHost)); - checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0, - cudaMemAttachHost)); - } - if (!isAsync) { - checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); - } - } - if (copyRequired) { - if (isAsync) { - checkCudaErrors(cudaMemcpyAsync( - hptrC, dptrC, size, cudaMemcpyDeviceToHost, streamToRunOn)); - } else { - checkCudaErrors( - cudaMemcpy(hptrC, dptrC, size, cudaMemcpyDeviceToHost)); - } - } - sdkStopTimer(&gpuTransferCallsTimer); - gpuTransferFromCallsTimes[i] += - sdkGetAverageTimerValue(&gpuTransferCallsTimer); - sdkResetTimer(&gpuTransferCallsTimer); - } - gpuLaunchAndTransferCallsTimes[i] = gpuLaunchCallsTimes[i] + - gpuTransferToCallsTimes[i] + - gpuTransferFromCallsTimes[i]; - gpuLaunchTransferSyncTimes[i] = gpuLaunchAndTransferCallsTimes[i]; - if (isAsync) { - sdkStartTimer(&gpuSyncTimer); - { - if (hintsRequired) { - *latch = 1; - } - checkCudaErrors(cudaStreamSynchronize(streamToRunOn)); - } - sdkStopTimer(&gpuSyncTimer); - gpuLaunchTransferSyncTimes[i] += sdkGetAverageTimerValue(&gpuSyncTimer); - sdkResetTimer(&gpuSyncTimer); - } - - sdkStartTimer(&cpuAccessTimer); - { - verifyMatrixData( - (i & 0x1 == 0) ? randValuesVerifyXmulY : randValuesVerifyYmulX, hptrC, - matrixDim); - } - sdkStopTimer(&cpuAccessTimer); - cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer); - sdkResetTimer(&cpuAccessTimer); - overallTimes[i] = cpuAccessTimes[i] + gpuLaunchTransferSyncTimes[i]; - } - - switch (allocType) { + switch (allocType) { case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY: case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC: - free(hptrA); - free(hptrB); - free(hptrC); - checkCudaErrors(cudaFree(dptrA)); - checkCudaErrors(cudaFree(dptrB)); - checkCudaErrors(cudaFree(dptrC)); - break; + free(hptrA); + free(hptrB); + free(hptrC); + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); + break; case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY: case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC: - checkCudaErrors(cudaFreeHost(hptrA)); - checkCudaErrors(cudaFreeHost(hptrB)); - checkCudaErrors(cudaFreeHost(hptrC)); - checkCudaErrors(cudaFree(dptrA)); - checkCudaErrors(cudaFree(dptrB)); - checkCudaErrors(cudaFree(dptrC)); - break; + checkCudaErrors(cudaFreeHost(hptrA)); + checkCudaErrors(cudaFreeHost(hptrB)); + checkCudaErrors(cudaFreeHost(hptrC)); + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); + break; case USE_ZERO_COPY: - checkCudaErrors(cudaFreeHost(hptrA)); - checkCudaErrors(cudaFreeHost(hptrB)); - checkCudaErrors(cudaFreeHost(hptrC)); - break; + checkCudaErrors(cudaFreeHost(hptrA)); + checkCudaErrors(cudaFreeHost(hptrB)); + checkCudaErrors(cudaFreeHost(hptrC)); + break; case USE_MANAGED_MEMORY: case USE_MANAGED_MEMORY_WITH_HINTS: case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC: - checkCudaErrors(cudaFree(dptrA)); - checkCudaErrors(cudaFree(dptrB)); - checkCudaErrors(cudaFree(dptrC)); - break; + checkCudaErrors(cudaFree(dptrA)); + checkCudaErrors(cudaFree(dptrB)); + checkCudaErrors(cudaFree(dptrC)); + break; default: - exit(EXIT_FAILURE); // exit due to error - } + exit(EXIT_FAILURE); // exit due to error + } - checkCudaErrors(cudaStreamDestroy(streamToRunOn)); - checkCudaErrors(cudaFreeHost(latch)); - free(randValuesX); - free(randValuesY); - free(randValuesVerifyXmulY); - free(randValuesVerifyYmulX); - sdkDeleteTimer(&gpuLaunchCallsTimer); - sdkDeleteTimer(&gpuTransferCallsTimer); - sdkDeleteTimer(&gpuSyncTimer); - sdkDeleteTimer(&cpuAccessTimer); + checkCudaErrors(cudaStreamDestroy(streamToRunOn)); + checkCudaErrors(cudaFreeHost(latch)); + free(randValuesX); + free(randValuesY); + free(randValuesVerifyXmulY); + free(randValuesVerifyYmulX); + sdkDeleteTimer(&gpuLaunchCallsTimer); + sdkDeleteTimer(&gpuTransferCallsTimer); + sdkDeleteTimer(&gpuSyncTimer); + sdkDeleteTimer(&cpuAccessTimer); } void matrixMultiplyPerfRunner(bool reportAsBandwidth, bool print_launch_transfer_results, - bool print_std_deviation, int device_id) { - int i; - unsigned int minMatrixDim = 32; - unsigned int multiplierDim = 2; - unsigned int matrixDim; - unsigned int minSize = minMatrixDim * minMatrixDim * sizeof(float); - unsigned int maxSize = - (maxSampleSizeInMb * ONE_MB) / - 4; // 3 buffers are used, but dividing by 4 (power of 2) - unsigned int multiplier = multiplierDim * multiplierDim; - unsigned int numSizesToTest; + bool print_std_deviation, + int device_id) +{ + int i; + unsigned int minMatrixDim = 32; + unsigned int multiplierDim = 2; + unsigned int matrixDim; + unsigned int minSize = minMatrixDim * minMatrixDim * sizeof(float); + unsigned int maxSize = (maxSampleSizeInMb * ONE_MB) / 4; // 3 buffers are used, but dividing by 4 (power of 2) + unsigned int multiplier = multiplierDim * multiplierDim; + unsigned int numSizesToTest; - struct testResults *results; - struct resultsData *gpuLaunchCallsTimes; - struct resultsData *gpuTransferToCallsTimes; - struct resultsData *gpuTransferFromCallsTimes; - struct resultsData *gpuLaunchAndTransferCallsTimes; - struct resultsData *gpuLaunchTransferSyncTimes; - struct resultsData *cpuAccessTimes; - struct resultsData *overallTimes; - unsigned long *sizesToTest; - unsigned int j; + struct testResults *results; + struct resultsData *gpuLaunchCallsTimes; + struct resultsData *gpuTransferToCallsTimes; + struct resultsData *gpuTransferFromCallsTimes; + struct resultsData *gpuLaunchAndTransferCallsTimes; + struct resultsData *gpuLaunchTransferSyncTimes; + struct resultsData *cpuAccessTimes; + struct resultsData *overallTimes; + unsigned long *sizesToTest; + unsigned int j; - numSizesToTest = findNumSizesToTest(minSize, maxSize, multiplier); + numSizesToTest = findNumSizesToTest(minSize, maxSize, multiplier); - createAndInitTestResults(&results, "matrixMultiplyPerf", numKernelRuns, - numSizesToTest); + createAndInitTestResults(&results, "matrixMultiplyPerf", numKernelRuns, numSizesToTest); - sizesToTest = getPtrSizesToTest(results); + sizesToTest = getPtrSizesToTest(results); - createResultDataAndAddToTestResults(&gpuLaunchCallsTimes, results, - "GPU Kernel Launch Call Time", false, - reportAsBandwidth); - createResultDataAndAddToTestResults(&gpuTransferToCallsTimes, results, - "CPU to GPU Transfer Calls Time", false, - reportAsBandwidth); - createResultDataAndAddToTestResults(&gpuTransferFromCallsTimes, results, - "GPU to CPU Transfer Calls Time", false, - reportAsBandwidth); - createResultDataAndAddToTestResults(&gpuLaunchAndTransferCallsTimes, results, - "GPU Launch and Transfer Calls Time", - false, reportAsBandwidth); - createResultDataAndAddToTestResults(&gpuLaunchTransferSyncTimes, results, - "GPU Launch Transfer and Sync Time", - false, reportAsBandwidth); - createResultDataAndAddToTestResults( - &cpuAccessTimes, results, "CPU Access Time", false, reportAsBandwidth); - createResultDataAndAddToTestResults(&overallTimes, results, "Overall Time", - false, reportAsBandwidth); + createResultDataAndAddToTestResults( + &gpuLaunchCallsTimes, results, "GPU Kernel Launch Call Time", false, reportAsBandwidth); + createResultDataAndAddToTestResults( + &gpuTransferToCallsTimes, results, "CPU to GPU Transfer Calls Time", false, reportAsBandwidth); + createResultDataAndAddToTestResults( + &gpuTransferFromCallsTimes, results, "GPU to CPU Transfer Calls Time", false, reportAsBandwidth); + createResultDataAndAddToTestResults( + &gpuLaunchAndTransferCallsTimes, results, "GPU Launch and Transfer Calls Time", false, reportAsBandwidth); + createResultDataAndAddToTestResults( + &gpuLaunchTransferSyncTimes, results, "GPU Launch Transfer and Sync Time", false, reportAsBandwidth); + createResultDataAndAddToTestResults(&cpuAccessTimes, results, "CPU Access Time", false, reportAsBandwidth); + createResultDataAndAddToTestResults(&overallTimes, results, "Overall Time", false, reportAsBandwidth); - printf("Running "); - for (matrixDim = minMatrixDim, j = 0; - matrixDim * matrixDim <= maxSize / sizeof(float); - matrixDim *= multiplierDim, ++j) { - sizesToTest[j] = matrixDim * matrixDim * sizeof(float); - for (i = MEMALLOC_TYPE_START; i <= MEMALLOC_TYPE_END; i++) { - printf("."); - fflush(stdout); - runMatrixMultiplyKernel( - matrixDim, i, numKernelRuns, - getPtrRunTimesInMs(gpuLaunchCallsTimes, i, j), - getPtrRunTimesInMs(gpuTransferToCallsTimes, i, j), - getPtrRunTimesInMs(gpuTransferFromCallsTimes, i, j), - getPtrRunTimesInMs(gpuLaunchAndTransferCallsTimes, i, j), - getPtrRunTimesInMs(gpuLaunchTransferSyncTimes, i, j), - getPtrRunTimesInMs(cpuAccessTimes, i, j), - getPtrRunTimesInMs(overallTimes, i, j), device_id); + printf("Running "); + for (matrixDim = minMatrixDim, j = 0; matrixDim * matrixDim <= maxSize / sizeof(float); + matrixDim *= multiplierDim, ++j) { + sizesToTest[j] = matrixDim * matrixDim * sizeof(float); + for (i = MEMALLOC_TYPE_START; i <= MEMALLOC_TYPE_END; i++) { + printf("."); + fflush(stdout); + runMatrixMultiplyKernel(matrixDim, + i, + numKernelRuns, + getPtrRunTimesInMs(gpuLaunchCallsTimes, i, j), + getPtrRunTimesInMs(gpuTransferToCallsTimes, i, j), + getPtrRunTimesInMs(gpuTransferFromCallsTimes, i, j), + getPtrRunTimesInMs(gpuLaunchAndTransferCallsTimes, i, j), + getPtrRunTimesInMs(gpuLaunchTransferSyncTimes, i, j), + getPtrRunTimesInMs(cpuAccessTimes, i, j), + getPtrRunTimesInMs(overallTimes, i, j), + device_id); + } } - } - printf("\n"); - printResults(results, print_launch_transfer_results, print_std_deviation); - freeTestResultsAndAllResultsData(results); + printf("\n"); + printResults(results, print_launch_transfer_results, print_std_deviation); + freeTestResultsAndAllResultsData(results); } -static void usage() { - printf( - "./cudaMemoryTypesPerf [-device=] [-reportAsBandwidth] " - "[-print-launch-transfer-results] [-print-std-deviation] [-verbose]\n"); - printf("Options:\n"); - printf( - "-reportAsBandwidth: By default time taken is printed, this " - "option allows to instead print bandwidth.\n"); - printf( - "-print-launch-transfer-results: By default overall results are printed, " - "this option allows to print data transfers and kernel time as well.\n"); - printf( - "-print-std-deviation: Prints std deviation of the results.\n"); - printf( - "-kernel-iterations=: Number of times the kernel tests should " - "be run[default is 100 iterations].\n"); - printf( - "-device=: Allows to pass GPU Device ID on which " - "the tests will be run.\n"); - printf("-verbose: Prints highly verbose output.\n"); +static void usage() +{ + printf("./cudaMemoryTypesPerf [-device=] [-reportAsBandwidth] " + "[-print-launch-transfer-results] [-print-std-deviation] [-verbose]\n"); + printf("Options:\n"); + printf("-reportAsBandwidth: By default time taken is printed, this " + "option allows to instead print bandwidth.\n"); + printf("-print-launch-transfer-results: By default overall results are printed, " + "this option allows to print data transfers and kernel time as well.\n"); + printf("-print-std-deviation: Prints std deviation of the results.\n"); + printf("-kernel-iterations=: Number of times the kernel tests should " + "be run[default is 100 iterations].\n"); + printf("-device=: Allows to pass GPU Device ID on which " + "the tests will be run.\n"); + printf("-verbose: Prints highly verbose output.\n"); } -int main(int argc, char **argv) { - bool reportAsBandwidth = false; - bool print_launch_transfer_results = false; - bool print_std_deviation = false; +int main(int argc, char **argv) +{ + bool reportAsBandwidth = false; + bool print_launch_transfer_results = false; + bool print_std_deviation = false; - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "h")) { - usage(); - printf("&&&& %s WAIVED\n", argv[0]); - exit(EXIT_WAIVED); - } + if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h")) { + usage(); + printf("&&&& %s WAIVED\n", argv[0]); + exit(EXIT_WAIVED); + } - if (checkCmdLineFlag(argc, (const char **)argv, "reportAsBandwidth")) { - reportAsBandwidth = true; - } + if (checkCmdLineFlag(argc, (const char **)argv, "reportAsBandwidth")) { + reportAsBandwidth = true; + } - if (checkCmdLineFlag(argc, (const char **)argv, - "print-launch-transfer-results")) { - print_launch_transfer_results = true; - } + if (checkCmdLineFlag(argc, (const char **)argv, "print-launch-transfer-results")) { + print_launch_transfer_results = true; + } - if (checkCmdLineFlag(argc, (const char **)argv, "print-std-deviation")) { - print_std_deviation = true; - } + if (checkCmdLineFlag(argc, (const char **)argv, "print-std-deviation")) { + print_std_deviation = true; + } - if (checkCmdLineFlag(argc, (const char **)argv, "kernel-iterations")) { - numKernelRuns = - getCmdLineArgumentInt(argc, (const char **)argv, "kernel-iterations"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "kernel-iterations")) { + numKernelRuns = getCmdLineArgumentInt(argc, (const char **)argv, "kernel-iterations"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) { - verboseResults = 1; - } + if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) { + verboseResults = 1; + } - int device_id = findCudaDevice(argc, (const char **)argv); + int device_id = findCudaDevice(argc, (const char **)argv); - matrixMultiplyPerfRunner(reportAsBandwidth, print_launch_transfer_results, - print_std_deviation, device_id); + matrixMultiplyPerfRunner(reportAsBandwidth, print_launch_transfer_results, print_std_deviation, device_id); - printf( - "\nNOTE: The CUDA Samples are not meant for performance measurements. " - "Results may vary when GPU Boost is enabled.\n"); - exit(EXIT_SUCCESS); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/6_Performance/alignedTypes/alignedTypes.cu b/Samples/6_Performance/alignedTypes/alignedTypes.cu index 87332f00..d7226317 100644 --- a/Samples/6_Performance/alignedTypes/alignedTypes.cu +++ b/Samples/6_Performance/alignedTypes/alignedTypes.cu @@ -41,8 +41,8 @@ #include // includes, project -#include // helper functions for CUDA error checking and initialization -#include // helper utility functions +#include // helper functions for CUDA error checking and initialization +#include // helper utility functions //////////////////////////////////////////////////////////////////////////////// // Misaligned types @@ -51,46 +51,50 @@ typedef unsigned char uint8; typedef unsigned short int uint16; -typedef struct { - unsigned char r, g, b, a; +typedef struct +{ + unsigned char r, g, b, a; } RGBA8_misaligned; -typedef struct { - unsigned int l, a; +typedef struct +{ + unsigned int l, a; } LA32_misaligned; -typedef struct { - unsigned int r, g, b; +typedef struct +{ + unsigned int r, g, b; } RGB32_misaligned; -typedef struct { - unsigned int r, g, b, a; +typedef struct +{ + unsigned int r, g, b, a; } RGBA32_misaligned; //////////////////////////////////////////////////////////////////////////////// // Aligned types //////////////////////////////////////////////////////////////////////////////// -typedef struct __align__(4) { - unsigned char r, g, b, a; -} -RGBA8; +typedef struct __align__(4) +{ + unsigned char r, g, b, a; +} RGBA8; typedef unsigned int I32; -typedef struct __align__(8) { - unsigned int l, a; -} -LA32; +typedef struct __align__(8) +{ + unsigned int l, a; +} LA32; -typedef struct __align__(16) { - unsigned int r, g, b; -} -RGB32; +typedef struct __align__(16) +{ + unsigned int r, g, b; +} RGB32; -typedef struct __align__(16) { - unsigned int r, g, b, a; -} -RGBA32; +typedef struct __align__(16) +{ + unsigned int r, g, b, a; +} RGBA32; //////////////////////////////////////////////////////////////////////////////// // Because G80 class hardware natively supports global memory operations @@ -101,10 +105,10 @@ RGBA32; // "Structure of arrays" storage strategy offers best performance // in general case. See section 5.1.2 of the Programming Guide. //////////////////////////////////////////////////////////////////////////////// -typedef struct __align__(16) { - RGBA32 c1, c2; -} -RGBA32_2; +typedef struct __align__(16) +{ + RGBA32 c1, c2; +} RGBA32_2; //////////////////////////////////////////////////////////////////////////////// // Common host and device functions @@ -126,14 +130,14 @@ int iAlignDown(int a, int b) { return a - a % b; } // Copy is carried out on per-element basis, // so it's not per-byte in case of padded structures. //////////////////////////////////////////////////////////////////////////////// -template -__global__ void testKernel(TData *d_odata, TData *d_idata, int numElements) { - const int tid = blockDim.x * blockIdx.x + threadIdx.x; - const int numThreads = blockDim.x * gridDim.x; +template __global__ void testKernel(TData *d_odata, TData *d_idata, int numElements) +{ + const int tid = blockDim.x * blockIdx.x + threadIdx.x; + const int numThreads = blockDim.x * gridDim.x; - for (int pos = tid; pos < numElements; pos += numThreads) { - d_odata[pos] = d_idata[pos]; - } + for (int pos = tid; pos < numElements; pos += numThreads) { + d_odata[pos] = d_idata[pos]; + } } //////////////////////////////////////////////////////////////////////////////// @@ -144,171 +148,166 @@ __global__ void testKernel(TData *d_odata, TData *d_idata, int numElements) { // is undefined, since padding is merely a placeholder // and doesn't contain any user data. //////////////////////////////////////////////////////////////////////////////// -template -int testCPU(TData *h_odata, TData *h_idata, int numElements, - int packedElementSize) { - for (int pos = 0; pos < numElements; pos++) { - TData src = h_idata[pos]; - TData dst = h_odata[pos]; +template int testCPU(TData *h_odata, TData *h_idata, int numElements, int packedElementSize) +{ + for (int pos = 0; pos < numElements; pos++) { + TData src = h_idata[pos]; + TData dst = h_odata[pos]; - for (int i = 0; i < packedElementSize; i++) - if (((char *)&src)[i] != ((char *)&dst)[i]) { - return 0; - } - } + for (int i = 0; i < packedElementSize; i++) + if (((char *)&src)[i] != ((char *)&dst)[i]) { + return 0; + } + } - return 1; + return 1; } //////////////////////////////////////////////////////////////////////////////// // Data configuration //////////////////////////////////////////////////////////////////////////////// // Memory chunk size in bytes. Reused for test -const int MEM_SIZE = 50000000; +const int MEM_SIZE = 50000000; const int NUM_ITERATIONS = 32; // GPU input and output data unsigned char *d_idata, *d_odata; // CPU input data and instance of GPU output data -unsigned char *h_idataCPU, *h_odataGPU; +unsigned char *h_idataCPU, *h_odataGPU; StopWatchInterface *hTimer = NULL; -template -int runTest(int packedElementSize, int memory_size) { - const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData)); - const int numElements = iDivDown(memory_size, sizeof(TData)); +template int runTest(int packedElementSize, int memory_size) +{ + const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData)); + const int numElements = iDivDown(memory_size, sizeof(TData)); - // Clean output buffer before current test - checkCudaErrors(cudaMemset(d_odata, 0, memory_size)); - // Run test - checkCudaErrors(cudaDeviceSynchronize()); - sdkResetTimer(&hTimer); - sdkStartTimer(&hTimer); + // Clean output buffer before current test + checkCudaErrors(cudaMemset(d_odata, 0, memory_size)); + // Run test + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); - for (int i = 0; i < NUM_ITERATIONS; i++) { - testKernel - <<<64, 256>>>((TData *)d_odata, (TData *)d_idata, numElements); - getLastCudaError("testKernel() execution failed\n"); - } + for (int i = 0; i < NUM_ITERATIONS; i++) { + testKernel<<<64, 256>>>((TData *)d_odata, (TData *)d_idata, numElements); + getLastCudaError("testKernel() execution failed\n"); + } - checkCudaErrors(cudaDeviceSynchronize()); - sdkStopTimer(&hTimer); - double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; - printf("Avg. time: %f ms / Copy throughput: %f GB/s.\n", gpuTime, - (double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0)); + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; + printf("Avg. time: %f ms / Copy throughput: %f GB/s.\n", + gpuTime, + (double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0)); - // Read back GPU results and run validation - checkCudaErrors( - cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost)); - int flag = testCPU((TData *)h_odataGPU, (TData *)h_idataCPU, numElements, - packedElementSize); + // Read back GPU results and run validation + checkCudaErrors(cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost)); + int flag = testCPU((TData *)h_odataGPU, (TData *)h_idataCPU, numElements, packedElementSize); - printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n"); + printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n"); - return !flag; + return !flag; } -int main(int argc, char **argv) { - int i, nTotalFailures = 0; +int main(int argc, char **argv) +{ + int i, nTotalFailures = 0; - int devID; - cudaDeviceProp deviceProp; - printf("[%s] - Starting...\n", argv[0]); + int devID; + cudaDeviceProp deviceProp; + printf("[%s] - Starting...\n", argv[0]); - // find first CUDA device - devID = findCudaDevice(argc, (const char **)argv); + // find first CUDA device + devID = findCudaDevice(argc, (const char **)argv); - // get number of SMs on this GPU - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, - deviceProp.multiProcessorCount, - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - deviceProp.multiProcessorCount); + // get number of SMs on this GPU + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", + deviceProp.name, + deviceProp.multiProcessorCount, + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); - // Anything that is less than 192 Cores will have a scaled down workload - float scale_factor = - max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - (float)deviceProp.multiProcessorCount)), - 1.0f); + // Anything that is less than 192 Cores will have a scaled down workload + float scale_factor = max( + (192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), + 1.0f); - int MemorySize = (int)(MEM_SIZE / scale_factor) & - 0xffffff00; // force multiple of 256 bytes + int MemorySize = (int)(MEM_SIZE / scale_factor) & 0xffffff00; // force multiple of 256 bytes - printf("> Compute scaling value = %4.2f\n", scale_factor); - printf("> Memory Size = %d\n", MemorySize); + printf("> Compute scaling value = %4.2f\n", scale_factor); + printf("> Memory Size = %d\n", MemorySize); - sdkCreateTimer(&hTimer); + sdkCreateTimer(&hTimer); - printf("Allocating memory...\n"); - h_idataCPU = (unsigned char *)malloc(MemorySize); - h_odataGPU = (unsigned char *)malloc(MemorySize); - checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize)); - checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize)); + printf("Allocating memory...\n"); + h_idataCPU = (unsigned char *)malloc(MemorySize); + h_odataGPU = (unsigned char *)malloc(MemorySize); + checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize)); + checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize)); - printf("Generating host input data array...\n"); + printf("Generating host input data array...\n"); - for (i = 0; i < MemorySize; i++) { - h_idataCPU[i] = (i & 0xFF) + 1; - } + for (i = 0; i < MemorySize; i++) { + h_idataCPU[i] = (i & 0xFF) + 1; + } - printf("Uploading input data to GPU memory...\n"); - checkCudaErrors( - cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice)); + printf("Uploading input data to GPU memory...\n"); + checkCudaErrors(cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice)); - printf("Testing misaligned types...\n"); - printf("uint8...\n"); - nTotalFailures += runTest(1, MemorySize); + printf("Testing misaligned types...\n"); + printf("uint8...\n"); + nTotalFailures += runTest(1, MemorySize); - printf("uint16...\n"); - nTotalFailures += runTest(2, MemorySize); + printf("uint16...\n"); + nTotalFailures += runTest(2, MemorySize); - printf("RGBA8_misaligned...\n"); - nTotalFailures += runTest(4, MemorySize); + printf("RGBA8_misaligned...\n"); + nTotalFailures += runTest(4, MemorySize); - printf("LA32_misaligned...\n"); - nTotalFailures += runTest(8, MemorySize); + printf("LA32_misaligned...\n"); + nTotalFailures += runTest(8, MemorySize); - printf("RGB32_misaligned...\n"); - nTotalFailures += runTest(12, MemorySize); + printf("RGB32_misaligned...\n"); + nTotalFailures += runTest(12, MemorySize); - printf("RGBA32_misaligned...\n"); - nTotalFailures += runTest(16, MemorySize); + printf("RGBA32_misaligned...\n"); + nTotalFailures += runTest(16, MemorySize); - printf("Testing aligned types...\n"); - printf("RGBA8...\n"); - nTotalFailures += runTest(4, MemorySize); + printf("Testing aligned types...\n"); + printf("RGBA8...\n"); + nTotalFailures += runTest(4, MemorySize); - printf("I32...\n"); - nTotalFailures += runTest(4, MemorySize); + printf("I32...\n"); + nTotalFailures += runTest(4, MemorySize); - printf("LA32...\n"); - nTotalFailures += runTest(8, MemorySize); + printf("LA32...\n"); + nTotalFailures += runTest(8, MemorySize); - printf("RGB32...\n"); - nTotalFailures += runTest(12, MemorySize); + printf("RGB32...\n"); + nTotalFailures += runTest(12, MemorySize); - printf("RGBA32...\n"); - nTotalFailures += runTest(16, MemorySize); + printf("RGBA32...\n"); + nTotalFailures += runTest(16, MemorySize); - printf("RGBA32_2...\n"); - nTotalFailures += runTest(32, MemorySize); + printf("RGBA32_2...\n"); + nTotalFailures += runTest(32, MemorySize); - printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures); + printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures); - printf("Shutting down...\n"); - checkCudaErrors(cudaFree(d_idata)); - checkCudaErrors(cudaFree(d_odata)); - free(h_odataGPU); - free(h_idataCPU); + printf("Shutting down...\n"); + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(d_odata)); + free(h_odataGPU); + free(h_idataCPU); - sdkDeleteTimer(&hTimer); + sdkDeleteTimer(&hTimer); - if (nTotalFailures != 0) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + if (nTotalFailures != 0) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } - printf("Test passed\n"); - exit(EXIT_SUCCESS); + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/6_Performance/alignedTypes/doc/alignedTypes.txt b/Samples/6_Performance/alignedTypes/doc/alignedTypes.txt index 9c959e35..4073a143 100644 --- a/Samples/6_Performance/alignedTypes/doc/alignedTypes.txt +++ b/Samples/6_Performance/alignedTypes/doc/alignedTypes.txt @@ -6,5 +6,5 @@ typedef struct{ float b; } testStructure; -Without alignment specification the compiler will not automatically use a single 64-bit global memory load/store instruction, but will emit two 32-bit load instructions instead. -This significantly impacts aggregate load/store bandwidth, since the latter breaks coalescing rules because of incontiguous memory access pattern. Refer to section 5.1.2.1 of the Programming Guide. \ No newline at end of file +Without alignment specification the compiler will not automatically use a single 64-bit global memory load/store instruction, but will emit two 32-bit load instructions instead. +This significantly impacts aggregate load/store bandwidth, since the latter breaks coalescing rules because of incontiguous memory access pattern. Refer to section 5.1.2.1 of the Programming Guide. diff --git a/Samples/6_Performance/cudaGraphsPerfScaling/README.md b/Samples/6_Performance/cudaGraphsPerfScaling/README.md index 23cee264..494c9433 100644 --- a/Samples/6_Performance/cudaGraphsPerfScaling/README.md +++ b/Samples/6_Performance/cudaGraphsPerfScaling/README.md @@ -30,4 +30,3 @@ cudaStreamBeginCapture, cudaGraphInstantiate, cudaGraphLaunch, cudaGraphUpload Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## References (for more details) - diff --git a/Samples/6_Performance/cudaGraphsPerfScaling/cudaGraphPerfScaling.cu b/Samples/6_Performance/cudaGraphsPerfScaling/cudaGraphPerfScaling.cu index 9699c83f..55372bdf 100644 --- a/Samples/6_Performance/cudaGraphsPerfScaling/cudaGraphPerfScaling.cu +++ b/Samples/6_Performance/cudaGraphsPerfScaling/cudaGraphPerfScaling.cu @@ -31,10 +31,10 @@ #define USE_NVTX +#include #include #include #include -#include typedef volatile int LatchType; @@ -43,75 +43,72 @@ std::chrono::time_point getCpuTime() return std::chrono::high_resolution_clock::now(); } -template -float getMicroSecondDuration(T start, T end) +template float getMicroSecondDuration(T start, T end) { - return std::chrono::duration_cast(end-start).count() *.001f; + return std::chrono::duration_cast(end - start).count() * .001f; } float getAsyncMicroSecondDuration(cudaEvent_t start, cudaEvent_t end) { float ms; cudaEventElapsedTime(&ms, start, end); - return ms*1000; + return ms * 1000; } #ifdef USE_NVTX #include -class Tracer { +class Tracer +{ public: - Tracer(const char* name) { - nvtxRangePushA(name); - } - ~Tracer() { - nvtxRangePop(); - } + Tracer(const char *name) { nvtxRangePushA(name); } + ~Tracer() { nvtxRangePop(); } }; -#define RANGE(name) Tracer uniq_name_using_macros(name); +#define RANGE(name) Tracer uniq_name_using_macros(name); #define RANGE_PUSH(name) nvtxRangePushA(name) -#define RANGE_POP() nvtxRangePop(); +#define RANGE_POP() nvtxRangePop(); #else #define RANGE(name) #endif std::vector stream; -cudaEvent_t event[1]; -cudaEvent_t timingEvent[2]; +cudaEvent_t event[1]; +cudaEvent_t timingEvent[2]; -struct hostData { +struct hostData +{ long long timeElapsed; - bool timeoutDetected; + bool timeoutDetected; long long timeElapsed2; - bool timeoutDetected2; + bool timeoutDetected2; LatchType latch; LatchType latch2; }; struct hostData *hostData; -__global__ void empty() -{ -} +__global__ void empty() {} // Function to read the GPU nanosecond timer in a kernel -__device__ __forceinline__ unsigned long long __globaltimer() { - unsigned long long globaltimer; - asm volatile ("mov.u64 %0, %globaltimer;" : "=l"(globaltimer)); - return globaltimer; +__device__ __forceinline__ unsigned long long __globaltimer() +{ + unsigned long long globaltimer; + asm volatile("mov.u64 %0, %globaltimer;" : "=l"(globaltimer)); + return globaltimer; } __global__ void delay(long long ticks) { long long endTime = clock64() + ticks; - while (clock64() < endTime); + while (clock64() < endTime) + ; } -__global__ void waitWithTimeout(long long nanoseconds, bool* timeoutDetected, long long *timeElapsed, LatchType* latch) +__global__ void waitWithTimeout(long long nanoseconds, bool *timeoutDetected, long long *timeElapsed, LatchType *latch) { long long startTime = __globaltimer(); - long long endTime = startTime + nanoseconds; - long long time = 0; + long long endTime = startTime + nanoseconds; + long long time = 0; do { time = __globaltimer(); } while (time < endTime && (latch == NULL || *latch == 0)); @@ -124,13 +121,9 @@ __global__ void waitWithTimeout(long long nanoseconds, bool* timeoutDetected, lo } } -__global__ void preUploadAnnotation() -{ -} +__global__ void preUploadAnnotation() {} -__global__ void postUploadAnnotation() -{ -} +__global__ void postUploadAnnotation() {} cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false) { @@ -138,9 +131,9 @@ cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false) RANGE("capture"); cudaGraph_t graph; cudaStreamBeginCapture(stream[0], cudaStreamCaptureModeGlobal); - int streamIdx = 0; + int streamIdx = 0; if (singleEntry) { - empty<<<1,1,0,stream[streamIdx]>>>(); + empty<<<1, 1, 0, stream[streamIdx]>>>(); } cudaEventRecord(event[0], stream[0]); @@ -151,7 +144,7 @@ cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false) for (int i = 0; i < width; i++) { streamIdx = i; for (int j = 0; j < length; j++) { - empty<<<1,1,0,stream[streamIdx]>>>(); + empty<<<1, 1, 0, stream[streamIdx]>>>(); } } @@ -164,10 +157,10 @@ cudaGraph_t createParallelChain(int length, int width, bool singleEntry = false) return graph; } -std::vector metricName; -std::vector metricValue; +std::vector metricName; +std::vector metricValue; -int counter2 = 0; +int counter2 = 0; void runDemo(cudaGraph_t graph, int length, int width) { cudaGraphExec_t graphExec; @@ -203,12 +196,13 @@ void runDemo(cudaGraph_t graph, int length, int width) metricValue.push_back(getMicroSecondDuration(start, streamSync)); } { - // re-instantiating the exec to simulate first launch into a busy stream. + // re-instantiating the exec to simulate first launch into a busy stream. cudaGraphExecDestroy(graphExec); cudaGraphInstantiateWithFlags(&graphExec, graph, 0); - long long maxTimeoutNanoSeconds = 4000 + 500*length*width; - waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); + long long maxTimeoutNanoSeconds = 4000 + 500 * length * width; + waitWithTimeout<<<1, 1, 0, stream[0]>>>( + maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); RANGE("launch including upload in busy stream"); cudaEventRecord(timingEvent[0], stream[0]); @@ -222,13 +216,14 @@ void runDemo(cudaGraph_t graph, int length, int width) metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1])); metricName.push_back("blockingKernelTimeoutDetected"); metricValue.push_back(hostData->timeoutDetected); - hostData->latch = 0; + hostData->latch = 0; hostData->timeoutDetected = 0; } { RANGE("repeat lauch in busy stream"); - long long maxTimeoutNanoSeconds = 4000 + 500*length*width; - waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); + long long maxTimeoutNanoSeconds = 4000 + 500 * length * width; + waitWithTimeout<<<1, 1, 0, stream[0]>>>( + maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); cudaEventRecord(timingEvent[0], stream[0]); cudaGraphLaunch(graphExec, stream[0]); cudaEventRecord(timingEvent[1], stream[0]); @@ -240,34 +235,37 @@ void runDemo(cudaGraph_t graph, int length, int width) metricValue.push_back(getAsyncMicroSecondDuration(timingEvent[0], timingEvent[1])); metricName.push_back("blockingKernelTimeoutDetected"); metricValue.push_back(hostData->timeoutDetected); - hostData->latch = 0; + hostData->latch = 0; hostData->timeoutDetected = 0; } { // re-instantiating the exec to provide upload with work to do. cudaGraphExecDestroy(graphExec); cudaGraphInstantiateWithFlags(&graphExec, graph, 0); - long long maxTimeoutNanoSeconds = 4000 + 1000*length*width; - waitWithTimeout<<<1,1,0,stream[0]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected2, &hostData->timeElapsed2, &hostData->latch2); - maxTimeoutNanoSeconds = 2000 + 500*length*width; - waitWithTimeout<<<1,1,0,stream[1]>>>(maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); + long long maxTimeoutNanoSeconds = 4000 + 1000 * length * width; + waitWithTimeout<<<1, 1, 0, stream[0]>>>( + maxTimeoutNanoSeconds, &hostData->timeoutDetected2, &hostData->timeElapsed2, &hostData->latch2); + maxTimeoutNanoSeconds = 2000 + 500 * length * width; + waitWithTimeout<<<1, 1, 0, stream[1]>>>( + maxTimeoutNanoSeconds, &hostData->timeoutDetected, &hostData->timeElapsed, &hostData->latch); RANGE("uploading a graph off of the critical path"); - preUploadAnnotation<<<1,1,0,stream[1]>>>(); + preUploadAnnotation<<<1, 1, 0, stream[1]>>>(); cudaEventRecord(timingEvent[0], stream[0]); auto start = getCpuTime(); cudaGraphUpload(graphExec, stream[1]); auto apiReturn = getCpuTime(); - cudaEventRecord(event[0],stream[1]); + cudaEventRecord(event[0], stream[1]); cudaEventRecord(timingEvent[1], stream[0]); - postUploadAnnotation<<<1,1,0,stream[1]>>>(); + postUploadAnnotation<<<1, 1, 0, stream[1]>>>(); hostData->latch = 1; // release the blocking kernel for the upload - cudaStreamWaitEvent(stream[0],event[0]); + cudaStreamWaitEvent(stream[0], event[0]); cudaGraphLaunch(graphExec, stream[0]); - cudaEventSynchronize(event[0]); // upload done, similuate critical path being ready for the graph to run by the release of the second latch + cudaEventSynchronize(event[0]); // upload done, similuate critical path being ready for the graph to run by the + // release of the second latch - hostData->latch2 = 1; // release the work + hostData->latch2 = 1; // release the work cudaStreamSynchronize(stream[0]); metricName.push_back("upload_api_time"); @@ -277,9 +275,9 @@ void runDemo(cudaGraph_t graph, int length, int width) metricName.push_back("blockingKernelTimeoutDetected"); metricValue.push_back(hostData->timeoutDetected); - hostData->latch = 0; - hostData->latch2 = 0; - hostData->timeoutDetected = 0; + hostData->latch = 0; + hostData->latch2 = 0; + hostData->timeoutDetected = 0; hostData->timeoutDetected2 = 0; } cudaGraphExecDestroy(graphExec); @@ -287,7 +285,8 @@ void runDemo(cudaGraph_t graph, int length, int width) RANGE_POP(); } -void usage() { +void usage() +{ printf("programName [outputFmt] [numTrials] [length] [width] [pattern] [stride] [maxLength] \n"); printf("\toutputFmt - program output, default=3 (see below)\n"); printf("\tnumTrials (per length)\n"); @@ -312,34 +311,36 @@ void usage() { int main(int argc, char **argv) { - if(argc < 1) { + if (argc < 1) { usage(); return 0; } - int numTrials=1, length=20, width=1, outputFmt=3, pattern=0, stride = 1; - if(argc > 1) outputFmt = atoi(argv[1]); - if(argc > 2) numTrials = atoi(argv[2]); - if(argc > 3) length= atoi(argv[3]); - if(argc > 4) width= atoi(argv[4]); - if(argc > 5) pattern = atoi(argv[5]); - if(argc > 6) stride = atoi(argv[6]); + int numTrials = 1, length = 20, width = 1, outputFmt = 3, pattern = 0, stride = 1; + if (argc > 1) + outputFmt = atoi(argv[1]); + if (argc > 2) + numTrials = atoi(argv[2]); + if (argc > 3) + length = atoi(argv[3]); + if (argc > 4) + width = atoi(argv[4]); + if (argc > 5) + pattern = atoi(argv[5]); + if (argc > 6) + stride = atoi(argv[6]); int maxLength = length; - if(argc > 7) maxLength = atoi(argv[7]); + if (argc > 7) + maxLength = atoi(argv[7]); if (maxLength < length) { maxLength = length; } - if((outputFmt & 4) && (outputFmt & 2)) { + if ((outputFmt & 4) && (outputFmt & 2)) { printf("printing average and all samples doesn't make sense\n"); } - if(length == 0 || - width == 0 || - outputFmt == 0 || - outputFmt > 5 || - pattern > 1) - { + if (length == 0 || width == 0 || outputFmt == 0 || outputFmt > 5 || pattern > 1) { usage(); return 0; } @@ -351,10 +352,10 @@ int main(int argc, char **argv) cudaFree(0); cudaMallocHost(&hostData, sizeof(*hostData)); int numStreams = width; - if (numStreams == 1) numStreams = 2; // demo needs two streams even if capture only needs 1. + if (numStreams == 1) + numStreams = 2; // demo needs two streams even if capture only needs 1. stream.resize(numStreams); - for (int i = 0; i < numStreams; i++) - { + for (int i = 0; i < numStreams; i++) { cudaStreamCreate(&stream[i]); } @@ -364,15 +365,14 @@ int main(int argc, char **argv) { RANGE("warmup"); - for (int i = 0; i < width; i++) - { - empty<<<1,1,0,stream[i]>>>(); + for (int i = 0; i < width; i++) { + empty<<<1, 1, 0, stream[i]>>>(); } cudaStreamSynchronize(stream[0]); auto start = getCpuTime(); - graph = createParallelChain(length, width, singleEntry); - auto end = getCpuTime(); + graph = createParallelChain(length, width, singleEntry); + auto end = getCpuTime(); metricValue.push_back(getMicroSecondDuration(start, end)); metricName.push_back("capture"); runDemo(graph, length, width); @@ -382,7 +382,7 @@ int main(int argc, char **argv) printf("length, width, pattern, "); for (int i = 0; i < metricName.size(); i++) { printf("%s, ", metricName[i]); - } + } printf("\r\n"); } @@ -390,7 +390,7 @@ int main(int argc, char **argv) printf("skipping trials since no output is expected\n"); return 1; } - + std::vector metricTotal; metricTotal.resize(metricValue.size()); @@ -399,32 +399,32 @@ int main(int argc, char **argv) metricName.clear(); metricValue.clear(); auto start = getCpuTime(); - graph = createParallelChain(length, width, singleEntry); - auto end = getCpuTime(); + graph = createParallelChain(length, width, singleEntry); + auto end = getCpuTime(); metricValue.push_back(getMicroSecondDuration(start, end)); runDemo(graph, length, width); if (outputFmt & 2) { - printf("%d, %d, %d, ",length, width, pattern); + printf("%d, %d, %d, ", length, width, pattern); for (int i = 0; i < metricValue.size(); i++) { printf("%0.3f, ", metricValue[i]); - } + } printf("\r\n"); } if (outputFmt & 4) { for (int i = 0; i < metricTotal.size(); i++) { metricTotal[i] += metricValue[i]; - } + } } } if (outputFmt & 4) { - printf("%d, %d, %d, ",length, width, pattern); + printf("%d, %d, %d, ", length, width, pattern); for (int i = 0; i < metricTotal.size(); i++) { - printf("%0.3f, ", metricTotal[i]/numTrials); + printf("%0.3f, ", metricTotal[i] / numTrials); metricTotal[i] = 0; - } + } printf("\r\n"); } diff --git a/Samples/6_Performance/transpose/transpose.cu b/Samples/6_Performance/transpose/transpose.cu index 1052be14..068aa02e 100644 --- a/Samples/6_Performance/transpose/transpose.cu +++ b/Samples/6_Performance/transpose/transpose.cu @@ -42,9 +42,9 @@ namespace cg = cooperative_groups; // Utilities and system includes -#include // helper for string parsing -#include // helper for image and data comparison -#include // helper for cuda error checking functions +#include // helper for cuda error checking functions +#include // helper for image and data comparison +#include // helper for string parsing const char *sSDKsample = "Transpose"; @@ -53,20 +53,19 @@ const char *sSDKsample = "Transpose"; // TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of // BLOCK_ROWS -#define TILE_DIM 32 +#define TILE_DIM 32 #define BLOCK_ROWS 16 // This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y int MATRIX_SIZE_X = 1024; int MATRIX_SIZE_Y = 1024; -int MUL_FACTOR = TILE_DIM; +int MUL_FACTOR = TILE_DIM; #define FLOOR(a, b) (a - (a % b)) // Compute the tile size necessary to illustrate performance cases for SM20+ // hardware -int MAX_TILES = (FLOOR(MATRIX_SIZE_X, 512) * FLOOR(MATRIX_SIZE_Y, 512)) / - (TILE_DIM * TILE_DIM); +int MAX_TILES = (FLOOR(MATRIX_SIZE_X, 512) * FLOOR(MATRIX_SIZE_Y, 512)) / (TILE_DIM * TILE_DIM); // Number of repetitions used for timing. Two sets of repetitions are // performed: 1) over kernel launches and 2) inside the kernel over just the @@ -79,41 +78,42 @@ int MAX_TILES = (FLOOR(MATRIX_SIZE_X, 512) * FLOOR(MATRIX_SIZE_Y, 512)) / // width and height must be integral multiples of TILE_DIM // ------------------------------------------------------- -__global__ void copy(float *odata, float *idata, int width, int height) { - int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; +__global__ void copy(float *odata, float *idata, int width, int height) +{ + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; - int index = xIndex + width * yIndex; + int index = xIndex + width * yIndex; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - odata[index + i * width] = idata[index + i * width]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index + i * width] = idata[index + i * width]; + } } -__global__ void copySharedMem(float *odata, float *idata, int width, - int height) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float tile[TILE_DIM][TILE_DIM]; +__global__ void copySharedMem(float *odata, float *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float tile[TILE_DIM][TILE_DIM]; - int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; - int index = xIndex + width * yIndex; + int index = xIndex + width * yIndex; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - if (xIndex < width && yIndex < height) { - tile[threadIdx.y][threadIdx.x] = idata[index]; + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + if (xIndex < width && yIndex < height) { + tile[threadIdx.y][threadIdx.x] = idata[index]; + } } - } - cg::sync(cta); + cg::sync(cta); - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - if (xIndex < height && yIndex < width) { - odata[index] = tile[threadIdx.y][threadIdx.x]; + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + if (xIndex < height && yIndex < width) { + odata[index] = tile[threadIdx.y][threadIdx.x]; + } } - } } // ------------------------------------------------------- @@ -121,71 +121,71 @@ __global__ void copySharedMem(float *odata, float *idata, int width, // width and height must be integral multiples of TILE_DIM // ------------------------------------------------------- -__global__ void transposeNaive(float *odata, float *idata, int width, - int height) { - int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; +__global__ void transposeNaive(float *odata, float *idata, int width, int height) +{ + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; - int index_in = xIndex + width * yIndex; - int index_out = yIndex + height * xIndex; + int index_in = xIndex + width * yIndex; + int index_out = yIndex + height * xIndex; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - odata[index_out + i] = idata[index_in + i * width]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i] = idata[index_in + i * width]; + } } // coalesced transpose (with bank conflicts) -__global__ void transposeCoalesced(float *odata, float *idata, int width, - int height) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float tile[TILE_DIM][TILE_DIM]; +__global__ void transposeCoalesced(float *odata, float *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float tile[TILE_DIM][TILE_DIM]; - int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; - int index_in = xIndex + (yIndex)*width; + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + int index_in = xIndex + (yIndex)*width; - xIndex = blockIdx.y * TILE_DIM + threadIdx.x; - yIndex = blockIdx.x * TILE_DIM + threadIdx.y; - int index_out = xIndex + (yIndex)*height; + xIndex = blockIdx.y * TILE_DIM + threadIdx.x; + yIndex = blockIdx.x * TILE_DIM + threadIdx.y; + int index_out = xIndex + (yIndex)*height; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; + } - cg::sync(cta); + cg::sync(cta); - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; + } } // Coalesced transpose with no bank conflicts -__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, - int height) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float tile[TILE_DIM][TILE_DIM + 1]; +__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float tile[TILE_DIM][TILE_DIM + 1]; - int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; - int index_in = xIndex + (yIndex)*width; + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + int index_in = xIndex + (yIndex)*width; - xIndex = blockIdx.y * TILE_DIM + threadIdx.x; - yIndex = blockIdx.x * TILE_DIM + threadIdx.y; - int index_out = xIndex + (yIndex)*height; + xIndex = blockIdx.y * TILE_DIM + threadIdx.x; + yIndex = blockIdx.x * TILE_DIM + threadIdx.y; + int index_out = xIndex + (yIndex)*height; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; + } - cg::sync(cta); + cg::sync(cta); - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; + } } // Transpose that effectively reorders execution of thread blocks along @@ -200,44 +200,45 @@ __global__ void transposeNoBankConflicts(float *odata, float *idata, int width, // blockIdx_y and replacement of blockIdx.x and bloclIdx.y with the subscripted // versions in the remaining code -__global__ void transposeDiagonal(float *odata, float *idata, int width, - int height) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float tile[TILE_DIM][TILE_DIM + 1]; +__global__ void transposeDiagonal(float *odata, float *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float tile[TILE_DIM][TILE_DIM + 1]; - int blockIdx_x, blockIdx_y; + int blockIdx_x, blockIdx_y; - // do diagonal reordering - if (width == height) { - blockIdx_y = blockIdx.x; - blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; - } else { - int bid = blockIdx.x + gridDim.x * blockIdx.y; - blockIdx_y = bid % gridDim.y; - blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x; - } + // do diagonal reordering + if (width == height) { + blockIdx_y = blockIdx.x; + blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; + } + else { + int bid = blockIdx.x + gridDim.x * blockIdx.y; + blockIdx_y = bid % gridDim.y; + blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x; + } - // from here on the code is same as previous kernel except blockIdx_x replaces - // blockIdx.x and similarly for y + // from here on the code is same as previous kernel except blockIdx_x replaces + // blockIdx.x and similarly for y - int xIndex = blockIdx_x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx_y * TILE_DIM + threadIdx.y; - int index_in = xIndex + (yIndex)*width; + int xIndex = blockIdx_x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx_y * TILE_DIM + threadIdx.y; + int index_in = xIndex + (yIndex)*width; - xIndex = blockIdx_y * TILE_DIM + threadIdx.x; - yIndex = blockIdx_x * TILE_DIM + threadIdx.y; - int index_out = xIndex + (yIndex)*height; + xIndex = blockIdx_y * TILE_DIM + threadIdx.x; + yIndex = blockIdx_x * TILE_DIM + threadIdx.y; + int index_out = xIndex + (yIndex)*height; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + tile[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; + } - cg::sync(cta); + cg::sync(cta); - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i]; + } } // -------------------------------------------------------------------- @@ -249,365 +250,369 @@ __global__ void transposeDiagonal(float *odata, float *idata, int width, // components of a full transpose // -------------------------------------------------------------------- -__global__ void transposeFineGrained(float *odata, float *idata, int width, - int height) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float block[TILE_DIM][TILE_DIM + 1]; +__global__ void transposeFineGrained(float *odata, float *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float block[TILE_DIM][TILE_DIM + 1]; - int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; - int index = xIndex + (yIndex)*width; + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + int index = xIndex + (yIndex)*width; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - block[threadIdx.y + i][threadIdx.x] = idata[index + i * width]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + block[threadIdx.y + i][threadIdx.x] = idata[index + i * width]; + } - cg::sync(cta); + cg::sync(cta); - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - odata[index + i * height] = block[threadIdx.x][threadIdx.y + i]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index + i * height] = block[threadIdx.x][threadIdx.y + i]; + } } -__global__ void transposeCoarseGrained(float *odata, float *idata, int width, - int height) { - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - __shared__ float block[TILE_DIM][TILE_DIM + 1]; +__global__ void transposeCoarseGrained(float *odata, float *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float block[TILE_DIM][TILE_DIM + 1]; - int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; - int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; - int index_in = xIndex + (yIndex)*width; + int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; + int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; + int index_in = xIndex + (yIndex)*width; - xIndex = blockIdx.y * TILE_DIM + threadIdx.x; - yIndex = blockIdx.x * TILE_DIM + threadIdx.y; - int index_out = xIndex + (yIndex)*height; + xIndex = blockIdx.y * TILE_DIM + threadIdx.x; + yIndex = blockIdx.x * TILE_DIM + threadIdx.y; + int index_out = xIndex + (yIndex)*height; - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - block[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + block[threadIdx.y + i][threadIdx.x] = idata[index_in + i * width]; + } - cg::sync(cta); + cg::sync(cta); - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { - odata[index_out + i * height] = block[threadIdx.y + i][threadIdx.x]; - } + for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { + odata[index_out + i * height] = block[threadIdx.y + i][threadIdx.x]; + } } // --------------------- // host utility routines // --------------------- -void computeTransposeGold(float *gold, float *idata, const int size_x, - const int size_y) { - for (int y = 0; y < size_y; ++y) { - for (int x = 0; x < size_x; ++x) { - gold[(x * size_y) + y] = idata[(y * size_x) + x]; +void computeTransposeGold(float *gold, float *idata, const int size_x, const int size_y) +{ + for (int y = 0; y < size_y; ++y) { + for (int x = 0; x < size_x; ++x) { + gold[(x * size_y) + y] = idata[(y * size_x) + x]; + } } - } } -void getParams(int argc, char **argv, cudaDeviceProp &deviceProp, int &size_x, - int &size_y, int max_tile_dim) { - // set matrix size (if (x,y) dim of matrix is not square, then this will have - // to be modified - if (checkCmdLineFlag(argc, (const char **)argv, "dimX")) { - size_x = getCmdLineArgumentInt(argc, (const char **)argv, "dimX"); +void getParams(int argc, char **argv, cudaDeviceProp &deviceProp, int &size_x, int &size_y, int max_tile_dim) +{ + // set matrix size (if (x,y) dim of matrix is not square, then this will have + // to be modified + if (checkCmdLineFlag(argc, (const char **)argv, "dimX")) { + size_x = getCmdLineArgumentInt(argc, (const char **)argv, "dimX"); - if (size_x > max_tile_dim) { - printf("> MatrixSize X = %d is greater than the recommended size = %d\n", - size_x, max_tile_dim); - } else { - printf("> MatrixSize X = %d\n", size_x); + if (size_x > max_tile_dim) { + printf("> MatrixSize X = %d is greater than the recommended size = %d\n", size_x, max_tile_dim); + } + else { + printf("> MatrixSize X = %d\n", size_x); + } } - } else { - size_x = max_tile_dim; - size_x = FLOOR(size_x, 512); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "dimY")) { - size_y = getCmdLineArgumentInt(argc, (const char **)argv, "dimY"); - - if (size_y > max_tile_dim) { - printf("> MatrixSize Y = %d is greater than the recommended size = %d\n", - size_y, max_tile_dim); - } else { - printf("> MatrixSize Y = %d\n", size_y); + else { + size_x = max_tile_dim; + size_x = FLOOR(size_x, 512); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "dimY")) { + size_y = getCmdLineArgumentInt(argc, (const char **)argv, "dimY"); + + if (size_y > max_tile_dim) { + printf("> MatrixSize Y = %d is greater than the recommended size = %d\n", size_y, max_tile_dim); + } + else { + printf("> MatrixSize Y = %d\n", size_y); + } + } + else { + size_y = max_tile_dim; + size_y = FLOOR(size_y, 512); } - } else { - size_y = max_tile_dim; - size_y = FLOOR(size_y, 512); - } } -void showHelp() { - printf("\n%s : Command line options\n", sSDKsample); - printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n"); - printf("> The default matrix size can be overridden with these parameters\n"); - printf("\t-dimX=row_dim_size (matrix row dimensions)\n"); - printf("\t-dimY=col_dim_size (matrix column dimensions)\n"); +void showHelp() +{ + printf("\n%s : Command line options\n", sSDKsample); + printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n"); + printf("> The default matrix size can be overridden with these parameters\n"); + printf("\t-dimX=row_dim_size (matrix row dimensions)\n"); + printf("\t-dimY=col_dim_size (matrix column dimensions)\n"); } // ---- // main // ---- -int main(int argc, char **argv) { - // Start logs - printf("%s Starting...\n\n", sSDKsample); +int main(int argc, char **argv) +{ + // Start logs + printf("%s Starting...\n\n", sSDKsample); - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - showHelp(); - return 0; - } - - int devID = findCudaDevice(argc, (const char **)argv); - cudaDeviceProp deviceProp; - - // get number of SMs on this GPU - checkCudaErrors(cudaGetDevice(&devID)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - - // compute the scaling factor (for GPUs with fewer MPs) - float scale_factor, total_tiles; - scale_factor = - max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - (float)deviceProp.multiProcessorCount)), - 1.0f); - - printf("> Device %d: \"%s\"\n", devID, deviceProp.name); - printf("> SM Capability %d.%d detected:\n", deviceProp.major, - deviceProp.minor); - - // Calculate number of tiles we will run for the Matrix Transpose performance - // tests - int size_x, size_y, max_matrix_dim, matrix_size_test; - - matrix_size_test = 512; // we round down max_matrix_dim for this perf test - total_tiles = (float)MAX_TILES / scale_factor; - - max_matrix_dim = - FLOOR((int)(floor(sqrt(total_tiles)) * TILE_DIM), matrix_size_test); - - // This is the minimum size allowed - if (max_matrix_dim == 0) { - max_matrix_dim = matrix_size_test; - } - - printf("> [%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, - deviceProp.multiProcessorCount, - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), - _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * - deviceProp.multiProcessorCount); - - printf("> Compute performance scaling factor = %4.2f\n", scale_factor); - - // Extract parameters if there are any, command line -dimx and -dimy can - // override any of these settings - getParams(argc, argv, deviceProp, size_x, size_y, max_matrix_dim); - - if (size_x != size_y) { - printf( - "\n[%s] does not support non-square matrices (row_dim_size(%d) != " - "col_dim_size(%d))\nExiting...\n\n", - sSDKsample, size_x, size_y); - exit(EXIT_FAILURE); - } - - if (size_x % TILE_DIM != 0 || size_y % TILE_DIM != 0) { - printf( - "[%s] Matrix size must be integral multiple of tile " - "size\nExiting...\n\n", - sSDKsample); - exit(EXIT_FAILURE); - } - - // kernel pointer and descriptor - void (*kernel)(float *, float *, int, int); - const char *kernelName; - - // execution configuration parameters - dim3 grid(size_x / TILE_DIM, size_y / TILE_DIM), - threads(TILE_DIM, BLOCK_ROWS); - - if (grid.x < 1 || grid.y < 1) { - printf("[%s] grid size computation incorrect in test \nExiting...\n\n", - sSDKsample); - exit(EXIT_FAILURE); - } - - // CUDA events - cudaEvent_t start, stop; - - // size of memory required to store the matrix - size_t mem_size = static_cast(sizeof(float) * size_x * size_y); - - if (2 * mem_size > deviceProp.totalGlobalMem) { - printf("Input matrix size is larger than the available device memory!\n"); - printf("Please choose a smaller size matrix\n"); - exit(EXIT_FAILURE); - } - - // allocate host memory - float *h_idata = (float *)malloc(mem_size); - float *h_odata = (float *)malloc(mem_size); - float *transposeGold = (float *)malloc(mem_size); - float *gold; - - // allocate device memory - float *d_idata, *d_odata; - checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); - checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); - - // initialize host data - for (int i = 0; i < (size_x * size_y); ++i) { - h_idata[i] = (float)i; - } - - // copy host data to device - checkCudaErrors( - cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); - - // Compute reference transpose solution - computeTransposeGold(transposeGold, h_idata, size_x, size_y); - - // print out common data for all kernels - printf( - "\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: " - "%dx%d\n\n", - size_x, size_y, size_x / TILE_DIM, size_y / TILE_DIM, TILE_DIM, TILE_DIM, - TILE_DIM, BLOCK_ROWS); - - // initialize events - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // - // loop over different kernels - // - - bool success = true; - - for (int k = 0; k < 8; k++) { - // set kernel pointer - switch (k) { - case 0: - kernel = © - kernelName = "simple copy "; - break; - - case 1: - kernel = ©SharedMem; - kernelName = "shared memory copy"; - break; - - case 2: - kernel = &transposeNaive; - kernelName = "naive "; - break; - - case 3: - kernel = &transposeCoalesced; - kernelName = "coalesced "; - break; - - case 4: - kernel = &transposeNoBankConflicts; - kernelName = "optimized "; - break; - - case 5: - kernel = &transposeCoarseGrained; - kernelName = "coarse-grained "; - break; - - case 6: - kernel = &transposeFineGrained; - kernelName = "fine-grained "; - break; - - case 7: - kernel = &transposeDiagonal; - kernelName = "diagonal "; - break; + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + showHelp(); + return 0; } - // set reference solution - if (kernel == © || kernel == ©SharedMem) { - gold = h_idata; - } else if (kernel == &transposeCoarseGrained || - kernel == &transposeFineGrained) { - gold = h_odata; // fine- and coarse-grained kernels are not full - // transposes, so bypass check - } else { - gold = transposeGold; + int devID = findCudaDevice(argc, (const char **)argv); + cudaDeviceProp deviceProp; + + // get number of SMs on this GPU + checkCudaErrors(cudaGetDevice(&devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + + // compute the scaling factor (for GPUs with fewer MPs) + float scale_factor, total_tiles; + scale_factor = max( + (192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), + 1.0f); + + printf("> Device %d: \"%s\"\n", devID, deviceProp.name); + printf("> SM Capability %d.%d detected:\n", deviceProp.major, deviceProp.minor); + + // Calculate number of tiles we will run for the Matrix Transpose performance + // tests + int size_x, size_y, max_matrix_dim, matrix_size_test; + + matrix_size_test = 512; // we round down max_matrix_dim for this perf test + total_tiles = (float)MAX_TILES / scale_factor; + + max_matrix_dim = FLOOR((int)(floor(sqrt(total_tiles)) * TILE_DIM), matrix_size_test); + + // This is the minimum size allowed + if (max_matrix_dim == 0) { + max_matrix_dim = matrix_size_test; } - // Clear error status - checkCudaErrors(cudaGetLastError()); + printf("> [%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", + deviceProp.name, + deviceProp.multiProcessorCount, + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), + _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); - // warmup to avoid timing startup - kernel<<>>(d_odata, d_idata, size_x, size_y); + printf("> Compute performance scaling factor = %4.2f\n", scale_factor); - // take measurements for loop over kernel launches - checkCudaErrors(cudaEventRecord(start, 0)); + // Extract parameters if there are any, command line -dimx and -dimy can + // override any of these settings + getParams(argc, argv, deviceProp, size_x, size_y, max_matrix_dim); - for (int i = 0; i < NUM_REPS; i++) { - kernel<<>>(d_odata, d_idata, size_x, size_y); - // Ensure no launch failure - checkCudaErrors(cudaGetLastError()); + if (size_x != size_y) { + printf("\n[%s] does not support non-square matrices (row_dim_size(%d) != " + "col_dim_size(%d))\nExiting...\n\n", + sSDKsample, + size_x, + size_y); + exit(EXIT_FAILURE); } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(stop)); - float kernelTime; - checkCudaErrors(cudaEventElapsedTime(&kernelTime, start, stop)); - - checkCudaErrors( - cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost)); - bool res = compareData(gold, h_odata, size_x * size_y, 0.01f, 0.0f); - - if (res == false) { - printf("*** %s kernel FAILED ***\n", kernelName); - success = false; + if (size_x % TILE_DIM != 0 || size_y % TILE_DIM != 0) { + printf("[%s] Matrix size must be integral multiple of tile " + "size\nExiting...\n\n", + sSDKsample); + exit(EXIT_FAILURE); } - // take measurements for loop inside kernel - checkCudaErrors( - cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost)); - res = compareData(gold, h_odata, size_x * size_y, 0.01f, 0.0f); + // kernel pointer and descriptor + void (*kernel)(float *, float *, int, int); + const char *kernelName; - if (res == false) { - printf("*** %s kernel FAILED ***\n", kernelName); - success = false; + // execution configuration parameters + dim3 grid(size_x / TILE_DIM, size_y / TILE_DIM), threads(TILE_DIM, BLOCK_ROWS); + + if (grid.x < 1 || grid.y < 1) { + printf("[%s] grid size computation incorrect in test \nExiting...\n\n", sSDKsample); + exit(EXIT_FAILURE); } - // report effective bandwidths - float kernelBandwidth = 2.0f * 1000.0f * mem_size / (1024 * 1024 * 1024) / - (kernelTime / NUM_REPS); - printf( - "transpose %s, Throughput = %.4f GB/s, Time = %.5f ms, Size = %u fp32 " - "elements, NumDevsUsed = %u, Workgroup = %u\n", - kernelName, kernelBandwidth, kernelTime / NUM_REPS, (size_x * size_y), - 1, TILE_DIM * BLOCK_ROWS); - } + // CUDA events + cudaEvent_t start, stop; - // cleanup - free(h_idata); - free(h_odata); - free(transposeGold); - cudaFree(d_idata); - cudaFree(d_odata); + // size of memory required to store the matrix + size_t mem_size = static_cast(sizeof(float) * size_x * size_y); - checkCudaErrors(cudaEventDestroy(start)); - checkCudaErrors(cudaEventDestroy(stop)); + if (2 * mem_size > deviceProp.totalGlobalMem) { + printf("Input matrix size is larger than the available device memory!\n"); + printf("Please choose a smaller size matrix\n"); + exit(EXIT_FAILURE); + } - if (!success) { - printf("Test failed!\n"); - exit(EXIT_FAILURE); - } + // allocate host memory + float *h_idata = (float *)malloc(mem_size); + float *h_odata = (float *)malloc(mem_size); + float *transposeGold = (float *)malloc(mem_size); + float *gold; - printf("Test passed\n"); - exit(EXIT_SUCCESS); + // allocate device memory + float *d_idata, *d_odata; + checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); + checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); + + // initialize host data + for (int i = 0; i < (size_x * size_y); ++i) { + h_idata[i] = (float)i; + } + + // copy host data to device + checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); + + // Compute reference transpose solution + computeTransposeGold(transposeGold, h_idata, size_x, size_y); + + // print out common data for all kernels + printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: " + "%dx%d\n\n", + size_x, + size_y, + size_x / TILE_DIM, + size_y / TILE_DIM, + TILE_DIM, + TILE_DIM, + TILE_DIM, + BLOCK_ROWS); + + // initialize events + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + // + // loop over different kernels + // + + bool success = true; + + for (int k = 0; k < 8; k++) { + // set kernel pointer + switch (k) { + case 0: + kernel = © + kernelName = "simple copy "; + break; + + case 1: + kernel = ©SharedMem; + kernelName = "shared memory copy"; + break; + + case 2: + kernel = &transposeNaive; + kernelName = "naive "; + break; + + case 3: + kernel = &transposeCoalesced; + kernelName = "coalesced "; + break; + + case 4: + kernel = &transposeNoBankConflicts; + kernelName = "optimized "; + break; + + case 5: + kernel = &transposeCoarseGrained; + kernelName = "coarse-grained "; + break; + + case 6: + kernel = &transposeFineGrained; + kernelName = "fine-grained "; + break; + + case 7: + kernel = &transposeDiagonal; + kernelName = "diagonal "; + break; + } + + // set reference solution + if (kernel == © || kernel == ©SharedMem) { + gold = h_idata; + } + else if (kernel == &transposeCoarseGrained || kernel == &transposeFineGrained) { + gold = h_odata; // fine- and coarse-grained kernels are not full + // transposes, so bypass check + } + else { + gold = transposeGold; + } + + // Clear error status + checkCudaErrors(cudaGetLastError()); + + // warmup to avoid timing startup + kernel<<>>(d_odata, d_idata, size_x, size_y); + + // take measurements for loop over kernel launches + checkCudaErrors(cudaEventRecord(start, 0)); + + for (int i = 0; i < NUM_REPS; i++) { + kernel<<>>(d_odata, d_idata, size_x, size_y); + // Ensure no launch failure + checkCudaErrors(cudaGetLastError()); + } + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(stop)); + float kernelTime; + checkCudaErrors(cudaEventElapsedTime(&kernelTime, start, stop)); + + checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost)); + bool res = compareData(gold, h_odata, size_x * size_y, 0.01f, 0.0f); + + if (res == false) { + printf("*** %s kernel FAILED ***\n", kernelName); + success = false; + } + + // take measurements for loop inside kernel + checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost)); + res = compareData(gold, h_odata, size_x * size_y, 0.01f, 0.0f); + + if (res == false) { + printf("*** %s kernel FAILED ***\n", kernelName); + success = false; + } + + // report effective bandwidths + float kernelBandwidth = 2.0f * 1000.0f * mem_size / (1024 * 1024 * 1024) / (kernelTime / NUM_REPS); + printf("transpose %s, Throughput = %.4f GB/s, Time = %.5f ms, Size = %u fp32 " + "elements, NumDevsUsed = %u, Workgroup = %u\n", + kernelName, + kernelBandwidth, + kernelTime / NUM_REPS, + (size_x * size_y), + 1, + TILE_DIM * BLOCK_ROWS); + } + + // cleanup + free(h_idata); + free(h_odata); + free(transposeGold); + cudaFree(d_idata); + cudaFree(d_odata); + + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + + if (!success) { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("Test passed\n"); + exit(EXIT_SUCCESS); } diff --git a/Samples/7_libNVVM/common/include/DDSWriter.h b/Samples/7_libNVVM/common/include/DDSWriter.h index a3a07378..3d99e63a 100644 --- a/Samples/7_libNVVM/common/include/DDSWriter.h +++ b/Samples/7_libNVVM/common/include/DDSWriter.h @@ -32,40 +32,42 @@ typedef int DWORD; /// DDS File Structures -struct DDSPixelFormat { - DWORD Size; - DWORD Flags; - DWORD FourCC; - DWORD RGBBitCount; - DWORD RBitMask; - DWORD GBitMask; - DWORD BBitMask; - DWORD ABitMask; +struct DDSPixelFormat +{ + DWORD Size; + DWORD Flags; + DWORD FourCC; + DWORD RGBBitCount; + DWORD RBitMask; + DWORD GBitMask; + DWORD BBitMask; + DWORD ABitMask; }; -struct DDSHeader { - DWORD Size; - DWORD Flags; - DWORD Height; - DWORD Width; - DWORD PitchOrLinearSize; - DWORD Depth; - DWORD MipMapCount; - DWORD Reserved1[11]; - DDSPixelFormat PixelFormat; - DWORD Caps; - DWORD Caps2; - DWORD Caps3; - DWORD Caps4; - DWORD Reserved2; +struct DDSHeader +{ + DWORD Size; + DWORD Flags; + DWORD Height; + DWORD Width; + DWORD PitchOrLinearSize; + DWORD Depth; + DWORD MipMapCount; + DWORD Reserved1[11]; + DDSPixelFormat PixelFormat; + DWORD Caps; + DWORD Caps2; + DWORD Caps3; + DWORD Caps4; + DWORD Reserved2; }; #define DDPF_ALPHAPIXELS 0x1 -#define DDPF_RGB 0x40 +#define DDPF_RGB 0x40 -#define DDSD_CAPS 0x1 -#define DDSD_HEIGHT 0x2 -#define DDSD_WIDTH 0x4 +#define DDSD_CAPS 0x1 +#define DDSD_HEIGHT 0x2 +#define DDSD_WIDTH 0x4 #define DDSD_PIXELFORMAT 0x1000 #define DDSCAPS_TEXTURE 0x1000 @@ -75,46 +77,46 @@ struct DDSHeader { /// (R, G, B, A) tuples of 32-bit floating-point data of dimensions /// width X height. The floating-point data should be normalized to [0,1] and /// will be scaled to a [0, 255] 8-bit value. -void writeDDS(const char *filename, const float *data, unsigned width, - unsigned height) { - // Write out the result as a .dds file - // This is a quick and dirty DDS writer - DDSHeader header; - memset(&header, 0, sizeof(DDSHeader)); +void writeDDS(const char *filename, const float *data, unsigned width, unsigned height) +{ + // Write out the result as a .dds file + // This is a quick and dirty DDS writer + DDSHeader header; + memset(&header, 0, sizeof(DDSHeader)); - header.Size = sizeof(DDSHeader); - header.Flags = DDSD_CAPS | DDSD_HEIGHT | DDSD_WIDTH | DDSD_PIXELFORMAT; - header.Height = height; - header.Width = width; - header.PitchOrLinearSize = width*4; - header.Caps = DDSCAPS_TEXTURE; + header.Size = sizeof(DDSHeader); + header.Flags = DDSD_CAPS | DDSD_HEIGHT | DDSD_WIDTH | DDSD_PIXELFORMAT; + header.Height = height; + header.Width = width; + header.PitchOrLinearSize = width * 4; + header.Caps = DDSCAPS_TEXTURE; - header.PixelFormat.Size = sizeof(DDSPixelFormat); - header.PixelFormat.Flags = DDPF_RGB | DDPF_ALPHAPIXELS; - header.PixelFormat.RGBBitCount = 32; - header.PixelFormat.ABitMask = 0xFF000000; - header.PixelFormat.RBitMask = 0x00FF0000; - header.PixelFormat.GBitMask = 0x0000FF00; - header.PixelFormat.BBitMask = 0x000000FF; + header.PixelFormat.Size = sizeof(DDSPixelFormat); + header.PixelFormat.Flags = DDPF_RGB | DDPF_ALPHAPIXELS; + header.PixelFormat.RGBBitCount = 32; + header.PixelFormat.ABitMask = 0xFF000000; + header.PixelFormat.RBitMask = 0x00FF0000; + header.PixelFormat.GBitMask = 0x0000FF00; + header.PixelFormat.BBitMask = 0x000000FF; - std::ofstream str(filename, std::ios::binary); - int magic = 0x20534444; - str.write((const char*)&magic, 4); - str.write((const char*)&header, sizeof(header)); - for(unsigned j = 0; j < height; ++j) { - for(unsigned i = 0; i < width; ++i) { - unsigned char r, g, b, a; - r = (unsigned char)(data[j*width*4+i*4+0] * 255.0); - g = (unsigned char)(data[j*width*4+i*4+1] * 255.0); - b = (unsigned char)(data[j*width*4+i*4+2] * 255.0); - a = (unsigned char)(data[j*width*4+i*4+3] * 255.0); - str.write((const char*)&b, 1); - str.write((const char*)&g, 1); - str.write((const char*)&r, 1); - str.write((const char*)&a, 1); + std::ofstream str(filename, std::ios::binary); + int magic = 0x20534444; + str.write((const char *)&magic, 4); + str.write((const char *)&header, sizeof(header)); + for (unsigned j = 0; j < height; ++j) { + for (unsigned i = 0; i < width; ++i) { + unsigned char r, g, b, a; + r = (unsigned char)(data[j * width * 4 + i * 4 + 0] * 255.0); + g = (unsigned char)(data[j * width * 4 + i * 4 + 1] * 255.0); + b = (unsigned char)(data[j * width * 4 + i * 4 + 2] * 255.0); + a = (unsigned char)(data[j * width * 4 + i * 4 + 3] * 255.0); + str.write((const char *)&b, 1); + str.write((const char *)&g, 1); + str.write((const char *)&r, 1); + str.write((const char *)&a, 1); + } } - } - str.close(); + str.close(); } #endif diff --git a/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp b/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp index 1fd62c38..06e8d1ca 100644 --- a/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp +++ b/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp @@ -24,6 +24,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#include #include #include #include @@ -34,10 +35,8 @@ #include #include #include -#include - -#include #include +#include #include #include "DDSWriter.h" @@ -45,289 +44,282 @@ static_assert(sizeof(void *) == 8, "Only 64bit targets are supported."); using namespace llvm; -static cl::opt SaveCubin("save-cubin", - cl::desc("Write linker cubin to disk"), - cl::init(false)); -static cl::opt SaveIR("save-ir", cl::desc("Write LLVM IR to disk"), - cl::init(false)); -static cl::opt SavePTX("save-ptx", cl::desc("Write PTX to disk"), - cl::init(false)); +static cl::opt SaveCubin("save-cubin", cl::desc("Write linker cubin to disk"), cl::init(false)); +static cl::opt SaveIR("save-ir", cl::desc("Write LLVM IR to disk"), cl::init(false)); +static cl::opt SavePTX("save-ptx", cl::desc("Write PTX to disk"), cl::init(false)); // Width and height of the output image. -const unsigned width = 1024; +const unsigned width = 1024; const unsigned height = 512; // If 'err' is non-zero, emit an error message and exit. #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) -static void __checkCudaErrors(CUresult err, const char *filename, int line) { - assert(filename); - if (CUDA_SUCCESS != err) { - const char *ename = NULL; - const CUresult res = cuGetErrorName(err, &ename); - fprintf(stderr, - "CUDA API Error %04d: \"%s\" from file <%s>, " - "line %i.\n", - err, ((CUDA_SUCCESS == res) ? ename : "Unknown"), filename, line); - exit(err); - } +static void __checkCudaErrors(CUresult err, const char *filename, int line) +{ + assert(filename); + if (CUDA_SUCCESS != err) { + const char *ename = NULL; + const CUresult res = cuGetErrorName(err, &ename); + fprintf(stderr, + "CUDA API Error %04d: \"%s\" from file <%s>, " + "line %i.\n", + err, + ((CUDA_SUCCESS == res) ? ename : "Unknown"), + filename, + line); + exit(err); + } } // Verify that the NVVM result code is success, or terminate otherwise. -void checkNVVMCall(nvvmResult res) { - if (res != NVVM_SUCCESS) { - errs() << "libnvvm call failed\n"; - exit(res); - } +void checkNVVMCall(nvvmResult res) +{ + if (res != NVVM_SUCCESS) { + errs() << "libnvvm call failed\n"; + exit(res); + } } /// generateModule - Generate and LLVM IR module that calls an /// externally-defined function -std::unique_ptr generateModule(LLVMContext &context) { - // Create the module and setup the layout and triple. - auto mod = std::make_unique("nvvm-module", context); - mod->setDataLayout( - "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" - "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:" - "64"); - mod->setTargetTriple("nvptx64-nvidia-cuda"); +std::unique_ptr generateModule(LLVMContext &context) +{ + // Create the module and setup the layout and triple. + auto mod = std::make_unique("nvvm-module", context); + mod->setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" + "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:" + "64"); + mod->setTargetTriple("nvptx64-nvidia-cuda"); - // Get pointers to some commonly-used types. - Type *voidTy = Type::getVoidTy(context); - Type *floatTy = Type::getFloatTy(context); - Type *i32Ty = Type::getInt32Ty(context); - Type *floatGenericPtrTy = PointerType::get(floatTy, /* address space */ 0); + // Get pointers to some commonly-used types. + Type *voidTy = Type::getVoidTy(context); + Type *floatTy = Type::getFloatTy(context); + Type *i32Ty = Type::getInt32Ty(context); + Type *floatGenericPtrTy = PointerType::get(floatTy, /* address space */ 0); - // void @mandelbrot(float*) - Type *mandelbrotParamTys[] = {floatGenericPtrTy}; - FunctionType *mandelbrotTy = - FunctionType::get(voidTy, mandelbrotParamTys, false); - FunctionCallee mandelbrotFunc = - mod->getOrInsertFunction("mandelbrot", mandelbrotTy); + // void @mandelbrot(float*) + Type *mandelbrotParamTys[] = {floatGenericPtrTy}; + FunctionType *mandelbrotTy = FunctionType::get(voidTy, mandelbrotParamTys, false); + FunctionCallee mandelbrotFunc = mod->getOrInsertFunction("mandelbrot", mandelbrotTy); - // Kernel argument types. - Type *paramTys[] = {floatGenericPtrTy}; + // Kernel argument types. + Type *paramTys[] = {floatGenericPtrTy}; - // Kernel function type. - FunctionType *funcTy = FunctionType::get(voidTy, paramTys, false); + // Kernel function type. + FunctionType *funcTy = FunctionType::get(voidTy, paramTys, false); - // Kernel function. - Function *func = - Function::Create(funcTy, GlobalValue::ExternalLinkage, "kernel", *mod); - func->arg_begin()->setName("ptr"); + // Kernel function. + Function *func = Function::Create(funcTy, GlobalValue::ExternalLinkage, "kernel", *mod); + func->arg_begin()->setName("ptr"); - // 'entry' basic block in kernel function. - BasicBlock *entry = BasicBlock::Create(context, "entry", func); + // 'entry' basic block in kernel function. + BasicBlock *entry = BasicBlock::Create(context, "entry", func); - // Build the entry block. - IRBuilder<> builder(entry); - builder.CreateCall(mandelbrotFunc, func->arg_begin()); - builder.CreateRetVoid(); + // Build the entry block. + IRBuilder<> builder(entry); + builder.CreateCall(mandelbrotFunc, func->arg_begin()); + builder.CreateRetVoid(); - // Create kernel metadata. - Metadata *mdVals[] = {ValueAsMetadata::get(func), - MDString::get(context, "kernel"), - ConstantAsMetadata::get(ConstantInt::getTrue(context))}; - MDNode *kernelMD = MDNode::get(context, mdVals); - NamedMDNode *nvvmAnnot = mod->getOrInsertNamedMetadata("nvvm.annotations"); - nvvmAnnot->addOperand(kernelMD); + // Create kernel metadata. + Metadata *mdVals[] = {ValueAsMetadata::get(func), + MDString::get(context, "kernel"), + ConstantAsMetadata::get(ConstantInt::getTrue(context))}; + MDNode *kernelMD = MDNode::get(context, mdVals); + NamedMDNode *nvvmAnnot = mod->getOrInsertNamedMetadata("nvvm.annotations"); + nvvmAnnot->addOperand(kernelMD); - // Set the NVVM IR version to 2.0. - auto *two = ConstantInt::get(Type::getInt32Ty(context), 2); - auto *zero = ConstantInt::get(Type::getInt32Ty(context), 0); - auto *versionMD = MDNode::get( - context, {ConstantAsMetadata::get(two), ConstantAsMetadata::get(zero)}); - NamedMDNode *nvvmIRVersion = mod->getOrInsertNamedMetadata("nvvmir.version"); - nvvmIRVersion->addOperand(versionMD); + // Set the NVVM IR version to 2.0. + auto *two = ConstantInt::get(Type::getInt32Ty(context), 2); + auto *zero = ConstantInt::get(Type::getInt32Ty(context), 0); + auto *versionMD = MDNode::get(context, {ConstantAsMetadata::get(two), ConstantAsMetadata::get(zero)}); + NamedMDNode *nvvmIRVersion = mod->getOrInsertNamedMetadata("nvvmir.version"); + nvvmIRVersion->addOperand(versionMD); - return mod; + return mod; } // Use libNVVM to compile an NVVM IR module to PTX. -std::string generatePtx(const std::string &module, int devMajor, int devMinor, - const char *moduleName) { - assert(moduleName); +std::string generatePtx(const std::string &module, int devMajor, int devMinor, const char *moduleName) +{ + assert(moduleName); - // libNVVM initialization. - nvvmProgram compileUnit; - checkNVVMCall(nvvmCreateProgram(&compileUnit)); + // libNVVM initialization. + nvvmProgram compileUnit; + checkNVVMCall(nvvmCreateProgram(&compileUnit)); - // Create a libNVVM compilation unit from the NVVM IR. - checkNVVMCall(nvvmAddModuleToProgram(compileUnit, module.c_str(), - module.size(), moduleName)); - std::string computeArg = "-arch=compute_"; - computeArg += utostr(devMajor); - computeArg += utostr(devMinor); + // Create a libNVVM compilation unit from the NVVM IR. + checkNVVMCall(nvvmAddModuleToProgram(compileUnit, module.c_str(), module.size(), moduleName)); + std::string computeArg = "-arch=compute_"; + computeArg += utostr(devMajor); + computeArg += utostr(devMinor); - // Compile the NVVM IR into PTX. - const char *options[] = {computeArg.c_str()}; - nvvmResult res = nvvmCompileProgram(compileUnit, 1, options); - if (res != NVVM_SUCCESS) { - errs() << "nvvmCompileProgram failed!\n"; - size_t logSize; - nvvmGetProgramLogSize(compileUnit, &logSize); - char *msg = new char[logSize]; - nvvmGetProgramLog(compileUnit, msg); - errs() << msg << "\n"; - delete[] msg; - exit(EXIT_FAILURE); - } + // Compile the NVVM IR into PTX. + const char *options[] = {computeArg.c_str()}; + nvvmResult res = nvvmCompileProgram(compileUnit, 1, options); + if (res != NVVM_SUCCESS) { + errs() << "nvvmCompileProgram failed!\n"; + size_t logSize; + nvvmGetProgramLogSize(compileUnit, &logSize); + char *msg = new char[logSize]; + nvvmGetProgramLog(compileUnit, msg); + errs() << msg << "\n"; + delete[] msg; + exit(EXIT_FAILURE); + } - // Get the result PTX size and source. - size_t ptxSize = 0; - checkNVVMCall(nvvmGetCompiledResultSize(compileUnit, &ptxSize)); - char *ptx = new char[ptxSize]; - checkNVVMCall(nvvmGetCompiledResult(compileUnit, ptx)); + // Get the result PTX size and source. + size_t ptxSize = 0; + checkNVVMCall(nvvmGetCompiledResultSize(compileUnit, &ptxSize)); + char *ptx = new char[ptxSize]; + checkNVVMCall(nvvmGetCompiledResult(compileUnit, ptx)); - // Clean-up libNVVM. - checkNVVMCall(nvvmDestroyProgram(&compileUnit)); + // Clean-up libNVVM. + checkNVVMCall(nvvmDestroyProgram(&compileUnit)); - return std::string(ptx); + return std::string(ptx); } -int main(int argc, char **argv) { - cl::ParseCommandLineOptions(argc, argv, "cuda-c-linking"); +int main(int argc, char **argv) +{ + cl::ParseCommandLineOptions(argc, argv, "cuda-c-linking"); - // Locate the pre-built library. - std::string libpath0 = sys::fs::getMainExecutable(argv[0], (void *)main); - SmallString<256> libpath(libpath0); - const char *mathlibFile = "libmathfuncs64.a"; - sys::path::remove_filename(libpath); - sys::path::append(libpath, mathlibFile); + // Locate the pre-built library. + std::string libpath0 = sys::fs::getMainExecutable(argv[0], (void *)main); + SmallString<256> libpath(libpath0); + const char *mathlibFile = "libmathfuncs64.a"; + sys::path::remove_filename(libpath); + sys::path::append(libpath, mathlibFile); - if (!sys::fs::exists(libpath.c_str())) { - errs() << "Unable to locate math library, expected at " << libpath << '\n'; - return EXIT_FAILURE; - } + if (!sys::fs::exists(libpath.c_str())) { + errs() << "Unable to locate math library, expected at " << libpath << '\n'; + return EXIT_FAILURE; + } - outs() << "Using math library: " << libpath.str() << "\n"; + outs() << "Using math library: " << libpath.str() << "\n"; - // Initialize CUDA and obtain device 0. - checkCudaErrors(cuInit(0)); - int nDevices; - checkCudaErrors(cuDeviceGetCount(&nDevices)); - if (nDevices == 0) { - errs() << "Failed to locate any CUDA compute devices.\n"; - exit(EXIT_FAILURE); - } - CUdevice device; - checkCudaErrors(cuDeviceGet(&device, 0)); + // Initialize CUDA and obtain device 0. + checkCudaErrors(cuInit(0)); + int nDevices; + checkCudaErrors(cuDeviceGetCount(&nDevices)); + if (nDevices == 0) { + errs() << "Failed to locate any CUDA compute devices.\n"; + exit(EXIT_FAILURE); + } + CUdevice device; + checkCudaErrors(cuDeviceGet(&device, 0)); - char name[128]; - checkCudaErrors(cuDeviceGetName(name, 128, device)); - outs() << "Using CUDA Device [0]: " << name << "\n"; + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + outs() << "Using CUDA Device [0]: " << name << "\n"; - int devMajor = 0, devMinor = 0; - checkCudaErrors(cuDeviceGetAttribute( - &devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); - checkCudaErrors(cuDeviceGetAttribute( - &devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); - outs() << "Device Compute Capability: " << devMajor << "." << devMinor - << "\n"; - if (devMajor < 5) { - errs() << "ERROR: Device 0 is not sm_50 or later.\n"; - return 1; - } + int devMajor = 0, devMinor = 0; + checkCudaErrors(cuDeviceGetAttribute(&devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); + checkCudaErrors(cuDeviceGetAttribute(&devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); + outs() << "Device Compute Capability: " << devMajor << "." << devMinor << "\n"; + if (devMajor < 5) { + errs() << "ERROR: Device 0 is not sm_50 or later.\n"; + return 1; + } - // Generate the IR module - LLVMContext ctx; - std::string moduleStr; - auto module = generateModule(ctx); + // Generate the IR module + LLVMContext ctx; + std::string moduleStr; + auto module = generateModule(ctx); - if (SaveIR) { - std::error_code err; - raw_fd_ostream out("cuda-c-linking.kernel.ll", err); - out << *(module.get()); - } + if (SaveIR) { + std::error_code err; + raw_fd_ostream out("cuda-c-linking.kernel.ll", err); + out << *(module.get()); + } - // Write the module to a string. - { - llvm::raw_string_ostream str(moduleStr); - str << *module.get(); - } + // Write the module to a string. + { + llvm::raw_string_ostream str(moduleStr); + str << *module.get(); + } - // Generate PTX. - std::string ptx = generatePtx(moduleStr, devMajor, devMinor, - module->getModuleIdentifier().c_str()); - if (SavePTX) { - std::error_code err; - raw_fd_ostream out("cuda-c-linking.kernel.ptx", err); - out << ptx; - } + // Generate PTX. + std::string ptx = generatePtx(moduleStr, devMajor, devMinor, module->getModuleIdentifier().c_str()); + if (SavePTX) { + std::error_code err; + raw_fd_ostream out("cuda-c-linking.kernel.ptx", err); + out << ptx; + } - // Create the CUDA context. - CUcontext context; - checkCudaErrors(cuCtxCreate(&context, 0, device)); + // Create the CUDA context. + CUcontext context; + checkCudaErrors(cuCtxCreate(&context, 0, device)); - // Create a JIT linker and generate the result CUBIN. - CUlinkState linker; - char linkerInfo[1024]{}; - char linkerErrors[1024]{}; - CUjit_option linkerOptions[] = { - CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - CU_JIT_LOG_VERBOSE}; - void *linkerOptionValues[] = {linkerInfo, reinterpret_cast(1024), - linkerErrors, reinterpret_cast(1024), - reinterpret_cast(1)}; - checkCudaErrors(cuLinkCreate(5, linkerOptions, linkerOptionValues, &linker)); - checkCudaErrors(cuLinkAddData(linker, CU_JIT_INPUT_PTX, (void *)ptx.c_str(), - ptx.size(), "", 0, NULL, NULL)); - checkCudaErrors(cuLinkAddFile(linker, CU_JIT_INPUT_LIBRARY, libpath.c_str(), - 0, NULL, NULL)); - void *cubin; - size_t cubinSize; - checkCudaErrors(cuLinkComplete(linker, &cubin, &cubinSize)); - outs() << "Linker Log:\n" << linkerInfo << "\n" << linkerErrors << "\n"; - if (SaveCubin) { - std::error_code err; - raw_fd_ostream out("cuda-c-linking.linked.cubin", err, sys::fs::OF_None); - out.write(reinterpret_cast(cubin), cubinSize); - } + // Create a JIT linker and generate the result CUBIN. + CUlinkState linker; + char linkerInfo[1024]{}; + char linkerErrors[1024]{}; + CUjit_option linkerOptions[] = {CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_LOG_VERBOSE}; + void *linkerOptionValues[] = {linkerInfo, + reinterpret_cast(1024), + linkerErrors, + reinterpret_cast(1024), + reinterpret_cast(1)}; + checkCudaErrors(cuLinkCreate(5, linkerOptions, linkerOptionValues, &linker)); + checkCudaErrors( + cuLinkAddData(linker, CU_JIT_INPUT_PTX, (void *)ptx.c_str(), ptx.size(), "", 0, NULL, NULL)); + checkCudaErrors(cuLinkAddFile(linker, CU_JIT_INPUT_LIBRARY, libpath.c_str(), 0, NULL, NULL)); + void *cubin; + size_t cubinSize; + checkCudaErrors(cuLinkComplete(linker, &cubin, &cubinSize)); + outs() << "Linker Log:\n" << linkerInfo << "\n" << linkerErrors << "\n"; + if (SaveCubin) { + std::error_code err; + raw_fd_ostream out("cuda-c-linking.linked.cubin", err, sys::fs::OF_None); + out.write(reinterpret_cast(cubin), cubinSize); + } - // Create a module and load the cubin into it. - CUmodule cudaModule; - checkCudaErrors(cuModuleLoadDataEx(&cudaModule, cubin, 0, 0, 0)); + // Create a module and load the cubin into it. + CUmodule cudaModule; + checkCudaErrors(cuModuleLoadDataEx(&cudaModule, cubin, 0, 0, 0)); - // Now that the CUBIN is loaded, we can release the linker. - checkCudaErrors(cuLinkDestroy(linker)); + // Now that the CUBIN is loaded, we can release the linker. + checkCudaErrors(cuLinkDestroy(linker)); - // Get kernel function. - CUfunction function; - checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel")); + // Get kernel function. + CUfunction function; + checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel")); - // Device data. - CUdeviceptr devBuffer; - checkCudaErrors(cuMemAlloc(&devBuffer, sizeof(float) * width * height * 4)); - float *data = new float[width * height * 4]; + // Device data. + CUdeviceptr devBuffer; + checkCudaErrors(cuMemAlloc(&devBuffer, sizeof(float) * width * height * 4)); + float *data = new float[width * height * 4]; - // Each thread will generate one pixel, and we'll subdivide the problem into - // 16x16 chunks. - const unsigned blockSizeX = 16; - const unsigned blockSizeY = 16; - const unsigned blockSizeZ = 1; - const unsigned gridSizeX = width / 16; - const unsigned gridSizeY = height / 16; - const unsigned gridSizeZ = 1; + // Each thread will generate one pixel, and we'll subdivide the problem into + // 16x16 chunks. + const unsigned blockSizeX = 16; + const unsigned blockSizeY = 16; + const unsigned blockSizeZ = 1; + const unsigned gridSizeX = width / 16; + const unsigned gridSizeY = height / 16; + const unsigned gridSizeZ = 1; - // Execute the kernel. - outs() << "Launching kernel\n"; - void *params[] = {&devBuffer}; - checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ, - blockSizeX, blockSizeY, blockSizeZ, 0, NULL, - params, NULL)); + // Execute the kernel. + outs() << "Launching kernel\n"; + void *params[] = {&devBuffer}; + checkCudaErrors(cuLaunchKernel( + function, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ, 0, NULL, params, NULL)); - // Retrieve the result data from the device. - checkCudaErrors( - cuMemcpyDtoH(&data[0], devBuffer, sizeof(float) * width * height * 4)); + // Retrieve the result data from the device. + checkCudaErrors(cuMemcpyDtoH(&data[0], devBuffer, sizeof(float) * width * height * 4)); - writeDDS("mandelbrot.dds", data, width, height); - outs() << "Output saved to mandelbrot.dds\n"; + writeDDS("mandelbrot.dds", data, width, height); + outs() << "Output saved to mandelbrot.dds\n"; - // Cleanup. - delete[] data; - checkCudaErrors(cuMemFree(devBuffer)); - checkCudaErrors(cuModuleUnload(cudaModule)); - checkCudaErrors(cuCtxDestroy(context)); + // Cleanup. + delete[] data; + checkCudaErrors(cuMemFree(devBuffer)); + checkCudaErrors(cuModuleUnload(cudaModule)); + checkCudaErrors(cuCtxDestroy(context)); - return 0; + return 0; } diff --git a/Samples/7_libNVVM/cuda-c-linking/math-funcs.cu b/Samples/7_libNVVM/cuda-c-linking/math-funcs.cu index 8eba3d14..43d8485a 100644 --- a/Samples/7_libNVVM/cuda-c-linking/math-funcs.cu +++ b/Samples/7_libNVVM/cuda-c-linking/math-funcs.cu @@ -29,57 +29,56 @@ // Note that this kernel is meant to be a simple, straight-forward // implementation, and so may not represent optimized GPU code. -extern "C" -__device__ -void mandelbrot(float* Data) { +extern "C" __device__ void mandelbrot(float *Data) +{ - // Which pixel am I? - unsigned DataX = blockIdx.x * blockDim.x + threadIdx.x; - unsigned DataY = blockIdx.y * blockDim.y + threadIdx.y; - unsigned Width = gridDim.x * blockDim.x; - unsigned Height = gridDim.y * blockDim.y; + // Which pixel am I? + unsigned DataX = blockIdx.x * blockDim.x + threadIdx.x; + unsigned DataY = blockIdx.y * blockDim.y + threadIdx.y; + unsigned Width = gridDim.x * blockDim.x; + unsigned Height = gridDim.y * blockDim.y; - float R, G, B, A; + float R, G, B, A; - // Scale coordinates to (-2.5, 1) and (-1, 1) + // Scale coordinates to (-2.5, 1) and (-1, 1) - float NormX = (float)DataX / (float)Width; - NormX *= 3.5f; - NormX -= 2.5f; + float NormX = (float)DataX / (float)Width; + NormX *= 3.5f; + NormX -= 2.5f; - float NormY = (float)DataY / (float)Height; - NormY *= 2.0f; - NormY -= 1.0f; + float NormY = (float)DataY / (float)Height; + NormY *= 2.0f; + NormY -= 1.0f; - float X0 = NormX; - float Y0 = NormY; + float X0 = NormX; + float Y0 = NormY; - float X = 0.0f; - float Y = 0.0f; + float X = 0.0f; + float Y = 0.0f; - unsigned Iter = 0; - unsigned MaxIter = 1000; + unsigned Iter = 0; + unsigned MaxIter = 1000; - // Iterate - while(X*X + Y*Y < 4.0f && Iter < MaxIter) { - float XTemp = X*X - Y*Y + X0; - Y = 2.0f*X*Y + Y0; + // Iterate + while (X * X + Y * Y < 4.0f && Iter < MaxIter) { + float XTemp = X * X - Y * Y + X0; + Y = 2.0f * X * Y + Y0; - X = XTemp; + X = XTemp; - Iter++; - } + Iter++; + } - unsigned ColorG = Iter % 50; - unsigned ColorB = Iter % 25; + unsigned ColorG = Iter % 50; + unsigned ColorB = Iter % 25; - R = 0.0f; - G = (float)ColorG / 50.0f; - B = (float)ColorB / 25.0f; - A = 1.0f; + R = 0.0f; + G = (float)ColorG / 50.0f; + B = (float)ColorB / 25.0f; + A = 1.0f; - Data[DataY*Width*4+DataX*4+0] = R; - Data[DataY*Width*4+DataX*4+1] = G; - Data[DataY*Width*4+DataX*4+2] = B; - Data[DataY*Width*4+DataX*4+3] = A; + Data[DataY * Width * 4 + DataX * 4 + 0] = R; + Data[DataY * Width * 4 + DataX * 4 + 1] = G; + Data[DataY * Width * 4 + DataX * 4 + 2] = B; + Data[DataY * Width * 4 + DataX * 4 + 3] = A; } diff --git a/Samples/7_libNVVM/cuda-shared-memory/CMakeLists.txt b/Samples/7_libNVVM/cuda-shared-memory/CMakeLists.txt index 57ea0ced..576b6f2d 100644 --- a/Samples/7_libNVVM/cuda-shared-memory/CMakeLists.txt +++ b/Samples/7_libNVVM/cuda-shared-memory/CMakeLists.txt @@ -38,6 +38,6 @@ add_test(NAME test-cuda-shared-memory-extern_shared_memory set_tests_properties(test-cuda-shared-memory-shared_memory test-cuda-shared-memory-extern_shared_memory PROPERTIES FIXTURES_REQUIRED PTXGENTEST) - + file(COPY shared_memory.ll DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") file(COPY extern_shared_memory.ll DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/Samples/7_libNVVM/device-side-launch/README.md b/Samples/7_libNVVM/device-side-launch/README.md index 9b3a35ee..e8949677 100644 --- a/Samples/7_libNVVM/device-side-launch/README.md +++ b/Samples/7_libNVVM/device-side-launch/README.md @@ -16,7 +16,7 @@ section from CUDA C Programming Guide Kernel Launch APIs ------------------ -Device-side kernel launches can be implemented using the following two APIs +Device-side kernel launches can be implemented using the following two APIs in an NVVM IR program: cudaLaunchDevice() and cudaGetParameterBuffer(). cudaLaunchDevice() launches the specified kernel with the parameter buffer that is obtained by calling cudaGetParameterBuffer() and filled with the @@ -34,8 +34,8 @@ form shown below before it is used. declare i32 @cudaLaunchDeviceV2(i8*, %struct.CUstream_st*) The CUDA-level declaration below is mapped to one of the aftorementioned NVVM -IR level declarations and is found in the system header file -cuda_device_runtime_api.h. The function is defined in the cudadevrt system +IR level declarations and is found in the system header file +cuda_device_runtime_api.h. The function is defined in the cudadevrt system library, which must be linked with a program in order to use device-side kernel launch functionality. @@ -60,21 +60,21 @@ given below: The following CUDA-level declaration of cudaGetParameterBufferV2() is mapped to the aforementioned NVVM IR level declaration: - extern __device__ __cudart_builtin__ void * CUDARTAPI - cudaGetParameterBufferV2(void *func, dim3 gridDimension, + extern __device__ __cudart_builtin__ void * CUDARTAPI + cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize); The first parameter is a pointer to the kernel to be launched, and the other parameters specify the launch configuration, i.e., as grid -dimension, block dimension, and shared memory size. +dimension, block dimension, and shared memory size. Parameter Buffer Layout ----------------------- -Parameter reordering in the parameter buffer is prohibited, and each individual -parameter placed in the parameter buffer is required to be aligned. That is, +Parameter reordering in the parameter buffer is prohibited, and each individual +parameter placed in the parameter buffer is required to be aligned. That is, each parameter must be placed at the n-th byte in the parameter buffer, where n -is the smallest multiple of the parameter size that is greater than the offset -of the last byte taken by the preceding parameter. The maximum size of the +is the smallest multiple of the parameter size that is greater than the offset +of the last byte taken by the preceding parameter. The maximum size of the parameter buffer is 4KB. diff --git a/Samples/7_libNVVM/device-side-launch/dsl.c b/Samples/7_libNVVM/device-side-launch/dsl.c index b3cce986..cc493743 100644 --- a/Samples/7_libNVVM/device-side-launch/dsl.c +++ b/Samples/7_libNVVM/device-side-launch/dsl.c @@ -37,221 +37,224 @@ // The full path to the libcudadevrt.a is determined by the build environment. const char *_libCudaDevRt = LIBCUDADEVRT; -static const char *getLibCudaDevRtName(void) { - // Check that the library exists. - FILE *fh = fopen(_libCudaDevRt, "rb"); - if (fh == NULL) { - fprintf(stderr, "Error locating the libcudadevrt runtime: %s\n", - _libCudaDevRt); - exit(EXIT_FAILURE); - } - fclose(fh); - return _libCudaDevRt; +static const char *getLibCudaDevRtName(void) +{ + // Check that the library exists. + FILE *fh = fopen(_libCudaDevRt, "rb"); + if (fh == NULL) { + fprintf(stderr, "Error locating the libcudadevrt runtime: %s\n", _libCudaDevRt); + exit(EXIT_FAILURE); + } + fclose(fh); + return _libCudaDevRt; } // If 'err' is non-zero, emit an error message and exit. #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) -static void __checkCudaErrors(CUresult err, const char *filename, int line) { - assert(filename); - if (CUDA_SUCCESS != err) { - const char *ename = NULL; - const CUresult res = cuGetErrorName(err, &ename); - fprintf(stderr, - "CUDA API Error %04d: \"%s\" from file <%s>, " - "line %i.\n", - err, ((CUDA_SUCCESS == res) ? ename : "Unknown"), filename, line); - exit(err); - } +static void __checkCudaErrors(CUresult err, const char *filename, int line) +{ + assert(filename); + if (CUDA_SUCCESS != err) { + const char *ename = NULL; + const CUresult res = cuGetErrorName(err, &ename); + fprintf(stderr, + "CUDA API Error %04d: \"%s\" from file <%s>, " + "line %i.\n", + err, + ((CUDA_SUCCESS == res) ? ename : "Unknown"), + filename, + line); + exit(err); + } } -static char *loadProgramSource(const char *filename, size_t *size) { - assert(filename && size); - char *source = NULL; - *size = 0; - FILE *fh = fopen(filename, "rb"); - if (fh) { - struct stat statbuf; - stat(filename, &statbuf); - source = (char *)malloc(statbuf.st_size + 1); - if (source) { - fread(source, statbuf.st_size, 1, fh); - source[statbuf.st_size] = 0; - *size = statbuf.st_size + 1; +static char *loadProgramSource(const char *filename, size_t *size) +{ + assert(filename && size); + char *source = NULL; + *size = 0; + FILE *fh = fopen(filename, "rb"); + if (fh) { + struct stat statbuf; + stat(filename, &statbuf); + source = (char *)malloc(statbuf.st_size + 1); + if (source) { + fread(source, statbuf.st_size, 1, fh); + source[statbuf.st_size] = 0; + *size = statbuf.st_size + 1; + } } - } else { - fprintf(stderr, "Error reading file %s\n", filename); - exit(EXIT_FAILURE); - } - return source; + else { + fprintf(stderr, "Error reading file %s\n", filename); + exit(EXIT_FAILURE); + } + return source; } // Compile the NVVM IR into PTX. -static char *generatePTX(const char *ll, size_t size, const char *filename, - int devMajor, int devMinor) { - assert(ll && filename); +static char *generatePTX(const char *ll, size_t size, const char *filename, int devMajor, int devMinor) +{ + assert(ll && filename); - // Create a program instance for libNVVM. - nvvmProgram program; - nvvmResult result = nvvmCreateProgram(&program); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmCreateProgram: Failed\n"); - exit(EXIT_FAILURE); - } + // Create a program instance for libNVVM. + nvvmProgram program; + nvvmResult result = nvvmCreateProgram(&program); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmCreateProgram: Failed\n"); + exit(EXIT_FAILURE); + } - // Add the NVVM IR as a module to our libNVVM program. - result = nvvmAddModuleToProgram(program, ll, size, filename); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmAddModuleToProgram: Failed\n"); - exit(EXIT_FAILURE); - } + // Add the NVVM IR as a module to our libNVVM program. + result = nvvmAddModuleToProgram(program, ll, size, filename); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmAddModuleToProgram: Failed\n"); + exit(EXIT_FAILURE); + } - // Dynamically construct the compute capability. - char arch[32] = {0}; - snprintf(arch, sizeof(arch) - 1, "-arch=compute_%d%d", devMajor, devMinor); + // Dynamically construct the compute capability. + char arch[32] = {0}; + snprintf(arch, sizeof(arch) - 1, "-arch=compute_%d%d", devMajor, devMinor); - // Compile the IR into PTX. - const char *options[] = {arch}; - result = nvvmCompileProgram(program, 1, options); - if (result != NVVM_SUCCESS) { - char *Msg = NULL; - size_t LogSize; - fprintf(stderr, "nvvmCompileProgram: Failed\n"); - nvvmGetProgramLogSize(program, &LogSize); - Msg = (char *)malloc(LogSize); - nvvmGetProgramLog(program, Msg); - fprintf(stderr, "%s\n", Msg); - free(Msg); - exit(EXIT_FAILURE); - } + // Compile the IR into PTX. + const char *options[] = {arch}; + result = nvvmCompileProgram(program, 1, options); + if (result != NVVM_SUCCESS) { + char *Msg = NULL; + size_t LogSize; + fprintf(stderr, "nvvmCompileProgram: Failed\n"); + nvvmGetProgramLogSize(program, &LogSize); + Msg = (char *)malloc(LogSize); + nvvmGetProgramLog(program, Msg); + fprintf(stderr, "%s\n", Msg); + free(Msg); + exit(EXIT_FAILURE); + } - size_t ptxSize = 0; - result = nvvmGetCompiledResultSize(program, &ptxSize); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmGetCompiledResultSize: Failed\n"); - exit(EXIT_FAILURE); - } + size_t ptxSize = 0; + result = nvvmGetCompiledResultSize(program, &ptxSize); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmGetCompiledResultSize: Failed\n"); + exit(EXIT_FAILURE); + } - char *ptx = malloc(ptxSize); - assert(ptx); - result = nvvmGetCompiledResult(program, ptx); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmGetCompiledResult: Failed\n"); - free(ptx); - exit(EXIT_FAILURE); - } + char *ptx = malloc(ptxSize); + assert(ptx); + result = nvvmGetCompiledResult(program, ptx); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmGetCompiledResult: Failed\n"); + free(ptx); + exit(EXIT_FAILURE); + } - result = nvvmDestroyProgram(&program); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmDestroyProgram: Failed\n"); - free(ptx); - exit(EXIT_FAILURE); - } + result = nvvmDestroyProgram(&program); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmDestroyProgram: Failed\n"); + free(ptx); + exit(EXIT_FAILURE); + } - return ptx; + return ptx; } // Return the device compute capability in major and minor. -static CUdevice cudaDeviceInit(int *major, int *minor) { - assert(major && minor); - // Count the number of CUDA compute capable devices.. - CUresult err = cuInit(0); - int deviceCount = 0; - if (CUDA_SUCCESS == err) - checkCudaErrors(cuDeviceGetCount(&deviceCount)); - if (deviceCount == 0) { - fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); - exit(EXIT_FAILURE); - } +static CUdevice cudaDeviceInit(int *major, int *minor) +{ + assert(major && minor); + // Count the number of CUDA compute capable devices.. + CUresult err = cuInit(0); + int deviceCount = 0; + if (CUDA_SUCCESS == err) + checkCudaErrors(cuDeviceGetCount(&deviceCount)); + if (deviceCount == 0) { + fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } - // Get the first device discovered (device 0) and print its name. - CUdevice cuDevice = 0; - checkCudaErrors(cuDeviceGet(&cuDevice, 0)); - char name[128] = {0}; - checkCudaErrors(cuDeviceGetName(name, sizeof(name), cuDevice)); - printf("Using CUDA Device [0]: %s\n", name); + // Get the first device discovered (device 0) and print its name. + CUdevice cuDevice = 0; + checkCudaErrors(cuDeviceGet(&cuDevice, 0)); + char name[128] = {0}; + checkCudaErrors(cuDeviceGetName(name, sizeof(name), cuDevice)); + printf("Using CUDA Device [0]: %s\n", name); - // Get and test the compute capability. - checkCudaErrors(cuDeviceGetAttribute( - major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - checkCudaErrors(cuDeviceGetAttribute( - minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - printf("compute capability = %d.%d\n", *major, *minor); - if (*major < 5) { - fprintf(stderr, "Device 0 is not sm_50 or later\n"); - exit(EXIT_FAILURE); - } - return cuDevice; + // Get and test the compute capability. + checkCudaErrors(cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + printf("compute capability = %d.%d\n", *major, *minor); + if (*major < 5) { + fprintf(stderr, "Device 0 is not sm_50 or later\n"); + exit(EXIT_FAILURE); + } + return cuDevice; } -static CUresult buildKernel(CUcontext *phContext, CUdevice *phDevice, - CUmodule *phModule, CUfunction *phKernel) { - assert(phContext && phDevice && phModule && phKernel); +static CUresult buildKernel(CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUfunction *phKernel) +{ + assert(phContext && phDevice && phModule && phKernel); - // Initialize CUDA and obtain the device's compute capability. - int major = 0, minor = 0; - *phDevice = cudaDeviceInit(&major, &minor); - checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); + // Initialize CUDA and obtain the device's compute capability. + int major = 0, minor = 0; + *phDevice = cudaDeviceInit(&major, &minor); + checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); - // Get the NVVM IR from file. - size_t size = 0; - const char *filename = "dsl-gpu64.ll"; - char *ll = loadProgramSource(filename, &size); - fprintf(stdout, "NVVM IR ll file loaded\n"); + // Get the NVVM IR from file. + size_t size = 0; + const char *filename = "dsl-gpu64.ll"; + char *ll = loadProgramSource(filename, &size); + fprintf(stdout, "NVVM IR ll file loaded\n"); - // Use libNVVM to generate PTX. - char *ptx = generatePTX(ll, size, filename, major, minor); - fprintf(stdout, "PTX generated:\n"); - fprintf(stdout, "%s\n", ptx); + // Use libNVVM to generate PTX. + char *ptx = generatePTX(ll, size, filename, major, minor); + fprintf(stdout, "PTX generated:\n"); + fprintf(stdout, "%s\n", ptx); - // Create a context and link the PTX and device library. - const char *libCudaDevRtName = getLibCudaDevRtName(); - void *cubin = NULL; - size_t cubinSize = 0; - CUlinkState linkState; - checkCudaErrors(cuLinkCreate(0, NULL, NULL, &linkState)); - checkCudaErrors(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, - strlen(ptx) + 1, 0, 0, 0, 0)); - checkCudaErrors(cuLinkAddFile(linkState, CU_JIT_INPUT_LIBRARY, - libCudaDevRtName, 0, NULL, NULL)); - checkCudaErrors(cuLinkComplete(linkState, &cubin, &cubinSize)); + // Create a context and link the PTX and device library. + const char *libCudaDevRtName = getLibCudaDevRtName(); + void *cubin = NULL; + size_t cubinSize = 0; + CUlinkState linkState; + checkCudaErrors(cuLinkCreate(0, NULL, NULL, &linkState)); + checkCudaErrors(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, strlen(ptx) + 1, 0, 0, 0, 0)); + checkCudaErrors(cuLinkAddFile(linkState, CU_JIT_INPUT_LIBRARY, libCudaDevRtName, 0, NULL, NULL)); + checkCudaErrors(cuLinkComplete(linkState, &cubin, &cubinSize)); - // Load the linked binary. - checkCudaErrors(cuModuleLoadData(phModule, cubin)); + // Load the linked binary. + checkCudaErrors(cuModuleLoadData(phModule, cubin)); - // Locate the kernel entry point. - checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "kernel")); + // Locate the kernel entry point. + checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "kernel")); - checkCudaErrors(cuLinkDestroy(linkState)); - free(ll); - free(ptx); - return CUDA_SUCCESS; + checkCudaErrors(cuLinkDestroy(linkState)); + free(ll); + free(ptx); + return CUDA_SUCCESS; } -int main(int argc, char **argv) { - const unsigned int nThreads = 1; - const unsigned int nBlocks = 1; +int main(int argc, char **argv) +{ + const unsigned int nThreads = 1; + const unsigned int nBlocks = 1; - // Initialize the device and get a handle to the kernel. - CUcontext hContext = 0; - CUdevice hDevice = 0; - CUmodule hModule = 0; - CUfunction hKernel = 0; - checkCudaErrors(buildKernel(&hContext, &hDevice, &hModule, &hKernel)); + // Initialize the device and get a handle to the kernel. + CUcontext hContext = 0; + CUdevice hDevice = 0; + CUmodule hModule = 0; + CUfunction hKernel = 0; + checkCudaErrors(buildKernel(&hContext, &hDevice, &hModule, &hKernel)); - // Launch the kernel. - int depth = 0; - void *params[] = {&depth}; - checkCudaErrors(cuLaunchKernel(hKernel, nBlocks, 1, 1, nThreads, 1, 1, 0, - NULL, params, NULL)); + // Launch the kernel. + int depth = 0; + void *params[] = {&depth}; + checkCudaErrors(cuLaunchKernel(hKernel, nBlocks, 1, 1, nThreads, 1, 1, 0, NULL, params, NULL)); - if (hModule) { - checkCudaErrors(cuModuleUnload(hModule)); - hModule = 0; - } - if (hContext) { - checkCudaErrors(cuCtxDestroy(hContext)); - hContext = 0; - } + if (hModule) { + checkCudaErrors(cuModuleUnload(hModule)); + hModule = 0; + } + if (hContext) { + checkCudaErrors(cuCtxDestroy(hContext)); + hContext = 0; + } - return 0; + return 0; } diff --git a/Samples/7_libNVVM/ptxgen/ptxgen.c b/Samples/7_libNVVM/ptxgen/ptxgen.c index d8058389..62f95214 100644 --- a/Samples/7_libNVVM/ptxgen/ptxgen.c +++ b/Samples/7_libNVVM/ptxgen/ptxgen.c @@ -33,235 +33,245 @@ /* Two levels of indirection to stringify LIBDEVICE_MAJOR_VERSION and * LIBDEVICE_MINOR_VERSION correctly. */ -#define getLibDeviceName() \ - _getLibDeviceName(LIBDEVICE_MAJOR_VERSION, LIBDEVICE_MINOR_VERSION) -#define _getLibDeviceName(MAJOR, MINOR) __getLibDeviceName(MAJOR, MINOR) -#define __getLibDeviceName(MAJOR, MINOR) \ - ("/libdevice/libdevice." #MAJOR #MINOR ".bc") +#define getLibDeviceName() _getLibDeviceName(LIBDEVICE_MAJOR_VERSION, LIBDEVICE_MINOR_VERSION) +#define _getLibDeviceName(MAJOR, MINOR) __getLibDeviceName(MAJOR, MINOR) +#define __getLibDeviceName(MAJOR, MINOR) ("/libdevice/libdevice." #MAJOR #MINOR ".bc") -#define getLibnvvmHome() _getLibnvvmHome(LIBNVVM_HOME) -#define _getLibnvvmHome(NVVM_HOME) __getLibnvvmHome(NVVM_HOME) +#define getLibnvvmHome() _getLibnvvmHome(LIBNVVM_HOME) +#define _getLibnvvmHome(NVVM_HOME) __getLibnvvmHome(NVVM_HOME) #define __getLibnvvmHome(NVVM_HOME) (#NVVM_HOME) typedef enum { - PTXGEN_SUCCESS = 0x0000, - PTXGEN_FILE_IO_ERROR = 0x0001, - PTXGEN_BAD_ALLOC_ERROR = 0x0002, - PTXGEN_LIBNVVM_COMPILATION_ERROR = 0x0004, - PTXGEN_LIBNVVM_ERROR = 0x0008, - PTXGEN_INVALID_USAGE = 0x0010, - PTXGEN_LIBNVVM_HOME_UNDEFINED = 0x0020, - PTXGEN_LIBNVVM_VERIFICATION_ERROR = 0x0040 + PTXGEN_SUCCESS = 0x0000, + PTXGEN_FILE_IO_ERROR = 0x0001, + PTXGEN_BAD_ALLOC_ERROR = 0x0002, + PTXGEN_LIBNVVM_COMPILATION_ERROR = 0x0004, + PTXGEN_LIBNVVM_ERROR = 0x0008, + PTXGEN_INVALID_USAGE = 0x0010, + PTXGEN_LIBNVVM_HOME_UNDEFINED = 0x0020, + PTXGEN_LIBNVVM_VERIFICATION_ERROR = 0x0040 } PTXGenStatus; typedef enum { PTXGEN_INPUT_PROGRAM, PTXGEN_INPUT_LIBDEVICE } PTXGENInput; -static PTXGenStatus getLibDevicePath(char **buffer) { - assert(buffer); +static PTXGenStatus getLibDevicePath(char **buffer) +{ + assert(buffer); - const char *libnvvmPath = getLibnvvmHome(); - if (libnvvmPath == NULL) { - fprintf(stderr, "The environment variable LIBNVVM_HOME undefined\n"); - return PTXGEN_LIBNVVM_HOME_UNDEFINED; - } + const char *libnvvmPath = getLibnvvmHome(); + if (libnvvmPath == NULL) { + fprintf(stderr, "The environment variable LIBNVVM_HOME undefined\n"); + return PTXGEN_LIBNVVM_HOME_UNDEFINED; + } - const char *libdevice = getLibDeviceName(); - *buffer = malloc(strlen(libnvvmPath) + strlen(libdevice) + 1); - if (*buffer == NULL) { - fprintf(stderr, "Failed to allocate memory\n"); - return PTXGEN_BAD_ALLOC_ERROR; - } + const char *libdevice = getLibDeviceName(); + *buffer = malloc(strlen(libnvvmPath) + strlen(libdevice) + 1); + if (*buffer == NULL) { + fprintf(stderr, "Failed to allocate memory\n"); + return PTXGEN_BAD_ALLOC_ERROR; + } - // Concatenate libnvvmPath with libdevice. - *buffer = strcat(strcpy(*buffer, libnvvmPath), libdevice); + // Concatenate libnvvmPath with libdevice. + *buffer = strcat(strcpy(*buffer, libnvvmPath), libdevice); - return PTXGEN_SUCCESS; + return PTXGEN_SUCCESS; } -static PTXGenStatus addFileToProgram(const char *filename, nvvmProgram prog, - PTXGENInput inputType) { - assert(filename); +static PTXGenStatus addFileToProgram(const char *filename, nvvmProgram prog, PTXGENInput inputType) +{ + assert(filename); - // Open the input file. - FILE *f = fopen(filename, "rb"); - if (f == NULL) { - fprintf(stderr, "Failed to open %s\n", filename); - return PTXGEN_FILE_IO_ERROR; - } + // Open the input file. + FILE *f = fopen(filename, "rb"); + if (f == NULL) { + fprintf(stderr, "Failed to open %s\n", filename); + return PTXGEN_FILE_IO_ERROR; + } - // Allocate a buffer for the input. - struct stat fileStat; - fstat(fileno(f), &fileStat); - char *buffer = malloc(fileStat.st_size); - if (buffer == NULL) { - fprintf(stderr, "Failed to allocate memory\n"); - return PTXGEN_BAD_ALLOC_ERROR; - } - const size_t size = fread(buffer, 1, fileStat.st_size, f); - if (ferror(f)) { - fprintf(stderr, "Failed to read %s\n", filename); + // Allocate a buffer for the input. + struct stat fileStat; + fstat(fileno(f), &fileStat); + char *buffer = malloc(fileStat.st_size); + if (buffer == NULL) { + fprintf(stderr, "Failed to allocate memory\n"); + return PTXGEN_BAD_ALLOC_ERROR; + } + const size_t size = fread(buffer, 1, fileStat.st_size, f); + if (ferror(f)) { + fprintf(stderr, "Failed to read %s\n", filename); + fclose(f); + free(buffer); + return PTXGEN_FILE_IO_ERROR; + } fclose(f); - free(buffer); - return PTXGEN_FILE_IO_ERROR; - } - fclose(f); - // Add the input to the libNVVM program instance. - nvvmResult result; - if (inputType == PTXGEN_INPUT_LIBDEVICE) - result = nvvmLazyAddModuleToProgram(prog, buffer, size, filename); - else - result = nvvmAddModuleToProgram(prog, buffer, size, filename); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "Failed to add the module %s to the compilation unit\n", - filename); - free(buffer); - return PTXGEN_LIBNVVM_ERROR; - } + // Add the input to the libNVVM program instance. + nvvmResult result; + if (inputType == PTXGEN_INPUT_LIBDEVICE) + result = nvvmLazyAddModuleToProgram(prog, buffer, size, filename); + else + result = nvvmAddModuleToProgram(prog, buffer, size, filename); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "Failed to add the module %s to the compilation unit\n", filename); + free(buffer); + return PTXGEN_LIBNVVM_ERROR; + } - free(buffer); - return PTXGEN_SUCCESS; + free(buffer); + return PTXGEN_SUCCESS; } // Read the nvvmProgram compilation log. -static PTXGenStatus dumpCompilationLog(nvvmProgram prog) { - size_t logSize; - PTXGenStatus status = PTXGEN_SUCCESS; - if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) { - fprintf(stderr, "Failed to get the compilation log size\n"); - status = PTXGEN_LIBNVVM_ERROR; - } else { - char *log = malloc(logSize); - if (log == NULL) { - fprintf(stderr, "Failed to allocate memory\n"); - status = PTXGEN_BAD_ALLOC_ERROR; - } else if (nvvmGetProgramLog(prog, log) != NVVM_SUCCESS) { - fprintf(stderr, "Failed to get the compilation log\n"); - status = PTXGEN_LIBNVVM_ERROR; - } else { - fprintf(stderr, "%s\n", log); +static PTXGenStatus dumpCompilationLog(nvvmProgram prog) +{ + size_t logSize; + PTXGenStatus status = PTXGEN_SUCCESS; + if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) { + fprintf(stderr, "Failed to get the compilation log size\n"); + status = PTXGEN_LIBNVVM_ERROR; } - free(log); - } - return status; + else { + char *log = malloc(logSize); + if (log == NULL) { + fprintf(stderr, "Failed to allocate memory\n"); + status = PTXGEN_BAD_ALLOC_ERROR; + } + else if (nvvmGetProgramLog(prog, log) != NVVM_SUCCESS) { + fprintf(stderr, "Failed to get the compilation log\n"); + status = PTXGEN_LIBNVVM_ERROR; + } + else { + fprintf(stderr, "%s\n", log); + } + free(log); + } + return status; } -static PTXGenStatus generatePTX(unsigned numOptions, const char **options, - unsigned numFilenames, const char **filenames) { - // Create the compilation unit (the libNVVM program instance). - nvvmProgram prog; - if (nvvmCreateProgram(&prog) != NVVM_SUCCESS) { - fprintf(stderr, "Failed to create the compilation unit\n"); - return PTXGEN_LIBNVVM_ERROR; - } +static PTXGenStatus +generatePTX(unsigned numOptions, const char **options, unsigned numFilenames, const char **filenames) +{ + // Create the compilation unit (the libNVVM program instance). + nvvmProgram prog; + if (nvvmCreateProgram(&prog) != NVVM_SUCCESS) { + fprintf(stderr, "Failed to create the compilation unit\n"); + return PTXGEN_LIBNVVM_ERROR; + } - // Add the module to the compilation unit. - for (unsigned i = 0; i < numFilenames; ++i) { - PTXGenStatus status = - addFileToProgram(filenames[i], prog, PTXGEN_INPUT_PROGRAM); + // Add the module to the compilation unit. + for (unsigned i = 0; i < numFilenames; ++i) { + PTXGenStatus status = addFileToProgram(filenames[i], prog, PTXGEN_INPUT_PROGRAM); + if (status != PTXGEN_SUCCESS) { + nvvmDestroyProgram(&prog); + return status; + } + } + + // Verify the compilation unit. + if (nvvmVerifyProgram(prog, numOptions, options) != NVVM_SUCCESS) { + fprintf(stderr, "Failed to verify the compilation unit\n"); + return PTXGEN_LIBNVVM_VERIFICATION_ERROR; + } + + // Add libdevice to the libNVVM program instance. + char *libDeviceName; + PTXGenStatus status = getLibDevicePath(&libDeviceName); if (status != PTXGEN_SUCCESS) { - nvvmDestroyProgram(&prog); - return status; + nvvmDestroyProgram(&prog); + return status; } - } - - // Verify the compilation unit. - if (nvvmVerifyProgram(prog, numOptions, options) != NVVM_SUCCESS) { - fprintf(stderr, "Failed to verify the compilation unit\n"); - return PTXGEN_LIBNVVM_VERIFICATION_ERROR; - } - - // Add libdevice to the libNVVM program instance. - char *libDeviceName; - PTXGenStatus status = getLibDevicePath(&libDeviceName); - if (status != PTXGEN_SUCCESS) { - nvvmDestroyProgram(&prog); - return status; - } - status = addFileToProgram(libDeviceName, prog, PTXGEN_INPUT_LIBDEVICE); - free(libDeviceName); - if (status != PTXGEN_SUCCESS) { - nvvmDestroyProgram(&prog); - return status; - } - - // Display the compilation log. - status |= dumpCompilationLog(prog); - if (status & PTXGEN_LIBNVVM_VERIFICATION_ERROR) { - nvvmDestroyProgram(&prog); - return status; - } - - // Compile the compilation unit. - if (nvvmCompileProgram(prog, numOptions, options) != NVVM_SUCCESS) { - fprintf(stderr, "Failed to generate PTX from the compilation unit\n"); - status |= PTXGEN_LIBNVVM_COMPILATION_ERROR; - } else { - size_t ptxSize; - if (nvvmGetCompiledResultSize(prog, &ptxSize) != NVVM_SUCCESS) { - fprintf(stderr, "Failed to get the PTX output size\n"); - status |= PTXGEN_LIBNVVM_ERROR; - } else { - char *ptx = malloc(ptxSize); - if (ptx == NULL) { - fprintf(stderr, "Failed to allocate memory\n"); - status |= PTXGEN_BAD_ALLOC_ERROR; - } else if (nvvmGetCompiledResult(prog, ptx) != NVVM_SUCCESS) { - fprintf(stderr, "Failed to get the PTX output\n"); - status |= PTXGEN_LIBNVVM_ERROR; - } else { - fprintf(stdout, "%s\n", ptx); - } - free(ptx); + status = addFileToProgram(libDeviceName, prog, PTXGEN_INPUT_LIBDEVICE); + free(libDeviceName); + if (status != PTXGEN_SUCCESS) { + nvvmDestroyProgram(&prog); + return status; } - } - status |= dumpCompilationLog(prog); + // Display the compilation log. + status |= dumpCompilationLog(prog); + if (status & PTXGEN_LIBNVVM_VERIFICATION_ERROR) { + nvvmDestroyProgram(&prog); + return status; + } - // Release the resources. - nvvmDestroyProgram(&prog); + // Compile the compilation unit. + if (nvvmCompileProgram(prog, numOptions, options) != NVVM_SUCCESS) { + fprintf(stderr, "Failed to generate PTX from the compilation unit\n"); + status |= PTXGEN_LIBNVVM_COMPILATION_ERROR; + } + else { + size_t ptxSize; + if (nvvmGetCompiledResultSize(prog, &ptxSize) != NVVM_SUCCESS) { + fprintf(stderr, "Failed to get the PTX output size\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else { + char *ptx = malloc(ptxSize); + if (ptx == NULL) { + fprintf(stderr, "Failed to allocate memory\n"); + status |= PTXGEN_BAD_ALLOC_ERROR; + } + else if (nvvmGetCompiledResult(prog, ptx) != NVVM_SUCCESS) { + fprintf(stderr, "Failed to get the PTX output\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else { + fprintf(stdout, "%s\n", ptx); + } + free(ptx); + } + } - return status; + status |= dumpCompilationLog(prog); + + // Release the resources. + nvvmDestroyProgram(&prog); + + return status; } -static void showUsage(void) { - fprintf(stderr, - "Usage: ptxgen [OPTION]... [FILE]...\n" - " [FILE] could be a .bc file or a .ll file\n"); +static void showUsage(void) +{ + fprintf(stderr, + "Usage: ptxgen [OPTION]... [FILE]...\n" + " [FILE] could be a .bc file or a .ll file\n"); } -int main(int argc, char *argv[]) { - PTXGenStatus status = PTXGEN_SUCCESS; +int main(int argc, char *argv[]) +{ + PTXGenStatus status = PTXGEN_SUCCESS; - if (argc == 1) { - showUsage(); - return PTXGEN_INVALID_USAGE; - } - - // Extract libNVVM options and the input file names. - unsigned numOptions = 0, numFilenames = 0; - const char **options = malloc((argc - 1) * sizeof(char *)); - const char **filenames = malloc((argc - 1) * sizeof(char *)); - assert(options && filenames); - for (int i = 1; i < argc; ++i) { - if (argv[i][0] == '-') { - options[numOptions] = argv[i]; - ++numOptions; - } else { - filenames[numFilenames] = argv[i]; - ++numFilenames; + if (argc == 1) { + showUsage(); + return PTXGEN_INVALID_USAGE; } - } - if (numFilenames == 0) { - showUsage(); - status = PTXGEN_INVALID_USAGE; - } else { - // Use libNVVM to generate PTX. - status = generatePTX(numOptions, options, numFilenames, filenames); - } + // Extract libNVVM options and the input file names. + unsigned numOptions = 0, numFilenames = 0; + const char **options = malloc((argc - 1) * sizeof(char *)); + const char **filenames = malloc((argc - 1) * sizeof(char *)); + assert(options && filenames); + for (int i = 1; i < argc; ++i) { + if (argv[i][0] == '-') { + options[numOptions] = argv[i]; + ++numOptions; + } + else { + filenames[numFilenames] = argv[i]; + ++numFilenames; + } + } - free(options); - free(filenames); - return status; + if (numFilenames == 0) { + showUsage(); + status = PTXGEN_INVALID_USAGE; + } + else { + // Use libNVVM to generate PTX. + status = generatePTX(numOptions, options, numFilenames, filenames); + } + + free(options); + free(filenames); + return status; } diff --git a/Samples/7_libNVVM/simple/simple.c b/Samples/7_libNVVM/simple/simple.c index 0cb4f143..94035e30 100644 --- a/Samples/7_libNVVM/simple/simple.c +++ b/Samples/7_libNVVM/simple/simple.c @@ -35,212 +35,217 @@ // If 'err' is non-zero, emit an error message and exit. #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) -static void __checkCudaErrors(CUresult err, const char *filename, int line) { - assert(filename); - if (CUDA_SUCCESS != err) { - const char *ename = NULL; - const CUresult res = cuGetErrorName(err, &ename); - fprintf(stderr, - "CUDA API Error %04d: \"%s\" from file <%s>, " - "line %i.\n", - err, ((CUDA_SUCCESS == res) ? ename : "Unknown"), filename, line); - exit(err); - } +static void __checkCudaErrors(CUresult err, const char *filename, int line) +{ + assert(filename); + if (CUDA_SUCCESS != err) { + const char *ename = NULL; + const CUresult res = cuGetErrorName(err, &ename); + fprintf(stderr, + "CUDA API Error %04d: \"%s\" from file <%s>, " + "line %i.\n", + err, + ((CUDA_SUCCESS == res) ? ename : "Unknown"), + filename, + line); + exit(err); + } } // Return a CUDA capable device or exit if one cannot be found. -static CUdevice cudaDeviceInit(int *devMajor, int *devMinor) { - assert(devMajor && devMinor); - CUresult err = cuInit(0); - int deviceCount = 0; - if (CUDA_SUCCESS == err) - checkCudaErrors(cuDeviceGetCount(&deviceCount)); - if (deviceCount == 0) { - fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); - exit(EXIT_FAILURE); - } +static CUdevice cudaDeviceInit(int *devMajor, int *devMinor) +{ + assert(devMajor && devMinor); + CUresult err = cuInit(0); + int deviceCount = 0; + if (CUDA_SUCCESS == err) + checkCudaErrors(cuDeviceGetCount(&deviceCount)); + if (deviceCount == 0) { + fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } - // Locate a CUDA supporting device and its name. - CUdevice cuDevice = 0; - checkCudaErrors(cuDeviceGet(&cuDevice, 0)); - char name[128]; - cuDeviceGetName(name, sizeof(name), cuDevice); - printf("Using CUDA Device [0]: %s\n", name); + // Locate a CUDA supporting device and its name. + CUdevice cuDevice = 0; + checkCudaErrors(cuDeviceGet(&cuDevice, 0)); + char name[128]; + cuDeviceGetName(name, sizeof(name), cuDevice); + printf("Using CUDA Device [0]: %s\n", name); - // Obtain the device's compute capability. - checkCudaErrors(cuDeviceGetAttribute( - devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - if (*devMajor < 5) { - fprintf(stderr, "Device 0 is not sm_50 or later\n"); - exit(EXIT_FAILURE); - } - checkCudaErrors(cuDeviceGetAttribute( - devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + // Obtain the device's compute capability. + checkCudaErrors(cuDeviceGetAttribute(devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + if (*devMajor < 5) { + fprintf(stderr, "Device 0 is not sm_50 or later\n"); + exit(EXIT_FAILURE); + } + checkCudaErrors(cuDeviceGetAttribute(devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - return cuDevice; + return cuDevice; } -static CUresult initCUDA(CUcontext *phContext, CUdevice *phDevice, - CUmodule *phModule, CUfunction *phKernel, - const char *ptx) { - assert(phContext && phDevice && phModule && phKernel && ptx); +static CUresult +initCUDA(CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUfunction *phKernel, const char *ptx) +{ + assert(phContext && phDevice && phModule && phKernel && ptx); - // Create a CUDA context on the device. - checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); + // Create a CUDA context on the device. + checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); - // Load the PTX. - checkCudaErrors(cuModuleLoadDataEx(phModule, ptx, 0, 0, 0)); + // Load the PTX. + checkCudaErrors(cuModuleLoadDataEx(phModule, ptx, 0, 0, 0)); - // Locate the kernel entry point. - checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "simple")); + // Locate the kernel entry point. + checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "simple")); - return CUDA_SUCCESS; + return CUDA_SUCCESS; } -static char *loadProgramSource(const char *filename, size_t *size) { - assert(filename && size); - char *source = NULL; - *size = 0; - FILE *fh = fopen(filename, "rb"); - if (fh) { - struct stat statbuf; - stat(filename, &statbuf); - source = malloc(statbuf.st_size + 1); - assert(source); - fread(source, statbuf.st_size, 1, fh); - source[statbuf.st_size] = 0; - *size = statbuf.st_size + 1; - } else { - fprintf(stderr, "Error reading file %s\n", filename); - exit(EXIT_FAILURE); - } - return source; +static char *loadProgramSource(const char *filename, size_t *size) +{ + assert(filename && size); + char *source = NULL; + *size = 0; + FILE *fh = fopen(filename, "rb"); + if (fh) { + struct stat statbuf; + stat(filename, &statbuf); + source = malloc(statbuf.st_size + 1); + assert(source); + fread(source, statbuf.st_size, 1, fh); + source[statbuf.st_size] = 0; + *size = statbuf.st_size + 1; + } + else { + fprintf(stderr, "Error reading file %s\n", filename); + exit(EXIT_FAILURE); + } + return source; } -static char *generatePTX(const char *ir, size_t size, const char *filename, - int devMajor, int devMinor) { - assert(ir && filename); +static char *generatePTX(const char *ir, size_t size, const char *filename, int devMajor, int devMinor) +{ + assert(ir && filename); - // Create a program instance for use with libNVVM. - nvvmProgram program; - nvvmResult result = nvvmCreateProgram(&program); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmCreateProgram: Failed\n"); - exit(EXIT_FAILURE); - } + // Create a program instance for use with libNVVM. + nvvmProgram program; + nvvmResult result = nvvmCreateProgram(&program); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmCreateProgram: Failed\n"); + exit(EXIT_FAILURE); + } - // Add the NVVM IR to the program instance. - result = nvvmAddModuleToProgram(program, ir, size, filename); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmAddModuleToProgram: Failed\n"); - exit(EXIT_FAILURE); - } + // Add the NVVM IR to the program instance. + result = nvvmAddModuleToProgram(program, ir, size, filename); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmAddModuleToProgram: Failed\n"); + exit(EXIT_FAILURE); + } - // Dynamically construct the compute capability. - char arch[32] = {0}; - snprintf(arch, sizeof(arch) - 1, "-arch=compute_%d%d", devMajor, devMinor); + // Dynamically construct the compute capability. + char arch[32] = {0}; + snprintf(arch, sizeof(arch) - 1, "-arch=compute_%d%d", devMajor, devMinor); - // Compile the IR into PTX. - const char *options[] = {arch}; - result = nvvmCompileProgram(program, 1, options); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmCompileProgram: Failed\n"); - size_t logSize; - nvvmGetProgramLogSize(program, &logSize); - char *msg = malloc(logSize); - assert(msg); - nvvmGetProgramLog(program, msg); - fprintf(stderr, "%s\n", msg); - free(msg); - exit(EXIT_FAILURE); - } + // Compile the IR into PTX. + const char *options[] = {arch}; + result = nvvmCompileProgram(program, 1, options); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmCompileProgram: Failed\n"); + size_t logSize; + nvvmGetProgramLogSize(program, &logSize); + char *msg = malloc(logSize); + assert(msg); + nvvmGetProgramLog(program, msg); + fprintf(stderr, "%s\n", msg); + free(msg); + exit(EXIT_FAILURE); + } - // Obrain the resulting PTX. - size_t ptxSize; - result = nvvmGetCompiledResultSize(program, &ptxSize); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmGetCompiledResultSize: Failed\n"); - exit(EXIT_FAILURE); - } - char *ptx = malloc(ptxSize); - assert(ptx); - result = nvvmGetCompiledResult(program, ptx); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmGetCompiledResult: Failed\n"); + // Obrain the resulting PTX. + size_t ptxSize; + result = nvvmGetCompiledResultSize(program, &ptxSize); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmGetCompiledResultSize: Failed\n"); + exit(EXIT_FAILURE); + } + char *ptx = malloc(ptxSize); + assert(ptx); + result = nvvmGetCompiledResult(program, ptx); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmGetCompiledResult: Failed\n"); + free(ptx); + exit(EXIT_FAILURE); + } + + // Cleanup the libNVVM program instance. + result = nvvmDestroyProgram(&program); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmDestroyProgram: Failed\n"); + free(ptx); + exit(EXIT_FAILURE); + } + + return ptx; +} + +int main(int argc, char **argv) +{ + const unsigned int nThreads = 32; + const unsigned int nBlocks = 1; + const size_t memSize = nThreads * nBlocks * sizeof(int); + const char *filename = "simple-gpu64.ll"; + + // Retrieve the NVVM IR from filename and create the kernel parameters. + size_t size = 0; + char *ir = loadProgramSource(filename, &size); + fprintf(stdout, "NVVM IR (.ll) file loaded\n"); + + // Initialize the device and obtain the compute capability. + int devMajor = 0, devMinor = 0; + CUdevice hDevice = cudaDeviceInit(&devMajor, &devMinor); + + // Use libNVVM to generate PTX from the NVVM IR. + char *ptx = generatePTX(ir, size, filename, devMajor, devMinor); + fprintf(stdout, "PTX generated:\n"); + fprintf(stdout, "%s\n", ptx); + + // Initialize the device and get a handle to the kernel. + CUcontext hContext = 0; + CUmodule hModule = 0; + CUfunction hKernel = 0; + checkCudaErrors(initCUDA(&hContext, &hDevice, &hModule, &hKernel, ptx)); + + // Allocate memory on the host and device. + int *hData = malloc(memSize); + if (!hData) { + fprintf(stderr, "Could not allocate host memory\n"); + exit(EXIT_FAILURE); + } + CUdeviceptr dData = 0; + checkCudaErrors(cuMemAlloc(&dData, memSize)); + + // Launch the kernel. + void *params[] = {&dData}; + checkCudaErrors(cuLaunchKernel(hKernel, nBlocks, 1, 1, nThreads, 1, 1, 0, NULL, params, NULL)); + fprintf(stdout, "CUDA kernel launched\n"); + + // Copy the result back to the host. + checkCudaErrors(cuMemcpyDtoH(hData, dData, memSize)); + + // Print the result. + for (unsigned i = 0; i < nBlocks * nThreads; i++) + fprintf(stdout, "%d ", hData[i]); + fprintf(stdout, "\n"); + + // Cleanup. + if (dData) + checkCudaErrors(cuMemFree(dData)); + if (hModule) + checkCudaErrors(cuModuleUnload(hModule)); + if (hContext) + checkCudaErrors(cuCtxDestroy(hContext)); + free(hData); + free(ir); free(ptx); - exit(EXIT_FAILURE); - } - // Cleanup the libNVVM program instance. - result = nvvmDestroyProgram(&program); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmDestroyProgram: Failed\n"); - free(ptx); - exit(EXIT_FAILURE); - } - - return ptx; -} - -int main(int argc, char **argv) { - const unsigned int nThreads = 32; - const unsigned int nBlocks = 1; - const size_t memSize = nThreads * nBlocks * sizeof(int); - const char *filename = "simple-gpu64.ll"; - - // Retrieve the NVVM IR from filename and create the kernel parameters. - size_t size = 0; - char *ir = loadProgramSource(filename, &size); - fprintf(stdout, "NVVM IR (.ll) file loaded\n"); - - // Initialize the device and obtain the compute capability. - int devMajor = 0, devMinor = 0; - CUdevice hDevice = cudaDeviceInit(&devMajor, &devMinor); - - // Use libNVVM to generate PTX from the NVVM IR. - char *ptx = generatePTX(ir, size, filename, devMajor, devMinor); - fprintf(stdout, "PTX generated:\n"); - fprintf(stdout, "%s\n", ptx); - - // Initialize the device and get a handle to the kernel. - CUcontext hContext = 0; - CUmodule hModule = 0; - CUfunction hKernel = 0; - checkCudaErrors(initCUDA(&hContext, &hDevice, &hModule, &hKernel, ptx)); - - // Allocate memory on the host and device. - int *hData = malloc(memSize); - if (!hData) { - fprintf(stderr, "Could not allocate host memory\n"); - exit(EXIT_FAILURE); - } - CUdeviceptr dData = 0; - checkCudaErrors(cuMemAlloc(&dData, memSize)); - - // Launch the kernel. - void *params[] = {&dData}; - checkCudaErrors(cuLaunchKernel(hKernel, nBlocks, 1, 1, nThreads, 1, 1, 0, - NULL, params, NULL)); - fprintf(stdout, "CUDA kernel launched\n"); - - // Copy the result back to the host. - checkCudaErrors(cuMemcpyDtoH(hData, dData, memSize)); - - // Print the result. - for (unsigned i = 0; i < nBlocks * nThreads; i++) - fprintf(stdout, "%d ", hData[i]); - fprintf(stdout, "\n"); - - // Cleanup. - if (dData) - checkCudaErrors(cuMemFree(dData)); - if (hModule) - checkCudaErrors(cuModuleUnload(hModule)); - if (hContext) - checkCudaErrors(cuCtxDestroy(hContext)); - free(hData); - free(ir); - free(ptx); - - return 0; + return 0; } diff --git a/Samples/7_libNVVM/syscalls/CMakeLists.txt b/Samples/7_libNVVM/syscalls/CMakeLists.txt index 48368041..3ea3ced7 100644 --- a/Samples/7_libNVVM/syscalls/CMakeLists.txt +++ b/Samples/7_libNVVM/syscalls/CMakeLists.txt @@ -35,6 +35,6 @@ add_test(NAME test-syscalls-vprintf set_tests_properties(test-syscalls-vprintf test-syscalls-malloc-free PROPERTIES FIXTURES_REQUIRED PTXGENTEST) - + file(COPY malloc-free.ll DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") file(COPY vprintf.ll DESTINATION "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/Samples/7_libNVVM/uvmlite/README.md b/Samples/7_libNVVM/uvmlite/README.md index 643212af..6ab7349a 100644 --- a/Samples/7_libNVVM/uvmlite/README.md +++ b/Samples/7_libNVVM/uvmlite/README.md @@ -31,7 +31,7 @@ that the attribute can only be used with variables in the global address space.) Accessing a managed variable in the host ----------------------------------------- +---------------------------------------- To access a managed variable defined in the NVVM IR code, we should retrieve a device pointer first, which can be done using cuModuleGetGlobal(). diff --git a/Samples/7_libNVVM/uvmlite/uvmlite.c b/Samples/7_libNVVM/uvmlite/uvmlite.c index 3f8c54b9..9cfead84 100644 --- a/Samples/7_libNVVM/uvmlite/uvmlite.c +++ b/Samples/7_libNVVM/uvmlite/uvmlite.c @@ -34,290 +34,287 @@ #include #include -#define ERROR_IF(expr) \ - if (expr) { \ - fprintf(stderr, "Failed check at %s:%d\n", __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } +#define ERROR_IF(expr) \ + if (expr) { \ + fprintf(stderr, "Failed check at %s:%d\n", __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } // If 'err' is non-zero, emit an error message and exit. #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) -static void __checkCudaErrors(CUresult err, const char *filename, int line) { - assert(filename); - if (CUDA_SUCCESS != err) { - const char *ename = NULL; - const CUresult res = cuGetErrorName(err, &ename); - fprintf(stderr, - "CUDA API Error %04d: \"%s\" from file <%s>, " - "line %i.\n", - err, ((CUDA_SUCCESS == res) ? ename : "Unknown"), filename, line); - exit(err); - } +static void __checkCudaErrors(CUresult err, const char *filename, int line) +{ + assert(filename); + if (CUDA_SUCCESS != err) { + const char *ename = NULL; + const CUresult res = cuGetErrorName(err, &ename); + fprintf(stderr, + "CUDA API Error %04d: \"%s\" from file <%s>, " + "line %i.\n", + err, + ((CUDA_SUCCESS == res) ? ename : "Unknown"), + filename, + line); + exit(err); + } } // Compile the NVVM IR into PTX. -static char *generatePTX(const char *ll, size_t size, const char *filename, - int devMajor, int devMinor) { - assert(ll && filename); +static char *generatePTX(const char *ll, size_t size, const char *filename, int devMajor, int devMinor) +{ + assert(ll && filename); - // Create a program instance for libNVVM. - nvvmProgram program; - nvvmResult result = nvvmCreateProgram(&program); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmCreateProgram: Failed\n"); - exit(EXIT_FAILURE); - } + // Create a program instance for libNVVM. + nvvmProgram program; + nvvmResult result = nvvmCreateProgram(&program); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmCreateProgram: Failed\n"); + exit(EXIT_FAILURE); + } - // Add the NVVM IR as a module to our libNVVM program. - result = nvvmAddModuleToProgram(program, ll, size, filename); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmAddModuleToProgram: Failed\n"); - exit(EXIT_FAILURE); - } + // Add the NVVM IR as a module to our libNVVM program. + result = nvvmAddModuleToProgram(program, ll, size, filename); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmAddModuleToProgram: Failed\n"); + exit(EXIT_FAILURE); + } - // Dynamically construct the compute capability. - char arch[32] = {0}; - snprintf(arch, sizeof(arch) - 1, "-arch=compute_%d%d", devMajor, devMinor); + // Dynamically construct the compute capability. + char arch[32] = {0}; + snprintf(arch, sizeof(arch) - 1, "-arch=compute_%d%d", devMajor, devMinor); - // Compile the IR into PTX. - const char *options[] = {arch}; - result = nvvmCompileProgram(program, 1, options); - if (result != NVVM_SUCCESS) { - char *Msg = NULL; - size_t LogSize; - fprintf(stderr, "nvvmCompileProgram: Failed\n"); - nvvmGetProgramLogSize(program, &LogSize); - Msg = (char *)malloc(LogSize); - nvvmGetProgramLog(program, Msg); - fprintf(stderr, "%s\n", Msg); - free(Msg); - exit(EXIT_FAILURE); - } + // Compile the IR into PTX. + const char *options[] = {arch}; + result = nvvmCompileProgram(program, 1, options); + if (result != NVVM_SUCCESS) { + char *Msg = NULL; + size_t LogSize; + fprintf(stderr, "nvvmCompileProgram: Failed\n"); + nvvmGetProgramLogSize(program, &LogSize); + Msg = (char *)malloc(LogSize); + nvvmGetProgramLog(program, Msg); + fprintf(stderr, "%s\n", Msg); + free(Msg); + exit(EXIT_FAILURE); + } - size_t ptxSize = 0; - result = nvvmGetCompiledResultSize(program, &ptxSize); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmGetCompiledResultSize: Failed\n"); - exit(EXIT_FAILURE); - } + size_t ptxSize = 0; + result = nvvmGetCompiledResultSize(program, &ptxSize); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmGetCompiledResultSize: Failed\n"); + exit(EXIT_FAILURE); + } - char *ptx = malloc(ptxSize); - assert(ptx); - result = nvvmGetCompiledResult(program, ptx); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmGetCompiledResult: Failed\n"); - free(ptx); - exit(EXIT_FAILURE); - } + char *ptx = malloc(ptxSize); + assert(ptx); + result = nvvmGetCompiledResult(program, ptx); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmGetCompiledResult: Failed\n"); + free(ptx); + exit(EXIT_FAILURE); + } - result = nvvmDestroyProgram(&program); - if (result != NVVM_SUCCESS) { - fprintf(stderr, "nvvmDestroyProgram: Failed\n"); - free(ptx); - exit(EXIT_FAILURE); - } + result = nvvmDestroyProgram(&program); + if (result != NVVM_SUCCESS) { + fprintf(stderr, "nvvmDestroyProgram: Failed\n"); + free(ptx); + exit(EXIT_FAILURE); + } - return ptx; + return ptx; } -static char *loadProgramSource(const char *filename, size_t *size) { - assert(filename && size); - *size = 0; - char *source = NULL; - FILE *fh = fopen(filename, "rb"); - if (fh) { - struct stat statbuf; - stat(filename, &statbuf); - source = (char *)malloc(statbuf.st_size + 1); - if (source) { - fread(source, statbuf.st_size, 1, fh); - source[statbuf.st_size] = 0; - *size = statbuf.st_size + 1; +static char *loadProgramSource(const char *filename, size_t *size) +{ + assert(filename && size); + *size = 0; + char *source = NULL; + FILE *fh = fopen(filename, "rb"); + if (fh) { + struct stat statbuf; + stat(filename, &statbuf); + source = (char *)malloc(statbuf.st_size + 1); + if (source) { + fread(source, statbuf.st_size, 1, fh); + source[statbuf.st_size] = 0; + *size = statbuf.st_size + 1; + } } - } else { - fprintf(stderr, "Error reading file %s\n", filename); - exit(EXIT_FAILURE); - } - return source; + else { + fprintf(stderr, "Error reading file %s\n", filename); + exit(EXIT_FAILURE); + } + return source; } // Return the device compute capability in major and minor. -static CUdevice cudaDeviceInit(int *major, int *minor) { - assert(major && minor); - // Count the number of CUDA compute capable devices.. - CUresult err = cuInit(0); - int deviceCount = 0; - if (CUDA_SUCCESS == err) - checkCudaErrors(cuDeviceGetCount(&deviceCount)); - if (deviceCount == 0) { - fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); - exit(EXIT_FAILURE); - } +static CUdevice cudaDeviceInit(int *major, int *minor) +{ + assert(major && minor); + // Count the number of CUDA compute capable devices.. + CUresult err = cuInit(0); + int deviceCount = 0; + if (CUDA_SUCCESS == err) + checkCudaErrors(cuDeviceGetCount(&deviceCount)); + if (deviceCount == 0) { + fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } - // Get the first device discovered (device 0) and print its name. - CUdevice cuDevice = 0; - checkCudaErrors(cuDeviceGet(&cuDevice, 0)); - char name[128] = {0}; - checkCudaErrors(cuDeviceGetName(name, sizeof(name), cuDevice)); - printf("Using CUDA Device [0]: %s\n", name); + // Get the first device discovered (device 0) and print its name. + CUdevice cuDevice = 0; + checkCudaErrors(cuDeviceGet(&cuDevice, 0)); + char name[128] = {0}; + checkCudaErrors(cuDeviceGetName(name, sizeof(name), cuDevice)); + printf("Using CUDA Device [0]: %s\n", name); - // Get and test the compute capability. - checkCudaErrors(cuDeviceGetAttribute( - major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - checkCudaErrors(cuDeviceGetAttribute( - minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - printf("compute capability = %d.%d\n", *major, *minor); - if (*major < 5) { - fprintf(stderr, "Device 0 is not sm_50 or later\n"); - exit(EXIT_FAILURE); - } + // Get and test the compute capability. + checkCudaErrors(cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); + checkCudaErrors(cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); + printf("compute capability = %d.%d\n", *major, *minor); + if (*major < 5) { + fprintf(stderr, "Device 0 is not sm_50 or later\n"); + exit(EXIT_FAILURE); + } - // Check if managed memory is supported. - int supportsUvm = 0; - checkCudaErrors(cuDeviceGetAttribute( - &supportsUvm, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, cuDevice)); - if (!supportsUvm) { - printf("This device does not support managed memory."); - exit(EXIT_SUCCESS); - } + // Check if managed memory is supported. + int supportsUvm = 0; + checkCudaErrors(cuDeviceGetAttribute(&supportsUvm, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, cuDevice)); + if (!supportsUvm) { + printf("This device does not support managed memory."); + exit(EXIT_SUCCESS); + } - // Check if unified addressing is supported (host and device share same - // the address space). - int supportsUva = 0; - checkCudaErrors(cuDeviceGetAttribute( - &supportsUva, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)); - if (!supportsUva) { - printf("This device does not support a unified address space."); - exit(EXIT_SUCCESS); - } + // Check if unified addressing is supported (host and device share same + // the address space). + int supportsUva = 0; + checkCudaErrors(cuDeviceGetAttribute(&supportsUva, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)); + if (!supportsUva) { + printf("This device does not support a unified address space."); + exit(EXIT_SUCCESS); + } - return cuDevice; + return cuDevice; } -static CUresult buildKernel(CUcontext *phContext, CUdevice *phDevice, - CUmodule *phModule, CUfunction *phKernel) { - assert(phContext && phDevice && phModule && phKernel); +static CUresult buildKernel(CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUfunction *phKernel) +{ + assert(phContext && phDevice && phModule && phKernel); - // Initialize CUDA and obtain the device's compute capability. - int major = 0, minor = 0; - *phDevice = cudaDeviceInit(&major, &minor); + // Initialize CUDA and obtain the device's compute capability. + int major = 0, minor = 0; + *phDevice = cudaDeviceInit(&major, &minor); - // Create a context on the device. - checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); + // Create a context on the device. + checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); - // Get the NVVM IR from file. - size_t size = 0; - const char *filename = "uvmlite64.ll"; - char *ll = loadProgramSource(filename, &size); - fprintf(stdout, "NVVM IR ll file loaded\n"); + // Get the NVVM IR from file. + size_t size = 0; + const char *filename = "uvmlite64.ll"; + char *ll = loadProgramSource(filename, &size); + fprintf(stdout, "NVVM IR ll file loaded\n"); - // Use libNVVM to generate PTX. - char *ptx = generatePTX(ll, size, filename, major, minor); - fprintf(stdout, "PTX generated:\n"); - fprintf(stdout, "%s\n", ptx); + // Use libNVVM to generate PTX. + char *ptx = generatePTX(ll, size, filename, major, minor); + fprintf(stdout, "PTX generated:\n"); + fprintf(stdout, "%s\n", ptx); - // Load module from PTX. - checkCudaErrors(cuModuleLoadDataEx(phModule, ptx, 0, NULL, NULL)); + // Load module from PTX. + checkCudaErrors(cuModuleLoadDataEx(phModule, ptx, 0, NULL, NULL)); - // Locate the kernel entry point. - checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "test_kernel")); + // Locate the kernel entry point. + checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "test_kernel")); - free(ll); - free(ptx); - return CUDA_SUCCESS; + free(ll); + free(ptx); + return CUDA_SUCCESS; } -int main(void) { - const unsigned int nThreads = 1; - const unsigned int nBlocks = 1; +int main(void) +{ + const unsigned int nThreads = 1; + const unsigned int nBlocks = 1; - // Pointers to the variables in the managed memory. - // See uvmlite64.ll for their definition. - CUdeviceptr devp_xxx, devp_yyy; - size_t size_xxx, size_yyy; - int *p_xxx, *p_yyy; + // Pointers to the variables in the managed memory. + // See uvmlite64.ll for their definition. + CUdeviceptr devp_xxx, devp_yyy; + size_t size_xxx, size_yyy; + int *p_xxx, *p_yyy; - // Initialize the device and get a handle to the kernel - CUcontext hContext = 0; - CUdevice hDevice = 0; - CUmodule hModule = 0; - CUfunction hKernel = 0; - checkCudaErrors(buildKernel(&hContext, &hDevice, &hModule, &hKernel)); + // Initialize the device and get a handle to the kernel + CUcontext hContext = 0; + CUdevice hDevice = 0; + CUmodule hModule = 0; + CUfunction hKernel = 0; + checkCudaErrors(buildKernel(&hContext, &hDevice, &hModule, &hKernel)); - // Whether or not a device supports unified addressing may be queried by - // calling cuDeviceGetAttribute() with the deivce attribute - // CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. - { - int attrVal; - checkCudaErrors(cuDeviceGetAttribute( - &attrVal, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, hDevice)); - ERROR_IF(attrVal != 1); - } + // Whether or not a device supports unified addressing may be queried by + // calling cuDeviceGetAttribute() with the deivce attribute + // CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. + { + int attrVal; + checkCudaErrors(cuDeviceGetAttribute(&attrVal, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, hDevice)); + ERROR_IF(attrVal != 1); + } - // Get the address of the variable xxx, yyy in the managed memory. - checkCudaErrors(cuModuleGetGlobal(&devp_xxx, &size_xxx, hModule, "xxx")); - checkCudaErrors(cuModuleGetGlobal(&devp_yyy, &size_yyy, hModule, "yyy")); + // Get the address of the variable xxx, yyy in the managed memory. + checkCudaErrors(cuModuleGetGlobal(&devp_xxx, &size_xxx, hModule, "xxx")); + checkCudaErrors(cuModuleGetGlobal(&devp_yyy, &size_yyy, hModule, "yyy")); - // Whether or not the pointer points to managed memory may be queried by - // calling cuPointerGetAttribute() with the pointer attribute - // CU_POINTER_ATTRIBUTE_IS_MANAGED. - { - unsigned int attrVal; + // Whether or not the pointer points to managed memory may be queried by + // calling cuPointerGetAttribute() with the pointer attribute + // CU_POINTER_ATTRIBUTE_IS_MANAGED. + { + unsigned int attrVal; - checkCudaErrors(cuPointerGetAttribute( - &attrVal, CU_POINTER_ATTRIBUTE_IS_MANAGED, devp_xxx)); - ERROR_IF(attrVal != 1); - checkCudaErrors(cuPointerGetAttribute( - &attrVal, CU_POINTER_ATTRIBUTE_IS_MANAGED, devp_yyy)); - ERROR_IF(attrVal != 1); - } + checkCudaErrors(cuPointerGetAttribute(&attrVal, CU_POINTER_ATTRIBUTE_IS_MANAGED, devp_xxx)); + ERROR_IF(attrVal != 1); + checkCudaErrors(cuPointerGetAttribute(&attrVal, CU_POINTER_ATTRIBUTE_IS_MANAGED, devp_yyy)); + ERROR_IF(attrVal != 1); + } - // Since CUdeviceptr is opaque, it is safe to use cuPointerGetAttribute to get - // the host pointers. - { - void *host_ptr_xxx, *host_ptr_yyy; + // Since CUdeviceptr is opaque, it is safe to use cuPointerGetAttribute to get + // the host pointers. + { + void *host_ptr_xxx, *host_ptr_yyy; - checkCudaErrors(cuPointerGetAttribute( - &host_ptr_xxx, CU_POINTER_ATTRIBUTE_HOST_POINTER, devp_xxx)); - checkCudaErrors(cuPointerGetAttribute( - &host_ptr_yyy, CU_POINTER_ATTRIBUTE_HOST_POINTER, devp_yyy)); + checkCudaErrors(cuPointerGetAttribute(&host_ptr_xxx, CU_POINTER_ATTRIBUTE_HOST_POINTER, devp_xxx)); + checkCudaErrors(cuPointerGetAttribute(&host_ptr_yyy, CU_POINTER_ATTRIBUTE_HOST_POINTER, devp_yyy)); - p_xxx = (int *)host_ptr_xxx; - p_yyy = (int *)host_ptr_yyy; - } + p_xxx = (int *)host_ptr_xxx; + p_yyy = (int *)host_ptr_yyy; + } - printf("The initial value of xxx initialized by the device = %d\n", *p_xxx); - printf("The initial value of yyy initialized by the device = %d\n", *p_yyy); + printf("The initial value of xxx initialized by the device = %d\n", *p_xxx); + printf("The initial value of yyy initialized by the device = %d\n", *p_yyy); - ERROR_IF(*p_xxx != 10); - ERROR_IF(*p_yyy != 100); + ERROR_IF(*p_xxx != 10); + ERROR_IF(*p_yyy != 100); - // The host adds 1 and 11 to xxx and yyy. - *p_xxx += 1; - *p_yyy += 11; + // The host adds 1 and 11 to xxx and yyy. + *p_xxx += 1; + *p_yyy += 11; - printf("The host added 1 and 11 to xxx and yyy.\n"); + printf("The host added 1 and 11 to xxx and yyy.\n"); - // Launch the kernel with the following parameters. - { - void *params[] = {(void *)&devp_xxx}; - checkCudaErrors(cuLaunchKernel(hKernel, nBlocks, 1, 1, nThreads, 1, 1, 0, - NULL, params, NULL)); - } - checkCudaErrors(cuCtxSynchronize()); + // Launch the kernel with the following parameters. + { + void *params[] = {(void *)&devp_xxx}; + checkCudaErrors(cuLaunchKernel(hKernel, nBlocks, 1, 1, nThreads, 1, 1, 0, NULL, params, NULL)); + } + checkCudaErrors(cuCtxSynchronize()); - printf("kernel added 20 and 30 to xxx and yyy, respectively.\n"); - printf("The final value checked in the host: xxx = %d, yyy = %d\n", *p_xxx, - *p_yyy); + printf("kernel added 20 and 30 to xxx and yyy, respectively.\n"); + printf("The final value checked in the host: xxx = %d, yyy = %d\n", *p_xxx, *p_yyy); - if (hModule) { - checkCudaErrors(cuModuleUnload(hModule)); - hModule = 0; - } - if (hContext) { - checkCudaErrors(cuCtxDestroy(hContext)); - hContext = 0; - } + if (hModule) { + checkCudaErrors(cuModuleUnload(hModule)); + hModule = 0; + } + if (hContext) { + checkCudaErrors(cuCtxDestroy(hContext)); + hContext = 0; + } - return 0; + return 0; } diff --git a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu index e48de741..1c459c5c 100644 --- a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu +++ b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu @@ -31,96 +31,99 @@ // synchronization. This app requires GLES 3.2 or higher //---------------------------INCLUDES---------------------------------// -#include -#include -#include -#include -#include "graphics_interface.h" -#include -#include -#include #include #include #include +#include +#include +#include +#include +#include +#include +#include + #include "egl_common.h" +#include "graphics_interface.h" //---------------------------DEFINES---------------------------------// #define MAX_ITR 100 #define FAILURE 0 #define SUCCESS 1 -#define WAIVED 2 +#define WAIVED 2 #define BLOCK_SIZE 16 -#define GL_READ 0 +#define GL_READ 0 #define GL_WRITE 1 //---------------------------MACROS---------------------------------// // Error-checking wrapper around GL calls -#define GL_SAFE_CALL(call) \ - { \ - GLenum err; \ - call; \ - err = glGetError(); \ - if (err != GL_NO_ERROR) { \ - fprintf(stderr, "%s:%d GL error: %d\n", __FILE__, __LINE__, err); \ - cleanup(FAILURE); \ - } \ - } +#define GL_SAFE_CALL(call) \ + { \ + GLenum err; \ + call; \ + err = glGetError(); \ + if (err != GL_NO_ERROR) { \ + fprintf(stderr, "%s:%d GL error: %d\n", __FILE__, __LINE__, err); \ + cleanup(FAILURE); \ + } \ + } -#define GL_SAFE_CALL_NO_CLEANUP(call, err) \ - { \ - GLenum status; \ - call; \ - status = glGetError(); \ - if (status != GL_NO_ERROR) { \ - fprintf(stderr, "%s:%d GL error: %d\n", __FILE__, __LINE__, status); \ - err = status; \ - } \ - } +#define GL_SAFE_CALL_NO_CLEANUP(call, err) \ + { \ + GLenum status; \ + call; \ + status = glGetError(); \ + if (status != GL_NO_ERROR) { \ + fprintf(stderr, "%s:%d GL error: %d\n", __FILE__, __LINE__, status); \ + err = status; \ + } \ + } // Error-checking wrapper around CUDA calls (taken from cutil.h) -#define CUDA_SAFE_CALL(call) \ - do { \ - cudaError err = call; \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ - __LINE__, cudaGetErrorString(err)); \ - cleanup(FAILURE); \ - } \ - } while (0) +#define CUDA_SAFE_CALL(call) \ + do { \ + cudaError err = call; \ + if (cudaSuccess != err) { \ + fprintf( \ + stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + cleanup(FAILURE); \ + } \ + } while (0) -#define CUDA_SAFE_CALL_NO_CLEANUP(call, err) \ - do { \ - cudaError status = call; \ - if (cudaSuccess != status) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ - __LINE__, cudaGetErrorString(status)); \ - err = status; \ - } \ - } while (0) +#define CUDA_SAFE_CALL_NO_CLEANUP(call, err) \ + do { \ + cudaError status = call; \ + if (cudaSuccess != status) { \ + fprintf( \ + stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString(status)); \ + err = status; \ + } \ + } while (0) #if defined(EXTENSION_LIST) EXTENSION_LIST(EXTLST_DECL) typedef void (*extlst_fnptr_t)(void); -static struct { - extlst_fnptr_t *fnptr; - char const *name; +static struct +{ + extlst_fnptr_t *fnptr; + char const *name; } extensionList[] = {EXTENSION_LIST(EXTLST_ENTRY)}; -int eglSetupExtensions(void) { - unsigned int i; +int eglSetupExtensions(void) +{ + unsigned int i; - for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) { - *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name); - if (*extensionList[i].fnptr == NULL) { - printf("Couldn't get address of %s()\n", extensionList[i].name); - return 0; + for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) { + *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name); + if (*extensionList[i].fnptr == NULL) { + printf("Couldn't get address of %s()\n", extensionList[i].name); + return 0; + } } - } - return 1; + return 1; } #endif @@ -141,18 +144,18 @@ int useGpu = 1; // CUDA Resource CUgraphicsResource writeResource = NULL; -CUgraphicsResource readResource = NULL; -CUarray writeArray, readArray; -CUdevice device; -CUcontext context; +CUgraphicsResource readResource = NULL; +CUarray writeArray, readArray; +CUdevice device; +CUcontext context; // Which device to run on unsigned int dev = 0; // Default width, height, and iterations value -int width = 2048; +int width = 2048; int height = 2048; -int itr = MAX_ITR; +int itr = MAX_ITR; // Error check variable __device__ static unsigned int numErrors = 0; @@ -160,7 +163,7 @@ __device__ static unsigned int numErrors = 0; //-----------------------FUNCTION PROTOTYPES------------------------// void checkSync(int argc, char **argv); -int parseCmdLine(int argc, char **argv); +int parseCmdLine(int argc, char **argv); void printUsage(void); void cleanup(int status); void exitHandler(void); @@ -168,339 +171,330 @@ void printStatus(int status); void checkSyncOnCPU(void); void checkSyncOnGPU(EGLDisplay dpy); -__global__ void verify_and_update_kernel(CUsurfObject write, CUsurfObject read, - char expected, char newval, int width, - int height); +__global__ void +verify_and_update_kernel(CUsurfObject write, CUsurfObject read, char expected, char newval, int width, int height); extern "C" cudaError_t cudaGetValueMismatch(); //-----------------------FUNCTION DEFINITIONS------------------------// -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) +{ #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - parseCmdLine(argc, argv); - atexit(exitHandler); + parseCmdLine(argc, argv); + atexit(exitHandler); - checkSync(argc, argv); - return 0; + checkSync(argc, argv); + return 0; } -int parseCmdLine(int argc, char **argv) { - int i; - for (i = 1; i < argc; i++) { - if (strcmp(argv[i], "-cpu") == 0) { - useGpu = 0; +int parseCmdLine(int argc, char **argv) +{ + int i; + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-cpu") == 0) { + useGpu = 0; + } + + if (strcmp(argv[i], "-h") == 0) { + printUsage(); + cleanup(SUCCESS); + } + + if (strcmp(argv[i], "-width") == 0) { + ++i; + if (i == argc) { + printf("width option must be followed by value\n"); + return FAILURE; + } + if (sscanf(argv[i], "%d", &width) != 1) { + printf("Error: invalid width value\n"); + return FAILURE; + } + } + + if (strcmp(argv[i], "-height") == 0) { + ++i; + if (i == argc) { + printf("height option must be followed by value\n"); + return FAILURE; + } + if (sscanf(argv[i], "%d", &height) != 1) { + printf("Error: invalid height value\n"); + return FAILURE; + } + } + if (strcmp(argv[i], "-itr") == 0) { + ++i; + if (i == argc) { + printf("itr option must be followed by iteration value\n"); + return FAILURE; + } + if (sscanf(argv[i], "%d", &itr) != 1) { + printf("Error: invalid iteration value\n"); + return FAILURE; + } + } } - if (strcmp(argv[i], "-h") == 0) { - printUsage(); - cleanup(SUCCESS); - } - - if (strcmp(argv[i], "-width") == 0) { - ++i; - if (i == argc) { - printf("width option must be followed by value\n"); - return FAILURE; - } - if (sscanf(argv[i], "%d", &width) != 1) { - printf("Error: invalid width value\n"); - return FAILURE; - } - } - - if (strcmp(argv[i], "-height") == 0) { - ++i; - if (i == argc) { - printf("height option must be followed by value\n"); - return FAILURE; - } - if (sscanf(argv[i], "%d", &height) != 1) { - printf("Error: invalid height value\n"); - return FAILURE; - } - } - if (strcmp(argv[i], "-itr") == 0) { - ++i; - if (i == argc) { - printf("itr option must be followed by iteration value\n"); - return FAILURE; - } - if (sscanf(argv[i], "%d", &itr) != 1) { - printf("Error: invalid iteration value\n"); - return FAILURE; - } - } - } - - return SUCCESS; + return SUCCESS; } -void printUsage(void) { - printf("Usage:\n"); - printf("\t-h\tPrint command line options\n"); - printf("\t-cpu\tSync on the CPU instead of the GPU\n"); - printf("\t-width w\tSet the width to w\n"); - printf("\t-height h\tSet the height to h\n"); - printf("\t-itr i\tSet number of iterations to i\n"); +void printUsage(void) +{ + printf("Usage:\n"); + printf("\t-h\tPrint command line options\n"); + printf("\t-cpu\tSync on the CPU instead of the GPU\n"); + printf("\t-width w\tSet the width to w\n"); + printf("\t-height h\tSet the height to h\n"); + printf("\t-itr i\tSet number of iterations to i\n"); } -void checkSync(int argc, char **argv) { - int x, y; - int bufferSize = width * height * 4; - unsigned char *pSurf_read = NULL, *pSurf_write = NULL; - int integrated; +void checkSync(int argc, char **argv) +{ + int x, y; + int bufferSize = width * height * 4; + unsigned char *pSurf_read = NULL, *pSurf_write = NULL; + int integrated; - CUresult status = CUDA_SUCCESS; + CUresult status = CUDA_SUCCESS; - // Init values for variables - x = y = 0; + // Init values for variables + x = y = 0; - if (CUDA_SUCCESS != (status = cuInit(0))) { - printf("Failed to initialize CUDA\n"); - } - device = findCudaDeviceDRV(argc, (const char **)argv); + if (CUDA_SUCCESS != (status = cuInit(0))) { + printf("Failed to initialize CUDA\n"); + } + device = findCudaDeviceDRV(argc, (const char **)argv); - if (CUDA_SUCCESS != (status = cuCtxCreate(&context, 0, device))) { - printf("failed to create CUDA context\n"); - } - cuCtxPushCurrent(context); + if (CUDA_SUCCESS != (status = cuCtxCreate(&context, 0, device))) { + printf("failed to create CUDA context\n"); + } + cuCtxPushCurrent(context); - status = - cuDeviceGetAttribute(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, device); - if (status != CUDA_SUCCESS) { - printf("Failed to get device attribute CU_DEVICE_ATTRIBUTE_INTEGRATED\n"); - cleanup(FAILURE); - } + status = cuDeviceGetAttribute(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, device); + if (status != CUDA_SUCCESS) { + printf("Failed to get device attribute CU_DEVICE_ATTRIBUTE_INTEGRATED\n"); + cleanup(FAILURE); + } - if (integrated != 1) { - printf( - "EGLSync_CUDAEvent_Interop does not support dGPU. Waiving sample.\n"); - cleanup(WAIVED); - } + if (integrated != 1) { + printf("EGLSync_CUDAEvent_Interop does not support dGPU. Waiving sample.\n"); + cleanup(WAIVED); + } #if (defined(__arm__) || defined(__aarch64__)) && defined(__linux__) - graphics_setup_window(0, 0, width, height, "EGLSync_CUDA_Interop"); + graphics_setup_window(0, 0, width, height, "EGLSync_CUDA_Interop"); #endif - pSurf_read = (unsigned char *)malloc(bufferSize); - pSurf_write = (unsigned char *)malloc(bufferSize); - if (pSurf_read == NULL || pSurf_write == NULL) { - printf("malloc failed\n"); - cleanup(FAILURE); - } - - for (x = 0; x < width; x++) { - for (y = 0; y < height; y++) { - pSurf_read[(y * width + x) * 4] = 1; - pSurf_read[(y * width + x) * 4 + 1] = 1; - pSurf_read[(y * width + x) * 4 + 2] = 1; - pSurf_read[(y * width + x) * 4 + 3] = 1; - pSurf_write[(y * width + x) * 4] = 0; - pSurf_write[(y * width + x) * 4 + 1] = 0; - pSurf_write[(y * width + x) * 4 + 2] = 0; - pSurf_write[(y * width + x) * 4 + 3] = 0; + pSurf_read = (unsigned char *)malloc(bufferSize); + pSurf_write = (unsigned char *)malloc(bufferSize); + if (pSurf_read == NULL || pSurf_write == NULL) { + printf("malloc failed\n"); + cleanup(FAILURE); } - } - // NOP call to error-check the above glut calls - GL_SAFE_CALL({}); + for (x = 0; x < width; x++) { + for (y = 0; y < height; y++) { + pSurf_read[(y * width + x) * 4] = 1; + pSurf_read[(y * width + x) * 4 + 1] = 1; + pSurf_read[(y * width + x) * 4 + 2] = 1; + pSurf_read[(y * width + x) * 4 + 3] = 1; + pSurf_write[(y * width + x) * 4] = 0; + pSurf_write[(y * width + x) * 4 + 1] = 0; + pSurf_write[(y * width + x) * 4 + 2] = 0; + pSurf_write[(y * width + x) * 4 + 3] = 0; + } + } - // Init texture - GL_SAFE_CALL(glGenTextures(2, tex)); + // NOP call to error-check the above glut calls + GL_SAFE_CALL({}); - GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, tex[GL_READ])); - GL_SAFE_CALL( - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); - GL_SAFE_CALL( - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); - GL_SAFE_CALL(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, - GL_RGBA, GL_UNSIGNED_BYTE, pSurf_read)); - GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, 0)); + // Init texture + GL_SAFE_CALL(glGenTextures(2, tex)); - GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, tex[GL_WRITE])); - GL_SAFE_CALL( - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); - GL_SAFE_CALL( - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); - GL_SAFE_CALL(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, - GL_RGBA, GL_UNSIGNED_BYTE, pSurf_write)); - GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, 0)); + GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, tex[GL_READ])); + GL_SAFE_CALL(glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); + GL_SAFE_CALL(glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); + GL_SAFE_CALL(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, pSurf_read)); + GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, 0)); - glFinish(); - - EGLDisplay eglDisplayHandle = eglGetCurrentDisplay(); - if (eglDisplayHandle == EGL_NO_DISPLAY) { - printf("eglDisplayHandle failed \n"); - cleanup(FAILURE); - } else { - printf("eglDisplay Handle created \n"); - } - - if (!eglSetupExtensions()) { - printf("SetupExtentions failed \n"); - cleanup(FAILURE); - } - - EGLContext eglCtx = eglGetCurrentContext(); - if (eglCtx == EGL_NO_CONTEXT) { - printf("Context1 create failed with error %d\n", eglGetError()); - cleanup(FAILURE); - } - - // Create the EGL_Image - EGLint eglImgAttrs[] = {EGL_IMAGE_PRESERVED_KHR, EGL_TRUE, EGL_NONE, - EGL_NONE}; - - EGLImageKHR eglImage1 = - eglCreateImageKHR(eglDisplayHandle, eglCtx, EGL_GL_TEXTURE_2D_KHR, - (EGLClientBuffer)(intptr_t)tex[GL_READ], eglImgAttrs); - if (eglImage1 == EGL_NO_IMAGE_KHR) { - printf("EGLImage create failed for read texture with error %d\n", - eglGetError()); - cleanup(FAILURE); - } else { - printf("EGLImage1 created \n"); - } - - EGLImageKHR eglImage2 = - eglCreateImageKHR(eglDisplayHandle, eglCtx, EGL_GL_TEXTURE_2D_KHR, - (EGLClientBuffer)(intptr_t)tex[GL_WRITE], eglImgAttrs); - if (eglImage2 == EGL_NO_IMAGE_KHR) { - printf("EGLImage create failed for write texture with error %d\n", - eglGetError()); - cleanup(FAILURE); - } else { - printf("EGLImage2 created \n"); - } - - glFinish(); - - status = cuGraphicsEGLRegisterImage(&writeResource, eglImage1, - CU_GRAPHICS_REGISTER_FLAGS_NONE); - if (status != CUDA_SUCCESS) { - printf("cuGraphicsEGLRegisterImage failed with Texture 1\n"); - cleanup(FAILURE); - } else { - printf( - "cuGraphicsEGLRegisterImage Passed, writeResource created with texture " - "1\n"); - } - - status = - cuGraphicsSubResourceGetMappedArray(&writeArray, writeResource, 0, 0); - if (status != CUDA_SUCCESS) { - printf( - "cuGraphicsSubResourceGetMappedArray failed for writeResource with " - "texture 1\n"); - cleanup(FAILURE); - } - - status = cuGraphicsEGLRegisterImage(&readResource, eglImage2, - CU_GRAPHICS_REGISTER_FLAGS_NONE); - if (status != CUDA_SUCCESS) { - printf( - "cuGraphicsEGLRegisterImage failed for readResource with Texture 2\n"); - cleanup(FAILURE); - } else { - printf( - "cuGraphicsEGLRegisterImage Passed, readResource created with texture " - "2\n"); - } - - status = cuGraphicsSubResourceGetMappedArray(&readArray, readResource, 0, 0); - if (status != CUDA_SUCCESS) { - printf("cuGraphicsSubResourceGetMappedArray failed for texture 2\n"); - cleanup(FAILURE); - } - - if (useGpu) { - printf("Using GPU Sync path\n"); - checkSyncOnGPU(eglDisplayHandle); - } else { - printf("Using CPU Sync path\n"); - checkSyncOnCPU(); - } - - free(pSurf_read); - free(pSurf_write); - cleanup(SUCCESS); -} - -void checkSyncOnCPU(void) { - int z = 0; - unsigned char expectedData, newData; - CUresult status = CUDA_SUCCESS; - CUDA_RESOURCE_DESC wdsc, rdsc; - memset(&wdsc, 0, sizeof(wdsc)); - memset(&rdsc, 0, sizeof(rdsc)); - - expectedData = 0; - newData = 1; - - wdsc.resType = CU_RESOURCE_TYPE_ARRAY; - wdsc.res.array.hArray = writeArray; - CUsurfObject writeSurface; - rdsc.resType = CU_RESOURCE_TYPE_ARRAY; - rdsc.res.array.hArray = readArray; - CUsurfObject readSurface; - - status = cuSurfObjectCreate(&writeSurface, &wdsc); - if (status != CUDA_SUCCESS) { - printf("Surface bounding failed with status %d\n", status); - cleanup(FAILURE); - } - status = cuSurfObjectCreate(&readSurface, &rdsc); - if (status != CUDA_SUCCESS) { - printf("Surface bounding failed\n"); - cleanup(FAILURE); - } - - for (z = 0; z < itr; z++) { - // GL call to copy from read texture to write texture - GL_SAFE_CALL(glCopyImageSubData(tex[GL_READ], GL_TEXTURE_2D, 0, 0, 0, 0, - tex[GL_WRITE], GL_TEXTURE_2D, 0, 0, 0, 0, - width, height, 1)); + GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, tex[GL_WRITE])); + GL_SAFE_CALL(glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); + GL_SAFE_CALL(glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); + GL_SAFE_CALL(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, pSurf_write)); + GL_SAFE_CALL(glBindTexture(GL_TEXTURE_2D, 0)); glFinish(); - newData++; - expectedData++; - - verify_and_update_kernel<<<(width * height) / 256, 256>>>( - writeSurface, readSurface, expectedData, newData, width, height); - - status = cuCtxSynchronize(); - if (status != CUDA_SUCCESS) { - printf("cuCtxSynchronize failed \n"); + EGLDisplay eglDisplayHandle = eglGetCurrentDisplay(); + if (eglDisplayHandle == EGL_NO_DISPLAY) { + printf("eglDisplayHandle failed \n"); + cleanup(FAILURE); + } + else { + printf("eglDisplay Handle created \n"); } - } - cudaError_t err = cudaGetValueMismatch(); - if (err != cudaSuccess) { - printf("Value mismatch seen when using CPU sync\n"); - cleanup(FAILURE); - } + if (!eglSetupExtensions()) { + printf("SetupExtentions failed \n"); + cleanup(FAILURE); + } - // Clean up CUDA writeResource - status = cuGraphicsUnregisterResource(writeResource); - if (status != CUDA_SUCCESS) { - printf("Failed to unregister %d", status); - cleanup(FAILURE); - } else { - printf("Unregistered writeResource. \n"); - } + EGLContext eglCtx = eglGetCurrentContext(); + if (eglCtx == EGL_NO_CONTEXT) { + printf("Context1 create failed with error %d\n", eglGetError()); + cleanup(FAILURE); + } - // Clean up CUDA readResource - status = cuGraphicsUnregisterResource(readResource); - if (status != CUDA_SUCCESS) { - printf("Failed to unregister %d", status); - cleanup(FAILURE); - } else { - printf("Unregistered readResource. \n"); - } + // Create the EGL_Image + EGLint eglImgAttrs[] = {EGL_IMAGE_PRESERVED_KHR, EGL_TRUE, EGL_NONE, EGL_NONE}; + + EGLImageKHR eglImage1 = eglCreateImageKHR( + eglDisplayHandle, eglCtx, EGL_GL_TEXTURE_2D_KHR, (EGLClientBuffer)(intptr_t)tex[GL_READ], eglImgAttrs); + if (eglImage1 == EGL_NO_IMAGE_KHR) { + printf("EGLImage create failed for read texture with error %d\n", eglGetError()); + cleanup(FAILURE); + } + else { + printf("EGLImage1 created \n"); + } + + EGLImageKHR eglImage2 = eglCreateImageKHR( + eglDisplayHandle, eglCtx, EGL_GL_TEXTURE_2D_KHR, (EGLClientBuffer)(intptr_t)tex[GL_WRITE], eglImgAttrs); + if (eglImage2 == EGL_NO_IMAGE_KHR) { + printf("EGLImage create failed for write texture with error %d\n", eglGetError()); + cleanup(FAILURE); + } + else { + printf("EGLImage2 created \n"); + } + + glFinish(); + + status = cuGraphicsEGLRegisterImage(&writeResource, eglImage1, CU_GRAPHICS_REGISTER_FLAGS_NONE); + if (status != CUDA_SUCCESS) { + printf("cuGraphicsEGLRegisterImage failed with Texture 1\n"); + cleanup(FAILURE); + } + else { + printf("cuGraphicsEGLRegisterImage Passed, writeResource created with texture " + "1\n"); + } + + status = cuGraphicsSubResourceGetMappedArray(&writeArray, writeResource, 0, 0); + if (status != CUDA_SUCCESS) { + printf("cuGraphicsSubResourceGetMappedArray failed for writeResource with " + "texture 1\n"); + cleanup(FAILURE); + } + + status = cuGraphicsEGLRegisterImage(&readResource, eglImage2, CU_GRAPHICS_REGISTER_FLAGS_NONE); + if (status != CUDA_SUCCESS) { + printf("cuGraphicsEGLRegisterImage failed for readResource with Texture 2\n"); + cleanup(FAILURE); + } + else { + printf("cuGraphicsEGLRegisterImage Passed, readResource created with texture " + "2\n"); + } + + status = cuGraphicsSubResourceGetMappedArray(&readArray, readResource, 0, 0); + if (status != CUDA_SUCCESS) { + printf("cuGraphicsSubResourceGetMappedArray failed for texture 2\n"); + cleanup(FAILURE); + } + + if (useGpu) { + printf("Using GPU Sync path\n"); + checkSyncOnGPU(eglDisplayHandle); + } + else { + printf("Using CPU Sync path\n"); + checkSyncOnCPU(); + } + + free(pSurf_read); + free(pSurf_write); + cleanup(SUCCESS); +} + +void checkSyncOnCPU(void) +{ + int z = 0; + unsigned char expectedData, newData; + CUresult status = CUDA_SUCCESS; + CUDA_RESOURCE_DESC wdsc, rdsc; + memset(&wdsc, 0, sizeof(wdsc)); + memset(&rdsc, 0, sizeof(rdsc)); + + expectedData = 0; + newData = 1; + + wdsc.resType = CU_RESOURCE_TYPE_ARRAY; + wdsc.res.array.hArray = writeArray; + CUsurfObject writeSurface; + rdsc.resType = CU_RESOURCE_TYPE_ARRAY; + rdsc.res.array.hArray = readArray; + CUsurfObject readSurface; + + status = cuSurfObjectCreate(&writeSurface, &wdsc); + if (status != CUDA_SUCCESS) { + printf("Surface bounding failed with status %d\n", status); + cleanup(FAILURE); + } + status = cuSurfObjectCreate(&readSurface, &rdsc); + if (status != CUDA_SUCCESS) { + printf("Surface bounding failed\n"); + cleanup(FAILURE); + } + + for (z = 0; z < itr; z++) { + // GL call to copy from read texture to write texture + GL_SAFE_CALL(glCopyImageSubData( + tex[GL_READ], GL_TEXTURE_2D, 0, 0, 0, 0, tex[GL_WRITE], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1)); + + glFinish(); + + newData++; + expectedData++; + + verify_and_update_kernel<<<(width * height) / 256, 256>>>( + writeSurface, readSurface, expectedData, newData, width, height); + + status = cuCtxSynchronize(); + if (status != CUDA_SUCCESS) { + printf("cuCtxSynchronize failed \n"); + } + } + + cudaError_t err = cudaGetValueMismatch(); + if (err != cudaSuccess) { + printf("Value mismatch seen when using CPU sync\n"); + cleanup(FAILURE); + } + + // Clean up CUDA writeResource + status = cuGraphicsUnregisterResource(writeResource); + if (status != CUDA_SUCCESS) { + printf("Failed to unregister %d", status); + cleanup(FAILURE); + } + else { + printf("Unregistered writeResource. \n"); + } + + // Clean up CUDA readResource + status = cuGraphicsUnregisterResource(readResource); + if (status != CUDA_SUCCESS) { + printf("Failed to unregister %d", status); + cleanup(FAILURE); + } + else { + printf("Unregistered readResource. \n"); + } } /* @@ -511,273 +505,281 @@ void checkSyncOnCPU(void) { synchronization needed between GL-EGL & CUDA operations all synchronizations happens on the GPU only. */ -void checkSyncOnGPU(EGLDisplay dpy) { - int z = 0; - unsigned char expectedData, newData; - cudaError_t err; - CUresult status = CUDA_SUCCESS; - CUstream stream; - CUevent timingDisabledEvent; - CUDA_RESOURCE_DESC wdsc, rdsc; - memset(&wdsc, 0, sizeof(wdsc)); - memset(&rdsc, 0, sizeof(rdsc)); +void checkSyncOnGPU(EGLDisplay dpy) +{ + int z = 0; + unsigned char expectedData, newData; + cudaError_t err; + CUresult status = CUDA_SUCCESS; + CUstream stream; + CUevent timingDisabledEvent; + CUDA_RESOURCE_DESC wdsc, rdsc; + memset(&wdsc, 0, sizeof(wdsc)); + memset(&rdsc, 0, sizeof(rdsc)); - expectedData = 0; - newData = 1; + expectedData = 0; + newData = 1; - wdsc.resType = CU_RESOURCE_TYPE_ARRAY; - wdsc.res.array.hArray = writeArray; - CUsurfObject writeSurface; - rdsc.resType = CU_RESOURCE_TYPE_ARRAY; - rdsc.res.array.hArray = readArray; - CUsurfObject readSurface; + wdsc.resType = CU_RESOURCE_TYPE_ARRAY; + wdsc.res.array.hArray = writeArray; + CUsurfObject writeSurface; + rdsc.resType = CU_RESOURCE_TYPE_ARRAY; + rdsc.res.array.hArray = readArray; + CUsurfObject readSurface; - status = cuSurfObjectCreate(&writeSurface, &wdsc); - if (status != CUDA_SUCCESS) { - printf("Surface bounding failed with status %d\n", status); - cleanup(FAILURE); - } - status = cuSurfObjectCreate(&readSurface, &rdsc); - if (status != CUDA_SUCCESS) { - printf("Surface bounding failed\n"); - cleanup(FAILURE); - } - - status = cuStreamCreate(&stream, CU_STREAM_DEFAULT); - if (status != CUDA_SUCCESS) { - printf("Stream creation failed\n"); - cleanup(FAILURE); - } - - // Creates timing disabled event which uses non-blocking synchronization - status = cuEventCreate(&timingDisabledEvent, CU_EVENT_DISABLE_TIMING); - if (status != CUDA_SUCCESS) { - printf("Default event creation failed\n"); - cleanup(FAILURE); - } - - /* - 1. We perform texture-to-texture copy in GLES which is async function - 2. Followed by creating EGLSync and a CUDA Event from that EGLSync object - 3. Using cuStreamWaitEvent() we wait in GPU for the GLES to finish texture - copy. - 4. CUDA kernel verfiy_and_update_kernel verifies if the copied data by - GLES is correct, and it updates the buffer with new values. - 5. This is followed by eglWaitSyncKHR() which waits for the cuda kernel to - finish, so that in the next iteration GLES can perform the copying of the - updated buffer to write texture, - */ - for (z = 0; z < itr; z++) { - // GL call to copy from read texture to write texture - GL_SAFE_CALL(glCopyImageSubData(tex[GL_READ], GL_TEXTURE_2D, 0, 0, 0, 0, - tex[GL_WRITE], GL_TEXTURE_2D, 0, 0, 0, 0, - width, height, 1)); - - EGLSyncKHR eglSyncForGL, eglSyncForCuda; - EGLBoolean egl_status = EGL_TRUE; - EGLAttribKHR eglattrib[] = {EGL_CUDA_EVENT_HANDLE_NV, - (EGLAttrib)timingDisabledEvent, EGL_NONE}; - - CUevent cudaEGLSyncEvent; - - eglSyncForGL = eglCreateSyncKHR(dpy, EGL_SYNC_FENCE_KHR, NULL); - - if (eglSyncForGL == EGL_NO_SYNC_KHR) { - printf(" EGL Sync creation failed\n"); - cleanup(FAILURE); - } - - status = cuEventCreateFromEGLSync(&cudaEGLSyncEvent, eglSyncForGL, - CU_EVENT_DEFAULT); + status = cuSurfObjectCreate(&writeSurface, &wdsc); if (status != CUDA_SUCCESS) { - printf("CUDA event creation from EGLSync failed\n"); - cleanup(FAILURE); + printf("Surface bounding failed with status %d\n", status); + cleanup(FAILURE); } - - // We wait from CUDA in GPU for GL-EGL operation completion - status = cuStreamWaitEvent(stream, cudaEGLSyncEvent, 0); + status = cuSurfObjectCreate(&readSurface, &rdsc); if (status != CUDA_SUCCESS) { - printf("Stream wait for event created from EGLSync failed\n"); - cleanup(FAILURE); + printf("Surface bounding failed\n"); + cleanup(FAILURE); } - egl_status = eglDestroySyncKHR(dpy, eglSyncForGL); - if (egl_status != EGL_TRUE) { - printf("EGL sync object destruction failed\n"); - cleanup(FAILURE); - } - - newData++; - expectedData++; - - // Verifies the values in readSurface which is copied by - // glCopyImageSubData() And writes value of newData into writeSurface - verify_and_update_kernel<<<(width * height) / 256, 256, 0, stream>>>( - writeSurface, readSurface, expectedData, newData, width, height); - - status = cuEventDestroy(cudaEGLSyncEvent); + status = cuStreamCreate(&stream, CU_STREAM_DEFAULT); if (status != CUDA_SUCCESS) { - printf("Event Destroy failed\n"); - cleanup(FAILURE); + printf("Stream creation failed\n"); + cleanup(FAILURE); } - status = cuEventRecord(timingDisabledEvent, stream); + // Creates timing disabled event which uses non-blocking synchronization + status = cuEventCreate(&timingDisabledEvent, CU_EVENT_DISABLE_TIMING); if (status != CUDA_SUCCESS) { - printf("Event Record failed\n"); - cleanup(FAILURE); + printf("Default event creation failed\n"); + cleanup(FAILURE); } - // creating an EGL sync object linked to a CUDA event object - eglSyncForCuda = eglCreateSync64KHR(dpy, EGL_SYNC_CUDA_EVENT_NV, eglattrib); + /* + 1. We perform texture-to-texture copy in GLES which is async function + 2. Followed by creating EGLSync and a CUDA Event from that EGLSync object + 3. Using cuStreamWaitEvent() we wait in GPU for the GLES to finish texture + copy. + 4. CUDA kernel verfiy_and_update_kernel verifies if the copied data by + GLES is correct, and it updates the buffer with new values. + 5. This is followed by eglWaitSyncKHR() which waits for the cuda kernel to + finish, so that in the next iteration GLES can perform the copying of the + updated buffer to write texture, + */ + for (z = 0; z < itr; z++) { + // GL call to copy from read texture to write texture + GL_SAFE_CALL(glCopyImageSubData( + tex[GL_READ], GL_TEXTURE_2D, 0, 0, 0, 0, tex[GL_WRITE], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1)); - // We wait from EGL for CUDA operation completion - egl_status = eglWaitSyncKHR(dpy, eglSyncForCuda, 0); - if (egl_status != EGL_TRUE) { - printf("eglWaitSyncKHR failed\n"); - cleanup(FAILURE); + EGLSyncKHR eglSyncForGL, eglSyncForCuda; + EGLBoolean egl_status = EGL_TRUE; + EGLAttribKHR eglattrib[] = {EGL_CUDA_EVENT_HANDLE_NV, (EGLAttrib)timingDisabledEvent, EGL_NONE}; + + CUevent cudaEGLSyncEvent; + + eglSyncForGL = eglCreateSyncKHR(dpy, EGL_SYNC_FENCE_KHR, NULL); + + if (eglSyncForGL == EGL_NO_SYNC_KHR) { + printf(" EGL Sync creation failed\n"); + cleanup(FAILURE); + } + + status = cuEventCreateFromEGLSync(&cudaEGLSyncEvent, eglSyncForGL, CU_EVENT_DEFAULT); + if (status != CUDA_SUCCESS) { + printf("CUDA event creation from EGLSync failed\n"); + cleanup(FAILURE); + } + + // We wait from CUDA in GPU for GL-EGL operation completion + status = cuStreamWaitEvent(stream, cudaEGLSyncEvent, 0); + if (status != CUDA_SUCCESS) { + printf("Stream wait for event created from EGLSync failed\n"); + cleanup(FAILURE); + } + + egl_status = eglDestroySyncKHR(dpy, eglSyncForGL); + if (egl_status != EGL_TRUE) { + printf("EGL sync object destruction failed\n"); + cleanup(FAILURE); + } + + newData++; + expectedData++; + + // Verifies the values in readSurface which is copied by + // glCopyImageSubData() And writes value of newData into writeSurface + verify_and_update_kernel<<<(width * height) / 256, 256, 0, stream>>>( + writeSurface, readSurface, expectedData, newData, width, height); + + status = cuEventDestroy(cudaEGLSyncEvent); + if (status != CUDA_SUCCESS) { + printf("Event Destroy failed\n"); + cleanup(FAILURE); + } + + status = cuEventRecord(timingDisabledEvent, stream); + if (status != CUDA_SUCCESS) { + printf("Event Record failed\n"); + cleanup(FAILURE); + } + + // creating an EGL sync object linked to a CUDA event object + eglSyncForCuda = eglCreateSync64KHR(dpy, EGL_SYNC_CUDA_EVENT_NV, eglattrib); + + // We wait from EGL for CUDA operation completion + egl_status = eglWaitSyncKHR(dpy, eglSyncForCuda, 0); + if (egl_status != EGL_TRUE) { + printf("eglWaitSyncKHR failed\n"); + cleanup(FAILURE); + } + egl_status = eglDestroySyncKHR(dpy, eglSyncForCuda); + if (egl_status != EGL_TRUE) { + printf("EGL sync object destruction failed\n"); + cleanup(FAILURE); + } } - egl_status = eglDestroySyncKHR(dpy, eglSyncForCuda); - if (egl_status != EGL_TRUE) { - printf("EGL sync object destruction failed\n"); - cleanup(FAILURE); + + err = cudaGetValueMismatch(); + if (err != cudaSuccess) { + printf("Value mismatch seen when using GPU sync\n"); + cleanup(FAILURE); } - } - err = cudaGetValueMismatch(); - if (err != cudaSuccess) { - printf("Value mismatch seen when using GPU sync\n"); - cleanup(FAILURE); - } + // Clean up CUDA writeResource + status = cuGraphicsUnregisterResource(writeResource); + if (status != CUDA_SUCCESS) { + printf("Failed to unregister %d", status); + cleanup(FAILURE); + } + else { + printf("Unregistered writeResource. \n"); + } - // Clean up CUDA writeResource - status = cuGraphicsUnregisterResource(writeResource); - if (status != CUDA_SUCCESS) { - printf("Failed to unregister %d", status); - cleanup(FAILURE); - } else { - printf("Unregistered writeResource. \n"); - } - - // Clean up CUDA readResource - status = cuGraphicsUnregisterResource(readResource); - if (status != CUDA_SUCCESS) { - printf("Failed to unregister %d", status); - cleanup(FAILURE); - } else { - printf("Unregistered readResource. \n"); - } + // Clean up CUDA readResource + status = cuGraphicsUnregisterResource(readResource); + if (status != CUDA_SUCCESS) { + printf("Failed to unregister %d", status); + cleanup(FAILURE); + } + else { + printf("Unregistered readResource. \n"); + } } // Verifies the values in readSurface whether they are expected ones // And writes value of newData into writeSurface -__global__ void verify_and_update_kernel(CUsurfObject write, CUsurfObject read, - char expected, char newval, int width, - int height) { - unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int y = blockDim.y * blockIdx.y + threadIdx.y; +__global__ void +verify_and_update_kernel(CUsurfObject write, CUsurfObject read, char expected, char newval, int width, int height) +{ + unsigned int x = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < width && y < height) { - uchar4 check; - surf2Dread(&check, read, x * 4, y); - if (check.x != expected || check.y != expected || check.z != expected || - check.w != expected) { - printf( - "Mismatch found in values read[0]= %u read[1]= %u read[2]= %u " - "read[3]= %u expected is %u\n", - check.x, check.y, check.z, check.w, expected); - numErrors++; - return; + if (x < width && y < height) { + uchar4 check; + surf2Dread(&check, read, x * 4, y); + if (check.x != expected || check.y != expected || check.z != expected || check.w != expected) { + printf("Mismatch found in values read[0]= %u read[1]= %u read[2]= %u " + "read[3]= %u expected is %u\n", + check.x, + check.y, + check.z, + check.w, + expected); + numErrors++; + return; + } + uchar4 data = make_uchar4(newval, newval, newval, newval); + surf2Dwrite(data, write, x * 4, y); } - uchar4 data = make_uchar4(newval, newval, newval, newval); - surf2Dwrite(data, write, x * 4, y); - } } __global__ void getNumErrors(int *numErr) { *numErr = numErrors; } -extern "C" cudaError_t cudaGetValueMismatch() { - int numErr_h; - int *numErr_d = NULL; - cudaError_t err = cudaSuccess; +extern "C" cudaError_t cudaGetValueMismatch() +{ + int numErr_h; + int *numErr_d = NULL; + cudaError_t err = cudaSuccess; - err = cudaMalloc(&numErr_d, sizeof(int)); - if (err != cudaSuccess) { - printf("Cuda Main: cudaMemcpy failed with %s\n", cudaGetErrorString(err)); - cudaFree(numErr_d); - return err; - } + err = cudaMalloc(&numErr_d, sizeof(int)); + if (err != cudaSuccess) { + printf("Cuda Main: cudaMemcpy failed with %s\n", cudaGetErrorString(err)); + cudaFree(numErr_d); + return err; + } - getNumErrors<<<1, 1>>>(numErr_d); - err = cudaDeviceSynchronize(); - if (err != cudaSuccess) { - printf("Cuda Main: cudaDeviceSynchronize failed with %s\n", - cudaGetErrorString(err)); - } - err = cudaMemcpy(&numErr_h, numErr_d, sizeof(int), cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - printf("Cuda Main: cudaMemcpy failed with %s\n", cudaGetErrorString(err)); - cudaFree(numErr_d); - return err; - } - err = cudaFree(numErr_d); - if (err != cudaSuccess) { - printf("Cuda Main: cudaFree failed with %s\n", cudaGetErrorString(err)); - return err; - } - if (numErr_h > 0) { - return cudaErrorUnknown; - } - return cudaSuccess; + getNumErrors<<<1, 1>>>(numErr_d); + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("Cuda Main: cudaDeviceSynchronize failed with %s\n", cudaGetErrorString(err)); + } + err = cudaMemcpy(&numErr_h, numErr_d, sizeof(int), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + printf("Cuda Main: cudaMemcpy failed with %s\n", cudaGetErrorString(err)); + cudaFree(numErr_d); + return err; + } + err = cudaFree(numErr_d); + if (err != cudaSuccess) { + printf("Cuda Main: cudaFree failed with %s\n", cudaGetErrorString(err)); + return err; + } + if (numErr_h > 0) { + return cudaErrorUnknown; + } + return cudaSuccess; } // Clean up state and exit. If status is SUCCESS, regression success is printed // to stdout. This will happen if the glut timer is triggered. If status is // anything else, the regression failure message is printed. -void cleanup(int status) { - GLenum glErr = GL_NO_ERROR; - cudaError cudaErr = cudaSuccess; - int exitStatus = status; +void cleanup(int status) +{ + GLenum glErr = GL_NO_ERROR; + cudaError cudaErr = cudaSuccess; + int exitStatus = status; - // Clean up GL - if (*tex) { - GL_SAFE_CALL_NO_CLEANUP(glDeleteTextures(2, tex), glErr); - } + // Clean up GL + if (*tex) { + GL_SAFE_CALL_NO_CLEANUP(glDeleteTextures(2, tex), glErr); + } - // Print test status and exit - if (glErr != GL_NO_ERROR || cudaErr != cudaSuccess) exitStatus = FAILURE; + // Print test status and exit + if (glErr != GL_NO_ERROR || cudaErr != cudaSuccess) + exitStatus = FAILURE; - printStatus(exitStatus); + printStatus(exitStatus); - cleanExit = 1; + cleanExit = 1; - graphics_close_window(); + graphics_close_window(); - if (exitStatus == FAILURE) exit(EXIT_FAILURE); + if (exitStatus == FAILURE) + exit(EXIT_FAILURE); - if (exitStatus == WAIVED) exit(EXIT_WAIVED); + if (exitStatus == WAIVED) + exit(EXIT_WAIVED); - exit(0); + exit(0); } -void exitHandler(void) { - if (!cleanExit) { - printf("&&&& EGLSync_CUDAEvent_Interop unexpected failure \n"); - printStatus(FAILURE); - } +void exitHandler(void) +{ + if (!cleanExit) { + printf("&&&& EGLSync_CUDAEvent_Interop unexpected failure \n"); + printStatus(FAILURE); + } } // Print test success or fail for regression testing -void printStatus(int status) { - switch (status) { +void printStatus(int status) +{ + switch (status) { case SUCCESS: - printf("&&&& EGLSync_CUDAEvent_Interop PASSED\n"); - break; + printf("&&&& EGLSync_CUDAEvent_Interop PASSED\n"); + break; case WAIVED: - printf("&&&& EGLSync_CUDAEvent_Interop WAIVED\n"); - break; + printf("&&&& EGLSync_CUDAEvent_Interop WAIVED\n"); + break; default: - printf("&&&& EGLSync_CUDAEvent_Interop FAILED\n"); - break; - } - fflush(stdout); + printf("&&&& EGLSync_CUDAEvent_Interop FAILED\n"); + break; + } + fflush(stdout); } diff --git a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/README.md b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/README.md index d659b8f8..d412501a 100644 --- a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/README.md +++ b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/README.md @@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/egl_common.h b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/egl_common.h index 46e2dbd8..66551e35 100644 --- a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/egl_common.h +++ b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/egl_common.h @@ -32,13 +32,14 @@ #ifndef _EGL_COMMON_H_ #define _EGL_COMMON_H_ +#include #include #include #include -#include #include #include -#include +#include + #include "cuda.h" #include "cudaEGL.h" @@ -54,18 +55,18 @@ EGLImageKHR eglImage; T(PFNEGLCREATESYNC64KHRPROC, eglCreateSync64KHR) \ T(PFNEGLWAITSYNCKHRPROC, eglWaitSyncKHR) -#define eglCreateImageKHR my_eglCreateImageKHR -#define eglDestroyImageKHR my_eglDestroyImageKHR -#define eglCreateSyncKHR my_eglCreateSyncKHR -#define eglDestroySyncKHR my_eglDestroySyncKHR +#define eglCreateImageKHR my_eglCreateImageKHR +#define eglDestroyImageKHR my_eglDestroyImageKHR +#define eglCreateSyncKHR my_eglCreateSyncKHR +#define eglDestroySyncKHR my_eglDestroySyncKHR #define eglClientWaitSyncKHR my_eglClientWaitSyncKHR -#define eglGetSyncAttribKHR my_eglGetSyncAttribKHR -#define eglCreateSync64KHR my_eglCreateSync64KHR -#define eglWaitSyncKHR my_eglWaitSyncKHR +#define eglGetSyncAttribKHR my_eglGetSyncAttribKHR +#define eglCreateSync64KHR my_eglCreateSync64KHR +#define eglWaitSyncKHR my_eglWaitSyncKHR -#define EXTLST_DECL(tx, x) tx my_##x = NULL; +#define EXTLST_DECL(tx, x) tx my_##x = NULL; #define EXTLST_EXTERN(tx, x) extern tx my_##x; -#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&my_##x, #x}, +#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&my_##x, #x}, int eglSetupExtensions(void); #endif diff --git a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/graphics_interface.h b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/graphics_interface.h index cce540bc..81925b79 100644 --- a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/graphics_interface.h +++ b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/graphics_interface.h @@ -27,178 +27,197 @@ #include #include +#include #include #include -#include Display *display; -int screen; -Window win = 0; +int screen; +Window win = 0; -void error_exit(const char *format, ...) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - exit(1); +void error_exit(const char *format, ...) +{ + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + exit(1); } #include #include -#define GET_GLERROR(ret) \ - { \ - GLenum err = glGetError(); \ - if (err != GL_NO_ERROR) { \ - fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, \ - err); \ - fflush(stderr); \ - \ - switch (err) { \ - case GL_INVALID_ENUM: \ - printf("GL_INVALID_ENUM\n"); \ - break; \ - case GL_INVALID_VALUE: \ - printf("GL_INVALID_VALUE\n"); \ - break; \ - case GL_INVALID_OPERATION: \ - printf("GL_INVALID_OPERATION\n"); \ - break; \ - case GL_OUT_OF_MEMORY: \ - printf("GL_OUT_OF_MEMORY\n"); \ - break; \ - case GL_INVALID_FRAMEBUFFER_OPERATION: \ - printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ - break; \ - default: \ - printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ - }; \ - } \ - } +#define GET_GLERROR(ret) \ + { \ + GLenum err = glGetError(); \ + if (err != GL_NO_ERROR) { \ + fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, err); \ + fflush(stderr); \ + \ + switch (err) { \ + case GL_INVALID_ENUM: \ + printf("GL_INVALID_ENUM\n"); \ + break; \ + case GL_INVALID_VALUE: \ + printf("GL_INVALID_VALUE\n"); \ + break; \ + case GL_INVALID_OPERATION: \ + printf("GL_INVALID_OPERATION\n"); \ + break; \ + case GL_OUT_OF_MEMORY: \ + printf("GL_OUT_OF_MEMORY\n"); \ + break; \ + case GL_INVALID_FRAMEBUFFER_OPERATION: \ + printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ + break; \ + default: \ + printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ + }; \ + } \ + } EGLDisplay eglDisplay = EGL_NO_DISPLAY; EGLSurface eglSurface = EGL_NO_SURFACE; EGLContext eglContext = EGL_NO_CONTEXT; -int graphics_setup_window(int xpos, int ypos, int width, int height, - const char *windowname) { - EGLint configAttrs[] = { - EGL_RED_SIZE, 1, EGL_GREEN_SIZE, 1, EGL_BLUE_SIZE, 1, EGL_DEPTH_SIZE, 16, - EGL_SAMPLE_BUFFERS, 0, EGL_SAMPLES, 0, - // EGL_CONFORMANT, EGL_OPENGL_BIT, - EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, // 3_BIT_KHR, - EGL_NONE}; - EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; +int graphics_setup_window(int xpos, int ypos, int width, int height, const char *windowname) +{ + EGLint configAttrs[] = {EGL_RED_SIZE, + 1, + EGL_GREEN_SIZE, + 1, + EGL_BLUE_SIZE, + 1, + EGL_DEPTH_SIZE, + 16, + EGL_SAMPLE_BUFFERS, + 0, + EGL_SAMPLES, + 0, + // EGL_CONFORMANT, EGL_OPENGL_BIT, + EGL_RENDERABLE_TYPE, + EGL_OPENGL_ES2_BIT, // 3_BIT_KHR, + EGL_NONE}; + EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; - EGLConfig *configList = NULL; - EGLint configCount; + EGLConfig *configList = NULL; + EGLint configCount; - display = XOpenDisplay(NULL); - if (!display) error_exit("Error opening X display.\n"); + display = XOpenDisplay(NULL); + if (!display) + error_exit("Error opening X display.\n"); - screen = DefaultScreen(display); + screen = DefaultScreen(display); - eglDisplay = eglGetDisplay(0); + eglDisplay = eglGetDisplay(0); - if (eglDisplay == EGL_NO_DISPLAY) - error_exit("EGL failed to obtain display\n"); + if (eglDisplay == EGL_NO_DISPLAY) + error_exit("EGL failed to obtain display\n"); - if (!eglInitialize(eglDisplay, 0, 0)) - error_exit("EGL failed to initialize\n"); + if (!eglInitialize(eglDisplay, 0, 0)) + error_exit("EGL failed to initialize\n"); - if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || - !configCount) - error_exit("EGL failed to return any matching configurations\n"); + if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || !configCount) + error_exit("EGL failed to return any matching configurations\n"); - configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); + configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); - if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, - &configCount) || - !configCount) - error_exit("EGL failed to populate configuration list\n"); + if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, &configCount) || !configCount) + error_exit("EGL failed to populate configuration list\n"); - Window xRootWindow = DefaultRootWindow(display); - XSetWindowAttributes xCreateWindowAttributes; - xCreateWindowAttributes.event_mask = ExposureMask; - win = XCreateWindow(display, xRootWindow, 0, 0, width, height, 0, - CopyFromParent, InputOutput, CopyFromParent, CWEventMask, - &xCreateWindowAttributes); - XMapWindow(display, win); + Window xRootWindow = DefaultRootWindow(display); + XSetWindowAttributes xCreateWindowAttributes; + xCreateWindowAttributes.event_mask = ExposureMask; + win = XCreateWindow(display, + xRootWindow, + 0, + 0, + width, + height, + 0, + CopyFromParent, + InputOutput, + CopyFromParent, + CWEventMask, + &xCreateWindowAttributes); + XMapWindow(display, win); - Atom netWmStateAtom = XInternAtom(display, "_NET_WM_STATE", false); - XEvent xEvent; - memset(&xEvent, 0, sizeof(xEvent)); - xEvent.type = ClientMessage; - xEvent.xclient.window = win; - xEvent.xclient.message_type = netWmStateAtom; - xEvent.xclient.format = 32; - xEvent.xclient.data.l[0] = 1; - xEvent.xclient.data.l[1] = false; - XSendEvent(display, xRootWindow, false, SubstructureNotifyMask, &xEvent); + Atom netWmStateAtom = XInternAtom(display, "_NET_WM_STATE", false); + XEvent xEvent; + memset(&xEvent, 0, sizeof(xEvent)); + xEvent.type = ClientMessage; + xEvent.xclient.window = win; + xEvent.xclient.message_type = netWmStateAtom; + xEvent.xclient.format = 32; + xEvent.xclient.data.l[0] = 1; + xEvent.xclient.data.l[1] = false; + XSendEvent(display, xRootWindow, false, SubstructureNotifyMask, &xEvent); - XStoreName(display, win, windowname); + XStoreName(display, win, windowname); - XSelectInput(display, win, - ExposureMask | KeyPressMask | ButtonPressMask | - ButtonReleaseMask | KeyReleaseMask | VisibilityChangeMask | - PointerMotionMask); + XSelectInput(display, + win, + ExposureMask | KeyPressMask | ButtonPressMask | ButtonReleaseMask | KeyReleaseMask + | VisibilityChangeMask | PointerMotionMask); - EGLint windowAttrs[] = {EGL_NONE}; + EGLint windowAttrs[] = {EGL_NONE}; - eglSurface = eglCreateWindowSurface(eglDisplay, configList[0], - (EGLNativeWindowType)win, windowAttrs); + eglSurface = eglCreateWindowSurface(eglDisplay, configList[0], (EGLNativeWindowType)win, windowAttrs); - if (!eglSurface) error_exit("EGL couldn't create window\n"); + if (!eglSurface) + error_exit("EGL couldn't create window\n"); - eglBindAPI(EGL_OPENGL_ES_API); + eglBindAPI(EGL_OPENGL_ES_API); - eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); - if (!eglContext) error_exit("EGL couldn't create context\n"); + eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); + if (!eglContext) + error_exit("EGL couldn't create context\n"); - if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) - error_exit("EGL couldn't make context/surface current\n"); + if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) + error_exit("EGL couldn't make context/surface current\n"); - EGLint Context_RendererType; - eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, - &Context_RendererType); + EGLint Context_RendererType; + eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, &Context_RendererType); - switch (Context_RendererType) { + switch (Context_RendererType) { case EGL_OPENGL_API: - printf("Using OpenGL API is not supported\n"); - exit(EXIT_FAILURE); - break; + printf("Using OpenGL API is not supported\n"); + exit(EXIT_FAILURE); + break; case EGL_OPENGL_ES_API: - printf("Using OpenGL ES API\n"); - break; + printf("Using OpenGL ES API\n"); + break; case EGL_OPENVG_API: - error_exit("Context Query Returned OpenVG. This is Unsupported\n"); + error_exit("Context Query Returned OpenVG. This is Unsupported\n"); default: - error_exit("Unknown Context Type. %04X\n", Context_RendererType); - } + error_exit("Unknown Context Type. %04X\n", Context_RendererType); + } - return 1; + return 1; } -void graphics_set_windowtitle(const char *windowname) { - XStoreName(display, win, windowname); -} +void graphics_set_windowtitle(const char *windowname) { XStoreName(display, win, windowname); } void graphics_swap_buffers() { eglSwapBuffers(eglDisplay, eglSurface); } -void graphics_close_window() { - if (eglDisplay != EGL_NO_DISPLAY) { - eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); +void graphics_close_window() +{ + if (eglDisplay != EGL_NO_DISPLAY) { + eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); - if (eglContext != EGL_NO_CONTEXT) eglDestroyContext(eglDisplay, eglContext); + if (eglContext != EGL_NO_CONTEXT) + eglDestroyContext(eglDisplay, eglContext); - if (eglSurface != EGL_NO_SURFACE) eglDestroySurface(eglDisplay, eglSurface); + if (eglSurface != EGL_NO_SURFACE) + eglDestroySurface(eglDisplay, eglSurface); - eglTerminate(eglDisplay); - } + eglTerminate(eglDisplay); + } - if (display) { - if (win) XDestroyWindow(display, win); + if (display) { + if (win) + XDestroyWindow(display, win); - XCloseDisplay(display); - } + XCloseDisplay(display); + } } diff --git a/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/README.md b/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/README.md index 156c6d60..8bbb668f 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/README.md +++ b/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/README.md @@ -30,4 +30,3 @@ cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSe Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/main.cu b/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/main.cu index 7a802293..b39cd010 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/main.cu +++ b/Samples/8_Platform_Specific/Tegra/cuDLAErrorReporting/main.cu @@ -25,407 +25,399 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cudla.h" -#include "cuda_runtime.h" - #include #include #include -#include #include #include +#include + +#include "cuda_runtime.h" +#include "cudla.h" #define DPRINTF(...) printf(__VA_ARGS__) -static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { - DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); - DPRINTF("\tsize: %lu\n", tensorDesc->size); +static void printTensorDesc(cudlaModuleTensorDescriptor *tensorDesc) +{ + DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); + DPRINTF("\tsize: %lu\n", tensorDesc->size); - DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, - tensorDesc->h, tensorDesc->w); + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, tensorDesc->h, tensorDesc->w); - DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); - DPRINTF("\tdata type: %d\n", tensorDesc->dataType); - DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); - DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); - DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); - DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); - DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); - DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); - DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); + DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); + DPRINTF("\tdata type: %d\n", tensorDesc->dataType); + DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); + DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); + DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); + DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); + DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); + DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); + DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); } -typedef struct { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - unsigned char* loadableData; - cudaStream_t stream; - unsigned char* inputBuffer; - unsigned char* outputBuffer; - void* inputBufferGPU; - void* outputBufferGPU; - cudlaModuleTensorDescriptor* inputTensorDesc; - cudlaModuleTensorDescriptor* outputTensorDesc; +typedef struct +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char *loadableData; + cudaStream_t stream; + unsigned char *inputBuffer; + unsigned char *outputBuffer; + void *inputBufferGPU; + void *outputBufferGPU; + cudlaModuleTensorDescriptor *inputTensorDesc; + cudlaModuleTensorDescriptor *outputTensorDesc; } ResourceList; -void cleanUp(ResourceList* resourceList); +void cleanUp(ResourceList *resourceList); -void cleanUp(ResourceList* resourceList) { - if (resourceList->inputTensorDesc != NULL) { - free(resourceList->inputTensorDesc); - resourceList->inputTensorDesc = NULL; - } - if (resourceList->outputTensorDesc != NULL) { - free(resourceList->outputTensorDesc); - resourceList->outputTensorDesc = NULL; - } +void cleanUp(ResourceList *resourceList) +{ + if (resourceList->inputTensorDesc != NULL) { + free(resourceList->inputTensorDesc); + resourceList->inputTensorDesc = NULL; + } + if (resourceList->outputTensorDesc != NULL) { + free(resourceList->outputTensorDesc); + resourceList->outputTensorDesc = NULL; + } - if (resourceList->loadableData != NULL) { - free(resourceList->loadableData); - resourceList->loadableData = NULL; - } + if (resourceList->loadableData != NULL) { + free(resourceList->loadableData); + resourceList->loadableData = NULL; + } - if (resourceList->moduleHandle != NULL) { - cudlaModuleUnload(resourceList->moduleHandle, 0); - resourceList->moduleHandle = NULL; - } + if (resourceList->moduleHandle != NULL) { + cudlaModuleUnload(resourceList->moduleHandle, 0); + resourceList->moduleHandle = NULL; + } - if (resourceList->devHandle != NULL) { - cudlaDestroyDevice(resourceList->devHandle); - resourceList->devHandle = NULL; - } + if (resourceList->devHandle != NULL) { + cudlaDestroyDevice(resourceList->devHandle); + resourceList->devHandle = NULL; + } - if (resourceList->inputBufferGPU != 0) { - cudaFree(resourceList->inputBufferGPU); - resourceList->inputBufferGPU = 0; - } - if (resourceList->outputBufferGPU != 0) { - cudaFree(resourceList->outputBufferGPU); - resourceList->outputBufferGPU = 0; - } + if (resourceList->inputBufferGPU != 0) { + cudaFree(resourceList->inputBufferGPU); + resourceList->inputBufferGPU = 0; + } + if (resourceList->outputBufferGPU != 0) { + cudaFree(resourceList->outputBufferGPU); + resourceList->outputBufferGPU = 0; + } - if (resourceList->inputBuffer != NULL) { - free(resourceList->inputBuffer); - resourceList->inputBuffer = NULL; - } - if (resourceList->outputBuffer != NULL) { - free(resourceList->outputBuffer); - resourceList->outputBuffer = NULL; - } + if (resourceList->inputBuffer != NULL) { + free(resourceList->inputBuffer); + resourceList->inputBuffer = NULL; + } + if (resourceList->outputBuffer != NULL) { + free(resourceList->outputBuffer); + resourceList->outputBuffer = NULL; + } - if (resourceList->stream != NULL) { - cudaStreamDestroy(resourceList->stream); - resourceList->stream = NULL; - } + if (resourceList->stream != NULL) { + cudaStreamDestroy(resourceList->stream); + resourceList->stream = NULL; + } } -int main(int argc, char** argv) { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - cudlaStatus err; - FILE* fp = NULL; - struct stat st; - size_t file_size; - size_t actually_read = 0; - unsigned char* loadableData = NULL; +int main(int argc, char **argv) +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + cudlaStatus err; + FILE *fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; - cudaStream_t stream; - cudaError_t result; - const char* errPtr = NULL; + cudaStream_t stream; + cudaError_t result; + const char *errPtr = NULL; - ResourceList resourceList; + ResourceList resourceList; - memset(&resourceList, 0x00, sizeof(ResourceList)); + memset(&resourceList, 0x00, sizeof(ResourceList)); - if (argc != 2) { - DPRINTF("Usage : ./cuDLAErrorReporting \n"); - return 1; - } - - // Read loadable into buffer. - fp = fopen(argv[1], "rb"); - if (fp == NULL) { - DPRINTF("Cannot open file %s\n", argv[1]); - return 1; - } - - if (stat(argv[1], &st) != 0) { - DPRINTF("Cannot stat file\n"); - return 1; - } - - file_size = st.st_size; - DPRINTF("The file size = %ld\n", file_size); - - loadableData = (unsigned char*)malloc(file_size); - if (loadableData == NULL) { - DPRINTF("Cannot Allocate memory for loadable\n"); - return 1; - } - - actually_read = fread(loadableData, 1, file_size, fp); - if (actually_read != file_size) { - free(loadableData); - DPRINTF("Read wrong size\n"); - return 1; - } - fclose(fp); - - resourceList.loadableData = loadableData; - - // Initialize CUDA. - result = cudaFree(0); - if (result != cudaSuccess) { - errPtr = cudaGetErrorName(result); - DPRINTF("Error in creating cudaFree = %s\n", errPtr); - cleanUp(&resourceList); - return 1; - } - result = cudaSetDevice(0); - if (result != cudaSuccess) { - errPtr = cudaGetErrorName(result); - DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr); - cleanUp(&resourceList); - return 1; - } - - err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA); - if (err != cudlaSuccess) { - DPRINTF("Error in cuDLA create device = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - DPRINTF("Device created successfully\n"); - resourceList.devHandle = devHandle; - - err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, - &moduleHandle, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); - cleanUp(&resourceList); - return 1; - } else { - DPRINTF("Successfully loaded module\n"); - } - - resourceList.moduleHandle = moduleHandle; - - // Create CUDA stream. - result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - - if (result != cudaSuccess) { - errPtr = cudaGetErrorName(result); - DPRINTF("Error in creating cuda stream = %s\n", errPtr); - cleanUp(&resourceList); - return 1; - } - - resourceList.stream = stream; - - // Get tensor attributes. - uint32_t numInputTensors = 0; - uint32_t numOutputTensors = 0; - cudlaModuleAttribute attribute; - - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting numInputTensors = %d\n", err); - cleanUp(&resourceList); - return 1; - } - numInputTensors = attribute.numInputTensors; - DPRINTF("numInputTensors = %d\n", numInputTensors); - - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting numOutputTensors = %d\n", err); - cleanUp(&resourceList); - return 1; - } - numOutputTensors = attribute.numOutputTensors; - DPRINTF("numOutputTensors = %d\n", numOutputTensors); - - cudlaModuleTensorDescriptor* inputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor) * - numInputTensors); - cudlaModuleTensorDescriptor* outputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor) * - numOutputTensors); - - if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { - if (inputTensorDesc != NULL) { - free(inputTensorDesc); - inputTensorDesc = NULL; + if (argc != 2) { + DPRINTF("Usage : ./cuDLAErrorReporting \n"); + return 1; } - if (outputTensorDesc != NULL) { - free(outputTensorDesc); - outputTensorDesc = NULL; + // Read loadable into buffer. + fp = fopen(argv[1], "rb"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", argv[1]); + return 1; + } + + if (stat(argv[1], &st) != 0) { + DPRINTF("Cannot stat file\n"); + return 1; + } + + file_size = st.st_size; + DPRINTF("The file size = %ld\n", file_size); + + loadableData = (unsigned char *)malloc(file_size); + if (loadableData == NULL) { + DPRINTF("Cannot Allocate memory for loadable\n"); + return 1; + } + + actually_read = fread(loadableData, 1, file_size, fp); + if (actually_read != file_size) { + free(loadableData); + DPRINTF("Read wrong size\n"); + return 1; + } + fclose(fp); + + resourceList.loadableData = loadableData; + + // Initialize CUDA. + result = cudaFree(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaFree = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + result = cudaSetDevice(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA create device = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Device created successfully\n"); + resourceList.devHandle = devHandle; + + err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + else { + DPRINTF("Successfully loaded module\n"); + } + + resourceList.moduleHandle = moduleHandle; + + // Create CUDA stream. + result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cuda stream = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + resourceList.stream = stream; + + // Get tensor attributes. + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; + cudlaModuleAttribute attribute; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numInputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numInputTensors = attribute.numInputTensors; + DPRINTF("numInputTensors = %d\n", numInputTensors); + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTensors = attribute.numOutputTensors; + DPRINTF("numOutputTensors = %d\n", numOutputTensors); + + cudlaModuleTensorDescriptor *inputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numInputTensors); + cudlaModuleTensorDescriptor *outputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numOutputTensors); + + if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { + if (inputTensorDesc != NULL) { + free(inputTensorDesc); + inputTensorDesc = NULL; + } + + if (outputTensorDesc != NULL) { + free(outputTensorDesc); + outputTensorDesc = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputTensorDesc = inputTensorDesc; + resourceList.outputTensorDesc = outputTensorDesc; + + attribute.inputTensorDesc = inputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting input tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing input tensor descriptor\n"); + printTensorDesc(inputTensorDesc); + + attribute.outputTensorDesc = outputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting output tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output tensor descriptor\n"); + printTensorDesc(outputTensorDesc); + + // Setup the input and output buffers which will be used as an input to CUDA. + unsigned char *inputBuffer = (unsigned char *)malloc(inputTensorDesc[0].size); + if (inputBuffer == NULL) { + DPRINTF("Error in allocating input memory\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBuffer = inputBuffer; + + unsigned char *outputBuffer = (unsigned char *)malloc(outputTensorDesc[0].size); + if (outputBuffer == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.outputBuffer = outputBuffer; + + memset(inputBuffer, 0x01, inputTensorDesc[0].size); + memset(outputBuffer, 0x00, outputTensorDesc[0].size); + + // Allocate memory on GPU. + void *inputBufferGPU; + void *outputBufferGPU; + result = cudaMalloc(&inputBufferGPU, inputTensorDesc[0].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating input memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBufferGPU = inputBufferGPU; + + result = cudaMalloc(&outputBufferGPU, outputTensorDesc[0].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating output memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.outputBufferGPU = outputBufferGPU; + + // Register the CUDA-allocated buffers. + uint64_t *inputBufferRegisteredPtr = NULL; + uint64_t *outputBufferRegisteredPtr = NULL; + + err = + cudlaMemRegister(devHandle, (uint64_t *)inputBufferGPU, inputTensorDesc[0].size, &inputBufferRegisteredPtr, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + err = cudlaMemRegister( + devHandle, (uint64_t *)outputBufferGPU, outputTensorDesc[0].size, &outputBufferRegisteredPtr, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); + + // Copy data from CPU buffers to GPU buffers. + result = cudaMemcpyAsync(inputBufferGPU, inputBuffer, inputTensorDesc[0].size, cudaMemcpyHostToDevice, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memcpy for input\n"); + cleanUp(&resourceList); + return 1; + } + result = cudaMemsetAsync(outputBufferGPU, 0, outputTensorDesc[0].size, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memset for output\n"); + cleanUp(&resourceList); + return 1; + } + + // Enqueue a cuDLA task. + cudlaTask task; + task.moduleHandle = moduleHandle; + task.outputTensor = &outputBufferRegisteredPtr; + task.numOutputTensors = 1; + task.numInputTensors = 1; + task.inputTensor = &inputBufferRegisteredPtr; + task.waitEvents = NULL; + task.signalEvents = NULL; + err = cudlaSubmitTask(devHandle, &task, 1, stream, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in submitting task\n"); + cleanUp(&resourceList); + return 1; + } + DPRINTF("SUBMIT IS DONE !!!\n"); + + // Wait for stream operations to finish and bring output buffer to CPU. + result = cudaMemcpyAsync(outputBuffer, outputBufferGPU, outputTensorDesc[0].size, cudaMemcpyDeviceToHost, stream); + if (result != cudaSuccess) { + if (result != cudaErrorExternalDevice) { + DPRINTF("Error in bringing result back to CPU\n"); + cleanUp(&resourceList); + return 1; + } + else { + cudlaStatus hwStatus = cudlaGetLastError(devHandle); + if (hwStatus != cudlaSuccess) { + DPRINTF("Asynchronous error in HW = %u\n", hwStatus); + } + } + } + + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream = %s\n", cudaGetErrorName(result)); + + if (result == cudaErrorExternalDevice) { + cudlaStatus hwStatus = cudlaGetLastError(devHandle); + if (hwStatus != cudlaSuccess) { + DPRINTF("Asynchronous error in HW = %u\n", hwStatus); + } + } } cleanUp(&resourceList); - return 1; - } - resourceList.inputTensorDesc = inputTensorDesc; - resourceList.outputTensorDesc = outputTensorDesc; + DPRINTF("cuDLAErrorReporting DONE !!!\n"); - attribute.inputTensorDesc = inputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting input tensor descriptor = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("Printing input tensor descriptor\n"); - printTensorDesc(inputTensorDesc); - - attribute.outputTensorDesc = outputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting output tensor descriptor = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("Printing output tensor descriptor\n"); - printTensorDesc(outputTensorDesc); - - // Setup the input and output buffers which will be used as an input to CUDA. - unsigned char* inputBuffer = (unsigned char*)malloc(inputTensorDesc[0].size); - if (inputBuffer == NULL) { - DPRINTF("Error in allocating input memory\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.inputBuffer = inputBuffer; - - unsigned char* outputBuffer = - (unsigned char*)malloc(outputTensorDesc[0].size); - if (outputBuffer == NULL) { - DPRINTF("Error in allocating output memory\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.outputBuffer = outputBuffer; - - memset(inputBuffer, 0x01, inputTensorDesc[0].size); - memset(outputBuffer, 0x00, outputTensorDesc[0].size); - - // Allocate memory on GPU. - void* inputBufferGPU; - void* outputBufferGPU; - result = cudaMalloc(&inputBufferGPU, inputTensorDesc[0].size); - if (result != cudaSuccess) { - DPRINTF("Error in allocating input memory on GPU\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.inputBufferGPU = inputBufferGPU; - - result = cudaMalloc(&outputBufferGPU, outputTensorDesc[0].size); - if (result != cudaSuccess) { - DPRINTF("Error in allocating output memory on GPU\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.outputBufferGPU = outputBufferGPU; - - // Register the CUDA-allocated buffers. - uint64_t* inputBufferRegisteredPtr = NULL; - uint64_t* outputBufferRegisteredPtr = NULL; - - err = cudlaMemRegister(devHandle, (uint64_t*)inputBufferGPU, - inputTensorDesc[0].size, &inputBufferRegisteredPtr, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in registering input memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - err = - cudlaMemRegister(devHandle, (uint64_t*)outputBufferGPU, - outputTensorDesc[0].size, &outputBufferRegisteredPtr, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in registering output memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); - - // Copy data from CPU buffers to GPU buffers. - result = cudaMemcpyAsync(inputBufferGPU, inputBuffer, inputTensorDesc[0].size, - cudaMemcpyHostToDevice, stream); - if (result != cudaSuccess) { - DPRINTF("Error in enqueueing memcpy for input\n"); - cleanUp(&resourceList); - return 1; - } - result = - cudaMemsetAsync(outputBufferGPU, 0, outputTensorDesc[0].size, stream); - if (result != cudaSuccess) { - DPRINTF("Error in enqueueing memset for output\n"); - cleanUp(&resourceList); - return 1; - } - - // Enqueue a cuDLA task. - cudlaTask task; - task.moduleHandle = moduleHandle; - task.outputTensor = &outputBufferRegisteredPtr; - task.numOutputTensors = 1; - task.numInputTensors = 1; - task.inputTensor = &inputBufferRegisteredPtr; - task.waitEvents = NULL; - task.signalEvents = NULL; - err = cudlaSubmitTask(devHandle, &task, 1, stream, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in submitting task\n"); - cleanUp(&resourceList); - return 1; - } - DPRINTF("SUBMIT IS DONE !!!\n"); - - // Wait for stream operations to finish and bring output buffer to CPU. - result = - cudaMemcpyAsync(outputBuffer, outputBufferGPU, outputTensorDesc[0].size, - cudaMemcpyDeviceToHost, stream); - if (result != cudaSuccess) { - if (result != cudaErrorExternalDevice) { - DPRINTF("Error in bringing result back to CPU\n"); - cleanUp(&resourceList); - return 1; - } else { - cudlaStatus hwStatus = cudlaGetLastError(devHandle); - if (hwStatus != cudlaSuccess) { - DPRINTF("Asynchronous error in HW = %u\n", hwStatus); - } - } - } - - result = cudaStreamSynchronize(stream); - if (result != cudaSuccess) { - DPRINTF("Error in synchronizing stream = %s\n", cudaGetErrorName(result)); - - if (result == cudaErrorExternalDevice) { - cudlaStatus hwStatus = cudlaGetLastError(devHandle); - if (hwStatus != cudlaSuccess) { - DPRINTF("Asynchronous error in HW = %u\n", hwStatus); - } - } - } - - cleanUp(&resourceList); - - DPRINTF("cuDLAErrorReporting DONE !!!\n"); - - return 0; + return 0; } diff --git a/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/README.md b/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/README.md index 286a52e8..d2a3778b 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/README.md +++ b/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/README.md @@ -30,4 +30,3 @@ cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSe Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/main.cu b/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/main.cu index b96a98d9..204032a3 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/main.cu +++ b/Samples/8_Platform_Specific/Tegra/cuDLAHybridMode/main.cu @@ -25,472 +25,463 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cudla.h" -#include "cuda_runtime.h" - #include #include #include -#include #include #include +#include + +#include "cuda_runtime.h" +#include "cudla.h" #define DPRINTF(...) printf(__VA_ARGS__) -static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { - DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); - DPRINTF("\tsize: %lu\n", tensorDesc->size); +static void printTensorDesc(cudlaModuleTensorDescriptor *tensorDesc) +{ + DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); + DPRINTF("\tsize: %lu\n", tensorDesc->size); - DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, - tensorDesc->h, tensorDesc->w); + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, tensorDesc->h, tensorDesc->w); - DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); - DPRINTF("\tdata type: %d\n", tensorDesc->dataType); - DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); - DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); - DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); - DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); - DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); - DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); - DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); + DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); + DPRINTF("\tdata type: %d\n", tensorDesc->dataType); + DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); + DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); + DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); + DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); + DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); + DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); + DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); } -static int initializeInputBuffers(char* filePath, - cudlaModuleTensorDescriptor* tensorDesc, - unsigned char* buf) { - // Read the file in filePath and fill up 'buf' according to format - // specified by the user. +static int initializeInputBuffers(char *filePath, cudlaModuleTensorDescriptor *tensorDesc, unsigned char *buf) +{ + // Read the file in filePath and fill up 'buf' according to format + // specified by the user. - return 0; + return 0; } -typedef struct { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - unsigned char* loadableData; - cudaStream_t stream; - unsigned char* inputBuffer; - unsigned char* outputBuffer; - void* inputBufferGPU; - void* outputBufferGPU; - cudlaModuleTensorDescriptor* inputTensorDesc; - cudlaModuleTensorDescriptor* outputTensorDesc; +typedef struct +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char *loadableData; + cudaStream_t stream; + unsigned char *inputBuffer; + unsigned char *outputBuffer; + void *inputBufferGPU; + void *outputBufferGPU; + cudlaModuleTensorDescriptor *inputTensorDesc; + cudlaModuleTensorDescriptor *outputTensorDesc; } ResourceList; -void cleanUp(ResourceList* resourceList); +void cleanUp(ResourceList *resourceList); -void cleanUp(ResourceList* resourceList) { - if (resourceList->inputTensorDesc != NULL) { - free(resourceList->inputTensorDesc); - resourceList->inputTensorDesc = NULL; - } - if (resourceList->outputTensorDesc != NULL) { - free(resourceList->outputTensorDesc); - resourceList->outputTensorDesc = NULL; - } +void cleanUp(ResourceList *resourceList) +{ + if (resourceList->inputTensorDesc != NULL) { + free(resourceList->inputTensorDesc); + resourceList->inputTensorDesc = NULL; + } + if (resourceList->outputTensorDesc != NULL) { + free(resourceList->outputTensorDesc); + resourceList->outputTensorDesc = NULL; + } - if (resourceList->loadableData != NULL) { - free(resourceList->loadableData); - resourceList->loadableData = NULL; - } + if (resourceList->loadableData != NULL) { + free(resourceList->loadableData); + resourceList->loadableData = NULL; + } - if (resourceList->moduleHandle != NULL) { - cudlaModuleUnload(resourceList->moduleHandle, 0); - resourceList->moduleHandle = NULL; - } + if (resourceList->moduleHandle != NULL) { + cudlaModuleUnload(resourceList->moduleHandle, 0); + resourceList->moduleHandle = NULL; + } - if (resourceList->devHandle != NULL) { - cudlaDestroyDevice(resourceList->devHandle); - resourceList->devHandle = NULL; - } + if (resourceList->devHandle != NULL) { + cudlaDestroyDevice(resourceList->devHandle); + resourceList->devHandle = NULL; + } - if (resourceList->inputBufferGPU != 0) { - cudaFree(resourceList->inputBufferGPU); - resourceList->inputBufferGPU = 0; - } - if (resourceList->outputBufferGPU != 0) { - cudaFree(resourceList->outputBufferGPU); - resourceList->outputBufferGPU = 0; - } + if (resourceList->inputBufferGPU != 0) { + cudaFree(resourceList->inputBufferGPU); + resourceList->inputBufferGPU = 0; + } + if (resourceList->outputBufferGPU != 0) { + cudaFree(resourceList->outputBufferGPU); + resourceList->outputBufferGPU = 0; + } - if (resourceList->inputBuffer != NULL) { - free(resourceList->inputBuffer); - resourceList->inputBuffer = NULL; - } - if (resourceList->outputBuffer != NULL) { - free(resourceList->outputBuffer); - resourceList->outputBuffer = NULL; - } + if (resourceList->inputBuffer != NULL) { + free(resourceList->inputBuffer); + resourceList->inputBuffer = NULL; + } + if (resourceList->outputBuffer != NULL) { + free(resourceList->outputBuffer); + resourceList->outputBuffer = NULL; + } - if (resourceList->stream != NULL) { - cudaStreamDestroy(resourceList->stream); - resourceList->stream = NULL; - } + if (resourceList->stream != NULL) { + cudaStreamDestroy(resourceList->stream); + resourceList->stream = NULL; + } } -int main(int argc, char** argv) { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - cudlaStatus err; - FILE* fp = NULL; - struct stat st; - size_t file_size; - size_t actually_read = 0; - unsigned char* loadableData = NULL; +int main(int argc, char **argv) +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + cudlaStatus err; + FILE *fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; - cudaStream_t stream; - cudaError_t result; - const char* errPtr = NULL; + cudaStream_t stream; + cudaError_t result; + const char *errPtr = NULL; - ResourceList resourceList; + ResourceList resourceList; - memset(&resourceList, 0x00, sizeof(ResourceList)); + memset(&resourceList, 0x00, sizeof(ResourceList)); - if (argc != 3) { - DPRINTF("Usage : ./cuDLAHybridMode \n"); - return 1; - } + if (argc != 3) { + DPRINTF("Usage : ./cuDLAHybridMode \n"); + return 1; + } - // Read loadable into buffer. - fp = fopen(argv[1], "rb"); - if (fp == NULL) { - DPRINTF("Cannot open file %s\n", argv[1]); - return 1; - } + // Read loadable into buffer. + fp = fopen(argv[1], "rb"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", argv[1]); + return 1; + } - if (stat(argv[1], &st) != 0) { - DPRINTF("Cannot stat file\n"); - return 1; - } + if (stat(argv[1], &st) != 0) { + DPRINTF("Cannot stat file\n"); + return 1; + } - file_size = st.st_size; - DPRINTF("The file size = %ld\n", file_size); + file_size = st.st_size; + DPRINTF("The file size = %ld\n", file_size); - loadableData = (unsigned char*)malloc(file_size); - if (loadableData == NULL) { - DPRINTF("Cannot Allocate memory for loadable\n"); - return 1; - } + loadableData = (unsigned char *)malloc(file_size); + if (loadableData == NULL) { + DPRINTF("Cannot Allocate memory for loadable\n"); + return 1; + } - actually_read = fread(loadableData, 1, file_size, fp); - if (actually_read != file_size) { + actually_read = fread(loadableData, 1, file_size, fp); + if (actually_read != file_size) { + free(loadableData); + DPRINTF("Read wrong size\n"); + return 1; + } + fclose(fp); + + resourceList.loadableData = loadableData; + + // Initialize CUDA. + result = cudaFree(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaFree = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + result = cudaSetDevice(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA create device = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Device created successfully\n"); + resourceList.devHandle = devHandle; + + err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + else { + DPRINTF("Successfully loaded module\n"); + } + + resourceList.moduleHandle = moduleHandle; + + // Create CUDA stream. + result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cuda stream = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + resourceList.stream = stream; + + // Get tensor attributes. + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; + cudlaModuleAttribute attribute; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numInputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numInputTensors = attribute.numInputTensors; + DPRINTF("numInputTensors = %d\n", numInputTensors); + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTensors = attribute.numOutputTensors; + DPRINTF("numOutputTensors = %d\n", numOutputTensors); + + cudlaModuleTensorDescriptor *inputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numInputTensors); + cudlaModuleTensorDescriptor *outputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numOutputTensors); + + if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { + if (inputTensorDesc != NULL) { + free(inputTensorDesc); + inputTensorDesc = NULL; + } + + if (outputTensorDesc != NULL) { + free(outputTensorDesc); + outputTensorDesc = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputTensorDesc = inputTensorDesc; + resourceList.outputTensorDesc = outputTensorDesc; + + attribute.inputTensorDesc = inputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting input tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing input tensor descriptor\n"); + printTensorDesc(inputTensorDesc); + + attribute.outputTensorDesc = outputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting output tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output tensor descriptor\n"); + printTensorDesc(outputTensorDesc); + + // Setup the input and output buffers which will be used as an input to CUDA. + unsigned char *inputBuffer = (unsigned char *)malloc(inputTensorDesc[0].size); + if (inputBuffer == NULL) { + DPRINTF("Error in allocating input memory\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBuffer = inputBuffer; + + unsigned char *outputBuffer = (unsigned char *)malloc(outputTensorDesc[0].size); + if (outputBuffer == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.outputBuffer = outputBuffer; + + memset(inputBuffer, 0x00, inputTensorDesc[0].size); + memset(outputBuffer, 0x00, outputTensorDesc[0].size); + + // Fill up the buffers with data. + if (initializeInputBuffers(argv[2], inputTensorDesc, inputBuffer) != 0) { + DPRINTF("Error in initializing input buffer\n"); + cleanUp(&resourceList); + return 1; + } + + // Allocate memory on GPU. + void *inputBufferGPU; + void *outputBufferGPU; + result = cudaMalloc(&inputBufferGPU, inputTensorDesc[0].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating input memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBufferGPU = inputBufferGPU; + + result = cudaMalloc(&outputBufferGPU, outputTensorDesc[0].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating output memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.outputBufferGPU = outputBufferGPU; + + // Register the CUDA-allocated buffers. + uint64_t *inputBufferRegisteredPtr = NULL; + uint64_t *outputBufferRegisteredPtr = NULL; + + err = + cudlaMemRegister(devHandle, (uint64_t *)inputBufferGPU, inputTensorDesc[0].size, &inputBufferRegisteredPtr, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + err = cudlaMemRegister( + devHandle, (uint64_t *)outputBufferGPU, outputTensorDesc[0].size, &outputBufferRegisteredPtr, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); + + // Copy data from CPU buffers to GPU buffers. + result = cudaMemcpyAsync(inputBufferGPU, inputBuffer, inputTensorDesc[0].size, cudaMemcpyHostToDevice, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memcpy for input\n"); + cleanUp(&resourceList); + return 1; + } + result = cudaMemsetAsync(outputBufferGPU, 0, outputTensorDesc[0].size, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memset for output\n"); + cleanUp(&resourceList); + return 1; + } + + // Enqueue a cuDLA task. + cudlaTask task; + task.moduleHandle = moduleHandle; + task.outputTensor = &outputBufferRegisteredPtr; + task.numOutputTensors = 1; + task.numInputTensors = 1; + task.inputTensor = &inputBufferRegisteredPtr; + task.waitEvents = NULL; + task.signalEvents = NULL; + err = cudlaSubmitTask(devHandle, &task, 1, stream, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in submitting task\n"); + cleanUp(&resourceList); + return 1; + } + DPRINTF("SUBMIT IS DONE !!!\n"); + + // Wait for stream operations to finish and bring output buffer to CPU. + result = cudaMemcpyAsync(outputBuffer, outputBufferGPU, outputTensorDesc[0].size, cudaMemcpyDeviceToHost, stream); + if (result != cudaSuccess) { + DPRINTF("Error in bringing result back to CPU\n"); + cleanUp(&resourceList); + return 1; + } + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream\n"); + cleanUp(&resourceList); + return 1; + } + + // Output is available in outputBuffer. + + // Teardown. + err = cudlaMemUnregister(devHandle, inputBufferRegisteredPtr); + if (err != cudlaSuccess) { + DPRINTF("Error in unregistering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + err = cudlaMemUnregister(devHandle, outputBufferRegisteredPtr); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); + + free(inputTensorDesc); + free(outputTensorDesc); free(loadableData); - DPRINTF("Read wrong size\n"); - return 1; - } - fclose(fp); + free(inputBuffer); + free(outputBuffer); + cudaFree(inputBufferGPU); + cudaFree(outputBufferGPU); - resourceList.loadableData = loadableData; + resourceList.inputTensorDesc = NULL; + resourceList.outputTensorDesc = NULL; + resourceList.loadableData = NULL; + resourceList.inputBuffer = NULL; + resourceList.outputBuffer = NULL; + resourceList.inputBufferGPU = 0; + resourceList.outputBufferGPU = 0; - // Initialize CUDA. - result = cudaFree(0); - if (result != cudaSuccess) { - errPtr = cudaGetErrorName(result); - DPRINTF("Error in creating cudaFree = %s\n", errPtr); - cleanUp(&resourceList); - return 1; - } - result = cudaSetDevice(0); - if (result != cudaSuccess) { - errPtr = cudaGetErrorName(result); - DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr); - cleanUp(&resourceList); - return 1; - } - - err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA); - if (err != cudlaSuccess) { - DPRINTF("Error in cuDLA create device = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - DPRINTF("Device created successfully\n"); - resourceList.devHandle = devHandle; - - err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, - &moduleHandle, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); - cleanUp(&resourceList); - return 1; - } else { - DPRINTF("Successfully loaded module\n"); - } - - resourceList.moduleHandle = moduleHandle; - - // Create CUDA stream. - result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - - if (result != cudaSuccess) { - errPtr = cudaGetErrorName(result); - DPRINTF("Error in creating cuda stream = %s\n", errPtr); - cleanUp(&resourceList); - return 1; - } - - resourceList.stream = stream; - - // Get tensor attributes. - uint32_t numInputTensors = 0; - uint32_t numOutputTensors = 0; - cudlaModuleAttribute attribute; - - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting numInputTensors = %d\n", err); - cleanUp(&resourceList); - return 1; - } - numInputTensors = attribute.numInputTensors; - DPRINTF("numInputTensors = %d\n", numInputTensors); - - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting numOutputTensors = %d\n", err); - cleanUp(&resourceList); - return 1; - } - numOutputTensors = attribute.numOutputTensors; - DPRINTF("numOutputTensors = %d\n", numOutputTensors); - - cudlaModuleTensorDescriptor* inputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor) * - numInputTensors); - cudlaModuleTensorDescriptor* outputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor) * - numOutputTensors); - - if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { - if (inputTensorDesc != NULL) { - free(inputTensorDesc); - inputTensorDesc = NULL; + result = cudaStreamDestroy(stream); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in destroying cuda stream = %s\n", errPtr); + cleanUp(&resourceList); + return 1; } - if (outputTensorDesc != NULL) { - free(outputTensorDesc); - outputTensorDesc = NULL; + resourceList.stream = NULL; + + err = cudlaModuleUnload(moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleUnload = %d\n", err); + cleanUp(&resourceList); + return 1; + } + else { + DPRINTF("Successfully unloaded module\n"); } - cleanUp(&resourceList); - return 1; - } + resourceList.moduleHandle = NULL; - resourceList.inputTensorDesc = inputTensorDesc; - resourceList.outputTensorDesc = outputTensorDesc; + err = cudlaDestroyDevice(devHandle); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA destroy device = %d\n", err); + return 1; + } + DPRINTF("Device destroyed successfully\n"); - attribute.inputTensorDesc = inputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting input tensor descriptor = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("Printing input tensor descriptor\n"); - printTensorDesc(inputTensorDesc); + resourceList.devHandle = NULL; - attribute.outputTensorDesc = outputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting output tensor descriptor = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("Printing output tensor descriptor\n"); - printTensorDesc(outputTensorDesc); + DPRINTF("cuDLAHybridMode DONE !!!\n"); - // Setup the input and output buffers which will be used as an input to CUDA. - unsigned char* inputBuffer = (unsigned char*)malloc(inputTensorDesc[0].size); - if (inputBuffer == NULL) { - DPRINTF("Error in allocating input memory\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.inputBuffer = inputBuffer; - - unsigned char* outputBuffer = - (unsigned char*)malloc(outputTensorDesc[0].size); - if (outputBuffer == NULL) { - DPRINTF("Error in allocating output memory\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.outputBuffer = outputBuffer; - - memset(inputBuffer, 0x00, inputTensorDesc[0].size); - memset(outputBuffer, 0x00, outputTensorDesc[0].size); - - // Fill up the buffers with data. - if (initializeInputBuffers(argv[2], inputTensorDesc, inputBuffer) != 0) { - DPRINTF("Error in initializing input buffer\n"); - cleanUp(&resourceList); - return 1; - } - - // Allocate memory on GPU. - void* inputBufferGPU; - void* outputBufferGPU; - result = cudaMalloc(&inputBufferGPU, inputTensorDesc[0].size); - if (result != cudaSuccess) { - DPRINTF("Error in allocating input memory on GPU\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.inputBufferGPU = inputBufferGPU; - - result = cudaMalloc(&outputBufferGPU, outputTensorDesc[0].size); - if (result != cudaSuccess) { - DPRINTF("Error in allocating output memory on GPU\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.outputBufferGPU = outputBufferGPU; - - // Register the CUDA-allocated buffers. - uint64_t* inputBufferRegisteredPtr = NULL; - uint64_t* outputBufferRegisteredPtr = NULL; - - err = cudlaMemRegister(devHandle, (uint64_t*)inputBufferGPU, - inputTensorDesc[0].size, &inputBufferRegisteredPtr, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in registering input memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - err = - cudlaMemRegister(devHandle, (uint64_t*)outputBufferGPU, - outputTensorDesc[0].size, &outputBufferRegisteredPtr, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in registering output memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); - - // Copy data from CPU buffers to GPU buffers. - result = cudaMemcpyAsync(inputBufferGPU, inputBuffer, inputTensorDesc[0].size, - cudaMemcpyHostToDevice, stream); - if (result != cudaSuccess) { - DPRINTF("Error in enqueueing memcpy for input\n"); - cleanUp(&resourceList); - return 1; - } - result = - cudaMemsetAsync(outputBufferGPU, 0, outputTensorDesc[0].size, stream); - if (result != cudaSuccess) { - DPRINTF("Error in enqueueing memset for output\n"); - cleanUp(&resourceList); - return 1; - } - - // Enqueue a cuDLA task. - cudlaTask task; - task.moduleHandle = moduleHandle; - task.outputTensor = &outputBufferRegisteredPtr; - task.numOutputTensors = 1; - task.numInputTensors = 1; - task.inputTensor = &inputBufferRegisteredPtr; - task.waitEvents = NULL; - task.signalEvents = NULL; - err = cudlaSubmitTask(devHandle, &task, 1, stream, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in submitting task\n"); - cleanUp(&resourceList); - return 1; - } - DPRINTF("SUBMIT IS DONE !!!\n"); - - // Wait for stream operations to finish and bring output buffer to CPU. - result = - cudaMemcpyAsync(outputBuffer, outputBufferGPU, outputTensorDesc[0].size, - cudaMemcpyDeviceToHost, stream); - if (result != cudaSuccess) { - DPRINTF("Error in bringing result back to CPU\n"); - cleanUp(&resourceList); - return 1; - } - result = cudaStreamSynchronize(stream); - if (result != cudaSuccess) { - DPRINTF("Error in synchronizing stream\n"); - cleanUp(&resourceList); - return 1; - } - - // Output is available in outputBuffer. - - // Teardown. - err = cudlaMemUnregister(devHandle, inputBufferRegisteredPtr); - if (err != cudlaSuccess) { - DPRINTF("Error in unregistering input memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - err = cudlaMemUnregister(devHandle, outputBufferRegisteredPtr); - if (err != cudlaSuccess) { - DPRINTF("Error in registering output memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); - - free(inputTensorDesc); - free(outputTensorDesc); - free(loadableData); - free(inputBuffer); - free(outputBuffer); - cudaFree(inputBufferGPU); - cudaFree(outputBufferGPU); - - resourceList.inputTensorDesc = NULL; - resourceList.outputTensorDesc = NULL; - resourceList.loadableData = NULL; - resourceList.inputBuffer = NULL; - resourceList.outputBuffer = NULL; - resourceList.inputBufferGPU = 0; - resourceList.outputBufferGPU = 0; - - result = cudaStreamDestroy(stream); - if (result != cudaSuccess) { - errPtr = cudaGetErrorName(result); - DPRINTF("Error in destroying cuda stream = %s\n", errPtr); - cleanUp(&resourceList); - return 1; - } - - resourceList.stream = NULL; - - err = cudlaModuleUnload(moduleHandle, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in cudlaModuleUnload = %d\n", err); - cleanUp(&resourceList); - return 1; - } else { - DPRINTF("Successfully unloaded module\n"); - } - - resourceList.moduleHandle = NULL; - - err = cudlaDestroyDevice(devHandle); - if (err != cudlaSuccess) { - DPRINTF("Error in cuDLA destroy device = %d\n", err); - return 1; - } - DPRINTF("Device destroyed successfully\n"); - - resourceList.devHandle = NULL; - - DPRINTF("cuDLAHybridMode DONE !!!\n"); - - return 0; + return 0; } diff --git a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/README.md b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/README.md index 330a8513..9ad11c43 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/README.md +++ b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/README.md @@ -30,4 +30,3 @@ cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSe Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/main.cu b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/main.cu index 81e575c6..9ced4b7b 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/main.cu +++ b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsHybrid/main.cu @@ -25,28 +25,28 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cudla.h" -#include "cuda_runtime.h" -#include "cudlaExternalEtbl.hpp" - #include #include #include -#include #include #include +#include -#define MAX_FILENAME_LEN 200 +#include "cuda_runtime.h" +#include "cudla.h" +#include "cudlaExternalEtbl.hpp" + +#define MAX_FILENAME_LEN 200 #define RESERVED_SUFFIX_LEN 10 #define DPRINTF(...) printf(__VA_ARGS__) -static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { +static void printTensorDesc(cudlaModuleTensorDescriptor *tensorDesc) +{ DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); DPRINTF("\tsize: %lu\n", tensorDesc->size); - DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, - tensorDesc->h, tensorDesc->w); + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, tensorDesc->h, tensorDesc->w); DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); DPRINTF("\tdata type: %d\n", tensorDesc->dataType); @@ -59,33 +59,35 @@ static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); } -typedef struct { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - unsigned char* loadableData; - cudaStream_t stream; - uint32_t numInputTensors; - uint32_t numOutputTensors; - uint32_t numOutputTaskStatistics; - unsigned char** inputBuffer; - unsigned char** outputBuffer; - unsigned char** statisticsOutputBuffer; - void** inputBufferGPU; - void** outputBufferGPU; - void** outputTaskStatisticsGPU; - void **csv; - cudlaModuleTensorDescriptor* inputTensorDesc; - cudlaModuleTensorDescriptor* outputTensorDesc; - cudlaModuleTensorDescriptor* outputTaskStatisticsDesc; - uint64_t** inputBufferRegisteredPtr; - uint64_t** outputBufferRegisteredPtr; - uint64_t** outputTaskStatisticsRegisteredPtr; - uint64_t** outputStatisticsBufferRegisteredPtr; +typedef struct +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char *loadableData; + cudaStream_t stream; + uint32_t numInputTensors; + uint32_t numOutputTensors; + uint32_t numOutputTaskStatistics; + unsigned char **inputBuffer; + unsigned char **outputBuffer; + unsigned char **statisticsOutputBuffer; + void **inputBufferGPU; + void **outputBufferGPU; + void **outputTaskStatisticsGPU; + void **csv; + cudlaModuleTensorDescriptor *inputTensorDesc; + cudlaModuleTensorDescriptor *outputTensorDesc; + cudlaModuleTensorDescriptor *outputTaskStatisticsDesc; + uint64_t **inputBufferRegisteredPtr; + uint64_t **outputBufferRegisteredPtr; + uint64_t **outputTaskStatisticsRegisteredPtr; + uint64_t **outputStatisticsBufferRegisteredPtr; } ResourceList; -void cleanUp(ResourceList* resourceList); +void cleanUp(ResourceList *resourceList); -void cleanUp(ResourceList* resourceList) { +void cleanUp(ResourceList *resourceList) +{ uint32_t ii = 0; if (resourceList->inputTensorDesc != NULL) { free(resourceList->inputTensorDesc); @@ -152,8 +154,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->csv != NULL) { for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { - if ((resourceList->csv)[ii] != NULL) - { + if ((resourceList->csv)[ii] != NULL) { free((resourceList->csv)[ii]); (resourceList->csv)[ii] = NULL; } @@ -175,8 +176,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->outputBuffer != NULL) { for (ii = 0; ii < resourceList->numOutputTensors; ii++) { - if ((resourceList->outputBuffer)[ii] != NULL) - { + if ((resourceList->outputBuffer)[ii] != NULL) { free((resourceList->outputBuffer)[ii]); (resourceList->outputBuffer)[ii] = NULL; } @@ -221,42 +221,44 @@ void cleanUp(ResourceList* resourceList) { resourceList->outputStatisticsBufferRegisteredPtr = NULL; } - resourceList->numInputTensors = 0; - resourceList->numOutputTensors = 0; + resourceList->numInputTensors = 0; + resourceList->numOutputTensors = 0; resourceList->numOutputTaskStatistics = 0; } -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ cudlaDevHandle devHandle; - cudlaModule moduleHandle; - cudlaStatus err; - uint32_t statSupport = 0; - uint32_t dlaFreqInMHz = 0; - FILE* fp = NULL; - struct stat st; - size_t file_size; - size_t actually_read = 0; - unsigned char *loadableData = NULL; - char filename[MAX_FILENAME_LEN]; - const char* suffix = ".csv"; + cudlaModule moduleHandle; + cudlaStatus err; + uint32_t statSupport = 0; + uint32_t dlaFreqInMHz = 0; + FILE *fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; + char filename[MAX_FILENAME_LEN]; + const char *suffix = ".csv"; cudaStream_t stream; - cudaError_t result; - const char* errPtr = NULL; + cudaError_t result; + const char *errPtr = NULL; ResourceList resourceList; memset(&resourceList, 0x00, sizeof(ResourceList)); if ((argc != 4) && (argc != 5)) { - DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_hybrid_test1 \n"); + DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_hybrid_test1 \n"); return 1; } if (argc == 5) { - if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) - { - DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); + if ((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) { + DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n", + (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); return 1; } } @@ -277,7 +279,7 @@ int main(int argc, char** argv) { DPRINTF("The file size = %ld\n", file_size); dlaFreqInMHz = atoi(argv[2]); - statSupport = atoi(argv[3]); + statSupport = atoi(argv[3]); loadableData = (unsigned char *)malloc(file_size); if (loadableData == NULL) { @@ -286,7 +288,7 @@ int main(int argc, char** argv) { } actually_read = fread(loadableData, 1, file_size, fp); - if ( actually_read != file_size ) { + if (actually_read != file_size) { free(loadableData); DPRINTF("Read wrong size\n"); return 1; @@ -327,8 +329,9 @@ int main(int argc, char** argv) { DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); cleanUp(&resourceList); return 1; - } else { - DPRINTF("Successfully loaded module\n"); + } + else { + DPRINTF("Successfully loaded module\n"); } resourceList.moduleHandle = moduleHandle; @@ -346,8 +349,8 @@ int main(int argc, char** argv) { resourceList.stream = stream; // Get tensor attributes. - uint32_t numInputTensors = 0; - uint32_t numOutputTensors = 0; + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; uint32_t numOutputTaskStatistics = 0; cudlaModuleAttribute attribute; @@ -382,20 +385,20 @@ int main(int argc, char** argv) { numOutputTaskStatistics = attribute.numOutputTensors; DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics); - if(numOutputTaskStatistics == 0) { + if (numOutputTaskStatistics == 0) { DPRINTF("Layerwise stats is not supported for this Loadable \n"); cleanUp(&resourceList); return 1; } - resourceList.numInputTensors = numInputTensors; - resourceList.numOutputTensors = numOutputTensors; + resourceList.numInputTensors = numInputTensors; + resourceList.numOutputTensors = numOutputTensors; resourceList.numOutputTaskStatistics = numOutputTaskStatistics; - cudlaModuleTensorDescriptor* inputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors); - cudlaModuleTensorDescriptor* outputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors); + cudlaModuleTensorDescriptor *inputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numInputTensors); + cudlaModuleTensorDescriptor *outputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numOutputTensors); if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { if (inputTensorDesc != NULL) { @@ -412,11 +415,11 @@ int main(int argc, char** argv) { return 1; } - resourceList.inputTensorDesc = inputTensorDesc; + resourceList.inputTensorDesc = inputTensorDesc; resourceList.outputTensorDesc = outputTensorDesc; - cudlaModuleTensorDescriptor* outputTaskStatisticsDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics); + cudlaModuleTensorDescriptor *outputTaskStatisticsDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numOutputTaskStatistics); if (outputTaskStatisticsDesc == NULL) { free(outputTaskStatisticsDesc); outputTaskStatisticsDesc = NULL; @@ -427,9 +430,7 @@ int main(int argc, char** argv) { resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc; attribute.inputTensorDesc = inputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, - CUDLA_INPUT_TENSOR_DESCRIPTORS, - &attribute); + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, &attribute); if (err != cudlaSuccess) { DPRINTF("Error in getting input tensor descriptor = %d\n", err); cleanUp(&resourceList); @@ -439,9 +440,7 @@ int main(int argc, char** argv) { printTensorDesc(inputTensorDesc); attribute.outputTensorDesc = outputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, - CUDLA_OUTPUT_TENSOR_DESCRIPTORS, - &attribute); + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, &attribute); if (err != cudlaSuccess) { DPRINTF("Error in getting output tensor descriptor = %d\n", err); cleanUp(&resourceList); @@ -451,9 +450,7 @@ int main(int argc, char** argv) { printTensorDesc(outputTensorDesc); attribute.outputTensorDesc = outputTaskStatisticsDesc; - err = cudlaModuleGetAttributes(moduleHandle, - CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, - &attribute); + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, &attribute); if (err != cudlaSuccess) { DPRINTF("Error in getting task statistics descriptor = %d\n", err); cleanUp(&resourceList); @@ -462,20 +459,20 @@ int main(int argc, char** argv) { DPRINTF("Printing output task statistics descriptor size\n"); for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size); + DPRINTF("The size of %u descriptor is %lu\n", ii, outputTaskStatisticsDesc[ii].size); } // Setup the input and output buffers which will be used as an input to CUDA. - unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors); + unsigned char **inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *) * numInputTensors); if (inputBuffer == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors); + memset(inputBuffer, 0x00, sizeof(unsigned char *) * numInputTensors); resourceList.inputBuffer = inputBuffer; for (uint32_t ii = 0; ii < numInputTensors; ii++) { - inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size); + inputBuffer[ii] = (unsigned char *)malloc(inputTensorDesc[ii].size); if (inputBuffer[ii] == NULL) { DPRINTF("Error in allocating input memory\n"); cleanUp(&resourceList); @@ -484,17 +481,17 @@ int main(int argc, char** argv) { memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size); } - unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors); + unsigned char **outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *) * numOutputTensors); if (outputBuffer == NULL) { DPRINTF("Error in allocating memory for output buffer array\n"); cleanUp(&resourceList); return 1; } - memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors); + memset(outputBuffer, 0x00, sizeof(unsigned char *) * numOutputTensors); resourceList.outputBuffer = outputBuffer; for (uint32_t ii = 0; ii < numOutputTensors; ii++) { - outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size); + outputBuffer[ii] = (unsigned char *)malloc(outputTensorDesc[ii].size); if (outputBuffer[ii] == NULL) { DPRINTF("Error in allocating output memory\n"); cleanUp(&resourceList); @@ -503,17 +500,18 @@ int main(int argc, char** argv) { memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size); } - unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics); + unsigned char **statisticsOutputBuffer = + (unsigned char **)malloc(sizeof(unsigned char *) * numOutputTaskStatistics); if (statisticsOutputBuffer == NULL) { DPRINTF("Error in allocating memory for output buffer array\n"); cleanUp(&resourceList); return 1; } - memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics); + memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *) * numOutputTaskStatistics); resourceList.statisticsOutputBuffer = statisticsOutputBuffer; for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size); + statisticsOutputBuffer[ii] = (unsigned char *)malloc(outputTaskStatisticsDesc[ii].size); if (outputBuffer[ii] == NULL) { DPRINTF("Error in allocating output memory\n"); cleanUp(&resourceList); @@ -523,32 +521,31 @@ int main(int argc, char** argv) { } // Allocate memory on GPU. - void** inputBufferGPU = (void **)malloc(sizeof(void *)*numInputTensors); + void **inputBufferGPU = (void **)malloc(sizeof(void *) * numInputTensors); if (inputBufferGPU == NULL) { DPRINTF("Error in allocating memory for input buffer GPU array\n"); cleanUp(&resourceList); return 1; } - memset(inputBufferGPU, 0x00, sizeof(void *)*numInputTensors); + memset(inputBufferGPU, 0x00, sizeof(void *) * numInputTensors); resourceList.inputBufferGPU = inputBufferGPU; for (uint32_t ii = 0; ii < numInputTensors; ii++) { result = cudaMalloc(&(inputBufferGPU[ii]), inputTensorDesc[ii].size); - if (result != cudaSuccess) - { + if (result != cudaSuccess) { DPRINTF("Error in allocating input memory on GPU\n"); cleanUp(&resourceList); return 1; } } - void** outputBufferGPU = (void **)malloc(sizeof(void *)*numOutputTensors); + void **outputBufferGPU = (void **)malloc(sizeof(void *) * numOutputTensors); if (outputBufferGPU == NULL) { DPRINTF("Error in allocating memory for output buffer GPU array\n"); cleanUp(&resourceList); return 1; } - memset(outputBufferGPU, 0x00, sizeof(void *)*numOutputTensors); + memset(outputBufferGPU, 0x00, sizeof(void *) * numOutputTensors); resourceList.outputBufferGPU = outputBufferGPU; for (uint32_t ii = 0; ii < numOutputTensors; ii++) { @@ -560,13 +557,13 @@ int main(int argc, char** argv) { } } - void** outputTaskStatisticsGPU = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + void **outputTaskStatisticsGPU = (void **)malloc(sizeof(void *) * numOutputTaskStatistics); if (outputTaskStatisticsGPU == NULL) { DPRINTF("Error in allocating memory for output task statistics GPU array\n"); cleanUp(&resourceList); return 1; } - memset(outputTaskStatisticsGPU, 0x00, sizeof(void *)*numOutputTaskStatistics); + memset(outputTaskStatisticsGPU, 0x00, sizeof(void *) * numOutputTaskStatistics); resourceList.outputTaskStatisticsGPU = outputTaskStatisticsGPU; for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { @@ -578,11 +575,12 @@ int main(int argc, char** argv) { } } - uint64_t** inputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors); - uint64_t** outputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors); - uint64_t** outputTaskStatisticsRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics); + uint64_t **inputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t *) * numInputTensors); + uint64_t **outputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t *) * numOutputTensors); + uint64_t **outputTaskStatisticsRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t *) * numOutputTaskStatistics); - if ((inputBufferRegisteredPtr == NULL) || (outputBufferRegisteredPtr == NULL) || (outputTaskStatisticsRegisteredPtr == NULL)) { + if ((inputBufferRegisteredPtr == NULL) || (outputBufferRegisteredPtr == NULL) + || (outputTaskStatisticsRegisteredPtr == NULL)) { if (inputBufferRegisteredPtr != NULL) { free(inputBufferRegisteredPtr); inputBufferRegisteredPtr = NULL; @@ -602,17 +600,14 @@ int main(int argc, char** argv) { return 1; } - resourceList.inputBufferRegisteredPtr = inputBufferRegisteredPtr; - resourceList.outputBufferRegisteredPtr = outputBufferRegisteredPtr; + resourceList.inputBufferRegisteredPtr = inputBufferRegisteredPtr; + resourceList.outputBufferRegisteredPtr = outputBufferRegisteredPtr; resourceList.outputTaskStatisticsRegisteredPtr = outputTaskStatisticsRegisteredPtr; // Register the CUDA-allocated buffers. for (uint32_t ii = 0; ii < numInputTensors; ii++) { - err = cudlaMemRegister(devHandle, - (uint64_t* )(inputBufferGPU[ii]), - inputTensorDesc[ii].size, - &(inputBufferRegisteredPtr[ii]), - 0); + err = cudlaMemRegister( + devHandle, (uint64_t *)(inputBufferGPU[ii]), inputTensorDesc[ii].size, &(inputBufferRegisteredPtr[ii]), 0); if (err != cudlaSuccess) { DPRINTF("Error in registering input memory = %d\n", err); cleanUp(&resourceList); @@ -622,7 +617,7 @@ int main(int argc, char** argv) { for (uint32_t ii = 0; ii < numOutputTensors; ii++) { err = cudlaMemRegister(devHandle, - (uint64_t* )(outputBufferGPU[ii]), + (uint64_t *)(outputBufferGPU[ii]), outputTensorDesc[ii].size, &(outputBufferRegisteredPtr[ii]), 0); @@ -635,7 +630,7 @@ int main(int argc, char** argv) { for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { err = cudlaMemRegister(devHandle, - (uint64_t* )(outputTaskStatisticsGPU[ii]), + (uint64_t *)(outputTaskStatisticsGPU[ii]), outputTaskStatisticsDesc[ii].size, &(outputTaskStatisticsRegisteredPtr[ii]), CUDLA_TASK_STATISTICS); @@ -650,7 +645,8 @@ int main(int argc, char** argv) { // Copy data from CPU buffers to GPU buffers. for (uint32_t ii = 0; ii < numInputTensors; ii++) { - result = cudaMemcpyAsync(inputBufferGPU[ii], inputBuffer[ii], inputTensorDesc[ii].size, cudaMemcpyHostToDevice, stream); + result = cudaMemcpyAsync( + inputBufferGPU[ii], inputBuffer[ii], inputTensorDesc[ii].size, cudaMemcpyHostToDevice, stream); if (result != cudaSuccess) { DPRINTF("Error in enqueueing memcpy for input\n"); cleanUp(&resourceList); @@ -677,36 +673,37 @@ int main(int argc, char** argv) { } uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0}; - uint32_t index = 0; - for (; index < numOutputTensors ; index++) { + uint32_t index = 0; + for (; index < numOutputTensors; index++) { outputStatisticsBufferRegisteredPtr[index] = ((outputBufferRegisteredPtr[index])); } - for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) { + for (uint32_t jj = 0; jj < numOutputTaskStatistics; jj++) { outputStatisticsBufferRegisteredPtr[index++] = ((outputTaskStatisticsRegisteredPtr[jj])); } // Enqueue a cuDLA task. cudlaTask task; task.moduleHandle = moduleHandle; - task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr; + task.outputTensor = (uint64_t *const *)&outputStatisticsBufferRegisteredPtr; - if(statSupport == 1) { + if (statSupport == 1) { task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics); DPRINTF("Layerwise profiling is requested \n"); - } else { - task.numOutputTensors = numOutputTensors; - DPRINTF("Layerwise profiling is not requested \n"); + } + else { + task.numOutputTensors = numOutputTensors; + DPRINTF("Layerwise profiling is not requested \n"); } task.numInputTensors = numInputTensors; - task.inputTensor = inputBufferRegisteredPtr; - task.waitEvents = NULL; - task.signalEvents = NULL; + task.inputTensor = inputBufferRegisteredPtr; + task.waitEvents = NULL; + task.signalEvents = NULL; err = cudlaSubmitTask(devHandle, &task, 1, stream, 0); if (err != cudlaSuccess) { - DPRINTF("no of output tensor %u \n",(task.numOutputTensors)); + DPRINTF("no of output tensor %u \n", (task.numOutputTensors)); DPRINTF("Error in submitting task\n"); cleanUp(&resourceList); return 1; @@ -722,8 +719,8 @@ int main(int argc, char** argv) { // Wait for stream operations to finish and bring output buffer to CPU. for (uint32_t ii = 0; ii < numOutputTensors; ii++) { - result = cudaMemcpyAsync(outputBuffer[ii], outputBufferGPU[ii], - outputTensorDesc[ii].size, cudaMemcpyDeviceToHost, stream); + result = cudaMemcpyAsync( + outputBuffer[ii], outputBufferGPU[ii], outputTensorDesc[ii].size, cudaMemcpyDeviceToHost, stream); if (result != cudaSuccess) { DPRINTF("Error in bringing result back to CPU\n"); cleanUp(&resourceList); @@ -738,11 +735,14 @@ int main(int argc, char** argv) { return 1; } - if(statSupport == 1) { + if (statSupport == 1) { // copy statistics data to cpu for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - result = cudaMemcpyAsync(statisticsOutputBuffer[ii], outputTaskStatisticsGPU[ii], - outputTaskStatisticsDesc[ii].size, cudaMemcpyDeviceToHost, stream); + result = cudaMemcpyAsync(statisticsOutputBuffer[ii], + outputTaskStatisticsGPU[ii], + outputTaskStatisticsDesc[ii].size, + cudaMemcpyDeviceToHost, + stream); if (result != cudaSuccess) { DPRINTF("Error in bringing result back to CPU\n"); cleanUp(&resourceList); @@ -760,35 +760,36 @@ int main(int argc, char** argv) { // To get the last index of the filename prefix in which statistics will be dumped uint32_t index = 0; if (argc == 5) { - while(argv[4][index]!='\0') { + while (argv[4][index] != '\0') { index++; } } - const cudlaExternalEtbl* etbl = NULL; - if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) { + const cudlaExternalEtbl *etbl = NULL; + if (cudlaGetExternalExportTable(&etbl, 0) != cudlaSuccess) { DPRINTF("Error in getting export table\n"); cleanUp(&resourceList); return 1; } - void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + void **csv = (void **)malloc(sizeof(void *) * numOutputTaskStatistics); if (csv == NULL) { DPRINTF("Error in allocating memory for csv stream\n"); cleanUp(&resourceList); return 1; } - memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics); + memset(csv, 0x00, sizeof(void *) * numOutputTaskStatistics); resourceList.csv = csv; for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { cudlaTranslateCsvAttribute csvAttribute; - uint64_t csvStreamLength = 0; + uint64_t csvStreamLength = 0; - err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute); - csv[ii] = (void* )malloc(csvAttribute.csvStreamLength); + err = etbl->etiTranslateStats( + devHandle, statisticsOutputBuffer[ii], dlaFreqInMHz, ii, CUDLA_GET_CSV_LENGTH, &csvAttribute); + csv[ii] = (void *)malloc(csvAttribute.csvStreamLength); csvStreamLength = csvAttribute.csvStreamLength; - DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength); + DPRINTF("size for statistics buffer %u is %lu \n", ii, csvStreamLength); if (csv[ii] == NULL) { DPRINTF("Error in allocating memory for csv stream\n"); @@ -798,7 +799,8 @@ int main(int argc, char** argv) { memset(csv[ii], 0x00, csvAttribute.csvStreamLength); csvAttribute.csvStreamStats = csv[ii]; - err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute); + err = etbl->etiTranslateStats( + devHandle, statisticsOutputBuffer[ii], dlaFreqInMHz, ii, CUDLA_GET_CSV_STATS, &csvAttribute); if (err != cudlaSuccess) { DPRINTF("Error in translating stats\n"); cleanUp(&resourceList); @@ -806,7 +808,7 @@ int main(int argc, char** argv) { } if (argc == 5) { - sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix); + sprintf(filename, "%s%u%s", argv[4], (ii + 1), suffix); fp = fopen(filename, "w+"); if (fp == NULL) { DPRINTF("Cannot open file %s\n", filename); @@ -814,23 +816,23 @@ int main(int argc, char** argv) { return 1; } - uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp); - if(ret_val != csvStreamLength) { + uint32_t ret_val = fwrite(csv[ii], sizeof(char), csvStreamLength, fp); + if (ret_val != csvStreamLength) { DPRINTF("number of elements written to file is %u \n", ret_val); cleanUp(&resourceList); return 1; } fclose(fp); - } else { - DPRINTF("%s \n",(char *)csv[ii]); + } + else { + DPRINTF("%s \n", (char *)csv[ii]); } } } // unregister the CUDA-allocated buffers. for (uint32_t ii = 0; ii < numInputTensors; ii++) { - err = cudlaMemUnregister(devHandle, - (inputBufferRegisteredPtr[ii])); + err = cudlaMemUnregister(devHandle, (inputBufferRegisteredPtr[ii])); if (err != cudlaSuccess) { DPRINTF("Error in registering input memory = %d\n", err); cleanUp(&resourceList); @@ -839,8 +841,7 @@ int main(int argc, char** argv) { } for (uint32_t ii = 0; ii < numOutputTensors; ii++) { - err = cudlaMemUnregister(devHandle, - (outputBufferRegisteredPtr[ii])); + err = cudlaMemUnregister(devHandle, (outputBufferRegisteredPtr[ii])); if (err != cudlaSuccess) { DPRINTF("Error in registering output memory = %d\n", err); cleanUp(&resourceList); @@ -849,8 +850,7 @@ int main(int argc, char** argv) { } for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - err = cudlaMemUnregister(devHandle, - (outputTaskStatisticsRegisteredPtr[ii])); + err = cudlaMemUnregister(devHandle, (outputTaskStatisticsRegisteredPtr[ii])); if (err != cudlaSuccess) { DPRINTF("Error in registering output memory = %d\n", err); cleanUp(&resourceList); @@ -875,7 +875,8 @@ int main(int argc, char** argv) { DPRINTF("Error in cudlaModuleUnload = %d\n", err); cleanUp(&resourceList); return 1; - } else { + } + else { DPRINTF("Successfully unloaded module\n"); } diff --git a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/README.md b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/README.md index 403afbfe..8348e83e 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/README.md +++ b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/README.md @@ -31,4 +31,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/main.cpp b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/main.cpp index 639e7889..4973801b 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/main.cpp +++ b/Samples/8_Platform_Specific/Tegra/cuDLALayerwiseStatsStandalone/main.cpp @@ -25,34 +25,31 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cudla.h" -#include "nvscierror.h" -#include "nvscibuf.h" -#include "nvscisync.h" -#include "cudlaExternalEtbl.hpp" - -#include #include #include #include -#include #include #include +#include +#include -#define MAX_FILENAME_LEN 200 +#include "cudla.h" +#include "cudlaExternalEtbl.hpp" +#include "nvscibuf.h" +#include "nvscierror.h" +#include "nvscisync.h" + +#define MAX_FILENAME_LEN 200 #define RESERVED_SUFFIX_LEN 10 #define DPRINTF(...) printf(__VA_ARGS__) -static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { +static void printTensorDesc(cudlaModuleTensorDescriptor *tensorDesc) +{ DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); DPRINTF("\tsize: %lu\n", tensorDesc->size); - DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", - tensorDesc->n, - tensorDesc->c, - tensorDesc->h, - tensorDesc->w); + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, tensorDesc->h, tensorDesc->w); DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); DPRINTF("\tdata type: %d\n", tensorDesc->dataType); @@ -65,51 +62,53 @@ static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); } -typedef struct { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - unsigned char* loadableData; - uint32_t numInputTensors; - uint32_t numOutputTensors; - uint32_t numOutputTaskStatistics; - unsigned char** inputBuffer; - unsigned char** outputBuffer; - unsigned char** statisticsOutputBuffer; - cudlaModuleTensorDescriptor* inputTensorDesc; - cudlaModuleTensorDescriptor* outputTensorDesc; - cudlaModuleTensorDescriptor* outputTaskStatisticsDesc; - NvSciBufObj* inputBufObj; - NvSciBufObj* outputBufObj; - NvSciBufObj* statisticsBufObj; - NvSciBufModule bufModule; - NvSciBufAttrList* inputAttrList; - NvSciBufAttrList* reconciledInputAttrList; - NvSciBufAttrList* inputConflictList; - NvSciBufAttrList* outputAttrList; - NvSciBufAttrList* reconciledOutputAttrList; - NvSciBufAttrList* outputConflictList; - NvSciSyncObj syncObj; - NvSciSyncModule syncModule; - NvSciSyncCpuWaitContext nvSciCtx; - NvSciSyncAttrList waiterAttrListObj; - NvSciSyncAttrList signalerAttrListObj; - NvSciSyncAttrList nvSciSyncConflictListObj; - NvSciSyncAttrList nvSciSyncReconciledListObj; - NvSciBufAttrList* statisticsOutputAttrList; - NvSciBufAttrList* reconciledStatisticsOutputAttrList; - NvSciBufAttrList* statisticsOutputConflictList; - uint64_t** inputBufObjRegPtr; - uint64_t** outputBufObjRegPtr; - uint64_t** statisticsBufObjRegPtr; - uint64_t** devPtrs; - cudlaSignalEvents* signalEvents; - NvSciSyncFence eofFence; - void **csv; +typedef struct +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char *loadableData; + uint32_t numInputTensors; + uint32_t numOutputTensors; + uint32_t numOutputTaskStatistics; + unsigned char **inputBuffer; + unsigned char **outputBuffer; + unsigned char **statisticsOutputBuffer; + cudlaModuleTensorDescriptor *inputTensorDesc; + cudlaModuleTensorDescriptor *outputTensorDesc; + cudlaModuleTensorDescriptor *outputTaskStatisticsDesc; + NvSciBufObj *inputBufObj; + NvSciBufObj *outputBufObj; + NvSciBufObj *statisticsBufObj; + NvSciBufModule bufModule; + NvSciBufAttrList *inputAttrList; + NvSciBufAttrList *reconciledInputAttrList; + NvSciBufAttrList *inputConflictList; + NvSciBufAttrList *outputAttrList; + NvSciBufAttrList *reconciledOutputAttrList; + NvSciBufAttrList *outputConflictList; + NvSciSyncObj syncObj; + NvSciSyncModule syncModule; + NvSciSyncCpuWaitContext nvSciCtx; + NvSciSyncAttrList waiterAttrListObj; + NvSciSyncAttrList signalerAttrListObj; + NvSciSyncAttrList nvSciSyncConflictListObj; + NvSciSyncAttrList nvSciSyncReconciledListObj; + NvSciBufAttrList *statisticsOutputAttrList; + NvSciBufAttrList *reconciledStatisticsOutputAttrList; + NvSciBufAttrList *statisticsOutputConflictList; + uint64_t **inputBufObjRegPtr; + uint64_t **outputBufObjRegPtr; + uint64_t **statisticsBufObjRegPtr; + uint64_t **devPtrs; + cudlaSignalEvents *signalEvents; + NvSciSyncFence eofFence; + void **csv; } ResourceList; -void cleanUp(ResourceList* resourceList); +void cleanUp(ResourceList *resourceList); -void cleanUp(ResourceList* resourceList) { +void cleanUp(ResourceList *resourceList) +{ uint32_t ii = 0; if (resourceList->inputTensorDesc != NULL) { @@ -143,7 +142,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->inputBufObj != NULL) { for (ii = 0; ii < resourceList->numInputTensors; ii++) { - if((resourceList->inputBufObj)[ii] != NULL) { + if ((resourceList->inputBufObj)[ii] != NULL) { NvSciBufObjFree((resourceList->inputBufObj)[ii]); (resourceList->inputBufObj)[ii] = NULL; } @@ -152,7 +151,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->outputBufObj != NULL) { for (ii = 0; ii < resourceList->numOutputTensors; ii++) { - if((resourceList->outputBufObj)[ii] != NULL) { + if ((resourceList->outputBufObj)[ii] != NULL) { NvSciBufObjFree((resourceList->outputBufObj)[ii]); (resourceList->outputBufObj)[ii] = NULL; } @@ -161,7 +160,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->statisticsBufObj != NULL) { for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { - if((resourceList->statisticsBufObj)[ii] != NULL) { + if ((resourceList->statisticsBufObj)[ii] != NULL) { NvSciBufObjFree((resourceList->statisticsBufObj)[ii]); (resourceList->statisticsBufObj)[ii] = NULL; } @@ -214,7 +213,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->reconciledInputAttrList != NULL) { for (ii = 0; ii < resourceList->numInputTensors; ii++) { - if((resourceList->reconciledInputAttrList)[ii] != NULL) { + if ((resourceList->reconciledInputAttrList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->reconciledInputAttrList)[ii]); (resourceList->reconciledInputAttrList)[ii] = NULL; } @@ -225,7 +224,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->inputConflictList != NULL) { for (ii = 0; ii < resourceList->numInputTensors; ii++) { - if((resourceList->inputConflictList)[ii] != NULL) { + if ((resourceList->inputConflictList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->inputConflictList)[ii]); (resourceList->inputConflictList)[ii] = NULL; } @@ -236,7 +235,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->inputAttrList != NULL) { for (ii = 0; ii < resourceList->numInputTensors; ii++) { - if((resourceList->inputAttrList)[ii] != NULL) { + if ((resourceList->inputAttrList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->inputAttrList)[ii]); (resourceList->inputAttrList)[ii] = NULL; } @@ -247,7 +246,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->reconciledOutputAttrList != NULL) { for (ii = 0; ii < resourceList->numOutputTensors; ii++) { - if((resourceList->reconciledOutputAttrList)[ii] != NULL) { + if ((resourceList->reconciledOutputAttrList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->reconciledOutputAttrList)[ii]); (resourceList->reconciledOutputAttrList)[ii] = NULL; } @@ -258,7 +257,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->outputConflictList != NULL) { for (ii = 0; ii < resourceList->numOutputTensors; ii++) { - if((resourceList->outputConflictList)[ii] != NULL) { + if ((resourceList->outputConflictList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->outputConflictList)[ii]); (resourceList->outputConflictList)[ii] = NULL; } @@ -269,7 +268,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->outputAttrList != NULL) { for (ii = 0; ii < resourceList->numOutputTensors; ii++) { - if((resourceList->outputAttrList)[ii] != NULL) { + if ((resourceList->outputAttrList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->outputAttrList)[ii]); (resourceList->outputAttrList)[ii] = NULL; } @@ -280,7 +279,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->reconciledStatisticsOutputAttrList != NULL) { for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { - if((resourceList->reconciledStatisticsOutputAttrList)[ii] != NULL) { + if ((resourceList->reconciledStatisticsOutputAttrList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->reconciledStatisticsOutputAttrList)[ii]); (resourceList->reconciledStatisticsOutputAttrList)[ii] = NULL; } @@ -291,7 +290,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->statisticsOutputConflictList != NULL) { for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { - if((resourceList->statisticsOutputConflictList)[ii] != NULL) { + if ((resourceList->statisticsOutputConflictList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->statisticsOutputConflictList)[ii]); (resourceList->statisticsOutputConflictList)[ii] = NULL; } @@ -302,7 +301,7 @@ void cleanUp(ResourceList* resourceList) { if (resourceList->statisticsOutputAttrList != NULL) { for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { - if((resourceList->statisticsOutputAttrList)[ii] != NULL) { + if ((resourceList->statisticsOutputAttrList)[ii] != NULL) { NvSciBufAttrListFree((resourceList->statisticsOutputAttrList)[ii]); (resourceList->statisticsOutputAttrList)[ii] = NULL; } @@ -381,21 +380,18 @@ void cleanUp(ResourceList* resourceList) { resourceList->devPtrs = NULL; } - resourceList->numInputTensors = 0; - resourceList->numOutputTensors = 0; + resourceList->numInputTensors = 0; + resourceList->numOutputTensors = 0; resourceList->numOutputTaskStatistics = 0; } -cudlaStatus createAndSetAttrList(NvSciBufModule module, - uint64_t bufSize, - NvSciBufAttrList *attrList); +cudlaStatus createAndSetAttrList(NvSciBufModule module, uint64_t bufSize, NvSciBufAttrList *attrList); -cudlaStatus createAndSetAttrList(NvSciBufModule module, - uint64_t bufSize, - NvSciBufAttrList *attrList) { - cudlaStatus status = cudlaSuccess; - NvSciError sciStatus = NvSciError_Success; +cudlaStatus createAndSetAttrList(NvSciBufModule module, uint64_t bufSize, NvSciBufAttrList *attrList) +{ + cudlaStatus status = cudlaSuccess; + NvSciError sciStatus = NvSciError_Success; sciStatus = NvSciBufAttrListCreate(module, attrList); if (sciStatus != NvSciError_Success) { @@ -405,45 +401,28 @@ cudlaStatus createAndSetAttrList(NvSciBufModule module, } // TODO: Refactor into multiple dimensions - bool needCpuAccess = true; - NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; - uint32_t dimcount = 1; - uint64_t sizes[] = {bufSize}; - uint32_t alignment[] = {1}; - uint32_t dataType = NvSciDataType_Int8; - NvSciBufType type = NvSciBufType_Tensor; - uint64_t baseAddrAlign = 512; + bool needCpuAccess = true; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; + uint32_t dimcount = 1; + uint64_t sizes[] = {bufSize}; + uint32_t alignment[] = {1}; + uint32_t dataType = NvSciDataType_Int8; + NvSciBufType type = NvSciBufType_Tensor; + uint64_t baseAddrAlign = 512; NvSciBufAttrKeyValuePair setAttrs[] = { - {.key = NvSciBufGeneralAttrKey_Types, - .value = &type, - .len = sizeof(type)}, - {.key = NvSciBufTensorAttrKey_DataType, - .value = &dataType, - .len = sizeof(dataType)}, - {.key = NvSciBufTensorAttrKey_NumDims, - .value = &dimcount, - .len = sizeof(dimcount)}, - {.key = NvSciBufTensorAttrKey_SizePerDim, - .value = &sizes, - .len = sizeof(sizes)}, - {.key = NvSciBufTensorAttrKey_AlignmentPerDim, - .value = &alignment, - .len = sizeof(alignment)}, - {.key = NvSciBufTensorAttrKey_BaseAddrAlign, - .value = &baseAddrAlign, - .len = sizeof(baseAddrAlign)}, - {.key = NvSciBufGeneralAttrKey_RequiredPerm, - .value = &perm, - .len = sizeof(perm)}, - {.key = NvSciBufGeneralAttrKey_NeedCpuAccess, - .value = &needCpuAccess, - .len = sizeof(needCpuAccess)}}; + {.key = NvSciBufGeneralAttrKey_Types, .value = &type, .len = sizeof(type)}, + {.key = NvSciBufTensorAttrKey_DataType, .value = &dataType, .len = sizeof(dataType)}, + {.key = NvSciBufTensorAttrKey_NumDims, .value = &dimcount, .len = sizeof(dimcount)}, + {.key = NvSciBufTensorAttrKey_SizePerDim, .value = &sizes, .len = sizeof(sizes)}, + {.key = NvSciBufTensorAttrKey_AlignmentPerDim, .value = &alignment, .len = sizeof(alignment)}, + {.key = NvSciBufTensorAttrKey_BaseAddrAlign, .value = &baseAddrAlign, .len = sizeof(baseAddrAlign)}, + {.key = NvSciBufGeneralAttrKey_RequiredPerm, .value = &perm, .len = sizeof(perm)}, + {.key = NvSciBufGeneralAttrKey_NeedCpuAccess, .value = &needCpuAccess, .len = sizeof(needCpuAccess)}}; size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair); sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length); - if (sciStatus != NvSciError_Success) - { + if (sciStatus != NvSciError_Success) { status = cudlaErrorNvSci; DPRINTF("Error in setting NvSciBuf attribute list\n"); return status; @@ -454,33 +433,35 @@ cudlaStatus createAndSetAttrList(NvSciBufModule module, NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list); -NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) { - bool cpuWaiter = true; +NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) +{ + bool cpuWaiter = true; NvSciSyncAttrKeyValuePair keyValue[2]; memset(keyValue, 0, sizeof(keyValue)); - keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; - keyValue[0].value = (void*) &cpuWaiter; - keyValue[0].len = sizeof(cpuWaiter); + keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; + keyValue[0].value = (void *)&cpuWaiter; + keyValue[0].len = sizeof(cpuWaiter); NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly; - keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; - keyValue[1].value = (void*) &cpuPerm; - keyValue[1].len = sizeof(cpuPerm); + keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; + keyValue[1].value = (void *)&cpuPerm; + keyValue[1].len = sizeof(cpuPerm); return NvSciSyncAttrListSetAttrs(list, keyValue, 2); } -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ cudlaDevHandle devHandle; - cudlaModule moduleHandle; - cudlaStatus err; - uint32_t statSupport = 0; - uint32_t dlaFreqInMHz = 0; - FILE* fp = NULL; - struct stat st; - size_t file_size; - size_t actually_read = 0; - unsigned char *loadableData = NULL; - char filename[MAX_FILENAME_LEN]; - const char* suffix = ".csv"; + cudlaModule moduleHandle; + cudlaStatus err; + uint32_t statSupport = 0; + uint32_t dlaFreqInMHz = 0; + FILE *fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; + char filename[MAX_FILENAME_LEN]; + const char *suffix = ".csv"; ResourceList resourceList; @@ -488,13 +469,15 @@ int main(int argc, char** argv) { memset(&resourceList, 0x00, sizeof(ResourceList)); if ((argc != 4) && (argc != 5)) { - DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_standalone_test1 \n"); + DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_standalone_test1 " + "\n"); return 1; } if (argc == 5) { - if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) { - DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); + if ((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) { + DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n", + (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); return 1; } } @@ -515,7 +498,7 @@ int main(int argc, char** argv) { DPRINTF("The file size = %ld\n", file_size); dlaFreqInMHz = atoi(argv[2]); - statSupport = atoi(argv[3]); + statSupport = atoi(argv[3]); loadableData = (unsigned char *)malloc(file_size); if (loadableData == NULL) { @@ -524,7 +507,7 @@ int main(int argc, char** argv) { } actually_read = fread(loadableData, 1, file_size, fp); - if ( actually_read != file_size ) { + if (actually_read != file_size) { free(loadableData); DPRINTF("Read wrong size\n"); return 1; @@ -548,15 +531,16 @@ int main(int argc, char** argv) { DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); cleanUp(&resourceList); return 1; - } else { + } + else { DPRINTF("Successfully loaded module\n"); } resourceList.moduleHandle = moduleHandle; // Get tensor attributes. - uint32_t numInputTensors = 0; - uint32_t numOutputTensors = 0; + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; uint32_t numOutputTaskStatistics = 0; cudlaModuleAttribute attribute; @@ -591,30 +575,28 @@ int main(int argc, char** argv) { numOutputTaskStatistics = attribute.numOutputTensors; DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics); - if(numOutputTaskStatistics == 0) { + if (numOutputTaskStatistics == 0) { DPRINTF("Layerwise stats is not supported for this Loadable \n"); cleanUp(&resourceList); return 1; } - resourceList.numInputTensors = numInputTensors; - resourceList.numOutputTensors = numOutputTensors; + resourceList.numInputTensors = numInputTensors; + resourceList.numOutputTensors = numOutputTensors; resourceList.numOutputTaskStatistics = numOutputTaskStatistics; - cudlaModuleTensorDescriptor* inputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors); - cudlaModuleTensorDescriptor* outputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors); + cudlaModuleTensorDescriptor *inputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numInputTensors); + cudlaModuleTensorDescriptor *outputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numOutputTensors); if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { - if (inputTensorDesc != NULL) - { + if (inputTensorDesc != NULL) { free(inputTensorDesc); inputTensorDesc = NULL; } - if (outputTensorDesc != NULL) - { + if (outputTensorDesc != NULL) { free(outputTensorDesc); outputTensorDesc = NULL; } @@ -623,11 +605,11 @@ int main(int argc, char** argv) { return 1; } - resourceList.inputTensorDesc = inputTensorDesc; + resourceList.inputTensorDesc = inputTensorDesc; resourceList.outputTensorDesc = outputTensorDesc; - cudlaModuleTensorDescriptor* outputTaskStatisticsDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics); + cudlaModuleTensorDescriptor *outputTaskStatisticsDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numOutputTaskStatistics); if (outputTaskStatisticsDesc == NULL) { free(outputTaskStatisticsDesc); outputTaskStatisticsDesc = NULL; @@ -638,9 +620,7 @@ int main(int argc, char** argv) { resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc; attribute.inputTensorDesc = inputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, - CUDLA_INPUT_TENSOR_DESCRIPTORS, - &attribute); + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, &attribute); if (err != cudlaSuccess) { DPRINTF("Error in getting input tensor descriptor = %d\n", err); cleanUp(&resourceList); @@ -650,9 +630,7 @@ int main(int argc, char** argv) { printTensorDesc(inputTensorDesc); attribute.outputTensorDesc = outputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, - CUDLA_OUTPUT_TENSOR_DESCRIPTORS, - &attribute); + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, &attribute); if (err != cudlaSuccess) { DPRINTF("Error in getting output tensor descriptor = %d\n", err); cleanUp(&resourceList); @@ -662,9 +640,7 @@ int main(int argc, char** argv) { printTensorDesc(outputTensorDesc); attribute.outputTensorDesc = outputTaskStatisticsDesc; - err = cudlaModuleGetAttributes(moduleHandle, - CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, - &attribute); + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, &attribute); if (err != cudlaSuccess) { DPRINTF("Error in getting task statistics descriptor = %d\n", err); cleanUp(&resourceList); @@ -672,21 +648,21 @@ int main(int argc, char** argv) { } DPRINTF("Printing output task statistics descriptor size\n"); for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size); + DPRINTF("The size of %u descriptor is %lu\n", ii, outputTaskStatisticsDesc[ii].size); } // Setup the input and output buffers. - unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors); + unsigned char **inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *) * numInputTensors); if (inputBuffer == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors); + memset(inputBuffer, 0x00, sizeof(unsigned char *) * numInputTensors); resourceList.inputBuffer = inputBuffer; for (uint32_t ii = 0; ii < numInputTensors; ii++) { - inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size); + inputBuffer[ii] = (unsigned char *)malloc(inputTensorDesc[ii].size); if (inputBuffer[ii] == NULL) { DPRINTF("Error in allocating input memory\n"); cleanUp(&resourceList); @@ -695,17 +671,17 @@ int main(int argc, char** argv) { memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size); } - unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors); + unsigned char **outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *) * numOutputTensors); if (outputBuffer == NULL) { DPRINTF("Error in allocating memory for output buffer array\n"); cleanUp(&resourceList); return 1; } - memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors); + memset(outputBuffer, 0x00, sizeof(unsigned char *) * numOutputTensors); resourceList.outputBuffer = outputBuffer; for (uint32_t ii = 0; ii < numOutputTensors; ii++) { - outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size); + outputBuffer[ii] = (unsigned char *)malloc(outputTensorDesc[ii].size); if (outputBuffer[ii] == NULL) { DPRINTF("Error in allocating output memory\n"); cleanUp(&resourceList); @@ -714,17 +690,18 @@ int main(int argc, char** argv) { memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size); } - unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics); + unsigned char **statisticsOutputBuffer = + (unsigned char **)malloc(sizeof(unsigned char *) * numOutputTaskStatistics); if (statisticsOutputBuffer == NULL) { DPRINTF("Error in allocating memory for output buffer array\n"); cleanUp(&resourceList); return 1; } - memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics); + memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *) * numOutputTaskStatistics); resourceList.statisticsOutputBuffer = statisticsOutputBuffer; for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size); + statisticsOutputBuffer[ii] = (unsigned char *)malloc(outputTaskStatisticsDesc[ii].size); if (outputBuffer[ii] == NULL) { DPRINTF("Error in allocating output memory\n"); cleanUp(&resourceList); @@ -733,17 +710,17 @@ int main(int argc, char** argv) { memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size); } - NvSciBufModule bufModule = NULL; - NvSciBufAttrList *inputAttrList = {NULL}; - NvSciBufAttrList *outputAttrList = {NULL}; - NvSciBufAttrList *statisticsOutputAttrList = {NULL}; - NvSciBufAttrList *reconciledInputAttrList = {NULL}; - NvSciBufAttrList *reconciledOutputAttrList = {NULL}; + NvSciBufModule bufModule = NULL; + NvSciBufAttrList *inputAttrList = {NULL}; + NvSciBufAttrList *outputAttrList = {NULL}; + NvSciBufAttrList *statisticsOutputAttrList = {NULL}; + NvSciBufAttrList *reconciledInputAttrList = {NULL}; + NvSciBufAttrList *reconciledOutputAttrList = {NULL}; NvSciBufAttrList *reconciledStatisticsOutputAttrList = {NULL}; - NvSciBufAttrList *inputConflictList = {NULL}; - NvSciBufAttrList *outputConflictList = {NULL}; - NvSciBufAttrList *statisticsOutputConflictList = {NULL}; - NvSciError sciError = NvSciError_Success; + NvSciBufAttrList *inputConflictList = {NULL}; + NvSciBufAttrList *outputConflictList = {NULL}; + NvSciBufAttrList *statisticsOutputConflictList = {NULL}; + NvSciError sciError = NvSciError_Success; sciError = NvSciBufModuleOpen(&bufModule); if (sciError != NvSciError_Success) { @@ -755,98 +732,89 @@ int main(int argc, char** argv) { // creating and setting input attribute list - inputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + inputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numInputTensors); if (inputAttrList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(inputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + memset(inputAttrList, 0x00, sizeof(NvSciBufAttrList) * numInputTensors); resourceList.inputAttrList = inputAttrList; - reconciledInputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + reconciledInputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numInputTensors); if (reconciledInputAttrList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(reconciledInputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + memset(reconciledInputAttrList, 0x00, sizeof(NvSciBufAttrList) * numInputTensors); resourceList.reconciledInputAttrList = reconciledInputAttrList; - inputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + inputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numInputTensors); if (inputConflictList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(inputConflictList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + memset(inputConflictList, 0x00, sizeof(NvSciBufAttrList) * numInputTensors); resourceList.inputConflictList = inputConflictList; for (uint32_t ii = 0; ii < numInputTensors; ii++) { - err = createAndSetAttrList(bufModule, - inputTensorDesc[ii].size, - &inputAttrList[ii]); + err = createAndSetAttrList(bufModule, inputTensorDesc[ii].size, &inputAttrList[ii]); if (err != cudlaSuccess) { DPRINTF("Error in creating NvSciBuf attribute list for input attribute\n"); cleanUp(&resourceList); return 1; } - sciError = NvSciBufAttrListReconcile(&inputAttrList[ii], - 1, - &reconciledInputAttrList[ii], - &inputConflictList[ii]); + sciError = + NvSciBufAttrListReconcile(&inputAttrList[ii], 1, &reconciledInputAttrList[ii], &inputConflictList[ii]); if (sciError != NvSciError_Success) { DPRINTF("Error in reconciling NvSciBuf attribute list for input attribute\n"); cleanUp(&resourceList); return 1; } - } - outputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + outputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numOutputTensors); if (outputAttrList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(outputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + memset(outputAttrList, 0x00, sizeof(NvSciBufAttrList) * numOutputTensors); resourceList.outputAttrList = outputAttrList; - reconciledOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + reconciledOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numOutputTensors); if (reconciledOutputAttrList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(reconciledOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + memset(reconciledOutputAttrList, 0x00, sizeof(NvSciBufAttrList) * numOutputTensors); resourceList.reconciledOutputAttrList = reconciledOutputAttrList; - outputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + outputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numOutputTensors); if (outputConflictList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(outputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + memset(outputConflictList, 0x00, sizeof(NvSciBufAttrList) * numOutputTensors); resourceList.outputConflictList = outputConflictList; // creating and setting output attribute list for (uint32_t ii = 0; ii < numOutputTensors; ii++) { - err = createAndSetAttrList(bufModule, - outputTensorDesc[ii].size, - &outputAttrList[ii]); + err = createAndSetAttrList(bufModule, outputTensorDesc[ii].size, &outputAttrList[ii]); if (err != cudlaSuccess) { DPRINTF("Error in creating NvSciBuf attribute list for output attibute\n"); cleanUp(&resourceList); return 1; } - sciError = NvSciBufAttrListReconcile(&outputAttrList[ii], - 1, - &reconciledOutputAttrList[ii], - &outputConflictList[ii]); + sciError = + NvSciBufAttrListReconcile(&outputAttrList[ii], 1, &reconciledOutputAttrList[ii], &outputConflictList[ii]); if (sciError != NvSciError_Success) { DPRINTF("Error in reconciling NvSciBuf attribute list for output attribute\n"); cleanUp(&resourceList); @@ -854,38 +822,36 @@ int main(int argc, char** argv) { } } - statisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + statisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numOutputTaskStatistics); if (statisticsOutputAttrList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(statisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + memset(statisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList) * numOutputTaskStatistics); resourceList.statisticsOutputAttrList = statisticsOutputAttrList; - reconciledStatisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + reconciledStatisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numOutputTaskStatistics); if (reconciledStatisticsOutputAttrList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(reconciledStatisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + memset(reconciledStatisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList) * numOutputTaskStatistics); resourceList.reconciledStatisticsOutputAttrList = reconciledStatisticsOutputAttrList; - statisticsOutputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + statisticsOutputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList) * numOutputTaskStatistics); if (statisticsOutputConflictList == NULL) { DPRINTF("Error in allocating memory for input buffer array\n"); cleanUp(&resourceList); return 1; } - memset(statisticsOutputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + memset(statisticsOutputConflictList, 0x00, sizeof(NvSciBufAttrList) * numOutputTaskStatistics); resourceList.statisticsOutputConflictList = statisticsOutputConflictList; // creating and setting statistics output attribute list for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - err = createAndSetAttrList(bufModule, - outputTaskStatisticsDesc[ii].size, - &statisticsOutputAttrList[ii]); + err = createAndSetAttrList(bufModule, outputTaskStatisticsDesc[ii].size, &statisticsOutputAttrList[ii]); if (err != cudlaSuccess) { DPRINTF("Error in creating NvSciBuf attribute list\n"); cleanUp(&resourceList); @@ -903,12 +869,12 @@ int main(int argc, char** argv) { } } - NvSciBufObj *inputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numInputTensors); - NvSciBufObj *outputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTensors); - NvSciBufObj *statisticsBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTaskStatistics); + NvSciBufObj *inputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj) * numInputTensors); + NvSciBufObj *outputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj) * numOutputTensors); + NvSciBufObj *statisticsBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj) * numOutputTaskStatistics); - resourceList.inputBufObj = inputBufObj; - resourceList.outputBufObj = outputBufObj; + resourceList.inputBufObj = inputBufObj; + resourceList.outputBufObj = outputBufObj; resourceList.statisticsBufObj = statisticsBufObj; for (uint32_t ii = 0; ii < numInputTensors; ii++) { @@ -938,9 +904,9 @@ int main(int argc, char** argv) { } } - uint64_t** inputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors); - uint64_t** outputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors); - uint64_t** statisticsBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics); + uint64_t **inputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t *) * numInputTensors); + uint64_t **outputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t *) * numOutputTensors); + uint64_t **statisticsBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t *) * numOutputTaskStatistics); if ((inputBufObjRegPtr == NULL) || (outputBufObjRegPtr == NULL) || (statisticsBufObjRegPtr == NULL)) { if (inputBufObjRegPtr != NULL) { @@ -962,21 +928,21 @@ int main(int argc, char** argv) { return 1; } - resourceList.inputBufObjRegPtr = inputBufObjRegPtr; - resourceList.outputBufObjRegPtr = outputBufObjRegPtr; + resourceList.inputBufObjRegPtr = inputBufObjRegPtr; + resourceList.outputBufObjRegPtr = outputBufObjRegPtr; resourceList.statisticsBufObjRegPtr = statisticsBufObjRegPtr; - void **inputBufObjBuffer = (void **)malloc(sizeof(void*)*numInputTensors); - void **outputBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTensors); - void **statisticsBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTaskStatistics); + void **inputBufObjBuffer = (void **)malloc(sizeof(void *) * numInputTensors); + void **outputBufObjBuffer = (void **)malloc(sizeof(void *) * numOutputTensors); + void **statisticsBufObjBuffer = (void **)malloc(sizeof(void *) * numOutputTaskStatistics); - cudlaExternalMemoryHandleDesc memDesc = { 0 }; + cudlaExternalMemoryHandleDesc memDesc = {0}; // importing external memory for (uint32_t ii = 0; ii < numInputTensors; ii++) { memset(&memDesc, 0, sizeof(memDesc)); memDesc.extBufObject = (void *)inputBufObj[ii]; - memDesc.size = inputTensorDesc[ii].size; - err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr[ii], 0); + memDesc.size = inputTensorDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr[ii], 0); if (err != cudlaSuccess) { DPRINTF("Error in importing external memory = %d\n", err); cleanUp(&resourceList); @@ -995,8 +961,8 @@ int main(int argc, char** argv) { for (uint32_t ii = 0; ii < numOutputTensors; ii++) { memset(&memDesc, 0, sizeof(memDesc)); memDesc.extBufObject = (void *)outputBufObj[ii]; - memDesc.size = outputTensorDesc[ii].size; - err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr[ii], 0); + memDesc.size = outputTensorDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr[ii], 0); if (err != cudlaSuccess) { DPRINTF("Error in importing external memory = %d\n", err); cleanUp(&resourceList); @@ -1015,7 +981,7 @@ int main(int argc, char** argv) { for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { memset(&memDesc, 0, sizeof(memDesc)); memDesc.extBufObject = (void *)statisticsBufObj[ii]; - memDesc.size = outputTaskStatisticsDesc[ii].size; + memDesc.size = outputTaskStatisticsDesc[ii].size; err = cudlaImportExternalMemory(devHandle, &memDesc, &statisticsBufObjRegPtr[ii], CUDLA_TASK_STATISTICS); if (err != cudlaSuccess) { DPRINTF("Error in importing external memory = %d\n", err); @@ -1032,26 +998,26 @@ int main(int argc, char** argv) { memset(statisticsBufObjBuffer[ii], 0, outputTaskStatisticsDesc[ii].size); } - uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0} ; + uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0}; uint32_t index = 0; - for (; index < numOutputTensors ; index++) { + for (; index < numOutputTensors; index++) { outputStatisticsBufferRegisteredPtr[index] = ((outputBufObjRegPtr[index])); } - for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) { + for (uint32_t jj = 0; jj < numOutputTaskStatistics; jj++) { outputStatisticsBufferRegisteredPtr[index++] = ((statisticsBufObjRegPtr[jj])); } - NvSciSyncObj syncObj; - NvSciSyncModule syncModule; - NvSciSyncAttrList syncAttrListObj[2]; + NvSciSyncObj syncObj; + NvSciSyncModule syncModule; + NvSciSyncAttrList syncAttrListObj[2]; NvSciSyncCpuWaitContext nvSciCtx; - NvSciSyncAttrList waiterAttrListObj = NULL; - NvSciSyncAttrList signalerAttrListObj = NULL; - NvSciSyncAttrList nvSciSyncConflictListObj; - NvSciSyncAttrList nvSciSyncReconciledListObj; - + NvSciSyncAttrList waiterAttrListObj = NULL; + NvSciSyncAttrList signalerAttrListObj = NULL; + NvSciSyncAttrList nvSciSyncConflictListObj; + NvSciSyncAttrList nvSciSyncReconciledListObj; + sciError = NvSciSyncModuleOpen(&syncModule); if (sciError != NvSciError_Success) { DPRINTF("Error in initializing NvSciSyncModuleOpen\n"); @@ -1059,7 +1025,7 @@ int main(int argc, char** argv) { return 1; } resourceList.syncModule = syncModule; - + sciError = NvSciSyncCpuWaitContextAlloc(syncModule, &nvSciCtx); if (sciError != NvSciError_Success) { DPRINTF("Error in allocating cpu wait context NvSciSyncCpuWaitContextAlloc\n"); @@ -1067,7 +1033,7 @@ int main(int argc, char** argv) { return 1; } resourceList.nvSciCtx = nvSciCtx; - + sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj); if (sciError != NvSciError_Success) { DPRINTF("Error in creating NvSciSync attribute list\n"); @@ -1075,7 +1041,7 @@ int main(int argc, char** argv) { return 1; } resourceList.signalerAttrListObj = signalerAttrListObj; - + sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj); if (sciError != NvSciError_Success) { DPRINTF("Error in creating NvSciSync attribute list\n"); @@ -1083,62 +1049,55 @@ int main(int argc, char** argv) { return 1; } resourceList.waiterAttrListObj = waiterAttrListObj; - - err = cudlaGetNvSciSyncAttributes(reinterpret_cast(signalerAttrListObj), - CUDLA_NVSCISYNC_ATTR_SIGNAL); + + err = cudlaGetNvSciSyncAttributes(reinterpret_cast(signalerAttrListObj), CUDLA_NVSCISYNC_ATTR_SIGNAL); if (err != cudlaSuccess) { DPRINTF("Error in getting cuDLA's NvSciSync attributes\n"); cleanUp(&resourceList); return 1; } - + sciError = fillCpuWaiterAttrList(waiterAttrListObj); if (sciError != NvSciError_Success) { DPRINTF("Error in setting NvSciSync attribute list\n"); cleanUp(&resourceList); return 1; } - + syncAttrListObj[0] = signalerAttrListObj; syncAttrListObj[1] = waiterAttrListObj; - sciError = NvSciSyncAttrListReconcile(syncAttrListObj, - 2, - &nvSciSyncReconciledListObj, - &nvSciSyncConflictListObj); + sciError = NvSciSyncAttrListReconcile(syncAttrListObj, 2, &nvSciSyncReconciledListObj, &nvSciSyncConflictListObj); if (sciError != NvSciError_Success) { DPRINTF("Error in reconciling NvSciSync's attribute lists\n"); cleanUp(&resourceList); return 1; } - resourceList.nvSciSyncConflictListObj = nvSciSyncConflictListObj; + resourceList.nvSciSyncConflictListObj = nvSciSyncConflictListObj; resourceList.nvSciSyncReconciledListObj = nvSciSyncReconciledListObj; - - sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj, &syncObj); + + sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj, &syncObj); if (sciError != NvSciError_Success) { DPRINTF("Error in allocating NvSciSync object\n"); cleanUp(&resourceList); return 1; } resourceList.syncObj = syncObj; - + // importing external semaphore - uint64_t* nvSciSyncObjRegPtr = NULL; - cudlaExternalSemaphoreHandleDesc semaMemDesc = { 0 }; + uint64_t *nvSciSyncObjRegPtr = NULL; + cudlaExternalSemaphoreHandleDesc semaMemDesc = {0}; memset(&semaMemDesc, 0, sizeof(semaMemDesc)); semaMemDesc.extSyncObject = syncObj; - err = cudlaImportExternalSemaphore(devHandle, - &semaMemDesc, - &nvSciSyncObjRegPtr, - 0); + err = cudlaImportExternalSemaphore(devHandle, &semaMemDesc, &nvSciSyncObjRegPtr, 0); if (err != cudlaSuccess) { DPRINTF("Error in importing external semaphore = %d\n", err); cleanUp(&resourceList); return 1; } DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); - + // Signal Events - cudlaSignalEvents* signalEvents; + cudlaSignalEvents *signalEvents; signalEvents = (cudlaSignalEvents *)malloc(sizeof(cudlaSignalEvents)); if (signalEvents == NULL) { DPRINTF("Error in allocating signal events\n"); @@ -1147,48 +1106,47 @@ int main(int argc, char** argv) { } signalEvents->numEvents = 1; - uint64_t** devPtrs = (uint64_t **)malloc(signalEvents->numEvents * - sizeof(uint64_t *)); + uint64_t **devPtrs = (uint64_t **)malloc(signalEvents->numEvents * sizeof(uint64_t *)); if (devPtrs == NULL) { DPRINTF("Error in allocating output pointer's array of registered objects\n"); cleanUp(&resourceList); return 1; } - devPtrs[0] = nvSciSyncObjRegPtr; + devPtrs[0] = nvSciSyncObjRegPtr; signalEvents->devPtrs = devPtrs; - resourceList.devPtrs = devPtrs; - - signalEvents->eofFences = (CudlaFence *)malloc(signalEvents->numEvents * - sizeof(CudlaFence)); + resourceList.devPtrs = devPtrs; + + signalEvents->eofFences = (CudlaFence *)malloc(signalEvents->numEvents * sizeof(CudlaFence)); if (signalEvents->eofFences == NULL) { DPRINTF("Error in allocating eofFence array\n"); cleanUp(&resourceList); return 1; } - NvSciSyncFence eofFence = NvSciSyncFenceInitializer; + NvSciSyncFence eofFence = NvSciSyncFenceInitializer; signalEvents->eofFences[0].fence = &eofFence; - signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE; - resourceList.signalEvents = signalEvents; - resourceList.eofFence = eofFence; + signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE; + resourceList.signalEvents = signalEvents; + resourceList.eofFence = eofFence; // Enqueue a cuDLA task. cudlaTask task; task.moduleHandle = moduleHandle; - task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr; + task.outputTensor = (uint64_t *const *)&outputStatisticsBufferRegisteredPtr; - if(statSupport == 1) { + if (statSupport == 1) { task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics); DPRINTF("Layerwise profiling is requested \n"); - } else { + } + else { task.numOutputTensors = numOutputTensors; DPRINTF("Layerwise profiling is not requested \n"); } task.numInputTensors = numInputTensors; - task.inputTensor = inputBufObjRegPtr; - task.waitEvents = NULL; - task.signalEvents = signalEvents; + task.inputTensor = inputBufObjRegPtr; + task.waitEvents = NULL; + task.signalEvents = signalEvents; err = cudlaSubmitTask(devHandle, &task, 1, NULL, 0); if (err != cudlaSuccess) { @@ -1199,8 +1157,7 @@ int main(int argc, char** argv) { DPRINTF("SUBMIT IS DONE !!!\n"); // Wait for operations to finish and bring output buffer to CPU. - sciError = NvSciSyncFenceWait(reinterpret_cast(signalEvents->eofFences[0].fence), - nvSciCtx, -1); + sciError = NvSciSyncFenceWait(reinterpret_cast(signalEvents->eofFences[0].fence), nvSciCtx, -1); if (sciError != NvSciError_Success) { DPRINTF("Error in waiting on NvSciSyncFence\n"); cleanUp(&resourceList); @@ -1212,34 +1169,35 @@ int main(int argc, char** argv) { memcpy(outputBuffer[ii], outputBufObjBuffer[ii], outputTensorDesc[ii].size); } - if(statSupport == 1) { + if (statSupport == 1) { for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { memcpy(statisticsOutputBuffer[ii], statisticsBufObjBuffer[ii], outputTaskStatisticsDesc[ii].size); } - const cudlaExternalEtbl* etbl = NULL; - if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) { + const cudlaExternalEtbl *etbl = NULL; + if (cudlaGetExternalExportTable(&etbl, 0) != cudlaSuccess) { DPRINTF("Error in getting export table\n"); cleanUp(&resourceList); return 1; } - void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + void **csv = (void **)malloc(sizeof(void *) * numOutputTaskStatistics); if (csv == NULL) { DPRINTF("Error in allocating memory for csv stream\n"); cleanUp(&resourceList); return 1; } - memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics); + memset(csv, 0x00, sizeof(void *) * numOutputTaskStatistics); resourceList.csv = csv; for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { cudlaTranslateCsvAttribute csvAttribute; - uint64_t csvStreamLength = 0; + uint64_t csvStreamLength = 0; - err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute); - csv[ii] = (void* )malloc(csvAttribute.csvStreamLength); + err = etbl->etiTranslateStats( + devHandle, statisticsOutputBuffer[ii], dlaFreqInMHz, ii, CUDLA_GET_CSV_LENGTH, &csvAttribute); + csv[ii] = (void *)malloc(csvAttribute.csvStreamLength); csvStreamLength = csvAttribute.csvStreamLength; - DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength); + DPRINTF("size for statistics buffer %u is %lu \n", ii, csvStreamLength); if (csv[ii] == NULL) { DPRINTF("Error in allocating memory for csv stream\n"); @@ -1249,7 +1207,8 @@ int main(int argc, char** argv) { memset(csv[ii], 0x00, csvAttribute.csvStreamLength); csvAttribute.csvStreamStats = csv[ii]; - err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute); + err = etbl->etiTranslateStats( + devHandle, statisticsOutputBuffer[ii], dlaFreqInMHz, ii, CUDLA_GET_CSV_STATS, &csvAttribute); if (err != cudlaSuccess) { DPRINTF("Error in translating stats\n"); cleanUp(&resourceList); @@ -1257,7 +1216,7 @@ int main(int argc, char** argv) { } if (argc == 5) { - sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix); + sprintf(filename, "%s%u%s", argv[4], (ii + 1), suffix); fp = fopen(filename, "w+"); if (fp == NULL) { DPRINTF("Cannot open file %s\n", filename); @@ -1265,24 +1224,24 @@ int main(int argc, char** argv) { return 1; } - uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp); - if(ret_val != csvStreamLength) { + uint32_t ret_val = fwrite(csv[ii], sizeof(char), csvStreamLength, fp); + if (ret_val != csvStreamLength) { DPRINTF("number of elements written to file is %u \n", ret_val); cleanUp(&resourceList); return 1; } fclose(fp); - } else { - DPRINTF("%s \n",(char *)csv[ii]); + } + else { + DPRINTF("%s \n", (char *)csv[ii]); } } } // unregister the CUDA-allocated buffers. for (uint32_t ii = 0; ii < numInputTensors; ii++) { - err = cudlaMemUnregister(devHandle, - (inputBufObjRegPtr[ii])); + err = cudlaMemUnregister(devHandle, (inputBufObjRegPtr[ii])); if (err != cudlaSuccess) { DPRINTF("Error in registering input memory = %d\n", err); cleanUp(&resourceList); @@ -1291,8 +1250,7 @@ int main(int argc, char** argv) { } for (uint32_t ii = 0; ii < numOutputTensors; ii++) { - err = cudlaMemUnregister(devHandle, - (outputBufObjRegPtr[ii])); + err = cudlaMemUnregister(devHandle, (outputBufObjRegPtr[ii])); if (err != cudlaSuccess) { DPRINTF("Error in registering output memory = %d\n", err); cleanUp(&resourceList); @@ -1301,8 +1259,7 @@ int main(int argc, char** argv) { } for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { - err = cudlaMemUnregister(devHandle, - (statisticsBufObjRegPtr[ii])); + err = cudlaMemUnregister(devHandle, (statisticsBufObjRegPtr[ii])); if (err != cudlaSuccess) { DPRINTF("Error in registering output memory = %d\n", err); cleanUp(&resourceList); @@ -1325,7 +1282,8 @@ int main(int argc, char** argv) { DPRINTF("Error in cudlaModuleUnload = %d\n", err); cleanUp(&resourceList); return 1; - } else { + } + else { DPRINTF("Successfully unloaded module\n"); } diff --git a/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/README.md b/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/README.md index 099ace0b..2c72ee04 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/README.md +++ b/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/README.md @@ -31,4 +31,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/main.cpp b/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/main.cpp index fe346bf6..c1380eca 100644 --- a/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/main.cpp +++ b/Samples/8_Platform_Specific/Tegra/cuDLAStandaloneMode/main.cpp @@ -25,1029 +25,991 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cudla.h" -#include "nvscierror.h" -#include "nvscibuf.h" -#include "nvscisync.h" - #include #include -#include #include #include +#include #include +#include "cudla.h" +#include "nvscibuf.h" +#include "nvscierror.h" +#include "nvscisync.h" + #define DPRINTF(...) printf(__VA_ARGS__) -static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { - DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); - DPRINTF("\tsize: %lu\n", tensorDesc->size); +static void printTensorDesc(cudlaModuleTensorDescriptor *tensorDesc) +{ + DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); + DPRINTF("\tsize: %lu\n", tensorDesc->size); - DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, - tensorDesc->h, tensorDesc->w); + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, tensorDesc->h, tensorDesc->w); - DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); - DPRINTF("\tdata type: %d\n", tensorDesc->dataType); - DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); - DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); - DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); - DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); - DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); - DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); - DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); + DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); + DPRINTF("\tdata type: %d\n", tensorDesc->dataType); + DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); + DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); + DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); + DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); + DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); + DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); + DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); } -static int initializeInputBuffers(char* filePath, - cudlaModuleTensorDescriptor* tensorDesc, - unsigned char* buf) { - // Read the file in filePath and fill up 'buf' according to format - // specified by the user. +static int initializeInputBuffers(char *filePath, cudlaModuleTensorDescriptor *tensorDesc, unsigned char *buf) +{ + // Read the file in filePath and fill up 'buf' according to format + // specified by the user. - return 0; + return 0; } -typedef struct { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - unsigned char* loadableData; - unsigned char* inputBuffer; - unsigned char* outputBuffer; - NvSciBufObj inputBufObj; - NvSciBufObj outputBufObj; - NvSciBufModule bufModule; - NvSciBufAttrList inputAttrList; - NvSciBufAttrList reconciledInputAttrList; - NvSciBufAttrList inputConflictList; - NvSciBufAttrList outputAttrList; - NvSciBufAttrList reconciledOutputAttrList; - NvSciBufAttrList outputConflictList; - NvSciSyncObj syncObj1; - NvSciSyncObj syncObj2; - NvSciSyncModule syncModule; - NvSciSyncFence preFence; - NvSciSyncFence eofFence; - NvSciSyncCpuWaitContext nvSciCtx; - NvSciSyncAttrList waiterAttrListObj1; - NvSciSyncAttrList signalerAttrListObj1; - NvSciSyncAttrList waiterAttrListObj2; - NvSciSyncAttrList signalerAttrListObj2; - NvSciSyncAttrList nvSciSyncConflictListObj1; - NvSciSyncAttrList nvSciSyncReconciledListObj1; - NvSciSyncAttrList nvSciSyncConflictListObj2; - NvSciSyncAttrList nvSciSyncReconciledListObj2; - cudlaModuleTensorDescriptor* inputTensorDesc; - cudlaModuleTensorDescriptor* outputTensorDesc; - CudlaFence* preFences; - uint64_t** devPtrs; - cudlaWaitEvents* waitEvents; - cudlaSignalEvents* signalEvents; +typedef struct +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char *loadableData; + unsigned char *inputBuffer; + unsigned char *outputBuffer; + NvSciBufObj inputBufObj; + NvSciBufObj outputBufObj; + NvSciBufModule bufModule; + NvSciBufAttrList inputAttrList; + NvSciBufAttrList reconciledInputAttrList; + NvSciBufAttrList inputConflictList; + NvSciBufAttrList outputAttrList; + NvSciBufAttrList reconciledOutputAttrList; + NvSciBufAttrList outputConflictList; + NvSciSyncObj syncObj1; + NvSciSyncObj syncObj2; + NvSciSyncModule syncModule; + NvSciSyncFence preFence; + NvSciSyncFence eofFence; + NvSciSyncCpuWaitContext nvSciCtx; + NvSciSyncAttrList waiterAttrListObj1; + NvSciSyncAttrList signalerAttrListObj1; + NvSciSyncAttrList waiterAttrListObj2; + NvSciSyncAttrList signalerAttrListObj2; + NvSciSyncAttrList nvSciSyncConflictListObj1; + NvSciSyncAttrList nvSciSyncReconciledListObj1; + NvSciSyncAttrList nvSciSyncConflictListObj2; + NvSciSyncAttrList nvSciSyncReconciledListObj2; + cudlaModuleTensorDescriptor *inputTensorDesc; + cudlaModuleTensorDescriptor *outputTensorDesc; + CudlaFence *preFences; + uint64_t **devPtrs; + cudlaWaitEvents *waitEvents; + cudlaSignalEvents *signalEvents; } ResourceList; -void cleanUp(ResourceList* resourceList); +void cleanUp(ResourceList *resourceList); -void cleanUp(ResourceList* resourceList) { - if (resourceList->inputBufObj != NULL) { - NvSciBufObjFree(resourceList->inputBufObj); - resourceList->inputBufObj = NULL; - } - - if (resourceList->outputBufObj != NULL) { - NvSciBufObjFree(resourceList->outputBufObj); - resourceList->outputBufObj = NULL; - } - - if (resourceList->reconciledInputAttrList != NULL) { - NvSciBufAttrListFree(resourceList->reconciledInputAttrList); - resourceList->reconciledInputAttrList = NULL; - } - - if (resourceList->inputConflictList != NULL) { - NvSciBufAttrListFree(resourceList->inputConflictList); - resourceList->inputConflictList = NULL; - } - - if (resourceList->inputAttrList != NULL) { - NvSciBufAttrListFree(resourceList->inputAttrList); - resourceList->inputAttrList = NULL; - } - - if (resourceList->reconciledOutputAttrList != NULL) { - NvSciBufAttrListFree(resourceList->reconciledOutputAttrList); - resourceList->reconciledOutputAttrList = NULL; - } - - if (resourceList->outputConflictList != NULL) { - NvSciBufAttrListFree(resourceList->outputConflictList); - resourceList->outputConflictList = NULL; - } - - if (resourceList->outputAttrList != NULL) { - NvSciBufAttrListFree(resourceList->outputAttrList); - resourceList->outputAttrList = NULL; - } - - if (resourceList->bufModule != NULL) { - NvSciBufModuleClose(resourceList->bufModule); - resourceList->bufModule = NULL; - } - - NvSciSyncFenceClear(&(resourceList->preFence)); - NvSciSyncFenceClear(&(resourceList->eofFence)); - - if (resourceList->syncObj1 != NULL) { - NvSciSyncObjFree(resourceList->syncObj1); - resourceList->syncObj1 = NULL; - } - - if (resourceList->syncObj2 != NULL) { - NvSciSyncObjFree(resourceList->syncObj2); - resourceList->syncObj2 = NULL; - } - - if (resourceList->nvSciSyncConflictListObj1 != NULL) { - NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj1); - resourceList->nvSciSyncConflictListObj1 = NULL; - } - - if (resourceList->nvSciSyncReconciledListObj1 != NULL) { - NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj1); - resourceList->nvSciSyncReconciledListObj1 = NULL; - } - - if (resourceList->nvSciSyncConflictListObj2 != NULL) { - NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj2); - resourceList->nvSciSyncConflictListObj2 = NULL; - } - - if (resourceList->nvSciSyncReconciledListObj2 != NULL) { - NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj2); - resourceList->nvSciSyncReconciledListObj2 = NULL; - } - - if (resourceList->signalerAttrListObj1 != NULL) { - NvSciSyncAttrListFree(resourceList->signalerAttrListObj1); - resourceList->signalerAttrListObj1 = NULL; - } - - if (resourceList->waiterAttrListObj1 != NULL) { - NvSciSyncAttrListFree(resourceList->waiterAttrListObj1); - resourceList->waiterAttrListObj1 = NULL; - } - - if (resourceList->signalerAttrListObj2 != NULL) { - NvSciSyncAttrListFree(resourceList->signalerAttrListObj2); - resourceList->signalerAttrListObj2 = NULL; - } - - if (resourceList->waiterAttrListObj2 != NULL) { - NvSciSyncAttrListFree(resourceList->waiterAttrListObj2); - resourceList->waiterAttrListObj2 = NULL; - } - - if (resourceList->nvSciCtx != NULL) { - NvSciSyncCpuWaitContextFree(resourceList->nvSciCtx); - resourceList->nvSciCtx = NULL; - } - - if (resourceList->syncModule != NULL) { - NvSciSyncModuleClose(resourceList->syncModule); - resourceList->syncModule = NULL; - } - - if (resourceList->waitEvents != NULL) { - free(resourceList->waitEvents); - resourceList->waitEvents = NULL; - } - - if (resourceList->preFences != NULL) { - free(resourceList->preFences); - resourceList->preFences = NULL; - } - - if (resourceList->signalEvents != NULL) { - if (resourceList->signalEvents->eofFences != NULL) { - free(resourceList->signalEvents->eofFences); - resourceList->signalEvents->eofFences = NULL; +void cleanUp(ResourceList *resourceList) +{ + if (resourceList->inputBufObj != NULL) { + NvSciBufObjFree(resourceList->inputBufObj); + resourceList->inputBufObj = NULL; } - free(resourceList->signalEvents); - resourceList->signalEvents = NULL; - } + if (resourceList->outputBufObj != NULL) { + NvSciBufObjFree(resourceList->outputBufObj); + resourceList->outputBufObj = NULL; + } - if (resourceList->devPtrs != NULL) { - free(resourceList->devPtrs); - resourceList->devPtrs = NULL; - } + if (resourceList->reconciledInputAttrList != NULL) { + NvSciBufAttrListFree(resourceList->reconciledInputAttrList); + resourceList->reconciledInputAttrList = NULL; + } - if (resourceList->inputTensorDesc != NULL) { - free(resourceList->inputTensorDesc); - resourceList->inputTensorDesc = NULL; - } - if (resourceList->outputTensorDesc != NULL) { - free(resourceList->outputTensorDesc); - resourceList->outputTensorDesc = NULL; - } + if (resourceList->inputConflictList != NULL) { + NvSciBufAttrListFree(resourceList->inputConflictList); + resourceList->inputConflictList = NULL; + } - if (resourceList->loadableData != NULL) { - free(resourceList->loadableData); - resourceList->loadableData = NULL; - } + if (resourceList->inputAttrList != NULL) { + NvSciBufAttrListFree(resourceList->inputAttrList); + resourceList->inputAttrList = NULL; + } - if (resourceList->moduleHandle != NULL) { - cudlaModuleUnload(resourceList->moduleHandle, 0); - resourceList->moduleHandle = NULL; - } + if (resourceList->reconciledOutputAttrList != NULL) { + NvSciBufAttrListFree(resourceList->reconciledOutputAttrList); + resourceList->reconciledOutputAttrList = NULL; + } - if (resourceList->devHandle != NULL) { - cudlaDestroyDevice(resourceList->devHandle); - resourceList->devHandle = NULL; - } + if (resourceList->outputConflictList != NULL) { + NvSciBufAttrListFree(resourceList->outputConflictList); + resourceList->outputConflictList = NULL; + } - if (resourceList->inputBuffer != NULL) { - free(resourceList->inputBuffer); - resourceList->inputBuffer = NULL; - } - if (resourceList->outputBuffer != NULL) { - free(resourceList->outputBuffer); - resourceList->outputBuffer = NULL; - } + if (resourceList->outputAttrList != NULL) { + NvSciBufAttrListFree(resourceList->outputAttrList); + resourceList->outputAttrList = NULL; + } + + if (resourceList->bufModule != NULL) { + NvSciBufModuleClose(resourceList->bufModule); + resourceList->bufModule = NULL; + } + + NvSciSyncFenceClear(&(resourceList->preFence)); + NvSciSyncFenceClear(&(resourceList->eofFence)); + + if (resourceList->syncObj1 != NULL) { + NvSciSyncObjFree(resourceList->syncObj1); + resourceList->syncObj1 = NULL; + } + + if (resourceList->syncObj2 != NULL) { + NvSciSyncObjFree(resourceList->syncObj2); + resourceList->syncObj2 = NULL; + } + + if (resourceList->nvSciSyncConflictListObj1 != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj1); + resourceList->nvSciSyncConflictListObj1 = NULL; + } + + if (resourceList->nvSciSyncReconciledListObj1 != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj1); + resourceList->nvSciSyncReconciledListObj1 = NULL; + } + + if (resourceList->nvSciSyncConflictListObj2 != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj2); + resourceList->nvSciSyncConflictListObj2 = NULL; + } + + if (resourceList->nvSciSyncReconciledListObj2 != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj2); + resourceList->nvSciSyncReconciledListObj2 = NULL; + } + + if (resourceList->signalerAttrListObj1 != NULL) { + NvSciSyncAttrListFree(resourceList->signalerAttrListObj1); + resourceList->signalerAttrListObj1 = NULL; + } + + if (resourceList->waiterAttrListObj1 != NULL) { + NvSciSyncAttrListFree(resourceList->waiterAttrListObj1); + resourceList->waiterAttrListObj1 = NULL; + } + + if (resourceList->signalerAttrListObj2 != NULL) { + NvSciSyncAttrListFree(resourceList->signalerAttrListObj2); + resourceList->signalerAttrListObj2 = NULL; + } + + if (resourceList->waiterAttrListObj2 != NULL) { + NvSciSyncAttrListFree(resourceList->waiterAttrListObj2); + resourceList->waiterAttrListObj2 = NULL; + } + + if (resourceList->nvSciCtx != NULL) { + NvSciSyncCpuWaitContextFree(resourceList->nvSciCtx); + resourceList->nvSciCtx = NULL; + } + + if (resourceList->syncModule != NULL) { + NvSciSyncModuleClose(resourceList->syncModule); + resourceList->syncModule = NULL; + } + + if (resourceList->waitEvents != NULL) { + free(resourceList->waitEvents); + resourceList->waitEvents = NULL; + } + + if (resourceList->preFences != NULL) { + free(resourceList->preFences); + resourceList->preFences = NULL; + } + + if (resourceList->signalEvents != NULL) { + if (resourceList->signalEvents->eofFences != NULL) { + free(resourceList->signalEvents->eofFences); + resourceList->signalEvents->eofFences = NULL; + } + + free(resourceList->signalEvents); + resourceList->signalEvents = NULL; + } + + if (resourceList->devPtrs != NULL) { + free(resourceList->devPtrs); + resourceList->devPtrs = NULL; + } + + if (resourceList->inputTensorDesc != NULL) { + free(resourceList->inputTensorDesc); + resourceList->inputTensorDesc = NULL; + } + if (resourceList->outputTensorDesc != NULL) { + free(resourceList->outputTensorDesc); + resourceList->outputTensorDesc = NULL; + } + + if (resourceList->loadableData != NULL) { + free(resourceList->loadableData); + resourceList->loadableData = NULL; + } + + if (resourceList->moduleHandle != NULL) { + cudlaModuleUnload(resourceList->moduleHandle, 0); + resourceList->moduleHandle = NULL; + } + + if (resourceList->devHandle != NULL) { + cudlaDestroyDevice(resourceList->devHandle); + resourceList->devHandle = NULL; + } + + if (resourceList->inputBuffer != NULL) { + free(resourceList->inputBuffer); + resourceList->inputBuffer = NULL; + } + if (resourceList->outputBuffer != NULL) { + free(resourceList->outputBuffer); + resourceList->outputBuffer = NULL; + } } -cudlaStatus createAndSetAttrList(NvSciBufModule module, uint64_t bufSize, - NvSciBufAttrList* attrList); +cudlaStatus createAndSetAttrList(NvSciBufModule module, uint64_t bufSize, NvSciBufAttrList *attrList); -cudlaStatus createAndSetAttrList(NvSciBufModule module, uint64_t bufSize, - NvSciBufAttrList* attrList) { - cudlaStatus status = cudlaSuccess; - NvSciError sciStatus = NvSciError_Success; +cudlaStatus createAndSetAttrList(NvSciBufModule module, uint64_t bufSize, NvSciBufAttrList *attrList) +{ + cudlaStatus status = cudlaSuccess; + NvSciError sciStatus = NvSciError_Success; + + sciStatus = NvSciBufAttrListCreate(module, attrList); + if (sciStatus != NvSciError_Success) { + status = cudlaErrorNvSci; + DPRINTF("Error in creating NvSciBuf attribute list\n"); + return status; + } + + bool needCpuAccess = true; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; + uint32_t dimcount = 1; + uint64_t sizes[] = {bufSize}; + uint32_t alignment[] = {1}; + uint32_t dataType = NvSciDataType_Int8; + NvSciBufType type = NvSciBufType_Tensor; + uint64_t baseAddrAlign = 512; + + NvSciBufAttrKeyValuePair setAttrs[] = { + {.key = NvSciBufGeneralAttrKey_Types, .value = &type, .len = sizeof(type)}, + {.key = NvSciBufTensorAttrKey_DataType, .value = &dataType, .len = sizeof(dataType)}, + {.key = NvSciBufTensorAttrKey_NumDims, .value = &dimcount, .len = sizeof(dimcount)}, + {.key = NvSciBufTensorAttrKey_SizePerDim, .value = &sizes, .len = sizeof(sizes)}, + {.key = NvSciBufTensorAttrKey_AlignmentPerDim, .value = &alignment, .len = sizeof(alignment)}, + {.key = NvSciBufTensorAttrKey_BaseAddrAlign, .value = &baseAddrAlign, .len = sizeof(baseAddrAlign)}, + {.key = NvSciBufGeneralAttrKey_RequiredPerm, .value = &perm, .len = sizeof(perm)}, + {.key = NvSciBufGeneralAttrKey_NeedCpuAccess, .value = &needCpuAccess, .len = sizeof(needCpuAccess)}}; + size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair); + + sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length); + if (sciStatus != NvSciError_Success) { + status = cudlaErrorNvSci; + DPRINTF("Error in setting NvSciBuf attribute list\n"); + return status; + } - sciStatus = NvSciBufAttrListCreate(module, attrList); - if (sciStatus != NvSciError_Success) { - status = cudlaErrorNvSci; - DPRINTF("Error in creating NvSciBuf attribute list\n"); return status; - } - - bool needCpuAccess = true; - NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; - uint32_t dimcount = 1; - uint64_t sizes[] = {bufSize}; - uint32_t alignment[] = {1}; - uint32_t dataType = NvSciDataType_Int8; - NvSciBufType type = NvSciBufType_Tensor; - uint64_t baseAddrAlign = 512; - - NvSciBufAttrKeyValuePair setAttrs[] = { - {.key = NvSciBufGeneralAttrKey_Types, - .value = &type, - .len = sizeof(type)}, - {.key = NvSciBufTensorAttrKey_DataType, - .value = &dataType, - .len = sizeof(dataType)}, - {.key = NvSciBufTensorAttrKey_NumDims, - .value = &dimcount, - .len = sizeof(dimcount)}, - {.key = NvSciBufTensorAttrKey_SizePerDim, - .value = &sizes, - .len = sizeof(sizes)}, - {.key = NvSciBufTensorAttrKey_AlignmentPerDim, - .value = &alignment, - .len = sizeof(alignment)}, - {.key = NvSciBufTensorAttrKey_BaseAddrAlign, - .value = &baseAddrAlign, - .len = sizeof(baseAddrAlign)}, - {.key = NvSciBufGeneralAttrKey_RequiredPerm, - .value = &perm, - .len = sizeof(perm)}, - {.key = NvSciBufGeneralAttrKey_NeedCpuAccess, - .value = &needCpuAccess, - .len = sizeof(needCpuAccess)}}; - size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair); - - sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length); - if (sciStatus != NvSciError_Success) { - status = cudlaErrorNvSci; - DPRINTF("Error in setting NvSciBuf attribute list\n"); - return status; - } - - return status; } NvSciError fillCpuSignalerAttrList(NvSciSyncAttrList list); -NvSciError fillCpuSignalerAttrList(NvSciSyncAttrList list) { - bool cpuSignaler = true; - NvSciSyncAttrKeyValuePair keyValue[2]; - memset(keyValue, 0, sizeof(keyValue)); - keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; - keyValue[0].value = (void*)&cpuSignaler; - keyValue[0].len = sizeof(cpuSignaler); +NvSciError fillCpuSignalerAttrList(NvSciSyncAttrList list) +{ + bool cpuSignaler = true; + NvSciSyncAttrKeyValuePair keyValue[2]; + memset(keyValue, 0, sizeof(keyValue)); + keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; + keyValue[0].value = (void *)&cpuSignaler; + keyValue[0].len = sizeof(cpuSignaler); - NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_SignalOnly; - keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; - keyValue[1].value = (void*)&cpuPerm; - keyValue[1].len = sizeof(cpuPerm); + NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_SignalOnly; + keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; + keyValue[1].value = (void *)&cpuPerm; + keyValue[1].len = sizeof(cpuPerm); - return NvSciSyncAttrListSetAttrs(list, keyValue, 2); + return NvSciSyncAttrListSetAttrs(list, keyValue, 2); } NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list); -NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) { - bool cpuWaiter = true; - NvSciSyncAttrKeyValuePair keyValue[2]; - memset(keyValue, 0, sizeof(keyValue)); - keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; - keyValue[0].value = (void*)&cpuWaiter; - keyValue[0].len = sizeof(cpuWaiter); +NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) +{ + bool cpuWaiter = true; + NvSciSyncAttrKeyValuePair keyValue[2]; + memset(keyValue, 0, sizeof(keyValue)); + keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; + keyValue[0].value = (void *)&cpuWaiter; + keyValue[0].len = sizeof(cpuWaiter); - NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly; - keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; - keyValue[1].value = (void*)&cpuPerm; - keyValue[1].len = sizeof(cpuPerm); + NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly; + keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; + keyValue[1].value = (void *)&cpuPerm; + keyValue[1].len = sizeof(cpuPerm); - return NvSciSyncAttrListSetAttrs(list, keyValue, 2); + return NvSciSyncAttrListSetAttrs(list, keyValue, 2); } -int main(int argc, char** argv) { - cudlaDevHandle devHandle; - cudlaModule moduleHandle; - cudlaStatus err; - FILE* fp = NULL; - struct stat st; - size_t file_size; - size_t actually_read = 0; - unsigned char* loadableData = NULL; +int main(int argc, char **argv) +{ + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + cudlaStatus err; + FILE *fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; - ResourceList resourceList; + ResourceList resourceList; - memset(&resourceList, 0x00, sizeof(ResourceList)); + memset(&resourceList, 0x00, sizeof(ResourceList)); - if (argc != 3) { - DPRINTF("Usage : ./cuDLAStandaloneMode \n"); - return 1; - } + if (argc != 3) { + DPRINTF("Usage : ./cuDLAStandaloneMode \n"); + return 1; + } - // Read loadable into buffer. - fp = fopen(argv[1], "rb"); - if (fp == NULL) { - DPRINTF("Cannot open file %s\n", argv[1]); - return 1; - } + // Read loadable into buffer. + fp = fopen(argv[1], "rb"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", argv[1]); + return 1; + } - if (stat(argv[1], &st) != 0) { - DPRINTF("Cannot stat file\n"); - return 1; - } + if (stat(argv[1], &st) != 0) { + DPRINTF("Cannot stat file\n"); + return 1; + } - file_size = st.st_size; - DPRINTF("The file size = %ld\n", file_size); + file_size = st.st_size; + DPRINTF("The file size = %ld\n", file_size); - loadableData = (unsigned char*)malloc(file_size); - if (loadableData == NULL) { - DPRINTF("Cannot Allocate memory for loadable\n"); - return 1; - } + loadableData = (unsigned char *)malloc(file_size); + if (loadableData == NULL) { + DPRINTF("Cannot Allocate memory for loadable\n"); + return 1; + } - actually_read = fread(loadableData, 1, file_size, fp); - if (actually_read != file_size) { + actually_read = fread(loadableData, 1, file_size, fp); + if (actually_read != file_size) { + free(loadableData); + DPRINTF("Read wrong size\n"); + return 1; + } + fclose(fp); + + resourceList.loadableData = loadableData; + + err = cudlaCreateDevice(0, &devHandle, CUDLA_STANDALONE); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA create device = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Device created successfully\n"); + resourceList.devHandle = devHandle; + + err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + else { + DPRINTF("Successfully loaded module\n"); + } + + resourceList.moduleHandle = moduleHandle; + // Get tensor attributes. + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; + cudlaModuleAttribute attribute; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numInputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numInputTensors = attribute.numInputTensors; + DPRINTF("numInputTensors = %d\n", numInputTensors); + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTensors = attribute.numOutputTensors; + DPRINTF("numOutputTensors = %d\n", numOutputTensors); + + cudlaModuleTensorDescriptor *inputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numInputTensors); + cudlaModuleTensorDescriptor *outputTensorDesc = + (cudlaModuleTensorDescriptor *)malloc(sizeof(cudlaModuleTensorDescriptor) * numOutputTensors); + + if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { + if (inputTensorDesc != NULL) { + free(inputTensorDesc); + inputTensorDesc = NULL; + } + + if (outputTensorDesc != NULL) { + free(outputTensorDesc); + outputTensorDesc = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputTensorDesc = inputTensorDesc; + resourceList.outputTensorDesc = outputTensorDesc; + + attribute.inputTensorDesc = inputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting input tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing input tensor descriptor\n"); + printTensorDesc(inputTensorDesc); + + attribute.outputTensorDesc = outputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting output tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output tensor descriptor\n"); + printTensorDesc(outputTensorDesc); + + // Setup the input and output buffers which will be used as an input to CUDA. + unsigned char *inputBuffer = (unsigned char *)malloc(inputTensorDesc[0].size); + if (inputBuffer == NULL) { + DPRINTF("Error in allocating input memory\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBuffer = inputBuffer; + + unsigned char *outputBuffer = (unsigned char *)malloc(outputTensorDesc[0].size); + if (outputBuffer == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.outputBuffer = outputBuffer; + + memset(inputBuffer, 0x00, inputTensorDesc[0].size); + memset(outputBuffer, 0x00, outputTensorDesc[0].size); + + // Fill up the buffers with data. + if (initializeInputBuffers(argv[2], inputTensorDesc, inputBuffer) != 0) { + DPRINTF("Error in initializing input buffer from PGM image\n"); + cleanUp(&resourceList); + return 1; + } + + NvSciBufModule bufModule = NULL; + NvSciBufAttrList inputAttrList = NULL; + NvSciBufAttrList outputAttrList = NULL; + NvSciBufAttrList reconciledInputAttrList = NULL; + NvSciBufAttrList reconciledOutputAttrList = NULL; + NvSciBufAttrList inputConflictList = NULL; + NvSciBufAttrList outputConflictList = NULL; + NvSciError sciError = NvSciError_Success; + + sciError = NvSciBufModuleOpen(&bufModule); + if (sciError != NvSciError_Success) { + DPRINTF("Error in initializing NvSciBufModule\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.bufModule = bufModule; + + // creating and setting input attribute list + err = createAndSetAttrList(bufModule, inputTensorDesc[0].size, &inputAttrList); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.inputAttrList = inputAttrList; + + sciError = NvSciBufAttrListReconcile(&inputAttrList, 1, &reconciledInputAttrList, &inputConflictList); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.reconciledInputAttrList = reconciledInputAttrList; + resourceList.inputConflictList = inputConflictList; + + // creating and setting output attribute list + err = createAndSetAttrList(bufModule, outputTensorDesc[0].size, &outputAttrList); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.outputAttrList = outputAttrList; + + sciError = NvSciBufAttrListReconcile(&outputAttrList, 1, &reconciledOutputAttrList, &outputConflictList); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.reconciledOutputAttrList = reconciledOutputAttrList; + resourceList.outputConflictList = outputConflictList; + + NvSciBufObj inputBufObj, outputBufObj; + sciError = NvSciBufObjAlloc(reconciledInputAttrList, &inputBufObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBufObj = inputBufObj; + + sciError = NvSciBufObjAlloc(reconciledOutputAttrList, &outputBufObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.outputBufObj = outputBufObj; + + uint64_t *inputBufObjRegPtr = NULL; + uint64_t *outputBufObjRegPtr = NULL; + void *inputBufObjBuffer; + void *outputBufObjBuffer; + + // importing external memory + cudlaExternalMemoryHandleDesc memDesc = {0}; + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)inputBufObj; + memDesc.size = inputTensorDesc[0].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(inputBufObj, &inputBufObjBuffer); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memcpy(inputBufObjBuffer, inputBuffer, inputTensorDesc[0].size); + + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)outputBufObj; + memDesc.size = outputTensorDesc[0].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(outputBufObj, &outputBufObjBuffer); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBufObjBuffer, 0, outputTensorDesc[0].size); + + NvSciSyncObj syncObj1, syncObj2; + NvSciSyncModule syncModule; + NvSciSyncAttrList syncAttrListObj1[2]; + NvSciSyncAttrList syncAttrListObj2[2]; + NvSciSyncCpuWaitContext nvSciCtx; + NvSciSyncAttrList waiterAttrListObj1 = NULL; + NvSciSyncAttrList signalerAttrListObj1 = NULL; + NvSciSyncAttrList waiterAttrListObj2 = NULL; + NvSciSyncAttrList signalerAttrListObj2 = NULL; + NvSciSyncAttrList nvSciSyncConflictListObj1; + NvSciSyncAttrList nvSciSyncReconciledListObj1; + NvSciSyncAttrList nvSciSyncConflictListObj2; + NvSciSyncAttrList nvSciSyncReconciledListObj2; + + sciError = NvSciSyncModuleOpen(&syncModule); + if (sciError != NvSciError_Success) { + DPRINTF("Error in initializing NvSciSyncModuleOpen\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.syncModule = syncModule; + + sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.signalerAttrListObj1 = signalerAttrListObj1; + + sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.waiterAttrListObj1 = waiterAttrListObj1; + + err = cudlaGetNvSciSyncAttributes(reinterpret_cast(waiterAttrListObj1), CUDLA_NVSCISYNC_ATTR_WAIT); + if (err != cudlaSuccess) { + DPRINTF("Error in getting cuDLA's NvSciSync attributes\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = fillCpuSignalerAttrList(signalerAttrListObj1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in setting NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + syncAttrListObj1[0] = signalerAttrListObj1; + syncAttrListObj1[1] = waiterAttrListObj1; + sciError = + NvSciSyncAttrListReconcile(syncAttrListObj1, 2, &nvSciSyncReconciledListObj1, &nvSciSyncConflictListObj1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciSync's attribute lists\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.nvSciSyncConflictListObj1 = nvSciSyncConflictListObj1; + resourceList.nvSciSyncReconciledListObj1 = nvSciSyncReconciledListObj1; + + sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj1, &syncObj1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciSync object\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.syncObj1 = syncObj1; + + sciError = NvSciSyncCpuWaitContextAlloc(syncModule, &nvSciCtx); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating cpu wait context NvSciSyncCpuWaitContextAlloc\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.nvSciCtx = nvSciCtx; + + sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj2); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.signalerAttrListObj2 = signalerAttrListObj2; + + sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj2); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.waiterAttrListObj2 = waiterAttrListObj2; + + err = cudlaGetNvSciSyncAttributes(reinterpret_cast(signalerAttrListObj2), CUDLA_NVSCISYNC_ATTR_SIGNAL); + if (err != cudlaSuccess) { + DPRINTF("Error in getting cuDLA's NvSciSync attributes\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = fillCpuWaiterAttrList(waiterAttrListObj2); + if (sciError != NvSciError_Success) { + DPRINTF("Error in setting NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + syncAttrListObj2[0] = signalerAttrListObj2; + syncAttrListObj2[1] = waiterAttrListObj2; + sciError = + NvSciSyncAttrListReconcile(syncAttrListObj2, 2, &nvSciSyncReconciledListObj2, &nvSciSyncConflictListObj2); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciSync's attribute lists\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.nvSciSyncConflictListObj2 = nvSciSyncConflictListObj2; + resourceList.nvSciSyncReconciledListObj2 = nvSciSyncReconciledListObj2; + + sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj2, &syncObj2); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciSync object\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.syncObj2 = syncObj2; + + // importing external semaphore + uint64_t *nvSciSyncObjRegPtr1 = NULL; + uint64_t *nvSciSyncObjRegPtr2 = NULL; + cudlaExternalSemaphoreHandleDesc semaMemDesc = {0}; + memset(&semaMemDesc, 0, sizeof(semaMemDesc)); + semaMemDesc.extSyncObject = syncObj1; + err = cudlaImportExternalSemaphore(devHandle, &semaMemDesc, &nvSciSyncObjRegPtr1, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + memset(&semaMemDesc, 0, sizeof(semaMemDesc)); + semaMemDesc.extSyncObject = syncObj2; + err = cudlaImportExternalSemaphore(devHandle, &semaMemDesc, &nvSciSyncObjRegPtr2, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); + + // Wait events + NvSciSyncFence preFence = NvSciSyncFenceInitializer; + sciError = NvSciSyncObjGenerateFence(syncObj1, &preFence); + if (sciError != NvSciError_Success) { + DPRINTF("Error in generating NvSciSyncObj fence %x\n", sciError); + cleanUp(&resourceList); + return 1; + } + resourceList.preFence = preFence; + + cudlaWaitEvents *waitEvents; + waitEvents = (cudlaWaitEvents *)malloc(sizeof(cudlaWaitEvents)); + if (waitEvents == NULL) { + DPRINTF("Error in allocating wait events\n"); + cleanUp(&resourceList); + return 1; + } + + waitEvents->numEvents = 1; + CudlaFence *preFences = (CudlaFence *)malloc(waitEvents->numEvents * sizeof(CudlaFence)); + if (preFences == NULL) { + DPRINTF("Error in allocating preFence array\n"); + cleanUp(&resourceList); + return 1; + } + + preFences[0].fence = &preFence; + preFences[0].type = CUDLA_NVSCISYNC_FENCE; + waitEvents->preFences = preFences; + resourceList.preFences = preFences; + resourceList.waitEvents = waitEvents; + + // Signal Events + cudlaSignalEvents *signalEvents; + signalEvents = (cudlaSignalEvents *)malloc(sizeof(cudlaSignalEvents)); + if (signalEvents == NULL) { + DPRINTF("Error in allocating signal events\n"); + cleanUp(&resourceList); + return 1; + } + + signalEvents->numEvents = 1; + uint64_t **devPtrs = (uint64_t **)malloc(signalEvents->numEvents * sizeof(uint64_t *)); + if (devPtrs == NULL) { + DPRINTF("Error in allocating output pointer's array of registered objects\n"); + cleanUp(&resourceList); + return 1; + } + devPtrs[0] = nvSciSyncObjRegPtr2; + signalEvents->devPtrs = devPtrs; + resourceList.devPtrs = devPtrs; + + signalEvents->eofFences = (CudlaFence *)malloc(signalEvents->numEvents * sizeof(CudlaFence)); + if (signalEvents->eofFences == NULL) { + DPRINTF("Error in allocating eofFence array\n"); + cleanUp(&resourceList); + return 1; + } + + NvSciSyncFence eofFence = NvSciSyncFenceInitializer; + signalEvents->eofFences[0].fence = &eofFence; + signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE; + resourceList.signalEvents = signalEvents; + resourceList.eofFence = eofFence; + + // Enqueue a cuDLA task. + cudlaTask task; + task.moduleHandle = moduleHandle; + task.outputTensor = &outputBufObjRegPtr; + task.numOutputTensors = 1; + task.numInputTensors = 1; + task.inputTensor = &inputBufObjRegPtr; + task.waitEvents = waitEvents; + task.signalEvents = signalEvents; + err = cudlaSubmitTask(devHandle, &task, 1, NULL, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in submitting task\n"); + cleanUp(&resourceList); + return 1; + } + DPRINTF("SUBMIT IS DONE !!!\n"); + + // Signal wait events + NvSciSyncObjSignal(syncObj1); + DPRINTF("SIGNALED WAIT EVENTS SUCCESSFULLY\n"); + + // Wait for operations to finish and bring output buffer to CPU. + sciError = NvSciSyncFenceWait(reinterpret_cast(signalEvents->eofFences[0].fence), nvSciCtx, -1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in waiting on NvSciSyncFence\n"); + cleanUp(&resourceList); + return 1; + } + + memcpy(outputBuffer, outputBufObjBuffer, outputTensorDesc[0].size); + + // Output is available in outputBuffer. + + // Teardown. + err = cudlaMemUnregister(devHandle, inputBufObjRegPtr); + if (err != cudlaSuccess) { + DPRINTF("Error in unregistering external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + err = cudlaMemUnregister(devHandle, outputBufObjRegPtr); + if (err != cudlaSuccess) { + DPRINTF("Error in unregistering external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr1); + if (err != cudlaSuccess) { + DPRINTF("Error in unregistering external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr2); + if (err != cudlaSuccess) { + DPRINTF("Error in unregistering external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); + + free(inputTensorDesc); + free(outputTensorDesc); free(loadableData); - DPRINTF("Read wrong size\n"); - return 1; - } - fclose(fp); + free(inputBuffer); + free(outputBuffer); + NvSciBufObjFree(inputBufObj); + NvSciBufObjFree(outputBufObj); + NvSciBufAttrListFree(reconciledInputAttrList); + NvSciBufAttrListFree(inputConflictList); + NvSciBufAttrListFree(inputAttrList); + NvSciBufAttrListFree(reconciledOutputAttrList); + NvSciBufAttrListFree(outputConflictList); + NvSciBufAttrListFree(outputAttrList); + NvSciBufModuleClose(bufModule); + NvSciSyncObjFree(syncObj1); + NvSciSyncObjFree(syncObj2); + NvSciSyncAttrListFree(signalerAttrListObj1); + NvSciSyncAttrListFree(waiterAttrListObj1); + NvSciSyncAttrListFree(signalerAttrListObj2); + NvSciSyncAttrListFree(waiterAttrListObj2); + NvSciSyncAttrListFree(nvSciSyncConflictListObj1); + NvSciSyncAttrListFree(nvSciSyncReconciledListObj1); + NvSciSyncAttrListFree(nvSciSyncConflictListObj2); + NvSciSyncAttrListFree(nvSciSyncReconciledListObj2); + NvSciSyncCpuWaitContextFree(nvSciCtx); + NvSciSyncModuleClose(syncModule); + free(waitEvents); + free(preFences); + free(signalEvents->eofFences); + free(signalEvents); + free(devPtrs); + NvSciSyncFenceClear(&preFence); + NvSciSyncFenceClear(&eofFence); - resourceList.loadableData = loadableData; + resourceList.inputTensorDesc = NULL; + resourceList.outputTensorDesc = NULL; + resourceList.loadableData = NULL; + resourceList.inputBuffer = NULL; + resourceList.outputBuffer = NULL; + resourceList.inputBufObj = NULL; + resourceList.outputBufObj = NULL; + resourceList.reconciledInputAttrList = NULL; + resourceList.inputConflictList = NULL; + resourceList.inputAttrList = NULL; + resourceList.reconciledOutputAttrList = NULL; + resourceList.outputConflictList = NULL; + resourceList.outputAttrList = NULL; + resourceList.bufModule = NULL; + resourceList.syncObj1 = NULL; + resourceList.syncObj2 = NULL; + resourceList.signalerAttrListObj1 = NULL; + resourceList.waiterAttrListObj1 = NULL; + resourceList.signalerAttrListObj2 = NULL; + resourceList.waiterAttrListObj2 = NULL; + resourceList.nvSciSyncConflictListObj1 = NULL; + resourceList.nvSciSyncReconciledListObj1 = NULL; + resourceList.nvSciSyncConflictListObj2 = NULL; + resourceList.nvSciSyncReconciledListObj2 = NULL; + resourceList.nvSciCtx = NULL; + resourceList.syncModule = NULL; + resourceList.waitEvents = NULL; + resourceList.signalEvents = NULL; + resourceList.preFences = NULL; + resourceList.devPtrs = NULL; - err = cudlaCreateDevice(0, &devHandle, CUDLA_STANDALONE); - if (err != cudlaSuccess) { - DPRINTF("Error in cuDLA create device = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - DPRINTF("Device created successfully\n"); - resourceList.devHandle = devHandle; - - err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, - &moduleHandle, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); - cleanUp(&resourceList); - return 1; - } else { - DPRINTF("Successfully loaded module\n"); - } - - resourceList.moduleHandle = moduleHandle; - // Get tensor attributes. - uint32_t numInputTensors = 0; - uint32_t numOutputTensors = 0; - cudlaModuleAttribute attribute; - - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting numInputTensors = %d\n", err); - cleanUp(&resourceList); - return 1; - } - numInputTensors = attribute.numInputTensors; - DPRINTF("numInputTensors = %d\n", numInputTensors); - - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting numOutputTensors = %d\n", err); - cleanUp(&resourceList); - return 1; - } - numOutputTensors = attribute.numOutputTensors; - DPRINTF("numOutputTensors = %d\n", numOutputTensors); - - cudlaModuleTensorDescriptor* inputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor) * - numInputTensors); - cudlaModuleTensorDescriptor* outputTensorDesc = - (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor) * - numOutputTensors); - - if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { - if (inputTensorDesc != NULL) { - free(inputTensorDesc); - inputTensorDesc = NULL; + err = cudlaModuleUnload(moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleUnload = %d\n", err); + cleanUp(&resourceList); + return 1; + } + else { + DPRINTF("Successfully unloaded module\n"); } - if (outputTensorDesc != NULL) { - free(outputTensorDesc); - outputTensorDesc = NULL; + resourceList.moduleHandle = NULL; + + err = cudlaDestroyDevice(devHandle); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA destroy device = %d\n", err); + return 1; } + DPRINTF("Device destroyed successfully\n"); - cleanUp(&resourceList); - return 1; - } + resourceList.devHandle = NULL; - resourceList.inputTensorDesc = inputTensorDesc; - resourceList.outputTensorDesc = outputTensorDesc; + DPRINTF("cuDLAStandaloneMode DONE !!!\n"); - attribute.inputTensorDesc = inputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_INPUT_TENSOR_DESCRIPTORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting input tensor descriptor = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("Printing input tensor descriptor\n"); - printTensorDesc(inputTensorDesc); - - attribute.outputTensorDesc = outputTensorDesc; - err = cudlaModuleGetAttributes(moduleHandle, CUDLA_OUTPUT_TENSOR_DESCRIPTORS, - &attribute); - if (err != cudlaSuccess) { - DPRINTF("Error in getting output tensor descriptor = %d\n", err); - cleanUp(&resourceList); - return 1; - } - DPRINTF("Printing output tensor descriptor\n"); - printTensorDesc(outputTensorDesc); - - // Setup the input and output buffers which will be used as an input to CUDA. - unsigned char* inputBuffer = (unsigned char*)malloc(inputTensorDesc[0].size); - if (inputBuffer == NULL) { - DPRINTF("Error in allocating input memory\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.inputBuffer = inputBuffer; - - unsigned char* outputBuffer = - (unsigned char*)malloc(outputTensorDesc[0].size); - if (outputBuffer == NULL) { - DPRINTF("Error in allocating output memory\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.outputBuffer = outputBuffer; - - memset(inputBuffer, 0x00, inputTensorDesc[0].size); - memset(outputBuffer, 0x00, outputTensorDesc[0].size); - - // Fill up the buffers with data. - if (initializeInputBuffers(argv[2], inputTensorDesc, inputBuffer) != 0) { - DPRINTF("Error in initializing input buffer from PGM image\n"); - cleanUp(&resourceList); - return 1; - } - - NvSciBufModule bufModule = NULL; - NvSciBufAttrList inputAttrList = NULL; - NvSciBufAttrList outputAttrList = NULL; - NvSciBufAttrList reconciledInputAttrList = NULL; - NvSciBufAttrList reconciledOutputAttrList = NULL; - NvSciBufAttrList inputConflictList = NULL; - NvSciBufAttrList outputConflictList = NULL; - NvSciError sciError = NvSciError_Success; - - sciError = NvSciBufModuleOpen(&bufModule); - if (sciError != NvSciError_Success) { - DPRINTF("Error in initializing NvSciBufModule\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.bufModule = bufModule; - - // creating and setting input attribute list - err = - createAndSetAttrList(bufModule, inputTensorDesc[0].size, &inputAttrList); - if (err != cudlaSuccess) { - DPRINTF("Error in creating NvSciBuf attribute list\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.inputAttrList = inputAttrList; - - sciError = NvSciBufAttrListReconcile( - &inputAttrList, 1, &reconciledInputAttrList, &inputConflictList); - if (sciError != NvSciError_Success) { - DPRINTF("Error in reconciling NvSciBuf attribute list\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.reconciledInputAttrList = reconciledInputAttrList; - resourceList.inputConflictList = inputConflictList; - - // creating and setting output attribute list - err = createAndSetAttrList(bufModule, outputTensorDesc[0].size, - &outputAttrList); - if (err != cudlaSuccess) { - DPRINTF("Error in creating NvSciBuf attribute list\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.outputAttrList = outputAttrList; - - sciError = NvSciBufAttrListReconcile( - &outputAttrList, 1, &reconciledOutputAttrList, &outputConflictList); - if (sciError != NvSciError_Success) { - DPRINTF("Error in reconciling NvSciBuf attribute list\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.reconciledOutputAttrList = reconciledOutputAttrList; - resourceList.outputConflictList = outputConflictList; - - NvSciBufObj inputBufObj, outputBufObj; - sciError = NvSciBufObjAlloc(reconciledInputAttrList, &inputBufObj); - if (sciError != NvSciError_Success) { - DPRINTF("Error in allocating NvSciBuf object\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.inputBufObj = inputBufObj; - - sciError = NvSciBufObjAlloc(reconciledOutputAttrList, &outputBufObj); - if (sciError != NvSciError_Success) { - DPRINTF("Error in allocating NvSciBuf object\n"); - cleanUp(&resourceList); - return 1; - } - - resourceList.outputBufObj = outputBufObj; - - uint64_t* inputBufObjRegPtr = NULL; - uint64_t* outputBufObjRegPtr = NULL; - void* inputBufObjBuffer; - void* outputBufObjBuffer; - - // importing external memory - cudlaExternalMemoryHandleDesc memDesc = {0}; - memset(&memDesc, 0, sizeof(memDesc)); - memDesc.extBufObject = (void*)inputBufObj; - memDesc.size = inputTensorDesc[0].size; - err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in importing external memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - sciError = NvSciBufObjGetCpuPtr(inputBufObj, &inputBufObjBuffer); - if (sciError != NvSciError_Success) { - DPRINTF("Error in getting NvSciBuf CPU pointer\n"); - cleanUp(&resourceList); - return 1; - } - memcpy(inputBufObjBuffer, inputBuffer, inputTensorDesc[0].size); - - memset(&memDesc, 0, sizeof(memDesc)); - memDesc.extBufObject = (void*)outputBufObj; - memDesc.size = outputTensorDesc[0].size; - err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in importing external memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - sciError = NvSciBufObjGetCpuPtr(outputBufObj, &outputBufObjBuffer); - if (sciError != NvSciError_Success) { - DPRINTF("Error in getting NvSciBuf CPU pointer\n"); - cleanUp(&resourceList); - return 1; - } - memset(outputBufObjBuffer, 0, outputTensorDesc[0].size); - - NvSciSyncObj syncObj1, syncObj2; - NvSciSyncModule syncModule; - NvSciSyncAttrList syncAttrListObj1[2]; - NvSciSyncAttrList syncAttrListObj2[2]; - NvSciSyncCpuWaitContext nvSciCtx; - NvSciSyncAttrList waiterAttrListObj1 = NULL; - NvSciSyncAttrList signalerAttrListObj1 = NULL; - NvSciSyncAttrList waiterAttrListObj2 = NULL; - NvSciSyncAttrList signalerAttrListObj2 = NULL; - NvSciSyncAttrList nvSciSyncConflictListObj1; - NvSciSyncAttrList nvSciSyncReconciledListObj1; - NvSciSyncAttrList nvSciSyncConflictListObj2; - NvSciSyncAttrList nvSciSyncReconciledListObj2; - - sciError = NvSciSyncModuleOpen(&syncModule); - if (sciError != NvSciError_Success) { - DPRINTF("Error in initializing NvSciSyncModuleOpen\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.syncModule = syncModule; - - sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj1); - if (sciError != NvSciError_Success) { - DPRINTF("Error in creating NvSciSync attribute list\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.signalerAttrListObj1 = signalerAttrListObj1; - - sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj1); - if (sciError != NvSciError_Success) { - DPRINTF("Error in creating NvSciSync attribute list\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.waiterAttrListObj1 = waiterAttrListObj1; - - err = cudlaGetNvSciSyncAttributes( - reinterpret_cast(waiterAttrListObj1), - CUDLA_NVSCISYNC_ATTR_WAIT); - if (err != cudlaSuccess) { - DPRINTF("Error in getting cuDLA's NvSciSync attributes\n"); - cleanUp(&resourceList); - return 1; - } - - sciError = fillCpuSignalerAttrList(signalerAttrListObj1); - if (sciError != NvSciError_Success) { - DPRINTF("Error in setting NvSciSync attribute list\n"); - cleanUp(&resourceList); - return 1; - } - - syncAttrListObj1[0] = signalerAttrListObj1; - syncAttrListObj1[1] = waiterAttrListObj1; - sciError = NvSciSyncAttrListReconcile(syncAttrListObj1, 2, - &nvSciSyncReconciledListObj1, - &nvSciSyncConflictListObj1); - if (sciError != NvSciError_Success) { - DPRINTF("Error in reconciling NvSciSync's attribute lists\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.nvSciSyncConflictListObj1 = nvSciSyncConflictListObj1; - resourceList.nvSciSyncReconciledListObj1 = nvSciSyncReconciledListObj1; - - sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj1, &syncObj1); - if (sciError != NvSciError_Success) { - DPRINTF("Error in allocating NvSciSync object\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.syncObj1 = syncObj1; - - sciError = NvSciSyncCpuWaitContextAlloc(syncModule, &nvSciCtx); - if (sciError != NvSciError_Success) { - DPRINTF( - "Error in allocating cpu wait context NvSciSyncCpuWaitContextAlloc\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.nvSciCtx = nvSciCtx; - - sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj2); - if (sciError != NvSciError_Success) { - DPRINTF("Error in creating NvSciSync attribute list\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.signalerAttrListObj2 = signalerAttrListObj2; - - sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj2); - if (sciError != NvSciError_Success) { - DPRINTF("Error in creating NvSciSync attribute list\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.waiterAttrListObj2 = waiterAttrListObj2; - - err = cudlaGetNvSciSyncAttributes( - reinterpret_cast(signalerAttrListObj2), - CUDLA_NVSCISYNC_ATTR_SIGNAL); - if (err != cudlaSuccess) { - DPRINTF("Error in getting cuDLA's NvSciSync attributes\n"); - cleanUp(&resourceList); - return 1; - } - - sciError = fillCpuWaiterAttrList(waiterAttrListObj2); - if (sciError != NvSciError_Success) { - DPRINTF("Error in setting NvSciSync attribute list\n"); - cleanUp(&resourceList); - return 1; - } - - syncAttrListObj2[0] = signalerAttrListObj2; - syncAttrListObj2[1] = waiterAttrListObj2; - sciError = NvSciSyncAttrListReconcile(syncAttrListObj2, 2, - &nvSciSyncReconciledListObj2, - &nvSciSyncConflictListObj2); - if (sciError != NvSciError_Success) { - DPRINTF("Error in reconciling NvSciSync's attribute lists\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.nvSciSyncConflictListObj2 = nvSciSyncConflictListObj2; - resourceList.nvSciSyncReconciledListObj2 = nvSciSyncReconciledListObj2; - - sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj2, &syncObj2); - if (sciError != NvSciError_Success) { - DPRINTF("Error in allocating NvSciSync object\n"); - cleanUp(&resourceList); - return 1; - } - resourceList.syncObj2 = syncObj2; - - // importing external semaphore - uint64_t* nvSciSyncObjRegPtr1 = NULL; - uint64_t* nvSciSyncObjRegPtr2 = NULL; - cudlaExternalSemaphoreHandleDesc semaMemDesc = {0}; - memset(&semaMemDesc, 0, sizeof(semaMemDesc)); - semaMemDesc.extSyncObject = syncObj1; - err = cudlaImportExternalSemaphore(devHandle, &semaMemDesc, - &nvSciSyncObjRegPtr1, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in importing external semaphore = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - memset(&semaMemDesc, 0, sizeof(semaMemDesc)); - semaMemDesc.extSyncObject = syncObj2; - err = cudlaImportExternalSemaphore(devHandle, &semaMemDesc, - &nvSciSyncObjRegPtr2, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in importing external semaphore = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); - - // Wait events - NvSciSyncFence preFence = NvSciSyncFenceInitializer; - sciError = NvSciSyncObjGenerateFence(syncObj1, &preFence); - if (sciError != NvSciError_Success) { - DPRINTF("Error in generating NvSciSyncObj fence %x\n", sciError); - cleanUp(&resourceList); - return 1; - } - resourceList.preFence = preFence; - - cudlaWaitEvents* waitEvents; - waitEvents = (cudlaWaitEvents*)malloc(sizeof(cudlaWaitEvents)); - if (waitEvents == NULL) { - DPRINTF("Error in allocating wait events\n"); - cleanUp(&resourceList); - return 1; - } - - waitEvents->numEvents = 1; - CudlaFence* preFences = - (CudlaFence*)malloc(waitEvents->numEvents * sizeof(CudlaFence)); - if (preFences == NULL) { - DPRINTF("Error in allocating preFence array\n"); - cleanUp(&resourceList); - return 1; - } - - preFences[0].fence = &preFence; - preFences[0].type = CUDLA_NVSCISYNC_FENCE; - waitEvents->preFences = preFences; - resourceList.preFences = preFences; - resourceList.waitEvents = waitEvents; - - // Signal Events - cudlaSignalEvents* signalEvents; - signalEvents = (cudlaSignalEvents*)malloc(sizeof(cudlaSignalEvents)); - if (signalEvents == NULL) { - DPRINTF("Error in allocating signal events\n"); - cleanUp(&resourceList); - return 1; - } - - signalEvents->numEvents = 1; - uint64_t** devPtrs = - (uint64_t**)malloc(signalEvents->numEvents * sizeof(uint64_t*)); - if (devPtrs == NULL) { - DPRINTF( - "Error in allocating output pointer's array of registered objects\n"); - cleanUp(&resourceList); - return 1; - } - devPtrs[0] = nvSciSyncObjRegPtr2; - signalEvents->devPtrs = devPtrs; - resourceList.devPtrs = devPtrs; - - signalEvents->eofFences = - (CudlaFence*)malloc(signalEvents->numEvents * sizeof(CudlaFence)); - if (signalEvents->eofFences == NULL) { - DPRINTF("Error in allocating eofFence array\n"); - cleanUp(&resourceList); - return 1; - } - - NvSciSyncFence eofFence = NvSciSyncFenceInitializer; - signalEvents->eofFences[0].fence = &eofFence; - signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE; - resourceList.signalEvents = signalEvents; - resourceList.eofFence = eofFence; - - // Enqueue a cuDLA task. - cudlaTask task; - task.moduleHandle = moduleHandle; - task.outputTensor = &outputBufObjRegPtr; - task.numOutputTensors = 1; - task.numInputTensors = 1; - task.inputTensor = &inputBufObjRegPtr; - task.waitEvents = waitEvents; - task.signalEvents = signalEvents; - err = cudlaSubmitTask(devHandle, &task, 1, NULL, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in submitting task\n"); - cleanUp(&resourceList); - return 1; - } - DPRINTF("SUBMIT IS DONE !!!\n"); - - // Signal wait events - NvSciSyncObjSignal(syncObj1); - DPRINTF("SIGNALED WAIT EVENTS SUCCESSFULLY\n"); - - // Wait for operations to finish and bring output buffer to CPU. - sciError = NvSciSyncFenceWait( - reinterpret_cast(signalEvents->eofFences[0].fence), - nvSciCtx, -1); - if (sciError != NvSciError_Success) { - DPRINTF("Error in waiting on NvSciSyncFence\n"); - cleanUp(&resourceList); - return 1; - } - - memcpy(outputBuffer, outputBufObjBuffer, outputTensorDesc[0].size); - - // Output is available in outputBuffer. - - // Teardown. - err = cudlaMemUnregister(devHandle, inputBufObjRegPtr); - if (err != cudlaSuccess) { - DPRINTF("Error in unregistering external memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - err = cudlaMemUnregister(devHandle, outputBufObjRegPtr); - if (err != cudlaSuccess) { - DPRINTF("Error in unregistering external memory = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr1); - if (err != cudlaSuccess) { - DPRINTF("Error in unregistering external semaphore = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr2); - if (err != cudlaSuccess) { - DPRINTF("Error in unregistering external semaphore = %d\n", err); - cleanUp(&resourceList); - return 1; - } - - DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); - - free(inputTensorDesc); - free(outputTensorDesc); - free(loadableData); - free(inputBuffer); - free(outputBuffer); - NvSciBufObjFree(inputBufObj); - NvSciBufObjFree(outputBufObj); - NvSciBufAttrListFree(reconciledInputAttrList); - NvSciBufAttrListFree(inputConflictList); - NvSciBufAttrListFree(inputAttrList); - NvSciBufAttrListFree(reconciledOutputAttrList); - NvSciBufAttrListFree(outputConflictList); - NvSciBufAttrListFree(outputAttrList); - NvSciBufModuleClose(bufModule); - NvSciSyncObjFree(syncObj1); - NvSciSyncObjFree(syncObj2); - NvSciSyncAttrListFree(signalerAttrListObj1); - NvSciSyncAttrListFree(waiterAttrListObj1); - NvSciSyncAttrListFree(signalerAttrListObj2); - NvSciSyncAttrListFree(waiterAttrListObj2); - NvSciSyncAttrListFree(nvSciSyncConflictListObj1); - NvSciSyncAttrListFree(nvSciSyncReconciledListObj1); - NvSciSyncAttrListFree(nvSciSyncConflictListObj2); - NvSciSyncAttrListFree(nvSciSyncReconciledListObj2); - NvSciSyncCpuWaitContextFree(nvSciCtx); - NvSciSyncModuleClose(syncModule); - free(waitEvents); - free(preFences); - free(signalEvents->eofFences); - free(signalEvents); - free(devPtrs); - NvSciSyncFenceClear(&preFence); - NvSciSyncFenceClear(&eofFence); - - resourceList.inputTensorDesc = NULL; - resourceList.outputTensorDesc = NULL; - resourceList.loadableData = NULL; - resourceList.inputBuffer = NULL; - resourceList.outputBuffer = NULL; - resourceList.inputBufObj = NULL; - resourceList.outputBufObj = NULL; - resourceList.reconciledInputAttrList = NULL; - resourceList.inputConflictList = NULL; - resourceList.inputAttrList = NULL; - resourceList.reconciledOutputAttrList = NULL; - resourceList.outputConflictList = NULL; - resourceList.outputAttrList = NULL; - resourceList.bufModule = NULL; - resourceList.syncObj1 = NULL; - resourceList.syncObj2 = NULL; - resourceList.signalerAttrListObj1 = NULL; - resourceList.waiterAttrListObj1 = NULL; - resourceList.signalerAttrListObj2 = NULL; - resourceList.waiterAttrListObj2 = NULL; - resourceList.nvSciSyncConflictListObj1 = NULL; - resourceList.nvSciSyncReconciledListObj1 = NULL; - resourceList.nvSciSyncConflictListObj2 = NULL; - resourceList.nvSciSyncReconciledListObj2 = NULL; - resourceList.nvSciCtx = NULL; - resourceList.syncModule = NULL; - resourceList.waitEvents = NULL; - resourceList.signalEvents = NULL; - resourceList.preFences = NULL; - resourceList.devPtrs = NULL; - - err = cudlaModuleUnload(moduleHandle, 0); - if (err != cudlaSuccess) { - DPRINTF("Error in cudlaModuleUnload = %d\n", err); - cleanUp(&resourceList); - return 1; - } else { - DPRINTF("Successfully unloaded module\n"); - } - - resourceList.moduleHandle = NULL; - - err = cudlaDestroyDevice(devHandle); - if (err != cudlaSuccess) { - DPRINTF("Error in cuDLA destroy device = %d\n", err); - return 1; - } - DPRINTF("Device destroyed successfully\n"); - - resourceList.devHandle = NULL; - - DPRINTF("cuDLAStandaloneMode DONE !!!\n"); - - return 0; + return 0; } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/README.md b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/README.md index 3e1f3597..02055f45 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/README.md +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.cpp index 57df5784..0b0fc34c 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.cpp @@ -27,162 +27,165 @@ #include "cudaNvSciBufMultiplanar.h" NvSciBufModule module; -NvSciBufObj buffObj; -CUuuid uuid; +NvSciBufObj buffObj; +CUuuid uuid; -void flipBits(uint8_t *pBuff, uint32_t size) { +void flipBits(uint8_t *pBuff, uint32_t size) +{ for (uint32_t i = 0; i < size; i++) { pBuff[i] = (~pBuff[i]); } } // Compare input and generated image files -void compareFiles(std::string &path1, std::string &path2) { - bool result = true; +void compareFiles(std::string &path1, std::string &path2) +{ + bool result = true; FILE *fp1, *fp2; - int ch1, ch2; - + int ch1, ch2; + fp1 = fopen(path1.c_str(), "rb"); fp2 = fopen(path2.c_str(), "rb"); if (!fp1) { result = false; printf("File %s open failed in %s line %d\n", path1.c_str(), __FILE__, __LINE__); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } if (!fp2) { result = false; printf("File %s open failed in %s line %d\n", path2.c_str(), __FILE__, __LINE__); exit(EXIT_FAILURE); } - - do { - ch1 = getc(fp1); + + do { + ch1 = getc(fp1); ch2 = getc(fp2); - if (ch1 != ch2) { - result = false; + if (ch1 != ch2) { + result = false; break; - } - } while(ch1 != EOF && ch2 != EOF); + } + } while (ch1 != EOF && ch2 != EOF); if (result) { printf("Input file : %s and output file : %s match SUCCESS\n", path1.c_str(), path2.c_str()); - } + } else { printf("Input file : %s and output file : %s match FAILURE\n", path1.c_str(), path2.c_str()); } - + if (fp1) { fclose(fp1); } if (fp2) { fclose(fp2); } -} +} -void Caller::init() { +void Caller::init() +{ checkNvSciErrors(NvSciBufAttrListCreate(module, &attrList)); attrListOut = NULL; } -void Caller::deinit() { +void Caller::deinit() +{ NvSciBufAttrListFree(attrList); checkCudaErrors(cudaDestroyExternalMemory(extMem)); } -// Set NvSciBufImage attribute values in the attribute list -void Caller::setAttrListImageMultiPlanes(int imageWidth, int imageHeight) { - NvSciBufType bufType = NvSciBufType_Image; - NvSciBufAttrValImageLayoutType layout = NvSciBufImage_BlockLinearType; - bool cpuAccessFlag = false; - NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; - NvSciRmGpuId gpuid; - bool vpr = false; - int32_t planeCount = PLANAR_NUM_PLANES; - int drvVersion; +// Set NvSciBufImage attribute values in the attribute list +void Caller::setAttrListImageMultiPlanes(int imageWidth, int imageHeight) +{ + NvSciBufType bufType = NvSciBufType_Image; + NvSciBufAttrValImageLayoutType layout = NvSciBufImage_BlockLinearType; + bool cpuAccessFlag = false; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; + NvSciRmGpuId gpuid; + bool vpr = false; + int32_t planeCount = PLANAR_NUM_PLANES; + int drvVersion; // Dimensions of the imported image in the YUV 420 planar format - int32_t planeWidths[] = {imageWidth, imageWidth/2, imageWidth/2}; - int32_t planeHeights[] = {imageHeight, imageHeight/2, imageHeight/2}; + int32_t planeWidths[] = {imageWidth, imageWidth / 2, imageWidth / 2}; + int32_t planeHeights[] = {imageHeight, imageHeight / 2, imageHeight / 2}; NvSciBufAttrKeyValuePair keyPair; NvSciBufAttrKeyValuePair pairArray[ATTR_SIZE]; - - NvSciBufAttrValColorFmt planeColorFmts[] = - { NvSciColor_Y8, NvSciColor_V8, NvSciColor_U8 }; - NvSciBufAttrValImageScanType planeScanType[] = - { NvSciBufScan_ProgressiveType }; - + + NvSciBufAttrValColorFmt planeColorFmts[] = {NvSciColor_Y8, NvSciColor_V8, NvSciColor_U8}; + NvSciBufAttrValImageScanType planeScanType[] = {NvSciBufScan_ProgressiveType}; + memcpy(&gpuid.bytes, &uuid.bytes, sizeof(uuid.bytes)); - + NvSciBufAttrKeyValuePair imgBuffAttrsArr[] = { - { NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType) }, - { NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccessFlag, - sizeof(cpuAccessFlag) }, - { NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm) }, - { NvSciBufGeneralAttrKey_GpuId, &gpuid, sizeof(gpuid) }, - { NvSciBufImageAttrKey_Layout, &layout, sizeof(layout) }, - { NvSciBufImageAttrKey_VprFlag, &vpr, sizeof(vpr) }, - { NvSciBufImageAttrKey_PlaneCount, &planeCount, sizeof(planeCount) }, - { NvSciBufImageAttrKey_PlaneColorFormat, planeColorFmts, - sizeof(planeColorFmts) }, - { NvSciBufImageAttrKey_PlaneWidth, planeWidths, sizeof(planeWidths) }, - { NvSciBufImageAttrKey_PlaneHeight, planeHeights, - sizeof(planeHeights) }, - { NvSciBufImageAttrKey_PlaneScanType, planeScanType, - sizeof(planeScanType) }, + {NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType)}, + {NvSciBufGeneralAttrKey_NeedCpuAccess, &cpuAccessFlag, sizeof(cpuAccessFlag)}, + {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)}, + {NvSciBufGeneralAttrKey_GpuId, &gpuid, sizeof(gpuid)}, + {NvSciBufImageAttrKey_Layout, &layout, sizeof(layout)}, + {NvSciBufImageAttrKey_VprFlag, &vpr, sizeof(vpr)}, + {NvSciBufImageAttrKey_PlaneCount, &planeCount, sizeof(planeCount)}, + {NvSciBufImageAttrKey_PlaneColorFormat, planeColorFmts, sizeof(planeColorFmts)}, + {NvSciBufImageAttrKey_PlaneWidth, planeWidths, sizeof(planeWidths)}, + {NvSciBufImageAttrKey_PlaneHeight, planeHeights, sizeof(planeHeights)}, + {NvSciBufImageAttrKey_PlaneScanType, planeScanType, sizeof(planeScanType)}, }; - - std::vector imgBuffAttrsVec(imgBuffAttrsArr, - imgBuffAttrsArr+(sizeof(imgBuffAttrsArr)/sizeof(imgBuffAttrsArr[0]))); - + + std::vector imgBuffAttrsVec( + imgBuffAttrsArr, imgBuffAttrsArr + (sizeof(imgBuffAttrsArr) / sizeof(imgBuffAttrsArr[0]))); + memset(pairArray, 0, sizeof(NvSciBufAttrKeyValuePair) * imgBuffAttrsVec.size()); std::copy(imgBuffAttrsVec.begin(), imgBuffAttrsVec.end(), pairArray); checkNvSciErrors(NvSciBufAttrListSetAttrs(attrList, pairArray, imgBuffAttrsVec.size())); } cudaNvSciBufMultiplanar::cudaNvSciBufMultiplanar(size_t width, size_t height, std::vector &deviceIds) - : imageWidth(width), - imageHeight(height) { - mCudaDeviceId = deviceIds[0]; - attrListReconciled = NULL; - attrListConflict = NULL; - checkNvSciErrors(NvSciBufModuleOpen(&module)); - initCuda(mCudaDeviceId); - } + : imageWidth(width) + , imageHeight(height) +{ + mCudaDeviceId = deviceIds[0]; + attrListReconciled = NULL; + attrListConflict = NULL; + checkNvSciErrors(NvSciBufModuleOpen(&module)); + initCuda(mCudaDeviceId); +} -void cudaNvSciBufMultiplanar::initCuda(int devId) { - int major = 0, minor = 0, drvVersion; +void cudaNvSciBufMultiplanar::initCuda(int devId) +{ + int major = 0, minor = 0, drvVersion; NvSciRmGpuId gpuid; checkCudaErrors(cudaSetDevice(mCudaDeviceId)); - checkCudaErrors(cudaDeviceGetAttribute( - &major, cudaDevAttrComputeCapabilityMajor, mCudaDeviceId)); - checkCudaErrors(cudaDeviceGetAttribute( - &minor, cudaDevAttrComputeCapabilityMinor, mCudaDeviceId)); - printf( - "[cudaNvSciBufMultiplanar] GPU Device %d: \"%s\" with compute capability " - "%d.%d\n\n", - mCudaDeviceId, _ConvertSMVer2ArchName(major, minor), major, minor); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, mCudaDeviceId)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, mCudaDeviceId)); + printf("[cudaNvSciBufMultiplanar] GPU Device %d: \"%s\" with compute capability " + "%d.%d\n\n", + mCudaDeviceId, + _ConvertSMVer2ArchName(major, minor), + major, + minor); checkCudaDrvErrors(cuDriverGetVersion(&drvVersion)); if (drvVersion <= 11030) { checkCudaDrvErrors(cuDeviceGetUuid(&uuid, devId)); - } else { + } + else { checkCudaDrvErrors(cuDeviceGetUuid_v2(&uuid, devId)); } } -/* +/* Caller1 flips a YUV image which is allocated to nvscibuf APIs and copied into CUDA Array. -It is mapped to CUDA surface and bit flip is done. Caller2 in the same thread copies +It is mapped to CUDA surface and bit flip is done. Caller2 in the same thread copies CUDA Array to a YUV image file. The original image is compared with the double bit flipped image. */ -void cudaNvSciBufMultiplanar::runCudaNvSciBufPlanar(std::string &imageFilename, std::string &imageFilenameOut) { +void cudaNvSciBufMultiplanar::runCudaNvSciBufPlanar(std::string &imageFilename, std::string &imageFilenameOut) +{ cudaArray_t levelArray1[PLANAR_NUM_PLANES]; cudaArray_t levelArray2[PLANAR_NUM_PLANES]; - Caller caller1; - Caller caller2; + Caller caller1; + Caller caller2; int numPlanes = PLANAR_NUM_PLANES; caller1.init(); @@ -191,23 +194,23 @@ void cudaNvSciBufMultiplanar::runCudaNvSciBufPlanar(std::string &imageFilename, // Set NvSciBufImage attribute values in the attribute list caller1.setAttrListImageMultiPlanes(imageWidth, imageHeight); caller2.setAttrListImageMultiPlanes(imageWidth, imageHeight); - + // Reconcile attribute lists and allocate NvSciBuf object - reconcileAttrList(&caller1.attrList, &caller2.attrList); + reconcileAttrList(&caller1.attrList, &caller2.attrList); caller1.copyExtMemToMultiPlanarArrays(); for (int i = 0; i < numPlanes; i++) { - checkCudaErrors(cudaGetMipmappedArrayLevel(&levelArray1[i], caller1.multiPlanarArray[i], 0)); + checkCudaErrors(cudaGetMipmappedArrayLevel(&levelArray1[i], caller1.multiPlanarArray[i], 0)); } caller1.copyYUVToCudaArrayAndFlipBits(imageFilename, levelArray1); - + caller2.copyExtMemToMultiPlanarArrays(); for (int i = 0; i < numPlanes; i++) { - checkCudaErrors(cudaGetMipmappedArrayLevel(&levelArray2[i], caller2.multiPlanarArray[i], 0)); + checkCudaErrors(cudaGetMipmappedArrayLevel(&levelArray2[i], caller2.multiPlanarArray[i], 0)); } // Maps cudaArray to surface memory and launches a kernel to flip bits launchFlipSurfaceBitsKernel(levelArray2, caller2.multiPlanarWidth, caller2.multiPlanarHeight, numPlanes); - - // Synchronization can be done using nvSciSync when non CUDA callers and cross-process signaler-waiter + + // Synchronization can be done using nvSciSync when non CUDA callers and cross-process signaler-waiter // applications are involved. Please refer to the cudaNvSci sample library for more details. checkCudaDrvErrors(cuCtxSynchronize()); printf("Bit flip of the surface memory done\n"); @@ -225,66 +228,69 @@ void cudaNvSciBufMultiplanar::runCudaNvSciBufPlanar(std::string &imageFilename, } // Map NvSciBufObj to cudaMipmappedArray -void Caller::copyExtMemToMultiPlanarArrays() { +void Caller::copyExtMemToMultiPlanarArrays() +{ checkNvSciErrors(NvSciBufObjGetAttrList(buffObj, &attrListOut)); memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * PLANE_ATTR_SIZE); - cudaExternalMemoryHandleDesc memHandleDesc; + cudaExternalMemoryHandleDesc memHandleDesc; cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0}; - cudaChannelFormatDesc desc = {0}; - cudaExtent extent = {0}; - - pairArrayOut[PLANE_SIZE].key = NvSciBufImageAttrKey_Size; // Datatype: @c uint64_t - pairArrayOut[PLANE_ALIGNED_SIZE].key = NvSciBufImageAttrKey_PlaneAlignedSize; // Datatype: @c uint64_t[] - pairArrayOut[PLANE_OFFSET].key = NvSciBufImageAttrKey_PlaneOffset; // Datatype: @c uint64_t[] - pairArrayOut[PLANE_HEIGHT].key = NvSciBufImageAttrKey_PlaneHeight; // Datatype: @c uint32_t[] - pairArrayOut[PLANE_WIDTH].key = NvSciBufImageAttrKey_PlaneWidth; // Datatype: @c int32_t[] - pairArrayOut[PLANE_CHANNEL_COUNT].key = NvSciBufImageAttrKey_PlaneChannelCount; // Datatype: @c uint8_t - pairArrayOut[PLANE_BITS_PER_PIXEL].key = NvSciBufImageAttrKey_PlaneBitsPerPixel;// Datatype: @c uint32_t[] - pairArrayOut[PLANE_COUNT].key = NvSciBufImageAttrKey_PlaneCount; // Datatype: @c uint32_t - checkNvSciErrors(NvSciBufAttrListGetAttrs(attrListOut, pairArrayOut, (PLANE_ATTR_SIZE))); - - uint64_t size = *(uint64_t*)pairArrayOut[PLANE_SIZE].value; - uint64_t *planeAlignedSize = (uint64_t*)pairArrayOut[PLANE_ALIGNED_SIZE].value; - int32_t *planeWidth = (int32_t*)pairArrayOut[PLANE_WIDTH].value; - int32_t *planeHeight = (int32_t*)pairArrayOut[PLANE_HEIGHT].value; - uint64_t *planeOffset = (uint64_t*)pairArrayOut[PLANE_OFFSET].value; - uint8_t planeChannelCount = *(uint8_t*)pairArrayOut[PLANE_CHANNEL_COUNT].value; - uint32_t *planeBitsPerPixel = (uint32_t*)pairArrayOut[PLANE_BITS_PER_PIXEL].value; - uint32_t planeCount = *(uint32_t*)pairArrayOut[PLANE_COUNT].value; - + cudaChannelFormatDesc desc = {0}; + cudaExtent extent = {0}; + + pairArrayOut[PLANE_SIZE].key = NvSciBufImageAttrKey_Size; // Datatype: @c uint64_t + pairArrayOut[PLANE_ALIGNED_SIZE].key = NvSciBufImageAttrKey_PlaneAlignedSize; // Datatype: @c uint64_t[] + pairArrayOut[PLANE_OFFSET].key = NvSciBufImageAttrKey_PlaneOffset; // Datatype: @c uint64_t[] + pairArrayOut[PLANE_HEIGHT].key = NvSciBufImageAttrKey_PlaneHeight; // Datatype: @c uint32_t[] + pairArrayOut[PLANE_WIDTH].key = NvSciBufImageAttrKey_PlaneWidth; // Datatype: @c int32_t[] + pairArrayOut[PLANE_CHANNEL_COUNT].key = NvSciBufImageAttrKey_PlaneChannelCount; // Datatype: @c uint8_t + pairArrayOut[PLANE_BITS_PER_PIXEL].key = NvSciBufImageAttrKey_PlaneBitsPerPixel; // Datatype: @c uint32_t[] + pairArrayOut[PLANE_COUNT].key = NvSciBufImageAttrKey_PlaneCount; // Datatype: @c uint32_t + checkNvSciErrors(NvSciBufAttrListGetAttrs(attrListOut, pairArrayOut, (PLANE_ATTR_SIZE))); + + uint64_t size = *(uint64_t *)pairArrayOut[PLANE_SIZE].value; + uint64_t *planeAlignedSize = (uint64_t *)pairArrayOut[PLANE_ALIGNED_SIZE].value; + int32_t *planeWidth = (int32_t *)pairArrayOut[PLANE_WIDTH].value; + int32_t *planeHeight = (int32_t *)pairArrayOut[PLANE_HEIGHT].value; + uint64_t *planeOffset = (uint64_t *)pairArrayOut[PLANE_OFFSET].value; + uint8_t planeChannelCount = *(uint8_t *)pairArrayOut[PLANE_CHANNEL_COUNT].value; + uint32_t *planeBitsPerPixel = (uint32_t *)pairArrayOut[PLANE_BITS_PER_PIXEL].value; + uint32_t planeCount = *(uint32_t *)pairArrayOut[PLANE_COUNT].value; + numPlanes = planeCount; - + for (int i = 0; i < numPlanes; i++) { - multiPlanarWidth[i] = planeWidth[i]; + multiPlanarWidth[i] = planeWidth[i]; multiPlanarHeight[i] = planeHeight[i]; } - + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); - memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; memHandleDesc.handle.nvSciBufObject = buffObj; - memHandleDesc.size = size; + memHandleDesc.size = size; checkCudaErrors(cudaImportExternalMemory(&extMem, &memHandleDesc)); - + desc = cudaCreateChannelDesc(planeBitsPerPixel[0], 0, 0, 0, cudaChannelFormatKindUnsigned); memset(&mipmapDesc, 0, sizeof(mipmapDesc)); - mipmapDesc.numLevels = 1; + mipmapDesc.numLevels = 1; - for (int i = 0; i < numPlanes; i++) { + for (int i = 0; i < numPlanes; i++) { memset(&extent, 0, sizeof(extent)); - extent.width = planeWidth[i]; - extent.height = planeHeight[i]; - extent.depth = 0; - mipmapDesc.offset = planeOffset[i]; + extent.width = planeWidth[i]; + extent.height = planeHeight[i]; + extent.depth = 0; + mipmapDesc.offset = planeOffset[i]; mipmapDesc.formatDesc = desc; - mipmapDesc.extent = extent; - mipmapDesc.flags = cudaArraySurfaceLoadStore;; + mipmapDesc.extent = extent; + mipmapDesc.flags = cudaArraySurfaceLoadStore; + ; checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&multiPlanarArray[i], extMem, &mipmapDesc)); } } -void cudaNvSciBufMultiplanar::reconcileAttrList(NvSciBufAttrList *attrList1, NvSciBufAttrList *attrList2) { - attrList[0] = *attrList1; - attrList[1] = *attrList2; +void cudaNvSciBufMultiplanar::reconcileAttrList(NvSciBufAttrList *attrList1, NvSciBufAttrList *attrList2) +{ + attrList[0] = *attrList1; + attrList[1] = *attrList2; bool isReconciled = false; checkNvSciErrors(NvSciBufAttrListReconcile(attrList, 2, &attrListReconciled, &attrListConflict)); @@ -294,38 +300,40 @@ void cudaNvSciBufMultiplanar::reconcileAttrList(NvSciBufAttrList *attrList1, NvS } // YUV 420 image is flipped and copied to cuda Array which is mapped to nvsciBuf -void Caller::copyYUVToCudaArrayAndFlipBits(std::string &path, cudaArray_t *cudaArr) { - FILE *fp = NULL; +void Caller::copyYUVToCudaArrayAndFlipBits(std::string &path, cudaArray_t *cudaArr) +{ + FILE *fp = NULL; uint8_t *pYBuff, *pUBuff, *pVBuff, *pChroma; - uint8_t *pBuff = NULL; + uint8_t *pBuff = NULL; uint32_t uvOffset[numPlanes] = {0}, copyWidthInBytes[numPlanes] = {0}, copyHeight[numPlanes] = {0}; - uint32_t width = multiPlanarWidth[0]; + uint32_t width = multiPlanarWidth[0]; uint32_t height = multiPlanarHeight[0]; fp = fopen(path.c_str(), "rb"); if (!fp) { printf("CudaProducer: Error opening file: %s in %s line %d\n", path.c_str(), __FILE__, __LINE__); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } - pBuff = (uint8_t*)malloc((width * height * PLANAR_CHROMA_WIDTH_ORDER * PLANAR_CHROMA_HEIGHT_ORDER) * sizeof(unsigned char)); + pBuff = (uint8_t *)malloc((width * height * PLANAR_CHROMA_WIDTH_ORDER * PLANAR_CHROMA_HEIGHT_ORDER) + * sizeof(unsigned char)); if (!pBuff) { printf("CudaProducer: Failed to allocate image buffer in %s line %d\n", __FILE__, __LINE__); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } - // Y V U order in the buffer. Fully planar formats use + // Y V U order in the buffer. Fully planar formats use // three planes to store the Y, Cb and Cr components separately. - pYBuff = pBuff; + pYBuff = pBuff; pVBuff = pYBuff + width * height; - pUBuff = pVBuff + (width / PLANAR_CHROMA_WIDTH_ORDER) * (height / PLANAR_CHROMA_HEIGHT_ORDER); + pUBuff = pVBuff + (width / PLANAR_CHROMA_WIDTH_ORDER) * (height / PLANAR_CHROMA_HEIGHT_ORDER); for (uint32_t i = 0; i < height; i++) { if (fread(pYBuff, width, 1, fp) != 1) { printf("ReadYUVFrame: Error reading file: %s in %s line %d\n", path.c_str(), __FILE__, __LINE__); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } flipBits(pYBuff, width); pYBuff += width; } - + pChroma = pVBuff; for (uint32_t i = 0; i < height / PLANAR_CHROMA_HEIGHT_ORDER; i++) { if (fread(pChroma, width / PLANAR_CHROMA_WIDTH_ORDER, 1, fp) != 1) { @@ -335,7 +343,7 @@ void Caller::copyYUVToCudaArrayAndFlipBits(std::string &path, cudaArray_t *cudaA flipBits(pChroma, width); pChroma += width / PLANAR_CHROMA_WIDTH_ORDER; } - + pChroma = pUBuff; for (uint32_t i = 0; i < height / PLANAR_CHROMA_HEIGHT_ORDER; i++) { if (fread(pChroma, width / PLANAR_CHROMA_WIDTH_ORDER, 1, fp) != 1) { @@ -345,22 +353,26 @@ void Caller::copyYUVToCudaArrayAndFlipBits(std::string &path, cudaArray_t *cudaA flipBits(pChroma, width); pChroma += width / PLANAR_CHROMA_WIDTH_ORDER; } - uvOffset[0] = 0; - copyHeight[0] = height; - copyHeight[1] = height / PLANAR_CHROMA_HEIGHT_ORDER; - copyHeight[2] = height / PLANAR_CHROMA_HEIGHT_ORDER; + uvOffset[0] = 0; + copyHeight[0] = height; + copyHeight[1] = height / PLANAR_CHROMA_HEIGHT_ORDER; + copyHeight[2] = height / PLANAR_CHROMA_HEIGHT_ORDER; copyWidthInBytes[0] = width; - // Width of the second and third planes is half of the first plane. - copyWidthInBytes[1] = width / PLANAR_CHROMA_WIDTH_ORDER; - copyWidthInBytes[2] = width / PLANAR_CHROMA_WIDTH_ORDER; - uvOffset[1] = width * height; - uvOffset[2] = uvOffset[1] + (width / PLANAR_CHROMA_WIDTH_ORDER) * (height / PLANAR_CHROMA_HEIGHT_ORDER); + // Width of the second and third planes is half of the first plane. + copyWidthInBytes[1] = width / PLANAR_CHROMA_WIDTH_ORDER; + copyWidthInBytes[2] = width / PLANAR_CHROMA_WIDTH_ORDER; + uvOffset[1] = width * height; + uvOffset[2] = uvOffset[1] + (width / PLANAR_CHROMA_WIDTH_ORDER) * (height / PLANAR_CHROMA_HEIGHT_ORDER); for (int i = 0; i < numPlanes; i++) { checkCudaDrvErrors(cuCtxSynchronize()); - checkCudaErrors(cudaMemcpy2DToArray( - cudaArr[i], 0, 0, (void *)(pBuff + uvOffset[i]), copyWidthInBytes[i], - copyWidthInBytes[i], copyHeight[i], - cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy2DToArray(cudaArr[i], + 0, + 0, + (void *)(pBuff + uvOffset[i]), + copyWidthInBytes[i], + copyWidthInBytes[i], + copyHeight[i], + cudaMemcpyHostToDevice)); } if (fp) { @@ -374,61 +386,67 @@ void Caller::copyYUVToCudaArrayAndFlipBits(std::string &path, cudaArray_t *cudaA printf("Image %s copied to CUDA Array and bit flip done\n", path.c_str()); } -// Copy Cuda Array in YUV 420 format to a file -void Caller::copyCudaArrayToYUV(std::string &path, cudaArray_t *cudaArr) { - FILE *fp = NULL; - int bufferSize; - uint32_t width = multiPlanarWidth[0]; - uint32_t height = multiPlanarHeight[0]; - uint32_t copyWidthInBytes=0, copyHeight=0; +// Copy Cuda Array in YUV 420 format to a file +void Caller::copyCudaArrayToYUV(std::string &path, cudaArray_t *cudaArr) +{ + FILE *fp = NULL; + int bufferSize; + uint32_t width = multiPlanarWidth[0]; + uint32_t height = multiPlanarHeight[0]; + uint32_t copyWidthInBytes = 0, copyHeight = 0; uint8_t *pCudaCopyMem = NULL; - + fp = fopen(path.c_str(), "wb+"); if (!fp) { printf("WriteFrame: file open failed %s in %s line %d\n", path.c_str(), __FILE__, __LINE__); exit(EXIT_FAILURE); } - + for (int i = 0; i < numPlanes; i++) { if (i == 0) { - bufferSize = width * height; + bufferSize = width * height; copyWidthInBytes = width; - copyHeight = height; - + copyHeight = height; + pCudaCopyMem = (uint8_t *)malloc(bufferSize); if (pCudaCopyMem == NULL) { printf("pCudaCopyMem malloc failed in %s line %d\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } - } + } else { - bufferSize = ((height / PLANAR_CHROMA_HEIGHT_ORDER) * (width / PLANAR_CHROMA_WIDTH_ORDER)); + bufferSize = ((height / PLANAR_CHROMA_HEIGHT_ORDER) * (width / PLANAR_CHROMA_WIDTH_ORDER)); copyWidthInBytes = width / PLANAR_CHROMA_WIDTH_ORDER; - copyHeight = height / PLANAR_CHROMA_HEIGHT_ORDER; + copyHeight = height / PLANAR_CHROMA_HEIGHT_ORDER; } memset(pCudaCopyMem, 0, bufferSize); - - checkCudaErrors(cudaMemcpy2DFromArray( - (void *)pCudaCopyMem, copyWidthInBytes, cudaArr[i], 0, 0, - copyWidthInBytes, copyHeight, - cudaMemcpyDeviceToHost)); + + checkCudaErrors(cudaMemcpy2DFromArray((void *)pCudaCopyMem, + copyWidthInBytes, + cudaArr[i], + 0, + 0, + copyWidthInBytes, + copyHeight, + cudaMemcpyDeviceToHost)); checkCudaDrvErrors(cuCtxSynchronize()); - + if (fwrite(pCudaCopyMem, bufferSize, 1, fp) != 1) { printf("Cuda consumer: output file write failed in %s line %d\n", __FILE__, __LINE__); - exit(EXIT_FAILURE); - } + exit(EXIT_FAILURE); + } } printf("Output file : %s saved\n", path.c_str()); - + if (fp) { fclose(fp); fp = NULL; } -} +} -void cudaNvSciBufMultiplanar::tearDown(Caller *caller1, Caller *caller2) { +void cudaNvSciBufMultiplanar::tearDown(Caller *caller1, Caller *caller2) +{ caller1->deinit(); caller2->deinit(); NvSciBufObjFree(buffObj); diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.h index 25a1d756..4822cae3 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/cudaNvSciBufMultiplanar.h @@ -27,61 +27,69 @@ #ifndef CUDA_NVSCIBUF_MULTIPLANAR_H #define CUDA_NVSCIBUF_MULTIPLANAR_H +#include #include +#include #include #include -#include -#include -#define PLANAR_NUM_PLANES 3 -#define PLANAR_CHROMA_WIDTH_ORDER 2 +#define PLANAR_NUM_PLANES 3 +#define PLANAR_CHROMA_WIDTH_ORDER 2 #define PLANAR_CHROMA_HEIGHT_ORDER 2 - -#define ATTR_SIZE 20 + +#define ATTR_SIZE 20 #define DEFAULT_GPU 0 -#define checkNvSciErrors(call) \ - do { \ - NvSciError _status = call; \ - if (NvSciError_Success != _status) { \ - printf( \ - "NVSCI call in file '%s' in line %i returned" \ - " %d, expected %d\n", \ - __FILE__, __LINE__, _status, NvSciError_Success); \ - fflush(stdout); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define checkNvSciErrors(call) \ + do { \ + NvSciError _status = call; \ + if (NvSciError_Success != _status) { \ + printf("NVSCI call in file '%s' in line %i returned" \ + " %d, expected %d\n", \ + __FILE__, \ + __LINE__, \ + _status, \ + NvSciError_Success); \ + fflush(stdout); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) #define checkCudaDrvErrors(call) \ - do { \ - CUresult err = call; \ - if (CUDA_SUCCESS != err) { \ - const char *errorStr = NULL; \ - cuGetErrorString(err, &errorStr); \ - printf( \ - "checkCudaDrvErrors() Driver API error" \ - " = %04d \"%s\" from file <%s>, " \ - "line %i.\n", \ - err, errorStr, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) + do { \ + CUresult err = call; \ + if (CUDA_SUCCESS != err) { \ + const char *errorStr = NULL; \ + cuGetErrorString(err, &errorStr); \ + printf("checkCudaDrvErrors() Driver API error" \ + " = %04d \"%s\" from file <%s>, " \ + "line %i.\n", \ + err, \ + errorStr, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) -extern void launchFlipSurfaceBitsKernel(cudaArray_t *levelArray, int32_t *multiPlanarWidth, - int32_t *multiPlanarHeight, int numPlanes); +extern void launchFlipSurfaceBitsKernel(cudaArray_t *levelArray, + int32_t *multiPlanarWidth, + int32_t *multiPlanarHeight, + int numPlanes); -class Caller { +class Caller +{ private: - NvSciBufAttrList attrListOut; + NvSciBufAttrList attrListOut; NvSciBufAttrKeyValuePair pairArrayOut[ATTR_SIZE]; - cudaExternalMemory_t extMem; - int32_t numPlanes; + cudaExternalMemory_t extMem; + int32_t numPlanes; + public: - NvSciBufAttrList attrList; + NvSciBufAttrList attrList; cudaMipmappedArray_t multiPlanarArray[PLANAR_NUM_PLANES]; - int32_t multiPlanarWidth[PLANAR_NUM_PLANES]; - int32_t multiPlanarHeight[PLANAR_NUM_PLANES]; + int32_t multiPlanarWidth[PLANAR_NUM_PLANES]; + int32_t multiPlanarHeight[PLANAR_NUM_PLANES]; void init(); void deinit(); @@ -92,15 +100,17 @@ public: }; -class cudaNvSciBufMultiplanar { +class cudaNvSciBufMultiplanar +{ private: - size_t imageWidth; - size_t imageHeight; - int mCudaDeviceId; - int deviceCnt; + size_t imageWidth; + size_t imageHeight; + int mCudaDeviceId; + int deviceCnt; NvSciBufAttrList attrList[2]; NvSciBufAttrList attrListReconciled; NvSciBufAttrList attrListConflict; + public: cudaNvSciBufMultiplanar(size_t imageWidth, size_t imageHeight, std::vector &deviceIds); void initCuda(int devId); @@ -110,15 +120,15 @@ public: }; enum NvSciBufImageAttributes { - PLANE_SIZE, + PLANE_SIZE, PLANE_ALIGNED_SIZE, PLANE_OFFSET, - PLANE_HEIGHT, - PLANE_WIDTH, - PLANE_CHANNEL_COUNT, + PLANE_HEIGHT, + PLANE_WIDTH, + PLANE_CHANNEL_COUNT, PLANE_BITS_PER_PIXEL, PLANE_COUNT, PLANE_ATTR_SIZE }; -#endif // CUDA_NVSCIBUF_MULTIPLANAR_H +#endif // CUDA_NVSCIBUF_MULTIPLANAR_H diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/imageKernels.cu b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/imageKernels.cu index eaaed39b..89129170 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/imageKernels.cu +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/imageKernels.cu @@ -27,13 +27,14 @@ #include #include -static __global__ void flipSurfaceBits(cudaSurfaceObject_t surfObj, int width, int height) { - char data; +static __global__ void flipSurfaceBits(cudaSurfaceObject_t surfObj, int width, int height) +{ + char data; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { // Read from input surface - surf2Dread(&data, surfObj, x, y); + surf2Dread(&data, surfObj, x, y); // Write to output surface data = ~data; surf2Dwrite(data, surfObj, x, y); @@ -41,24 +42,23 @@ static __global__ void flipSurfaceBits(cudaSurfaceObject_t surfObj, int width, i } // Copy cudaArray to surface memory and launch the CUDA kernel -void launchFlipSurfaceBitsKernel( - cudaArray_t *levelArray, - int32_t *multiPlanarWidth, - int32_t *multiPlanarHeight, - int numPlanes) { +void launchFlipSurfaceBitsKernel(cudaArray_t *levelArray, + int32_t *multiPlanarWidth, + int32_t *multiPlanarHeight, + int numPlanes) +{ cudaSurfaceObject_t surfObject[numPlanes] = {0}; - cudaResourceDesc resDesc; - - for (int i = 0; i < numPlanes; i++) { + cudaResourceDesc resDesc; + + for (int i = 0; i < numPlanes; i++) { memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypeArray; + resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = levelArray[i]; checkCudaErrors(cudaCreateSurfaceObject(&surfObject[i], &resDesc)); dim3 threadsperBlock(16, 16); dim3 numBlocks((multiPlanarWidth[i] + threadsperBlock.x - 1) / threadsperBlock.x, - (multiPlanarHeight[i] + threadsperBlock.y - 1) / threadsperBlock.y); + (multiPlanarHeight[i] + threadsperBlock.y - 1) / threadsperBlock.y); flipSurfaceBits<<>>(surfObject[i], multiPlanarWidth[i], multiPlanarHeight[i]); } } - diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/main.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/main.cpp index d6ce1c2e..a5abf20b 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/main.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/main.cpp @@ -25,48 +25,48 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include -#include -#include "cudaNvSciBufMultiplanar.h" #include +#include + +#include "cudaNvSciBufMultiplanar.h" #define MAX_FILE_SIZE 100 -int main(int argc, const char **argv) { - int numOfGPUs = 0; - std::vector deviceIds; - (cudaGetDeviceCount(&numOfGPUs)); +int main(int argc, const char **argv) +{ + int numOfGPUs = 0; + std::vector deviceIds; + (cudaGetDeviceCount(&numOfGPUs)); - printf("%d GPUs found\n", numOfGPUs); - if (!numOfGPUs) { - exit(EXIT_WAIVED); - } else { - for (int devID = 0; devID < numOfGPUs; devID++) { - int major = 0, minor = 0; - (cudaDeviceGetAttribute( - &major, cudaDevAttrComputeCapabilityMajor, devID)); - (cudaDeviceGetAttribute( - &minor, cudaDevAttrComputeCapabilityMinor, devID)); - if (major >= 6) { - deviceIds.push_back(devID); - } + printf("%d GPUs found\n", numOfGPUs); + if (!numOfGPUs) { + exit(EXIT_WAIVED); } - if (deviceIds.size() == 0) { - printf( - "cudaNvSciBufMultiplanar requires one or more GPUs of Pascal(SM 6.0) or higher " - "archs\nWaiving..\n"); - exit(EXIT_WAIVED); + else { + for (int devID = 0; devID < numOfGPUs; devID++) { + int major = 0, minor = 0; + (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + (cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + if (major >= 6) { + deviceIds.push_back(devID); + } + } + if (deviceIds.size() == 0) { + printf("cudaNvSciBufMultiplanar requires one or more GPUs of Pascal(SM 6.0) or higher " + "archs\nWaiving..\n"); + exit(EXIT_WAIVED); + } } - } - std::string image_filename = sdkFindFilePath("yuv_planar_img1.yuv", argv[0]); - std::string image_filename_out = "image_out.yuv"; - uint32_t imageWidth = 720; - uint32_t imageHeight = 480; + std::string image_filename = sdkFindFilePath("yuv_planar_img1.yuv", argv[0]); + std::string image_filename_out = "image_out.yuv"; + uint32_t imageWidth = 720; + uint32_t imageHeight = 480; - printf("input image %s , width = %d, height = %d\n", image_filename.c_str(), imageWidth, imageHeight); + printf("input image %s , width = %d, height = %d\n", image_filename.c_str(), imageWidth, imageHeight); - cudaNvSciBufMultiplanar cudaNvSciBufMultiplanarApp(imageWidth, imageHeight, deviceIds); - cudaNvSciBufMultiplanarApp.runCudaNvSciBufPlanar(image_filename, image_filename_out); + cudaNvSciBufMultiplanar cudaNvSciBufMultiplanarApp(imageWidth, imageHeight, deviceIds); + cudaNvSciBufMultiplanarApp.runCudaNvSciBufPlanar(image_filename, image_filename_out); - return EXIT_SUCCESS; -} \ No newline at end of file + return EXIT_SUCCESS; +} diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/README.md b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/README.md index 40ca89ed..9c25fd3e 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/README.md +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/README.md @@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.cu b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.cu index 1a9279bc..d640a06b 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.cu +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.cu @@ -25,404 +25,382 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include #include -#include #include "cuda_consumer.h" -#include #include "nvmedia_image_nvscibuf.h" #include "nvmedia_utils/cmdline.h" // Enable this to 1 if require cuda processed output to ppm file. #define WRITE_OUTPUT_IMAGE 0 -#define checkNvSciErrors(call) \ - do { \ - NvSciError _status = call; \ - if (NvSciError_Success != _status) { \ - printf( \ - "NVSCI call in file '%s' in line %i returned" \ - " %d, expected %d\n", \ - __FILE__, __LINE__, _status, NvSciError_Success); \ - fflush(stdout); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define checkNvSciErrors(call) \ + do { \ + NvSciError _status = call; \ + if (NvSciError_Success != _status) { \ + printf("NVSCI call in file '%s' in line %i returned" \ + " %d, expected %d\n", \ + __FILE__, \ + __LINE__, \ + _status, \ + NvSciError_Success); \ + fflush(stdout); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) -__global__ static void yuvToGrayscale(cudaSurfaceObject_t surfaceObject, - unsigned int *dstImage, - int32_t imageWidth, int32_t imageHeight) { - size_t x = blockIdx.x * blockDim.x + threadIdx.x; - size_t y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ static void +yuvToGrayscale(cudaSurfaceObject_t surfaceObject, unsigned int *dstImage, int32_t imageWidth, int32_t imageHeight) +{ + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; - uchar4 *dstImageUchar4 = (uchar4 *)dstImage; - for (; x < imageWidth && y < imageHeight; - x += gridDim.x * blockDim.x, y += gridDim.y * blockDim.y) { - int colInBytes = x * sizeof(unsigned char); - unsigned char luma = - surf2Dread(surfaceObject, colInBytes, y); - uchar4 grayscalePix = make_uchar4(luma, luma, luma, 0); + uchar4 *dstImageUchar4 = (uchar4 *)dstImage; + for (; x < imageWidth && y < imageHeight; x += gridDim.x * blockDim.x, y += gridDim.y * blockDim.y) { + int colInBytes = x * sizeof(unsigned char); + unsigned char luma = surf2Dread(surfaceObject, colInBytes, y); + uchar4 grayscalePix = make_uchar4(luma, luma, luma, 0); - dstImageUchar4[y * imageWidth + x] = grayscalePix; - } + dstImageUchar4[y * imageWidth + x] = grayscalePix; + } } -static void cudaImportNvSciSync(cudaExternalSemaphore_t &extSem, - NvSciSyncObj &syncObj) { - cudaExternalSemaphoreHandleDesc extSemDesc; - memset(&extSemDesc, 0, sizeof(extSemDesc)); - extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; - extSemDesc.handle.nvSciSyncObj = (void *)syncObj; +static void cudaImportNvSciSync(cudaExternalSemaphore_t &extSem, NvSciSyncObj &syncObj) +{ + cudaExternalSemaphoreHandleDesc extSemDesc; + memset(&extSemDesc, 0, sizeof(extSemDesc)); + extSemDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync; + extSemDesc.handle.nvSciSyncObj = (void *)syncObj; - checkCudaErrors(cudaImportExternalSemaphore(&extSem, &extSemDesc)); + checkCudaErrors(cudaImportExternalSemaphore(&extSem, &extSemDesc)); } -static void waitExternalSemaphore(cudaExternalSemaphore_t &waitSem, - NvSciSyncFence *fence, cudaStream_t stream) { - cudaExternalSemaphoreWaitParams waitParams; - memset(&waitParams, 0, sizeof(waitParams)); - // For cross-process signaler-waiter applications need to use NvSciIpc - // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence - // across process. This step is optional in single-process. - waitParams.params.nvSciSync.fence = (void *)fence; - waitParams.flags = 0; +static void waitExternalSemaphore(cudaExternalSemaphore_t &waitSem, NvSciSyncFence *fence, cudaStream_t stream) +{ + cudaExternalSemaphoreWaitParams waitParams; + memset(&waitParams, 0, sizeof(waitParams)); + // For cross-process signaler-waiter applications need to use NvSciIpc + // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence + // across process. This step is optional in single-process. + waitParams.params.nvSciSync.fence = (void *)fence; + waitParams.flags = 0; - checkCudaErrors( - cudaWaitExternalSemaphoresAsync(&waitSem, &waitParams, 1, stream)); + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&waitSem, &waitParams, 1, stream)); } -static void signalExternalSemaphore(cudaExternalSemaphore_t &signalSem, - NvSciSyncFence *fence, - cudaStream_t stream) { - cudaExternalSemaphoreSignalParams signalParams; - memset(&signalParams, 0, sizeof(signalParams)); - // For cross-process signaler-waiter applications need to use NvSciIpc - // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence - // across process. This step is optional in single-process. - signalParams.params.nvSciSync.fence = (void *)fence; - signalParams.flags = 0; +static void signalExternalSemaphore(cudaExternalSemaphore_t &signalSem, NvSciSyncFence *fence, cudaStream_t stream) +{ + cudaExternalSemaphoreSignalParams signalParams; + memset(&signalParams, 0, sizeof(signalParams)); + // For cross-process signaler-waiter applications need to use NvSciIpc + // and NvSciSync[Export|Import] utilities to share the NvSciSyncFence + // across process. This step is optional in single-process. + signalParams.params.nvSciSync.fence = (void *)fence; + signalParams.flags = 0; - checkCudaErrors( - cudaSignalExternalSemaphoresAsync(&signalSem, &signalParams, 1, stream)); + checkCudaErrors(cudaSignalExternalSemaphoresAsync(&signalSem, &signalParams, 1, stream)); } -static void yuvToGrayscaleCudaKernel(cudaExternalResInterop &cudaExtResObj, - int32_t imageWidth, int32_t imageHeight) { +static void yuvToGrayscaleCudaKernel(cudaExternalResInterop &cudaExtResObj, int32_t imageWidth, int32_t imageHeight) +{ #if WRITE_OUTPUT_IMAGE - unsigned int *h_dstImage; - checkCudaErrors(cudaMallocHost( - &h_dstImage, sizeof(unsigned int) * imageHeight * imageWidth)); + unsigned int *h_dstImage; + checkCudaErrors(cudaMallocHost(&h_dstImage, sizeof(unsigned int) * imageHeight * imageWidth)); #endif - dim3 block(16, 16, 1); - dim3 grid((imageWidth / block.x) + 1, (imageHeight / block.y) + 1, 1); + dim3 block(16, 16, 1); + dim3 grid((imageWidth / block.x) + 1, (imageHeight / block.y) + 1, 1); - yuvToGrayscale<<>>( - cudaExtResObj.cudaSurfaceNvmediaBuf[0], cudaExtResObj.d_outputImage, - imageWidth, imageHeight); + yuvToGrayscale<<>>( + cudaExtResObj.cudaSurfaceNvmediaBuf[0], cudaExtResObj.d_outputImage, imageWidth, imageHeight); #if WRITE_OUTPUT_IMAGE - checkCudaErrors( - cudaMemcpyAsync(h_dstImage, cudaExtResObj.d_outputImage, - sizeof(unsigned int) * imageHeight * imageWidth, - cudaMemcpyDeviceToHost, cudaExtResObj.stream)); - checkCudaErrors(cudaStreamSynchronize(cudaExtResObj.stream)); - char outputFilename[1024]; - std::string image_filename = "Grayscale"; - strcpy(outputFilename, image_filename.c_str()); - strcpy(outputFilename + image_filename.length(), "_nvsci_out.ppm"); - sdkSavePPM4ub(outputFilename, (unsigned char *)h_dstImage, imageWidth, - imageHeight); - printf("Wrote '%s'\n", outputFilename); - checkCudaErrors(cudaFreeHost(h_dstImage)); + checkCudaErrors(cudaMemcpyAsync(h_dstImage, + cudaExtResObj.d_outputImage, + sizeof(unsigned int) * imageHeight * imageWidth, + cudaMemcpyDeviceToHost, + cudaExtResObj.stream)); + checkCudaErrors(cudaStreamSynchronize(cudaExtResObj.stream)); + char outputFilename[1024]; + std::string image_filename = "Grayscale"; + strcpy(outputFilename, image_filename.c_str()); + strcpy(outputFilename + image_filename.length(), "_nvsci_out.ppm"); + sdkSavePPM4ub(outputFilename, (unsigned char *)h_dstImage, imageWidth, imageHeight); + printf("Wrote '%s'\n", outputFilename); + checkCudaErrors(cudaFreeHost(h_dstImage)); #endif } -static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj, - NvSciBufObj &inputBufObj) { - NvSciBufModule module = NULL; - NvSciBufAttrList attrlist = NULL; - NvSciBufAttrKeyValuePair pairArrayOut[10]; +static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj, NvSciBufObj &inputBufObj) +{ + NvSciBufModule module = NULL; + NvSciBufAttrList attrlist = NULL; + NvSciBufAttrKeyValuePair pairArrayOut[10]; - checkNvSciErrors(NvSciBufModuleOpen(&module)); - checkNvSciErrors(NvSciBufAttrListCreate(module, &attrlist)); - checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &attrlist)); + checkNvSciErrors(NvSciBufModuleOpen(&module)); + checkNvSciErrors(NvSciBufAttrListCreate(module, &attrlist)); + checkNvSciErrors(NvSciBufObjGetAttrList(inputBufObj, &attrlist)); - memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); + memset(pairArrayOut, 0, sizeof(NvSciBufAttrKeyValuePair) * 10); - int numAttrs = 0; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Size; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneChannelCount; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneCount; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneWidth; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneHeight; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Layout; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneBitsPerPixel; - pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneOffset; + int numAttrs = 0; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Size; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneChannelCount; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneCount; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneWidth; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneHeight; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Layout; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneBitsPerPixel; + pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneOffset; - checkNvSciErrors(NvSciBufAttrListGetAttrs(attrlist, pairArrayOut, numAttrs)); + checkNvSciErrors(NvSciBufAttrListGetAttrs(attrlist, pairArrayOut, numAttrs)); - uint64_t size = *(uint64_t *)pairArrayOut[0].value; - uint8_t channelCount = *(uint8_t *)pairArrayOut[1].value; - cudaExtResObj.planeCount = *(int32_t *)pairArrayOut[2].value; - cudaExtResObj.imageWidth = - (int32_t *)malloc(sizeof(int32_t) * cudaExtResObj.planeCount); - cudaExtResObj.imageHeight = - (int32_t *)malloc(sizeof(int32_t) * cudaExtResObj.planeCount); - cudaExtResObj.planeOffset = - (uint64_t *)malloc(sizeof(uint64_t) * cudaExtResObj.planeCount); + uint64_t size = *(uint64_t *)pairArrayOut[0].value; + uint8_t channelCount = *(uint8_t *)pairArrayOut[1].value; + cudaExtResObj.planeCount = *(int32_t *)pairArrayOut[2].value; + cudaExtResObj.imageWidth = (int32_t *)malloc(sizeof(int32_t) * cudaExtResObj.planeCount); + cudaExtResObj.imageHeight = (int32_t *)malloc(sizeof(int32_t) * cudaExtResObj.planeCount); + cudaExtResObj.planeOffset = (uint64_t *)malloc(sizeof(uint64_t) * cudaExtResObj.planeCount); - memcpy(cudaExtResObj.imageWidth, (int32_t *)pairArrayOut[3].value, - cudaExtResObj.planeCount * sizeof(int32_t)); - memcpy(cudaExtResObj.imageHeight, (int32_t *)pairArrayOut[4].value, - cudaExtResObj.planeCount * sizeof(int32_t)); - memcpy(cudaExtResObj.planeOffset, (uint64_t *)pairArrayOut[7].value, - cudaExtResObj.planeCount * sizeof(uint64_t)); + memcpy(cudaExtResObj.imageWidth, (int32_t *)pairArrayOut[3].value, cudaExtResObj.planeCount * sizeof(int32_t)); + memcpy(cudaExtResObj.imageHeight, (int32_t *)pairArrayOut[4].value, cudaExtResObj.planeCount * sizeof(int32_t)); + memcpy(cudaExtResObj.planeOffset, (uint64_t *)pairArrayOut[7].value, cudaExtResObj.planeCount * sizeof(uint64_t)); - NvSciBufAttrValImageLayoutType layout = - *(NvSciBufAttrValImageLayoutType *)pairArrayOut[5].value; - uint32_t bitsPerPixel = *(uint32_t *)pairArrayOut[6].value; + NvSciBufAttrValImageLayoutType layout = *(NvSciBufAttrValImageLayoutType *)pairArrayOut[5].value; + uint32_t bitsPerPixel = *(uint32_t *)pairArrayOut[6].value; - if (layout != NvSciBufImage_BlockLinearType) { - printf("Image layout is not block linear.. waiving execution\n"); - exit(EXIT_WAIVED); - } - - cudaExternalMemoryHandleDesc memHandleDesc; - memset(&memHandleDesc, 0, sizeof(memHandleDesc)); - memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; - memHandleDesc.handle.nvSciBufObject = inputBufObj; - memHandleDesc.size = size; - checkCudaErrors( - cudaImportExternalMemory(&cudaExtResObj.extMemImageBuf, &memHandleDesc)); - - cudaExtResObj.d_mipmapArray = (cudaMipmappedArray_t *)malloc( - sizeof(cudaMipmappedArray_t) * cudaExtResObj.planeCount); - - for (int i = 0; i < cudaExtResObj.planeCount; i++) { - cudaExtent extent = {}; - memset(&extent, 0, sizeof(extent)); - extent.width = cudaExtResObj.imageWidth[i]; - extent.height = cudaExtResObj.imageHeight[i]; - extent.depth = 0; - cudaChannelFormatDesc desc; - switch (channelCount) { - case 1: - default: - desc = cudaCreateChannelDesc(bitsPerPixel, 0, 0, 0, - cudaChannelFormatKindUnsigned); - break; - case 2: - desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, 0, 0, - cudaChannelFormatKindUnsigned); - break; - case 3: - desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, bitsPerPixel, - 0, cudaChannelFormatKindUnsigned); - break; - case 4: - desc = - cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, bitsPerPixel, - bitsPerPixel, cudaChannelFormatKindUnsigned); - break; + if (layout != NvSciBufImage_BlockLinearType) { + printf("Image layout is not block linear.. waiving execution\n"); + exit(EXIT_WAIVED); } - cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0}; - mipmapDesc.offset = cudaExtResObj.planeOffset[i]; - mipmapDesc.formatDesc = desc; - mipmapDesc.extent = extent; - mipmapDesc.flags = 0; - mipmapDesc.numLevels = 1; - checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray( - &cudaExtResObj.d_mipmapArray[i], cudaExtResObj.extMemImageBuf, - &mipmapDesc)); - } + cudaExternalMemoryHandleDesc memHandleDesc; + memset(&memHandleDesc, 0, sizeof(memHandleDesc)); + memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf; + memHandleDesc.handle.nvSciBufObject = inputBufObj; + memHandleDesc.size = size; + checkCudaErrors(cudaImportExternalMemory(&cudaExtResObj.extMemImageBuf, &memHandleDesc)); + + cudaExtResObj.d_mipmapArray = + (cudaMipmappedArray_t *)malloc(sizeof(cudaMipmappedArray_t) * cudaExtResObj.planeCount); + + for (int i = 0; i < cudaExtResObj.planeCount; i++) { + cudaExtent extent = {}; + memset(&extent, 0, sizeof(extent)); + extent.width = cudaExtResObj.imageWidth[i]; + extent.height = cudaExtResObj.imageHeight[i]; + extent.depth = 0; + cudaChannelFormatDesc desc; + switch (channelCount) { + case 1: + default: + desc = cudaCreateChannelDesc(bitsPerPixel, 0, 0, 0, cudaChannelFormatKindUnsigned); + break; + case 2: + desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, 0, 0, cudaChannelFormatKindUnsigned); + break; + case 3: + desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, bitsPerPixel, 0, cudaChannelFormatKindUnsigned); + break; + case 4: + desc = cudaCreateChannelDesc( + bitsPerPixel, bitsPerPixel, bitsPerPixel, bitsPerPixel, cudaChannelFormatKindUnsigned); + break; + } + + cudaExternalMemoryMipmappedArrayDesc mipmapDesc = {0}; + mipmapDesc.offset = cudaExtResObj.planeOffset[i]; + mipmapDesc.formatDesc = desc; + mipmapDesc.extent = extent; + mipmapDesc.flags = 0; + mipmapDesc.numLevels = 1; + checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray( + &cudaExtResObj.d_mipmapArray[i], cudaExtResObj.extMemImageBuf, &mipmapDesc)); + } } -static cudaSurfaceObject_t createCudaSurface(cudaArray_t &d_mipLevelArray) { - cudaResourceDesc resourceDesc; - memset(&resourceDesc, 0, sizeof(resourceDesc)); - resourceDesc.resType = cudaResourceTypeArray; - resourceDesc.res.array.array = d_mipLevelArray; +static cudaSurfaceObject_t createCudaSurface(cudaArray_t &d_mipLevelArray) +{ + cudaResourceDesc resourceDesc; + memset(&resourceDesc, 0, sizeof(resourceDesc)); + resourceDesc.resType = cudaResourceTypeArray; + resourceDesc.res.array.array = d_mipLevelArray; - cudaSurfaceObject_t surfaceObject; - checkCudaErrors(cudaCreateSurfaceObject(&surfaceObject, &resourceDesc)); - return surfaceObject; + cudaSurfaceObject_t surfaceObject; + checkCudaErrors(cudaCreateSurfaceObject(&surfaceObject, &resourceDesc)); + return surfaceObject; } -static cudaStream_t createCudaStream(int deviceId) { - checkCudaErrors(cudaSetDevice(deviceId)); - cudaStream_t stream; - checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - return stream; +static cudaStream_t createCudaStream(int deviceId) +{ + checkCudaErrors(cudaSetDevice(deviceId)); + cudaStream_t stream; + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + return stream; } // CUDA setup buffers/synchronization objects for interop via NvSci API. -void setupCuda(cudaExternalResInterop &cudaExtResObj, NvSciBufObj &inputBufObj, - NvSciSyncObj &syncObj, NvSciSyncObj &cudaSignalerSyncObj, - int deviceId) { - checkCudaErrors(cudaSetDevice(deviceId)); - cudaImportNvSciSync(cudaExtResObj.waitSem, syncObj); - cudaImportNvSciSync(cudaExtResObj.signalSem, cudaSignalerSyncObj); +void setupCuda(cudaExternalResInterop &cudaExtResObj, + NvSciBufObj &inputBufObj, + NvSciSyncObj &syncObj, + NvSciSyncObj &cudaSignalerSyncObj, + int deviceId) +{ + checkCudaErrors(cudaSetDevice(deviceId)); + cudaImportNvSciSync(cudaExtResObj.waitSem, syncObj); + cudaImportNvSciSync(cudaExtResObj.signalSem, cudaSignalerSyncObj); - cudaImportNvSciImage(cudaExtResObj, inputBufObj); - cudaExtResObj.d_mipLevelArray = - (cudaArray_t *)malloc(sizeof(cudaArray_t) * cudaExtResObj.planeCount); - cudaExtResObj.cudaSurfaceNvmediaBuf = (cudaSurfaceObject_t *)malloc( - sizeof(cudaSurfaceObject_t) * cudaExtResObj.planeCount); + cudaImportNvSciImage(cudaExtResObj, inputBufObj); + cudaExtResObj.d_mipLevelArray = (cudaArray_t *)malloc(sizeof(cudaArray_t) * cudaExtResObj.planeCount); + cudaExtResObj.cudaSurfaceNvmediaBuf = + (cudaSurfaceObject_t *)malloc(sizeof(cudaSurfaceObject_t) * cudaExtResObj.planeCount); - for (int i = 0; i < cudaExtResObj.planeCount; ++i) { - uint32_t mipLevelId = 0; - checkCudaErrors( - cudaGetMipmappedArrayLevel(&cudaExtResObj.d_mipLevelArray[i], - cudaExtResObj.d_mipmapArray[i], mipLevelId)); - cudaExtResObj.cudaSurfaceNvmediaBuf[i] = - createCudaSurface(cudaExtResObj.d_mipLevelArray[i]); - } + for (int i = 0; i < cudaExtResObj.planeCount; ++i) { + uint32_t mipLevelId = 0; + checkCudaErrors( + cudaGetMipmappedArrayLevel(&cudaExtResObj.d_mipLevelArray[i], cudaExtResObj.d_mipmapArray[i], mipLevelId)); + cudaExtResObj.cudaSurfaceNvmediaBuf[i] = createCudaSurface(cudaExtResObj.d_mipLevelArray[i]); + } - cudaExtResObj.stream = createCudaStream(deviceId); - checkCudaErrors(cudaMalloc(&cudaExtResObj.d_outputImage, - sizeof(unsigned int) * - cudaExtResObj.imageWidth[0] * - cudaExtResObj.imageHeight[0])); + cudaExtResObj.stream = createCudaStream(deviceId); + checkCudaErrors(cudaMalloc(&cudaExtResObj.d_outputImage, + sizeof(unsigned int) * cudaExtResObj.imageWidth[0] * cudaExtResObj.imageHeight[0])); } // CUDA clean up buffers used **with** NvSci API. -void cleanupCuda(cudaExternalResInterop &cudaExtResObj) { - for (int i = 0; i < cudaExtResObj.planeCount; i++) { - checkCudaErrors( - cudaDestroySurfaceObject(cudaExtResObj.cudaSurfaceNvmediaBuf[i])); - checkCudaErrors(cudaFreeMipmappedArray(cudaExtResObj.d_mipmapArray[i])); - } - free(cudaExtResObj.d_mipmapArray); - free(cudaExtResObj.d_mipLevelArray); - free(cudaExtResObj.cudaSurfaceNvmediaBuf); - free(cudaExtResObj.imageWidth); - free(cudaExtResObj.imageHeight); - checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtResObj.waitSem)); - checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtResObj.signalSem)); - checkCudaErrors(cudaDestroyExternalMemory(cudaExtResObj.extMemImageBuf)); - checkCudaErrors(cudaStreamDestroy(cudaExtResObj.stream)); - checkCudaErrors(cudaFree(cudaExtResObj.d_outputImage)); +void cleanupCuda(cudaExternalResInterop &cudaExtResObj) +{ + for (int i = 0; i < cudaExtResObj.planeCount; i++) { + checkCudaErrors(cudaDestroySurfaceObject(cudaExtResObj.cudaSurfaceNvmediaBuf[i])); + checkCudaErrors(cudaFreeMipmappedArray(cudaExtResObj.d_mipmapArray[i])); + } + free(cudaExtResObj.d_mipmapArray); + free(cudaExtResObj.d_mipLevelArray); + free(cudaExtResObj.cudaSurfaceNvmediaBuf); + free(cudaExtResObj.imageWidth); + free(cudaExtResObj.imageHeight); + checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtResObj.waitSem)); + checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtResObj.signalSem)); + checkCudaErrors(cudaDestroyExternalMemory(cudaExtResObj.extMemImageBuf)); + checkCudaErrors(cudaStreamDestroy(cudaExtResObj.stream)); + checkCudaErrors(cudaFree(cudaExtResObj.d_outputImage)); } void runCudaOperation(cudaExternalResInterop &cudaExtResObj, - NvSciSyncFence *cudaWaitFence, - NvSciSyncFence *cudaSignalFence, int deviceId, - int iterations) { - checkCudaErrors(cudaSetDevice(deviceId)); - static int64_t launch = 0; + NvSciSyncFence *cudaWaitFence, + NvSciSyncFence *cudaSignalFence, + int deviceId, + int iterations) +{ + checkCudaErrors(cudaSetDevice(deviceId)); + static int64_t launch = 0; - waitExternalSemaphore(cudaExtResObj.waitSem, cudaWaitFence, - cudaExtResObj.stream); + waitExternalSemaphore(cudaExtResObj.waitSem, cudaWaitFence, cudaExtResObj.stream); - // run cuda kernel over surface object of the LUMA surface part to extract - // grayscale. - yuvToGrayscaleCudaKernel(cudaExtResObj, cudaExtResObj.imageWidth[0], - cudaExtResObj.imageHeight[0]); + // run cuda kernel over surface object of the LUMA surface part to extract + // grayscale. + yuvToGrayscaleCudaKernel(cudaExtResObj, cudaExtResObj.imageWidth[0], cudaExtResObj.imageHeight[0]); - // signal fence till the second last iterations for NvMedia2DBlit to wait for - // cuda signal and for final iteration as there is no corresponding NvMedia - // operation pending therefore we end with cudaStreamSynchronize() - if (launch < iterations - 1) { - signalExternalSemaphore(cudaExtResObj.signalSem, cudaSignalFence, - cudaExtResObj.stream); - } else { - checkCudaErrors(cudaStreamSynchronize(cudaExtResObj.stream)); - } - launch++; + // signal fence till the second last iterations for NvMedia2DBlit to wait for + // cuda signal and for final iteration as there is no corresponding NvMedia + // operation pending therefore we end with cudaStreamSynchronize() + if (launch < iterations - 1) { + signalExternalSemaphore(cudaExtResObj.signalSem, cudaSignalFence, cudaExtResObj.stream); + } + else { + checkCudaErrors(cudaStreamSynchronize(cudaExtResObj.stream)); + } + launch++; } // CUDA imports and operates on NvSci buffer/synchronization objects -void setupCuda(Blit2DTest *ctx, cudaResources &cudaResObj, int deviceId) { - checkCudaErrors(cudaSetDevice(deviceId)); - cudaResObj.d_yuvArray = - (cudaArray_t *)malloc(sizeof(cudaArray_t) * ctx->numSurfaces); - cudaResObj.cudaSurfaceNvmediaBuf = (cudaSurfaceObject_t *)malloc( - sizeof(cudaSurfaceObject_t) * ctx->numSurfaces); - cudaChannelFormatDesc channelDesc; - switch (ctx->bytesPerPixel) { +void setupCuda(Blit2DTest *ctx, cudaResources &cudaResObj, int deviceId) +{ + checkCudaErrors(cudaSetDevice(deviceId)); + cudaResObj.d_yuvArray = (cudaArray_t *)malloc(sizeof(cudaArray_t) * ctx->numSurfaces); + cudaResObj.cudaSurfaceNvmediaBuf = (cudaSurfaceObject_t *)malloc(sizeof(cudaSurfaceObject_t) * ctx->numSurfaces); + cudaChannelFormatDesc channelDesc; + switch (ctx->bytesPerPixel) { case 1: default: - channelDesc = - cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned); - break; - } + channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned); + break; + } - for (int k = 0; k < ctx->numSurfaces; k++) { - checkCudaErrors(cudaMallocArray( - &cudaResObj.d_yuvArray[k], &channelDesc, - ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel, - ctx->heightSurface * ctx->yScalePtr[k])); - cudaResObj.cudaSurfaceNvmediaBuf[k] = - createCudaSurface(cudaResObj.d_yuvArray[k]); - } - checkCudaErrors(cudaMalloc( - &cudaResObj.d_outputImage, - sizeof(unsigned int) * ctx->widthSurface * ctx->heightSurface)); + for (int k = 0; k < ctx->numSurfaces; k++) { + checkCudaErrors(cudaMallocArray(&cudaResObj.d_yuvArray[k], + &channelDesc, + ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel, + ctx->heightSurface * ctx->yScalePtr[k])); + cudaResObj.cudaSurfaceNvmediaBuf[k] = createCudaSurface(cudaResObj.d_yuvArray[k]); + } + checkCudaErrors( + cudaMalloc(&cudaResObj.d_outputImage, sizeof(unsigned int) * ctx->widthSurface * ctx->heightSurface)); - cudaResObj.stream = createCudaStream(deviceId); + cudaResObj.stream = createCudaStream(deviceId); } // CUDA clean up buffers used **without** NvSci API. -void cleanupCuda(Blit2DTest *ctx, cudaResources &cudaResObj) { - for (int k = 0; k < ctx->numSurfaces; k++) { - checkCudaErrors( - cudaDestroySurfaceObject(cudaResObj.cudaSurfaceNvmediaBuf[k])); - checkCudaErrors(cudaFreeArray(cudaResObj.d_yuvArray[k])); - } +void cleanupCuda(Blit2DTest *ctx, cudaResources &cudaResObj) +{ + for (int k = 0; k < ctx->numSurfaces; k++) { + checkCudaErrors(cudaDestroySurfaceObject(cudaResObj.cudaSurfaceNvmediaBuf[k])); + checkCudaErrors(cudaFreeArray(cudaResObj.d_yuvArray[k])); + } - free(cudaResObj.cudaSurfaceNvmediaBuf); + free(cudaResObj.cudaSurfaceNvmediaBuf); - checkCudaErrors(cudaStreamDestroy(cudaResObj.stream)); - checkCudaErrors(cudaFree(cudaResObj.d_outputImage)); + checkCudaErrors(cudaStreamDestroy(cudaResObj.stream)); + checkCudaErrors(cudaFree(cudaResObj.d_outputImage)); } -static void yuvToGrayscaleCudaKernelNonNvSci(cudaResources &cudaResObj, - int deviceId, int32_t imageWidth, - int32_t imageHeight) { +static void +yuvToGrayscaleCudaKernelNonNvSci(cudaResources &cudaResObj, int deviceId, int32_t imageWidth, int32_t imageHeight) +{ #if WRITE_OUTPUT_IMAGE - unsigned int *h_dstImage; - checkCudaErrors(cudaMallocHost( - &h_dstImage, sizeof(unsigned int) * imageHeight * imageWidth)); + unsigned int *h_dstImage; + checkCudaErrors(cudaMallocHost(&h_dstImage, sizeof(unsigned int) * imageHeight * imageWidth)); #endif - dim3 block(16, 16, 1); - dim3 grid((imageWidth / block.x) + 1, (imageHeight / block.y) + 1, 1); + dim3 block(16, 16, 1); + dim3 grid((imageWidth / block.x) + 1, (imageHeight / block.y) + 1, 1); - yuvToGrayscale<<>>( - cudaResObj.cudaSurfaceNvmediaBuf[0], cudaResObj.d_outputImage, imageWidth, - imageHeight); + yuvToGrayscale<<>>( + cudaResObj.cudaSurfaceNvmediaBuf[0], cudaResObj.d_outputImage, imageWidth, imageHeight); #if WRITE_OUTPUT_IMAGE - checkCudaErrors( - cudaMemcpyAsync(h_dstImage, cudaResObj.d_outputImage, - sizeof(unsigned int) * imageHeight * imageWidth, - cudaMemcpyDeviceToHost, cudaResObj.stream)); - checkCudaErrors(cudaStreamSynchronize(cudaResObj.stream)); - char outputFilename[1024]; - std::string image_filename = "Grayscale"; - strcpy(outputFilename, image_filename.c_str()); - strcpy(outputFilename + image_filename.length(), "_non-nvsci_out.ppm"); - sdkSavePPM4ub(outputFilename, (unsigned char *)h_dstImage, imageWidth, - imageHeight); - printf("Wrote '%s'\n", outputFilename); - checkCudaErrors(cudaFreeHost(h_dstImage)); + checkCudaErrors(cudaMemcpyAsync(h_dstImage, + cudaResObj.d_outputImage, + sizeof(unsigned int) * imageHeight * imageWidth, + cudaMemcpyDeviceToHost, + cudaResObj.stream)); + checkCudaErrors(cudaStreamSynchronize(cudaResObj.stream)); + char outputFilename[1024]; + std::string image_filename = "Grayscale"; + strcpy(outputFilename, image_filename.c_str()); + strcpy(outputFilename + image_filename.length(), "_non-nvsci_out.ppm"); + sdkSavePPM4ub(outputFilename, (unsigned char *)h_dstImage, imageWidth, imageHeight); + printf("Wrote '%s'\n", outputFilename); + checkCudaErrors(cudaFreeHost(h_dstImage)); #else - checkCudaErrors(cudaStreamSynchronize(cudaResObj.stream)); + checkCudaErrors(cudaStreamSynchronize(cudaResObj.stream)); #endif } // CUDA operates **without** NvSci APIs buffer/synchronization objects. -void runCudaOperation(Blit2DTest *ctx, cudaResources &cudaResObj, - int deviceId) { - for (int k = 0; k < ctx->numSurfaces; k++) { - checkCudaErrors(cudaMemcpy2DToArray( - cudaResObj.d_yuvArray[k], 0, 0, ctx->dstBuff[k], - ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel, - ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel, - ctx->heightSurface * ctx->yScalePtr[k], cudaMemcpyHostToDevice)); - } - // run cuda kernel over surface object of the LUMA surface part to extract - // grayscale. - yuvToGrayscaleCudaKernelNonNvSci(cudaResObj, deviceId, ctx->widthSurface, - ctx->heightSurface); +void runCudaOperation(Blit2DTest *ctx, cudaResources &cudaResObj, int deviceId) +{ + for (int k = 0; k < ctx->numSurfaces; k++) { + checkCudaErrors(cudaMemcpy2DToArray(cudaResObj.d_yuvArray[k], + 0, + 0, + ctx->dstBuff[k], + ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel, + ctx->widthSurface * ctx->xScalePtr[k] * ctx->bytesPerPixel, + ctx->heightSurface * ctx->yScalePtr[k], + cudaMemcpyHostToDevice)); + } + // run cuda kernel over surface object of the LUMA surface part to extract + // grayscale. + yuvToGrayscaleCudaKernelNonNvSci(cudaResObj, deviceId, ctx->widthSurface, ctx->heightSurface); } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.h index 067e2484..74b9b22a 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/cuda_consumer.h @@ -29,42 +29,49 @@ #define __CUDA_BUFIMPORT_KERNEL_H__ #include + #include "helper_cuda.h" #include "nvmedia_image_nvscibuf.h" -#include "nvscisync.h" #include "nvmedia_utils/cmdline.h" +#include "nvscisync.h" -struct cudaExternalResInterop { - cudaMipmappedArray_t *d_mipmapArray; - cudaArray_t *d_mipLevelArray; - cudaSurfaceObject_t *cudaSurfaceNvmediaBuf; - cudaStream_t stream; - cudaExternalMemory_t extMemImageBuf; - cudaExternalSemaphore_t waitSem; - cudaExternalSemaphore_t signalSem; +struct cudaExternalResInterop +{ + cudaMipmappedArray_t *d_mipmapArray; + cudaArray_t *d_mipLevelArray; + cudaSurfaceObject_t *cudaSurfaceNvmediaBuf; + cudaStream_t stream; + cudaExternalMemory_t extMemImageBuf; + cudaExternalSemaphore_t waitSem; + cudaExternalSemaphore_t signalSem; - int32_t planeCount; - uint64_t *planeOffset; - int32_t *imageWidth; - int32_t *imageHeight; - unsigned int *d_outputImage; + int32_t planeCount; + uint64_t *planeOffset; + int32_t *imageWidth; + int32_t *imageHeight; + unsigned int *d_outputImage; }; -struct cudaResources { - cudaArray_t *d_yuvArray; - cudaStream_t stream; - cudaSurfaceObject_t *cudaSurfaceNvmediaBuf; - unsigned int *d_outputImage; +struct cudaResources +{ + cudaArray_t *d_yuvArray; + cudaStream_t stream; + cudaSurfaceObject_t *cudaSurfaceNvmediaBuf; + unsigned int *d_outputImage; }; void runCudaOperation(cudaExternalResInterop &cudaExtResObj, - NvSciSyncFence *fence, NvSciSyncFence *cudaSignalfence, - int deviceId, int iterations); + NvSciSyncFence *fence, + NvSciSyncFence *cudaSignalfence, + int deviceId, + int iterations); void runCudaOperation(Blit2DTest *ctx, cudaResources &cudaResObj, int deviceId); -void setupCuda(cudaExternalResInterop &cudaExtResObj, NvSciBufObj &inputBufObj, - NvSciSyncObj &syncObj, NvSciSyncObj &cudaSignalerSyncObj, - int deviceId); +void setupCuda(cudaExternalResInterop &cudaExtResObj, + NvSciBufObj &inputBufObj, + NvSciSyncObj &syncObj, + NvSciSyncObj &cudaSignalerSyncObj, + int deviceId); void setupCuda(Blit2DTest *ctx, cudaResources &cudaResObj, int deviceId); void cleanupCuda(cudaExternalResInterop &cudaObjs); void cleanupCuda(Blit2DTest *ctx, cudaResources &cudaResObj); diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/main.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/main.cpp index 6bcbdabb..1287e908 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/main.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/main.cpp @@ -26,185 +26,186 @@ */ /* standard headers */ -#include #include #include +#include #include /* Nvidia headers */ #include -#include "nvmedia_utils/cmdline.h" -#include "nvmedia_image.h" + +#include "cuda_consumer.h" #include "nvmedia_2d.h" #include "nvmedia_2d_nvscisync.h" -#include "nvmedia_surface.h" -#include "nvmedia_utils/image_utils.h" +#include "nvmedia_image.h" #include "nvmedia_image_nvscibuf.h" -#include "cuda_consumer.h" #include "nvmedia_producer.h" +#include "nvmedia_surface.h" +#include "nvmedia_utils/cmdline.h" +#include "nvmedia_utils/image_utils.h" #include "nvsci_setup.h" -#define checkNvSciErrors(call) \ - do { \ - NvSciError _status = call; \ - if (NvSciError_Success != _status) { \ - printf( \ - "NVSCI call in file '%s' in line %i returned" \ - " %d, expected %d\n", \ - __FILE__, __LINE__, _status, NvSciError_Success); \ - fflush(stdout); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define checkNvSciErrors(call) \ + do { \ + NvSciError _status = call; \ + if (NvSciError_Success != _status) { \ + printf("NVSCI call in file '%s' in line %i returned" \ + " %d, expected %d\n", \ + __FILE__, \ + __LINE__, \ + _status, \ + NvSciError_Success); \ + fflush(stdout); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) -static void cleanup(Blit2DTest* ctx, NvMediaStatus status) { - if (ctx->i2d != NULL) { - NvMedia2DDestroy(ctx->i2d); - } - - if (ctx->device != NULL) { - NvMediaDeviceDestroy(ctx->device); - } - if (status != NVMEDIA_STATUS_OK) { - exit(EXIT_FAILURE); - } -} - -int main(int argc, char* argv[]) { - TestArgs args; - Blit2DTest ctx; - NvMediaStatus status = NVMEDIA_STATUS_ERROR; - NvSciSyncFence nvMediaSignalerFence = NvSciSyncFenceInitializer; - NvSciSyncFence cudaSignalerFence = NvSciSyncFenceInitializer; - - int cudaDeviceId; - uint64_t startTime, endTime; - uint64_t operationStartTime, operationEndTime; - double processingTime; - - /* Read configuration from command line and config file */ - memset(&args, 0, sizeof(TestArgs)); - memset(&ctx, 0, sizeof(Blit2DTest)); - - /* ParseArgs parses the command line and the 2D configuration file and - * populates all initParams and run time configuration in to appropriate - * structures within args - */ - if (ParseArgs(argc, argv, &args)) { - PrintUsage(); - return -1; - } - /* Check version */ - NvMediaVersion version; - status = NvMedia2DGetVersion(&version); - if (status == NVMEDIA_STATUS_OK) { - printf("Library version: %u.%u\n", version.major, version.minor); - printf("Header version: %u.%u\n", NVMEDIA_2D_VERSION_MAJOR, - NVMEDIA_2D_VERSION_MINOR); - if ((version.major != NVMEDIA_2D_VERSION_MAJOR) || - (version.minor != NVMEDIA_2D_VERSION_MINOR)) { - printf("Library and Header mismatch!\n"); - cleanup(&ctx, status); +static void cleanup(Blit2DTest *ctx, NvMediaStatus status) +{ + if (ctx->i2d != NULL) { + NvMedia2DDestroy(ctx->i2d); } - } - // Create NvMedia device - ctx.device = NvMediaDeviceCreate(); - if (!ctx.device) { - printf("%s: Failed to create NvMedia device\n", __func__); - cleanup(&ctx, status); - } - - // Create 2D blitter - ctx.i2d = NvMedia2DCreate(ctx.device); - if (!ctx.i2d) { - printf("%s: Failed to create NvMedia 2D i2d\n", __func__); - cleanup(&ctx, status); - } - - cudaDeviceId = findCudaDevice(argc, (const char**)argv); - - // NvMedia-CUDA operations without NvSCI APIs starts - cudaResources cudaResObj; - GetTimeMicroSec(&startTime); - setupNvMedia(&args, &ctx); - setupCuda(&ctx, cudaResObj, cudaDeviceId); - - GetTimeMicroSec(&operationStartTime); - for (int i = 0; i < args.iterations; i++) { - runNvMediaBlit2D(&args, &ctx); - runCudaOperation(&ctx, cudaResObj, cudaDeviceId); - } - GetTimeMicroSec(&operationEndTime); - - cleanupNvMedia(&ctx); - cleanupCuda(&ctx, cudaResObj); - GetTimeMicroSec(&endTime); - // NvMedia-CUDA operations without NvSCI APIs ends - - processingTime = (double)(operationEndTime - operationStartTime) / 1000.0; - printf( - "Overall Processing time of NvMedia-CUDA Operations without NvSCI APIs " - "%.4f ms with %zu iterations\n", - processingTime, args.iterations); - processingTime = (double)(endTime - startTime) / 1000.0; - printf( - "Overall Processing time of NvMedia-CUDA Operations + allocation/cleanup " - "without NvSCI APIs %.4f ms with %zu iterations\n", - processingTime, args.iterations); - - NvSciBufObj dstNvSciBufobj, srcNvSciBufobj; - NvSciSyncObj nvMediaSignalerSyncObj, cudaSignalerSyncObj; - cudaExternalResInterop cudaExtResObj; - // NvMedia-CUDA operations via interop with NvSCI APIs starts - GetTimeMicroSec(&startTime); - setupNvMediaSignalerNvSciSync(&ctx, nvMediaSignalerSyncObj, cudaDeviceId); - setupCudaSignalerNvSciSync(&ctx, cudaSignalerSyncObj, cudaDeviceId); - setupNvMedia(&args, &ctx, srcNvSciBufobj, dstNvSciBufobj, - nvMediaSignalerSyncObj, cudaSignalerSyncObj, cudaDeviceId); - setupCuda(cudaExtResObj, dstNvSciBufobj, nvMediaSignalerSyncObj, - cudaSignalerSyncObj, cudaDeviceId); - - GetTimeMicroSec(&operationStartTime); - for (int i = 0; i < args.iterations; i++) { - runNvMediaBlit2D(&args, &ctx, nvMediaSignalerSyncObj, &cudaSignalerFence, - &nvMediaSignalerFence); - runCudaOperation(cudaExtResObj, &nvMediaSignalerFence, &cudaSignalerFence, - cudaDeviceId, args.iterations); - } - GetTimeMicroSec(&operationEndTime); - - cleanupNvMedia(&ctx, nvMediaSignalerSyncObj, cudaSignalerSyncObj); - cleanupCuda(cudaExtResObj); - cleanupNvSciSync(nvMediaSignalerSyncObj); - cleanupNvSciSync(cudaSignalerSyncObj); - cleanupNvSciBuf(srcNvSciBufobj); - cleanupNvSciBuf(dstNvSciBufobj); - GetTimeMicroSec(&endTime); - // NvMedia-CUDA operations via interop with NvSCI APIs ends - - processingTime = (double)(operationEndTime - operationStartTime) / 1000.0; - printf( - "Overall Processing time of NvMedia-CUDA Operations with NvSCI APIs %.4f " - "ms with %zu iterations\n", - processingTime, args.iterations); - processingTime = (double)(endTime - startTime) / 1000.0; - printf( - "Overall Processing time of NvMedia-CUDA Operations + allocation/cleanup " - "with NvSCI APIs %.4f ms with %zu iterations\n", - processingTime, args.iterations); - - if (ctx.i2d != NULL) { - NvMedia2DDestroy(ctx.i2d); - } - - if (ctx.device != NULL) { - NvMediaDeviceDestroy(ctx.device); - } - - if (status == NVMEDIA_STATUS_OK) { - return 0; - } else { - return 1; - } + if (ctx->device != NULL) { + NvMediaDeviceDestroy(ctx->device); + } + if (status != NVMEDIA_STATUS_OK) { + exit(EXIT_FAILURE); + } +} + +int main(int argc, char *argv[]) +{ + TestArgs args; + Blit2DTest ctx; + NvMediaStatus status = NVMEDIA_STATUS_ERROR; + NvSciSyncFence nvMediaSignalerFence = NvSciSyncFenceInitializer; + NvSciSyncFence cudaSignalerFence = NvSciSyncFenceInitializer; + + int cudaDeviceId; + uint64_t startTime, endTime; + uint64_t operationStartTime, operationEndTime; + double processingTime; + + /* Read configuration from command line and config file */ + memset(&args, 0, sizeof(TestArgs)); + memset(&ctx, 0, sizeof(Blit2DTest)); + + /* ParseArgs parses the command line and the 2D configuration file and + * populates all initParams and run time configuration in to appropriate + * structures within args + */ + if (ParseArgs(argc, argv, &args)) { + PrintUsage(); + return -1; + } + /* Check version */ + NvMediaVersion version; + status = NvMedia2DGetVersion(&version); + if (status == NVMEDIA_STATUS_OK) { + printf("Library version: %u.%u\n", version.major, version.minor); + printf("Header version: %u.%u\n", NVMEDIA_2D_VERSION_MAJOR, NVMEDIA_2D_VERSION_MINOR); + if ((version.major != NVMEDIA_2D_VERSION_MAJOR) || (version.minor != NVMEDIA_2D_VERSION_MINOR)) { + printf("Library and Header mismatch!\n"); + cleanup(&ctx, status); + } + } + + // Create NvMedia device + ctx.device = NvMediaDeviceCreate(); + if (!ctx.device) { + printf("%s: Failed to create NvMedia device\n", __func__); + cleanup(&ctx, status); + } + + // Create 2D blitter + ctx.i2d = NvMedia2DCreate(ctx.device); + if (!ctx.i2d) { + printf("%s: Failed to create NvMedia 2D i2d\n", __func__); + cleanup(&ctx, status); + } + + cudaDeviceId = findCudaDevice(argc, (const char **)argv); + + // NvMedia-CUDA operations without NvSCI APIs starts + cudaResources cudaResObj; + GetTimeMicroSec(&startTime); + setupNvMedia(&args, &ctx); + setupCuda(&ctx, cudaResObj, cudaDeviceId); + + GetTimeMicroSec(&operationStartTime); + for (int i = 0; i < args.iterations; i++) { + runNvMediaBlit2D(&args, &ctx); + runCudaOperation(&ctx, cudaResObj, cudaDeviceId); + } + GetTimeMicroSec(&operationEndTime); + + cleanupNvMedia(&ctx); + cleanupCuda(&ctx, cudaResObj); + GetTimeMicroSec(&endTime); + // NvMedia-CUDA operations without NvSCI APIs ends + + processingTime = (double)(operationEndTime - operationStartTime) / 1000.0; + printf("Overall Processing time of NvMedia-CUDA Operations without NvSCI APIs " + "%.4f ms with %zu iterations\n", + processingTime, + args.iterations); + processingTime = (double)(endTime - startTime) / 1000.0; + printf("Overall Processing time of NvMedia-CUDA Operations + allocation/cleanup " + "without NvSCI APIs %.4f ms with %zu iterations\n", + processingTime, + args.iterations); + + NvSciBufObj dstNvSciBufobj, srcNvSciBufobj; + NvSciSyncObj nvMediaSignalerSyncObj, cudaSignalerSyncObj; + cudaExternalResInterop cudaExtResObj; + // NvMedia-CUDA operations via interop with NvSCI APIs starts + GetTimeMicroSec(&startTime); + setupNvMediaSignalerNvSciSync(&ctx, nvMediaSignalerSyncObj, cudaDeviceId); + setupCudaSignalerNvSciSync(&ctx, cudaSignalerSyncObj, cudaDeviceId); + setupNvMedia( + &args, &ctx, srcNvSciBufobj, dstNvSciBufobj, nvMediaSignalerSyncObj, cudaSignalerSyncObj, cudaDeviceId); + setupCuda(cudaExtResObj, dstNvSciBufobj, nvMediaSignalerSyncObj, cudaSignalerSyncObj, cudaDeviceId); + + GetTimeMicroSec(&operationStartTime); + for (int i = 0; i < args.iterations; i++) { + runNvMediaBlit2D(&args, &ctx, nvMediaSignalerSyncObj, &cudaSignalerFence, &nvMediaSignalerFence); + runCudaOperation(cudaExtResObj, &nvMediaSignalerFence, &cudaSignalerFence, cudaDeviceId, args.iterations); + } + GetTimeMicroSec(&operationEndTime); + + cleanupNvMedia(&ctx, nvMediaSignalerSyncObj, cudaSignalerSyncObj); + cleanupCuda(cudaExtResObj); + cleanupNvSciSync(nvMediaSignalerSyncObj); + cleanupNvSciSync(cudaSignalerSyncObj); + cleanupNvSciBuf(srcNvSciBufobj); + cleanupNvSciBuf(dstNvSciBufobj); + GetTimeMicroSec(&endTime); + // NvMedia-CUDA operations via interop with NvSCI APIs ends + + processingTime = (double)(operationEndTime - operationStartTime) / 1000.0; + printf("Overall Processing time of NvMedia-CUDA Operations with NvSCI APIs %.4f " + "ms with %zu iterations\n", + processingTime, + args.iterations); + processingTime = (double)(endTime - startTime) / 1000.0; + printf("Overall Processing time of NvMedia-CUDA Operations + allocation/cleanup " + "with NvSCI APIs %.4f ms with %zu iterations\n", + processingTime, + args.iterations); + + if (ctx.i2d != NULL) { + NvMedia2DDestroy(ctx.i2d); + } + + if (ctx.device != NULL) { + NvMediaDeviceDestroy(ctx.device); + } + + if (status == NVMEDIA_STATUS_OK) { + return 0; + } + else { + return 1; + } } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.cpp index b6d9dafb..db60fd76 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.cpp @@ -25,447 +25,464 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include +#include /* Nvidia headers */ -#include "nvmedia_utils/cmdline.h" -#include "nvmedia_image.h" #include "nvmedia_2d.h" -#include "nvmedia_surface.h" -#include "nvmedia_utils/image_utils.h" +#include "nvmedia_2d_nvscisync.h" +#include "nvmedia_image.h" #include "nvmedia_image_nvscibuf.h" #include "nvmedia_producer.h" -#include "nvmedia_2d_nvscisync.h" +#include "nvmedia_surface.h" +#include "nvmedia_utils/cmdline.h" +#include "nvmedia_utils/image_utils.h" #include "nvsci_setup.h" -NvMediaImage *NvMediaImageCreateUsingNvScibuf(NvMediaDevice *device, - NvMediaSurfaceType type, +NvMediaImage *NvMediaImageCreateUsingNvScibuf(NvMediaDevice *device, + NvMediaSurfaceType type, const NvMediaSurfAllocAttr *attrs, - uint32_t numAttrs, uint32_t flags, - NvSciBufObj &bufobj, - int cudaDeviceId) { - NvSciBufModule module = NULL; - NvSciError err = NvSciError_Success; - NvMediaStatus status = NVMEDIA_STATUS_OK; - NvSciBufAttrList attrlist = NULL; - NvSciBufAttrList conflictlist = NULL; - NvSciBufAttrValAccessPerm access_perm = NvSciBufAccessPerm_ReadWrite; - NvSciBufAttrKeyValuePair attr_kvp = {NvSciBufGeneralAttrKey_RequiredPerm, - &access_perm, sizeof(access_perm)}; - NvSciBufAttrKeyValuePair pairArrayOut[10]; + uint32_t numAttrs, + uint32_t flags, + NvSciBufObj &bufobj, + int cudaDeviceId) +{ + NvSciBufModule module = NULL; + NvSciError err = NvSciError_Success; + NvMediaStatus status = NVMEDIA_STATUS_OK; + NvSciBufAttrList attrlist = NULL; + NvSciBufAttrList conflictlist = NULL; + NvSciBufAttrValAccessPerm access_perm = NvSciBufAccessPerm_ReadWrite; + NvSciBufAttrKeyValuePair attr_kvp = {NvSciBufGeneralAttrKey_RequiredPerm, &access_perm, sizeof(access_perm)}; + NvSciBufAttrKeyValuePair pairArrayOut[10]; - NvMediaImage *image = NULL; + NvMediaImage *image = NULL; - err = NvSciBufModuleOpen(&module); - if (err != NvSciError_Success) { - printf("%s: NvSciBuffModuleOpen failed. Error: %d \n", __func__, err); - goto fail_cleanup; - } + err = NvSciBufModuleOpen(&module); + if (err != NvSciError_Success) { + printf("%s: NvSciBuffModuleOpen failed. Error: %d \n", __func__, err); + goto fail_cleanup; + } - err = NvSciBufAttrListCreate(module, &attrlist); - if (err != NvSciError_Success) { - printf("%s: SciBufAttrListCreate failed. Error: %d \n", __func__, err); - goto fail_cleanup; - } + err = NvSciBufAttrListCreate(module, &attrlist); + if (err != NvSciError_Success) { + printf("%s: SciBufAttrListCreate failed. Error: %d \n", __func__, err); + goto fail_cleanup; + } - err = NvSciBufAttrListSetAttrs(attrlist, &attr_kvp, 1); - if (err != NvSciError_Success) { - printf("%s: AccessPermSetAttr failed. Error: %d \n", __func__, err); - goto fail_cleanup; - } + err = NvSciBufAttrListSetAttrs(attrlist, &attr_kvp, 1); + if (err != NvSciError_Success) { + printf("%s: AccessPermSetAttr failed. Error: %d \n", __func__, err); + goto fail_cleanup; + } - status = - NvMediaImageFillNvSciBufAttrs(device, type, attrs, numAttrs, 0, attrlist); + status = NvMediaImageFillNvSciBufAttrs(device, type, attrs, numAttrs, 0, attrlist); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: ImageFillSciBufAttrs failed. Error: %d \n", __func__, err); - goto fail_cleanup; - } + if (status != NVMEDIA_STATUS_OK) { + printf("%s: ImageFillSciBufAttrs failed. Error: %d \n", __func__, err); + goto fail_cleanup; + } - setupNvSciBuf(bufobj, attrlist, cudaDeviceId); + setupNvSciBuf(bufobj, attrlist, cudaDeviceId); - status = NvMediaImageCreateFromNvSciBuf(device, bufobj, &image); + status = NvMediaImageCreateFromNvSciBuf(device, bufobj, &image); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: ImageCreatefromSciBuf failed. Error: %d \n", __func__, err); - goto fail_cleanup; - } + if (status != NVMEDIA_STATUS_OK) { + printf("%s: ImageCreatefromSciBuf failed. Error: %d \n", __func__, err); + goto fail_cleanup; + } - NvSciBufAttrListFree(attrlist); + NvSciBufAttrListFree(attrlist); - if (module != NULL) { - NvSciBufModuleClose(module); - } + if (module != NULL) { + NvSciBufModuleClose(module); + } - return image; + return image; fail_cleanup: - if (attrlist != NULL) { - NvSciBufAttrListFree(attrlist); - } - if (bufobj != NULL) { - NvSciBufObjFree(bufobj); - bufobj = NULL; - } + if (attrlist != NULL) { + NvSciBufAttrListFree(attrlist); + } + if (bufobj != NULL) { + NvSciBufObjFree(bufobj); + bufobj = NULL; + } - if (module != NULL) { - NvSciBufModuleClose(module); - } - NvMediaImageDestroy(image); - return NULL; + if (module != NULL) { + NvSciBufModuleClose(module); + } + NvMediaImageDestroy(image); + return NULL; } /* Create NvMediaImage surface based on the input attributes. * Returns NVMEDIA_STATUS_OK on success */ -static NvMediaStatus createSurface(Blit2DTest *ctx, +static NvMediaStatus createSurface(Blit2DTest *ctx, NvMediaSurfFormatAttr *surfFormatAttrs, - NvMediaSurfAllocAttr *surfAllocAttrs, - uint32_t numSurfAllocAttrs, - NvMediaImage **image, NvSciBufObj &bufObj, - int cudaDeviceId) { - NvMediaSurfaceType surfType; + NvMediaSurfAllocAttr *surfAllocAttrs, + uint32_t numSurfAllocAttrs, + NvMediaImage **image, + NvSciBufObj &bufObj, + int cudaDeviceId) +{ + NvMediaSurfaceType surfType; - /* create source image */ - surfType = - NvMediaSurfaceFormatGetType(surfFormatAttrs, NVM_SURF_FMT_ATTR_MAX); - *image = NvMediaImageCreateUsingNvScibuf(ctx->device, /* device */ - surfType, /* surface type */ - surfAllocAttrs, numSurfAllocAttrs, 0, - bufObj, cudaDeviceId); + /* create source image */ + surfType = NvMediaSurfaceFormatGetType(surfFormatAttrs, NVM_SURF_FMT_ATTR_MAX); + *image = NvMediaImageCreateUsingNvScibuf(ctx->device, /* device */ + surfType, /* surface type */ + surfAllocAttrs, + numSurfAllocAttrs, + 0, + bufObj, + cudaDeviceId); - if (*image == NULL) { - printf("Unable to create image\n"); - return NVMEDIA_STATUS_ERROR; - } - InitImage(*image, surfAllocAttrs[0].value, surfAllocAttrs[1].value); + if (*image == NULL) { + printf("Unable to create image\n"); + return NVMEDIA_STATUS_ERROR; + } + InitImage(*image, surfAllocAttrs[0].value, surfAllocAttrs[1].value); - /* printf("%s: NvMediaImageCreate:: Image size: %ux%u Image type: %d\n", - __func__, surfAllocAttrs[0].value, surfAllocAttrs[1].value, - surfType);*/ + /* printf("%s: NvMediaImageCreate:: Image size: %ux%u Image type: %d\n", + __func__, surfAllocAttrs[0].value, surfAllocAttrs[1].value, + surfType);*/ - return NVMEDIA_STATUS_OK; + return NVMEDIA_STATUS_OK; } /* Create NvMediaImage surface based on the input attributes. * Returns NVMEDIA_STATUS_OK on success */ -static NvMediaStatus createSurfaceNonNvSCI( - Blit2DTest *ctx, NvMediaSurfFormatAttr *surfFormatAttrs, - NvMediaSurfAllocAttr *surfAllocAttrs, uint32_t numSurfAllocAttrs, - NvMediaImage **image) { - NvMediaSurfaceType surfType; +static NvMediaStatus createSurfaceNonNvSCI(Blit2DTest *ctx, + NvMediaSurfFormatAttr *surfFormatAttrs, + NvMediaSurfAllocAttr *surfAllocAttrs, + uint32_t numSurfAllocAttrs, + NvMediaImage **image) +{ + NvMediaSurfaceType surfType; - /* create source image */ - surfType = - NvMediaSurfaceFormatGetType(surfFormatAttrs, NVM_SURF_FMT_ATTR_MAX); + /* create source image */ + surfType = NvMediaSurfaceFormatGetType(surfFormatAttrs, NVM_SURF_FMT_ATTR_MAX); - *image = NvMediaImageCreateNew(ctx->device, surfType, surfAllocAttrs, - numSurfAllocAttrs, 0); + *image = NvMediaImageCreateNew(ctx->device, surfType, surfAllocAttrs, numSurfAllocAttrs, 0); - if (*image == NULL) { - printf("Unable to create image\n"); - return NVMEDIA_STATUS_ERROR; - } - InitImage(*image, surfAllocAttrs[0].value, surfAllocAttrs[1].value); + if (*image == NULL) { + printf("Unable to create image\n"); + return NVMEDIA_STATUS_ERROR; + } + InitImage(*image, surfAllocAttrs[0].value, surfAllocAttrs[1].value); - /* printf("%s: NvMediaImageCreate:: Image size: %ux%u Image type: %d\n", - __func__, surfAllocAttrs[0].value, surfAllocAttrs[1].value, - surfType);*/ + /* printf("%s: NvMediaImageCreate:: Image size: %ux%u Image type: %d\n", + __func__, surfAllocAttrs[0].value, surfAllocAttrs[1].value, + surfType);*/ - return NVMEDIA_STATUS_OK; + return NVMEDIA_STATUS_OK; } static void destroySurface(NvMediaImage *image) { NvMediaImageDestroy(image); } -static NvMediaStatus blit2DImage(Blit2DTest *ctx, TestArgs *args, - NvSciSyncObj &nvMediaSignalerSyncObj, +static NvMediaStatus blit2DImage(Blit2DTest *ctx, + TestArgs *args, + NvSciSyncObj &nvMediaSignalerSyncObj, NvSciSyncFence *preSyncFence, - NvSciSyncFence *fence) { - NvMediaStatus status; - NvMediaImageSurfaceMap surfaceMap; + NvSciSyncFence *fence) +{ + NvMediaStatus status; + NvMediaImageSurfaceMap surfaceMap; - status = ReadImage(args->inputFileName, /* fileName */ - 0, /* frameNum */ - args->srcSurfAllocAttrs[0].value, /* source image width */ - args->srcSurfAllocAttrs[1].value, /* source image height */ - ctx->srcImage, /* srcImage */ - NVMEDIA_TRUE, /* uvOrderFlag */ - 1, /* bytesPerPixel */ - MSB_ALIGNED); /* pixelAlignment */ + status = ReadImage(args->inputFileName, /* fileName */ + 0, /* frameNum */ + args->srcSurfAllocAttrs[0].value, /* source image width */ + args->srcSurfAllocAttrs[1].value, /* source image height */ + ctx->srcImage, /* srcImage */ + NVMEDIA_TRUE, /* uvOrderFlag */ + 1, /* bytesPerPixel */ + MSB_ALIGNED); /* pixelAlignment */ - if (status != NVMEDIA_STATUS_OK) { - printf("%s: ReadImage failed for input buffer: %d\n", __func__, status); - return status; - } - - if ((args->srcRect.x1 <= args->srcRect.x0) || - (args->srcRect.y1 <= args->srcRect.y0)) { - ctx->srcRect = NULL; - } else { - ctx->srcRect = &(args->srcRect); - } - - if ((args->dstRect.x1 <= args->dstRect.x0) || - (args->dstRect.y1 <= args->dstRect.y0)) { - ctx->dstRect = NULL; - } else { - ctx->dstRect = &(args->dstRect); - } - - static int64_t launch = 0; - // Start inserting pre-fence from second launch inorder to for NvMedia2Blit to - // wait - // for cuda signal on fence. - if (launch) { - status = NvMedia2DInsertPreNvSciSyncFence(ctx->i2d, preSyncFence); if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMedia2DSetNvSciSyncObjforEOF failed: %d\n", __func__, - status); - return status; + printf("%s: ReadImage failed for input buffer: %d\n", __func__, status); + return status; } - NvSciSyncFenceClear(preSyncFence); - } - launch++; - status = NvMedia2DSetNvSciSyncObjforEOF(ctx->i2d, nvMediaSignalerSyncObj); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMedia2DSetNvSciSyncObjforEOF failed: %d\n", __func__, - status); - return status; - } + if ((args->srcRect.x1 <= args->srcRect.x0) || (args->srcRect.y1 <= args->srcRect.y0)) { + ctx->srcRect = NULL; + } + else { + ctx->srcRect = &(args->srcRect); + } - /* 2DBlit processing on input image */ - status = NvMedia2DBlitEx(ctx->i2d, /* i2d */ - ctx->dstImage, /* dstSurface */ - ctx->dstRect, /* dstRect */ - ctx->srcImage, /* srcSurface */ - ctx->srcRect, /* srcRect */ - &args->blitParams, /* params */ - NULL); /* paramsOut */ + if ((args->dstRect.x1 <= args->dstRect.x0) || (args->dstRect.y1 <= args->dstRect.y0)) { + ctx->dstRect = NULL; + } + else { + ctx->dstRect = &(args->dstRect); + } - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMedia2DBlitEx failed: %d\n", __func__, status); - return status; - } + static int64_t launch = 0; + // Start inserting pre-fence from second launch inorder to for NvMedia2Blit to + // wait + // for cuda signal on fence. + if (launch) { + status = NvMedia2DInsertPreNvSciSyncFence(ctx->i2d, preSyncFence); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMedia2DSetNvSciSyncObjforEOF failed: %d\n", __func__, status); + return status; + } + NvSciSyncFenceClear(preSyncFence); + } + launch++; - status = - NvMedia2DGetEOFNvSciSyncFence(ctx->i2d, nvMediaSignalerSyncObj, fence); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMedia2DGetEOFNvSciSyncFence failed: %d\n", __func__, status); - return status; - } + status = NvMedia2DSetNvSciSyncObjforEOF(ctx->i2d, nvMediaSignalerSyncObj); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMedia2DSetNvSciSyncObjforEOF failed: %d\n", __func__, status); + return status; + } - return NVMEDIA_STATUS_OK; + /* 2DBlit processing on input image */ + status = NvMedia2DBlitEx(ctx->i2d, /* i2d */ + ctx->dstImage, /* dstSurface */ + ctx->dstRect, /* dstRect */ + ctx->srcImage, /* srcSurface */ + ctx->srcRect, /* srcRect */ + &args->blitParams, /* params */ + NULL); /* paramsOut */ + + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMedia2DBlitEx failed: %d\n", __func__, status); + return status; + } + + status = NvMedia2DGetEOFNvSciSyncFence(ctx->i2d, nvMediaSignalerSyncObj, fence); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMedia2DGetEOFNvSciSyncFence failed: %d\n", __func__, status); + return status; + } + + return NVMEDIA_STATUS_OK; } -static NvMediaStatus blit2DImageNonNvSCI(Blit2DTest *ctx, TestArgs *args) { - NvMediaStatus status; - NvMediaImageSurfaceMap surfaceMap; +static NvMediaStatus blit2DImageNonNvSCI(Blit2DTest *ctx, TestArgs *args) +{ + NvMediaStatus status; + NvMediaImageSurfaceMap surfaceMap; - status = ReadImage(args->inputFileName, /* fileName */ - 0, /* frameNum */ - args->srcSurfAllocAttrs[0].value, /* source image width */ - args->srcSurfAllocAttrs[1].value, /* source image height */ - ctx->srcImage, /* srcImage */ - NVMEDIA_TRUE, /* uvOrderFlag */ - 1, /* bytesPerPixel */ - MSB_ALIGNED); /* pixelAlignment */ + status = ReadImage(args->inputFileName, /* fileName */ + 0, /* frameNum */ + args->srcSurfAllocAttrs[0].value, /* source image width */ + args->srcSurfAllocAttrs[1].value, /* source image height */ + ctx->srcImage, /* srcImage */ + NVMEDIA_TRUE, /* uvOrderFlag */ + 1, /* bytesPerPixel */ + MSB_ALIGNED); /* pixelAlignment */ - if (status != NVMEDIA_STATUS_OK) { - printf("%s: ReadImage failed for input buffer: %d\n", __func__, status); - return status; - } + if (status != NVMEDIA_STATUS_OK) { + printf("%s: ReadImage failed for input buffer: %d\n", __func__, status); + return status; + } - if ((args->srcRect.x1 <= args->srcRect.x0) || - (args->srcRect.y1 <= args->srcRect.y0)) { - ctx->srcRect = NULL; - } else { - ctx->srcRect = &(args->srcRect); - } + if ((args->srcRect.x1 <= args->srcRect.x0) || (args->srcRect.y1 <= args->srcRect.y0)) { + ctx->srcRect = NULL; + } + else { + ctx->srcRect = &(args->srcRect); + } - if ((args->dstRect.x1 <= args->dstRect.x0) || - (args->dstRect.y1 <= args->dstRect.y0)) { - ctx->dstRect = NULL; - } else { - ctx->dstRect = &(args->dstRect); - } + if ((args->dstRect.x1 <= args->dstRect.x0) || (args->dstRect.y1 <= args->dstRect.y0)) { + ctx->dstRect = NULL; + } + else { + ctx->dstRect = &(args->dstRect); + } - /* 2DBlit processing on input image */ - status = NvMedia2DBlitEx(ctx->i2d, /* i2d */ - ctx->dstImage, /* dstSurface */ - ctx->dstRect, /* dstRect */ - ctx->srcImage, /* srcSurface */ - ctx->srcRect, /* srcRect */ - &args->blitParams, /* params */ - NULL); /* paramsOut */ - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMedia2DBlitEx failed: %d\n", __func__, status); - return status; - } + /* 2DBlit processing on input image */ + status = NvMedia2DBlitEx(ctx->i2d, /* i2d */ + ctx->dstImage, /* dstSurface */ + ctx->dstRect, /* dstRect */ + ctx->srcImage, /* srcSurface */ + ctx->srcRect, /* srcRect */ + &args->blitParams, /* params */ + NULL); /* paramsOut */ + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMedia2DBlitEx failed: %d\n", __func__, status); + return status; + } - /* Write output image into buffer */ - ctx->bytesPerPixel = 1; - WriteImageToAllocatedBuffer(ctx, ctx->dstImage, NVMEDIA_TRUE, NVMEDIA_FALSE, - ctx->bytesPerPixel); + /* Write output image into buffer */ + ctx->bytesPerPixel = 1; + WriteImageToAllocatedBuffer(ctx, ctx->dstImage, NVMEDIA_TRUE, NVMEDIA_FALSE, ctx->bytesPerPixel); - return NVMEDIA_STATUS_OK; + return NVMEDIA_STATUS_OK; } -static void cleanup(Blit2DTest *ctx, NvMediaStatus status = NVMEDIA_STATUS_OK) { - if (ctx->srcImage != NULL) { - NvMedia2DImageUnRegister(ctx->i2d, ctx->srcImage); - destroySurface(ctx->srcImage); - } - if (ctx->dstImage != NULL) { - NvMedia2DImageUnRegister(ctx->i2d, ctx->dstImage); - destroySurface(ctx->dstImage); - } - if (status != NVMEDIA_STATUS_OK) { - exit(EXIT_FAILURE); - } +static void cleanup(Blit2DTest *ctx, NvMediaStatus status = NVMEDIA_STATUS_OK) +{ + if (ctx->srcImage != NULL) { + NvMedia2DImageUnRegister(ctx->i2d, ctx->srcImage); + destroySurface(ctx->srcImage); + } + if (ctx->dstImage != NULL) { + NvMedia2DImageUnRegister(ctx->i2d, ctx->dstImage); + destroySurface(ctx->dstImage); + } + if (status != NVMEDIA_STATUS_OK) { + exit(EXIT_FAILURE); + } } -void cleanupNvMedia(Blit2DTest *ctx, NvSciSyncObj &syncObj, - NvSciSyncObj &preSyncObj) { - NvMediaStatus status; - cleanup(ctx); - status = NvMedia2DUnregisterNvSciSyncObj(ctx->i2d, syncObj); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMediaImageSciBufInit failed\n", __func__); - exit(EXIT_FAILURE); - } - status = NvMedia2DUnregisterNvSciSyncObj(ctx->i2d, preSyncObj); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMediaImageSciBufInit failed\n", __func__); - exit(EXIT_FAILURE); - } - NvMediaImageNvSciBufDeinit(); +void cleanupNvMedia(Blit2DTest *ctx, NvSciSyncObj &syncObj, NvSciSyncObj &preSyncObj) +{ + NvMediaStatus status; + cleanup(ctx); + status = NvMedia2DUnregisterNvSciSyncObj(ctx->i2d, syncObj); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMediaImageSciBufInit failed\n", __func__); + exit(EXIT_FAILURE); + } + status = NvMedia2DUnregisterNvSciSyncObj(ctx->i2d, preSyncObj); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMediaImageSciBufInit failed\n", __func__); + exit(EXIT_FAILURE); + } + NvMediaImageNvSciBufDeinit(); } -void cleanupNvMedia(Blit2DTest *ctx) { - cleanup(ctx); - free(ctx->dstBuffPitches); - free(ctx->dstBuffer); - free(ctx->dstBuff); +void cleanupNvMedia(Blit2DTest *ctx) +{ + cleanup(ctx); + free(ctx->dstBuffPitches); + free(ctx->dstBuffer); + free(ctx->dstBuff); } -void setupNvMedia(TestArgs *args, Blit2DTest *ctx, NvSciBufObj &srcNvSciBufobj, - NvSciBufObj &dstNvSciBufobj, NvSciSyncObj &syncObj, - NvSciSyncObj &preSyncObj, int cudaDeviceId) { - NvMediaStatus status; - status = NvMediaImageNvSciBufInit(); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMediaImageSciBufInit failed\n", __func__); - cleanup(ctx, status); - } +void setupNvMedia(TestArgs *args, + Blit2DTest *ctx, + NvSciBufObj &srcNvSciBufobj, + NvSciBufObj &dstNvSciBufobj, + NvSciSyncObj &syncObj, + NvSciSyncObj &preSyncObj, + int cudaDeviceId) +{ + NvMediaStatus status; + status = NvMediaImageNvSciBufInit(); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMediaImageSciBufInit failed\n", __func__); + cleanup(ctx, status); + } - // Create source surface - status = createSurface(ctx, args->srcSurfFormatAttrs, args->srcSurfAllocAttrs, - args->numSurfAllocAttrs, &ctx->srcImage, - srcNvSciBufobj, cudaDeviceId); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to create buffer pools\n", __func__); - cleanup(ctx, status); - } + // Create source surface + status = createSurface(ctx, + args->srcSurfFormatAttrs, + args->srcSurfAllocAttrs, + args->numSurfAllocAttrs, + &ctx->srcImage, + srcNvSciBufobj, + cudaDeviceId); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to create buffer pools\n", __func__); + cleanup(ctx, status); + } - // Create destination surface - status = createSurface(ctx, args->dstSurfFormatAttrs, args->dstSurfAllocAttrs, - args->numSurfAllocAttrs, &ctx->dstImage, - dstNvSciBufobj, cudaDeviceId); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to create buffer pools\n", __func__); - cleanup(ctx, status); - } + // Create destination surface + status = createSurface(ctx, + args->dstSurfFormatAttrs, + args->dstSurfAllocAttrs, + args->numSurfAllocAttrs, + &ctx->dstImage, + dstNvSciBufobj, + cudaDeviceId); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to create buffer pools\n", __func__); + cleanup(ctx, status); + } - // Register source Surface - status = - NvMedia2DImageRegister(ctx->i2d, ctx->srcImage, NVMEDIA_ACCESS_MODE_READ); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to register source surface\n", __func__); - cleanup(ctx, status); - } - // Register destination Surface - status = NvMedia2DImageRegister(ctx->i2d, ctx->dstImage, - NVMEDIA_ACCESS_MODE_READ_WRITE); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to register destination surface\n", __func__); - cleanup(ctx, status); - } + // Register source Surface + status = NvMedia2DImageRegister(ctx->i2d, ctx->srcImage, NVMEDIA_ACCESS_MODE_READ); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to register source surface\n", __func__); + cleanup(ctx, status); + } + // Register destination Surface + status = NvMedia2DImageRegister(ctx->i2d, ctx->dstImage, NVMEDIA_ACCESS_MODE_READ_WRITE); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to register destination surface\n", __func__); + cleanup(ctx, status); + } - status = NvMedia2DRegisterNvSciSyncObj(ctx->i2d, NVMEDIA_EOFSYNCOBJ, syncObj); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to NvMedia2DRegisterNvSciSyncObj\n", __func__); - } + status = NvMedia2DRegisterNvSciSyncObj(ctx->i2d, NVMEDIA_EOFSYNCOBJ, syncObj); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to NvMedia2DRegisterNvSciSyncObj\n", __func__); + } - status = - NvMedia2DRegisterNvSciSyncObj(ctx->i2d, NVMEDIA_PRESYNCOBJ, preSyncObj); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to NvMedia2DRegisterNvSciSyncObj\n", __func__); - } + status = NvMedia2DRegisterNvSciSyncObj(ctx->i2d, NVMEDIA_PRESYNCOBJ, preSyncObj); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to NvMedia2DRegisterNvSciSyncObj\n", __func__); + } } // Create NvMedia src & dst image without NvSciBuf -void setupNvMedia(TestArgs *args, Blit2DTest *ctx) { - NvMediaStatus status; +void setupNvMedia(TestArgs *args, Blit2DTest *ctx) +{ + NvMediaStatus status; - // Create source surface - status = createSurfaceNonNvSCI(ctx, args->srcSurfFormatAttrs, - args->srcSurfAllocAttrs, - args->numSurfAllocAttrs, &ctx->srcImage); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to create buffer pools\n", __func__); - cleanup(ctx, status); - } + // Create source surface + status = createSurfaceNonNvSCI( + ctx, args->srcSurfFormatAttrs, args->srcSurfAllocAttrs, args->numSurfAllocAttrs, &ctx->srcImage); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to create buffer pools\n", __func__); + cleanup(ctx, status); + } - // Create destination surface - status = createSurfaceNonNvSCI(ctx, args->dstSurfFormatAttrs, - args->dstSurfAllocAttrs, - args->numSurfAllocAttrs, &ctx->dstImage); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to create buffer pools\n", __func__); - cleanup(ctx, status); - } + // Create destination surface + status = createSurfaceNonNvSCI( + ctx, args->dstSurfFormatAttrs, args->dstSurfAllocAttrs, args->numSurfAllocAttrs, &ctx->dstImage); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to create buffer pools\n", __func__); + cleanup(ctx, status); + } - // Register source Surface - status = - NvMedia2DImageRegister(ctx->i2d, ctx->srcImage, NVMEDIA_ACCESS_MODE_READ); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to register source surface\n", __func__); - cleanup(ctx, status); - } + // Register source Surface + status = NvMedia2DImageRegister(ctx->i2d, ctx->srcImage, NVMEDIA_ACCESS_MODE_READ); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to register source surface\n", __func__); + cleanup(ctx, status); + } - // Register destination Surface - status = NvMedia2DImageRegister(ctx->i2d, ctx->dstImage, - NVMEDIA_ACCESS_MODE_READ_WRITE); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Unable to register destination surface\n", __func__); - cleanup(ctx, status); - } + // Register destination Surface + status = NvMedia2DImageRegister(ctx->i2d, ctx->dstImage, NVMEDIA_ACCESS_MODE_READ_WRITE); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Unable to register destination surface\n", __func__); + cleanup(ctx, status); + } - // Allocate buffer for writing image & set image parameters in Blit2DTest. - ctx->bytesPerPixel = 1; - AllocateBufferToWriteImage(ctx, ctx->dstImage, NVMEDIA_TRUE, /* uvOrderFlag */ - NVMEDIA_FALSE); /* appendFlag */ + // Allocate buffer for writing image & set image parameters in Blit2DTest. + ctx->bytesPerPixel = 1; + AllocateBufferToWriteImage(ctx, + ctx->dstImage, + NVMEDIA_TRUE, /* uvOrderFlag */ + NVMEDIA_FALSE); /* appendFlag */ } -void runNvMediaBlit2D(TestArgs *args, Blit2DTest *ctx) { - // Blit2D function - NvMediaStatus status = blit2DImageNonNvSCI(ctx, args); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Blit2D failed\n", __func__); - cleanup(ctx, status); - } +void runNvMediaBlit2D(TestArgs *args, Blit2DTest *ctx) +{ + // Blit2D function + NvMediaStatus status = blit2DImageNonNvSCI(ctx, args); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Blit2D failed\n", __func__); + cleanup(ctx, status); + } } -void runNvMediaBlit2D(TestArgs *args, Blit2DTest *ctx, - NvSciSyncObj &nvMediaSignalerSyncObj, - NvSciSyncFence *preSyncFence, NvSciSyncFence *fence) { - // Blit2D function - NvMediaStatus status = - blit2DImage(ctx, args, nvMediaSignalerSyncObj, preSyncFence, fence); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: Blit2D failed\n", __func__); - cleanup(ctx, status); - } +void runNvMediaBlit2D(TestArgs *args, + Blit2DTest *ctx, + NvSciSyncObj &nvMediaSignalerSyncObj, + NvSciSyncFence *preSyncFence, + NvSciSyncFence *fence) +{ + // Blit2D function + NvMediaStatus status = blit2DImage(ctx, args, nvMediaSignalerSyncObj, preSyncFence, fence); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: Blit2D failed\n", __func__); + cleanup(ctx, status); + } } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.h index adee6984..796009b5 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_producer.h @@ -27,22 +27,28 @@ #ifndef __NVMEDIA_PRODUCER_H__ #define __NVMEDIA_PRODUCER_H__ -#include "nvmedia_utils/cmdline.h" -#include "nvmedia_image.h" #include "nvmedia_2d.h" -#include "nvmedia_surface.h" -#include "nvmedia_utils/image_utils.h" +#include "nvmedia_image.h" #include "nvmedia_image_nvscibuf.h" +#include "nvmedia_surface.h" +#include "nvmedia_utils/cmdline.h" +#include "nvmedia_utils/image_utils.h" #include "nvscisync.h" -void runNvMediaBlit2D(TestArgs* args, Blit2DTest* ctx, NvSciSyncObj& syncObj, - NvSciSyncFence* preSyncFence, NvSciSyncFence* fence); -void runNvMediaBlit2D(TestArgs* args, Blit2DTest* ctx); -void setupNvMedia(TestArgs* args, Blit2DTest* ctx, NvSciBufObj& srcNvSciBufobj, - NvSciBufObj& dstNvSciBufobj, NvSciSyncObj& syncObj, - NvSciSyncObj& preSyncObj, int cudaDeviceId); -void setupNvMedia(TestArgs* args, Blit2DTest* ctx); -void cleanupNvMedia(Blit2DTest* ctx, NvSciSyncObj& syncObj, - NvSciSyncObj& preSyncObj); -void cleanupNvMedia(Blit2DTest* ctx); +void runNvMediaBlit2D(TestArgs *args, + Blit2DTest *ctx, + NvSciSyncObj &syncObj, + NvSciSyncFence *preSyncFence, + NvSciSyncFence *fence); +void runNvMediaBlit2D(TestArgs *args, Blit2DTest *ctx); +void setupNvMedia(TestArgs *args, + Blit2DTest *ctx, + NvSciBufObj &srcNvSciBufobj, + NvSciBufObj &dstNvSciBufobj, + NvSciSyncObj &syncObj, + NvSciSyncObj &preSyncObj, + int cudaDeviceId); +void setupNvMedia(TestArgs *args, Blit2DTest *ctx); +void cleanupNvMedia(Blit2DTest *ctx, NvSciSyncObj &syncObj, NvSciSyncObj &preSyncObj); +void cleanupNvMedia(Blit2DTest *ctx); #endif diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.cpp index e5341d50..8120dd72 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.cpp @@ -35,10 +35,10 @@ /* Nvidia headers */ #include "cmdline.h" -#include "log_utils.h" -#include "misc_utils.h" #include "config_parser.h" #include "helper_cuda.h" +#include "log_utils.h" +#include "misc_utils.h" /* see cmdline.h for details */ void PrintUsage() @@ -59,11 +59,11 @@ SectionMap sectionsMap[] = { /* see cmdline.h for details */ int ParseArgs(int argc, char *argv[], TestArgs *args) { - NvMediaBool bLastArg = NVMEDIA_FALSE; - NvMediaBool bDataAvailable = NVMEDIA_FALSE; - NvMediaStatus status = NVMEDIA_STATUS_OK; - const char* filename = NULL; - int i; + NvMediaBool bLastArg = NVMEDIA_FALSE; + NvMediaBool bDataAvailable = NVMEDIA_FALSE; + NvMediaStatus status = NVMEDIA_STATUS_OK; + const char *filename = NULL; + int i; args->srcSurfAllocAttrs[0].type = args->dstSurfAllocAttrs[0].type = NVM_SURF_ATTR_WIDTH; args->srcSurfAllocAttrs[1].type = args->dstSurfAllocAttrs[1].type = NVM_SURF_ATTR_HEIGHT; @@ -73,7 +73,7 @@ int ParseArgs(int argc, char *argv[], TestArgs *args) args->srcSurfAllocAttrs[5].type = args->dstSurfAllocAttrs[5].type = NVM_SURF_ATTR_ALLOC_TYPE; args->srcSurfAllocAttrs[6].type = args->dstSurfAllocAttrs[6].type = NVM_SURF_ATTR_SCAN_TYPE; args->srcSurfAllocAttrs[7].type = args->dstSurfAllocAttrs[7].type = NVM_SURF_ATTR_COLOR_STD_TYPE; - args->numSurfAllocAttrs = 8; + args->numSurfAllocAttrs = 8; args->srcSurfFormatAttrs[0].type = args->dstSurfFormatAttrs[0].type = NVM_SURF_ATTR_SURF_TYPE; args->srcSurfFormatAttrs[1].type = args->dstSurfFormatAttrs[1].type = NVM_SURF_ATTR_LAYOUT; @@ -87,56 +87,74 @@ int ParseArgs(int argc, char *argv[], TestArgs *args) * See nvmedia_2d.h and sample config file(s) for details. */ ConfigParamsMap paramsMap[] = { - /*ParamName, &args->variableName, paramType, D, LimitType, Mn, Mx, CharSize, p2C, section */ - {"transformMode", &args->blitParams.dstTransform, TYPE_UINT, 0, LIMITS_BOTH, 0, 7, 0, 0, SECTION_NONE}, - {"filterMode", &args->blitParams.filter, TYPE_UINT, 1, LIMITS_BOTH, 1, 4, 0, 0, SECTION_NONE}, - {"colorStd", &args->blitParams.colorStandard, TYPE_UINT, 0, LIMITS_MIN, 0, 3, 0, 0, SECTION_NONE}, - {"validOperations", &args->blitParams.validFields, TYPE_UINT, 0, LIMITS_BOTH, 0, 15, 0, 0, SECTION_NONE}, - {"inputfile", &args->inputFileName, TYPE_CHAR_ARR, 0, LIMITS_NONE, 0, 0, FILE_NAME_SIZE, 0, SECTION_NONE}, + /*ParamName, &args->variableName, paramType, D, LimitType, Mn, + Mx, CharSize, p2C, section */ + {"transformMode", &args->blitParams.dstTransform, TYPE_UINT, 0, LIMITS_BOTH, 0, 7, 0, 0, SECTION_NONE}, + {"filterMode", &args->blitParams.filter, TYPE_UINT, 1, LIMITS_BOTH, 1, 4, 0, 0, SECTION_NONE}, + {"colorStd", &args->blitParams.colorStandard, TYPE_UINT, 0, LIMITS_MIN, 0, 3, 0, 0, SECTION_NONE}, + {"validOperations", &args->blitParams.validFields, TYPE_UINT, 0, LIMITS_BOTH, 0, 15, 0, 0, SECTION_NONE}, + {"inputfile", &args->inputFileName, TYPE_CHAR_ARR, 0, LIMITS_NONE, 0, 0, FILE_NAME_SIZE, 0, SECTION_NONE}, /*src surface alloc attributes*/ - {"srcWidth", &args->srcSurfAllocAttrs[0].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"srcHeight", &args->srcSurfAllocAttrs[1].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"srcCPUAccess", &args->srcSurfAllocAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, - {"srcAllocType", &args->srcSurfAllocAttrs[5].value, TYPE_UINT, 0, LIMITS_BOTH, 0, 1, 0, 0, SECTION_NONE}, - {"srcScanType", &args->srcSurfAllocAttrs[6].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 1, 0, 0, SECTION_NONE}, - {"srcColorStd", &args->srcSurfAllocAttrs[7].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 12, 0, 0, SECTION_NONE}, + {"srcWidth", &args->srcSurfAllocAttrs[0].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"srcHeight", &args->srcSurfAllocAttrs[1].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"srcCPUAccess", &args->srcSurfAllocAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, + {"srcAllocType", &args->srcSurfAllocAttrs[5].value, TYPE_UINT, 0, LIMITS_BOTH, 0, 1, 0, 0, SECTION_NONE}, + {"srcScanType", &args->srcSurfAllocAttrs[6].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 1, 0, 0, SECTION_NONE}, + {"srcColorStd", &args->srcSurfAllocAttrs[7].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 12, 0, 0, SECTION_NONE}, /*src surface format attributes*/ - {"srcSurfType", &args->srcSurfFormatAttrs[0].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, - {"srcLayout", &args->srcSurfFormatAttrs[1].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 2, 0, 0, SECTION_NONE}, - {"srcDataType", &args->srcSurfFormatAttrs[2].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 4, 0, 0, SECTION_NONE}, - {"srcMemory", &args->srcSurfFormatAttrs[3].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, - {"srcSubSamplingType", &args->srcSurfFormatAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 0, 4, 0, 0, SECTION_NONE}, - {"srcBitsPerComponent", &args->srcSurfFormatAttrs[5].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 10, 0, 0, SECTION_NONE}, - {"srcComponentOrder", &args->srcSurfFormatAttrs[6].value, TYPE_UINT, 2, LIMITS_BOTH, 1, 45, 0, 0, SECTION_NONE}, + {"srcSurfType", &args->srcSurfFormatAttrs[0].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, + {"srcLayout", &args->srcSurfFormatAttrs[1].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 2, 0, 0, SECTION_NONE}, + {"srcDataType", &args->srcSurfFormatAttrs[2].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 4, 0, 0, SECTION_NONE}, + {"srcMemory", &args->srcSurfFormatAttrs[3].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, + {"srcSubSamplingType", &args->srcSurfFormatAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 0, 4, 0, 0, SECTION_NONE}, + {"srcBitsPerComponent", + &args->srcSurfFormatAttrs[5].value, + TYPE_UINT, + 1, + LIMITS_BOTH, + 1, + 10, + 0, + 0, + SECTION_NONE}, + {"srcComponentOrder", &args->srcSurfFormatAttrs[6].value, TYPE_UINT, 2, LIMITS_BOTH, 1, 45, 0, 0, SECTION_NONE}, /*srcRect*/ - {"srcRectx0", &args->srcRect.x0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"srcRecty0", &args->srcRect.y0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"srcRectx1", &args->srcRect.x1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"srcRecty1", &args->srcRect.y1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"srcRectx0", &args->srcRect.x0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"srcRecty0", &args->srcRect.y0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"srcRectx1", &args->srcRect.x1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"srcRecty1", &args->srcRect.y1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, /*dst surface alloc attributes*/ - {"dstWidth", &args->dstSurfAllocAttrs[0].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"dstHeight", &args->dstSurfAllocAttrs[1].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"dstCPUAccess", &args->dstSurfAllocAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, - {"dstAllocType", &args->dstSurfAllocAttrs[5].value, TYPE_UINT, 0, LIMITS_BOTH, 0, 1, 0, 0, SECTION_NONE}, - {"dstScanType", &args->dstSurfAllocAttrs[6].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 1, 0, 0, SECTION_NONE}, - {"dstColorStd", &args->dstSurfAllocAttrs[7].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 12, 0, 0, SECTION_NONE}, + {"dstWidth", &args->dstSurfAllocAttrs[0].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"dstHeight", &args->dstSurfAllocAttrs[1].value, TYPE_UINT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"dstCPUAccess", &args->dstSurfAllocAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, + {"dstAllocType", &args->dstSurfAllocAttrs[5].value, TYPE_UINT, 0, LIMITS_BOTH, 0, 1, 0, 0, SECTION_NONE}, + {"dstScanType", &args->dstSurfAllocAttrs[6].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 1, 0, 0, SECTION_NONE}, + {"dstColorStd", &args->dstSurfAllocAttrs[7].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 12, 0, 0, SECTION_NONE}, /*dst surface format attributes*/ - {"dstSurfType", &args->dstSurfFormatAttrs[0].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, - {"dstLayout", &args->dstSurfFormatAttrs[1].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 2, 0, 0, SECTION_NONE}, - {"dstDataType", &args->dstSurfFormatAttrs[2].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 4, 0, 0, SECTION_NONE}, - {"dstMemory", &args->dstSurfFormatAttrs[3].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, - {"dstSubSamplingType", &args->dstSurfFormatAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 0, 4, 0, 0, SECTION_NONE}, - {"dstBitsPerComponent", &args->dstSurfFormatAttrs[5].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 10, 0, 0, SECTION_NONE}, - {"dstComponentOrder", &args->dstSurfFormatAttrs[6].value, TYPE_UINT, 2, LIMITS_BOTH, 1, 45, 0, 0, SECTION_NONE}, + {"dstSurfType", &args->dstSurfFormatAttrs[0].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, + {"dstLayout", &args->dstSurfFormatAttrs[1].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 2, 0, 0, SECTION_NONE}, + {"dstDataType", &args->dstSurfFormatAttrs[2].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 4, 0, 0, SECTION_NONE}, + {"dstMemory", &args->dstSurfFormatAttrs[3].value, TYPE_UINT, 1, LIMITS_BOTH, 1, 3, 0, 0, SECTION_NONE}, + {"dstSubSamplingType", &args->dstSurfFormatAttrs[4].value, TYPE_UINT, 1, LIMITS_BOTH, 0, 4, 0, 0, SECTION_NONE}, + {"dstBitsPerComponent", + &args->dstSurfFormatAttrs[5].value, + TYPE_UINT, + 1, + LIMITS_BOTH, + 1, + 10, + 0, + 0, + SECTION_NONE}, + {"dstComponentOrder", &args->dstSurfFormatAttrs[6].value, TYPE_UINT, 2, LIMITS_BOTH, 1, 45, 0, 0, SECTION_NONE}, /*dstRect*/ - {"dstRectx0", &args->dstRect.x0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"dstRecty0", &args->dstRect.y0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"dstRectx1", &args->dstRect.x1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, - {"dstRecty1", &args->dstRect.y1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"dstRectx0", &args->dstRect.x0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"dstRecty0", &args->dstRect.y0, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"dstRectx1", &args->dstRect.x1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, + {"dstRecty1", &args->dstRect.y1, TYPE_USHORT, 0, LIMITS_MIN, 0, 0, 0, 0, SECTION_NONE}, /*End of the array */ - {NULL, NULL, TYPE_UINT, 0, LIMITS_NONE, 0, 0, 0, 0, SECTION_NONE} - }; + {NULL, NULL, TYPE_UINT, 0, LIMITS_NONE, 0, 0, 0, 0, SECTION_NONE}}; args->iterations = 100; // Set default iterations value. @@ -146,7 +164,7 @@ int ParseArgs(int argc, char *argv[], TestArgs *args) if (checkCmdLineFlag(argc, (const char **)argv, "cf")) { char *inputFileName = NULL; - getCmdLineArgumentString(argc, (const char **)argv, "cf", (char**)&inputFileName); + getCmdLineArgumentString(argc, (const char **)argv, "cf", (char **)&inputFileName); if (!inputFileName) { printf("ERR: Invalid config file name\n"); return -1; @@ -162,8 +180,7 @@ int ParseArgs(int argc, char *argv[], TestArgs *args) filename = sdkFindFilePath("sample.cfg", "."); } - if (filename != NULL) - { + if (filename != NULL) { printf("Using config file %s\n", filename); /* Init Parser Map*/ @@ -173,7 +190,7 @@ int ParseArgs(int argc, char *argv[], TestArgs *args) return -1; } - status = ConfigParser_ParseFile(paramsMap, 1, sectionsMap, (char*)filename); + status = ConfigParser_ParseFile(paramsMap, 1, sectionsMap, (char *)filename); if (status != NVMEDIA_STATUS_OK) { printf("ERR: Failed to parse config file. status:%x\n", status); return -1; @@ -189,4 +206,3 @@ int ParseArgs(int argc, char *argv[], TestArgs *args) return 0; } - diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.h index e4dac813..15d7b8ee 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/cmdline.h @@ -36,55 +36,57 @@ #define FILE_NAME_SIZE 1024 /* TestArgs contains all arguments required to run the 2D test */ -typedef struct _TestArgs { +typedef struct _TestArgs +{ char inputFileName[FILE_NAME_SIZE]; NvMediaSurfAllocAttr srcSurfAllocAttrs[NVM_SURF_ALLOC_ATTR_MAX]; NvMediaSurfAllocAttr dstSurfAllocAttrs[NVM_SURF_ALLOC_ATTR_MAX]; - uint32_t numSurfAllocAttrs; + uint32_t numSurfAllocAttrs; NvMediaSurfFormatAttr srcSurfFormatAttrs[NVM_SURF_FMT_ATTR_MAX]; NvMediaSurfFormatAttr dstSurfFormatAttrs[NVM_SURF_FMT_ATTR_MAX]; - NvMediaRect srcRect; - NvMediaRect dstRect; - NvMedia2DBlitParameters blitParams; - size_t iterations; + NvMediaRect srcRect; + NvMediaRect dstRect; + NvMedia2DBlitParameters blitParams; + size_t iterations; } TestArgs; -typedef struct { - NvMediaDevice *device; +typedef struct +{ + NvMediaDevice *device; /* I2D for 2D blit processing */ - NvMedia2D *i2d; - NvMediaImage *srcImage; - NvMediaImage *dstImage; - NvMediaRect *srcRect; - NvMediaRect *dstRect; - uint8_t **dstBuff; - uint32_t *dstBuffPitches; - uint8_t *dstBuffer; - uint32_t numSurfaces; - uint32_t bytesPerPixel; - uint32_t heightSurface; - uint32_t widthSurface; - float *xScalePtr; - float *yScalePtr; + NvMedia2D *i2d; + NvMediaImage *srcImage; + NvMediaImage *dstImage; + NvMediaRect *srcRect; + NvMediaRect *dstRect; + uint8_t **dstBuff; + uint32_t *dstBuffPitches; + uint8_t *dstBuffer; + uint32_t numSurfaces; + uint32_t bytesPerPixel; + uint32_t heightSurface; + uint32_t widthSurface; + float *xScalePtr; + float *yScalePtr; } Blit2DTest; /* Prints application usage options */ -void PrintUsage (void); +void PrintUsage(void); /* Parses command line arguments. -* Also parses any configuration files supplied in the command line arguments. -* Arguments: -* argc -* (in) Number of tokens in the command line -* argv -* (in) Command line tokens -* args -* (out) Pointer to test arguments structure -*/ -int ParseArgs(int argc, char **argv, TestArgs *args); + * Also parses any configuration files supplied in the command line arguments. + * Arguments: + * argc + * (in) Number of tokens in the command line + * argv + * (in) Command line tokens + * args + * (out) Pointer to test arguments structure + */ +int ParseArgs(int argc, char **argv, TestArgs *args); #endif /* _NVMEDIA_2D_CMD_LINE_H_ */ diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.cpp index 639ef37b..3c9dff9a 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.cpp @@ -25,9 +25,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "config_parser.h" + #include -#include "config_parser.h" #include "log_utils.h" #if defined(__QNX__) #include @@ -37,11 +38,12 @@ static NvMediaStatus GetParamIndex(ConfigParamsMap *paramsMap, char *paramName, { int i = 0; - while(paramsMap[i].paramName != NULL) { + while (paramsMap[i].paramName != NULL) { if (strcasecmp(paramsMap[i].paramName, paramName) == 0) { *index = i; return NVMEDIA_STATUS_OK; - } else { + } + else { i++; } } @@ -53,11 +55,12 @@ NvMediaStatus ConfigParser_GetSectionIndexByName(SectionMap *sectionsMap, char * { unsigned int i = 0; - while(sectionsMap[i].secType != SECTION_NONE) { - if(strcmp(sectionsMap[i].name, sectionName) == 0) { + while (sectionsMap[i].secType != SECTION_NONE) { + if (strcmp(sectionsMap[i].name, sectionName) == 0) { *index = i; return NVMEDIA_STATUS_OK; - } else { + } + else { i++; } } @@ -69,11 +72,12 @@ NvMediaStatus ConfigParser_GetSectionIndexByType(SectionMap *sectionsMap, Sectio { unsigned int i = 0; - while(sectionsMap[i].secType != SECTION_NONE) { - if(sectionsMap[i].secType == sectionType) { + while (sectionsMap[i].secType != SECTION_NONE) { + if (sectionsMap[i].secType == sectionType) { *index = i; return NVMEDIA_STATUS_OK; - } else { + } + else { i++; } } @@ -86,10 +90,10 @@ static NvMediaStatus GetFileContent(char *filename, char **fileContentOut) { FILE *file; char *fileCotent; - long fileSize; + long fileSize; file = fopen(filename, "r"); - if(file == NULL) { + if (file == NULL) { printf("Parser_GetFileContent: Cannot open configuration file %s\n", filename); return NVMEDIA_STATUS_BAD_PARAMETER; } @@ -100,152 +104,162 @@ static NvMediaStatus GetFileContent(char *filename, char **fileContentOut) } fileSize = ftell(file); - if(fileSize < 0 || fileSize > 150000) { + if (fileSize < 0 || fileSize > 150000) { printf("Parser_GetFileContent: Unreasonable Filesize %ld encountered for file %s\n", fileSize, filename); return NVMEDIA_STATUS_ERROR; } - if(fseek (file, 0, SEEK_SET) != 0) { + if (fseek(file, 0, SEEK_SET) != 0) { printf("Parser_GetFileContent: Cannot fseek in configuration file %s\n", filename); return NVMEDIA_STATUS_ERROR; } - fileCotent = (char*)malloc(fileSize + 1); - if(fileCotent == NULL) { + fileCotent = (char *)malloc(fileSize + 1); + if (fileCotent == NULL) { printf("Parser_GetFileContent: Failed allocating buffer for file Content\n"); return NVMEDIA_STATUS_OUT_OF_MEMORY; } - fileSize = (long)fread(fileCotent, 1, fileSize, file); + fileSize = (long)fread(fileCotent, 1, fileSize, file); fileCotent[fileSize] = '\0'; - *fileContentOut = fileCotent; + *fileContentOut = fileCotent; fclose(file); return NVMEDIA_STATUS_OK; } -NvMediaStatus ConfigParser_ParseFile(ConfigParamsMap *paramsMap, unsigned int numParams, SectionMap *sectionsMap, char *fileName) +NvMediaStatus +ConfigParser_ParseFile(ConfigParamsMap *paramsMap, unsigned int numParams, SectionMap *sectionsMap, char *fileName) { - char *items[MAX_ITEMS_TO_PARSE] = {NULL}; - int intValue, itemsCount = 0, i = 0, sectionIndex = 0; - double doubleValue; - float floatValue; - unsigned int currItemIndex, uintValue, sectionId = 0, currSectionId = 0, charValue, paramDefaultLength; - unsigned short ushortValue; - short shortValue; + char *items[MAX_ITEMS_TO_PARSE] = {NULL}; + int intValue, itemsCount = 0, i = 0, sectionIndex = 0; + double doubleValue; + float floatValue; + unsigned int currItemIndex, uintValue, sectionId = 0, currSectionId = 0, charValue, paramDefaultLength; + unsigned short ushortValue; + short shortValue; unsigned long long ullValue; - NvMediaBool isInString = NVMEDIA_FALSE, isInItem = NVMEDIA_FALSE; - char *buffer, *bufferEnd, *param, *pParamLength; - char sectionName[100]; - char currDigit; - char *configContentBuf = NULL; - unsigned int numSetsInSection = 0; + NvMediaBool isInString = NVMEDIA_FALSE, isInItem = NVMEDIA_FALSE; + char *buffer, *bufferEnd, *param, *pParamLength; + char sectionName[100]; + char currDigit; + char *configContentBuf = NULL; + unsigned int numSetsInSection = 0; - if(GetFileContent(fileName, &configContentBuf) != NVMEDIA_STATUS_OK) { + if (GetFileContent(fileName, &configContentBuf) != NVMEDIA_STATUS_OK) { printf("ConfigParser_ParseFile: Failed reading file %s", fileName); return NVMEDIA_STATUS_ERROR; } - buffer = configContentBuf; + buffer = configContentBuf; bufferEnd = &configContentBuf[strlen(configContentBuf)]; - // Stage one: Create items mapping in the content using "items" pointers array. For each parameter we have 3 items: param name, '=' char and the param value. - while(buffer < bufferEnd) { - if(itemsCount >= MAX_ITEMS_TO_PARSE) { - LOG_WARN("ConfigParser_ParseFile: Number of items in configuration file exceeded the maximum allowed (%d). Only %d items will be parsed.\n", - MAX_ITEMS_TO_PARSE, MAX_ITEMS_TO_PARSE); + // Stage one: Create items mapping in the content using "items" pointers array. For each parameter we have 3 items: + // param name, '=' char and the param value. + while (buffer < bufferEnd) { + if (itemsCount >= MAX_ITEMS_TO_PARSE) { + LOG_WARN("ConfigParser_ParseFile: Number of items in configuration file exceeded the maximum allowed (%d). " + "Only %d items will be parsed.\n", + MAX_ITEMS_TO_PARSE, + MAX_ITEMS_TO_PARSE); itemsCount = MAX_ITEMS_TO_PARSE; break; } - switch(*buffer) { - // Carriage return - case 13: + switch (*buffer) { + // Carriage return + case 13: + ++buffer; + break; + case '#': + *buffer = '\0'; // Replace '#' with '\0' in case of comment immediately following integer or string + while (*buffer != '\n' && buffer < bufferEnd) { // Skip till EOL or EOF ++buffer; - break; - case '#': - *buffer = '\0'; // Replace '#' with '\0' in case of comment immediately following integer or string - while(*buffer != '\n' && buffer < bufferEnd) { // Skip till EOL or EOF - ++buffer; - } - isInString = NVMEDIA_FALSE; - isInItem = NVMEDIA_FALSE; - break; - case '\n': - isInItem = NVMEDIA_FALSE; - isInString = NVMEDIA_FALSE; - *buffer++='\0'; - break; - case ' ': - case '\t': // Skip whitespace, leave state unchanged - if(isInString) - buffer++; - else { // Terminate non-strings once whitespace is found - *buffer++ = '\0'; - isInItem = NVMEDIA_FALSE; - } - break; - case '"': // Begin/End of String - *buffer++ = '\0'; - if(!isInString) { - items[itemsCount++] = buffer; - isInItem = ~isInItem; - } else { - isInItem = NVMEDIA_FALSE; - } - isInString = ~isInString; // Toggle - break; - case '[': - *(buffer++) = '\0'; - items[itemsCount++] = buffer; - while(*buffer != ' ' && *buffer != '\n' && buffer < bufferEnd) { // Skip till whitespace (after which is located the parsed section number) or EOL or EOF - sectionName[i++] = *(buffer++); - } - sectionName[i] = '\0'; - i = 0; - while(*buffer == ' ') { - *(buffer++) = '\0'; - } - items[itemsCount++] = buffer; - while(*buffer != ']' && *buffer != '\n' && buffer < bufferEnd) { // Read the section number - currDigit = *buffer; - sectionIndex = sectionIndex * 10 + (currDigit - '0'); - buffer++; - } - *(buffer++) = '\0'; - sectionIndex--; - if(ConfigParser_GetSectionIndexByName(sectionsMap, sectionName, §ionId) != NVMEDIA_STATUS_OK) { - printf("ConfigParser_ParseFile: SectionName couldn't be found in section map: '%s'.\n", sectionName); - } - numSetsInSection++; - sectionsMap[sectionId].lastSectionIndex = sectionIndex; - sectionIndex = 0; - isInString = NVMEDIA_FALSE; - isInItem = NVMEDIA_FALSE; - break; - default: - if(!isInItem) { - items[itemsCount++] = buffer; - isInItem = ~isInItem; - } + } + isInString = NVMEDIA_FALSE; + isInItem = NVMEDIA_FALSE; + break; + case '\n': + isInItem = NVMEDIA_FALSE; + isInString = NVMEDIA_FALSE; + *buffer++ = '\0'; + break; + case ' ': + case '\t': // Skip whitespace, leave state unchanged + if (isInString) buffer++; + else { // Terminate non-strings once whitespace is found + *buffer++ = '\0'; + isInItem = NVMEDIA_FALSE; + } + break; + case '"': // Begin/End of String + *buffer++ = '\0'; + if (!isInString) { + items[itemsCount++] = buffer; + isInItem = ~isInItem; + } + else { + isInItem = NVMEDIA_FALSE; + } + isInString = ~isInString; // Toggle + break; + case '[': + *(buffer++) = '\0'; + items[itemsCount++] = buffer; + while (*buffer != ' ' && *buffer != '\n' + && buffer < bufferEnd) { // Skip till whitespace (after which is located the parsed section number) + // or EOL or EOF + sectionName[i++] = *(buffer++); + } + sectionName[i] = '\0'; + i = 0; + while (*buffer == ' ') { + *(buffer++) = '\0'; + } + items[itemsCount++] = buffer; + while (*buffer != ']' && *buffer != '\n' && buffer < bufferEnd) { // Read the section number + currDigit = *buffer; + sectionIndex = sectionIndex * 10 + (currDigit - '0'); + buffer++; + } + *(buffer++) = '\0'; + sectionIndex--; + if (ConfigParser_GetSectionIndexByName(sectionsMap, sectionName, §ionId) != NVMEDIA_STATUS_OK) { + printf("ConfigParser_ParseFile: SectionName couldn't be found in section map: '%s'.\n", sectionName); + } + numSetsInSection++; + sectionsMap[sectionId].lastSectionIndex = sectionIndex; + sectionIndex = 0; + isInString = NVMEDIA_FALSE; + isInItem = NVMEDIA_FALSE; + break; + default: + if (!isInItem) { + items[itemsCount++] = buffer; + isInItem = ~isInItem; + } + buffer++; } } itemsCount--; - if(numSetsInSection > numParams) { - printf("%s: Not enough buffers allocated for parsing. Number of sets allocated: %d. Number of sets in config file: %d \n", - __func__, numParams, numSetsInSection); - if(configContentBuf) { + if (numSetsInSection > numParams) { + printf("%s: Not enough buffers allocated for parsing. Number of sets allocated: %d. Number of sets in config " + "file: %d \n", + __func__, + numParams, + numSetsInSection); + if (configContentBuf) { free(configContentBuf); } return NVMEDIA_STATUS_ERROR; } // Stage 2: Go through the list of items and save their values in parameters map - for(i = 0; i < itemsCount; i += 3) { - if(ConfigParser_GetSectionIndexByName(sectionsMap, items[i], &currItemIndex) == NVMEDIA_STATUS_OK) { + for (i = 0; i < itemsCount; i += 3) { + if (ConfigParser_GetSectionIndexByName(sectionsMap, items[i], &currItemIndex) == NVMEDIA_STATUS_OK) { currSectionId = atoi(items[i + 1]); currSectionId--; LOG_DBG("ConfigParser_ParseFile: Parsing section %s index %d\n", items[i], currSectionId); @@ -253,105 +267,130 @@ NvMediaStatus ConfigParser_ParseFile(ConfigParamsMap *paramsMap, unsigned int nu continue; } - if(GetParamIndex(paramsMap, items[i], &currItemIndex) != NVMEDIA_STATUS_OK) { - LOG_WARN("ConfigParser_ParseFile: Parameter Name '%s' is not recognized. Dismissing this parameter.\n", items[i]); + if (GetParamIndex(paramsMap, items[i], &currItemIndex) != NVMEDIA_STATUS_OK) { + LOG_WARN("ConfigParser_ParseFile: Parameter Name '%s' is not recognized. Dismissing this parameter.\n", + items[i]); continue; } - if(strcmp("=", items[i + 1])) { - printf("ConfigParser_ParseFile: '=' expected as the second token in each line. Error caught while parsing parameter '%s'.\n", items[i]); + if (strcmp("=", items[i + 1])) { + printf("ConfigParser_ParseFile: '=' expected as the second token in each line. Error caught while parsing " + "parameter '%s'.\n", + items[i]); i -= 2; continue; } - if(ConfigParser_GetSectionIndexByType(sectionsMap, paramsMap[currItemIndex].sectionType, §ionId) != NVMEDIA_STATUS_OK) { - printf("ConfigParser_ParseFile: Section index couldn't be found in section map by type. Param Name: '%s'.\n", paramsMap[currItemIndex].paramName); + if (ConfigParser_GetSectionIndexByType(sectionsMap, paramsMap[currItemIndex].sectionType, §ionId) + != NVMEDIA_STATUS_OK) { + printf( + "ConfigParser_ParseFile: Section index couldn't be found in section map by type. Param Name: '%s'.\n", + paramsMap[currItemIndex].paramName); } - if(sectionsMap[sectionId].lastSectionIndex == 0) { + if (sectionsMap[sectionId].lastSectionIndex == 0) { // Param is not part of a collection or collection includes only one item currSectionId = 0; } param = (char *)paramsMap[currItemIndex].mappedLocation + currSectionId * sectionsMap[sectionId].sizeOfStruct; - pParamLength = (char *)paramsMap[currItemIndex].stringLengthAddr + currSectionId * sectionsMap[sectionId].sizeOfStruct; + pParamLength = + (char *)paramsMap[currItemIndex].stringLengthAddr + currSectionId * sectionsMap[sectionId].sizeOfStruct; paramDefaultLength = paramsMap[currItemIndex].stringLength; // Interpret the Value LOG_DBG("ConfigParser_ParseFile: Interpreting parameter %s\n", items[i]); - switch(paramsMap[currItemIndex].type) { - case TYPE_INT: - if(sscanf(items[i + 2], "%d", &intValue) != 1) { - printf("ConfigParser_ParseFile: Expected numerical value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(int *)(void *)param = intValue; - break; - case TYPE_UINT: - if(sscanf(items[i + 2], "%u", &uintValue) != 1) { - printf("ConfigParser_ParseFile: Expected numerical value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(unsigned int *)(void *)param = uintValue; - break; - case TYPE_UINT_HEX: - if(sscanf(items[i + 2], "%x", &uintValue) != 1) { - printf("ConfigParser_ParseFile: Expected unsigned char value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(unsigned int *)(void *)param = uintValue; - break; - case TYPE_CHAR_ARR: - if(items[i + 2] == NULL) - memset(param, 0, (pParamLength != NULL && *pParamLength != 0) ? *pParamLength : paramDefaultLength); - else { - strncpy(param, items[i + 2], paramsMap[currItemIndex].stringLength); - param[strlen(items[i + 2])] = '\0'; - } - break; - case TYPE_DOUBLE: - if(sscanf(items[i + 2], "%lf", &doubleValue) != 1) { - printf("ConfigParser_ParseFile: Expected double value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(double *)(void *)param = doubleValue; - break; - case TYPE_FLOAT: - if(sscanf(items[i + 2], "%f", &floatValue) != 1) { - printf("ConfigParser_ParseFile: Expected double value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(float *)(void *)param = floatValue; - break; - case TYPE_UCHAR: - if(sscanf(items[i + 2], "%u", &charValue) != 1) { - printf("ConfigParser_ParseFile: Expected unsigned char value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(unsigned char *)(void *)param = charValue; - break; - case TYPE_USHORT: - if(sscanf(items[i + 2], "%hu", &ushortValue) != 1) { - printf("ConfigParser_ParseFile: Expected unsigned short value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(unsigned short *)(void *)param = ushortValue; - break; - case TYPE_SHORT: - if(sscanf(items[i + 2], "%hd", &shortValue) != 1) { - printf("ConfigParser_ParseFile: Expected short value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(short *)(void *)param = shortValue; - break; - case TYPE_UCHAR_ARR: - if(items[i + 2] == NULL) - memset(param, 0, (pParamLength != NULL && *pParamLength != 0) ? *pParamLength : paramDefaultLength); - else { - strncpy(param, items[i + 2], paramsMap[currItemIndex].stringLength); - param[strlen(items[i + 2])] = '\0'; - } - break; - case TYPE_ULLONG: - if(sscanf(items[i + 2], "%llu", &ullValue) != 1) { - printf("ConfigParser_ParseFile: Expected numerical value for Parameter %s, found value '%s'\n", items[i], items[i + 2]); - } - *(unsigned long long *)(void *)param = ullValue; - break; - default: - printf("ConfigParser_ParseFile: Encountered unknown value type in the map\n"); + switch (paramsMap[currItemIndex].type) { + case TYPE_INT: + if (sscanf(items[i + 2], "%d", &intValue) != 1) { + printf("ConfigParser_ParseFile: Expected numerical value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(int *)(void *)param = intValue; + break; + case TYPE_UINT: + if (sscanf(items[i + 2], "%u", &uintValue) != 1) { + printf("ConfigParser_ParseFile: Expected numerical value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(unsigned int *)(void *)param = uintValue; + break; + case TYPE_UINT_HEX: + if (sscanf(items[i + 2], "%x", &uintValue) != 1) { + printf("ConfigParser_ParseFile: Expected unsigned char value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(unsigned int *)(void *)param = uintValue; + break; + case TYPE_CHAR_ARR: + if (items[i + 2] == NULL) + memset(param, 0, (pParamLength != NULL && *pParamLength != 0) ? *pParamLength : paramDefaultLength); + else { + strncpy(param, items[i + 2], paramsMap[currItemIndex].stringLength); + param[strlen(items[i + 2])] = '\0'; + } + break; + case TYPE_DOUBLE: + if (sscanf(items[i + 2], "%lf", &doubleValue) != 1) { + printf("ConfigParser_ParseFile: Expected double value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(double *)(void *)param = doubleValue; + break; + case TYPE_FLOAT: + if (sscanf(items[i + 2], "%f", &floatValue) != 1) { + printf("ConfigParser_ParseFile: Expected double value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(float *)(void *)param = floatValue; + break; + case TYPE_UCHAR: + if (sscanf(items[i + 2], "%u", &charValue) != 1) { + printf("ConfigParser_ParseFile: Expected unsigned char value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(unsigned char *)(void *)param = charValue; + break; + case TYPE_USHORT: + if (sscanf(items[i + 2], "%hu", &ushortValue) != 1) { + printf("ConfigParser_ParseFile: Expected unsigned short value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(unsigned short *)(void *)param = ushortValue; + break; + case TYPE_SHORT: + if (sscanf(items[i + 2], "%hd", &shortValue) != 1) { + printf("ConfigParser_ParseFile: Expected short value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(short *)(void *)param = shortValue; + break; + case TYPE_UCHAR_ARR: + if (items[i + 2] == NULL) + memset(param, 0, (pParamLength != NULL && *pParamLength != 0) ? *pParamLength : paramDefaultLength); + else { + strncpy(param, items[i + 2], paramsMap[currItemIndex].stringLength); + param[strlen(items[i + 2])] = '\0'; + } + break; + case TYPE_ULLONG: + if (sscanf(items[i + 2], "%llu", &ullValue) != 1) { + printf("ConfigParser_ParseFile: Expected numerical value for Parameter %s, found value '%s'\n", + items[i], + items[i + 2]); + } + *(unsigned long long *)(void *)param = ullValue; + break; + default: + printf("ConfigParser_ParseFile: Encountered unknown value type in the map\n"); } } @@ -365,42 +404,42 @@ NvMediaStatus ConfigParser_InitParamsMap(ConfigParamsMap *paramsMap) { int i = 0; - while(paramsMap[i].paramName != NULL) { + while (paramsMap[i].paramName != NULL) { if (paramsMap[i].mappedLocation == NULL) { - i++; - continue; + i++; + continue; } - switch(paramsMap[i].type) { - case TYPE_UINT: - case TYPE_UINT_HEX: - *(unsigned int *)(paramsMap[i].mappedLocation) = (unsigned int)paramsMap[i].defaultValue; - break; - case TYPE_INT: - *(int *)(paramsMap[i].mappedLocation) = (int)paramsMap[i].defaultValue; - break; - case TYPE_DOUBLE: - *(double *)(paramsMap[i].mappedLocation) = (double)paramsMap[i].defaultValue; - break; - case TYPE_FLOAT: - *(float *)(paramsMap[i].mappedLocation) = (float)paramsMap[i].defaultValue; - break; - case TYPE_UCHAR: - *(unsigned char *)(paramsMap[i].mappedLocation) = (NvMediaBool)paramsMap[i].defaultValue; - break; - case TYPE_USHORT: - *(unsigned short *)(paramsMap[i].mappedLocation) = (unsigned short)paramsMap[i].defaultValue; - break; - case TYPE_SHORT: - *(short *)(paramsMap[i].mappedLocation) = (short)paramsMap[i].defaultValue; - break; - case TYPE_ULLONG: - *(unsigned long long *)(paramsMap[i].mappedLocation) = (unsigned long long)paramsMap[i].defaultValue; - break; - case TYPE_CHAR_ARR: - case TYPE_UCHAR_ARR: - default: - break; + switch (paramsMap[i].type) { + case TYPE_UINT: + case TYPE_UINT_HEX: + *(unsigned int *)(paramsMap[i].mappedLocation) = (unsigned int)paramsMap[i].defaultValue; + break; + case TYPE_INT: + *(int *)(paramsMap[i].mappedLocation) = (int)paramsMap[i].defaultValue; + break; + case TYPE_DOUBLE: + *(double *)(paramsMap[i].mappedLocation) = (double)paramsMap[i].defaultValue; + break; + case TYPE_FLOAT: + *(float *)(paramsMap[i].mappedLocation) = (float)paramsMap[i].defaultValue; + break; + case TYPE_UCHAR: + *(unsigned char *)(paramsMap[i].mappedLocation) = (NvMediaBool)paramsMap[i].defaultValue; + break; + case TYPE_USHORT: + *(unsigned short *)(paramsMap[i].mappedLocation) = (unsigned short)paramsMap[i].defaultValue; + break; + case TYPE_SHORT: + *(short *)(paramsMap[i].mappedLocation) = (short)paramsMap[i].defaultValue; + break; + case TYPE_ULLONG: + *(unsigned long long *)(paramsMap[i].mappedLocation) = (unsigned long long)paramsMap[i].defaultValue; + break; + case TYPE_CHAR_ARR: + case TYPE_UCHAR_ARR: + default: + break; } i++; } @@ -410,82 +449,88 @@ NvMediaStatus ConfigParser_InitParamsMap(ConfigParamsMap *paramsMap) NvMediaStatus ConfigParser_ValidateParams(ConfigParamsMap *paramsMap, SectionMap *sectionsMap) { - NvMediaStatus status = NVMEDIA_STATUS_OK; - unsigned int sectionId = 0, i = 0, j; - char *param; + NvMediaStatus status = NVMEDIA_STATUS_OK; + unsigned int sectionId = 0, i = 0, j; + char *param; - while(paramsMap[i].paramName != NULL) { - if(ConfigParser_GetSectionIndexByType(sectionsMap, paramsMap[i].sectionType, §ionId) != NVMEDIA_STATUS_OK) { - printf("ConfigParser_ValidateParams: Section index couldn't be found in section map. Param Name: '%s'.\n", paramsMap[i].paramName); + while (paramsMap[i].paramName != NULL) { + if (ConfigParser_GetSectionIndexByType(sectionsMap, paramsMap[i].sectionType, §ionId) + != NVMEDIA_STATUS_OK) { + printf("ConfigParser_ValidateParams: Section index couldn't be found in section map. Param Name: '%s'.\n", + paramsMap[i].paramName); } - for(j = 0; j <= sectionsMap[sectionId].lastSectionIndex; j++) { - if(paramsMap[i].paramLimits == 1 || paramsMap[i].paramLimits == 2) { + for (j = 0; j <= sectionsMap[sectionId].lastSectionIndex; j++) { + if (paramsMap[i].paramLimits == 1 || paramsMap[i].paramLimits == 2) { param = (char *)paramsMap[i].mappedLocation + j * sectionsMap[sectionId].sizeOfStruct; if (param == NULL) { i++; continue; } switch (paramsMap[i].type) { - case TYPE_UINT: - case TYPE_UINT_HEX: - if(*(unsigned int *)(void *)param < (unsigned int)paramsMap[i].minLimit || - (paramsMap[i].paramLimits == 2 && *(unsigned int *)(void *)param > (unsigned int)paramsMap[i].maxLimit )) { - printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); - printf("Check configuration file for parameter limits\n"); - status = NVMEDIA_STATUS_BAD_PARAMETER; - } - break; - case TYPE_DOUBLE: - if(*(double *)(void *)param < (double)paramsMap[i].minLimit || - (paramsMap[i].paramLimits == 2 && *(double *)(void *)param > (double)paramsMap[i].maxLimit )) { - printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); - printf("Check configuration file for parameter limits\n"); - status = NVMEDIA_STATUS_BAD_PARAMETER; - } - break; - case TYPE_FLOAT: - if(*(float *)(void *)param < (float)paramsMap[i].minLimit || - (paramsMap[i].paramLimits == 2 && *(float *)(void *)param > (float)paramsMap[i].maxLimit )) { - printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); - printf("Check configuration file for parameter limits\n"); - status = NVMEDIA_STATUS_BAD_PARAMETER; - } - break; - case TYPE_INT: - if(*(int *)(void *)param < (int)paramsMap[i].minLimit || - (paramsMap[i].paramLimits == 2 && *(int *)(void *)param > (int)paramsMap[i].maxLimit )) { - printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); - printf("Check configuration file for parameter limits\n"); - status = NVMEDIA_STATUS_BAD_PARAMETER; - } - break; - case TYPE_USHORT: - if(*(unsigned short *)(void *)param < (unsigned short)paramsMap[i].minLimit || - (paramsMap[i].paramLimits == 2 && *(unsigned short *)(void *)param > (unsigned short)paramsMap[i].maxLimit )) { - printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); - printf("Check configuration file for parameter limits\n"); - status = NVMEDIA_STATUS_BAD_PARAMETER; - } - break; - case TYPE_SHORT: - if(*(short *)(void *)param < (short)paramsMap[i].minLimit || - (paramsMap[i].paramLimits == 2 && *(short *)(void *)param > (short)paramsMap[i].maxLimit )) { - printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); - printf("Check configuration file for parameter limits\n"); - status = NVMEDIA_STATUS_BAD_PARAMETER; - } - break; - case TYPE_ULLONG: - if(*(unsigned long long *)(void *)param < (unsigned long long)paramsMap[i].minLimit || - (paramsMap[i].paramLimits == 2 && *(unsigned long long *)(void *)param > (unsigned long long)paramsMap[i].maxLimit )) { - printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); - printf("Check configuration file for parameter limits\n"); - status = NVMEDIA_STATUS_BAD_PARAMETER; - } - break; - default: - break; + case TYPE_UINT: + case TYPE_UINT_HEX: + if (*(unsigned int *)(void *)param < (unsigned int)paramsMap[i].minLimit + || (paramsMap[i].paramLimits == 2 + && *(unsigned int *)(void *)param > (unsigned int)paramsMap[i].maxLimit)) { + printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); + printf("Check configuration file for parameter limits\n"); + status = NVMEDIA_STATUS_BAD_PARAMETER; + } + break; + case TYPE_DOUBLE: + if (*(double *)(void *)param < (double)paramsMap[i].minLimit + || (paramsMap[i].paramLimits == 2 + && *(double *)(void *)param > (double)paramsMap[i].maxLimit)) { + printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); + printf("Check configuration file for parameter limits\n"); + status = NVMEDIA_STATUS_BAD_PARAMETER; + } + break; + case TYPE_FLOAT: + if (*(float *)(void *)param < (float)paramsMap[i].minLimit + || (paramsMap[i].paramLimits == 2 && *(float *)(void *)param > (float)paramsMap[i].maxLimit)) { + printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); + printf("Check configuration file for parameter limits\n"); + status = NVMEDIA_STATUS_BAD_PARAMETER; + } + break; + case TYPE_INT: + if (*(int *)(void *)param < (int)paramsMap[i].minLimit + || (paramsMap[i].paramLimits == 2 && *(int *)(void *)param > (int)paramsMap[i].maxLimit)) { + printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); + printf("Check configuration file for parameter limits\n"); + status = NVMEDIA_STATUS_BAD_PARAMETER; + } + break; + case TYPE_USHORT: + if (*(unsigned short *)(void *)param < (unsigned short)paramsMap[i].minLimit + || (paramsMap[i].paramLimits == 2 + && *(unsigned short *)(void *)param > (unsigned short)paramsMap[i].maxLimit)) { + printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); + printf("Check configuration file for parameter limits\n"); + status = NVMEDIA_STATUS_BAD_PARAMETER; + } + break; + case TYPE_SHORT: + if (*(short *)(void *)param < (short)paramsMap[i].minLimit + || (paramsMap[i].paramLimits == 2 && *(short *)(void *)param > (short)paramsMap[i].maxLimit)) { + printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); + printf("Check configuration file for parameter limits\n"); + status = NVMEDIA_STATUS_BAD_PARAMETER; + } + break; + case TYPE_ULLONG: + if (*(unsigned long long *)(void *)param < (unsigned long long)paramsMap[i].minLimit + || (paramsMap[i].paramLimits == 2 + && *(unsigned long long *)(void *)param > (unsigned long long)paramsMap[i].maxLimit)) { + printf("ConfigParser_ValidateParams: Error in input parameter %s\n", paramsMap[i].paramName); + printf("Check configuration file for parameter limits\n"); + status = NVMEDIA_STATUS_BAD_PARAMETER; + } + break; + default: + break; } } } @@ -498,57 +543,70 @@ NvMediaStatus ConfigParser_ValidateParams(ConfigParamsMap *paramsMap, SectionMap NvMediaStatus ConfigParser_DisplayParams(ConfigParamsMap *pParamsMap, SectionMap *pSectionsMap) { unsigned int i = 0, j, sectionId = 0; - char *param; + char *param; - while(pParamsMap[i].paramName != NULL) { - if(ConfigParser_GetSectionIndexByType(pSectionsMap, pParamsMap[i].sectionType, §ionId) != NVMEDIA_STATUS_OK) { - printf("ConfigParser_DisplayParams: Section index couldn't be found in section map by type. Param Name: '%s'.\n", pParamsMap[i].paramName); + while (pParamsMap[i].paramName != NULL) { + if (ConfigParser_GetSectionIndexByType(pSectionsMap, pParamsMap[i].sectionType, §ionId) + != NVMEDIA_STATUS_OK) { + printf("ConfigParser_DisplayParams: Section index couldn't be found in section map by type. Param Name: " + "'%s'.\n", + pParamsMap[i].paramName); } - for(j = 0; j <= pSectionsMap[sectionId].lastSectionIndex; j++) { + for (j = 0; j <= pSectionsMap[sectionId].lastSectionIndex; j++) { param = (char *)pParamsMap[i].mappedLocation + j * pSectionsMap[sectionId].sizeOfStruct; if (param == NULL) { i++; continue; } - switch(pParamsMap[i].type) { - case TYPE_UINT: - printf("(%d) %s = %u\n", j, pParamsMap[i].paramName, *(unsigned int *)(void *)param); - break; - case TYPE_DOUBLE: - printf("(%d) %s = %.2lf\n", j, pParamsMap[i].paramName, *(double *)(void *)param); - break; - case TYPE_FLOAT: - printf("(%d) %s = %.2f\n", j, pParamsMap[i].paramName, *(float *)(void *)param); - break; - case TYPE_UCHAR: - printf("(%d) %s = %d\n", j, pParamsMap[i].paramName, *(unsigned char *)(void *)param); - break; - case TYPE_USHORT: - printf("(%d) %s = %hu\n", j, pParamsMap[i].paramName, *(unsigned short *)(void *)param); - break; - case TYPE_SHORT: - printf("(%d) %s = %hd\n", j, pParamsMap[i].paramName, *(short *)(void *)param); - break; - case TYPE_ULLONG: - printf("(%d) %s = %llu\n", j, pParamsMap[i].paramName, *(unsigned long long *)(void *)param); - break; - case TYPE_CHAR_ARR: - printf("(%d) %s = ""%s""\n", j, pParamsMap[i].paramName, param); - break; - case TYPE_UCHAR_ARR: - printf("(%d) %s = ""%s""\n", j, pParamsMap[i].paramName, (unsigned char *)(void *)param); - break; - case TYPE_INT: - printf("(%d) %s = %d\n", j, pParamsMap[i].paramName, *(int *)(void *)param); - break; - case TYPE_UINT_HEX: - printf("(%d) %s = %x\n", j, pParamsMap[i].paramName, *(unsigned int *)(void *)param); - break; - default: - // Do nothing - break; + switch (pParamsMap[i].type) { + case TYPE_UINT: + printf("(%d) %s = %u\n", j, pParamsMap[i].paramName, *(unsigned int *)(void *)param); + break; + case TYPE_DOUBLE: + printf("(%d) %s = %.2lf\n", j, pParamsMap[i].paramName, *(double *)(void *)param); + break; + case TYPE_FLOAT: + printf("(%d) %s = %.2f\n", j, pParamsMap[i].paramName, *(float *)(void *)param); + break; + case TYPE_UCHAR: + printf("(%d) %s = %d\n", j, pParamsMap[i].paramName, *(unsigned char *)(void *)param); + break; + case TYPE_USHORT: + printf("(%d) %s = %hu\n", j, pParamsMap[i].paramName, *(unsigned short *)(void *)param); + break; + case TYPE_SHORT: + printf("(%d) %s = %hd\n", j, pParamsMap[i].paramName, *(short *)(void *)param); + break; + case TYPE_ULLONG: + printf("(%d) %s = %llu\n", j, pParamsMap[i].paramName, *(unsigned long long *)(void *)param); + break; + case TYPE_CHAR_ARR: + printf("(%d) %s = " + "%s" + "\n", + j, + pParamsMap[i].paramName, + param); + break; + case TYPE_UCHAR_ARR: + printf("(%d) %s = " + "%s" + "\n", + j, + pParamsMap[i].paramName, + (unsigned char *)(void *)param); + break; + case TYPE_INT: + printf("(%d) %s = %d\n", j, pParamsMap[i].paramName, *(int *)(void *)param); + break; + case TYPE_UINT_HEX: + printf("(%d) %s = %x\n", j, pParamsMap[i].paramName, *(unsigned int *)(void *)param); + break; + default: + // Do nothing + break; } } i++; diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.h index 48ea112f..16d28191 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/config_parser.h @@ -29,7 +29,8 @@ #define _NVMEDIA_TEST_CONFIG_PARSER_H_ #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif #include @@ -42,65 +43,65 @@ extern "C" { #define MAX_ITEMS_TO_PARSE 10000 -typedef enum _ParamType { - TYPE_UINT = 0, - TYPE_UINT_HEX, - TYPE_INT, - TYPE_DOUBLE, - TYPE_FLOAT, - TYPE_UCHAR, - TYPE_ULLONG, - TYPE_USHORT, - TYPE_CHAR_ARR, - TYPE_UCHAR_ARR, - TYPE_SHORT -} ParamType; + typedef enum _ParamType { + TYPE_UINT = 0, + TYPE_UINT_HEX, + TYPE_INT, + TYPE_DOUBLE, + TYPE_FLOAT, + TYPE_UCHAR, + TYPE_ULLONG, + TYPE_USHORT, + TYPE_CHAR_ARR, + TYPE_UCHAR_ARR, + TYPE_SHORT + } ParamType; -typedef enum { - LIMITS_NONE = 0, - LIMITS_MIN = 1, - LIMITS_BOTH = 2 -} LimitsType; + typedef enum { LIMITS_NONE = 0, LIMITS_MIN = 1, LIMITS_BOTH = 2 } LimitsType; -typedef enum { - SECTION_NONE, - SECTION_CAPTURE, - SECTION_QP, - SECTION_RC, - SECTION_ENCODE_PIC, - SECTION_ENCODE_PIC_H264, - SECTION_ENCODE_PIC_H265, - SECTION_MVC, - SECTION_PAYLOAD, - SECTION_2DPROCESSOR -} SectionType; + typedef enum { + SECTION_NONE, + SECTION_CAPTURE, + SECTION_QP, + SECTION_RC, + SECTION_ENCODE_PIC, + SECTION_ENCODE_PIC_H264, + SECTION_ENCODE_PIC_H265, + SECTION_MVC, + SECTION_PAYLOAD, + SECTION_2DPROCESSOR + } SectionType; -typedef struct { - SectionType secType; - const char *name; - unsigned int lastSectionIndex; - size_t sizeOfStruct; -} SectionMap; + typedef struct + { + SectionType secType; + const char *name; + unsigned int lastSectionIndex; + size_t sizeOfStruct; + } SectionMap; -typedef struct { - const char *paramName; - void *mappedLocation; - ParamType type; - double defaultValue; - LimitsType paramLimits; - double minLimit; - double maxLimit; - unsigned int stringLength; // string param size - unsigned int *stringLengthAddr; // address of string param size - SectionType sectionType; -} ConfigParamsMap; + typedef struct + { + const char *paramName; + void *mappedLocation; + ParamType type; + double defaultValue; + LimitsType paramLimits; + double minLimit; + double maxLimit; + unsigned int stringLength; // string param size + unsigned int *stringLengthAddr; // address of string param size + SectionType sectionType; + } ConfigParamsMap; -NvMediaStatus ConfigParser_InitParamsMap(ConfigParamsMap *paramsMap); -NvMediaStatus ConfigParser_ParseFile(ConfigParamsMap *paramsMap, unsigned int numParams, SectionMap *sectionsMap, char *file); -NvMediaStatus ConfigParser_ValidateParams(ConfigParamsMap *paramsMap, SectionMap *sectionsMap); -NvMediaStatus ConfigParser_DisplayParams(ConfigParamsMap *paramsMap, SectionMap *sectionsMap); -NvMediaStatus ConfigParser_GetSectionIndexByName(SectionMap *sectionsMap, char *sectionName, unsigned int *index); -NvMediaStatus ConfigParser_GetSectionIndexByType(SectionMap *sectionsMap, SectionType sectionType, unsigned int *index); + NvMediaStatus ConfigParser_InitParamsMap(ConfigParamsMap *paramsMap); + NvMediaStatus + ConfigParser_ParseFile(ConfigParamsMap *paramsMap, unsigned int numParams, SectionMap *sectionsMap, char *file); + NvMediaStatus ConfigParser_ValidateParams(ConfigParamsMap *paramsMap, SectionMap *sectionsMap); + NvMediaStatus ConfigParser_DisplayParams(ConfigParamsMap *paramsMap, SectionMap *sectionsMap); + NvMediaStatus ConfigParser_GetSectionIndexByName(SectionMap *sectionsMap, char *sectionName, unsigned int *index); + NvMediaStatus + ConfigParser_GetSectionIndexByType(SectionMap *sectionsMap, SectionType sectionType, unsigned int *index); #ifdef __cplusplus } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.cpp index 7c0e2ba5..34046209 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.cpp @@ -26,108 +26,125 @@ */ +#include "image_utils.h" + #include #include #include #include -#include "image_utils.h" #include "misc_utils.h" #include "nvmedia_surface.h" #define MAXM_NUM_SURFACES 6 -typedef struct { - float heightFactor[6]; - float widthFactor[6]; +typedef struct +{ + float heightFactor[6]; + float widthFactor[6]; unsigned int numSurfaces; } ImgUtilSurfParams; -ImgUtilSurfParams ImgSurfParamsTable_RGBA = { +ImgUtilSurfParams ImgSurfParamsTable_RGBA = { .heightFactor = {1, 0, 0, 0, 0, 0}, - .widthFactor = {1, 0, 0, 0, 0, 0}, - .numSurfaces = 1, + .widthFactor = {1, 0, 0, 0, 0, 0}, + .numSurfaces = 1, }; -ImgUtilSurfParams ImgSurfParamsTable_RAW = { +ImgUtilSurfParams ImgSurfParamsTable_RAW = { .heightFactor = {1, 0, 0, 0, 0, 0}, - .widthFactor = {1, 0, 0, 0, 0, 0}, - .numSurfaces = 1, + .widthFactor = {1, 0, 0, 0, 0, 0}, + .numSurfaces = 1, }; ImgUtilSurfParams ImgSurfParamsTable_YUV[][4] = { - { /* PLANAR */ - { /* 420 */ + { + /* PLANAR */ + { + /* 420 */ .heightFactor = {1, 0.5, 0.5, 0, 0, 0}, - .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, + .numSurfaces = 3, }, - { /* 422 */ + { + /* 422 */ .heightFactor = {1, 1, 1, 0, 0, 0}, - .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, + .numSurfaces = 3, }, - { /* 444 */ + { + /* 444 */ .heightFactor = {1, 1, 1, 0, 0, 0}, - .widthFactor = {1, 1, 1, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 1, 1, 0, 0, 0}, + .numSurfaces = 3, }, - { /* 422R */ + { + /* 422R */ .heightFactor = {1, 0.5, 0.5, 0, 0, 0}, - .widthFactor = {1, 1, 1, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 1, 1, 0, 0, 0}, + .numSurfaces = 3, }, }, - { /* SEMI_PLANAR */ - { /* 420 */ + { + /* SEMI_PLANAR */ + { + /* 420 */ .heightFactor = {1, 0.5, 0, 0, 0, 0}, - .widthFactor = {1, 0.5, 0, 0, 0, 0}, - .numSurfaces = 2, + .widthFactor = {1, 0.5, 0, 0, 0, 0}, + .numSurfaces = 2, }, - { /* 422 */ + { + /* 422 */ .heightFactor = {1, 1, 0, 0, 0, 0}, - .widthFactor = {1, 0.5, 0, 0, 0, 0}, - .numSurfaces = 2, + .widthFactor = {1, 0.5, 0, 0, 0, 0}, + .numSurfaces = 2, }, - { /* 444 */ + { + /* 444 */ .heightFactor = {1, 1, 0.5, 0, 0, 0}, - .widthFactor = {1, 1, 0.5, 0, 0, 0}, - .numSurfaces = 2, + .widthFactor = {1, 1, 0.5, 0, 0, 0}, + .numSurfaces = 2, }, - { /* 422R */ + { + /* 422R */ .heightFactor = {1, 0.5, 0.5, 0, 0, 0}, - .widthFactor = {1, 1, 0.5, 0, 0, 0}, - .numSurfaces = 2, + .widthFactor = {1, 1, 0.5, 0, 0, 0}, + .numSurfaces = 2, }, }, - { /* PACKED */ - { /* 420 */ + { + /* PACKED */ + { + /* 420 */ .heightFactor = {1, 0.5, 0.5, 0, 0, 0}, - .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, + .numSurfaces = 3, }, - { /* 422 */ + { + /* 422 */ .heightFactor = {1, 1, 1, 0, 0, 0}, - .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 0.5, 0.5, 0, 0, 0}, + .numSurfaces = 3, }, - { /* 444 */ + { + /* 444 */ .heightFactor = {1, 1, 1, 0, 0, 0}, - .widthFactor = {1, 1, 1, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 1, 1, 0, 0, 0}, + .numSurfaces = 3, }, - { /* 422R */ + { + /* 422R */ .heightFactor = {1, 0.5, 0.5, 0, 0, 0}, - .widthFactor = {1, 1, 1, 0, 0, 0}, - .numSurfaces = 3, + .widthFactor = {1, 1, 1, 0, 0, 0}, + .numSurfaces = 3, }, }, }; -ImgUtilSurfParams ImgSurfParamsTable_Packed = { +ImgUtilSurfParams ImgSurfParamsTable_Packed = { .heightFactor = {1, 0, 0, 0, 0, 0}, - .widthFactor = {1, 0, 0, 0, 0, 0}, - .numSurfaces = 1, + .widthFactor = {1, 0, 0, 0, 0, 0}, + .numSurfaces = 1, }; @@ -139,8 +156,7 @@ unsigned int ImgBytesPerPixelTable_RGBA16[][6] = { {8, 0, 0, 0, 0, 0}, /* 16 */ }; -unsigned int ImgBytesPerPixelTable_RG16[6] = - {4, 0, 0, 0, 0, 0}; +unsigned int ImgBytesPerPixelTable_RG16[6] = {4, 0, 0, 0, 0, 0}; unsigned int ImgBytesPerPixelTable_Alpha[][6] = { {1, 0, 0, 0, 0, 0}, /* 8 */ @@ -164,37 +180,34 @@ unsigned int ImgBytesPerPixelTable_RAW[][6] = { {4, 0, 0, 0, 0, 0}, /* 20 */ }; -unsigned int ImgBytesPerPixelTable_YUV[][9][6] = { - { /* PLANAR */ - {1, 1, 1, 0, 0, 0}, /* 8 */ - {2, 2, 2, 0, 0, 0}, /* 10 */ - {2, 2, 2, 0, 0, 0}, /* 12 */ - {2, 2, 2, 0, 0, 0}, /* 14 */ - {2, 2, 2, 0, 0, 0}, /* 16 */ - {4, 4, 4, 0, 0, 0}, /* 32 */ - {2, 1, 1, 0, 0, 0}, /* 16_8_8 */ - {2, 1, 1, 0, 0, 0}, /* 10_8_8 */ - {4, 0, 0, 0, 0, 0}, /* 2_10_10_10 */ - }, - { /* SEMI_PLANAR */ - {1, 2, 0, 0, 0, 0}, /* 8 */ - {2, 4, 0, 0, 0, 0}, /* 10 */ - {2, 4, 0, 0, 0, 0}, /* 12 */ - {2, 4, 0, 0, 0, 0}, /* 14 */ - {2, 4, 0, 0, 0, 0}, /* 16 */ - {4, 8, 0, 0, 0, 0}, /* 32 */ - {2, 2, 0, 0, 0, 0}, /* 16_8_8 */ - {2, 2, 0, 0, 0, 0}, /* 10_8_8 */ - {4, 0, 0, 0, 0, 0}, /* 2_10_10_10 */ - } -}; +unsigned int ImgBytesPerPixelTable_YUV[][9][6] = {{ + /* PLANAR */ + {1, 1, 1, 0, 0, 0}, /* 8 */ + {2, 2, 2, 0, 0, 0}, /* 10 */ + {2, 2, 2, 0, 0, 0}, /* 12 */ + {2, 2, 2, 0, 0, 0}, /* 14 */ + {2, 2, 2, 0, 0, 0}, /* 16 */ + {4, 4, 4, 0, 0, 0}, /* 32 */ + {2, 1, 1, 0, 0, 0}, /* 16_8_8 */ + {2, 1, 1, 0, 0, 0}, /* 10_8_8 */ + {4, 0, 0, 0, 0, 0}, /* 2_10_10_10 */ + }, + { + /* SEMI_PLANAR */ + {1, 2, 0, 0, 0, 0}, /* 8 */ + {2, 4, 0, 0, 0, 0}, /* 10 */ + {2, 4, 0, 0, 0, 0}, /* 12 */ + {2, 4, 0, 0, 0, 0}, /* 14 */ + {2, 4, 0, 0, 0, 0}, /* 16 */ + {4, 8, 0, 0, 0, 0}, /* 32 */ + {2, 2, 0, 0, 0, 0}, /* 16_8_8 */ + {2, 2, 0, 0, 0, 0}, /* 10_8_8 */ + {4, 0, 0, 0, 0, 0}, /* 2_10_10_10 */ + }}; -static NvMediaStatus -GetBytesPerCompForPackedYUV(unsigned int surfBPCidx, - unsigned int *bytespercomp -) +static NvMediaStatus GetBytesPerCompForPackedYUV(unsigned int surfBPCidx, unsigned int *bytespercomp) { - switch(surfBPCidx) { + switch (surfBPCidx) { case NVM_SURF_ATTR_BITS_PER_COMPONENT_8: case NVM_SURF_ATTR_BITS_PER_COMPONENT_LAYOUT_2_10_10_10: *bytespercomp = 1; @@ -215,116 +228,120 @@ GetBytesPerCompForPackedYUV(unsigned int surfBPCidx, return NVMEDIA_STATUS_ERROR; } return NVMEDIA_STATUS_OK; - } -static NvMediaStatus -GetSurfParams(unsigned int surfaceType, - float **xScale, - float **yScale, - unsigned int **bytePerPixel, - uint32_t *numSurfacesVal) +static NvMediaStatus GetSurfParams(unsigned int surfaceType, + float **xScale, + float **yScale, + unsigned int **bytePerPixel, + uint32_t *numSurfacesVal) { NvMediaStatus status; - unsigned int surfType, surfMemoryType, surfSubSamplingType, surfBPC, surfCompOrder; - float *xScalePtr = NULL, *yScalePtr = NULL; + unsigned int surfType, surfMemoryType, surfSubSamplingType, surfBPC, surfCompOrder; + float *xScalePtr = NULL, *yScalePtr = NULL; unsigned int *bytePerPixelPtr = NULL; NVM_SURF_FMT_DEFINE_ATTR(srcAttr); - uint32_t numSurfaces = 1; + uint32_t numSurfaces = 1; static unsigned int yuvpackedtbl[6] = {1, 0, 0, 0, 0, 0}; - unsigned int numcomps = 1; + unsigned int numcomps = 1; - status = NvMediaSurfaceFormatGetAttrs(surfaceType, - srcAttr, - NVM_SURF_FMT_ATTR_MAX); + status = NvMediaSurfaceFormatGetAttrs(surfaceType, srcAttr, NVM_SURF_FMT_ATTR_MAX); if (status != NVMEDIA_STATUS_OK) { printf("%s:NvMediaSurfaceFormatGetAttrs failed\n", __func__); return NVMEDIA_STATUS_ERROR; } - surfType = srcAttr[NVM_SURF_ATTR_SURF_TYPE].value; - surfMemoryType = srcAttr[NVM_SURF_ATTR_MEMORY].value; + surfType = srcAttr[NVM_SURF_ATTR_SURF_TYPE].value; + surfMemoryType = srcAttr[NVM_SURF_ATTR_MEMORY].value; surfSubSamplingType = srcAttr[NVM_SURF_ATTR_SUB_SAMPLING_TYPE].value; - surfBPC = srcAttr[NVM_SURF_ATTR_BITS_PER_COMPONENT].value; - surfCompOrder = srcAttr[NVM_SURF_ATTR_COMPONENT_ORDER].value; + surfBPC = srcAttr[NVM_SURF_ATTR_BITS_PER_COMPONENT].value; + surfCompOrder = srcAttr[NVM_SURF_ATTR_COMPONENT_ORDER].value; - switch(surfType) { - case NVM_SURF_ATTR_SURF_TYPE_YUV: - if (surfSubSamplingType == NVM_SURF_ATTR_SUB_SAMPLING_TYPE_NONE && - surfMemoryType == NVM_SURF_ATTR_MEMORY_PACKED) { + switch (surfType) { + case NVM_SURF_ATTR_SURF_TYPE_YUV: + if (surfSubSamplingType == NVM_SURF_ATTR_SUB_SAMPLING_TYPE_NONE + && surfMemoryType == NVM_SURF_ATTR_MEMORY_PACKED) { - xScalePtr = &ImgSurfParamsTable_Packed.widthFactor[0]; - yScalePtr = &ImgSurfParamsTable_Packed.heightFactor[0]; - numSurfaces = ImgSurfParamsTable_Packed.numSurfaces; + xScalePtr = &ImgSurfParamsTable_Packed.widthFactor[0]; + yScalePtr = &ImgSurfParamsTable_Packed.heightFactor[0]; + numSurfaces = ImgSurfParamsTable_Packed.numSurfaces; - if (NVMEDIA_STATUS_OK != GetBytesPerCompForPackedYUV(surfBPC, &yuvpackedtbl[0])) { - printf("Invalid Bits per component and Packed YUV combination\n"); - return NVMEDIA_STATUS_ERROR; - } - - switch(surfCompOrder) { - case NVM_SURF_ATTR_COMPONENT_ORDER_VUYX: - case NVM_SURF_ATTR_COMPONENT_ORDER_XYUV: - case NVM_SURF_ATTR_COMPONENT_ORDER_XUYV: - numcomps = 4; - break; - case NVM_SURF_ATTR_COMPONENT_ORDER_UYVY: - case NVM_SURF_ATTR_COMPONENT_ORDER_VYUY: - case NVM_SURF_ATTR_COMPONENT_ORDER_YVYU: - case NVM_SURF_ATTR_COMPONENT_ORDER_YUYV: - numcomps = 2; - break; - case NVM_SURF_ATTR_COMPONENT_ORDER_LUMA: - numcomps = 1; - break; - default: - printf("Invalid component Order and Packed YUV combination\n"); - return NVMEDIA_STATUS_ERROR; - } - yuvpackedtbl[0] = yuvpackedtbl[0] * numcomps; - bytePerPixelPtr = &yuvpackedtbl[0]; - - } else { - xScalePtr = &ImgSurfParamsTable_YUV[0][surfSubSamplingType - NVM_SURF_ATTR_SUB_SAMPLING_TYPE_420].widthFactor[0]; - yScalePtr = &ImgSurfParamsTable_YUV[0][surfSubSamplingType - NVM_SURF_ATTR_SUB_SAMPLING_TYPE_420].heightFactor[0]; - numSurfaces = ImgSurfParamsTable_YUV[0][surfSubSamplingType - NVM_SURF_ATTR_SUB_SAMPLING_TYPE_420].numSurfaces; - bytePerPixelPtr = &ImgBytesPerPixelTable_YUV[0][surfBPC - NVM_SURF_ATTR_BITS_PER_COMPONENT_8][0]; + if (NVMEDIA_STATUS_OK != GetBytesPerCompForPackedYUV(surfBPC, &yuvpackedtbl[0])) { + printf("Invalid Bits per component and Packed YUV combination\n"); + return NVMEDIA_STATUS_ERROR; } - break; - case NVM_SURF_ATTR_SURF_TYPE_RGBA: - if (surfCompOrder == NVM_SURF_ATTR_COMPONENT_ORDER_ALPHA) { - bytePerPixelPtr = &ImgBytesPerPixelTable_Alpha[surfBPC - NVM_SURF_ATTR_BITS_PER_COMPONENT_8][0]; - } else if (surfCompOrder == NVM_SURF_ATTR_COMPONENT_ORDER_RG) { - if(surfBPC == NVM_SURF_ATTR_BITS_PER_COMPONENT_16) { - bytePerPixelPtr = &ImgBytesPerPixelTable_RG16[0]; - } else { - printf("Invalid RGorder & Bitspercomp combination.Only RG16 is supported\n"); - return NVMEDIA_STATUS_ERROR; - } - } else { /* RGBA, ARGB, BGRA */ - if (surfBPC == NVM_SURF_ATTR_BITS_PER_COMPONENT_16) { - bytePerPixelPtr = &ImgBytesPerPixelTable_RGBA16[0][0]; - } else if (surfBPC == NVM_SURF_ATTR_BITS_PER_COMPONENT_8) { - bytePerPixelPtr = &ImgBytesPerPixelTable_RGBA[0][0]; - } else { - printf("RGBA orders with 8 and 16bits only is supported \n"); - return NVMEDIA_STATUS_ERROR; - } + switch (surfCompOrder) { + case NVM_SURF_ATTR_COMPONENT_ORDER_VUYX: + case NVM_SURF_ATTR_COMPONENT_ORDER_XYUV: + case NVM_SURF_ATTR_COMPONENT_ORDER_XUYV: + numcomps = 4; + break; + case NVM_SURF_ATTR_COMPONENT_ORDER_UYVY: + case NVM_SURF_ATTR_COMPONENT_ORDER_VYUY: + case NVM_SURF_ATTR_COMPONENT_ORDER_YVYU: + case NVM_SURF_ATTR_COMPONENT_ORDER_YUYV: + numcomps = 2; + break; + case NVM_SURF_ATTR_COMPONENT_ORDER_LUMA: + numcomps = 1; + break; + default: + printf("Invalid component Order and Packed YUV combination\n"); + return NVMEDIA_STATUS_ERROR; } - xScalePtr = &ImgSurfParamsTable_RGBA.widthFactor[0]; - yScalePtr = &ImgSurfParamsTable_RGBA.heightFactor[0]; - numSurfaces = ImgSurfParamsTable_RGBA.numSurfaces; - break; - case NVM_SURF_ATTR_SURF_TYPE_RAW: - bytePerPixelPtr = &ImgBytesPerPixelTable_RAW[surfBPC - NVM_SURF_ATTR_BITS_PER_COMPONENT_8][0]; - xScalePtr = &ImgSurfParamsTable_RAW.widthFactor[0]; - yScalePtr = &ImgSurfParamsTable_RAW.heightFactor[0]; - numSurfaces = ImgSurfParamsTable_RAW.numSurfaces; - break; - default: - printf("%s: Unsupported Pixel Format %d", __func__, surfType); - return NVMEDIA_STATUS_ERROR; + yuvpackedtbl[0] = yuvpackedtbl[0] * numcomps; + bytePerPixelPtr = &yuvpackedtbl[0]; + } + else { + xScalePtr = + &ImgSurfParamsTable_YUV[0][surfSubSamplingType - NVM_SURF_ATTR_SUB_SAMPLING_TYPE_420].widthFactor[0]; + yScalePtr = + &ImgSurfParamsTable_YUV[0][surfSubSamplingType - NVM_SURF_ATTR_SUB_SAMPLING_TYPE_420].heightFactor[0]; + numSurfaces = + ImgSurfParamsTable_YUV[0][surfSubSamplingType - NVM_SURF_ATTR_SUB_SAMPLING_TYPE_420].numSurfaces; + bytePerPixelPtr = &ImgBytesPerPixelTable_YUV[0][surfBPC - NVM_SURF_ATTR_BITS_PER_COMPONENT_8][0]; + } + + break; + case NVM_SURF_ATTR_SURF_TYPE_RGBA: + if (surfCompOrder == NVM_SURF_ATTR_COMPONENT_ORDER_ALPHA) { + bytePerPixelPtr = &ImgBytesPerPixelTable_Alpha[surfBPC - NVM_SURF_ATTR_BITS_PER_COMPONENT_8][0]; + } + else if (surfCompOrder == NVM_SURF_ATTR_COMPONENT_ORDER_RG) { + if (surfBPC == NVM_SURF_ATTR_BITS_PER_COMPONENT_16) { + bytePerPixelPtr = &ImgBytesPerPixelTable_RG16[0]; + } + else { + printf("Invalid RGorder & Bitspercomp combination.Only RG16 is supported\n"); + return NVMEDIA_STATUS_ERROR; + } + } + else { /* RGBA, ARGB, BGRA */ + if (surfBPC == NVM_SURF_ATTR_BITS_PER_COMPONENT_16) { + bytePerPixelPtr = &ImgBytesPerPixelTable_RGBA16[0][0]; + } + else if (surfBPC == NVM_SURF_ATTR_BITS_PER_COMPONENT_8) { + bytePerPixelPtr = &ImgBytesPerPixelTable_RGBA[0][0]; + } + else { + printf("RGBA orders with 8 and 16bits only is supported \n"); + return NVMEDIA_STATUS_ERROR; + } + } + xScalePtr = &ImgSurfParamsTable_RGBA.widthFactor[0]; + yScalePtr = &ImgSurfParamsTable_RGBA.heightFactor[0]; + numSurfaces = ImgSurfParamsTable_RGBA.numSurfaces; + break; + case NVM_SURF_ATTR_SURF_TYPE_RAW: + bytePerPixelPtr = &ImgBytesPerPixelTable_RAW[surfBPC - NVM_SURF_ATTR_BITS_PER_COMPONENT_8][0]; + xScalePtr = &ImgSurfParamsTable_RAW.widthFactor[0]; + yScalePtr = &ImgSurfParamsTable_RAW.heightFactor[0]; + numSurfaces = ImgSurfParamsTable_RAW.numSurfaces; + break; + default: + printf("%s: Unsupported Pixel Format %d", __func__, surfType); + return NVMEDIA_STATUS_ERROR; } if (xScale) { @@ -344,43 +361,39 @@ GetSurfParams(unsigned int surfaceType, } NvMediaStatus -AllocateBufferToWriteImage( - Blit2DTest *ctx, - NvMediaImage *image, - NvMediaBool uvOrderFlag, - NvMediaBool appendFlag) +AllocateBufferToWriteImage(Blit2DTest *ctx, NvMediaImage *image, NvMediaBool uvOrderFlag, NvMediaBool appendFlag) { - uint32_t imageSize = 0; - unsigned int size[3] ={0}; - uint8_t *buffer = NULL; - uint32_t i, k, newk = 0; + uint32_t imageSize = 0; + unsigned int size[3] = {0}; + uint8_t *buffer = NULL; + uint32_t i, k, newk = 0; unsigned int *bytePerPixelPtr = NULL; - ctx->numSurfaces = 1; + ctx->numSurfaces = 1; NvMediaImageSurfaceMap surfaceMap; - NvMediaStatus status = NVMEDIA_STATUS_ERROR; - uint32_t lineWidth, numRows, startOffset; + NvMediaStatus status = NVMEDIA_STATUS_ERROR; + uint32_t lineWidth, numRows, startOffset; - if(!image) { + if (!image) { printf("%s: Bad parameter\n", __func__); return NVMEDIA_STATUS_BAD_PARAMETER; } status = NvMediaImageLock(image, NVMEDIA_IMAGE_ACCESS_WRITE, &surfaceMap); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaImageLock() failed\n", __func__); return status; } NvMediaImageUnlock(image); - ctx->dstBuff = (uint8_t**) malloc(sizeof(uint8_t*)*MAXM_NUM_SURFACES); - if(!ctx->dstBuff) { + ctx->dstBuff = (uint8_t **)malloc(sizeof(uint8_t *) * MAXM_NUM_SURFACES); + if (!ctx->dstBuff) { printf("%s: Out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; } - ctx->dstBuffPitches = (uint32_t*) calloc(1,sizeof(uint32_t) * MAXM_NUM_SURFACES); - if(!ctx->dstBuffPitches) { + ctx->dstBuffPitches = (uint32_t *)calloc(1, sizeof(uint32_t) * MAXM_NUM_SURFACES); + if (!ctx->dstBuffPitches) { printf("%s: Out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; @@ -389,18 +402,14 @@ AllocateBufferToWriteImage( ctx->heightSurface = surfaceMap.height; ctx->widthSurface = surfaceMap.width; - status = GetSurfParams(image->type, - &ctx->xScalePtr, - &ctx->yScalePtr, - &bytePerPixelPtr, - &ctx->numSurfaces); - if(status != NVMEDIA_STATUS_OK) { + status = GetSurfParams(image->type, &ctx->xScalePtr, &ctx->yScalePtr, &bytePerPixelPtr, &ctx->numSurfaces); + if (status != NVMEDIA_STATUS_OK) { printf("%s: GetSurfParams() failed\n", __func__); goto done; } imageSize = 0; - for(i = 0; i < ctx->numSurfaces; i++) { + for (i = 0; i < ctx->numSurfaces; i++) { size[i] = (ctx->widthSurface * ctx->xScalePtr[i] * ctx->heightSurface * ctx->yScalePtr[i] * bytePerPixelPtr[i]); imageSize += size[i]; ctx->dstBuffPitches[i] = (uint32_t)((float)ctx->widthSurface * ctx->xScalePtr[i]) * bytePerPixelPtr[i]; @@ -412,8 +421,8 @@ AllocateBufferToWriteImage( imageSize += image->embeddedDataTopSize; imageSize += image->embeddedDataBottomSize; - buffer = (uint8_t *) calloc(1, imageSize); - if(!buffer) { + buffer = (uint8_t *)calloc(1, imageSize); + if (!buffer) { printf("%s: Out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; @@ -421,33 +430,31 @@ AllocateBufferToWriteImage( ctx->dstBuffer = buffer; memset(buffer, 0xFF, imageSize); - for(i = 0; i < ctx->numSurfaces; i++) { + for (i = 0; i < ctx->numSurfaces; i++) { ctx->dstBuff[i] = buffer; - buffer = buffer + (uint32_t)(ctx->heightSurface * ctx->yScalePtr[i] * ctx->dstBuffPitches[i]); + buffer = buffer + (uint32_t)(ctx->heightSurface * ctx->yScalePtr[i] * ctx->dstBuffPitches[i]); } done: return status; } -NvMediaStatus -WriteImageToAllocatedBuffer( - Blit2DTest *ctx, - NvMediaImage *image, - NvMediaBool uvOrderFlag, - NvMediaBool appendFlag, - uint32_t bytesPerPixel) +NvMediaStatus WriteImageToAllocatedBuffer(Blit2DTest *ctx, + NvMediaImage *image, + NvMediaBool uvOrderFlag, + NvMediaBool appendFlag, + uint32_t bytesPerPixel) { NvMediaImageSurfaceMap surfaceMap; NvMediaStatus status = NvMediaImageLock(image, NVMEDIA_IMAGE_ACCESS_WRITE, &surfaceMap); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaImageLock() failed\n", __func__); goto done; } status = NvMediaImageGetBits(image, NULL, (void **)ctx->dstBuff, ctx->dstBuffPitches); NvMediaImageUnlock(image); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaVideoSurfaceGetBits() failed \n", __func__); goto done; } @@ -458,41 +465,39 @@ done: } -static NvMediaStatus -ReadImageNew( - char *fileName, - uint32_t frameNum, - uint32_t width, - uint32_t height, - NvMediaImage *image, - NvMediaBool uvOrderFlag, - uint32_t bytesPerPixel, - uint32_t pixelAlignment) +static NvMediaStatus ReadImageNew(char *fileName, + uint32_t frameNum, + uint32_t width, + uint32_t height, + NvMediaImage *image, + NvMediaBool uvOrderFlag, + uint32_t bytesPerPixel, + uint32_t pixelAlignment) { - uint8_t **pBuff = NULL; - uint32_t *pBuffPitches = NULL; - uint32_t imageSize = 0,surfaceSize = 0; - uint8_t *buffer = NULL; - uint8_t *pBuffer = NULL; - uint32_t i, j, k, newk = 0; - float *xScalePtr = NULL, *yScalePtr = NULL; - unsigned int *bytePerPixelPtr = NULL; - uint32_t numSurfaces = 1; - unsigned int uHeightSurface, uWidthSurface; + uint8_t **pBuff = NULL; + uint32_t *pBuffPitches = NULL; + uint32_t imageSize = 0, surfaceSize = 0; + uint8_t *buffer = NULL; + uint8_t *pBuffer = NULL; + uint32_t i, j, k, newk = 0; + float *xScalePtr = NULL, *yScalePtr = NULL; + unsigned int *bytePerPixelPtr = NULL; + uint32_t numSurfaces = 1; + unsigned int uHeightSurface, uWidthSurface; NvMediaImageSurfaceMap surfaceMap; - NvMediaStatus status = NVMEDIA_STATUS_ERROR; - FILE *file = NULL; - unsigned int count, index; + NvMediaStatus status = NVMEDIA_STATUS_ERROR; + FILE *file = NULL; + unsigned int count, index; NVM_SURF_FMT_DEFINE_ATTR(srcAttr); unsigned int surfType, surfBPC; - if(!image || !fileName) { + if (!image || !fileName) { printf("%s: Bad parameter\n", __func__); return NVMEDIA_STATUS_BAD_PARAMETER; } status = NvMediaImageLock(image, NVMEDIA_IMAGE_ACCESS_WRITE, &surfaceMap); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaImageLock() failed\n", __func__); return status; } @@ -501,63 +506,57 @@ ReadImageNew( uHeightSurface = surfaceMap.height; uWidthSurface = surfaceMap.width; - if(width > uWidthSurface || height > uHeightSurface) { + if (width > uWidthSurface || height > uHeightSurface) { printf("%s: Bad parameter\n", __func__); return NVMEDIA_STATUS_BAD_PARAMETER; } - pBuff = (uint8_t **) malloc(sizeof(uint8_t*)*MAXM_NUM_SURFACES); - if(!pBuff) { + pBuff = (uint8_t **)malloc(sizeof(uint8_t *) * MAXM_NUM_SURFACES); + if (!pBuff) { printf("%s: Out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; } - pBuffPitches = (uint32_t *)calloc(1,sizeof(uint32_t) * MAXM_NUM_SURFACES); - if(!pBuffPitches) { + pBuffPitches = (uint32_t *)calloc(1, sizeof(uint32_t) * MAXM_NUM_SURFACES); + if (!pBuffPitches) { printf("%s: Out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; } - status = GetSurfParams(image->type, - &xScalePtr, - &yScalePtr, - &bytePerPixelPtr, - &numSurfaces); - if(status != NVMEDIA_STATUS_OK) { + status = GetSurfParams(image->type, &xScalePtr, &yScalePtr, &bytePerPixelPtr, &numSurfaces); + if (status != NVMEDIA_STATUS_OK) { printf("%s: GetSurfParams() failed\n", __func__); goto done; } - status = NvMediaSurfaceFormatGetAttrs(image->type, - srcAttr, - NVM_SURF_FMT_ATTR_MAX); + status = NvMediaSurfaceFormatGetAttrs(image->type, srcAttr, NVM_SURF_FMT_ATTR_MAX); if (status != NVMEDIA_STATUS_OK) { printf("%s:NvMediaSurfaceFormatGetAttrs failed\n", __func__); goto done; } surfType = srcAttr[NVM_SURF_ATTR_SURF_TYPE].value; - surfBPC = srcAttr[NVM_SURF_ATTR_BITS_PER_COMPONENT].value; + surfBPC = srcAttr[NVM_SURF_ATTR_BITS_PER_COMPONENT].value; surfaceSize = 0; - imageSize = 0; - for(i = 0; i < numSurfaces; i++) { + imageSize = 0; + for (i = 0; i < numSurfaces; i++) { surfaceSize += (uWidthSurface * xScalePtr[i] * uHeightSurface * yScalePtr[i] * bytePerPixelPtr[i]); imageSize += (width * xScalePtr[i] * height * yScalePtr[i] * bytePerPixelPtr[i]); pBuffPitches[i] = (uint32_t)((float)uWidthSurface * xScalePtr[i]) * bytePerPixelPtr[i]; } buffer = (uint8_t *)calloc(1, surfaceSize); - if(!buffer) { + if (!buffer) { printf("%s: Out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; } pBuffer = buffer; - memset(buffer,0x10,surfaceSize); - for(i = 0; i < numSurfaces; i++) { + memset(buffer, 0x10, surfaceSize); + for (i = 0; i < numSurfaces; i++) { pBuff[i] = buffer; if (i) { memset(pBuff[i], 0x80, (uHeightSurface * yScalePtr[i] * pBuffPitches[i])); @@ -566,28 +565,28 @@ ReadImageNew( } file = fopen(fileName, "rb"); - if(!file) { + if (!file) { printf("%s: Error opening file: %s\n", __func__, fileName); status = NVMEDIA_STATUS_ERROR; goto done; } - if(frameNum > 0) { - if(fseeko(file, frameNum * (off_t)imageSize, SEEK_SET)) { + if (frameNum > 0) { + if (fseeko(file, frameNum * (off_t)imageSize, SEEK_SET)) { printf("ReadImage: Error seeking file: %s\n", fileName); status = NVMEDIA_STATUS_ERROR; goto done; } } - if((surfType == NVM_SURF_ATTR_SURF_TYPE_RGBA ) && strstr(fileName, ".png")) { + if ((surfType == NVM_SURF_ATTR_SURF_TYPE_RGBA) && strstr(fileName, ".png")) { printf("ReadImage: Does not support png format\n"); status = NVMEDIA_STATUS_ERROR; goto done; } - for(k = 0; k < numSurfaces; k++) { - for(j = 0; j < height*yScalePtr[k]; j++) { - newk = (!uvOrderFlag && k ) ? (numSurfaces - k) : k; + for (k = 0; k < numSurfaces; k++) { + for (j = 0; j < height * yScalePtr[k]; j++) { + newk = (!uvOrderFlag && k) ? (numSurfaces - k) : k; index = j * pBuffPitches[newk]; count = width * xScalePtr[newk] * bytePerPixelPtr[newk]; if (fread(pBuff[newk] + index, count, 1, file) != 1) { @@ -595,44 +594,44 @@ ReadImageNew( printf("ReadImage: Error reading file: %s\n", fileName); goto done; } - if((surfType == NVM_SURF_ATTR_SURF_TYPE_YUV) && (pixelAlignment == LSB_ALIGNED)) { - uint16_t *psrc = (uint16_t*)(pBuff[newk] + index); - switch(surfBPC) { - case NVM_SURF_ATTR_BITS_PER_COMPONENT_10: - for(i = 0; i < count/2; i++) { - *(psrc + i) = (*(psrc + i)) << (16 - 10); - } - break; - case NVM_SURF_ATTR_BITS_PER_COMPONENT_12: - for(i = 0; i < count/2; i++) { - *(psrc + i) = (*(psrc + i)) << (16 - 12); - } - break; - case NVM_SURF_ATTR_BITS_PER_COMPONENT_14: - for(i = 0; i < count/2; i++) { - *(psrc + i) = (*(psrc + i)) << (16 - 14); - } - break; - default: - break; + if ((surfType == NVM_SURF_ATTR_SURF_TYPE_YUV) && (pixelAlignment == LSB_ALIGNED)) { + uint16_t *psrc = (uint16_t *)(pBuff[newk] + index); + switch (surfBPC) { + case NVM_SURF_ATTR_BITS_PER_COMPONENT_10: + for (i = 0; i < count / 2; i++) { + *(psrc + i) = (*(psrc + i)) << (16 - 10); + } + break; + case NVM_SURF_ATTR_BITS_PER_COMPONENT_12: + for (i = 0; i < count / 2; i++) { + *(psrc + i) = (*(psrc + i)) << (16 - 12); + } + break; + case NVM_SURF_ATTR_BITS_PER_COMPONENT_14: + for (i = 0; i < count / 2; i++) { + *(psrc + i) = (*(psrc + i)) << (16 - 14); + } + break; + default: + break; } } } } status = NvMediaImageLock(image, NVMEDIA_IMAGE_ACCESS_WRITE, &surfaceMap); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaImageLock() failed\n", __func__); goto done; } status = NvMediaImagePutBits(image, NULL, (void **)pBuff, pBuffPitches); NvMediaImageUnlock(image); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: Failed to put bits\n", __func__); } done: - if(pBuff) { + if (pBuff) { free(pBuff); } @@ -644,73 +643,58 @@ done: free(pBuffer); } - if(file) { + if (file) { fclose(file); } return status; } -NvMediaStatus -ReadImage( - char *fileName, - uint32_t frameNum, - uint32_t width, - uint32_t height, - NvMediaImage *image, - NvMediaBool uvOrderFlag, - uint32_t bytesPerPixel, - uint32_t pixelAlignment) +NvMediaStatus ReadImage(char *fileName, + uint32_t frameNum, + uint32_t width, + uint32_t height, + NvMediaImage *image, + NvMediaBool uvOrderFlag, + uint32_t bytesPerPixel, + uint32_t pixelAlignment) { NvMediaStatus status; NVM_SURF_FMT_DEFINE_ATTR(srcAttr); - status = NvMediaSurfaceFormatGetAttrs(image->type, - srcAttr, - NVM_SURF_FMT_ATTR_MAX); + status = NvMediaSurfaceFormatGetAttrs(image->type, srcAttr, NVM_SURF_FMT_ATTR_MAX); if (status == NVMEDIA_STATUS_OK) { - return ReadImageNew( - fileName, - frameNum, - width, - height, - image, - uvOrderFlag, - bytesPerPixel, - pixelAlignment); - } else { + return ReadImageNew(fileName, frameNum, width, height, image, uvOrderFlag, bytesPerPixel, pixelAlignment); + } + else { printf("%s:NvMediaSurfaceFormatGetAttrs failed\n", __func__); return status; } } -NvMediaStatus -InitImage( - NvMediaImage *image, - uint32_t width, - uint32_t height) +NvMediaStatus InitImage(NvMediaImage *image, uint32_t width, uint32_t height) { - uint8_t **pBuff = NULL; - uint32_t *pBuffPitches = NULL; - uint32_t imageSize = 0,surfaceSize = 0; - uint8_t *buffer = NULL; - uint8_t *pBuffer = NULL; - float *xScalePtr = NULL, *yScalePtr = NULL; - unsigned int *bytePerPixelPtr = NULL; - uint32_t numSurfaces = 1; - uint32_t i; - unsigned int uHeightSurface, uWidthSurface; + uint8_t **pBuff = NULL; + uint32_t *pBuffPitches = NULL; + uint32_t imageSize = 0, surfaceSize = 0; + uint8_t *buffer = NULL; + uint8_t *pBuffer = NULL; + float *xScalePtr = NULL, *yScalePtr = NULL; + unsigned int *bytePerPixelPtr = NULL; + uint32_t numSurfaces = 1; + uint32_t i; + unsigned int uHeightSurface, uWidthSurface; NvMediaImageSurfaceMap surfaceMap; - NvMediaStatus status = NVMEDIA_STATUS_ERROR; + NvMediaStatus status = NVMEDIA_STATUS_ERROR; NVM_SURF_FMT_DEFINE_ATTR(srcAttr); - if(!image) { + if (!image) { printf("%s: Bad parameter\n", __func__); return NVMEDIA_STATUS_BAD_PARAMETER; } status = NvMediaImageLock(image, NVMEDIA_IMAGE_ACCESS_WRITE, &surfaceMap); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaImageLock failed\n", __func__); return status; } @@ -720,78 +704,72 @@ InitImage( uHeightSurface = surfaceMap.height; uWidthSurface = surfaceMap.width; - if(width > uWidthSurface || height > uHeightSurface) { + if (width > uWidthSurface || height > uHeightSurface) { printf("%s: Bad parameter\n", __func__); return NVMEDIA_STATUS_BAD_PARAMETER; } - pBuff = (uint8_t **) calloc(1,sizeof(uint8_t*)*MAXM_NUM_SURFACES); - if(!pBuff) { + pBuff = (uint8_t **)calloc(1, sizeof(uint8_t *) * MAXM_NUM_SURFACES); + if (!pBuff) { printf("%s: out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; } - pBuffPitches = (uint32_t *) calloc(1,sizeof(uint32_t) * MAXM_NUM_SURFACES); - if(!pBuffPitches) { + pBuffPitches = (uint32_t *)calloc(1, sizeof(uint32_t) * MAXM_NUM_SURFACES); + if (!pBuffPitches) { printf("%s: out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; } - status = GetSurfParams(image->type, - &xScalePtr, - &yScalePtr, - &bytePerPixelPtr, - &numSurfaces); - if(status != NVMEDIA_STATUS_OK) { + status = GetSurfParams(image->type, &xScalePtr, &yScalePtr, &bytePerPixelPtr, &numSurfaces); + if (status != NVMEDIA_STATUS_OK) { printf("%s: GetSurfParams failed\n", __func__); goto done; } - status = NvMediaSurfaceFormatGetAttrs(image->type, - srcAttr, - NVM_SURF_FMT_ATTR_MAX); + status = NvMediaSurfaceFormatGetAttrs(image->type, srcAttr, NVM_SURF_FMT_ATTR_MAX); if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaSurfaceFormatGetAttrs failed\n", __func__); goto done; } surfaceSize = 0; - imageSize = 0; - for(i = 0; i < numSurfaces; i++) { + imageSize = 0; + for (i = 0; i < numSurfaces; i++) { surfaceSize += (uWidthSurface * xScalePtr[i] * uHeightSurface * yScalePtr[i] * bytePerPixelPtr[i]); imageSize += (width * xScalePtr[i] * height * yScalePtr[i] * bytePerPixelPtr[i]); pBuffPitches[i] = (uint32_t)((float)uWidthSurface * xScalePtr[i]) * bytePerPixelPtr[i]; } buffer = (uint8_t *)calloc(1, surfaceSize); - if(!buffer) { + if (!buffer) { printf("%s: out of memory\n", __func__); status = NVMEDIA_STATUS_OUT_OF_MEMORY; goto done; } pBuffer = buffer; - memset(buffer,0x00,surfaceSize); - for(i = 0; i < numSurfaces; i++) { + memset(buffer, 0x00, surfaceSize); + for (i = 0; i < numSurfaces; i++) { pBuff[i] = buffer; - buffer = buffer + (uint32_t)(uHeightSurface * yScalePtr[i] * pBuffPitches[i]); + buffer = buffer + (uint32_t)(uHeightSurface * yScalePtr[i] * pBuffPitches[i]); } status = NvMediaImageLock(image, NVMEDIA_IMAGE_ACCESS_WRITE, &surfaceMap); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaImageLock failed\n", __func__); goto done; } status = NvMediaImagePutBits(image, NULL, (void **)pBuff, pBuffPitches); NvMediaImageUnlock(image); - if(status != NVMEDIA_STATUS_OK) { + if (status != NVMEDIA_STATUS_OK) { printf("%s: NvMediaImagePutBits failed\n", __func__); } done: - if(pBuff) { + if (pBuff) { free(pBuff); } @@ -805,4 +783,3 @@ done: return status; } - diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.h index 0d960f33..75709d92 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/image_utils.h @@ -29,110 +29,96 @@ #define _NVMEDIA_TEST_IMAGE_UTILS_H_ #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif +#include "cmdline.h" #include "misc_utils.h" #include "nvmedia_core.h" -#include "nvmedia_surface.h" #include "nvmedia_image.h" -#include "cmdline.h" +#include "nvmedia_surface.h" #if (NV_IS_SAFETY == 1) #include "nvmedia_image_internal.h" #endif -#define PACK_RGBA(R, G, B, A) (((uint32_t)(A) << 24) | ((uint32_t)(B) << 16) | \ - ((uint32_t)(G) << 8) | (uint32_t)(R)) -#define DEFAULT_ALPHA 0x80 +#define PACK_RGBA(R, G, B, A) (((uint32_t)(A) << 24) | ((uint32_t)(B) << 16) | ((uint32_t)(G) << 8) | (uint32_t)(R)) +#define DEFAULT_ALPHA 0x80 + // ReadImage + // + // ReadImage() Read image from file + // + // Arguments: + // + // filename + // (in) Input file name + // + // frameNum + // (in) Frame number to read. Use for stream input files. + // + // width + // (in) Surface width + // + // height + // (in) Surface height + // + // image + // (out) Pointer to pre-allocated output surface + // + // uvOrderFlag + // (in) Flag for UV order. If true - UV; If false - VU; + // + // bytesPerPixel + // (in) Bytes per pixel. Nedded for RAW image types handling. + // RAW8 - 1 byte per pixel + // RAW10, RAW12, RAW14 - 2 bytes per pixel + // + // pixelAlignment + // (in) Alignment of bits in pixel. + // 0 - LSB Aligned + // 1 - MSB Aligned + NvMediaStatus ReadImage(char *fileName, + uint32_t frameNum, + uint32_t width, + uint32_t height, + NvMediaImage *image, + NvMediaBool uvOrderFlag, + uint32_t bytesPerPixel, + uint32_t pixelAlignment); -// ReadImage -// -// ReadImage() Read image from file -// -// Arguments: -// -// filename -// (in) Input file name -// -// frameNum -// (in) Frame number to read. Use for stream input files. -// -// width -// (in) Surface width -// -// height -// (in) Surface height -// -// image -// (out) Pointer to pre-allocated output surface -// -// uvOrderFlag -// (in) Flag for UV order. If true - UV; If false - VU; -// -// bytesPerPixel -// (in) Bytes per pixel. Nedded for RAW image types handling. -// RAW8 - 1 byte per pixel -// RAW10, RAW12, RAW14 - 2 bytes per pixel -// -// pixelAlignment -// (in) Alignment of bits in pixel. -// 0 - LSB Aligned -// 1 - MSB Aligned + // InitImage + // + // InitImage() Init image data to zeros + // + // Arguments: + // + // image + // (in) image to initialize + // + // width + // (in) Surface width + // + // height + // (in) Surface height -NvMediaStatus -ReadImage( - char *fileName, - uint32_t frameNum, - uint32_t width, - uint32_t height, - NvMediaImage *image, - NvMediaBool uvOrderFlag, - uint32_t bytesPerPixel, - uint32_t pixelAlignment); + NvMediaStatus InitImage(NvMediaImage *image, uint32_t width, uint32_t height); -// InitImage -// -// InitImage() Init image data to zeros -// -// Arguments: -// -// image -// (in) image to initialize -// -// width -// (in) Surface width -// -// height -// (in) Surface height + NvMediaStatus + AllocateBufferToWriteImage(Blit2DTest *ctx, NvMediaImage *image, NvMediaBool uvOrderFlag, NvMediaBool appendFlag); -NvMediaStatus -InitImage( - NvMediaImage *image, - uint32_t width, - uint32_t height); - -NvMediaStatus -AllocateBufferToWriteImage( - Blit2DTest *ctx, - NvMediaImage *image, - NvMediaBool uvOrderFlag, - NvMediaBool appendFlag); - -// WriteImageToBuffer -// -// WriteImageToBuffer() Save RGB or YUV image -// -NvMediaStatus -WriteImageToAllocatedBuffer( - Blit2DTest *ctx, - NvMediaImage *image, - NvMediaBool uvOrderFlag, - NvMediaBool appendFlag, - uint32_t bytesPerPixel); + // WriteImageToBuffer + // + // WriteImageToBuffer() Save RGB or YUV image + // + NvMediaStatus WriteImageToAllocatedBuffer(Blit2DTest *ctx, + NvMediaImage *image, + NvMediaBool uvOrderFlag, + NvMediaBool appendFlag, + uint32_t bytesPerPixel); #ifdef __cplusplus } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.cpp index 0d3feaaa..9be97872 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.cpp @@ -29,7 +29,7 @@ #include #include #ifdef NVMEDIA_ANDROID -#define LOG_TAG "nvmedia_common" +#define LOG_TAG "nvmedia_common" #define LOG_NDEBUG 1 #include #endif @@ -48,99 +48,100 @@ static enum LogLevel msg_level = LEVEL_ERR; static enum LogStyle msg_style = LOG_STYLE_NORMAL; -static FILE *msg_file = NULL; +static FILE *msg_file = NULL; void SetLogLevel(enum LogLevel level) { - if (level > LEVEL_DBG) - return; + if (level > LEVEL_DBG) + return; - msg_level = level; + msg_level = level; } void SetLogStyle(enum LogStyle style) { - if (style > LOG_STYLE_FUNCTION_LINE) - return; + if (style > LOG_STYLE_FUNCTION_LINE) + return; - msg_style = style; + msg_style = style; } void SetLogFile(FILE *logFileHandle) { - if(!logFileHandle) + if (!logFileHandle) return; msg_file = logFileHandle; } -void LogLevelMessage(enum LogLevel level, const char *functionName, - int lineNumber, const char *format, ...) +void LogLevelMessage(enum LogLevel level, const char *functionName, int lineNumber, const char *format, ...) { va_list ap; - char str[LOG_BUFFER_BYTES] = {'\0',}; + char str[LOG_BUFFER_BYTES] = { + '\0', + }; FILE *logFile = msg_file ? msg_file : stdout; if (level > msg_level) return; #ifndef NVMEDIA_ANDROID -/** In the case of Android ADB log, if LOG_TAG is defined, - * before 'Log.h' is included in source file, - * LOG_TAG is automatically concatenated at the beginning of log message, - * so, we don't copy 'nvmedia: ' into 'str'. - */ + /** In the case of Android ADB log, if LOG_TAG is defined, + * before 'Log.h' is included in source file, + * LOG_TAG is automatically concatenated at the beginning of log message, + * so, we don't copy 'nvmedia: ' into 'str'. + */ strcpy(str, "nvmedia: "); -/** As LOG_TAG is concatednated, log level is also automatically concatenated, - * by calling different ADB log function such as ALOGE(for eror log message), - * ALOGW(for warning log message). - */ + /** As LOG_TAG is concatednated, log level is also automatically concatenated, + * by calling different ADB log function such as ALOGE(for eror log message), + * ALOGW(for warning log message). + */ switch (level) { - case LEVEL_ERR: - strcat(str, "ERROR: "); - break; - case LEVEL_WARN: - strcat(str, "WARNING: "); - break; - case LEVEL_INFO: - case LEVEL_DBG: - // Empty - break; + case LEVEL_ERR: + strcat(str, "ERROR: "); + break; + case LEVEL_WARN: + strcat(str, "WARNING: "); + break; + case LEVEL_INFO: + case LEVEL_DBG: + // Empty + break; } #endif va_start(ap, format); vsnprintf(str + strlen(str), sizeof(str) - strlen(str), format, ap); - if(msg_style == LOG_STYLE_NORMAL) { + if (msg_style == LOG_STYLE_NORMAL) { // Add trailing new line char - if(strlen(str) && str[strlen(str) - 1] != '\n') + if (strlen(str) && str[strlen(str) - 1] != '\n') strcat(str, "\n"); - - } else if(msg_style == LOG_STYLE_FUNCTION_LINE) { + } + else if (msg_style == LOG_STYLE_FUNCTION_LINE) { // Remove trailing new line char - if(strlen(str) && str[strlen(str) - 1] == '\n') + if (strlen(str) && str[strlen(str) - 1] == '\n') str[strlen(str) - 1] = 0; // Add function and line info - snprintf(str + + strlen(str), sizeof(str) - strlen(str), " at %s():%d\n", functionName, lineNumber); + snprintf(str + +strlen(str), sizeof(str) - strlen(str), " at %s():%d\n", functionName, lineNumber); } #ifdef NVMEDIA_ANDROID switch (msg_level) { - case LEVEL_ERR: - ALOGE("%s", str); - break; - case LEVEL_WARN: - ALOGW("%s", str); - break; - case LEVEL_INFO: - ALOGI("%s", str); - break; - case LEVEL_DBG: - ALOGD("%s", str); - break; + case LEVEL_ERR: + ALOGE("%s", str); + break; + case LEVEL_WARN: + ALOGW("%s", str); + break; + case LEVEL_INFO: + ALOGI("%s", str); + break; + case LEVEL_DBG: + ALOGD("%s", str); + break; } #else fprintf(logFile, "%s", str); @@ -151,4 +152,3 @@ void LogLevelMessage(enum LogLevel level, const char *functionName, #endif va_end(ap); } - diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.h index 98cc5462..4375be91 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/log_utils.h @@ -29,90 +29,76 @@ #define _NVMEDIA_TEST_LOG_UTILS_H_ #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif #include #include -enum LogLevel { - LEVEL_ERR = 0, - LEVEL_WARN = 1, - LEVEL_INFO = 2, - LEVEL_DBG = 3, -}; + enum LogLevel { + LEVEL_ERR = 0, + LEVEL_WARN = 1, + LEVEL_INFO = 2, + LEVEL_DBG = 3, + }; -enum LogStyle { - LOG_STYLE_NORMAL = 0, - LOG_STYLE_FUNCTION_LINE -}; + enum LogStyle { LOG_STYLE_NORMAL = 0, LOG_STYLE_FUNCTION_LINE }; -#define LINE_INFO __FUNCTION__, __LINE__ -#define LOG_DBG(...) LogLevelMessage(LEVEL_DBG, LINE_INFO, __VA_ARGS__) -#define LOG_INFO(...) LogLevelMessage(LEVEL_INFO, LINE_INFO, __VA_ARGS__) -#define LOG_WARN(...) LogLevelMessage(LEVEL_WARN, LINE_INFO, __VA_ARGS__) +#define LINE_INFO __FUNCTION__, __LINE__ +#define LOG_DBG(...) LogLevelMessage(LEVEL_DBG, LINE_INFO, __VA_ARGS__) +#define LOG_INFO(...) LogLevelMessage(LEVEL_INFO, LINE_INFO, __VA_ARGS__) +#define LOG_WARN(...) LogLevelMessage(LEVEL_WARN, LINE_INFO, __VA_ARGS__) -// SetLogLevel -// -// SetLogLevel() Set logging level -// -// Arguments: -// -// level -// (in) Logging level + // SetLogLevel + // + // SetLogLevel() Set logging level + // + // Arguments: + // + // level + // (in) Logging level -void -SetLogLevel( - enum LogLevel level); + void SetLogLevel(enum LogLevel level); -// SetLogStyle -// -// SetLogStyle() Set logging print slyle -// -// Arguments: -// -// level -// (in) Logging style + // SetLogStyle + // + // SetLogStyle() Set logging print slyle + // + // Arguments: + // + // level + // (in) Logging style -void -SetLogStyle( - enum LogStyle style); + void SetLogStyle(enum LogStyle style); -// SetLogFile -// -// SetLogFile() Set logging file handle -// -// Arguments: -// -// level -// (in) Logging file handle + // SetLogFile + // + // SetLogFile() Set logging file handle + // + // Arguments: + // + // level + // (in) Logging file handle -void -SetLogFile( - FILE *logFileHandle); + void SetLogFile(FILE *logFileHandle); -// LogLevelMessage -// -// LogLevelMessage() Print message if logging level is higher than message level -// -// Arguments: -// -// LogLevel -// (in) Message level -// -// format -// (in) Message format -// -// ... -// (in) Parameters list + // LogLevelMessage + // + // LogLevelMessage() Print message if logging level is higher than message level + // + // Arguments: + // + // LogLevel + // (in) Message level + // + // format + // (in) Message format + // + // ... + // (in) Parameters list -void -LogLevelMessage( - enum LogLevel level, - const char *functionName, - int lineNumber, - const char *format, - ...); + void LogLevelMessage(enum LogLevel level, const char *functionName, int lineNumber, const char *format, ...); #ifdef __cplusplus } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.cpp index 47032e55..6b8f7c41 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.cpp @@ -28,35 +28,29 @@ #include #include #include -#include #include -#if defined (__QNX__) +#include +#if defined(__QNX__) #include #endif #include "misc_utils.h" -uint32_t -u32(const uint8_t* ptr) -{ - return ptr[0] | (ptr[1]<<8) | (ptr[2]<<16) | (ptr[3]<<24); -} +uint32_t u32(const uint8_t *ptr) { return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16) | (ptr[3] << 24); } -NvMediaStatus -GetTimeMicroSec( - uint64_t *uTime) +NvMediaStatus GetTimeMicroSec(uint64_t *uTime) { struct timespec t; #if !(defined(CLOCK_MONOTONIC) && defined(_POSIX_MONOTONIC_CLOCK) && _POSIX_MONOTONIC_CLOCK >= 0 && _POSIX_TIMERS > 0) struct timeval tv; #endif - if(!uTime) + if (!uTime) return NVMEDIA_STATUS_BAD_PARAMETER; #if !(defined(CLOCK_MONOTONIC) && defined(_POSIX_MONOTONIC_CLOCK) && _POSIX_MONOTONIC_CLOCK >= 0 && _POSIX_TIMERS > 0) gettimeofday(&tv, NULL); - t.tv_sec = tv.tv_sec; - t.tv_nsec = tv.tv_usec*1000L; + t.tv_sec = tv.tv_sec; + t.tv_nsec = tv.tv_usec * 1000L; #else clock_gettime(CLOCK_MONOTONIC, &t); #endif @@ -64,4 +58,3 @@ GetTimeMicroSec( *uTime = (uint64_t)t.tv_sec * 1000000LL + (uint64_t)t.tv_nsec / 1000LL; return NVMEDIA_STATUS_OK; } - diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.h index 1945bed2..74e83cd5 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvmedia_utils/misc_utils.h @@ -29,46 +29,42 @@ #define _NVMEDIA_TEST_MISC_UTILS_H_ #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -#include "nvmedia_core.h" #include "nvmedia_common.h" +#include "nvmedia_core.h" #ifndef __INTEGRITY -#define MIN(a,b) (((a) < (b)) ? (a) : (b)) -#define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) #endif -typedef enum { - LSB_ALIGNED, - MSB_ALIGNED -} PixelAlignment; + typedef enum { LSB_ALIGNED, MSB_ALIGNED } PixelAlignment; -// u32 -// -// u32() Reads 4 bytes from buffer and returns the read value -// -// Arguments: -// -// ptr -// (in) Input buffer + // u32 + // + // u32() Reads 4 bytes from buffer and returns the read value + // + // Arguments: + // + // ptr + // (in) Input buffer -uint32_t u32(const uint8_t* ptr); + uint32_t u32(const uint8_t *ptr); -// GetTimeMicroSec -// -// GetTimeMicroSec() Returns current time in microseconds -// -// Arguments: -// -// uTime -// (out) Pointer to current time in microseconds + // GetTimeMicroSec + // + // GetTimeMicroSec() Returns current time in microseconds + // + // Arguments: + // + // uTime + // (out) Pointer to current time in microseconds -NvMediaStatus -GetTimeMicroSec( - uint64_t *uTime); + NvMediaStatus GetTimeMicroSec(uint64_t *uTime); #ifdef __cplusplus } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.cpp b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.cpp index ccecf75e..8e303b2d 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.cpp +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.cpp @@ -25,133 +25,130 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "nvmedia_utils/cmdline.h" +#include "nvsci_setup.h" + #include #include + #include "helper_cuda.h" -#include "nvsci_setup.h" #include "nvmedia_2d_nvscisync.h" +#include "nvmedia_utils/cmdline.h" -#define checkNvSciErrors(call) \ - do { \ - NvSciError _status = call; \ - if (NvSciError_Success != _status) { \ - printf( \ - "NVSCI call in file '%s' in line %i returned" \ - " %d, expected %d\n", \ - __FILE__, __LINE__, _status, NvSciError_Success); \ - fflush(stdout); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define checkNvSciErrors(call) \ + do { \ + NvSciError _status = call; \ + if (NvSciError_Success != _status) { \ + printf("NVSCI call in file '%s' in line %i returned" \ + " %d, expected %d\n", \ + __FILE__, \ + __LINE__, \ + _status, \ + NvSciError_Success); \ + fflush(stdout); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) -void setupNvMediaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, - int cudaDeviceId) { - NvSciSyncModule sciSyncModule; - checkNvSciErrors(NvSciSyncModuleOpen(&sciSyncModule)); - NvSciSyncAttrList signalerAttrList, waiterAttrList; - NvSciSyncAttrList syncUnreconciledList[2]; - NvSciSyncAttrList syncReconciledList, syncConflictList; +void setupNvMediaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, int cudaDeviceId) +{ + NvSciSyncModule sciSyncModule; + checkNvSciErrors(NvSciSyncModuleOpen(&sciSyncModule)); + NvSciSyncAttrList signalerAttrList, waiterAttrList; + NvSciSyncAttrList syncUnreconciledList[2]; + NvSciSyncAttrList syncReconciledList, syncConflictList; - checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &signalerAttrList)); - checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &waiterAttrList)); + checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &signalerAttrList)); + checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &waiterAttrList)); - NvMediaStatus status = NvMedia2DFillNvSciSyncAttrList( - ctx->i2d, signalerAttrList, NVMEDIA_SIGNALER); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMedia2DFillNvSciSyncAttrList failed\n", __func__); - exit(EXIT_FAILURE); - } + NvMediaStatus status = NvMedia2DFillNvSciSyncAttrList(ctx->i2d, signalerAttrList, NVMEDIA_SIGNALER); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMedia2DFillNvSciSyncAttrList failed\n", __func__); + exit(EXIT_FAILURE); + } - checkCudaErrors(cudaSetDevice(cudaDeviceId)); - checkCudaErrors(cudaDeviceGetNvSciSyncAttributes(waiterAttrList, cudaDeviceId, - cudaNvSciSyncAttrWait)); + checkCudaErrors(cudaSetDevice(cudaDeviceId)); + checkCudaErrors(cudaDeviceGetNvSciSyncAttributes(waiterAttrList, cudaDeviceId, cudaNvSciSyncAttrWait)); - syncUnreconciledList[0] = signalerAttrList; - syncUnreconciledList[1] = waiterAttrList; - checkNvSciErrors(NvSciSyncAttrListReconcile( - syncUnreconciledList, 2, &syncReconciledList, &syncConflictList)); - checkNvSciErrors(NvSciSyncObjAlloc(syncReconciledList, &syncObj)); + syncUnreconciledList[0] = signalerAttrList; + syncUnreconciledList[1] = waiterAttrList; + checkNvSciErrors(NvSciSyncAttrListReconcile(syncUnreconciledList, 2, &syncReconciledList, &syncConflictList)); + checkNvSciErrors(NvSciSyncObjAlloc(syncReconciledList, &syncObj)); - NvSciSyncAttrListFree(signalerAttrList); - NvSciSyncAttrListFree(waiterAttrList); - if (syncConflictList != nullptr) { - NvSciSyncAttrListFree(syncConflictList); - } + NvSciSyncAttrListFree(signalerAttrList); + NvSciSyncAttrListFree(waiterAttrList); + if (syncConflictList != nullptr) { + NvSciSyncAttrListFree(syncConflictList); + } } -void setupCudaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, - int cudaDeviceId) { - NvSciSyncModule sciSyncModule; - checkNvSciErrors(NvSciSyncModuleOpen(&sciSyncModule)); - NvSciSyncAttrList signalerAttrList, waiterAttrList; - NvSciSyncAttrList syncUnreconciledList[2]; - NvSciSyncAttrList syncReconciledList, syncConflictList; +void setupCudaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, int cudaDeviceId) +{ + NvSciSyncModule sciSyncModule; + checkNvSciErrors(NvSciSyncModuleOpen(&sciSyncModule)); + NvSciSyncAttrList signalerAttrList, waiterAttrList; + NvSciSyncAttrList syncUnreconciledList[2]; + NvSciSyncAttrList syncReconciledList, syncConflictList; - checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &signalerAttrList)); - checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &waiterAttrList)); + checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &signalerAttrList)); + checkNvSciErrors(NvSciSyncAttrListCreate(sciSyncModule, &waiterAttrList)); - NvMediaStatus status = - NvMedia2DFillNvSciSyncAttrList(ctx->i2d, waiterAttrList, NVMEDIA_WAITER); - if (status != NVMEDIA_STATUS_OK) { - printf("%s: NvMedia2DFillNvSciSyncAttrList failed\n", __func__); - exit(EXIT_FAILURE); - } + NvMediaStatus status = NvMedia2DFillNvSciSyncAttrList(ctx->i2d, waiterAttrList, NVMEDIA_WAITER); + if (status != NVMEDIA_STATUS_OK) { + printf("%s: NvMedia2DFillNvSciSyncAttrList failed\n", __func__); + exit(EXIT_FAILURE); + } - checkCudaErrors(cudaSetDevice(cudaDeviceId)); - checkCudaErrors(cudaDeviceGetNvSciSyncAttributes( - signalerAttrList, cudaDeviceId, cudaNvSciSyncAttrSignal)); + checkCudaErrors(cudaSetDevice(cudaDeviceId)); + checkCudaErrors(cudaDeviceGetNvSciSyncAttributes(signalerAttrList, cudaDeviceId, cudaNvSciSyncAttrSignal)); - syncUnreconciledList[0] = signalerAttrList; - syncUnreconciledList[1] = waiterAttrList; - checkNvSciErrors(NvSciSyncAttrListReconcile( - syncUnreconciledList, 2, &syncReconciledList, &syncConflictList)); - checkNvSciErrors(NvSciSyncObjAlloc(syncReconciledList, &syncObj)); + syncUnreconciledList[0] = signalerAttrList; + syncUnreconciledList[1] = waiterAttrList; + checkNvSciErrors(NvSciSyncAttrListReconcile(syncUnreconciledList, 2, &syncReconciledList, &syncConflictList)); + checkNvSciErrors(NvSciSyncObjAlloc(syncReconciledList, &syncObj)); - NvSciSyncAttrListFree(signalerAttrList); - NvSciSyncAttrListFree(waiterAttrList); - if (syncConflictList != nullptr) { - NvSciSyncAttrListFree(syncConflictList); - } + NvSciSyncAttrListFree(signalerAttrList); + NvSciSyncAttrListFree(waiterAttrList); + if (syncConflictList != nullptr) { + NvSciSyncAttrListFree(syncConflictList); + } } -void setupNvSciBuf(NvSciBufObj &bufobj, NvSciBufAttrList &nvmediaAttrlist, - int cudaDeviceId) { - CUuuid devUUID; - NvSciBufAttrList conflictlist; - NvSciBufAttrList bufUnreconciledAttrlist[1]; +void setupNvSciBuf(NvSciBufObj &bufobj, NvSciBufAttrList &nvmediaAttrlist, int cudaDeviceId) +{ + CUuuid devUUID; + NvSciBufAttrList conflictlist; + NvSciBufAttrList bufUnreconciledAttrlist[1]; - CUresult res = cuDeviceGetUuid(&devUUID, cudaDeviceId); - if (res != CUDA_SUCCESS) { - fprintf(stderr, "Driver API error = %04d \n", res); - exit(EXIT_FAILURE); - } + CUresult res = cuDeviceGetUuid(&devUUID, cudaDeviceId); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "Driver API error = %04d \n", res); + exit(EXIT_FAILURE); + } - NvSciBufAttrKeyValuePair attr_gpuid[] = {NvSciBufGeneralAttrKey_GpuId, - &devUUID, sizeof(devUUID)}; + NvSciBufAttrKeyValuePair attr_gpuid[] = {NvSciBufGeneralAttrKey_GpuId, &devUUID, sizeof(devUUID)}; - // set CUDA GPU ID to attribute list - checkNvSciErrors(NvSciBufAttrListSetAttrs( - nvmediaAttrlist, attr_gpuid, - sizeof(attr_gpuid) / sizeof(NvSciBufAttrKeyValuePair))); + // set CUDA GPU ID to attribute list + checkNvSciErrors( + NvSciBufAttrListSetAttrs(nvmediaAttrlist, attr_gpuid, sizeof(attr_gpuid) / sizeof(NvSciBufAttrKeyValuePair))); - bufUnreconciledAttrlist[0] = nvmediaAttrlist; + bufUnreconciledAttrlist[0] = nvmediaAttrlist; - checkNvSciErrors(NvSciBufAttrListReconcileAndObjAlloc( - bufUnreconciledAttrlist, 1, &bufobj, &conflictlist)); - if (conflictlist != NULL) { - NvSciBufAttrListFree(conflictlist); - } + checkNvSciErrors(NvSciBufAttrListReconcileAndObjAlloc(bufUnreconciledAttrlist, 1, &bufobj, &conflictlist)); + if (conflictlist != NULL) { + NvSciBufAttrListFree(conflictlist); + } } -void cleanupNvSciBuf(NvSciBufObj &Bufobj) { - if (Bufobj != NULL) { - NvSciBufObjFree(Bufobj); - } +void cleanupNvSciBuf(NvSciBufObj &Bufobj) +{ + if (Bufobj != NULL) { + NvSciBufObjFree(Bufobj); + } } -void cleanupNvSciSync(NvSciSyncObj &syncObj) { - if (NvSciSyncObjFree != NULL) { - NvSciSyncObjFree(syncObj); - } +void cleanupNvSciSync(NvSciSyncObj &syncObj) +{ + if (NvSciSyncObjFree != NULL) { + NvSciSyncObjFree(syncObj); + } } diff --git a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.h b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.h index f2e1efaf..c62138e0 100644 --- a/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.h +++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciNvMedia/nvsci_setup.h @@ -27,16 +27,14 @@ #ifndef __NVSCI_SETUP_H__ #define __NVSCI_SETUP_H__ -#include "nvmedia_utils/cmdline.h" #include #include -void setupNvMediaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, - int cudaDeviceId); -void setupCudaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, - int cudaDeviceId); -void setupNvSciBuf(NvSciBufObj &bufobj, NvSciBufAttrList &nvmediaAttrlist, - int cudaDeviceId); +#include "nvmedia_utils/cmdline.h" + +void setupNvMediaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, int cudaDeviceId); +void setupCudaSignalerNvSciSync(Blit2DTest *ctx, NvSciSyncObj &syncObj, int cudaDeviceId); +void setupNvSciBuf(NvSciBufObj &bufobj, NvSciBufAttrList &nvmediaAttrlist, int cudaDeviceId); void cleanupNvSciBuf(NvSciBufObj &Bufobj); void cleanupNvSciSync(NvSciSyncObj &syncObj); -#endif \ No newline at end of file +#endif diff --git a/Samples/8_Platform_Specific/Tegra/fluidsGLES/README.md b/Samples/8_Platform_Specific/Tegra/fluidsGLES/README.md index 84a5ce8b..75ee3111 100644 --- a/Samples/8_Platform_Specific/Tegra/fluidsGLES/README.md +++ b/Samples/8_Platform_Specific/Tegra/fluidsGLES/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/fluidsGLES/defines.h b/Samples/8_Platform_Specific/Tegra/fluidsGLES/defines.h index af58c64b..e1f5e8a8 100644 --- a/Samples/8_Platform_Specific/Tegra/fluidsGLES/defines.h +++ b/Samples/8_Platform_Specific/Tegra/fluidsGLES/defines.h @@ -28,21 +28,20 @@ #ifndef DEFINES_H #define DEFINES_H -#define DIM 512 // Square size of solver domain -#define DS (DIM * DIM) // Total domain size -#define CPADW (DIM / 2 + 1) // Padded width for real->complex in-place FFT -#define RPADW \ - (2 * (DIM / 2 + 1)) // Padded width for real->complex in-place FFT -#define PDS (DIM * CPADW) // Padded total domain size +#define DIM 512 // Square size of solver domain +#define DS (DIM * DIM) // Total domain size +#define CPADW (DIM / 2 + 1) // Padded width for real->complex in-place FFT +#define RPADW (2 * (DIM / 2 + 1)) // Padded width for real->complex in-place FFT +#define PDS (DIM * CPADW) // Padded total domain size -#define DT 0.09f // Delta T for interative solver -#define VIS 0.0025f // Viscosity constant -#define FORCE (5.8f * DIM) // Force scale factor -#define FR 4 // Force update radius +#define DT 0.09f // Delta T for interative solver +#define VIS 0.0025f // Viscosity constant +#define FORCE (5.8f * DIM) // Force scale factor +#define FR 4 // Force update radius -#define TILEX 64 // Tile width -#define TILEY 64 // Tile height -#define TIDSX 64 // Tids in X -#define TIDSY 4 // Tids in Y +#define TILEX 64 // Tile width +#define TILEY 64 // Tile height +#define TIDSX 64 // Tids in X +#define TIDSY 4 // Tids in Y #endif diff --git a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp index 9166e597..e3279ea2 100644 --- a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp +++ b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp @@ -10,22 +10,20 @@ */ // Includes -#include -#include -#include -#include - -#include - #include #include +#include +#include +#include +#include +#include -void error_exit(const char* format, ... ) +void error_exit(const char *format, ...) { va_list args; - va_start( args, format ); - vfprintf( stderr, format, args ); - va_end( args ); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); exit(1); } @@ -33,19 +31,19 @@ void error_exit(const char* format, ... ) #include "graphics_interface.h" // CUDA standard includes -#include #include +#include // CUDA FFT Libraries #include // CUDA helper functions -#include "helper_functions.h" -#include #include +#include #include "defines.h" #include "fluidsGLES_kernels.h" +#include "helper_functions.h" typedef float matrix4[4][4]; typedef float vector3[3]; @@ -65,20 +63,20 @@ void cleanup(void); void reshape(int x, int y); // CUFFT plan handle -cufftHandle planr2c; -cufftHandle planc2r; +cufftHandle planr2c; +cufftHandle planc2r; static cData *vxfield = NULL; static cData *vyfield = NULL; -cData *hvfield = NULL; -cData *dvfield = NULL; +cData *hvfield = NULL; +cData *dvfield = NULL; static int wWidth = MAX(512, DIM); static int wHeight = MAX(512, DIM); -static int clicked = 0; -static int fpsCount = 0; -static int fpsLimit = 1; -StopWatchInterface *timer = NULL; +static int clicked = 0; +static int fpsCount = 0; +static int fpsLimit = 1; +StopWatchInterface *timer = NULL; int gui_mode; // For X window // Rotate & translate variable temp., will remove and use shaders. @@ -87,19 +85,19 @@ float translate_z = -3.0; // Particle data -GLuint vbo = 0,vao = 0; // OpenGLES vertex buffer object -GLuint m_texture = 0; +GLuint vbo = 0, vao = 0; // OpenGLES vertex buffer object +GLuint m_texture = 0; struct cudaGraphicsResource *cuda_vbo_resource; // handles OpenGLES-CUDA exchange -static cData *particles = NULL; // particle positions in host memory -static int lastx = 0, lasty = 0; +static cData *particles = NULL; // particle positions in host memory +static int lastx = 0, lasty = 0; // Texture pitch size_t tPitch = 0; // Now this is compatible with gcc in 64-bit -char *ref_file = NULL; -bool g_bQAAddTestForce = true; -int g_iFrameToCompare = 100; -int g_TotalErrors = 0; +char *ref_file = NULL; +bool g_bQAAddTestForce = true; +int g_iFrameToCompare = 100; +int g_TotalErrors = 0; bool g_bExitESC = false; @@ -107,7 +105,7 @@ const unsigned int window_width = 512; const unsigned int window_height = 512; // CheckFBO/BackBuffer class objects -CheckRender *g_CheckRender = NULL; +CheckRender *g_CheckRender = NULL; void autoTest(char **); void displayFrame(); @@ -132,8 +130,8 @@ GLuint mesh_shader = 0; void mat_identity(matrix4 m) { - m[0][1] = m[0][2] = m[0][3] = m[1][0] = m[1][2] = m[1][3] = m[2][0] = - m[2][1] = m[2][3] = m[3][0] = m[3][1] = m[3][2] = 0.0f; + m[0][1] = m[0][2] = m[0][3] = m[1][0] = m[1][2] = m[1][3] = m[2][0] = m[2][1] = m[2][3] = m[3][0] = m[3][1] = + m[3][2] = 0.0f; m[0][0] = m[1][1] = m[2][2] = m[3][3] = 1.0f; } @@ -141,18 +139,14 @@ void mat_identity(matrix4 m) void mat_multiply(matrix4 m0, matrix4 m1) { float m[4]; - for(int r = 0; r < 4; r++) - { + for (int r = 0; r < 4; r++) { m[0] = m[1] = m[2] = m[3] = 0.0f; - for(int c = 0; c < 4; c++) - { - for(int i = 0; i < 4; i++) - { + for (int c = 0; c < 4; c++) { + for (int i = 0; i < 4; i++) { m[c] += m0[i][r] * m1[c][i]; } } - for(int c = 0; c < 4; c++) - { + for (int c = 0; c < 4; c++) { m0[c][r] = m[c]; } } @@ -163,52 +157,52 @@ void mat4f_Ortho(float left, float right, float bottom, float top, float near, f float r_l = right - left; float t_b = top - bottom; float f_n = far - near; - float tx = - (right + left) / (right - left); - float ty = - (top + bottom) / (top - bottom); - float tz = - (far + near) / (far - near); + float tx = -(right + left) / (right - left); + float ty = -(top + bottom) / (top - bottom); + float tz = -(far + near) / (far - near); matrix4 m2; - m2[0][0] = 2.0f/ r_l; + m2[0][0] = 2.0f / r_l; m2[0][1] = 0.0f; m2[0][2] = 0.0f; m2[0][3] = 0.0f; - m2[1][0] = 0.0f; + m2[1][0] = 0.0f; m2[1][1] = 2.0f / t_b; m2[1][2] = 0.0f; m2[1][3] = 0.0f; m2[2][0] = 0.0f; - m2[2][1] = 0.0f; + m2[2][1] = 0.0f; m2[2][2] = -2.0f / f_n; m2[2][3] = 0.0f; m2[3][0] = tx; - m2[3][1] = ty; + m2[3][1] = ty; m2[3][2] = tz; m2[3][3] = 1.0f; - mat_multiply(m, m2); + mat_multiply(m, m2); } void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, const char *filename, GLenum shaderType) { - FILE *file = fopen(filename,"rb"); // open shader text file - if (!file) + FILE *file = fopen(filename, "rb"); // open shader text file + if (!file) error_exit("Filename %s does not exist\n", filename); /* get the size of the file and read it */ - fseek(file,0,SEEK_END); + fseek(file, 0, SEEK_END); GLint size = ftell(file); - char *data = (char*)malloc(sizeof(char)*(size + 1)); - memset(data, 0, sizeof(char)*(size + 1)); - fseek(file,0,SEEK_SET); - size_t res = fread(data,1,size,file); + char *data = (char *)malloc(sizeof(char) * (size + 1)); + memset(data, 0, sizeof(char) * (size + 1)); + fseek(file, 0, SEEK_SET); + size_t res = fread(data, 1, size, file); fclose(file); GLuint shader = glCreateShader(shaderType); - glShaderSource(shader, 1, (const GLchar**)&data, &size); + glShaderSource(shader, 1, (const GLchar **)&data, &size); glCompileShader(shader); GET_GLERROR(0); @@ -216,19 +210,18 @@ void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, const char *file glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success); GET_GLERROR(0); - if (compile_success == GL_FALSE) - { + if (compile_success == GL_FALSE) { printf("Compilation of %s failed!\n Reason:\n", filename); GLint maxLength = 0; glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength); - + char errorLog[maxLength]; glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]); - + printf("%s", errorLog); - glDeleteShader(shader); + glDeleteShader(shader); exit(1); } @@ -259,8 +252,7 @@ GLuint ShaderCreate(const char *vshader_filename, const char *fshader_filename) GLint link_success; glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success); - if (link_success == GL_FALSE) - { + if (link_success == GL_FALSE) { printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename, fshader_filename); GLint maxLength = 0; @@ -282,17 +274,16 @@ void motion(int x, int y) // Convert motion coordinates to domain float fx = (lastx / (float)wWidth); float fy = (lasty / (float)wHeight); - int nx = (int)(fx * DIM); - int ny = (int)(fy * DIM); + int nx = (int)(fx * DIM); + int ny = (int)(fy * DIM); - if (clicked && nx < DIM-FR && nx > FR-1 && ny < DIM-FR && ny > FR-1) - { + if (clicked && nx < DIM - FR && nx > FR - 1 && ny < DIM - FR && ny > FR - 1) { int ddx = x - lastx; int ddy = y - lasty; - fx = ddx / (float)wWidth; - fy = ddy / (float)wHeight; - int spy = ny-FR; - int spx = nx-FR; + fx = ddx / (float)wWidth; + fy = ddy / (float)wHeight; + int spy = ny - FR; + int spx = nx - FR; addForces(dvfield, DIM, DIM, spx, spy, FORCE * DT * fx, FORCE * DT * fy, FR); lastx = x; lasty = y; @@ -302,12 +293,12 @@ void motion(int x, int y) //=========================================================================== // InitGraphicsState() - initialize OpenGLES //=========================================================================== -static void InitGraphicsState(int argc, char** argv) +static void InitGraphicsState(int argc, char **argv) { char *GL_version = (char *)glGetString(GL_VERSION); char *GL_vendor = (char *)glGetString(GL_VENDOR); char *GL_renderer = (char *)glGetString(GL_RENDERER); - + printf("Version: %s\n", GL_version); printf("Vendor: %s\n", GL_vendor); printf("Renderer: %s\n", GL_renderer); @@ -316,26 +307,24 @@ static void InitGraphicsState(int argc, char** argv) GLint bsize; // initialize buffer object - glGenBuffers(1, &vbo); + glGenBuffers(1, &vbo); glBindBuffer(GL_ARRAY_BUFFER, vbo); glBufferData(GL_ARRAY_BUFFER, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW); glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, &bsize); - if (bsize != (sizeof(cData) * DS)) - { + if (bsize != (sizeof(cData) * DS)) { printf("Failed to initialize GL extensions.\n"); exit(EXIT_FAILURE); } checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); - + // GLSL stuff - char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", argv[0]); + char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", argv[0]); char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", argv[0]); - if (vertex_shader_path == NULL || fragment_shader_path == NULL) - { + if (vertex_shader_path == NULL || fragment_shader_path == NULL) { printf("Error finding shader file\n"); exit(EXIT_FAILURE); } @@ -345,19 +334,18 @@ static void InitGraphicsState(int argc, char** argv) free(vertex_shader_path); free(fragment_shader_path); - + glUseProgram(mesh_shader); } void displayFrame(void) { - if (!ref_file) - { + if (!ref_file) { sdkStartTimer(&timer); simulateFluids(); } - GLint view_arr[4]; + GLint view_arr[4]; glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); @@ -366,20 +354,20 @@ void displayFrame(void) glDepthMask(GL_FALSE); glUseProgram(mesh_shader); - + // Set modelview and projection matricies - GLint h_ModelViewMatrix = glGetUniformLocation(mesh_shader, "modelview"); - GLint h_ProjectionMatrix = glGetUniformLocation(mesh_shader, "projection"); + GLint h_ModelViewMatrix = glGetUniformLocation(mesh_shader, "modelview"); + GLint h_ProjectionMatrix = glGetUniformLocation(mesh_shader, "projection"); matrix4 modelview; matrix4 projection; mat_identity(modelview); mat_identity(projection); -// (float left, float right, float bottom, float top, float near, float far, matrix4 m) + // (float left, float right, float bottom, float top, float near, float far, matrix4 m) mat4f_Ortho(0.0, 1.0, 1.0, 0.0, 0.0, 1.0, projection); - - glUniformMatrix4fv(h_ModelViewMatrix, 1, GL_FALSE, (GLfloat*)modelview); - glUniformMatrix4fv(h_ProjectionMatrix, 1, GL_FALSE, (GLfloat*)projection); + + glUniformMatrix4fv(h_ModelViewMatrix, 1, GL_FALSE, (GLfloat *)modelview); + glUniformMatrix4fv(h_ProjectionMatrix, 1, GL_FALSE, (GLfloat *)projection); // Set position coords GLint h_position = glGetAttribLocation(mesh_shader, "a_position"); @@ -388,7 +376,7 @@ void displayFrame(void) glBindBuffer(GL_ARRAY_BUFFER, vbo); - glDrawArrays(GL_POINTS, 0, DS*sizeof(cData)); + glDrawArrays(GL_POINTS, 0, DS * sizeof(cData)); glDisableVertexAttribArray(h_position); glDisable(GL_DEPTH_TEST); @@ -396,8 +384,7 @@ void displayFrame(void) glDisable(GL_BLEND); glDepthMask(GL_TRUE); - if (ref_file) - { + if (ref_file) { return; } @@ -408,9 +395,8 @@ void displayFrame(void) fpsCount++; - if (fpsCount == fpsLimit) - { - char fps[256]; + if (fpsCount == fpsLimit) { + char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "Cuda/GL Stable Fluids (%d x %d): %3.1f fps", DIM, DIM, ifps); graphics_set_windowtitle(fps); @@ -423,7 +409,7 @@ void displayFrame(void) void autoTest(char **argv) { CFrameBufferObject *fbo = new CFrameBufferObject(wWidth, wHeight, 4, false, GL_TEXTURE_2D); - g_CheckRender = new CheckFBO(wWidth, wHeight, 4, fbo); + g_CheckRender = new CheckFBO(wWidth, wHeight, 4, fbo); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); @@ -431,26 +417,24 @@ void autoTest(char **argv) fbo->bindRenderPath(); - for (int count=0; countreadback(wWidth, wHeight); g_CheckRender->savePPM("fluidsGLES.ppm", true, NULL); - if (!g_CheckRender->PPMvsPPM("fluidsGLES.ppm", ref_file, MAX_EPSILON_ERROR, 0.25f)) - { + if (!g_CheckRender->PPMvsPPM("fluidsGLES.ppm", ref_file, MAX_EPSILON_ERROR, 0.25f)) { g_TotalErrors++; } } @@ -482,18 +465,16 @@ bool runFluidsSimulation(int argc, char **argv, char *ref_file) sdkCreateTimer(&timer); - if (ref_file != NULL) - { - // command line mode only - auto test - graphics_setup_window(0,0, wWidth, wHeight, sSDKname); + if (ref_file != NULL) { + // command line mode only - auto test + graphics_setup_window(0, 0, wWidth, wHeight, sSDKname); InitGraphicsState(argc, argv); // set up GLES stuff autoTest(argv); cleanup(); } - else - { + else { // create X11 window and set up associated OpenGL ES context - graphics_setup_window(0,0, wWidth, wHeight, sSDKname); + graphics_setup_window(0, 0, wWidth, wHeight, sSDKname); InitGraphicsState(argc, argv); // set up GLES stuff @@ -501,60 +482,50 @@ bool runFluidsSimulation(int argc, char **argv, char *ref_file) graphics_swap_buffers(); XEvent event; KeySym key; - char text[255]; + char text[255]; - while (1) - { - while (XPending(display) > 0) - { + while (1) { + while (XPending(display) > 0) { XNextEvent(display, &event); - if (event.type==Expose && event.xexpose.count==0) - { + if (event.type == Expose && event.xexpose.count == 0) { printf("Redraw requested!\n"); - } - - if (event.type==KeyPress && XLookupString(&event.xkey,text,255,&key,0)==1) - { - if (text[0] == 27 || text[0] == 'q' || text[0] == 'Q') - { - keyboard(text[0], 0, 0, argc, argv); - return true; - } - - if (text[0] == 114) - { - keyboard(text[0], 0, 0, argc, argv); - } - - printf("You pressed the %c key!\n",text[0]); } - if (event.type==ButtonPress) - { - lastx = event.xbutton.x; - lasty = event.xbutton.y; + if (event.type == KeyPress && XLookupString(&event.xkey, text, 255, &key, 0) == 1) { + if (text[0] == 27 || text[0] == 'q' || text[0] == 'Q') { + keyboard(text[0], 0, 0, argc, argv); + return true; + } + + if (text[0] == 114) { + keyboard(text[0], 0, 0, argc, argv); + } + + printf("You pressed the %c key!\n", text[0]); + } + + if (event.type == ButtonPress) { + lastx = event.xbutton.x; + lasty = event.xbutton.y; clicked = !clicked; } - if (event.type==ButtonRelease) - { - lastx = event.xbutton.x; - lasty = event.xbutton.y; + if (event.type == ButtonRelease) { + lastx = event.xbutton.x; + lasty = event.xbutton.y; clicked = !clicked; } - if (event.type == MotionNotify) - { - motion(event.xmotion.x, event.xmotion.y); + if (event.type == MotionNotify) { + motion(event.xmotion.x, event.xmotion.y); } - else - { + else { XFlush(display); } } displayFrame(); - usleep(1000); // need not take full CPU and GPU + usleep(1000); // need not take full CPU and GPU } } @@ -567,70 +538,65 @@ bool runFluidsSimulation(int argc, char **argv, char *ref_file) float myrand(void) { static int seed = 72191; - char sq[22]; + char sq[22]; - if (ref_file) - { + if (ref_file) { seed *= seed; sprintf(sq, "%010d", seed); // pull the middle 5 digits out of sq sq[8] = 0; - seed = atoi(&sq[3]); + seed = atoi(&sq[3]); - return seed/99999.f; + return seed / 99999.f; } - else - { - return rand()/(float)RAND_MAX; + else { + return rand() / (float)RAND_MAX; } } void initParticles(cData *p, int dx, int dy) { int i, j; - for (i = 0; i < dy; i++) - { - for (j = 0; j < dx; j++) - { - p[i*dx+j].x = (j+0.5f+(myrand() - 0.5f))/dx; - p[i*dx+j].y = (i+0.5f+(myrand() - 0.5f))/dy; + for (i = 0; i < dy; i++) { + for (j = 0; j < dx; j++) { + p[i * dx + j].x = (j + 0.5f + (myrand() - 0.5f)) / dx; + p[i * dx + j].y = (i + 0.5f + (myrand() - 0.5f)) / dy; } } } void keyboard(unsigned char key, int x, int y, int argc, char **argv) { - switch (key) - { - case 'q': - case 'Q': - case 27: - g_bExitESC = true; - cleanup(); - graphics_close_window(); // close window and destroy OpenGL ES context - return; - break; - case 'r': - printf("\nResetting\n"); - memset(hvfield, 0, sizeof(cData) * DS); - cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); + switch (key) { + case 'q': + case 'Q': + case 27: + g_bExitESC = true; + cleanup(); + graphics_close_window(); // close window and destroy OpenGL ES context + return; + break; + case 'r': + printf("\nResetting\n"); + memset(hvfield, 0, sizeof(cData) * DS); + cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); - initParticles(particles, DIM, DIM); + initParticles(particles, DIM, DIM); - checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); - getLastCudaError("cudaGraphicsUnregisterBuffer failed"); + getLastCudaError("cudaGraphicsUnregisterBuffer failed"); - glBindBuffer(GL_ARRAY_BUFFER, 0); - glDeleteBuffers(1, &vbo); - InitGraphicsState(argc, argv); // set up GLES stuff - graphics_swap_buffers(); + glBindBuffer(GL_ARRAY_BUFFER, 0); + glDeleteBuffers(1, &vbo); + InitGraphicsState(argc, argv); // set up GLES stuff + graphics_swap_buffers(); - getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); - break; + getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); + break; - default: - break; + default: + break; } } @@ -657,18 +623,19 @@ void cleanup(void) int main(int argc, char **argv) { - int devID; + int devID; cudaDeviceProp deviceProps; #if defined(__linux__) - setenv ("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif printf("%s Starting...\n\n", sSDKname); - printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); + printf("NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is " + "enabled.\n\n"); -#if defined (__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || defined(__arm__) // find iGPU on the system which is compute capable which will perform GLES-CUDA interop devID = findIntegratedGPU(); #else @@ -678,12 +645,10 @@ int main(int argc, char **argv) // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); - printf("CUDA device [%s] has %d Multi-Processors\n", - deviceProps.name, deviceProps.multiProcessorCount); + printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness - if (checkCmdLineFlag(argc, (const char **)argv, "file")) - { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } @@ -694,7 +659,7 @@ int main(int argc, char **argv) memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data - checkCudaErrors(cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM)); + checkCudaErrors(cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData) * DIM, DIM)); checkCudaErrors(cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice)); // Temporary complex velocity field data @@ -715,16 +680,14 @@ int main(int argc, char **argv) runFluidsSimulation(argc, argv, ref_file); - if (ref_file) - { + if (ref_file) { printf("[fluidsGLES] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } sdkDeleteTimer(&timer); - if (!ref_file) - { + if (!ref_file) { exit(EXIT_SUCCESS); } diff --git a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cu b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cu index 1970e3dd..ec1bebb8 100644 --- a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cu +++ b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cu @@ -25,13 +25,12 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include // CUDA FFT Libraries +#include // Helper functions for CUDA Error handling #include #include -#include -#include // CUDA FFT Libraries -#include // Helper functions for CUDA Error handling - // OpenGL Graphics includes #include @@ -40,51 +39,52 @@ // Texture object for reading velocity field cudaTextureObject_t texObj; -static cudaArray *array = NULL; +static cudaArray *array = NULL; // Particle data -extern GLuint vbo; // OpenGL vertex buffer object -extern struct cudaGraphicsResource - *cuda_vbo_resource; // handles OpenGL-CUDA exchange +extern GLuint vbo; // OpenGL vertex buffer object +extern struct cudaGraphicsResource *cuda_vbo_resource; // handles OpenGL-CUDA exchange // Texture pitch -extern size_t tPitch; +extern size_t tPitch; extern cufftHandle planr2c; extern cufftHandle planc2r; -cData *vxfield = NULL; -cData *vyfield = NULL; +cData *vxfield = NULL; +cData *vyfield = NULL; -void setupTexture(int x, int y) { - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); +void setupTexture(int x, int y) +{ + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaMallocArray(&array, &desc, y, x); - getLastCudaError("cudaMalloc failed"); + cudaMallocArray(&array, &desc, y, x); + getLastCudaError("cudaMalloc failed"); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = array; + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = array; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeWrap; - texDescr.readMode = cudaReadModeElementType; + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeWrap; + texDescr.readMode = cudaReadModeElementType; - checkCudaErrors(cudaCreateTextureObject(&texObj, &texRes, &texDescr, NULL)); + checkCudaErrors(cudaCreateTextureObject(&texObj, &texRes, &texDescr, NULL)); } -void updateTexture(cData *data, size_t wib, size_t h, size_t pitch) { - checkCudaErrors(cudaMemcpy2DToArray(array, 0, 0, data, pitch, wib, h, - cudaMemcpyDeviceToDevice)); +void updateTexture(cData *data, size_t wib, size_t h, size_t pitch) +{ + checkCudaErrors(cudaMemcpy2DToArray(array, 0, 0, data, pitch, wib, h, cudaMemcpyDeviceToDevice)); } -void deleteTexture(void) { - checkCudaErrors(cudaDestroyTextureObject(texObj)); - checkCudaErrors(cudaFreeArray(array)); +void deleteTexture(void) +{ + checkCudaErrors(cudaDestroyTextureObject(texObj)); + checkCudaErrors(cudaFreeArray(array)); } // Note that these kernels are designed to work with arbitrary @@ -98,54 +98,61 @@ void deleteTexture(void) { // This method adds constant force vectors to the velocity field // stored in 'v' according to v(x,t+1) = v(x,t) + dt * f. -__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, - float fx, float fy, int r, size_t pitch) { - int tx = threadIdx.x; - int ty = threadIdx.y; - cData *fj = (cData *)((char *)v + (ty + spy) * pitch) + tx + spx; +__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch) +{ + int tx = threadIdx.x; + int ty = threadIdx.y; + cData *fj = (cData *)((char *)v + (ty + spy) * pitch) + tx + spx; - cData vterm = *fj; - tx -= r; - ty -= r; - float s = 1.f / (1.f + tx * tx * tx * tx + ty * ty * ty * ty); - vterm.x += s * fx; - vterm.y += s * fy; - *fj = vterm; + cData vterm = *fj; + tx -= r; + ty -= r; + float s = 1.f / (1.f + tx * tx * tx * tx + ty * ty * ty * ty); + vterm.x += s * fx; + vterm.y += s * fy; + *fj = vterm; } // This method performs the velocity advection step, where we // trace velocity vectors back in time to update each grid cell. // That is, v(x,t+1) = v(p(x,-dt),t). Here we perform bilinear // interpolation in the velocity space. -__global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, float dt, int lb, - cudaTextureObject_t texObject) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void advectVelocity_k(cData *v, + float *vx, + float *vy, + int dx, + int pdx, + int dy, + float dt, + int lb, + cudaTextureObject_t texObject) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - cData vterm, ploc; - float vxterm, vyterm; + cData vterm, ploc; + float vxterm, vyterm; - // gtidx is the domain location in x for this thread - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + // gtidx is the domain location in x for this thread + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fj = fi * pdx + gtidx; - vterm = tex2D(texObject, (float)gtidx, (float)fi); - ploc.x = (gtidx + 0.5f) - (dt * vterm.x * dx); - ploc.y = (fi + 0.5f) - (dt * vterm.y * dy); - vterm = tex2D(texObject, ploc.x, ploc.y); - vxterm = vterm.x; - vyterm = vterm.y; - vx[fj] = vxterm; - vy[fj] = vyterm; - } + if (fi < dy) { + int fj = fi * pdx + gtidx; + vterm = tex2D(texObject, (float)gtidx, (float)fi); + ploc.x = (gtidx + 0.5f) - (dt * vterm.x * dx); + ploc.y = (fi + 0.5f) - (dt * vterm.y * dy); + vterm = tex2D(texObject, ploc.x, ploc.y); + vxterm = vterm.x; + vyterm = vterm.y; + vx[fj] = vxterm; + vy[fj] = vyterm; + } + } } - } } // This method performs velocity diffusion and forces mass conservation @@ -156,201 +163,194 @@ __global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, // and k is the wavenumber. The projection step forces the Fourier // velocity vectors to be orthogonal to the vectors for each // wavenumber: v(k,t) = v(k,t) - ((k dot v(k,t) * k) / k^2. -__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, - float visc, int lb) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, float visc, int lb) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - cData xterm, yterm; + cData xterm, yterm; - // gtidx is the domain location in x for this thread - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + // gtidx is the domain location in x for this thread + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fj = fi * dx + gtidx; - xterm = vx[fj]; - yterm = vy[fj]; + if (fi < dy) { + int fj = fi * dx + gtidx; + xterm = vx[fj]; + yterm = vy[fj]; - // Compute the index of the wavenumber based on the - // data order produced by a standard NN FFT. - int iix = gtidx; - int iiy = (fi > dy / 2) ? (fi - (dy)) : fi; + // Compute the index of the wavenumber based on the + // data order produced by a standard NN FFT. + int iix = gtidx; + int iiy = (fi > dy / 2) ? (fi - (dy)) : fi; - // Velocity diffusion - float kk = (float)(iix * iix + iiy * iiy); // k^2 - float diff = 1.f / (1.f + visc * dt * kk); - xterm.x *= diff; - xterm.y *= diff; - yterm.x *= diff; - yterm.y *= diff; + // Velocity diffusion + float kk = (float)(iix * iix + iiy * iiy); // k^2 + float diff = 1.f / (1.f + visc * dt * kk); + xterm.x *= diff; + xterm.y *= diff; + yterm.x *= diff; + yterm.y *= diff; - // Velocity projection - if (kk > 0.f) { - float rkk = 1.f / kk; - // Real portion of velocity projection - float rkp = (iix * xterm.x + iiy * yterm.x); - // Imaginary portion of velocity projection - float ikp = (iix * xterm.y + iiy * yterm.y); - xterm.x -= rkk * rkp * iix; - xterm.y -= rkk * ikp * iix; - yterm.x -= rkk * rkp * iiy; - yterm.y -= rkk * ikp * iiy; + // Velocity projection + if (kk > 0.f) { + float rkk = 1.f / kk; + // Real portion of velocity projection + float rkp = (iix * xterm.x + iiy * yterm.x); + // Imaginary portion of velocity projection + float ikp = (iix * xterm.y + iiy * yterm.y); + xterm.x -= rkk * rkp * iix; + xterm.y -= rkk * ikp * iix; + yterm.x -= rkk * rkp * iiy; + yterm.y -= rkk * ikp * iiy; + } + + vx[fj] = xterm; + vy[fj] = yterm; + } } - - vx[fj] = xterm; - vy[fj] = yterm; - } } - } } // This method updates the velocity field 'v' using the two complex // arrays from the previous step: 'vx' and 'vy'. Here we scale the // real components by 1/(dx*dy) to account for an unnormalized FFT. -__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, int lb, size_t pitch) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, int lb, size_t pitch) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - float vxterm, vyterm; - cData nvterm; + float vxterm, vyterm; + cData nvterm; - // gtidx is the domain location in x for this thread - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + // gtidx is the domain location in x for this thread + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fjr = fi * pdx + gtidx; - vxterm = vx[fjr]; - vyterm = vy[fjr]; + if (fi < dy) { + int fjr = fi * pdx + gtidx; + vxterm = vx[fjr]; + vyterm = vy[fjr]; - // Normalize the result of the inverse FFT - float scale = 1.f / (dx * dy); - nvterm.x = vxterm * scale; - nvterm.y = vyterm * scale; + // Normalize the result of the inverse FFT + float scale = 1.f / (dx * dy); + nvterm.x = vxterm * scale; + nvterm.y = vyterm * scale; - cData *fj = (cData *)((char *)v + fi * pitch) + gtidx; - *fj = nvterm; - } - } // If this thread is inside the domain in Y - } // If this thread is inside the domain in X + cData *fj = (cData *)((char *)v + fi * pitch) + gtidx; + *fj = nvterm; + } + } // If this thread is inside the domain in Y + } // If this thread is inside the domain in X } // This method updates the particles by moving particle positions // according to the velocity field and time step. That is, for each // particle: p(t+1) = p(t) + dt * v(p(t)). -__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, - float dt, int lb, size_t pitch) { - int gtidx = blockIdx.x * blockDim.x + threadIdx.x; - int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; - int p; +__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, float dt, int lb, size_t pitch) +{ + int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb; + int p; - // gtidx is the domain location in x for this thread - cData pterm, vterm; + // gtidx is the domain location in x for this thread + cData pterm, vterm; - if (gtidx < dx) { - for (p = 0; p < lb; p++) { - // fi is the domain location in y for this thread - int fi = gtidy + p; + if (gtidx < dx) { + for (p = 0; p < lb; p++) { + // fi is the domain location in y for this thread + int fi = gtidy + p; - if (fi < dy) { - int fj = fi * dx + gtidx; - pterm = part[fj]; + if (fi < dy) { + int fj = fi * dx + gtidx; + pterm = part[fj]; - int xvi = ((int)(pterm.x * dx)); - int yvi = ((int)(pterm.y * dy)); - vterm = *((cData *)((char *)v + yvi * pitch) + xvi); + int xvi = ((int)(pterm.x * dx)); + int yvi = ((int)(pterm.y * dy)); + vterm = *((cData *)((char *)v + yvi * pitch) + xvi); - pterm.x += dt * vterm.x; - pterm.x = pterm.x - (int)pterm.x; - pterm.x += 1.f; - pterm.x = pterm.x - (int)pterm.x; - pterm.y += dt * vterm.y; - pterm.y = pterm.y - (int)pterm.y; - pterm.y += 1.f; - pterm.y = pterm.y - (int)pterm.y; + pterm.x += dt * vterm.x; + pterm.x = pterm.x - (int)pterm.x; + pterm.x += 1.f; + pterm.x = pterm.x - (int)pterm.x; + pterm.y += dt * vterm.y; + pterm.y = pterm.y - (int)pterm.y; + pterm.y += 1.f; + pterm.y = pterm.y - (int)pterm.y; - part[fj] = pterm; - } - } // If this thread is inside the domain in Y - } // If this thread is inside the domain in X + part[fj] = pterm; + } + } // If this thread is inside the domain in Y + } // If this thread is inside the domain in X } // These are the external function calls necessary for launching fluid simuation -extern "C" void addForces(cData *v, int dx, int dy, int spx, int spy, float fx, - float fy, int r) { - dim3 tids(2 * r + 1, 2 * r + 1); +extern "C" void addForces(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r) +{ + dim3 tids(2 * r + 1, 2 * r + 1); - addForces_k<<<1, tids>>>(v, dx, dy, spx, spy, fx, fy, r, tPitch); - getLastCudaError("addForces_k failed."); + addForces_k<<<1, tids>>>(v, dx, dy, spx, spy, fx, fy, r, tPitch); + getLastCudaError("addForces_k failed."); } -extern "C" void advectVelocity(cData *v, float *vx, float *vy, int dx, int pdx, - int dy, float dt) { - dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); +extern "C" void advectVelocity(cData *v, float *vx, float *vy, int dx, int pdx, int dy, float dt) +{ + dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); - dim3 tids(TIDSX, TIDSY); + dim3 tids(TIDSX, TIDSY); - updateTexture(v, DIM * sizeof(cData), DIM, tPitch); - advectVelocity_k<<>>(v, vx, vy, dx, pdx, dy, dt, TILEY / TIDSY, - texObj); - getLastCudaError("advectVelocity_k failed."); + updateTexture(v, DIM * sizeof(cData), DIM, tPitch); + advectVelocity_k<<>>(v, vx, vy, dx, pdx, dy, dt, TILEY / TIDSY, texObj); + getLastCudaError("advectVelocity_k failed."); } -extern "C" void diffuseProject(cData *vx, cData *vy, int dx, int dy, float dt, - float visc) { - // Forward FFT - checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vx, (cufftComplex *)vx)); - checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vy, (cufftComplex *)vy)); +extern "C" void diffuseProject(cData *vx, cData *vy, int dx, int dy, float dt, float visc) +{ + // Forward FFT + checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vx, (cufftComplex *)vx)); + checkCudaErrors(cufftExecR2C(planr2c, (cufftReal *)vy, (cufftComplex *)vy)); - uint3 grid = make_uint3((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1), 1); - uint3 tids = make_uint3(TIDSX, TIDSY, 1); + uint3 grid = make_uint3((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1), 1); + uint3 tids = make_uint3(TIDSX, TIDSY, 1); - diffuseProject_k<<>>(vx, vy, dx, dy, dt, visc, TILEY / TIDSY); - getLastCudaError("diffuseProject_k failed."); + diffuseProject_k<<>>(vx, vy, dx, dy, dt, visc, TILEY / TIDSY); + getLastCudaError("diffuseProject_k failed."); - // Inverse FFT - checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vx, (cufftReal *)vx)); - checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vy, (cufftReal *)vy)); + // Inverse FFT + checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vx, (cufftReal *)vx)); + checkCudaErrors(cufftExecC2R(planc2r, (cufftComplex *)vy, (cufftReal *)vy)); } -extern "C" void updateVelocity(cData *v, float *vx, float *vy, int dx, int pdx, - int dy) { - dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); - dim3 tids(TIDSX, TIDSY); +extern "C" void updateVelocity(cData *v, float *vx, float *vy, int dx, int pdx, int dy) +{ + dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); + dim3 tids(TIDSX, TIDSY); - updateVelocity_k<<>>(v, vx, vy, dx, pdx, dy, TILEY / TIDSY, - tPitch); - getLastCudaError("updateVelocity_k failed."); + updateVelocity_k<<>>(v, vx, vy, dx, pdx, dy, TILEY / TIDSY, tPitch); + getLastCudaError("updateVelocity_k failed."); } -extern "C" void advectParticles(GLuint vbo, cData *v, int dx, int dy, - float dt) { - dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), - (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); - dim3 tids(TIDSX, TIDSY); +extern "C" void advectParticles(GLuint vbo, cData *v, int dx, int dy, float dt) +{ + dim3 grid((dx / TILEX) + (!(dx % TILEX) ? 0 : 1), (dy / TILEY) + (!(dy % TILEY) ? 0 : 1)); + dim3 tids(TIDSX, TIDSY); - cData *p; - checkCudaErrors(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); - getLastCudaError("cudaGraphicsMapResources failed"); + cData *p; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); + getLastCudaError("cudaGraphicsMapResources failed"); - size_t num_bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&p, &num_bytes, - cuda_vbo_resource)); - getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&p, &num_bytes, cuda_vbo_resource)); + getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); - advectParticles_k<<>>(p, v, dx, dy, dt, TILEY / TIDSY, tPitch); - getLastCudaError("advectParticles_k failed."); + advectParticles_k<<>>(p, v, dx, dy, dt, TILEY / TIDSY, tPitch); + getLastCudaError("advectParticles_k failed."); - checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); } diff --git a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cuh b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cuh index 6c677ec2..e7485fe3 100644 --- a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cuh +++ b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.cuh @@ -39,16 +39,14 @@ void deleteTexture(void); // This method adds constant force vectors to the velocity field // stored in 'v' according to v(x,t+1) = v(x,t) + dt * f. -__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, - float fx, float fy, int r, size_t pitch); +__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch); // This method performs the velocity advection step, where we // trace velocity vectors back in time to update each grid cell. // That is, v(x,t+1) = v(p(x,-dt),t). Here we perform bilinear // interpolation in the velocity space. -__global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, float dt, int lb, - cudaTextureObject_t tex); +__global__ void +advectVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, float dt, int lb, cudaTextureObject_t tex); // This method performs velocity diffusion and forces mass conservation // in the frequency domain. The inputs 'vx' and 'vy' are complex-valued @@ -58,19 +56,16 @@ __global__ void advectVelocity_k(cData *v, float *vx, float *vy, int dx, // and k is the wavenumber. The projection step forces the Fourier // velocity vectors to be orthogonal to the wave wave vectors for each // wavenumber: v(k,t) = v(k,t) - ((k dot v(k,t) * k) / k^2. -__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, - float visc, int lb); +__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, float visc, int lb); // This method updates the velocity field 'v' using the two complex // arrays from the previous step: 'vx' and 'vy'. Here we scale the // real components by 1/(dx*dy) to account for an unnormalized FFT. -__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, - int pdx, int dy, int lb, size_t pitch); +__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, int lb, size_t pitch); // This method updates the particles by moving particle positions // according to the velocity field and time step. That is, for each // particle: p(t+1) = p(t) + dt * v(p(t)). -__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, - float dt, int lb, size_t pitch); +__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, float dt, int lb, size_t pitch); #endif diff --git a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.h b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.h index e40fb222..33df553e 100644 --- a/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.h +++ b/Samples/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES_kernels.h @@ -20,16 +20,14 @@ void deleteTexture(void); // This method adds constant force vectors to the velocity field // stored in 'v' according to v(x,t+1) = v(x,t) + dt * f. -__global__ void -addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch); +__global__ void addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch); // This method performs the velocity advection step, where we // trace velocity vectors back in time to update each grid cell. // That is, v(x,t+1) = v(p(x,-dt),t). Here we perform bilinear // interpolation in the velocity space. __global__ void -advectVelocity_k(cData *v, float *vx, float *vy, - int dx, int pdx, int dy, float dt, int lb, cudaTextureObject_t tex); +advectVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, float dt, int lb, cudaTextureObject_t tex); // This method performs velocity diffusion and forces mass conservation // in the frequency domain. The inputs 'vx' and 'vy' are complex-valued @@ -39,23 +37,16 @@ advectVelocity_k(cData *v, float *vx, float *vy, // and k is the wavenumber. The projection step forces the Fourier // velocity vectors to be orthogonal to the wave wave vectors for each // wavenumber: v(k,t) = v(k,t) - ((k dot v(k,t) * k) / k^2. -__global__ void -diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, - float visc, int lb); +__global__ void diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt, float visc, int lb); // This method updates the velocity field 'v' using the two complex // arrays from the previous step: 'vx' and 'vy'. Here we scale the // real components by 1/(dx*dy) to account for an unnormalized FFT. -__global__ void -updateVelocity_k(cData *v, float *vx, float *vy, - int dx, int pdx, int dy, int lb, size_t pitch); +__global__ void updateVelocity_k(cData *v, float *vx, float *vy, int dx, int pdx, int dy, int lb, size_t pitch); // This method updates the particles by moving particle positions // according to the velocity field and time step. That is, for each // particle: p(t+1) = p(t) + dt * v(p(t)). -__global__ void -advectParticles_k(cData *part, cData *v, int dx, int dy, - float dt, int lb, size_t pitch); +__global__ void advectParticles_k(cData *part, cData *v, int dx, int dy, float dt, int lb, size_t pitch); #endif - diff --git a/Samples/8_Platform_Specific/Tegra/fluidsGLES/graphics_interface.h b/Samples/8_Platform_Specific/Tegra/fluidsGLES/graphics_interface.h index d91760c8..dec2c0a6 100644 --- a/Samples/8_Platform_Specific/Tegra/fluidsGLES/graphics_interface.h +++ b/Samples/8_Platform_Specific/Tegra/fluidsGLES/graphics_interface.h @@ -31,165 +31,183 @@ #include Display *display; -int screen; -Window win = 0; +int screen; +Window win = 0; -#include #include #include +#include -#define GET_GLERROR(ret) \ - \ -{ \ - GLenum err = glGetError(); \ - if (err != GL_NO_ERROR) { \ - fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, \ - err); \ - fflush(stderr); \ - \ - switch (err) { \ - case GL_INVALID_ENUM: \ - printf("GL_INVALID_ENUM\n"); \ - break; \ - case GL_INVALID_VALUE: \ - printf("GL_INVALID_VALUE\n"); \ - break; \ - case GL_INVALID_OPERATION: \ - printf("GL_INVALID_OPERATION\n"); \ - break; \ - case GL_OUT_OF_MEMORY: \ - printf("GL_OUT_OF_MEMORY\n"); \ - break; \ - case GL_INVALID_FRAMEBUFFER_OPERATION: \ - printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ - break; \ - default: \ - printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ - }; \ - } \ - \ -} +#define GET_GLERROR(ret) \ + \ + { \ + GLenum err = glGetError(); \ + if (err != GL_NO_ERROR) { \ + fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, err); \ + fflush(stderr); \ + \ + switch (err) { \ + case GL_INVALID_ENUM: \ + printf("GL_INVALID_ENUM\n"); \ + break; \ + case GL_INVALID_VALUE: \ + printf("GL_INVALID_VALUE\n"); \ + break; \ + case GL_INVALID_OPERATION: \ + printf("GL_INVALID_OPERATION\n"); \ + break; \ + case GL_OUT_OF_MEMORY: \ + printf("GL_OUT_OF_MEMORY\n"); \ + break; \ + case GL_INVALID_FRAMEBUFFER_OPERATION: \ + printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ + break; \ + default: \ + printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ + }; \ + } \ + } EGLDisplay eglDisplay = EGL_NO_DISPLAY; EGLSurface eglSurface = EGL_NO_SURFACE; EGLContext eglContext = EGL_NO_CONTEXT; -int graphics_setup_window(int xpos, int ypos, int width, int height, - const char *windowname) { - // OpenGL ES 3.1 - EGLint configAttrs[] = { - EGL_RED_SIZE, 1, EGL_GREEN_SIZE, 1, EGL_BLUE_SIZE, 1, EGL_DEPTH_SIZE, 16, - EGL_SAMPLE_BUFFERS, 0, EGL_SAMPLES, 0, - // EGL_CONFORMANT, EGL_OPENGL_BIT, - EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, // 3_BIT_KHR, - EGL_NONE}; - EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; +int graphics_setup_window(int xpos, int ypos, int width, int height, const char *windowname) +{ + // OpenGL ES 3.1 + EGLint configAttrs[] = {EGL_RED_SIZE, + 1, + EGL_GREEN_SIZE, + 1, + EGL_BLUE_SIZE, + 1, + EGL_DEPTH_SIZE, + 16, + EGL_SAMPLE_BUFFERS, + 0, + EGL_SAMPLES, + 0, + // EGL_CONFORMANT, EGL_OPENGL_BIT, + EGL_RENDERABLE_TYPE, + EGL_OPENGL_ES2_BIT, // 3_BIT_KHR, + EGL_NONE}; + EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; - EGLConfig *configList = NULL; - EGLint configCount; + EGLConfig *configList = NULL; + EGLint configCount; - display = XOpenDisplay(NULL); - if (!display) error_exit("Error opening X display.\n"); + display = XOpenDisplay(NULL); + if (!display) + error_exit("Error opening X display.\n"); - screen = DefaultScreen(display); + screen = DefaultScreen(display); - eglDisplay = eglGetDisplay(0); + eglDisplay = eglGetDisplay(0); - if (eglDisplay == EGL_NO_DISPLAY) - error_exit("EGL failed to obtain display\n"); + if (eglDisplay == EGL_NO_DISPLAY) + error_exit("EGL failed to obtain display\n"); - if (!eglInitialize(eglDisplay, 0, 0)) - error_exit("EGL failed to initialize\n"); + if (!eglInitialize(eglDisplay, 0, 0)) + error_exit("EGL failed to initialize\n"); - if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || - !configCount) - error_exit("EGL failed to return any matching configurations\n"); + if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || !configCount) + error_exit("EGL failed to return any matching configurations\n"); - configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); + configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); - if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, - &configCount) || - !configCount) - error_exit("EGL failed to populate configuration list\n"); + if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, &configCount) || !configCount) + error_exit("EGL failed to populate configuration list\n"); - Window xRootWindow = DefaultRootWindow(display); - XSetWindowAttributes xCreateWindowAttributes; - xCreateWindowAttributes.event_mask = ExposureMask; - win = XCreateWindow(display, xRootWindow, 0, 0, width, height, 0, - CopyFromParent, InputOutput, CopyFromParent, CWEventMask, - &xCreateWindowAttributes); - XMapWindow(display, win); - Atom netWmStateAtom = XInternAtom(display, "_NET_WM_STATE", false); - XEvent xEvent; - memset(&xEvent, 0, sizeof(xEvent)); - xEvent.type = ClientMessage; - xEvent.xclient.window = win; - xEvent.xclient.message_type = netWmStateAtom; - xEvent.xclient.format = 32; - xEvent.xclient.data.l[0] = 1; - xEvent.xclient.data.l[1] = false; - XSendEvent(display, xRootWindow, false, SubstructureNotifyMask, &xEvent); + Window xRootWindow = DefaultRootWindow(display); + XSetWindowAttributes xCreateWindowAttributes; + xCreateWindowAttributes.event_mask = ExposureMask; + win = XCreateWindow(display, + xRootWindow, + 0, + 0, + width, + height, + 0, + CopyFromParent, + InputOutput, + CopyFromParent, + CWEventMask, + &xCreateWindowAttributes); + XMapWindow(display, win); + Atom netWmStateAtom = XInternAtom(display, "_NET_WM_STATE", false); + XEvent xEvent; + memset(&xEvent, 0, sizeof(xEvent)); + xEvent.type = ClientMessage; + xEvent.xclient.window = win; + xEvent.xclient.message_type = netWmStateAtom; + xEvent.xclient.format = 32; + xEvent.xclient.data.l[0] = 1; + xEvent.xclient.data.l[1] = false; + XSendEvent(display, xRootWindow, false, SubstructureNotifyMask, &xEvent); - XStoreName(display, win, windowname); + XStoreName(display, win, windowname); - XSelectInput(display, win, ExposureMask | KeyPressMask | ButtonPressMask | - ButtonReleaseMask | KeyReleaseMask | - VisibilityChangeMask | PointerMotionMask); + XSelectInput(display, + win, + ExposureMask | KeyPressMask | ButtonPressMask | ButtonReleaseMask | KeyReleaseMask + | VisibilityChangeMask | PointerMotionMask); - EGLint windowAttrs[] = {EGL_NONE}; + EGLint windowAttrs[] = {EGL_NONE}; - eglSurface = eglCreateWindowSurface(eglDisplay, configList[0], - (EGLNativeWindowType)win, windowAttrs); + eglSurface = eglCreateWindowSurface(eglDisplay, configList[0], (EGLNativeWindowType)win, windowAttrs); - if (!eglSurface) error_exit("EGL couldn't create window\n"); + if (!eglSurface) + error_exit("EGL couldn't create window\n"); - eglBindAPI(EGL_OPENGL_ES_API); + eglBindAPI(EGL_OPENGL_ES_API); - eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); - if (!eglContext) error_exit("EGL couldn't create context\n"); + eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); + if (!eglContext) + error_exit("EGL couldn't create context\n"); - if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) - error_exit("EGL couldn't make context/surface current\n"); + if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) + error_exit("EGL couldn't make context/surface current\n"); - EGLint Context_RendererType; - eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, - &Context_RendererType); + EGLint Context_RendererType; + eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, &Context_RendererType); - switch (Context_RendererType) { + switch (Context_RendererType) { case EGL_OPENGL_API: - printf("Using OpenGL API is not supported\n"); - exit(EXIT_FAILURE); - break; + printf("Using OpenGL API is not supported\n"); + exit(EXIT_FAILURE); + break; case EGL_OPENGL_ES_API: - printf("Using OpenGL ES API"); - break; + printf("Using OpenGL ES API"); + break; case EGL_OPENVG_API: - error_exit("Context Query Returned OpenVG. This is Unsupported\n"); + error_exit("Context Query Returned OpenVG. This is Unsupported\n"); default: - error_exit("Unknown Context Type. %04X\n", Context_RendererType); - } + error_exit("Unknown Context Type. %04X\n", Context_RendererType); + } - return 1; + return 1; } -void graphics_set_windowtitle(const char *windowname) { - XStoreName(display, win, windowname); -} +void graphics_set_windowtitle(const char *windowname) { XStoreName(display, win, windowname); } void graphics_swap_buffers() { eglSwapBuffers(eglDisplay, eglSurface); } -void graphics_close_window() { - if (eglDisplay != EGL_NO_DISPLAY) { - eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); +void graphics_close_window() +{ + if (eglDisplay != EGL_NO_DISPLAY) { + eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); - if (eglContext != EGL_NO_CONTEXT) eglDestroyContext(eglDisplay, eglContext); + if (eglContext != EGL_NO_CONTEXT) + eglDestroyContext(eglDisplay, eglContext); - if (eglSurface != EGL_NO_SURFACE) eglDestroySurface(eglDisplay, eglSurface); + if (eglSurface != EGL_NO_SURFACE) + eglDestroySurface(eglDisplay, eglSurface); - eglTerminate(eglDisplay); - } + eglTerminate(eglDisplay); + } - if (win) XDestroyWindow(display, win); + if (win) + XDestroyWindow(display, win); - XCloseDisplay(display); + XCloseDisplay(display); } diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/README.md b/Samples/8_Platform_Specific/Tegra/nbody_opengles/README.md index 2dda4e15..3a20edb5 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/README.md +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystem.h b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystem.h index 2d11c1ed..9455dc2b 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystem.h +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystem.h @@ -30,257 +30,262 @@ #include -enum NBodyConfig { - NBODY_CONFIG_RANDOM, - NBODY_CONFIG_SHELL, - NBODY_CONFIG_EXPAND, - NBODY_NUM_CONFIGS -}; +enum NBodyConfig { NBODY_CONFIG_RANDOM, NBODY_CONFIG_SHELL, NBODY_CONFIG_EXPAND, NBODY_NUM_CONFIGS }; enum BodyArray { - BODYSYSTEM_POSITION, - BODYSYSTEM_VELOCITY, + BODYSYSTEM_POSITION, + BODYSYSTEM_VELOCITY, }; -template -struct vec3 { - typedef float Type; -}; // dummy -template <> -struct vec3 { - typedef float3 Type; +template struct vec3 +{ + typedef float Type; +}; // dummy +template <> struct vec3 +{ + typedef float3 Type; }; -template <> -struct vec3 { - typedef double3 Type; +template <> struct vec3 +{ + typedef double3 Type; }; -template -struct vec4 { - typedef float Type; -}; // dummy -template <> -struct vec4 { - typedef float4 Type; +template struct vec4 +{ + typedef float Type; +}; // dummy +template <> struct vec4 +{ + typedef float4 Type; }; -template <> -struct vec4 { - typedef double4 Type; +template <> struct vec4 +{ + typedef double4 Type; }; class string; // BodySystem abstract base class -template -class BodySystem { - public: // methods - BodySystem(int numBodies) {} - virtual ~BodySystem() {} +template class BodySystem +{ +public: // methods + BodySystem(int numBodies) {} + virtual ~BodySystem() {} - virtual void loadTipsyFile(const std::string &filename) = 0; + virtual void loadTipsyFile(const std::string &filename) = 0; - virtual void update(T deltaTime) = 0; + virtual void update(T deltaTime) = 0; - virtual void setSoftening(T softening) = 0; - virtual void setDamping(T damping) = 0; + virtual void setSoftening(T softening) = 0; + virtual void setDamping(T damping) = 0; - virtual T *getArray(BodyArray array) = 0; - virtual void setArray(BodyArray array, const T *data) = 0; + virtual T *getArray(BodyArray array) = 0; + virtual void setArray(BodyArray array, const T *data) = 0; - virtual unsigned int getCurrentReadBuffer() const = 0; + virtual unsigned int getCurrentReadBuffer() const = 0; - virtual unsigned int getNumBodies() const = 0; + virtual unsigned int getNumBodies() const = 0; - virtual void synchronizeThreads() const {}; + virtual void synchronizeThreads() const {}; - protected: // methods - BodySystem() {} // default constructor +protected: // methods + BodySystem() {} // default constructor - virtual void _initialize(int numBodies) = 0; - virtual void _finalize() = 0; + virtual void _initialize(int numBodies) = 0; + virtual void _finalize() = 0; }; -inline float3 scalevec(float3 &vector, float scalar) { - float3 rt = vector; - rt.x *= scalar; - rt.y *= scalar; - rt.z *= scalar; - return rt; +inline float3 scalevec(float3 &vector, float scalar) +{ + float3 rt = vector; + rt.x *= scalar; + rt.y *= scalar; + rt.z *= scalar; + return rt; } -inline float normalize(float3 &vector) { - float dist = - sqrtf(vector.x * vector.x + vector.y * vector.y + vector.z * vector.z); +inline float normalize(float3 &vector) +{ + float dist = sqrtf(vector.x * vector.x + vector.y * vector.y + vector.z * vector.z); - if (dist > 1e-6) { - vector.x /= dist; - vector.y /= dist; - vector.z /= dist; - } + if (dist > 1e-6) { + vector.x /= dist; + vector.y /= dist; + vector.z /= dist; + } - return dist; + return dist; } -inline float dot(float3 v0, float3 v1) { - return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; -} +inline float dot(float3 v0, float3 v1) { return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; } -inline float3 cross(float3 v0, float3 v1) { - float3 rt; - rt.x = v0.y * v1.z - v0.z * v1.y; - rt.y = v0.z * v1.x - v0.x * v1.z; - rt.z = v0.x * v1.y - v0.y * v1.x; - return rt; +inline float3 cross(float3 v0, float3 v1) +{ + float3 rt; + rt.x = v0.y * v1.z - v0.z * v1.y; + rt.y = v0.z * v1.x - v0.x * v1.z; + rt.z = v0.x * v1.y - v0.y * v1.x; + return rt; } // utility function template -void randomizeBodies(NBodyConfig config, T *pos, T *vel, float *color, - float clusterScale, float velocityScale, int numBodies, - bool vec4vel) { - switch (config) { +void randomizeBodies(NBodyConfig config, + T *pos, + T *vel, + float *color, + float clusterScale, + float velocityScale, + int numBodies, + bool vec4vel) +{ + switch (config) { default: case NBODY_CONFIG_RANDOM: { - float scale = clusterScale * std::max(1.0f, numBodies / (1024.0f)); - float vscale = velocityScale * scale; + float scale = clusterScale * std::max(1.0f, numBodies / (1024.0f)); + float vscale = velocityScale * scale; - int p = 0, v = 0; - int i = 0; + int p = 0, v = 0; + int i = 0; - while (i < numBodies) { - float3 point; - // const int scale = 16; - point.x = rand() / (float)RAND_MAX * 2 - 1; - point.y = rand() / (float)RAND_MAX * 2 - 1; - point.z = rand() / (float)RAND_MAX * 2 - 1; - float lenSqr = dot(point, point); + while (i < numBodies) { + float3 point; + // const int scale = 16; + point.x = rand() / (float)RAND_MAX * 2 - 1; + point.y = rand() / (float)RAND_MAX * 2 - 1; + point.z = rand() / (float)RAND_MAX * 2 - 1; + float lenSqr = dot(point, point); - if (lenSqr > 1) continue; + if (lenSqr > 1) + continue; - float3 velocity; - velocity.x = rand() / (float)RAND_MAX * 2 - 1; - velocity.y = rand() / (float)RAND_MAX * 2 - 1; - velocity.z = rand() / (float)RAND_MAX * 2 - 1; - lenSqr = dot(velocity, velocity); + float3 velocity; + velocity.x = rand() / (float)RAND_MAX * 2 - 1; + velocity.y = rand() / (float)RAND_MAX * 2 - 1; + velocity.z = rand() / (float)RAND_MAX * 2 - 1; + lenSqr = dot(velocity, velocity); - if (lenSqr > 1) continue; + if (lenSqr > 1) + continue; - pos[p++] = point.x * scale; // pos.x - pos[p++] = point.y * scale; // pos.y - pos[p++] = point.z * scale; // pos.z - pos[p++] = 1.0f; // mass + pos[p++] = point.x * scale; // pos.x + pos[p++] = point.y * scale; // pos.y + pos[p++] = point.z * scale; // pos.z + pos[p++] = 1.0f; // mass - vel[v++] = velocity.x * vscale; // pos.x - vel[v++] = velocity.y * vscale; // pos.x - vel[v++] = velocity.z * vscale; // pos.x + vel[v++] = velocity.x * vscale; // pos.x + vel[v++] = velocity.y * vscale; // pos.x + vel[v++] = velocity.z * vscale; // pos.x - if (vec4vel) vel[v++] = 1.0f; // inverse mass + if (vec4vel) + vel[v++] = 1.0f; // inverse mass - i++; - } + i++; + } } break; case NBODY_CONFIG_SHELL: { - float scale = clusterScale; - float vscale = scale * velocityScale; - float inner = 2.5f * scale; - float outer = 4.0f * scale; + float scale = clusterScale; + float vscale = scale * velocityScale; + float inner = 2.5f * scale; + float outer = 4.0f * scale; - int p = 0, v = 0; - int i = 0; + int p = 0, v = 0; + int i = 0; - while (i < numBodies) // for(int i=0; i < numBodies; i++) - { - float x, y, z; - x = rand() / (float)RAND_MAX * 2 - 1; - y = rand() / (float)RAND_MAX * 2 - 1; - z = rand() / (float)RAND_MAX * 2 - 1; + while (i < numBodies) // for(int i=0; i < numBodies; i++) + { + float x, y, z; + x = rand() / (float)RAND_MAX * 2 - 1; + y = rand() / (float)RAND_MAX * 2 - 1; + z = rand() / (float)RAND_MAX * 2 - 1; - float3 point = {x, y, z}; - float len = normalize(point); + float3 point = {x, y, z}; + float len = normalize(point); - if (len > 1) continue; + if (len > 1) + continue; - pos[p++] = - point.x * (inner + (outer - inner) * rand() / (float)RAND_MAX); - pos[p++] = - point.y * (inner + (outer - inner) * rand() / (float)RAND_MAX); - pos[p++] = - point.z * (inner + (outer - inner) * rand() / (float)RAND_MAX); - pos[p++] = 1.0f; + pos[p++] = point.x * (inner + (outer - inner) * rand() / (float)RAND_MAX); + pos[p++] = point.y * (inner + (outer - inner) * rand() / (float)RAND_MAX); + pos[p++] = point.z * (inner + (outer - inner) * rand() / (float)RAND_MAX); + pos[p++] = 1.0f; - x = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); - y = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); - z = 1.0f; // * (rand() / (float) RAND_MAX * 2 - 1); - float3 axis = {x, y, z}; - normalize(axis); + x = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); + y = 0.0f; // * (rand() / (float) RAND_MAX * 2 - 1); + z = 1.0f; // * (rand() / (float) RAND_MAX * 2 - 1); + float3 axis = {x, y, z}; + normalize(axis); - if (1 - dot(point, axis) < 1e-6) { - axis.x = point.y; - axis.y = point.x; - normalize(axis); + if (1 - dot(point, axis) < 1e-6) { + axis.x = point.y; + axis.y = point.x; + normalize(axis); + } + + // if (point.y < 0) axis = scalevec(axis, -1); + float3 vv = {(float)pos[4 * i], (float)pos[4 * i + 1], (float)pos[4 * i + 2]}; + vv = cross(vv, axis); + vel[v++] = vv.x * vscale; + vel[v++] = vv.y * vscale; + vel[v++] = vv.z * vscale; + + if (vec4vel) + vel[v++] = 1.0f; + + i++; } - - // if (point.y < 0) axis = scalevec(axis, -1); - float3 vv = {(float)pos[4 * i], (float)pos[4 * i + 1], - (float)pos[4 * i + 2]}; - vv = cross(vv, axis); - vel[v++] = vv.x * vscale; - vel[v++] = vv.y * vscale; - vel[v++] = vv.z * vscale; - - if (vec4vel) vel[v++] = 1.0f; - - i++; - } } break; case NBODY_CONFIG_EXPAND: { - float scale = clusterScale * numBodies / (1024.f); + float scale = clusterScale * numBodies / (1024.f); - if (scale < 1.0f) scale = clusterScale; + if (scale < 1.0f) + scale = clusterScale; - float vscale = scale * velocityScale; + float vscale = scale * velocityScale; - int p = 0, v = 0; + int p = 0, v = 0; - for (int i = 0; i < numBodies;) { - float3 point; + for (int i = 0; i < numBodies;) { + float3 point; - point.x = rand() / (float)RAND_MAX * 2 - 1; - point.y = rand() / (float)RAND_MAX * 2 - 1; - point.z = rand() / (float)RAND_MAX * 2 - 1; + point.x = rand() / (float)RAND_MAX * 2 - 1; + point.y = rand() / (float)RAND_MAX * 2 - 1; + point.z = rand() / (float)RAND_MAX * 2 - 1; - float lenSqr = dot(point, point); + float lenSqr = dot(point, point); - if (lenSqr > 1) continue; + if (lenSqr > 1) + continue; - pos[p++] = point.x * scale; // pos.x - pos[p++] = point.y * scale; // pos.y - pos[p++] = point.z * scale; // pos.z - pos[p++] = 1.0f; // mass - vel[v++] = point.x * vscale; // pos.x - vel[v++] = point.y * vscale; // pos.x - vel[v++] = point.z * vscale; // pos.x + pos[p++] = point.x * scale; // pos.x + pos[p++] = point.y * scale; // pos.y + pos[p++] = point.z * scale; // pos.z + pos[p++] = 1.0f; // mass + vel[v++] = point.x * vscale; // pos.x + vel[v++] = point.y * vscale; // pos.x + vel[v++] = point.z * vscale; // pos.x - if (vec4vel) vel[v++] = 1.0f; // inverse mass + if (vec4vel) + vel[v++] = 1.0f; // inverse mass - i++; - } + i++; + } } break; - } - - if (color) { - int v = 0; - - for (int i = 0; i < numBodies; i++) { - // const int scale = 16; - color[v++] = rand() / (float)RAND_MAX; - color[v++] = rand() / (float)RAND_MAX; - color[v++] = rand() / (float)RAND_MAX; - color[v++] = 1.0f; } - } + + if (color) { + int v = 0; + + for (int i = 0; i < numBodies; i++) { + // const int scale = 16; + color[v++] = rand() / (float)RAND_MAX; + color[v++] = rand() / (float)RAND_MAX; + color[v++] = rand() / (float)RAND_MAX; + color[v++] = 1.0f; + } + } } -#endif // __BODYSYSTEM_H__ +#endif // __BODYSYSTEM_H__ diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu.h b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu.h index 700e385a..11553ffb 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu.h +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu.h @@ -31,49 +31,47 @@ #include "bodysystem.h" // CPU Body System -template -class BodySystemCPU : public BodySystem { - public: - BodySystemCPU(int numBodies); - virtual ~BodySystemCPU(); +template class BodySystemCPU : public BodySystem +{ +public: + BodySystemCPU(int numBodies); + virtual ~BodySystemCPU(); - virtual void loadTipsyFile(const std::string &filename); + virtual void loadTipsyFile(const std::string &filename); - virtual void update(T deltaTime); + virtual void update(T deltaTime); - virtual void setSoftening(T softening) { - m_softeningSquared = softening * softening; - } - virtual void setDamping(T damping) { m_damping = damping; } + virtual void setSoftening(T softening) { m_softeningSquared = softening * softening; } + virtual void setDamping(T damping) { m_damping = damping; } - virtual T *getArray(BodyArray array); - virtual void setArray(BodyArray array, const T *data); + virtual T *getArray(BodyArray array); + virtual void setArray(BodyArray array, const T *data); - virtual unsigned int getCurrentReadBuffer() const { return 0; } + virtual unsigned int getCurrentReadBuffer() const { return 0; } - virtual unsigned int getNumBodies() const { return m_numBodies; } + virtual unsigned int getNumBodies() const { return m_numBodies; } - protected: // methods - BodySystemCPU() {} // default constructor +protected: // methods + BodySystemCPU() {} // default constructor - virtual void _initialize(int numBodies); - virtual void _finalize(); + virtual void _initialize(int numBodies); + virtual void _finalize(); - void _computeNBodyGravitation(); - void _integrateNBodySystem(T deltaTime); + void _computeNBodyGravitation(); + void _integrateNBodySystem(T deltaTime); - protected: // data - int m_numBodies; - bool m_bInitialized; +protected: // data + int m_numBodies; + bool m_bInitialized; - T *m_pos; - T *m_vel; - T *m_force; + T *m_pos; + T *m_vel; + T *m_force; - T m_softeningSquared; - T m_damping; + T m_softeningSquared; + T m_damping; }; #include "bodysystemcpu_impl.h" -#endif // __BODYSYSTEMCPU_H__ +#endif // __BODYSYSTEMCPU_H__ diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu_impl.h b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu_impl.h index 14130064..bf0c7437 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu_impl.h +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcpu_impl.h @@ -25,15 +25,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "bodysystemcpu.h" - -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include + +#include "bodysystemcpu.h" #include "tipsy.h" #ifdef OPENMP @@ -42,239 +42,229 @@ template BodySystemCPU::BodySystemCPU(int numBodies) - : m_numBodies(numBodies), - m_bInitialized(false), - m_force(0), - m_softeningSquared(.00125f), - m_damping(0.995f) { - m_pos = 0; - m_vel = 0; + : m_numBodies(numBodies) + , m_bInitialized(false) + , m_force(0) + , m_softeningSquared(.00125f) + , m_damping(0.995f) +{ + m_pos = 0; + m_vel = 0; - _initialize(numBodies); + _initialize(numBodies); } -template -BodySystemCPU::~BodySystemCPU() { - _finalize(); - m_numBodies = 0; +template BodySystemCPU::~BodySystemCPU() +{ + _finalize(); + m_numBodies = 0; } -template -void BodySystemCPU::_initialize(int numBodies) { - assert(!m_bInitialized); +template void BodySystemCPU::_initialize(int numBodies) +{ + assert(!m_bInitialized); - m_numBodies = numBodies; + m_numBodies = numBodies; - m_pos = new T[m_numBodies * 4]; - m_vel = new T[m_numBodies * 4]; - m_force = new T[m_numBodies * 3]; + m_pos = new T[m_numBodies * 4]; + m_vel = new T[m_numBodies * 4]; + m_force = new T[m_numBodies * 3]; - memset(m_pos, 0, m_numBodies * 4 * sizeof(T)); - memset(m_vel, 0, m_numBodies * 4 * sizeof(T)); - memset(m_force, 0, m_numBodies * 3 * sizeof(T)); + memset(m_pos, 0, m_numBodies * 4 * sizeof(T)); + memset(m_vel, 0, m_numBodies * 4 * sizeof(T)); + memset(m_force, 0, m_numBodies * 3 * sizeof(T)); - m_bInitialized = true; + m_bInitialized = true; } -template -void BodySystemCPU::_finalize() { - assert(m_bInitialized); +template void BodySystemCPU::_finalize() +{ + assert(m_bInitialized); - delete[] m_pos; - delete[] m_vel; - delete[] m_force; + delete[] m_pos; + delete[] m_vel; + delete[] m_force; - m_bInitialized = false; + m_bInitialized = false; } -template -void BodySystemCPU::loadTipsyFile(const std::string &filename) { - if (m_bInitialized) _finalize(); +template void BodySystemCPU::loadTipsyFile(const std::string &filename) +{ + if (m_bInitialized) + _finalize(); - vector::Type> positions; - vector::Type> velocities; - vector ids; + vector::Type> positions; + vector::Type> velocities; + vector ids; - int nBodies = 0; - int nFirst = 0, nSecond = 0, nThird = 0; + int nBodies = 0; + int nFirst = 0, nSecond = 0, nThird = 0; - read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, - nSecond, nThird); + read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, nSecond, nThird); - _initialize(nBodies); + _initialize(nBodies); - memcpy(m_pos, &positions[0], sizeof(vec4) * nBodies); - memcpy(m_vel, &velocities[0], sizeof(vec4) * nBodies); + memcpy(m_pos, &positions[0], sizeof(vec4) * nBodies); + memcpy(m_vel, &velocities[0], sizeof(vec4) * nBodies); } -template -void BodySystemCPU::update(T deltaTime) { - assert(m_bInitialized); +template void BodySystemCPU::update(T deltaTime) +{ + assert(m_bInitialized); - _integrateNBodySystem(deltaTime); + _integrateNBodySystem(deltaTime); - // std::swap(m_currentRead, m_currentWrite); + // std::swap(m_currentRead, m_currentWrite); } -template -T *BodySystemCPU::getArray(BodyArray array) { - assert(m_bInitialized); +template T *BodySystemCPU::getArray(BodyArray array) +{ + assert(m_bInitialized); - T *data = 0; + T *data = 0; - switch (array) { + switch (array) { default: case BODYSYSTEM_POSITION: - data = m_pos; - break; + data = m_pos; + break; case BODYSYSTEM_VELOCITY: - data = m_vel; - break; - } - - return data; -} - -template -void BodySystemCPU::setArray(BodyArray array, const T *data) { - assert(m_bInitialized); - - T *target = 0; - - switch (array) { - default: - case BODYSYSTEM_POSITION: - target = m_pos; - break; - - case BODYSYSTEM_VELOCITY: - target = m_vel; - break; - } - - memcpy(target, data, m_numBodies * 4 * sizeof(T)); -} - -template -T sqrt_T(T x) { - return sqrt(x); -} - -template <> -float sqrt_T(float x) { - return sqrtf(x); -} - -template -void bodyBodyInteraction(T accel[3], T posMass0[4], T posMass1[4], - T softeningSquared) { - T r[3]; - - // r_01 [3 FLOPS] - r[0] = posMass1[0] - posMass0[0]; - r[1] = posMass1[1] - posMass0[1]; - r[2] = posMass1[2] - posMass0[2]; - - // d^2 + e^2 [6 FLOPS] - T distSqr = r[0] * r[0] + r[1] * r[1] + r[2] * r[2]; - distSqr += softeningSquared; - - // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] - T invDist = (T)1.0 / (T)sqrt((double)distSqr); - T invDistCube = invDist * invDist * invDist; - - // s = m_j * invDistCube [1 FLOP] - T s = posMass1[3] * invDistCube; - - // (m_1 * r_01) / (d^2 + e^2)^(3/2) [6 FLOPS] - accel[0] += r[0] * s; - accel[1] += r[1] * s; - accel[2] += r[2] * s; -} - -template -void BodySystemCPU::_computeNBodyGravitation() { -#ifdef OPENMP -#pragma omp parallel for -#endif - - for (int i = 0; i < m_numBodies; i++) { - int indexForce = 3 * i; - - T acc[3] = {0, 0, 0}; - - // We unroll this loop 4X for a small performance boost. - int j = 0; - - while (j < m_numBodies) { - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; - bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], - m_softeningSquared); - j++; + data = m_vel; + break; } - m_force[indexForce] = acc[0]; - m_force[indexForce + 1] = acc[1]; - m_force[indexForce + 2] = acc[2]; - } + return data; } -template -void BodySystemCPU::_integrateNBodySystem(T deltaTime) { - _computeNBodyGravitation(); +template void BodySystemCPU::setArray(BodyArray array, const T *data) +{ + assert(m_bInitialized); + + T *target = 0; + + switch (array) { + default: + case BODYSYSTEM_POSITION: + target = m_pos; + break; + + case BODYSYSTEM_VELOCITY: + target = m_vel; + break; + } + + memcpy(target, data, m_numBodies * 4 * sizeof(T)); +} + +template T sqrt_T(T x) { return sqrt(x); } + +template <> float sqrt_T(float x) { return sqrtf(x); } + +template void bodyBodyInteraction(T accel[3], T posMass0[4], T posMass1[4], T softeningSquared) +{ + T r[3]; + + // r_01 [3 FLOPS] + r[0] = posMass1[0] - posMass0[0]; + r[1] = posMass1[1] - posMass0[1]; + r[2] = posMass1[2] - posMass0[2]; + + // d^2 + e^2 [6 FLOPS] + T distSqr = r[0] * r[0] + r[1] * r[1] + r[2] * r[2]; + distSqr += softeningSquared; + + // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] + T invDist = (T)1.0 / (T)sqrt((double)distSqr); + T invDistCube = invDist * invDist * invDist; + + // s = m_j * invDistCube [1 FLOP] + T s = posMass1[3] * invDistCube; + + // (m_1 * r_01) / (d^2 + e^2)^(3/2) [6 FLOPS] + accel[0] += r[0] * s; + accel[1] += r[1] * s; + accel[2] += r[2] * s; +} + +template void BodySystemCPU::_computeNBodyGravitation() +{ +#ifdef OPENMP +#pragma omp parallel for +#endif + + for (int i = 0; i < m_numBodies; i++) { + int indexForce = 3 * i; + + T acc[3] = {0, 0, 0}; + + // We unroll this loop 4X for a small performance boost. + int j = 0; + + while (j < m_numBodies) { + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + bodyBodyInteraction(acc, &m_pos[4 * i], &m_pos[4 * j], m_softeningSquared); + j++; + } + + m_force[indexForce] = acc[0]; + m_force[indexForce + 1] = acc[1]; + m_force[indexForce + 2] = acc[2]; + } +} + +template void BodySystemCPU::_integrateNBodySystem(T deltaTime) +{ + _computeNBodyGravitation(); #ifdef OPENMP #pragma omp parallel for #endif - for (int i = 0; i < m_numBodies; ++i) { - int index = 4 * i; - int indexForce = 3 * i; + for (int i = 0; i < m_numBodies; ++i) { + int index = 4 * i; + int indexForce = 3 * i; - T pos[3], vel[3], force[3]; - pos[0] = m_pos[index + 0]; - pos[1] = m_pos[index + 1]; - pos[2] = m_pos[index + 2]; - T invMass = m_pos[index + 3]; + T pos[3], vel[3], force[3]; + pos[0] = m_pos[index + 0]; + pos[1] = m_pos[index + 1]; + pos[2] = m_pos[index + 2]; + T invMass = m_pos[index + 3]; - vel[0] = m_vel[index + 0]; - vel[1] = m_vel[index + 1]; - vel[2] = m_vel[index + 2]; + vel[0] = m_vel[index + 0]; + vel[1] = m_vel[index + 1]; + vel[2] = m_vel[index + 2]; - force[0] = m_force[indexForce + 0]; - force[1] = m_force[indexForce + 1]; - force[2] = m_force[indexForce + 2]; + force[0] = m_force[indexForce + 0]; + force[1] = m_force[indexForce + 1]; + force[2] = m_force[indexForce + 2]; - // acceleration = force / mass; - // new velocity = old velocity + acceleration * deltaTime - vel[0] += (force[0] * invMass) * deltaTime; - vel[1] += (force[1] * invMass) * deltaTime; - vel[2] += (force[2] * invMass) * deltaTime; + // acceleration = force / mass; + // new velocity = old velocity + acceleration * deltaTime + vel[0] += (force[0] * invMass) * deltaTime; + vel[1] += (force[1] * invMass) * deltaTime; + vel[2] += (force[2] * invMass) * deltaTime; - vel[0] *= m_damping; - vel[1] *= m_damping; - vel[2] *= m_damping; + vel[0] *= m_damping; + vel[1] *= m_damping; + vel[2] *= m_damping; - // new position = old position + velocity * deltaTime - pos[0] += vel[0] * deltaTime; - pos[1] += vel[1] * deltaTime; - pos[2] += vel[2] * deltaTime; + // new position = old position + velocity * deltaTime + pos[0] += vel[0] * deltaTime; + pos[1] += vel[1] * deltaTime; + pos[2] += vel[2] * deltaTime; - m_pos[index + 0] = pos[0]; - m_pos[index + 1] = pos[1]; - m_pos[index + 2] = pos[2]; + m_pos[index + 0] = pos[0]; + m_pos[index + 1] = pos[1]; + m_pos[index + 2] = pos[2]; - m_vel[index + 0] = vel[0]; - m_vel[index + 1] = vel[1]; - m_vel[index + 2] = vel[2]; - } + m_vel[index + 0] = vel[0]; + m_vel[index + 1] = vel[1]; + m_vel[index + 2] = vel[2]; + } } diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.cu b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.cu index 1c95980e..c5a6ee2b 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.cu +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.cu @@ -28,249 +28,246 @@ #include #include -//#include -//#include +// #include +// #include // CUDA standard includes #include -//#include +// #include #include "bodysystem.h" -__constant__ float softeningSquared; +__constant__ float softeningSquared; __constant__ double softeningSquared_fp64; -cudaError_t setSofteningSquared(float softeningSq) { - return cudaMemcpyToSymbol(softeningSquared, &softeningSq, sizeof(float), 0, - cudaMemcpyHostToDevice); +cudaError_t setSofteningSquared(float softeningSq) +{ + return cudaMemcpyToSymbol(softeningSquared, &softeningSq, sizeof(float), 0, cudaMemcpyHostToDevice); } -cudaError_t setSofteningSquared(double softeningSq) { - return cudaMemcpyToSymbol(softeningSquared_fp64, &softeningSq, sizeof(double), - 0, cudaMemcpyHostToDevice); +cudaError_t setSofteningSquared(double softeningSq) +{ + return cudaMemcpyToSymbol(softeningSquared_fp64, &softeningSq, sizeof(double), 0, cudaMemcpyHostToDevice); } -template -struct SharedMemory { - __device__ inline operator T *() { - extern __shared__ int __smem[]; - return (T *)__smem; - } +template struct SharedMemory +{ + __device__ inline operator T *() + { + extern __shared__ int __smem[]; + return (T *)__smem; + } - __device__ inline operator const T *() const { - extern __shared__ int __smem[]; - return (T *)__smem; - } + __device__ inline operator const T *() const + { + extern __shared__ int __smem[]; + return (T *)__smem; + } }; -template -__device__ T rsqrt_T(T x) { - return rsqrt(x); -} +template __device__ T rsqrt_T(T x) { return rsqrt(x); } -template <> -__device__ float rsqrt_T(float x) { - return rsqrtf(x); -} +template <> __device__ float rsqrt_T(float x) { return rsqrtf(x); } -template <> -__device__ double rsqrt_T(double x) { - return rsqrt(x); -} +template <> __device__ double rsqrt_T(double x) { return rsqrt(x); } // Macros to simplify shared memory addressing #define SX(i) sharedPos[i + blockDim.x * threadIdx.y] // This macro is only used when multithreadBodies is true (below) #define SX_SUM(i, j) sharedPos[i + blockDim.x * j] -template -__device__ T getSofteningSquared() { - return softeningSquared; -} -template <> -__device__ double getSofteningSquared() { - return softeningSquared_fp64; -} +template __device__ T getSofteningSquared() { return softeningSquared; } +template <> __device__ double getSofteningSquared() { return softeningSquared_fp64; } -template -struct DeviceData { - T *dPos[2]; // mapped host pointers - T *dVel; - cudaEvent_t event; - unsigned int offset; - unsigned int numBodies; +template struct DeviceData +{ + T *dPos[2]; // mapped host pointers + T *dVel; + cudaEvent_t event; + unsigned int offset; + unsigned int numBodies; }; template -__device__ typename vec3::Type bodyBodyInteraction( - typename vec3::Type ai, typename vec4::Type bi, - typename vec4::Type bj) { - typename vec3::Type r; +__device__ typename vec3::Type +bodyBodyInteraction(typename vec3::Type ai, typename vec4::Type bi, typename vec4::Type bj) +{ + typename vec3::Type r; - // r_ij [3 FLOPS] - r.x = bj.x - bi.x; - r.y = bj.y - bi.y; - r.z = bj.z - bi.z; + // r_ij [3 FLOPS] + r.x = bj.x - bi.x; + r.y = bj.y - bi.y; + r.z = bj.z - bi.z; - // distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS] - T distSqr = r.x * r.x + r.y * r.y + r.z * r.z; - distSqr += getSofteningSquared(); + // distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS] + T distSqr = r.x * r.x + r.y * r.y + r.z * r.z; + distSqr += getSofteningSquared(); - // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] - T invDist = rsqrt_T(distSqr); - T invDistCube = invDist * invDist * invDist; + // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] + T invDist = rsqrt_T(distSqr); + T invDistCube = invDist * invDist * invDist; - // s = m_j * invDistCube [1 FLOP] - T s = bj.w * invDistCube; + // s = m_j * invDistCube [1 FLOP] + T s = bj.w * invDistCube; - // a_i = a_i + s * r_ij [6 FLOPS] - ai.x += r.x * s; - ai.y += r.y * s; - ai.z += r.z * s; + // a_i = a_i + s * r_ij [6 FLOPS] + ai.x += r.x * s; + ai.y += r.y * s; + ai.z += r.z * s; - return ai; + return ai; } template -__device__ typename vec3::Type computeBodyAccel( - typename vec4::Type bodyPos, typename vec4::Type *positions, - int numTiles) { - typename vec4::Type *sharedPos = SharedMemory::Type>(); +__device__ typename vec3::Type +computeBodyAccel(typename vec4::Type bodyPos, typename vec4::Type *positions, int numTiles) +{ + typename vec4::Type *sharedPos = SharedMemory::Type>(); - typename vec3::Type acc = {0.0f, 0.0f, 0.0f}; + typename vec3::Type acc = {0.0f, 0.0f, 0.0f}; - for (int tile = 0; tile < numTiles; tile++) { - sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; + for (int tile = 0; tile < numTiles; tile++) { + sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; - __syncthreads(); + __syncthreads(); - // This is the "tile_calculation" from the GPUG3 article. + // This is the "tile_calculation" from the GPUG3 article. #pragma unroll 128 - for (unsigned int counter = 0; counter < blockDim.x; counter++) { - acc = bodyBodyInteraction(acc, bodyPos, sharedPos[counter]); + for (unsigned int counter = 0; counter < blockDim.x; counter++) { + acc = bodyBodyInteraction(acc, bodyPos, sharedPos[counter]); + } + + __syncthreads(); } - __syncthreads(); - } - - return acc; + return acc; } template __global__ void integrateBodies(typename vec4::Type *__restrict__ newPos, typename vec4::Type *__restrict__ oldPos, typename vec4::Type *vel, - unsigned int deviceOffset, - unsigned int deviceNumBodies, float deltaTime, - float damping, int numTiles) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int deviceOffset, + unsigned int deviceNumBodies, + float deltaTime, + float damping, + int numTiles) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index >= deviceNumBodies) { - return; - } + if (index >= deviceNumBodies) { + return; + } - typename vec4::Type position = oldPos[deviceOffset + index]; + typename vec4::Type position = oldPos[deviceOffset + index]; - typename vec3::Type accel = - computeBodyAccel(position, oldPos, numTiles); + typename vec3::Type accel = computeBodyAccel(position, oldPos, numTiles); - // acceleration = force / mass; - // new velocity = old velocity + acceleration * deltaTime - // note we factor out the body's mass from the equation, here and in - // bodyBodyInteraction (because they cancel out). Thus here force == - // acceleration - typename vec4::Type velocity = vel[deviceOffset + index]; + // acceleration = force / mass; + // new velocity = old velocity + acceleration * deltaTime + // note we factor out the body's mass from the equation, here and in + // bodyBodyInteraction (because they cancel out). Thus here force == + // acceleration + typename vec4::Type velocity = vel[deviceOffset + index]; - velocity.x += accel.x * deltaTime; - velocity.y += accel.y * deltaTime; - velocity.z += accel.z * deltaTime; + velocity.x += accel.x * deltaTime; + velocity.y += accel.y * deltaTime; + velocity.z += accel.z * deltaTime; - velocity.x *= damping; - velocity.y *= damping; - velocity.z *= damping; + velocity.x *= damping; + velocity.y *= damping; + velocity.z *= damping; - // new position = old position + velocity * deltaTime - position.x += velocity.x * deltaTime; - position.y += velocity.y * deltaTime; - position.z += velocity.z * deltaTime; + // new position = old position + velocity * deltaTime + position.x += velocity.x * deltaTime; + position.y += velocity.y * deltaTime; + position.z += velocity.z * deltaTime; - // store new position and velocity - newPos[deviceOffset + index] = position; - vel[deviceOffset + index] = velocity; + // store new position and velocity + newPos[deviceOffset + index] = position; + vel[deviceOffset + index] = velocity; } template -void integrateNbodySystem(DeviceData *deviceData, +void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, float deltaTime, - float damping, unsigned int numBodies, - unsigned int numDevices, int blockSize, - bool bUsePBO) { - if (bUsePBO) { - checkCudaErrors(cudaGraphicsResourceSetMapFlags( - pgres[currentRead], cudaGraphicsMapFlagsReadOnly)); - checkCudaErrors(cudaGraphicsResourceSetMapFlags( - pgres[1 - currentRead], cudaGraphicsMapFlagsWriteDiscard)); - checkCudaErrors(cudaGraphicsMapResources(2, pgres, 0)); - size_t bytes; - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&(deviceData[0].dPos[currentRead]), &bytes, - pgres[currentRead])); - checkCudaErrors(cudaGraphicsResourceGetMappedPointer( - (void **)&(deviceData[0].dPos[1 - currentRead]), &bytes, - pgres[1 - currentRead])); - } - - for (unsigned int dev = 0; dev != numDevices; dev++) { - if (numDevices > 1) { - cudaSetDevice(dev); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO) +{ + if (bUsePBO) { + checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres[currentRead], cudaGraphicsMapFlagsReadOnly)); + checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres[1 - currentRead], cudaGraphicsMapFlagsWriteDiscard)); + checkCudaErrors(cudaGraphicsMapResources(2, pgres, 0)); + size_t bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer( + (void **)&(deviceData[0].dPos[currentRead]), &bytes, pgres[currentRead])); + checkCudaErrors(cudaGraphicsResourceGetMappedPointer( + (void **)&(deviceData[0].dPos[1 - currentRead]), &bytes, pgres[1 - currentRead])); } - int numBlocks = (deviceData[dev].numBodies + blockSize - 1) / blockSize; - int numTiles = (numBodies + blockSize - 1) / blockSize; - int sharedMemSize = blockSize * 4 * sizeof(T); // 4 floats for pos + for (unsigned int dev = 0; dev != numDevices; dev++) { + if (numDevices > 1) { + cudaSetDevice(dev); + } - integrateBodies<<>>( - (typename vec4::Type *)deviceData[dev].dPos[1 - currentRead], - (typename vec4::Type *)deviceData[dev].dPos[currentRead], - (typename vec4::Type *)deviceData[dev].dVel, deviceData[dev].offset, - deviceData[dev].numBodies, deltaTime, damping, numTiles); + int numBlocks = (deviceData[dev].numBodies + blockSize - 1) / blockSize; + int numTiles = (numBodies + blockSize - 1) / blockSize; + int sharedMemSize = blockSize * 4 * sizeof(T); // 4 floats for pos + + integrateBodies + <<>>((typename vec4::Type *)deviceData[dev].dPos[1 - currentRead], + (typename vec4::Type *)deviceData[dev].dPos[currentRead], + (typename vec4::Type *)deviceData[dev].dVel, + deviceData[dev].offset, + deviceData[dev].numBodies, + deltaTime, + damping, + numTiles); + + if (numDevices > 1) { + checkCudaErrors(cudaEventRecord(deviceData[dev].event)); + // MJH: Hack on older driver versions to force kernel launches to flush! + cudaStreamQuery(0); + } + + // check if kernel invocation generated an error + getLastCudaError("Kernel execution failed"); + } if (numDevices > 1) { - checkCudaErrors(cudaEventRecord(deviceData[dev].event)); - // MJH: Hack on older driver versions to force kernel launches to flush! - cudaStreamQuery(0); + for (unsigned int dev = 0; dev < numDevices; dev++) { + checkCudaErrors(cudaEventSynchronize(deviceData[dev].event)); + } } - // check if kernel invocation generated an error - getLastCudaError("Kernel execution failed"); - } - - if (numDevices > 1) { - for (unsigned int dev = 0; dev < numDevices; dev++) { - checkCudaErrors(cudaEventSynchronize(deviceData[dev].event)); + if (bUsePBO) { + checkCudaErrors(cudaGraphicsUnmapResources(2, pgres, 0)); } - } - - if (bUsePBO) { - checkCudaErrors(cudaGraphicsUnmapResources(2, pgres, 0)); - } } // Explicit specializations needed to generate code -template void integrateNbodySystem(DeviceData *deviceData, +template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, - float deltaTime, float damping, - unsigned int numBodies, - unsigned int numDevices, - int blockSize, bool bUsePBO); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO); -template void integrateNbodySystem(DeviceData *deviceData, +template void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, - float deltaTime, float damping, - unsigned int numBodies, - unsigned int numDevices, - int blockSize, bool bUsePBO); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO); diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.h b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.h index 977d4856..68345c2d 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.h +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda.h @@ -30,70 +30,71 @@ #include "bodysystem.h" -template -struct DeviceData { - T *dPos[2]; // mapped host pointers - T *dVel; - cudaEvent_t event; - unsigned int offset; - unsigned int numBodies; +template struct DeviceData +{ + T *dPos[2]; // mapped host pointers + T *dVel; + cudaEvent_t event; + unsigned int offset; + unsigned int numBodies; }; // CUDA BodySystem: runs on the GPU -template -class BodySystemCUDA : public BodySystem { - public: - BodySystemCUDA(unsigned int numBodies, unsigned int numDevices, - unsigned int blockSize, bool usePBO, bool useSysMem = false); - virtual ~BodySystemCUDA(); +template class BodySystemCUDA : public BodySystem +{ +public: + BodySystemCUDA(unsigned int numBodies, + unsigned int numDevices, + unsigned int blockSize, + bool usePBO, + bool useSysMem = false); + virtual ~BodySystemCUDA(); - virtual void loadTipsyFile(const std::string &filename); + virtual void loadTipsyFile(const std::string &filename); - virtual void update(T deltaTime); + virtual void update(T deltaTime); - virtual void setSoftening(T softening); - virtual void setDamping(T damping); + virtual void setSoftening(T softening); + virtual void setDamping(T damping); - virtual T *getArray(BodyArray array); - virtual void setArray(BodyArray array, const T *data); + virtual T *getArray(BodyArray array); + virtual void setArray(BodyArray array, const T *data); - virtual unsigned int getCurrentReadBuffer() const { - return m_pbo[m_currentRead]; - } + virtual unsigned int getCurrentReadBuffer() const { return m_pbo[m_currentRead]; } - virtual unsigned int getNumBodies() const { return m_numBodies; } + virtual unsigned int getNumBodies() const { return m_numBodies; } - protected: // methods - BodySystemCUDA() {} +protected: // methods + BodySystemCUDA() {} - virtual void _initialize(int numBodies); - virtual void _finalize(); + virtual void _initialize(int numBodies); + virtual void _finalize(); - protected: // data - unsigned int m_numBodies; - unsigned int m_numDevices; - bool m_bInitialized; +protected: // data + unsigned int m_numBodies; + unsigned int m_numDevices; + bool m_bInitialized; - // Host data - T *m_hPos[2]; - T *m_hVel; + // Host data + T *m_hPos[2]; + T *m_hVel; - DeviceData *m_deviceData; + DeviceData *m_deviceData; - bool m_bUsePBO; - bool m_bUseSysMem; - unsigned int m_SMVersion; + bool m_bUsePBO; + bool m_bUseSysMem; + unsigned int m_SMVersion; - T m_damping; + T m_damping; - unsigned int m_pbo[2]; - cudaGraphicsResource *m_pGRes[2]; - unsigned int m_currentRead; - unsigned int m_currentWrite; + unsigned int m_pbo[2]; + cudaGraphicsResource *m_pGRes[2]; + unsigned int m_currentRead; + unsigned int m_currentWrite; - unsigned int m_blockSize; + unsigned int m_blockSize; }; #include "bodysystemcuda_impl.h" -#endif // __BODYSYSTEMCUDA_H__ +#endif // __BODYSYSTEMCUDA_H__ diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda_impl.h b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda_impl.h index 3e4c85d6..98cea629 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda_impl.h +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/bodysystemcuda_impl.h @@ -25,25 +25,28 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include - +#include #include -#include -#include #include #include +#include +#include +#include #include -#include -//#include +// #include #include template -void integrateNbodySystem(DeviceData *deviceData, +void integrateNbodySystem(DeviceData *deviceData, cudaGraphicsResource **pgres, - unsigned int currentRead, float deltaTime, - float damping, unsigned int numBodies, - unsigned int numDevices, int blockSize, bool bUsePBO); + unsigned int currentRead, + float deltaTime, + float damping, + unsigned int numBodies, + unsigned int numDevices, + int blockSize, + bool bUsePBO); cudaError_t setSofteningSquared(float softeningSq); cudaError_t setSofteningSquared(double softeningSq); @@ -51,323 +54,323 @@ cudaError_t setSofteningSquared(double softeningSq); template BodySystemCUDA::BodySystemCUDA(unsigned int numBodies, unsigned int numDevices, - unsigned int blockSize, bool usePBO, - bool useSysMem) - : m_numBodies(numBodies), - m_numDevices(numDevices), - m_bInitialized(false), - m_bUsePBO(usePBO), - m_bUseSysMem(useSysMem), - m_currentRead(0), - m_currentWrite(1), - m_blockSize(blockSize) { - m_hPos[0] = m_hPos[1] = 0; - m_hVel = 0; + unsigned int blockSize, + bool usePBO, + bool useSysMem) + : m_numBodies(numBodies) + , m_numDevices(numDevices) + , m_bInitialized(false) + , m_bUsePBO(usePBO) + , m_bUseSysMem(useSysMem) + , m_currentRead(0) + , m_currentWrite(1) + , m_blockSize(blockSize) +{ + m_hPos[0] = m_hPos[1] = 0; + m_hVel = 0; - m_deviceData = 0; + m_deviceData = 0; - _initialize(numBodies); - setSoftening(0.00125f); - setDamping(0.995f); + _initialize(numBodies); + setSoftening(0.00125f); + setDamping(0.995f); } -template -BodySystemCUDA::~BodySystemCUDA() { - _finalize(); - m_numBodies = 0; +template BodySystemCUDA::~BodySystemCUDA() +{ + _finalize(); + m_numBodies = 0; } -template -void BodySystemCUDA::_initialize(int numBodies) { - assert(!m_bInitialized); +template void BodySystemCUDA::_initialize(int numBodies) +{ + assert(!m_bInitialized); - m_numBodies = numBodies; + m_numBodies = numBodies; - unsigned int memSize = sizeof(T) * 4 * numBodies; + unsigned int memSize = sizeof(T) * 4 * numBodies; - m_deviceData = new DeviceData[m_numDevices]; + m_deviceData = new DeviceData[m_numDevices]; - // divide up the workload amongst Devices - float *weights = new float[m_numDevices]; - int *numSms = new int[m_numDevices]; - float total = 0; - - for (unsigned int i = 0; i < m_numDevices; i++) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, i)); - - // Choose the weight based on the Compute Capability - // We estimate that a CC2.0 SM is about 4.0x faster than a CC 1.x SM for - // this application (since a 15-SM GF100 is about 2X faster than a 30-SM - // GT200). - numSms[i] = props.multiProcessorCount; - weights[i] = numSms[i] * (props.major >= 2 ? 4.f : 1.f); - total += weights[i]; - } - - unsigned int offset = 0; - unsigned int remaining = m_numBodies; - - for (unsigned int i = 0; i < m_numDevices; i++) { - unsigned int count = (int)((weights[i] / total) * m_numBodies); - unsigned int round = numSms[i] * 256; - count = round * ((count + round - 1) / round); - - if (count > remaining) { - count = remaining; - } - - remaining -= count; - m_deviceData[i].offset = offset; - m_deviceData[i].numBodies = count; - offset += count; - - if ((i == m_numDevices - 1) && (offset < m_numBodies - 1)) { - m_deviceData[i].numBodies += m_numBodies - offset; - } - } - - delete[] weights; - delete[] numSms; - - if (m_bUseSysMem) { - checkCudaErrors(cudaHostAlloc((void **)&m_hPos[0], memSize, - cudaHostAllocMapped | cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc((void **)&m_hPos[1], memSize, - cudaHostAllocMapped | cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc((void **)&m_hVel, memSize, - cudaHostAllocMapped | cudaHostAllocPortable)); - - memset(m_hPos[0], 0, memSize); - memset(m_hPos[1], 0, memSize); - memset(m_hVel, 0, memSize); + // divide up the workload amongst Devices + float *weights = new float[m_numDevices]; + int *numSms = new int[m_numDevices]; + float total = 0; for (unsigned int i = 0; i < m_numDevices; i++) { - if (m_numDevices > 1) { - checkCudaErrors(cudaSetDevice(i)); - } + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, i)); - checkCudaErrors(cudaEventCreate(&m_deviceData[i].event)); - checkCudaErrors(cudaHostGetDevicePointer( - (void **)&m_deviceData[i].dPos[0], (void *)m_hPos[0], 0)); - checkCudaErrors(cudaHostGetDevicePointer( - (void **)&m_deviceData[i].dPos[1], (void *)m_hPos[1], 0)); - checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dVel, - (void *)m_hVel, 0)); + // Choose the weight based on the Compute Capability + // We estimate that a CC2.0 SM is about 4.0x faster than a CC 1.x SM for + // this application (since a 15-SM GF100 is about 2X faster than a 30-SM + // GT200). + numSms[i] = props.multiProcessorCount; + weights[i] = numSms[i] * (props.major >= 2 ? 4.f : 1.f); + total += weights[i]; } - } else { - m_hPos[0] = new T[m_numBodies * 4]; - m_hVel = new T[m_numBodies * 4]; - memset(m_hPos[0], 0, memSize); - memset(m_hVel, 0, memSize); + unsigned int offset = 0; + unsigned int remaining = m_numBodies; - checkCudaErrors(cudaEventCreate(&m_deviceData[0].event)); + for (unsigned int i = 0; i < m_numDevices; i++) { + unsigned int count = (int)((weights[i] / total) * m_numBodies); + unsigned int round = numSms[i] * 256; + count = round * ((count + round - 1) / round); - if (m_bUsePBO) { - // create the position pixel buffer objects for rendering - // we will actually compute directly from this memory in CUDA too - glGenBuffers(2, (GLuint *)m_pbo); - - for (int i = 0; i < 2; ++i) { - glBindBuffer(GL_ARRAY_BUFFER, m_pbo[i]); - glBufferData(GL_ARRAY_BUFFER, memSize, m_hPos[0], GL_DYNAMIC_DRAW); - - int size = 0; - glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); - - if ((unsigned)size != memSize) { - fprintf(stderr, "WARNING: Pixel Buffer Object allocation failed!n"); + if (count > remaining) { + count = remaining; } - glBindBuffer(GL_ARRAY_BUFFER, 0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_pGRes[i], m_pbo[i], - cudaGraphicsMapFlagsNone)); - } - } else { - checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[0], memSize)); - checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[1], memSize)); + remaining -= count; + m_deviceData[i].offset = offset; + m_deviceData[i].numBodies = count; + offset += count; + + if ((i == m_numDevices - 1) && (offset < m_numBodies - 1)) { + m_deviceData[i].numBodies += m_numBodies - offset; + } } - checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dVel, memSize)); - } + delete[] weights; + delete[] numSms; - m_bInitialized = true; + if (m_bUseSysMem) { + checkCudaErrors(cudaHostAlloc((void **)&m_hPos[0], memSize, cudaHostAllocMapped | cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void **)&m_hPos[1], memSize, cudaHostAllocMapped | cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void **)&m_hVel, memSize, cudaHostAllocMapped | cudaHostAllocPortable)); + + memset(m_hPos[0], 0, memSize); + memset(m_hPos[1], 0, memSize); + memset(m_hVel, 0, memSize); + + for (unsigned int i = 0; i < m_numDevices; i++) { + if (m_numDevices > 1) { + checkCudaErrors(cudaSetDevice(i)); + } + + checkCudaErrors(cudaEventCreate(&m_deviceData[i].event)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dPos[0], (void *)m_hPos[0], 0)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dPos[1], (void *)m_hPos[1], 0)); + checkCudaErrors(cudaHostGetDevicePointer((void **)&m_deviceData[i].dVel, (void *)m_hVel, 0)); + } + } + else { + m_hPos[0] = new T[m_numBodies * 4]; + m_hVel = new T[m_numBodies * 4]; + + memset(m_hPos[0], 0, memSize); + memset(m_hVel, 0, memSize); + + checkCudaErrors(cudaEventCreate(&m_deviceData[0].event)); + + if (m_bUsePBO) { + // create the position pixel buffer objects for rendering + // we will actually compute directly from this memory in CUDA too + glGenBuffers(2, (GLuint *)m_pbo); + + for (int i = 0; i < 2; ++i) { + glBindBuffer(GL_ARRAY_BUFFER, m_pbo[i]); + glBufferData(GL_ARRAY_BUFFER, memSize, m_hPos[0], GL_DYNAMIC_DRAW); + + int size = 0; + glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); + + if ((unsigned)size != memSize) { + fprintf(stderr, "WARNING: Pixel Buffer Object allocation failed!n"); + } + + glBindBuffer(GL_ARRAY_BUFFER, 0); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&m_pGRes[i], m_pbo[i], cudaGraphicsMapFlagsNone)); + } + } + else { + checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[0], memSize)); + checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dPos[1], memSize)); + } + + checkCudaErrors(cudaMalloc((void **)&m_deviceData[0].dVel, memSize)); + } + + m_bInitialized = true; } -template -void BodySystemCUDA::_finalize() { - assert(m_bInitialized); +template void BodySystemCUDA::_finalize() +{ + assert(m_bInitialized); - if (m_bUseSysMem) { - checkCudaErrors(cudaFreeHost(m_hPos[0])); - checkCudaErrors(cudaFreeHost(m_hPos[1])); - checkCudaErrors(cudaFreeHost(m_hVel)); + if (m_bUseSysMem) { + checkCudaErrors(cudaFreeHost(m_hPos[0])); + checkCudaErrors(cudaFreeHost(m_hPos[1])); + checkCudaErrors(cudaFreeHost(m_hVel)); + + for (unsigned int i = 0; i < m_numDevices; i++) { + cudaEventDestroy(m_deviceData[i].event); + } + } + else { + delete[] m_hPos[0]; + delete[] m_hPos[1]; + delete[] m_hVel; + + checkCudaErrors(cudaFree((void **)m_deviceData[0].dVel)); + + if (m_bUsePBO) { + checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[0])); + checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[1])); + glDeleteBuffers(2, (const GLuint *)m_pbo); + } + else { + checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[0])); + checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[1])); + } + } + + delete[] m_deviceData; + + m_bInitialized = false; +} + +template void BodySystemCUDA::loadTipsyFile(const std::string &filename) +{ + if (m_bInitialized) + _finalize(); + + std::vector::Type> positions; + std::vector::Type> velocities; + std::vector ids; + + int nBodies = 0; + int nFirst = 0, nSecond = 0, nThird = 0; + + read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, nSecond, nThird); + + _initialize(nBodies); + + setArray(BODYSYSTEM_POSITION, (T *)&positions[0]); + setArray(BODYSYSTEM_VELOCITY, (T *)&velocities[0]); +} + +template void BodySystemCUDA::setSoftening(T softening) +{ + T softeningSq = softening * softening; for (unsigned int i = 0; i < m_numDevices; i++) { - cudaEventDestroy(m_deviceData[i].event); + if (m_numDevices > 1) { + checkCudaErrors(cudaSetDevice(i)); + } + + checkCudaErrors(setSofteningSquared(softeningSq)); } - } else { - delete[] m_hPos[0]; - delete[] m_hPos[1]; - delete[] m_hVel; - - checkCudaErrors(cudaFree((void **)m_deviceData[0].dVel)); - - if (m_bUsePBO) { - checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[0])); - checkCudaErrors(cudaGraphicsUnregisterResource(m_pGRes[1])); - glDeleteBuffers(2, (const GLuint *)m_pbo); - } else { - checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[0])); - checkCudaErrors(cudaFree((void **)m_deviceData[0].dPos[1])); - } - } - - delete[] m_deviceData; - - m_bInitialized = false; } -template -void BodySystemCUDA::loadTipsyFile(const std::string &filename) { - if (m_bInitialized) _finalize(); +template void BodySystemCUDA::setDamping(T damping) { m_damping = damping; } - std::vector::Type> positions; - std::vector::Type> velocities; - std::vector ids; +template void BodySystemCUDA::update(T deltaTime) +{ + assert(m_bInitialized); - int nBodies = 0; - int nFirst = 0, nSecond = 0, nThird = 0; + integrateNbodySystem(m_deviceData, + m_pGRes, + m_currentRead, + (float)deltaTime, + (float)m_damping, + m_numBodies, + m_numDevices, + m_blockSize, + m_bUsePBO); - read_tipsy_file(positions, velocities, ids, filename, nBodies, nFirst, - nSecond, nThird); - - _initialize(nBodies); - - setArray(BODYSYSTEM_POSITION, (T *)&positions[0]); - setArray(BODYSYSTEM_VELOCITY, (T *)&velocities[0]); + std::swap(m_currentRead, m_currentWrite); } -template -void BodySystemCUDA::setSoftening(T softening) { - T softeningSq = softening * softening; +template T *BodySystemCUDA::getArray(BodyArray array) +{ + assert(m_bInitialized); - for (unsigned int i = 0; i < m_numDevices; i++) { - if (m_numDevices > 1) { - checkCudaErrors(cudaSetDevice(i)); - } + T *hdata = 0; + T *ddata = 0; - checkCudaErrors(setSofteningSquared(softeningSq)); - } -} + cudaGraphicsResource *pgres = NULL; -template -void BodySystemCUDA::setDamping(T damping) { - m_damping = damping; -} + int currentReadHost = m_bUseSysMem ? m_currentRead : 0; -template -void BodySystemCUDA::update(T deltaTime) { - assert(m_bInitialized); - - integrateNbodySystem(m_deviceData, m_pGRes, m_currentRead, - (float)deltaTime, (float)m_damping, m_numBodies, - m_numDevices, m_blockSize, m_bUsePBO); - - std::swap(m_currentRead, m_currentWrite); -} - -template -T *BodySystemCUDA::getArray(BodyArray array) { - assert(m_bInitialized); - - T *hdata = 0; - T *ddata = 0; - - cudaGraphicsResource *pgres = NULL; - - int currentReadHost = m_bUseSysMem ? m_currentRead : 0; - - switch (array) { + switch (array) { default: case BODYSYSTEM_POSITION: - hdata = m_hPos[currentReadHost]; - ddata = m_deviceData[0].dPos[m_currentRead]; + hdata = m_hPos[currentReadHost]; + ddata = m_deviceData[0].dPos[m_currentRead]; - if (m_bUsePBO) { - pgres = m_pGRes[m_currentRead]; - } - - break; - - case BODYSYSTEM_VELOCITY: - hdata = m_hVel; - ddata = m_deviceData[0].dVel; - break; - } - - if (!m_bUseSysMem) { - if (pgres) { - checkCudaErrors( - cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly)); - checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0)); - size_t bytes; - checkCudaErrors( - cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres)); - } - - checkCudaErrors(cudaMemcpy(hdata, ddata, m_numBodies * 4 * sizeof(T), - cudaMemcpyDeviceToHost)); - - if (pgres) { - checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0)); - } - } - - return hdata; -} - -template -void BodySystemCUDA::setArray(BodyArray array, const T *data) { - assert(m_bInitialized); - - m_currentRead = 0; - m_currentWrite = 1; - - switch (array) { - default: - case BODYSYSTEM_POSITION: { - if (m_bUsePBO) { - glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]); - glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data); - - int size = 0; - glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); - - if ((unsigned)size != 4 * (sizeof(T) * m_numBodies)) { - fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n"); + if (m_bUsePBO) { + pgres = m_pGRes[m_currentRead]; } - glBindBuffer(GL_ARRAY_BUFFER, 0); - } else { - if (m_bUseSysMem) { - memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T)); - } else - checkCudaErrors(cudaMemcpy(m_deviceData[0].dPos[m_currentRead], data, - m_numBodies * 4 * sizeof(T), - cudaMemcpyHostToDevice)); - } + break; + + case BODYSYSTEM_VELOCITY: + hdata = m_hVel; + ddata = m_deviceData[0].dVel; + break; + } + + if (!m_bUseSysMem) { + if (pgres) { + checkCudaErrors(cudaGraphicsResourceSetMapFlags(pgres, cudaGraphicsMapFlagsReadOnly)); + checkCudaErrors(cudaGraphicsMapResources(1, &pgres, 0)); + size_t bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&ddata, &bytes, pgres)); + } + + checkCudaErrors(cudaMemcpy(hdata, ddata, m_numBodies * 4 * sizeof(T), cudaMemcpyDeviceToHost)); + + if (pgres) { + checkCudaErrors(cudaGraphicsUnmapResources(1, &pgres, 0)); + } + } + + return hdata; +} + +template void BodySystemCUDA::setArray(BodyArray array, const T *data) +{ + assert(m_bInitialized); + + m_currentRead = 0; + m_currentWrite = 1; + + switch (array) { + default: + case BODYSYSTEM_POSITION: { + if (m_bUsePBO) { + glBindBuffer(GL_ARRAY_BUFFER, m_pbo[m_currentRead]); + glBufferSubData(GL_ARRAY_BUFFER, 0, 4 * sizeof(T) * m_numBodies, data); + + int size = 0; + glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, (GLint *)&size); + + if ((unsigned)size != 4 * (sizeof(T) * m_numBodies)) { + fprintf(stderr, "WARNING: Pixel Buffer Object download failed!n"); + } + + glBindBuffer(GL_ARRAY_BUFFER, 0); + } + else { + if (m_bUseSysMem) { + memcpy(m_hPos[m_currentRead], data, m_numBodies * 4 * sizeof(T)); + } + else + checkCudaErrors(cudaMemcpy( + m_deviceData[0].dPos[m_currentRead], data, m_numBodies * 4 * sizeof(T), cudaMemcpyHostToDevice)); + } } break; case BODYSYSTEM_VELOCITY: - if (m_bUseSysMem) { - memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T)); - } else - checkCudaErrors(cudaMemcpy(m_deviceData[0].dVel, data, - m_numBodies * 4 * sizeof(T), - cudaMemcpyHostToDevice)); + if (m_bUseSysMem) { + memcpy(m_hVel, data, m_numBodies * 4 * sizeof(T)); + } + else + checkCudaErrors( + cudaMemcpy(m_deviceData[0].dVel, data, m_numBodies * 4 * sizeof(T), cudaMemcpyHostToDevice)); - break; - } + break; + } } diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/nbody_opengles.cpp b/Samples/8_Platform_Specific/Tegra/nbody_opengles/nbody_opengles.cpp index cabdec6d..ed96bc47 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/nbody_opengles.cpp +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/nbody_opengles.cpp @@ -25,59 +25,55 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include -#include #include -#include - -#include "render_particles.h" - -#include -#include +#include +#include #include #include -#include -#include - +#include +#include #include #include - #include +#include +#include +#include -#include "bodysystemcuda.h" #include "bodysystemcpu.h" +#include "bodysystemcuda.h" #include "cuda_runtime.h" +#include "render_particles.h" EGLDisplay eglDisplay = EGL_NO_DISPLAY; EGLSurface eglSurface = EGL_NO_SURFACE; EGLContext eglContext = EGL_NO_CONTEXT; // view params -int ox = 0, oy = 0; -int buttonState = 0; -float camera_trans[] = {0, -2, -150}; -float camera_rot[] = {0, 0, 0}; -float camera_trans_lag[] = {0, -2, -150}; -float camera_rot_lag[] = {0, 0, 0}; -const float inertia = 0.1f; +int ox = 0, oy = 0; +int buttonState = 0; +float camera_trans[] = {0, -2, -150}; +float camera_rot[] = {0, 0, 0}; +float camera_trans_lag[] = {0, -2, -150}; +float camera_rot_lag[] = {0, 0, 0}; +const float inertia = 0.1f; -bool benchmark = false; -bool compareToCPU = false; -bool QATest = false; -int blockSize = 256; -bool useHostMem = false; -bool fp64 = false; -bool useCpu = false; -int numDevsRequested = 1; -bool displayEnabled = true; -unsigned int dispno = 0; -unsigned int window_width = 720; -unsigned int window_height = 480; -bool bPause = false; -bool bFullscreen = false; -bool bDispInteractions = false; -bool bSupportDouble = false; -int flopsPerInteraction = 20; +bool benchmark = false; +bool compareToCPU = false; +bool QATest = false; +int blockSize = 256; +bool useHostMem = false; +bool fp64 = false; +bool useCpu = false; +int numDevsRequested = 1; +bool displayEnabled = true; +unsigned int dispno = 0; +unsigned int window_width = 720; +unsigned int window_height = 480; +bool bPause = false; +bool bFullscreen = false; +bool bDispInteractions = false; +bool bSupportDouble = false; +int flopsPerInteraction = 20; char deviceName[100]; @@ -87,39 +83,48 @@ int numBodies = 16384; std::string tipsyFile = ""; -int numIterations = 0; // run until exit +int numIterations = 0; // run until exit -void computePerfStats(double &interactionsPerSecond, double &gflops, - float milliseconds, int iterations) { - // double precision uses intrinsic operation followed by refinement, - // resulting in higher operation count per interaction. - // (Note Astrophysicists use 38 flops per interaction no matter what, - // based on "historical precedent", but they are using FLOP/s as a - // measure of "science throughput". We are using it as a measure of - // hardware throughput. They should really use interactions/s... - // const int flopsPerInteraction = fp64 ? 30 : 20; - interactionsPerSecond = (float)numBodies * (float)numBodies; - interactionsPerSecond *= 1e-9 * iterations * 1000 / milliseconds; - gflops = interactionsPerSecond * (float)flopsPerInteraction; +void computePerfStats(double &interactionsPerSecond, double &gflops, float milliseconds, int iterations) +{ + // double precision uses intrinsic operation followed by refinement, + // resulting in higher operation count per interaction. + // (Note Astrophysicists use 38 flops per interaction no matter what, + // based on "historical precedent", but they are using FLOP/s as a + // measure of "science throughput". We are using it as a measure of + // hardware throughput. They should really use interactions/s... + // const int flopsPerInteraction = fp64 ? 30 : 20; + interactionsPerSecond = (float)numBodies * (float)numBodies; + interactionsPerSecond *= 1e-9 * iterations * 1000 / milliseconds; + gflops = interactionsPerSecond * (float)flopsPerInteraction; } //////////////////////////////////////// // Demo Parameters //////////////////////////////////////// -struct NBodyParams { - float m_timestep; - float m_clusterScale; - float m_velocityScale; - float m_softening; - float m_damping; - float m_pointSize; - float m_x, m_y, m_z; +struct NBodyParams +{ + float m_timestep; + float m_clusterScale; + float m_velocityScale; + float m_softening; + float m_damping; + float m_pointSize; + float m_x, m_y, m_z; - void print() { - printf("{ %f, %f, %f, %f, %f, %f, %f, %f, %f },\n", m_timestep, - m_clusterScale, m_velocityScale, m_softening, m_damping, m_pointSize, - m_x, m_y, m_z); - } + void print() + { + printf("{ %f, %f, %f, %f, %f, %f, %f, %f, %f },\n", + m_timestep, + m_clusterScale, + m_velocityScale, + m_softening, + m_damping, + m_pointSize, + m_x, + m_y, + m_z); + } }; NBodyParams demoParams[] = { @@ -129,14 +134,13 @@ NBodyParams demoParams[] = { {0.0006f, 0.16f, 1000.0f, 1.0f, 1.0f, 0.07f, 0, 0, -1.5f}, {0.0019f, 0.32f, 276.0f, 1.0f, 1.0f, 0.07f, 0, 0, -5}, {0.0016f, 0.32f, 272.0f, 0.145f, 1.0f, 0.08f, 0, 0, -5}, - {0.016000f, 6.040000f, 0.000000f, 1.000000f, 1.000000f, 0.760000f, 0, 0, - -50}, + {0.016000f, 6.040000f, 0.000000f, 1.000000f, 1.000000f, 0.760000f, 0, 0, -50}, }; -int numDemos = sizeof(demoParams) / sizeof(NBodyParams); -bool cycleDemo = true; -int activeDemo = 0; -float demoTime = 10000.0f; // ms +int numDemos = sizeof(demoParams) / sizeof(NBodyParams); +bool cycleDemo = true; +int activeDemo = 0; +float demoTime = 10000.0f; // ms StopWatchInterface *demoTimer = NULL, *timer = NULL; // run multiple iterations to compute an average sort time @@ -147,990 +151,1016 @@ NBodyParams activeParams = demoParams[activeDemo]; bool bShowSliders = true; // fps -static int fpsCount = 0; -static int fpsLimit = 5; +static int fpsCount = 0; +static int fpsLimit = 5; cudaEvent_t startEvent, stopEvent; cudaEvent_t hostMemSyncEvent; -template -class NBodyDemo { - public: - static void Create() { m_singleton = new NBodyDemo; } - static void Destroy() { delete m_singleton; } - - static void init(int numBodies, int numDevices, int blockSize, bool usePBO, - bool useHostMem, bool useCpu) { - m_singleton->_init(numBodies, numDevices, blockSize, usePBO, useHostMem, - useCpu); - } - - static void reset(int numBodies, NBodyConfig config) { - m_singleton->_reset(numBodies, config); - } - - static void selectDemo(int index) { m_singleton->_selectDemo(index); } - - static bool compareResults(int numBodies) { - return m_singleton->_compareResults(numBodies); - } - - static void runBenchmark(int iterations) { - m_singleton->_runBenchmark(iterations); - } - - static void updateParams() { - m_singleton->m_nbody->setSoftening(activeParams.m_softening); - m_singleton->m_nbody->setDamping(activeParams.m_damping); - } - - static void updateSimulation() { - m_singleton->m_nbody->update(activeParams.m_timestep); - } - - static void display() { - m_singleton->m_renderer->setSpriteSize(activeParams.m_pointSize); - - if (useHostMem) { - // This event sync is required because we are rendering from the host - // memory that CUDA is writing. If we don't wait until CUDA is done - // updating it, we will render partially updated data, resulting in a - // jerky frame rate. - if (!useCpu) { - cudaEventSynchronize(hostMemSyncEvent); - } - - m_singleton->m_renderer->setPositions( - m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION), - m_singleton->m_nbody->getNumBodies()); - } else { - m_singleton->m_renderer->setPBO( - m_singleton->m_nbody->getCurrentReadBuffer(), - m_singleton->m_nbody->getNumBodies(), (sizeof(T) > 4)); - } - - // display particles - m_singleton->m_renderer->display(); - } - - static void getArrays(T *pos, T *vel) { - T *_pos = m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION); - T *_vel = m_singleton->m_nbody->getArray(BODYSYSTEM_VELOCITY); - memcpy(pos, _pos, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); - memcpy(vel, _vel, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); - } - - static void setArrays(const T *pos, const T *vel) { - if (pos != m_singleton->m_hPos) { - memcpy(m_singleton->m_hPos, pos, numBodies * 4 * sizeof(T)); - } - - if (vel != m_singleton->m_hVel) { - memcpy(m_singleton->m_hVel, vel, numBodies * 4 * sizeof(T)); - } - - m_singleton->m_nbody->setArray(BODYSYSTEM_POSITION, m_singleton->m_hPos); - m_singleton->m_nbody->setArray(BODYSYSTEM_VELOCITY, m_singleton->m_hVel); - - if (!benchmark && !useCpu && !compareToCPU) { - m_singleton->_resetRenderer(); - } - } - - private: - static NBodyDemo *m_singleton; - - BodySystem *m_nbody; - BodySystemCUDA *m_nbodyCuda; - BodySystemCPU *m_nbodyCpu; - - ParticleRenderer *m_renderer; - - T *m_hPos; - T *m_hVel; - float *m_hColor; - - private: - NBodyDemo() - : m_nbody(0), - m_nbodyCuda(0), - m_nbodyCpu(0), - m_renderer(0), - m_hPos(0), - m_hVel(0), - m_hColor(0) {} - - ~NBodyDemo() { - if (m_nbodyCpu) { - delete m_nbodyCpu; - } - - if (m_nbodyCuda) { - delete m_nbodyCuda; - } - - if (m_hPos) { - delete[] m_hPos; - } - - if (m_hVel) { - delete[] m_hVel; - } - - if (m_hColor) { - delete[] m_hColor; - } - - sdkDeleteTimer(&demoTimer); - - if (!benchmark && !compareToCPU) delete m_renderer; - } - - void _init(int numBodies, int numDevices, int blockSize, bool bUsePBO, - bool useHostMem, bool useCpu) { - if (useCpu) { - m_nbodyCpu = new BodySystemCPU(numBodies); - m_nbody = m_nbodyCpu; - m_nbodyCuda = 0; - } else { - m_nbodyCuda = new BodySystemCUDA(numBodies, numDevices, blockSize, - bUsePBO, useHostMem); - m_nbody = m_nbodyCuda; - m_nbodyCpu = 0; - } - - // allocate host memory - m_hPos = new T[numBodies * 4]; - m_hVel = new T[numBodies * 4]; - m_hColor = new float[numBodies * 4]; - - m_nbody->setSoftening(activeParams.m_softening); - m_nbody->setDamping(activeParams.m_damping); - - if (useCpu) { - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - } else { - checkCudaErrors(cudaEventCreate(&startEvent)); - checkCudaErrors(cudaEventCreate(&stopEvent)); - checkCudaErrors(cudaEventCreate(&hostMemSyncEvent)); - } - - if (!benchmark && !compareToCPU) { - m_renderer = new ParticleRenderer(window_width, window_height); - _resetRenderer(); - } - - sdkCreateTimer(&demoTimer); - sdkStartTimer(&demoTimer); - } - - void _reset(int numBodies, NBodyConfig config) { - if (tipsyFile == "") { - randomizeBodies(config, m_hPos, m_hVel, m_hColor, - activeParams.m_clusterScale, activeParams.m_velocityScale, - numBodies, true); - setArrays(m_hPos, m_hVel); - } else { - m_nbody->loadTipsyFile(tipsyFile); - ::numBodies = m_nbody->getNumBodies(); - } - } - - void _resetRenderer() { - if (fp64) { - float color[4] = {0.4f, 0.8f, 0.1f, 1.0f}; - m_renderer->setBaseColor(color); - } else { - float color[4] = {1.0f, 0.6f, 0.3f, 1.0f}; - m_renderer->setBaseColor(color); - } - - m_renderer->setColors(m_hColor, m_nbody->getNumBodies()); - m_renderer->setSpriteSize(activeParams.m_pointSize); - m_renderer->setCameraPos(camera_trans); - } - - void _selectDemo(int index) { - assert(index < numDemos); - - activeParams = demoParams[index]; - camera_trans[0] = camera_trans_lag[0] = activeParams.m_x; - camera_trans[1] = camera_trans_lag[1] = activeParams.m_y; - camera_trans[2] = camera_trans_lag[2] = activeParams.m_z; - reset(numBodies, NBODY_CONFIG_SHELL); - sdkResetTimer(&demoTimer); - - m_singleton->m_renderer->setCameraPos(camera_trans); - } - - bool _compareResults(int numBodies) { - assert(m_nbodyCuda); - - bool passed = true; - - m_nbody->update(0.001f); +template class NBodyDemo +{ +public: + static void Create() { m_singleton = new NBodyDemo; } + static void Destroy() { delete m_singleton; } + static void init(int numBodies, int numDevices, int blockSize, bool usePBO, bool useHostMem, bool useCpu) { - m_nbodyCpu = new BodySystemCPU(numBodies); + m_singleton->_init(numBodies, numDevices, blockSize, usePBO, useHostMem, useCpu); + } - m_nbodyCpu->setArray(BODYSYSTEM_POSITION, m_hPos); - m_nbodyCpu->setArray(BODYSYSTEM_VELOCITY, m_hVel); + static void reset(int numBodies, NBodyConfig config) { m_singleton->_reset(numBodies, config); } - m_nbodyCpu->update(0.001f); + static void selectDemo(int index) { m_singleton->_selectDemo(index); } - T *cudaPos = m_nbodyCuda->getArray(BODYSYSTEM_POSITION); - T *cpuPos = m_nbodyCpu->getArray(BODYSYSTEM_POSITION); + static bool compareResults(int numBodies) { return m_singleton->_compareResults(numBodies); } - T tolerance = 0.0005f; + static void runBenchmark(int iterations) { m_singleton->_runBenchmark(iterations); } - for (int i = 0; i < numBodies; i++) { - if (fabs(cpuPos[i] - cudaPos[i]) > tolerance) { - passed = false; - printf("Error: (host)%f != (device)%f\n", cpuPos[i], cudaPos[i]); + static void updateParams() + { + m_singleton->m_nbody->setSoftening(activeParams.m_softening); + m_singleton->m_nbody->setDamping(activeParams.m_damping); + } + + static void updateSimulation() { m_singleton->m_nbody->update(activeParams.m_timestep); } + + static void display() + { + m_singleton->m_renderer->setSpriteSize(activeParams.m_pointSize); + + if (useHostMem) { + // This event sync is required because we are rendering from the host + // memory that CUDA is writing. If we don't wait until CUDA is done + // updating it, we will render partially updated data, resulting in a + // jerky frame rate. + if (!useCpu) { + cudaEventSynchronize(hostMemSyncEvent); + } + + m_singleton->m_renderer->setPositions(m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION), + m_singleton->m_nbody->getNumBodies()); + } + else { + m_singleton->m_renderer->setPBO( + m_singleton->m_nbody->getCurrentReadBuffer(), m_singleton->m_nbody->getNumBodies(), (sizeof(T) > 4)); } - } - } - return passed; - } - void _runBenchmark(int iterations) { - // once without timing to prime the device - if (!useCpu) { - m_nbody->update(activeParams.m_timestep); + // display particles + m_singleton->m_renderer->display(); } - if (useCpu) { - sdkCreateTimer(&timer); - sdkStartTimer(&timer); - } else { - checkCudaErrors(cudaEventRecord(startEvent, 0)); + static void getArrays(T *pos, T *vel) + { + T *_pos = m_singleton->m_nbody->getArray(BODYSYSTEM_POSITION); + T *_vel = m_singleton->m_nbody->getArray(BODYSYSTEM_VELOCITY); + memcpy(pos, _pos, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); + memcpy(vel, _vel, m_singleton->m_nbody->getNumBodies() * 4 * sizeof(T)); } - for (int i = 0; i < iterations; ++i) { - m_nbody->update(activeParams.m_timestep); + static void setArrays(const T *pos, const T *vel) + { + if (pos != m_singleton->m_hPos) { + memcpy(m_singleton->m_hPos, pos, numBodies * 4 * sizeof(T)); + } + + if (vel != m_singleton->m_hVel) { + memcpy(m_singleton->m_hVel, vel, numBodies * 4 * sizeof(T)); + } + + m_singleton->m_nbody->setArray(BODYSYSTEM_POSITION, m_singleton->m_hPos); + m_singleton->m_nbody->setArray(BODYSYSTEM_VELOCITY, m_singleton->m_hVel); + + if (!benchmark && !useCpu && !compareToCPU) { + m_singleton->_resetRenderer(); + } } - float milliseconds = 0; +private: + static NBodyDemo *m_singleton; - if (useCpu) { - sdkStopTimer(&timer); - milliseconds = sdkGetTimerValue(&timer); - sdkStartTimer(&timer); - } else { - checkCudaErrors(cudaEventRecord(stopEvent, 0)); - checkCudaErrors(cudaEventSynchronize(stopEvent)); - checkCudaErrors( - cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + BodySystem *m_nbody; + BodySystemCUDA *m_nbodyCuda; + BodySystemCPU *m_nbodyCpu; + + ParticleRenderer *m_renderer; + + T *m_hPos; + T *m_hVel; + float *m_hColor; + +private: + NBodyDemo() + : m_nbody(0) + , m_nbodyCuda(0) + , m_nbodyCpu(0) + , m_renderer(0) + , m_hPos(0) + , m_hVel(0) + , m_hColor(0) + { } - double interactionsPerSecond = 0; - double gflops = 0; - computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); + ~NBodyDemo() + { + if (m_nbodyCpu) { + delete m_nbodyCpu; + } - printf("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, - iterations, milliseconds); - printf("= %.3f billion interactions per second\n", interactionsPerSecond); - printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, - (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); - } + if (m_nbodyCuda) { + delete m_nbodyCuda; + } + + if (m_hPos) { + delete[] m_hPos; + } + + if (m_hVel) { + delete[] m_hVel; + } + + if (m_hColor) { + delete[] m_hColor; + } + + sdkDeleteTimer(&demoTimer); + + if (!benchmark && !compareToCPU) + delete m_renderer; + } + + void _init(int numBodies, int numDevices, int blockSize, bool bUsePBO, bool useHostMem, bool useCpu) + { + if (useCpu) { + m_nbodyCpu = new BodySystemCPU(numBodies); + m_nbody = m_nbodyCpu; + m_nbodyCuda = 0; + } + else { + m_nbodyCuda = new BodySystemCUDA(numBodies, numDevices, blockSize, bUsePBO, useHostMem); + m_nbody = m_nbodyCuda; + m_nbodyCpu = 0; + } + + // allocate host memory + m_hPos = new T[numBodies * 4]; + m_hVel = new T[numBodies * 4]; + m_hColor = new float[numBodies * 4]; + + m_nbody->setSoftening(activeParams.m_softening); + m_nbody->setDamping(activeParams.m_damping); + + if (useCpu) { + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + } + else { + checkCudaErrors(cudaEventCreate(&startEvent)); + checkCudaErrors(cudaEventCreate(&stopEvent)); + checkCudaErrors(cudaEventCreate(&hostMemSyncEvent)); + } + + if (!benchmark && !compareToCPU) { + m_renderer = new ParticleRenderer(window_width, window_height); + _resetRenderer(); + } + + sdkCreateTimer(&demoTimer); + sdkStartTimer(&demoTimer); + } + + void _reset(int numBodies, NBodyConfig config) + { + if (tipsyFile == "") { + randomizeBodies(config, + m_hPos, + m_hVel, + m_hColor, + activeParams.m_clusterScale, + activeParams.m_velocityScale, + numBodies, + true); + setArrays(m_hPos, m_hVel); + } + else { + m_nbody->loadTipsyFile(tipsyFile); + ::numBodies = m_nbody->getNumBodies(); + } + } + + void _resetRenderer() + { + if (fp64) { + float color[4] = {0.4f, 0.8f, 0.1f, 1.0f}; + m_renderer->setBaseColor(color); + } + else { + float color[4] = {1.0f, 0.6f, 0.3f, 1.0f}; + m_renderer->setBaseColor(color); + } + + m_renderer->setColors(m_hColor, m_nbody->getNumBodies()); + m_renderer->setSpriteSize(activeParams.m_pointSize); + m_renderer->setCameraPos(camera_trans); + } + + void _selectDemo(int index) + { + assert(index < numDemos); + + activeParams = demoParams[index]; + camera_trans[0] = camera_trans_lag[0] = activeParams.m_x; + camera_trans[1] = camera_trans_lag[1] = activeParams.m_y; + camera_trans[2] = camera_trans_lag[2] = activeParams.m_z; + reset(numBodies, NBODY_CONFIG_SHELL); + sdkResetTimer(&demoTimer); + + m_singleton->m_renderer->setCameraPos(camera_trans); + } + + bool _compareResults(int numBodies) + { + assert(m_nbodyCuda); + + bool passed = true; + + m_nbody->update(0.001f); + + { + m_nbodyCpu = new BodySystemCPU(numBodies); + + m_nbodyCpu->setArray(BODYSYSTEM_POSITION, m_hPos); + m_nbodyCpu->setArray(BODYSYSTEM_VELOCITY, m_hVel); + + m_nbodyCpu->update(0.001f); + + T *cudaPos = m_nbodyCuda->getArray(BODYSYSTEM_POSITION); + T *cpuPos = m_nbodyCpu->getArray(BODYSYSTEM_POSITION); + + T tolerance = 0.0005f; + + for (int i = 0; i < numBodies; i++) { + if (fabs(cpuPos[i] - cudaPos[i]) > tolerance) { + passed = false; + printf("Error: (host)%f != (device)%f\n", cpuPos[i], cudaPos[i]); + } + } + } + return passed; + } + + void _runBenchmark(int iterations) + { + // once without timing to prime the device + if (!useCpu) { + m_nbody->update(activeParams.m_timestep); + } + + if (useCpu) { + sdkCreateTimer(&timer); + sdkStartTimer(&timer); + } + else { + checkCudaErrors(cudaEventRecord(startEvent, 0)); + } + + for (int i = 0; i < iterations; ++i) { + m_nbody->update(activeParams.m_timestep); + } + + float milliseconds = 0; + + if (useCpu) { + sdkStopTimer(&timer); + milliseconds = sdkGetTimerValue(&timer); + sdkStartTimer(&timer); + } + else { + checkCudaErrors(cudaEventRecord(stopEvent, 0)); + checkCudaErrors(cudaEventSynchronize(stopEvent)); + checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); + } + + double interactionsPerSecond = 0; + double gflops = 0; + computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); + + printf("%d bodies, total time for %d iterations: %.3f ms\n", numBodies, iterations, milliseconds); + printf("= %.3f billion interactions per second\n", interactionsPerSecond); + printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", + gflops, + (sizeof(T) > 4) ? "double" : "single", + flopsPerInteraction); + } }; -void finalize() { - if (!useCpu) { - checkCudaErrors(cudaEventDestroy(startEvent)); - checkCudaErrors(cudaEventDestroy(stopEvent)); - checkCudaErrors(cudaEventDestroy(hostMemSyncEvent)); - } +void finalize() +{ + if (!useCpu) { + checkCudaErrors(cudaEventDestroy(startEvent)); + checkCudaErrors(cudaEventDestroy(stopEvent)); + checkCudaErrors(cudaEventDestroy(hostMemSyncEvent)); + } - NBodyDemo::Destroy(); + NBodyDemo::Destroy(); - if (bSupportDouble) NBodyDemo::Destroy(); + if (bSupportDouble) + NBodyDemo::Destroy(); } -template <> -NBodyDemo *NBodyDemo::m_singleton = 0; -template <> -NBodyDemo *NBodyDemo::m_singleton = 0; +template <> NBodyDemo *NBodyDemo::m_singleton = 0; +template <> NBodyDemo *NBodyDemo::m_singleton = 0; -template -void switchDemoPrecision() { - cudaDeviceSynchronize(); +template void switchDemoPrecision() +{ + cudaDeviceSynchronize(); - fp64 = !fp64; - flopsPerInteraction = fp64 ? 30 : 20; + fp64 = !fp64; + flopsPerInteraction = fp64 ? 30 : 20; - T_old *oldPos = new T_old[numBodies * 4]; - T_old *oldVel = new T_old[numBodies * 4]; + T_old *oldPos = new T_old[numBodies * 4]; + T_old *oldVel = new T_old[numBodies * 4]; - NBodyDemo::getArrays(oldPos, oldVel); + NBodyDemo::getArrays(oldPos, oldVel); - // convert float to double - T_new *newPos = new T_new[numBodies * 4]; - T_new *newVel = new T_new[numBodies * 4]; + // convert float to double + T_new *newPos = new T_new[numBodies * 4]; + T_new *newVel = new T_new[numBodies * 4]; - for (int i = 0; i < numBodies * 4; i++) { - newPos[i] = (T_new)oldPos[i]; - newVel[i] = (T_new)oldVel[i]; - } + for (int i = 0; i < numBodies * 4; i++) { + newPos[i] = (T_new)oldPos[i]; + newVel[i] = (T_new)oldVel[i]; + } - NBodyDemo::setArrays(newPos, newVel); + NBodyDemo::setArrays(newPos, newVel); - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); - delete[] oldPos; - delete[] oldVel; - delete[] newPos; - delete[] newVel; + delete[] oldPos; + delete[] oldVel; + delete[] newPos; + delete[] newVel; } -void initGL(int *argc, char **argv) { - EGLint configAttrs[] = {EGL_RED_SIZE, - 1, - EGL_GREEN_SIZE, - 1, - EGL_BLUE_SIZE, - 1, - EGL_DEPTH_SIZE, - 16, - EGL_SAMPLE_BUFFERS, - 0, - EGL_SAMPLES, - 0, - EGL_RENDERABLE_TYPE, - EGL_OPENGL_ES2_BIT, - EGL_NONE}; +void initGL(int *argc, char **argv) +{ + EGLint configAttrs[] = {EGL_RED_SIZE, + 1, + EGL_GREEN_SIZE, + 1, + EGL_BLUE_SIZE, + 1, + EGL_DEPTH_SIZE, + 16, + EGL_SAMPLE_BUFFERS, + 0, + EGL_SAMPLES, + 0, + EGL_RENDERABLE_TYPE, + EGL_OPENGL_ES2_BIT, + EGL_NONE}; - EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; + EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; - EGLint windowAttrs[] = {EGL_NONE}; - EGLConfig *configList = NULL; - EGLint configCount; + EGLint windowAttrs[] = {EGL_NONE}; + EGLConfig *configList = NULL; + EGLint configCount; - eglDisplay = eglGetDisplay(0); + eglDisplay = eglGetDisplay(0); - if (eglDisplay == EGL_NO_DISPLAY) { - printf("EGL failed to obtain display\n"); - exit(EXIT_FAILURE); - } + if (eglDisplay == EGL_NO_DISPLAY) { + printf("EGL failed to obtain display\n"); + exit(EXIT_FAILURE); + } - if (!eglInitialize(eglDisplay, 0, 0)) { - printf("EGL failed to initialize\n"); - exit(EXIT_FAILURE); - } + if (!eglInitialize(eglDisplay, 0, 0)) { + printf("EGL failed to initialize\n"); + exit(EXIT_FAILURE); + } - if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || - !configCount) { - printf("EGL failed to return matching configs\n"); - exit(EXIT_FAILURE); - } + if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || !configCount) { + printf("EGL failed to return matching configs\n"); + exit(EXIT_FAILURE); + } - configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); + configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); - if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, - &configCount) || - !configCount) { - printf("EGL failed to populate config list\n"); - exit(EXIT_FAILURE); - } + if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, &configCount) || !configCount) { + printf("EGL failed to populate config list\n"); + exit(EXIT_FAILURE); + } - Display *xDisplay = XOpenDisplay(NULL); - if (!xDisplay) { - printf("X server failed to open a window\n"); - exit(EXIT_FAILURE); - } + Display *xDisplay = XOpenDisplay(NULL); + if (!xDisplay) { + printf("X server failed to open a window\n"); + exit(EXIT_FAILURE); + } - Window xRootWindow = DefaultRootWindow(xDisplay); - XSetWindowAttributes xCreateWindowAttributes; - xCreateWindowAttributes.event_mask = ExposureMask; - Window xWindow = - XCreateWindow(xDisplay, xRootWindow, 0, 0, window_width, window_height, 0, - CopyFromParent, InputOutput, CopyFromParent, CWEventMask, - &xCreateWindowAttributes); - XMapWindow(xDisplay, xWindow); - Atom netWmStateAtom = XInternAtom(xDisplay, "_NET_WM_STATE", false); - XEvent xEvent; - memset(&xEvent, 0, sizeof(xEvent)); - xEvent.type = ClientMessage; - xEvent.xclient.window = xWindow; - xEvent.xclient.message_type = netWmStateAtom; - xEvent.xclient.format = 32; - xEvent.xclient.data.l[0] = 1; - xEvent.xclient.data.l[1] = false; - XSendEvent(xDisplay, xRootWindow, false, SubstructureNotifyMask, &xEvent); + Window xRootWindow = DefaultRootWindow(xDisplay); + XSetWindowAttributes xCreateWindowAttributes; + xCreateWindowAttributes.event_mask = ExposureMask; + Window xWindow = XCreateWindow(xDisplay, + xRootWindow, + 0, + 0, + window_width, + window_height, + 0, + CopyFromParent, + InputOutput, + CopyFromParent, + CWEventMask, + &xCreateWindowAttributes); + XMapWindow(xDisplay, xWindow); + Atom netWmStateAtom = XInternAtom(xDisplay, "_NET_WM_STATE", false); + XEvent xEvent; + memset(&xEvent, 0, sizeof(xEvent)); + xEvent.type = ClientMessage; + xEvent.xclient.window = xWindow; + xEvent.xclient.message_type = netWmStateAtom; + xEvent.xclient.format = 32; + xEvent.xclient.data.l[0] = 1; + xEvent.xclient.data.l[1] = false; + XSendEvent(xDisplay, xRootWindow, false, SubstructureNotifyMask, &xEvent); - eglSurface = eglCreateWindowSurface( - eglDisplay, configList[0], (EGLNativeWindowType)xWindow, windowAttrs); - if (!eglSurface) { - printf("EGL couldn't create window\n"); - exit(EXIT_FAILURE); - } + eglSurface = eglCreateWindowSurface(eglDisplay, configList[0], (EGLNativeWindowType)xWindow, windowAttrs); + if (!eglSurface) { + printf("EGL couldn't create window\n"); + exit(EXIT_FAILURE); + } - eglBindAPI(EGL_OPENGL_ES_API); + eglBindAPI(EGL_OPENGL_ES_API); - eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); - if (!eglContext) { - printf("EGL couldn't create context\n"); - exit(EXIT_FAILURE); - } + eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); + if (!eglContext) { + printf("EGL couldn't create context\n"); + exit(EXIT_FAILURE); + } - if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) { - printf("EGL couldn't make context/surface current\n"); - exit(EXIT_FAILURE); - } + if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) { + printf("EGL couldn't make context/surface current\n"); + exit(EXIT_FAILURE); + } - EGLint contextRendererType; - eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, - &contextRendererType); + EGLint contextRendererType; + eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, &contextRendererType); - switch (contextRendererType) { + switch (contextRendererType) { case EGL_OPENGL_ES_API: - printf("Using OpenGL ES API\n"); - break; + printf("Using OpenGL ES API\n"); + break; case EGL_OPENGL_API: - printf("Using OpenGL API - this is unsupported\n"); - exit(EXIT_FAILURE); + printf("Using OpenGL API - this is unsupported\n"); + exit(EXIT_FAILURE); case EGL_OPENVG_API: - printf("Using OpenVG API - this is unsupported\n"); - exit(EXIT_FAILURE); + printf("Using OpenVG API - this is unsupported\n"); + exit(EXIT_FAILURE); default: - printf("Unknown context type\n"); - exit(EXIT_FAILURE); - } + printf("Unknown context type\n"); + exit(EXIT_FAILURE); + } } -void selectDemo(int activeDemo) { - if (fp64) { - NBodyDemo::selectDemo(activeDemo); - } else { - NBodyDemo::selectDemo(activeDemo); - } +void selectDemo(int activeDemo) +{ + if (fp64) { + NBodyDemo::selectDemo(activeDemo); + } + else { + NBodyDemo::selectDemo(activeDemo); + } } -void updateSimulation() { - if (fp64) { - NBodyDemo::updateSimulation(); - } else { - NBodyDemo::updateSimulation(); - } +void updateSimulation() +{ + if (fp64) { + NBodyDemo::updateSimulation(); + } + else { + NBodyDemo::updateSimulation(); + } } -void displayNBodySystem() { - if (fp64) { - NBodyDemo::display(); - } else { - NBodyDemo::display(); - } +void displayNBodySystem() +{ + if (fp64) { + NBodyDemo::display(); + } + else { + NBodyDemo::display(); + } } -void display() { - static double gflops = 0; - static double ifps = 0; - static double interactionsPerSecond = 0; +void display() +{ + static double gflops = 0; + static double ifps = 0; + static double interactionsPerSecond = 0; - // update the simulation - if (!bPause) { - if (cycleDemo && (sdkGetTimerValue(&demoTimer) > demoTime)) { - activeDemo = (activeDemo + 1) % numDemos; - selectDemo(activeDemo); + // update the simulation + if (!bPause) { + if (cycleDemo && (sdkGetTimerValue(&demoTimer) > demoTime)) { + activeDemo = (activeDemo + 1) % numDemos; + selectDemo(activeDemo); + } + + updateSimulation(); + + if (!useCpu) { + cudaEventRecord(hostMemSyncEvent, + 0); // insert an event to wait on before rendering + } } - updateSimulation(); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - if (!useCpu) { - cudaEventRecord(hostMemSyncEvent, - 0); // insert an event to wait on before rendering - } - } + if (displayEnabled) { + // view transform + for (int c = 0; c < 3; ++c) { + camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; + camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; + } - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - - if (displayEnabled) { - // view transform - for (int c = 0; c < 3; ++c) { - camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; - camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; + displayNBodySystem(); } - displayNBodySystem(); - } + fpsCount++; - fpsCount++; + // this displays the frame rate updated every second (independent of frame + // rate) + if (fpsCount >= fpsLimit) { + char fps[256]; - // this displays the frame rate updated every second (independent of frame - // rate) - if (fpsCount >= fpsLimit) { - char fps[256]; + float milliseconds = 1; - float milliseconds = 1; + // stop timer + if (useCpu) { + milliseconds = sdkGetTimerValue(&timer); + sdkResetTimer(&timer); + } + else { + checkCudaErrors(cudaEventRecord(stopEvent, 0)); + checkCudaErrors(cudaEventSynchronize(stopEvent)); + } - // stop timer - if (useCpu) { - milliseconds = sdkGetTimerValue(&timer); - sdkResetTimer(&timer); - } else { - checkCudaErrors(cudaEventRecord(stopEvent, 0)); - checkCudaErrors(cudaEventSynchronize(stopEvent)); + milliseconds /= (float)fpsCount; + computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); + + ifps = 1.f / (milliseconds / 1000.f); + sprintf(fps, + "CUDA N-Body (%d bodies): " + "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s", + numBodies, + ifps, + interactionsPerSecond, + gflops, + fp64 ? "double precision" : "single precision"); + + fpsCount = 0; + fpsLimit = (ifps > 1.f) ? (int)ifps : 1; + + if (bPause) { + fpsLimit = 0; + } + + // restart timer + if (!useCpu) { + checkCudaErrors(cudaEventRecord(startEvent, 0)); + } } - - milliseconds /= (float)fpsCount; - computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); - - ifps = 1.f / (milliseconds / 1000.f); - sprintf(fps, - "CUDA N-Body (%d bodies): " - "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s", - numBodies, ifps, interactionsPerSecond, gflops, - fp64 ? "double precision" : "single precision"); - - fpsCount = 0; - fpsLimit = (ifps > 1.f) ? (int)ifps : 1; - - if (bPause) { - fpsLimit = 0; - } - - // restart timer - if (!useCpu) { - checkCudaErrors(cudaEventRecord(startEvent, 0)); - } - } } -void updateParams() { - if (fp64) { - NBodyDemo::updateParams(); - } else { - NBodyDemo::updateParams(); - } +void updateParams() +{ + if (fp64) { + NBodyDemo::updateParams(); + } + else { + NBodyDemo::updateParams(); + } } // commented out to remove unused parameter warnings in Linux -void key(unsigned char key, int /*x*/, int /*y*/) { - switch (key) { +void key(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) { case ' ': - bPause = !bPause; - break; + bPause = !bPause; + break; - case 27: // escape + case 27: // escape case 'q': case 'Q': - finalize(); + finalize(); - exit(EXIT_SUCCESS); - break; + exit(EXIT_SUCCESS); + break; - case 13: // return - if (bSupportDouble) { - if (fp64) { - switchDemoPrecision(); - } else { - switchDemoPrecision(); + case 13: // return + if (bSupportDouble) { + if (fp64) { + switchDemoPrecision(); + } + else { + switchDemoPrecision(); + } + + printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); } - printf("> %s precision floating point simulation\n", - fp64 ? "Double" : "Single"); - } - - break; + break; case '`': - bShowSliders = !bShowSliders; - break; + bShowSliders = !bShowSliders; + break; case 'g': case 'G': - bDispInteractions = !bDispInteractions; - break; + bDispInteractions = !bDispInteractions; + break; case 'c': case 'C': - cycleDemo = !cycleDemo; - printf("Cycle Demo Parameters: %s\n", cycleDemo ? "ON" : "OFF"); - break; + cycleDemo = !cycleDemo; + printf("Cycle Demo Parameters: %s\n", cycleDemo ? "ON" : "OFF"); + break; case '[': - activeDemo = - (activeDemo == 0) ? numDemos - 1 : (activeDemo - 1) % numDemos; - selectDemo(activeDemo); - break; + activeDemo = (activeDemo == 0) ? numDemos - 1 : (activeDemo - 1) % numDemos; + selectDemo(activeDemo); + break; case ']': - activeDemo = (activeDemo + 1) % numDemos; - selectDemo(activeDemo); - break; + activeDemo = (activeDemo + 1) % numDemos; + selectDemo(activeDemo); + break; case 'd': case 'D': - displayEnabled = !displayEnabled; - break; + displayEnabled = !displayEnabled; + break; case 'o': case 'O': - activeParams.print(); - break; + activeParams.print(); + break; case '1': - if (fp64) { - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - } else { - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - } + if (fp64) { + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + } + else { + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + } - break; + break; case '2': - if (fp64) { - NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); - } else { - NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); - } + if (fp64) { + NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); + } + else { + NBodyDemo::reset(numBodies, NBODY_CONFIG_RANDOM); + } - break; + break; case '3': - if (fp64) { - NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); - } else { - NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); - } + if (fp64) { + NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); + } + else { + NBodyDemo::reset(numBodies, NBODY_CONFIG_EXPAND); + } - break; - } + break; + } } -void showHelp() { - printf("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); - printf( - "\t-fp64 (use double precision floating point values for " - "simulation)\n"); - printf("\t-hostmem (stores simulation data in host memory)\n"); - printf("\t-benchmark (run benchmark to measure performance) \n"); - printf( - "\t-numbodies= (number of bodies (>= 1) to run in simulation) \n"); - printf( - "\t-device= (where d=0,1,2.... for the CUDA device to use)\n"); - printf("\t-dispno= (where n represents the display to use)\n"); - printf( - "\t-width= (where w represents the width of the window to " - "open)\n"); - printf( - "\t-width= (where h represents the height of the window to " - "open)\n"); - printf( - "\t-numdevices= (where i=(number of CUDA devices > 0) to use for " - "simulation)\n"); - printf( - "\t-compare (compares simulation results running once on the " - "default GPU and once on the CPU)\n"); - printf("\t-cpu (run n-body simulation on the CPU)\n"); - printf("\t-tipsy= (load a tipsy model file for simulation)\n\n"); +void showHelp() +{ + printf("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); + printf("\t-fp64 (use double precision floating point values for " + "simulation)\n"); + printf("\t-hostmem (stores simulation data in host memory)\n"); + printf("\t-benchmark (run benchmark to measure performance) \n"); + printf("\t-numbodies= (number of bodies (>= 1) to run in simulation) \n"); + printf("\t-device= (where d=0,1,2.... for the CUDA device to use)\n"); + printf("\t-dispno= (where n represents the display to use)\n"); + printf("\t-width= (where w represents the width of the window to " + "open)\n"); + printf("\t-width= (where h represents the height of the window to " + "open)\n"); + printf("\t-numdevices= (where i=(number of CUDA devices > 0) to use for " + "simulation)\n"); + printf("\t-compare (compares simulation results running once on the " + "default GPU and once on the CPU)\n"); + printf("\t-cpu (run n-body simulation on the CPU)\n"); + printf("\t-tipsy= (load a tipsy model file for simulation)\n\n"); } ////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - bool bTestResults = true; +int main(int argc, char **argv) +{ + bool bTestResults = true; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - if (checkCmdLineFlag(argc, (const char **)argv, "help")) { - printf("\n> Command line options\n"); + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("\n> Command line options\n"); + showHelp(); + return 0; + } + + printf("Run \"nbody_opengles -benchmark [-numbodies=]\" to measure " + "performance.\n"); showHelp(); - return 0; - } - printf( - "Run \"nbody_opengles -benchmark [-numbodies=]\" to measure " - "performance.\n"); - showHelp(); + bFullscreen = (checkCmdLineFlag(argc, (const char **)argv, "fullscreen") != 0); - bFullscreen = - (checkCmdLineFlag(argc, (const char **)argv, "fullscreen") != 0); - - if (bFullscreen) { - bShowSliders = false; - } - - benchmark = (checkCmdLineFlag(argc, (const char **)argv, "benchmark") != 0); - - compareToCPU = - ((checkCmdLineFlag(argc, (const char **)argv, "compare") != 0) || - (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0)); - - QATest = (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0); - useHostMem = (checkCmdLineFlag(argc, (const char **)argv, "hostmem") != 0); - fp64 = (checkCmdLineFlag(argc, (const char **)argv, "fp64") != 0); - - flopsPerInteraction = fp64 ? 30 : 20; - - useCpu = (checkCmdLineFlag(argc, (const char **)argv, "cpu") != 0); - - if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) { - numDevsRequested = - getCmdLineArgumentInt(argc, (const char **)argv, "numdevices"); - - if (numDevsRequested < 1) { - printf( - "Error: \"number of CUDA devices\" specified %d is invalid. Value " - "should be >= 1\n", - numDevsRequested); - exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); - } else { - printf("number of CUDA devices = %d\n", numDevsRequested); - } - } - - if (checkCmdLineFlag(argc, (const char **)argv, "dispno")) { - dispno = getCmdLineArgumentInt(argc, (const char **)argv, "dispno"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "width")) { - window_width = getCmdLineArgumentInt(argc, (const char **)argv, "width"); - } - - if (checkCmdLineFlag(argc, (const char **)argv, "height")) { - window_height = getCmdLineArgumentInt(argc, (const char **)argv, "height"); - } - - // for multi-device we currently require using host memory -- the devices - // share data via the host - if (numDevsRequested > 1) { - useHostMem = true; - } - - int numDevsAvailable = 0; - bool customGPU = false; - cudaGetDeviceCount(&numDevsAvailable); - - if (numDevsAvailable < numDevsRequested) { - printf("Error: only %d Devices available, %d requested. Exiting.\n", - numDevsAvailable, numDevsRequested); - exit(EXIT_SUCCESS); - } - - printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); - printf("> Simulation data stored in %s memory\n", - useHostMem ? "system" : "video"); - printf("> %s precision floating point simulation\n", - fp64 ? "Double" : "Single"); - printf("> %d Devices used for simulation\n", numDevsRequested); - - int devID; - cudaDeviceProp props; - - if (useCpu) { - useHostMem = true; - compareToCPU = false; - bSupportDouble = true; - -#ifdef OPENMP - printf("> Simulation with CPU using OpenMP\n"); -#else - printf("> Simulation with CPU\n"); -#endif - } - - if (!benchmark && !compareToCPU) { - initGL(&argc, argv); - } - - if (!useCpu) { - if (checkCmdLineFlag(argc, (const char **)argv, "device")) { - customGPU = true; + if (bFullscreen) { + bShowSliders = false; } -#if defined(__aarch64__) || defined(__arm__) - // find iGPU on the system which is compute capable which will perform - // GLES-CUDA interop - devID = findIntegratedGPU(); -#else - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - devID = findCudaDevice(argc, (const char **)argv); -#endif + benchmark = (checkCmdLineFlag(argc, (const char **)argv, "benchmark") != 0); - checkCudaErrors(cudaGetDevice(&devID)); - checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + compareToCPU = ((checkCmdLineFlag(argc, (const char **)argv, "compare") != 0) + || (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0)); - bSupportDouble = true; + QATest = (checkCmdLineFlag(argc, (const char **)argv, "qatest") != 0); + useHostMem = (checkCmdLineFlag(argc, (const char **)argv, "hostmem") != 0); + fp64 = (checkCmdLineFlag(argc, (const char **)argv, "fp64") != 0); - // Initialize devices - if (numDevsRequested > 1 && customGPU) { - printf("You can't use --numdevices and --device at the same time.\n"); - exit(EXIT_SUCCESS); - } + flopsPerInteraction = fp64 ? 30 : 20; - if (customGPU || numDevsRequested == 1) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, devID)); - printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, - props.name); - } else { - for (int i = 0; i < numDevsRequested; i++) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, i)); + useCpu = (checkCmdLineFlag(argc, (const char **)argv, "cpu") != 0); - printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, - props.name); + if (checkCmdLineFlag(argc, (const char **)argv, "numdevices")) { + numDevsRequested = getCmdLineArgumentInt(argc, (const char **)argv, "numdevices"); - if (useHostMem) { - if (!props.canMapHostMemory) { - fprintf(stderr, "Device %d cannot map host memory!\n", devID); - exit(EXIT_SUCCESS); - } - - if (numDevsRequested > 1) { - checkCudaErrors(cudaSetDevice(i)); - } - - checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); + if (numDevsRequested < 1) { + printf("Error: \"number of CUDA devices\" specified %d is invalid. Value " + "should be >= 1\n", + numDevsRequested); + exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); + } + else { + printf("number of CUDA devices = %d\n", numDevsRequested); } - } - - // CC 1.2 and earlier do not support double precision - if (props.major * 10 + props.minor <= 12) { - bSupportDouble = false; - } } - // if(numDevsRequested > 1) - // checkCudaErrors(cudaSetDevice(devID)); - - if (fp64 && !bSupportDouble) { - fprintf(stderr, - "One or more of the requested devices does not support double " - "precision floating-point\n"); - exit(EXIT_SUCCESS); + if (checkCmdLineFlag(argc, (const char **)argv, "dispno")) { + dispno = getCmdLineArgumentInt(argc, (const char **)argv, "dispno"); } - } - numIterations = 0; - blockSize = 0; + if (checkCmdLineFlag(argc, (const char **)argv, "width")) { + window_width = getCmdLineArgumentInt(argc, (const char **)argv, "width"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "i")) { - numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); - } + if (checkCmdLineFlag(argc, (const char **)argv, "height")) { + window_height = getCmdLineArgumentInt(argc, (const char **)argv, "height"); + } - if (checkCmdLineFlag(argc, (const char **)argv, "blockSize")) { - blockSize = getCmdLineArgumentInt(argc, (const char **)argv, "blockSize"); - } + // for multi-device we currently require using host memory -- the devices + // share data via the host + if (numDevsRequested > 1) { + useHostMem = true; + } - if (blockSize == 0) // blockSize not set on command line - blockSize = 256; + int numDevsAvailable = 0; + bool customGPU = false; + cudaGetDeviceCount(&numDevsAvailable); + + if (numDevsAvailable < numDevsRequested) { + printf("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); + exit(EXIT_SUCCESS); + } + + printf("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); + printf("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video"); + printf("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); + printf("> %d Devices used for simulation\n", numDevsRequested); + + int devID; + cudaDeviceProp props; + + if (useCpu) { + useHostMem = true; + compareToCPU = false; + bSupportDouble = true; - // default number of bodies is #SMs * 4 * CTA size - if (useCpu) #ifdef OPENMP - numBodies = 8192; - + printf("> Simulation with CPU using OpenMP\n"); #else - numBodies = 4096; + printf("> Simulation with CPU\n"); #endif - else if (numDevsRequested == 1) { - numBodies = compareToCPU ? 4096 : blockSize * 4 * props.multiProcessorCount; - } else { - numBodies = 0; - - for (int i = 0; i < numDevsRequested; i++) { - cudaDeviceProp props; - checkCudaErrors(cudaGetDeviceProperties(&props, i)); - numBodies += - blockSize * (props.major >= 2 ? 4 : 1) * props.multiProcessorCount; - } - } - - if (checkCmdLineFlag(argc, (const char **)argv, "numbodies")) { - numBodies = getCmdLineArgumentInt(argc, (const char **)argv, "numbodies"); - - if (numBodies < 1) { - printf( - "Error: \"number of bodies\" specified %d is invalid. Value should " - "be >= 1\n", - numBodies); - exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); - } else if (numBodies % blockSize) { - int newNumBodies = ((numBodies / blockSize) + 1) * blockSize; - printf( - "Warning: \"number of bodies\" specified %d is not a multiple of " - "%d.\n", - numBodies, blockSize); - printf("Rounding up to the nearest multiple: %d.\n", newNumBodies); - numBodies = newNumBodies; - } else { - printf("number of bodies = %d\n", numBodies); - } - } - - char *fname; - - if (getCmdLineArgumentString(argc, (const char **)argv, "tipsy", &fname)) { - tipsyFile.assign(fname, strlen(fname)); - cycleDemo = false; - bShowSliders = false; - } - - if (numBodies <= 1024) { - activeParams.m_clusterScale = 1.52f; - activeParams.m_velocityScale = 2.f; - } else if (numBodies <= 2048) { - activeParams.m_clusterScale = 1.56f; - activeParams.m_velocityScale = 2.64f; - } else if (numBodies <= 4096) { - activeParams.m_clusterScale = 1.68f; - activeParams.m_velocityScale = 2.98f; - } else if (numBodies <= 8192) { - activeParams.m_clusterScale = 1.98f; - activeParams.m_velocityScale = 2.9f; - } else if (numBodies <= 16384) { - activeParams.m_clusterScale = 1.54f; - activeParams.m_velocityScale = 8.f; - } else if (numBodies <= 32768) { - activeParams.m_clusterScale = 1.44f; - activeParams.m_velocityScale = 11.f; - } - - NBodyDemo::Create(); - - NBodyDemo::init(numBodies, numDevsRequested, blockSize, - !(benchmark || compareToCPU || useHostMem), useHostMem, - useCpu); - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - - if (bSupportDouble) { - NBodyDemo::Create(); - NBodyDemo::init(numBodies, numDevsRequested, blockSize, - !(benchmark || compareToCPU || useHostMem), - useHostMem, useCpu); - NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); - } - - if (benchmark) { - if (numIterations <= 0) { - numIterations = 10; } - NBodyDemo::runBenchmark(numIterations); - } else if (compareToCPU) { - bTestResults = NBodyDemo::compareResults(numBodies); - } else { - glClear(GL_COLOR_BUFFER_BIT); - - eglSwapBuffers(eglDisplay, eglSurface); - - while (1) { - display(); - usleep(1000); - eglSwapBuffers(eglDisplay, eglSurface); + if (!benchmark && !compareToCPU) { + initGL(&argc, argv); } if (!useCpu) { - checkCudaErrors(cudaEventRecord(startEvent, 0)); - } - } + if (checkCmdLineFlag(argc, (const char **)argv, "device")) { + customGPU = true; + } - finalize(); - exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); +#if defined(__aarch64__) || defined(__arm__) + // find iGPU on the system which is compute capable which will perform + // GLES-CUDA interop + devID = findIntegratedGPU(); +#else + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + devID = findCudaDevice(argc, (const char **)argv); +#endif + + checkCudaErrors(cudaGetDevice(&devID)); + checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + + bSupportDouble = true; + + // Initialize devices + if (numDevsRequested > 1 && customGPU) { + printf("You can't use --numdevices and --device at the same time.\n"); + exit(EXIT_SUCCESS); + } + + if (customGPU || numDevsRequested == 1) { + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); + } + else { + for (int i = 0; i < numDevsRequested; i++) { + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, i)); + + printf("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); + + if (useHostMem) { + if (!props.canMapHostMemory) { + fprintf(stderr, "Device %d cannot map host memory!\n", devID); + exit(EXIT_SUCCESS); + } + + if (numDevsRequested > 1) { + checkCudaErrors(cudaSetDevice(i)); + } + + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost)); + } + } + + // CC 1.2 and earlier do not support double precision + if (props.major * 10 + props.minor <= 12) { + bSupportDouble = false; + } + } + + // if(numDevsRequested > 1) + // checkCudaErrors(cudaSetDevice(devID)); + + if (fp64 && !bSupportDouble) { + fprintf(stderr, + "One or more of the requested devices does not support double " + "precision floating-point\n"); + exit(EXIT_SUCCESS); + } + } + + numIterations = 0; + blockSize = 0; + + if (checkCmdLineFlag(argc, (const char **)argv, "i")) { + numIterations = getCmdLineArgumentInt(argc, (const char **)argv, "i"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "blockSize")) { + blockSize = getCmdLineArgumentInt(argc, (const char **)argv, "blockSize"); + } + + if (blockSize == 0) // blockSize not set on command line + blockSize = 256; + + // default number of bodies is #SMs * 4 * CTA size + if (useCpu) +#ifdef OPENMP + numBodies = 8192; + +#else + numBodies = 4096; +#endif + else if (numDevsRequested == 1) { + numBodies = compareToCPU ? 4096 : blockSize * 4 * props.multiProcessorCount; + } + else { + numBodies = 0; + + for (int i = 0; i < numDevsRequested; i++) { + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, i)); + numBodies += blockSize * (props.major >= 2 ? 4 : 1) * props.multiProcessorCount; + } + } + + if (checkCmdLineFlag(argc, (const char **)argv, "numbodies")) { + numBodies = getCmdLineArgumentInt(argc, (const char **)argv, "numbodies"); + + if (numBodies < 1) { + printf("Error: \"number of bodies\" specified %d is invalid. Value should " + "be >= 1\n", + numBodies); + exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); + } + else if (numBodies % blockSize) { + int newNumBodies = ((numBodies / blockSize) + 1) * blockSize; + printf("Warning: \"number of bodies\" specified %d is not a multiple of " + "%d.\n", + numBodies, + blockSize); + printf("Rounding up to the nearest multiple: %d.\n", newNumBodies); + numBodies = newNumBodies; + } + else { + printf("number of bodies = %d\n", numBodies); + } + } + + char *fname; + + if (getCmdLineArgumentString(argc, (const char **)argv, "tipsy", &fname)) { + tipsyFile.assign(fname, strlen(fname)); + cycleDemo = false; + bShowSliders = false; + } + + if (numBodies <= 1024) { + activeParams.m_clusterScale = 1.52f; + activeParams.m_velocityScale = 2.f; + } + else if (numBodies <= 2048) { + activeParams.m_clusterScale = 1.56f; + activeParams.m_velocityScale = 2.64f; + } + else if (numBodies <= 4096) { + activeParams.m_clusterScale = 1.68f; + activeParams.m_velocityScale = 2.98f; + } + else if (numBodies <= 8192) { + activeParams.m_clusterScale = 1.98f; + activeParams.m_velocityScale = 2.9f; + } + else if (numBodies <= 16384) { + activeParams.m_clusterScale = 1.54f; + activeParams.m_velocityScale = 8.f; + } + else if (numBodies <= 32768) { + activeParams.m_clusterScale = 1.44f; + activeParams.m_velocityScale = 11.f; + } + + NBodyDemo::Create(); + + NBodyDemo::init( + numBodies, numDevsRequested, blockSize, !(benchmark || compareToCPU || useHostMem), useHostMem, useCpu); + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + + if (bSupportDouble) { + NBodyDemo::Create(); + NBodyDemo::init( + numBodies, numDevsRequested, blockSize, !(benchmark || compareToCPU || useHostMem), useHostMem, useCpu); + NBodyDemo::reset(numBodies, NBODY_CONFIG_SHELL); + } + + if (benchmark) { + if (numIterations <= 0) { + numIterations = 10; + } + + NBodyDemo::runBenchmark(numIterations); + } + else if (compareToCPU) { + bTestResults = NBodyDemo::compareResults(numBodies); + } + else { + glClear(GL_COLOR_BUFFER_BIT); + + eglSwapBuffers(eglDisplay, eglSurface); + + while (1) { + display(); + usleep(1000); + eglSwapBuffers(eglDisplay, eglSurface); + } + + if (!useCpu) { + checkCudaErrors(cudaEventRecord(startEvent, 0)); + } + } + + finalize(); + exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.cpp b/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.cpp index 3af7c32a..8cc91d1a 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.cpp +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.cpp @@ -27,294 +27,294 @@ #include "render_particles.h" -#include -#include - -#include - -#include #include +#include +#include +#include +#include -void mat_identity(matrix4 m) { - m[0][1] = m[0][2] = m[0][3] = m[1][0] = m[1][2] = m[1][3] = m[2][0] = - m[2][1] = m[2][3] = m[3][0] = m[3][1] = m[3][2] = 0.0f; - m[0][0] = m[1][1] = m[2][2] = m[3][3] = 1.0f; +void mat_identity(matrix4 m) +{ + m[0][1] = m[0][2] = m[0][3] = m[1][0] = m[1][2] = m[1][3] = m[2][0] = m[2][1] = m[2][3] = m[3][0] = m[3][1] = + m[3][2] = 0.0f; + m[0][0] = m[1][1] = m[2][2] = m[3][3] = 1.0f; } -void mat_multiply(matrix4 m0, matrix4 m1) { - float m[4]; +void mat_multiply(matrix4 m0, matrix4 m1) +{ + float m[4]; - for (int r = 0; r < 4; r++) { - m[0] = m[1] = m[2] = m[3] = 0.0f; + for (int r = 0; r < 4; r++) { + m[0] = m[1] = m[2] = m[3] = 0.0f; - for (int c = 0; c < 4; c++) { - for (int i = 0; i < 4; i++) { - m[c] += m0[i][r] * m1[c][i]; - } + for (int c = 0; c < 4; c++) { + for (int i = 0; i < 4; i++) { + m[c] += m0[i][r] * m1[c][i]; + } + } + + for (int c = 0; c < 4; c++) { + m0[c][r] = m[c]; + } } - - for (int c = 0; c < 4; c++) { - m0[c][r] = m[c]; - } - } } -void mat_translate(matrix4 m, vector3 v) { - matrix4 m2; - m2[0][0] = m2[1][1] = m2[2][2] = m2[3][3] = 1.0f; - m2[0][1] = m2[0][2] = m2[0][3] = m2[1][0] = m2[1][2] = m2[1][3] = m2[2][0] = - m2[2][1] = m2[2][3] = 0.0f; - m2[3][0] = v[0]; - m2[3][1] = v[1]; - m2[3][2] = v[2]; - mat_multiply(m, m2); +void mat_translate(matrix4 m, vector3 v) +{ + matrix4 m2; + m2[0][0] = m2[1][1] = m2[2][2] = m2[3][3] = 1.0f; + m2[0][1] = m2[0][2] = m2[0][3] = m2[1][0] = m2[1][2] = m2[1][3] = m2[2][0] = m2[2][1] = m2[2][3] = 0.0f; + m2[3][0] = v[0]; + m2[3][1] = v[1]; + m2[3][2] = v[2]; + mat_multiply(m, m2); } -void mat_perspective(matrix4 m, GLfloat fovy, GLfloat aspect, GLfloat znear, - GLfloat zfar) { - matrix4 m2; - m2[1][0] = m2[2][0] = m2[3][0] = m2[0][1] = m2[2][1] = m2[3][1] = m2[0][2] = - m2[1][2] = m2[0][3] = m2[1][3] = m2[3][3] = 0.0f; - m2[2][3] = -1.0f; +void mat_perspective(matrix4 m, GLfloat fovy, GLfloat aspect, GLfloat znear, GLfloat zfar) +{ + matrix4 m2; + m2[1][0] = m2[2][0] = m2[3][0] = m2[0][1] = m2[2][1] = m2[3][1] = m2[0][2] = m2[1][2] = m2[0][3] = m2[1][3] = + m2[3][3] = 0.0f; + m2[2][3] = -1.0f; - float f = 1 / tan((fovy * M_PI / 180) / 2); - m2[0][0] = f / aspect; - m2[1][1] = f; + float f = 1 / tan((fovy * M_PI / 180) / 2); + m2[0][0] = f / aspect; + m2[1][1] = f; - m2[2][2] = ((znear + zfar) / (znear - zfar)); - m2[3][2] = ((2 * znear * zfar) / (znear - zfar)); + m2[2][2] = ((znear + zfar) / (znear - zfar)); + m2[3][2] = ((2 * znear * zfar) / (znear - zfar)); - mat_multiply(m, m2); + mat_multiply(m, m2); } -ParticleRenderer::ParticleRenderer(unsigned int windowWidth, - unsigned int windowHeight) - : m_pos(0), - m_numParticles(0), - m_pointSize(1.0f), - m_spriteSize(2.0f), - m_vertexShader(0), - m_vertexShaderPoints(0), - m_fragmentShader(0), - m_programPoints(0), - m_programSprites(0), - m_texture(0), - m_pbo(0), - m_vboColor(0), - m_windowWidth(windowWidth), - m_windowHeight(windowHeight), - m_bFp64Positions(false) { - m_camera[0] = 0; - m_camera[1] = 0; - m_camera[2] = 0; - _initGL(); +ParticleRenderer::ParticleRenderer(unsigned int windowWidth, unsigned int windowHeight) + : m_pos(0) + , m_numParticles(0) + , m_pointSize(1.0f) + , m_spriteSize(2.0f) + , m_vertexShader(0) + , m_vertexShaderPoints(0) + , m_fragmentShader(0) + , m_programPoints(0) + , m_programSprites(0) + , m_texture(0) + , m_pbo(0) + , m_vboColor(0) + , m_windowWidth(windowWidth) + , m_windowHeight(windowHeight) + , m_bFp64Positions(false) +{ + m_camera[0] = 0; + m_camera[1] = 0; + m_camera[2] = 0; + _initGL(); } ParticleRenderer::~ParticleRenderer() { m_pos = 0; } void ParticleRenderer::resetPBO() { glDeleteBuffers(1, (GLuint *)&m_pbo); } -void ParticleRenderer::setPositions(float *pos, int numParticles) { - m_pos = pos; - m_numParticles = numParticles; +void ParticleRenderer::setPositions(float *pos, int numParticles) +{ + m_pos = pos; + m_numParticles = numParticles; - if (!m_pbo) { - glGenBuffers(1, (GLuint *)&m_pbo); - } - - glBindBuffer(GL_ARRAY_BUFFER, m_pbo); - glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), pos, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); - checkGLErrors("Setting particle float position"); -} - -void ParticleRenderer::setPositions(double *pos, int numParticles) { - m_bFp64Positions = true; - m_pos_fp64 = pos; - m_numParticles = numParticles; - - if (!m_pbo) { - glGenBuffers(1, (GLuint *)&m_pbo); - } - - glBindBuffer(GL_ARRAY_BUFFER, m_pbo); - glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(double), pos, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); - checkGLErrors("Setting particle double position"); -} - -void ParticleRenderer::setColors(float *color, int numParticles) { - glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); - glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), color, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); -} - -void ParticleRenderer::setBaseColor(float color[4]) { - for (int i = 0; i < 4; i++) m_baseColor[i] = color[i]; -} - -void ParticleRenderer::setPBO(unsigned int pbo, int numParticles, bool fp64) { - m_pbo = pbo; - m_numParticles = numParticles; - - if (fp64) m_bFp64Positions = true; -} - -void ParticleRenderer::display() { - glEnable(GL_BLEND); - glBlendFunc(GL_SRC_ALPHA, GL_ONE); - glDepthMask(GL_FALSE); - - glUseProgram(m_programSprites); - - // Set modelview and projection matrices - GLint h_ModelViewMatrix = glGetUniformLocation(m_programSprites, "modelview"); - GLint h_ProjectionMatrix = - glGetUniformLocation(m_programSprites, "projection"); - matrix4 modelview; - matrix4 projection; - mat_identity(modelview); - mat_identity(projection); - mat_translate(modelview, m_camera); - mat_perspective(projection, 60, (float)m_windowWidth / (float)m_windowHeight, - 0.1, 1000.0); - glUniformMatrix4fv(h_ModelViewMatrix, 1, GL_FALSE, (GLfloat *)modelview); - glUniformMatrix4fv(h_ProjectionMatrix, 1, GL_FALSE, (GLfloat *)projection); - - // Set point size - GLint h_PointSize = glGetUniformLocation(m_programSprites, "size"); - glUniform1f(h_PointSize, m_spriteSize); - - // Set base and secondary colors - GLint h_BaseColor = glGetUniformLocation(m_programSprites, "baseColor"); - GLint h_SecondaryColor = - glGetUniformLocation(m_programSprites, "secondaryColor"); - glUniform4f(h_BaseColor, 1.0, 1.0, 1.0, 1.0); - glUniform4f(h_SecondaryColor, m_baseColor[0], m_baseColor[1], m_baseColor[2], - m_baseColor[3]); - - // Set position coords - GLint h_position = glGetAttribLocation(m_programSprites, "a_position"); - glBindBuffer(GL_ARRAY_BUFFER, m_pbo); - glEnableVertexAttribArray(h_position); - glVertexAttribPointer(h_position, 4, GL_FLOAT, GL_FALSE, 0, 0); - - GLuint texLoc = glGetUniformLocation(m_programSprites, "splatTexture"); - glUniform1i(texLoc, 0); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, m_texture); - - glDrawArrays(GL_POINTS, 0, m_numParticles); - - glDisableVertexAttribArray(h_position); - - glUseProgram(0); - - glDisable(GL_BLEND); - glDepthMask(GL_TRUE); -} - -const char vertexShader[] = { - "attribute vec4 a_position;" - - "uniform mat4 projection;" - "uniform mat4 modelview;" - "uniform float size;" - - "void main()" - "{" - "float pointSize = 500.0 * size;" - "vec4 vert = a_position;" - "vert.w = 1.0;" - "vec3 pos_eye = vec3(modelview * vert);" - "gl_PointSize = max(1.0, pointSize / (1.0 - pos_eye.z));" - "gl_Position = projection * modelview * a_position;" - "}"}; - -const char fragmentShader[] = { - "uniform sampler2D splatTexture;" - "uniform lowp vec4 baseColor;" - "uniform lowp vec4 secondaryColor;" - - "void main()" - "{" - "lowp vec4 textureColor = (0.6 + 0.4 * baseColor) * " - "texture2D(splatTexture, gl_PointCoord);" - "gl_FragColor = textureColor * secondaryColor;" - "}"}; - -// Checks if the shader is compiled. -static int CheckCompiled(GLuint shader) { - GLint isCompiled = 0; - glGetShaderiv(shader, GL_COMPILE_STATUS, &isCompiled); - - if (!isCompiled) { - GLint infoLen = 0; - glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLen); - - if (infoLen > 1) { - char *infoLog = (char *)malloc(sizeof(char) * infoLen); - - glGetShaderInfoLog(shader, infoLen, NULL, infoLog); - printf("Error compiling program:\n%s\n", infoLog); - free(infoLog); + if (!m_pbo) { + glGenBuffers(1, (GLuint *)&m_pbo); } - return 0; - } - - return 1; + glBindBuffer(GL_ARRAY_BUFFER, m_pbo); + glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), pos, GL_STATIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); + checkGLErrors("Setting particle float position"); } -void ParticleRenderer::_initGL() { - m_vertexShader = glCreateShader(GL_VERTEX_SHADER); - m_fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); +void ParticleRenderer::setPositions(double *pos, int numParticles) +{ + m_bFp64Positions = true; + m_pos_fp64 = pos; + m_numParticles = numParticles; - const char *v = vertexShader; - const char *f = fragmentShader; - glShaderSource(m_vertexShader, 1, &v, 0); - glShaderSource(m_fragmentShader, 1, &f, 0); + if (!m_pbo) { + glGenBuffers(1, (GLuint *)&m_pbo); + } - checkGLErrors("Shader Source"); + glBindBuffer(GL_ARRAY_BUFFER, m_pbo); + glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(double), pos, GL_STATIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); + checkGLErrors("Setting particle double position"); +} - glCompileShader(m_vertexShader); - glCompileShader(m_fragmentShader); +void ParticleRenderer::setColors(float *color, int numParticles) +{ + glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); + glBufferData(GL_ARRAY_BUFFER, numParticles * 4 * sizeof(float), color, GL_STATIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); +} - if (!CheckCompiled(m_vertexShader) || !CheckCompiled(m_fragmentShader)) { - printf("A shader failed to compile.\n"); - exit(1); - } +void ParticleRenderer::setBaseColor(float color[4]) +{ + for (int i = 0; i < 4; i++) + m_baseColor[i] = color[i]; +} - m_programSprites = glCreateProgram(); +void ParticleRenderer::setPBO(unsigned int pbo, int numParticles, bool fp64) +{ + m_pbo = pbo; + m_numParticles = numParticles; - checkGLErrors("create program"); + if (fp64) + m_bFp64Positions = true; +} - glAttachShader(m_programSprites, m_vertexShader); - glAttachShader(m_programSprites, m_fragmentShader); +void ParticleRenderer::display() +{ + glEnable(GL_BLEND); + glBlendFunc(GL_SRC_ALPHA, GL_ONE); + glDepthMask(GL_FALSE); - checkGLErrors("attaching shaders"); + glUseProgram(m_programSprites); - glLinkProgram(m_programSprites); + // Set modelview and projection matrices + GLint h_ModelViewMatrix = glGetUniformLocation(m_programSprites, "modelview"); + GLint h_ProjectionMatrix = glGetUniformLocation(m_programSprites, "projection"); + matrix4 modelview; + matrix4 projection; + mat_identity(modelview); + mat_identity(projection); + mat_translate(modelview, m_camera); + mat_perspective(projection, 60, (float)m_windowWidth / (float)m_windowHeight, 0.1, 1000.0); + glUniformMatrix4fv(h_ModelViewMatrix, 1, GL_FALSE, (GLfloat *)modelview); + glUniformMatrix4fv(h_ProjectionMatrix, 1, GL_FALSE, (GLfloat *)projection); - checkGLErrors("linking program"); + // Set point size + GLint h_PointSize = glGetUniformLocation(m_programSprites, "size"); + glUniform1f(h_PointSize, m_spriteSize); - EGLint linked; - glGetProgramiv(m_programSprites, GL_LINK_STATUS, &linked); - if (!linked) { - printf("A shader failed to link.\n"); - exit(1); - } + // Set base and secondary colors + GLint h_BaseColor = glGetUniformLocation(m_programSprites, "baseColor"); + GLint h_SecondaryColor = glGetUniformLocation(m_programSprites, "secondaryColor"); + glUniform4f(h_BaseColor, 1.0, 1.0, 1.0, 1.0); + glUniform4f(h_SecondaryColor, m_baseColor[0], m_baseColor[1], m_baseColor[2], m_baseColor[3]); - _createTexture(32); + // Set position coords + GLint h_position = glGetAttribLocation(m_programSprites, "a_position"); + glBindBuffer(GL_ARRAY_BUFFER, m_pbo); + glEnableVertexAttribArray(h_position); + glVertexAttribPointer(h_position, 4, GL_FLOAT, GL_FALSE, 0, 0); - glGenBuffers(1, (GLuint *)&m_vboColor); - glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); - glBufferData(GL_ARRAY_BUFFER, m_numParticles * 4 * sizeof(float), 0, - GL_STATIC_DRAW); - glBindBuffer(GL_ARRAY_BUFFER, 0); + GLuint texLoc = glGetUniformLocation(m_programSprites, "splatTexture"); + glUniform1i(texLoc, 0); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, m_texture); + + glDrawArrays(GL_POINTS, 0, m_numParticles); + + glDisableVertexAttribArray(h_position); + + glUseProgram(0); + + glDisable(GL_BLEND); + glDepthMask(GL_TRUE); +} + +const char vertexShader[] = {"attribute vec4 a_position;" + + "uniform mat4 projection;" + "uniform mat4 modelview;" + "uniform float size;" + + "void main()" + "{" + "float pointSize = 500.0 * size;" + "vec4 vert = a_position;" + "vert.w = 1.0;" + "vec3 pos_eye = vec3(modelview * vert);" + "gl_PointSize = max(1.0, pointSize / (1.0 - pos_eye.z));" + "gl_Position = projection * modelview * a_position;" + "}"}; + +const char fragmentShader[] = {"uniform sampler2D splatTexture;" + "uniform lowp vec4 baseColor;" + "uniform lowp vec4 secondaryColor;" + + "void main()" + "{" + "lowp vec4 textureColor = (0.6 + 0.4 * baseColor) * " + "texture2D(splatTexture, gl_PointCoord);" + "gl_FragColor = textureColor * secondaryColor;" + "}"}; + +// Checks if the shader is compiled. +static int CheckCompiled(GLuint shader) +{ + GLint isCompiled = 0; + glGetShaderiv(shader, GL_COMPILE_STATUS, &isCompiled); + + if (!isCompiled) { + GLint infoLen = 0; + glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLen); + + if (infoLen > 1) { + char *infoLog = (char *)malloc(sizeof(char) * infoLen); + + glGetShaderInfoLog(shader, infoLen, NULL, infoLog); + printf("Error compiling program:\n%s\n", infoLog); + free(infoLog); + } + + return 0; + } + + return 1; +} + +void ParticleRenderer::_initGL() +{ + m_vertexShader = glCreateShader(GL_VERTEX_SHADER); + m_fragmentShader = glCreateShader(GL_FRAGMENT_SHADER); + + const char *v = vertexShader; + const char *f = fragmentShader; + glShaderSource(m_vertexShader, 1, &v, 0); + glShaderSource(m_fragmentShader, 1, &f, 0); + + checkGLErrors("Shader Source"); + + glCompileShader(m_vertexShader); + glCompileShader(m_fragmentShader); + + if (!CheckCompiled(m_vertexShader) || !CheckCompiled(m_fragmentShader)) { + printf("A shader failed to compile.\n"); + exit(1); + } + + m_programSprites = glCreateProgram(); + + checkGLErrors("create program"); + + glAttachShader(m_programSprites, m_vertexShader); + glAttachShader(m_programSprites, m_fragmentShader); + + checkGLErrors("attaching shaders"); + + glLinkProgram(m_programSprites); + + checkGLErrors("linking program"); + + EGLint linked; + glGetProgramiv(m_programSprites, GL_LINK_STATUS, &linked); + if (!linked) { + printf("A shader failed to link.\n"); + exit(1); + } + + _createTexture(32); + + glGenBuffers(1, (GLuint *)&m_vboColor); + glBindBuffer(GL_ARRAY_BUFFER, m_vboColor); + glBufferData(GL_ARRAY_BUFFER, m_numParticles * 4 * sizeof(float), 0, GL_STATIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); } //------------------------------------------------------------------------------ @@ -325,50 +325,53 @@ void ParticleRenderer::_initGL() { * EvalHermite(float pA, float pB, float vA, float vB, float u) * @brief Evaluates Hermite basis functions for the specified coefficients. */ -inline float evalHermite(float pA, float pB, float vA, float vB, float u) { - float u2 = (u * u), u3 = u2 * u; - float B0 = 2 * u3 - 3 * u2 + 1; - float B1 = -2 * u3 + 3 * u2; - float B2 = u3 - 2 * u2 + u; - float B3 = u3 - u; - return (B0 * pA + B1 * pB + B2 * vA + B3 * vB); +inline float evalHermite(float pA, float pB, float vA, float vB, float u) +{ + float u2 = (u * u), u3 = u2 * u; + float B0 = 2 * u3 - 3 * u2 + 1; + float B1 = -2 * u3 + 3 * u2; + float B2 = u3 - 2 * u2 + u; + float B3 = u3 - u; + return (B0 * pA + B1 * pB + B2 * vA + B3 * vB); } -unsigned char *createGaussianMap(int N) { - float *M = new float[2 * N * N]; - unsigned char *B = new unsigned char[4 * N * N]; - float X, Y, Y2, Dist; - float Incr = 2.0f / N; - int i = 0; - int j = 0; - Y = -1.0f; +unsigned char *createGaussianMap(int N) +{ + float *M = new float[2 * N * N]; + unsigned char *B = new unsigned char[4 * N * N]; + float X, Y, Y2, Dist; + float Incr = 2.0f / N; + int i = 0; + int j = 0; + Y = -1.0f; - // float mmax = 0; - for (int y = 0; y < N; y++, Y += Incr) { - Y2 = Y * Y; - X = -1.0f; + // float mmax = 0; + for (int y = 0; y < N; y++, Y += Incr) { + Y2 = Y * Y; + X = -1.0f; - for (int x = 0; x < N; x++, X += Incr, i += 2, j += 4) { - Dist = (float)sqrtf(X * X + Y2); + for (int x = 0; x < N; x++, X += Incr, i += 2, j += 4) { + Dist = (float)sqrtf(X * X + Y2); - if (Dist > 1) Dist = 1; + if (Dist > 1) + Dist = 1; - M[i + 1] = M[i] = evalHermite(1.0f, 0, 0, 0, Dist); - B[j + 3] = B[j + 2] = B[j + 1] = B[j] = (unsigned char)(M[i] * 255); + M[i + 1] = M[i] = evalHermite(1.0f, 0, 0, 0, Dist); + B[j + 3] = B[j + 2] = B[j + 1] = B[j] = (unsigned char)(M[i] * 255); + } } - } - delete[] M; - return (B); + delete[] M; + return (B); } -void ParticleRenderer::_createTexture(int resolution) { - unsigned char *data = createGaussianMap(resolution); - glGenTextures(1, (GLuint *)&m_texture); - glBindTexture(GL_TEXTURE_2D, m_texture); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, - GL_LINEAR); //_MIPMAP_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, resolution, resolution, 0, GL_RGBA, - GL_UNSIGNED_BYTE, data); +void ParticleRenderer::_createTexture(int resolution) +{ + unsigned char *data = createGaussianMap(resolution); + glGenTextures(1, (GLuint *)&m_texture); + glBindTexture(GL_TEXTURE_2D, m_texture); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, + GL_LINEAR); //_MIPMAP_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, resolution, resolution, 0, GL_RGBA, GL_UNSIGNED_BYTE, data); } diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.h b/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.h index e028c8e5..38d980cf 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.h +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/render_particles.h @@ -31,80 +31,76 @@ #include #include #include - #include typedef float matrix4[4][4]; typedef float vector3[3]; // check for OpenGL errors -inline void checkGLErrors(const char *s) { - EGLenum error; +inline void checkGLErrors(const char *s) +{ + EGLenum error; - while ((error = glGetError()) != GL_NO_ERROR) { - fprintf(stderr, "%s: error - %d\n", s, error); - } + while ((error = glGetError()) != GL_NO_ERROR) { + fprintf(stderr, "%s: error - %d\n", s, error); + } } -class ParticleRenderer { - public: - ParticleRenderer(unsigned int windowWidth = 720, - unsigned int windowHeight = 480); - ~ParticleRenderer(); +class ParticleRenderer +{ +public: + ParticleRenderer(unsigned int windowWidth = 720, unsigned int windowHeight = 480); + ~ParticleRenderer(); - void setPositions(float *pos, int numParticles); - void setPositions(double *pos, int numParticles); - void setBaseColor(float color[4]); - void setColors(float *color, int numParticles); - void setPBO(unsigned int pbo, int numParticles, bool fp64); + void setPositions(float *pos, int numParticles); + void setPositions(double *pos, int numParticles); + void setBaseColor(float color[4]); + void setColors(float *color, int numParticles); + void setPBO(unsigned int pbo, int numParticles, bool fp64); - enum DisplayMode { - PARTICLE_POINTS, - PARTICLE_SPRITES, - PARTICLE_SPRITES_COLOR, - PARTICLE_NUM_MODES - }; + enum DisplayMode { PARTICLE_POINTS, PARTICLE_SPRITES, PARTICLE_SPRITES_COLOR, PARTICLE_NUM_MODES }; - void display(); + void display(); - void setPointSize(float size) { m_pointSize = size; } - void setSpriteSize(float size) { m_spriteSize = size; } + void setPointSize(float size) { m_pointSize = size; } + void setSpriteSize(float size) { m_spriteSize = size; } - void setCameraPos(vector3 camera_pos) { - m_camera[0] = camera_pos[0]; - m_camera[1] = camera_pos[1]; - m_camera[2] = camera_pos[2]; - } + void setCameraPos(vector3 camera_pos) + { + m_camera[0] = camera_pos[0]; + m_camera[1] = camera_pos[1]; + m_camera[2] = camera_pos[2]; + } - void resetPBO(); + void resetPBO(); - protected: // methods - void _initGL(); - void _createTexture(int resolution); +protected: // methods + void _initGL(); + void _createTexture(int resolution); - protected: // data - float *m_pos; - double *m_pos_fp64; - int m_numParticles; +protected: // data + float *m_pos; + double *m_pos_fp64; + int m_numParticles; - float m_pointSize; - float m_spriteSize; - vector3 m_camera; + float m_pointSize; + float m_spriteSize; + vector3 m_camera; - unsigned int m_vertexShader; - unsigned int m_vertexShaderPoints; - unsigned int m_fragmentShader; - unsigned int m_programPoints; - unsigned int m_programSprites; - unsigned int m_texture; - unsigned int m_pbo; - unsigned int m_vboColor; - unsigned int m_windowWidth; - unsigned int m_windowHeight; + unsigned int m_vertexShader; + unsigned int m_vertexShaderPoints; + unsigned int m_fragmentShader; + unsigned int m_programPoints; + unsigned int m_programSprites; + unsigned int m_texture; + unsigned int m_pbo; + unsigned int m_vboColor; + unsigned int m_windowWidth; + unsigned int m_windowHeight; - float m_baseColor[4]; + float m_baseColor[4]; - bool m_bFp64Positions; + bool m_bFp64Positions; }; -#endif //__ RENDER_PARTICLES__ +#endif //__ RENDER_PARTICLES__ diff --git a/Samples/8_Platform_Specific/Tegra/nbody_opengles/tipsy.h b/Samples/8_Platform_Specific/Tegra/nbody_opengles/tipsy.h index 99692a9c..fc1faa90 100644 --- a/Samples/8_Platform_Specific/Tegra/nbody_opengles/tipsy.h +++ b/Samples/8_Platform_Specific/Tegra/nbody_opengles/tipsy.h @@ -17,11 +17,11 @@ struct gas_particle Real rho; Real temp; Real hsmooth; - Real metals ; - Real phi ; -} ; + Real metals; + Real phi; +}; -//struct gas_particle *gas_particles; +// struct gas_particle *gas_particles; struct dark_particle { @@ -29,45 +29,45 @@ struct dark_particle Real pos[MAXDIM]; Real vel[MAXDIM]; Real eps; - int phi ; -} ; + int phi; +}; -//struct dark_particle *dark_particles; +// struct dark_particle *dark_particles; struct star_particle { Real mass; Real pos[MAXDIM]; Real vel[MAXDIM]; - Real metals ; - Real tform ; + Real metals; + Real tform; Real eps; - int phi ; -} ; + int phi; +}; -//struct star_particle *star_particles; +// struct star_particle *star_particles; struct dump { - double time ; - int nbodies ; - int ndim ; - int nsph ; - int ndark ; - int nstar ; -} ; + double time; + int nbodies; + int ndim; + int nsph; + int ndark; + int nstar; +}; -typedef struct dump header ; +typedef struct dump header; template -void read_tipsy_file(vector &bodyPositions, - vector &bodyVelocities, - vector &bodiesIDs, +void read_tipsy_file(vector &bodyPositions, + vector &bodyVelocities, + vector &bodiesIDs, const std::string &fileName, - int &NTotal, - int &NFirst, - int &NSecond, - int &NThird) + int &NTotal, + int &NFirst, + int &NSecond, + int &NThird) { /* Read in our custom version of the tipsy file format written by @@ -82,59 +82,55 @@ void read_tipsy_file(vector &bodyPositions, ifstream inputFile(fullFileName, ios::in | ios::binary); - if (!inputFile.is_open()) - { + if (!inputFile.is_open()) { cout << "Can't open input file \n"; exit(EXIT_SUCCESS); } - dump h; + dump h; inputFile.read((char *)&h, sizeof(h)); - int idummy; + int idummy; real4 positions; real4 velocity; - //Read tipsy header - NTotal = h.nbodies; - NFirst = h.ndark; - NSecond = h.nstar; - NThird = h.nsph; + // Read tipsy header + NTotal = h.nbodies; + NFirst = h.ndark; + NSecond = h.nstar; + NThird = h.nsph; - //Start reading + // Start reading int particleCount = 0; dark_particle d; star_particle s; - for (int i=0; i < NTotal; i++) - { - if (i < NFirst) - { + for (int i = 0; i < NTotal; i++) { + if (i < NFirst) { inputFile.read((char *)&d, sizeof(d)); - velocity.w = d.eps; - positions.w = d.mass; - positions.x = d.pos[0]; - positions.y = d.pos[1]; - positions.z = d.pos[2]; - velocity.x = d.vel[0]; - velocity.y = d.vel[1]; - velocity.z = d.vel[2]; - idummy = d.phi; + velocity.w = d.eps; + positions.w = d.mass; + positions.x = d.pos[0]; + positions.y = d.pos[1]; + positions.z = d.pos[2]; + velocity.x = d.vel[0]; + velocity.y = d.vel[1]; + velocity.z = d.vel[2]; + idummy = d.phi; } - else - { + else { inputFile.read((char *)&s, sizeof(s)); - velocity.w = s.eps; - positions.w = s.mass; - positions.x = s.pos[0]; - positions.y = s.pos[1]; - positions.z = s.pos[2]; - velocity.x = s.vel[0]; - velocity.y = s.vel[1]; - velocity.z = s.vel[2]; - idummy = s.phi; + velocity.w = s.eps; + positions.w = s.mass; + positions.x = s.pos[0]; + positions.y = s.pos[1]; + positions.z = s.pos[2]; + velocity.x = s.vel[0]; + velocity.y = s.vel[1]; + velocity.z = s.vel[2]; + idummy = s.phi; } bodyPositions.push_back(positions); @@ -142,18 +138,16 @@ void read_tipsy_file(vector &bodyPositions, bodiesIDs.push_back(idummy); particleCount++; - }//end for + } // end for // round up to a multiple of 256 bodies since our kernel only supports that... int newTotal = NTotal; - if (NTotal % 256) - { + if (NTotal % 256) { newTotal = ((NTotal / 256) + 1) * 256; } - for (int i = NTotal; i < newTotal; i++) - { + for (int i = NTotal; i < newTotal; i++) { positions.w = positions.x = positions.y = positions.z = 0; velocity.x = velocity.y = velocity.z = 0; bodyPositions.push_back(positions); diff --git a/Samples/8_Platform_Specific/Tegra/simpleGLES/README.md b/Samples/8_Platform_Specific/Tegra/simpleGLES/README.md index f90fbc58..39e61d85 100644 --- a/Samples/8_Platform_Specific/Tegra/simpleGLES/README.md +++ b/Samples/8_Platform_Specific/Tegra/simpleGLES/README.md @@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/simpleGLES/graphics_interface.c b/Samples/8_Platform_Specific/Tegra/simpleGLES/graphics_interface.c index a6905959..8dba0b44 100644 --- a/Samples/8_Platform_Specific/Tegra/simpleGLES/graphics_interface.c +++ b/Samples/8_Platform_Specific/Tegra/simpleGLES/graphics_interface.c @@ -26,45 +26,42 @@ */ Display *display; -int screen; -Window win = 0; +int screen; +Window win = 0; #include -//#include // not (yet) needed +// #include // not (yet) needed #include #include -#define GET_GLERROR(ret) \ - { \ - GLenum err = glGetError(); \ - if (err != GL_NO_ERROR) \ - { \ - fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, \ - err); \ - fflush(stderr); \ - \ - switch (err) \ - { \ - case GL_INVALID_ENUM: \ - printf("GL_INVALID_ENUM\n"); \ - break; \ - case GL_INVALID_VALUE: \ - printf("GL_INVALID_VALUE\n"); \ - break; \ - case GL_INVALID_OPERATION: \ - printf("GL_INVALID_OPERATION\n"); \ - break; \ - case GL_OUT_OF_MEMORY: \ - printf("GL_OUT_OF_MEMORY\n"); \ - break; \ - case GL_INVALID_FRAMEBUFFER_OPERATION: \ - printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ - break; \ - default: \ - printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ - }; \ - } \ - } +#define GET_GLERROR(ret) \ + { \ + GLenum err = glGetError(); \ + if (err != GL_NO_ERROR) { \ + fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, err); \ + fflush(stderr); \ + \ + switch (err) { \ + case GL_INVALID_ENUM: \ + printf("GL_INVALID_ENUM\n"); \ + break; \ + case GL_INVALID_VALUE: \ + printf("GL_INVALID_VALUE\n"); \ + break; \ + case GL_INVALID_OPERATION: \ + printf("GL_INVALID_OPERATION\n"); \ + break; \ + case GL_OUT_OF_MEMORY: \ + printf("GL_OUT_OF_MEMORY\n"); \ + break; \ + case GL_INVALID_FRAMEBUFFER_OPERATION: \ + printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ + break; \ + default: \ + printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ + }; \ + } \ + } EGLDisplay eglDisplay = EGL_NO_DISPLAY; EGLSurface eglSurface = EGL_NO_SURFACE; @@ -85,111 +82,128 @@ typedef GL_APICALL void (*GL_APIENTRY glGetProgramivTYPE) (GLuint program, GLenu glGetProgramivTYPE my_glGetProgramiv; #endif -int graphics_setup_window(int xpos, int ypos, int width, int height, - const char *windowname) +int graphics_setup_window(int xpos, int ypos, int width, int height, const char *windowname) { #ifdef USE_GL - // OpenGL 4.3 Core Profile creation through EGL - would be even available on - // desktop, but CUDA interop doesn't yet work for OpenGL context established - // through EGL - EGLint configAttrs[] = {EGL_RED_SIZE, 1, EGL_GREEN_SIZE, 1, EGL_BLUE_SIZE, 1, - // EGL_DEPTH_SIZE, 16, - EGL_SAMPLE_BUFFERS, 0, EGL_SAMPLES, 0, EGL_CONFORMANT, - EGL_OPENGL_BIT, - // EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT, - // EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR, 1, - EGL_NONE}; - EGLint contextAttrs[] = {// EGL_CONTEXT_MAJOR_VERSION_KHR, 4, - // EGL_CONTEXT_MINOR_VERSION_KHR, 3, - EGL_NONE}; + // OpenGL 4.3 Core Profile creation through EGL - would be even available on + // desktop, but CUDA interop doesn't yet work for OpenGL context established + // through EGL + EGLint configAttrs[] = {EGL_RED_SIZE, + 1, + EGL_GREEN_SIZE, + 1, + EGL_BLUE_SIZE, + 1, + // EGL_DEPTH_SIZE, 16, + EGL_SAMPLE_BUFFERS, + 0, + EGL_SAMPLES, + 0, + EGL_CONFORMANT, + EGL_OPENGL_BIT, + // EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT, + // EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR, 1, + EGL_NONE}; + EGLint contextAttrs[] = {// EGL_CONTEXT_MAJOR_VERSION_KHR, 4, + // EGL_CONTEXT_MINOR_VERSION_KHR, 3, + EGL_NONE}; #else // OpenGL ES 3.1 - EGLint configAttrs[] = { - EGL_RED_SIZE, 1, EGL_GREEN_SIZE, 1, EGL_BLUE_SIZE, 1, EGL_DEPTH_SIZE, 16, - EGL_SAMPLE_BUFFERS, 0, EGL_SAMPLES, 0, - // EGL_CONFORMANT, EGL_OPENGL_BIT, - EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, // 3_BIT_KHR, - EGL_NONE}; - EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; + EGLint configAttrs[] = {EGL_RED_SIZE, + 1, + EGL_GREEN_SIZE, + 1, + EGL_BLUE_SIZE, + 1, + EGL_DEPTH_SIZE, + 16, + EGL_SAMPLE_BUFFERS, + 0, + EGL_SAMPLES, + 0, + // EGL_CONFORMANT, EGL_OPENGL_BIT, + EGL_RENDERABLE_TYPE, + EGL_OPENGL_ES2_BIT, // 3_BIT_KHR, + EGL_NONE}; + EGLint contextAttrs[] = {EGL_CONTEXT_CLIENT_VERSION, 3, EGL_NONE}; #endif - EGLint windowAttrs[] = {EGL_NONE}; - EGLConfig *configList = NULL; - EGLint configCount; + EGLint windowAttrs[] = {EGL_NONE}; + EGLConfig *configList = NULL; + EGLint configCount; - display = XOpenDisplay(NULL); - if (!display) - error_exit("Error opening X display.\n"); + display = XOpenDisplay(NULL); + if (!display) + error_exit("Error opening X display.\n"); - screen = DefaultScreen(display); + screen = DefaultScreen(display); - eglDisplay = eglGetDisplay(display); - if (eglDisplay == EGL_NO_DISPLAY) - error_exit("EGL failed to obtain display\n"); + eglDisplay = eglGetDisplay(display); + if (eglDisplay == EGL_NO_DISPLAY) + error_exit("EGL failed to obtain display\n"); - if (!eglInitialize(eglDisplay, 0, 0)) - error_exit("EGL failed to initialize\n"); + if (!eglInitialize(eglDisplay, 0, 0)) + error_exit("EGL failed to initialize\n"); - if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || - !configCount) - error_exit("EGL failed to return any matching configurations\n"); + if (!eglChooseConfig(eglDisplay, configAttrs, NULL, 0, &configCount) || !configCount) + error_exit("EGL failed to return any matching configurations\n"); - configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); + configList = (EGLConfig *)malloc(configCount * sizeof(EGLConfig)); - if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, - &configCount) || - !configCount) - error_exit("EGL failed to populate configuration list\n"); + if (!eglChooseConfig(eglDisplay, configAttrs, configList, configCount, &configCount) || !configCount) + error_exit("EGL failed to populate configuration list\n"); - win = XCreateSimpleWindow(display, RootWindow(display, screen), xpos, ypos, - width, height, 0, BlackPixel(display, screen), - WhitePixel(display, screen)); + win = XCreateSimpleWindow(display, + RootWindow(display, screen), + xpos, + ypos, + width, + height, + 0, + BlackPixel(display, screen), + WhitePixel(display, screen)); - XStoreName(display, win, windowname); + XStoreName(display, win, windowname); - XSelectInput(display, win, - ExposureMask | ButtonPressMask | KeyPressMask | - StructureNotifyMask | ButtonReleaseMask | KeyReleaseMask | - EnterWindowMask | LeaveWindowMask | PointerMotionMask | - Button1MotionMask | Button2MotionMask | - VisibilityChangeMask | ColormapChangeMask); + XSelectInput(display, + win, + ExposureMask | ButtonPressMask | KeyPressMask | StructureNotifyMask | ButtonReleaseMask + | KeyReleaseMask | EnterWindowMask | LeaveWindowMask | PointerMotionMask | Button1MotionMask + | Button2MotionMask | VisibilityChangeMask | ColormapChangeMask); - XMapWindow(display, win); + XMapWindow(display, win); - eglSurface = eglCreateWindowSurface(eglDisplay, configList[0], - (EGLNativeWindowType)win, windowAttrs); - if (!eglSurface) - error_exit("EGL couldn't create window\n"); + eglSurface = eglCreateWindowSurface(eglDisplay, configList[0], (EGLNativeWindowType)win, windowAttrs); + if (!eglSurface) + error_exit("EGL couldn't create window\n"); #ifdef USE_GL - eglBindAPI(EGL_OPENGL_API); + eglBindAPI(EGL_OPENGL_API); #else - eglBindAPI(EGL_OPENGL_ES_API); + eglBindAPI(EGL_OPENGL_ES_API); #endif - eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); - if (!eglContext) - error_exit("EGL couldn't create context\n"); + eglContext = eglCreateContext(eglDisplay, configList[0], NULL, contextAttrs); + if (!eglContext) + error_exit("EGL couldn't create context\n"); - if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) - error_exit("EGL couldn't make context/surface current\n"); + if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) + error_exit("EGL couldn't make context/surface current\n"); - EGLint Context_RendererType; - eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, - &Context_RendererType); + EGLint Context_RendererType; + eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, &Context_RendererType); - switch (Context_RendererType) - { - case EGL_OPENGL_API: - printf("Using OpenGL API\n"); - break; - case EGL_OPENGL_ES_API: - printf("Using OpenGL ES API"); - break; - case EGL_OPENVG_API: - error_exit("Context Query Returned OpenVG. This is Unsupported\n"); - default: - error_exit("Unknown Context Type. %04X\n", Context_RendererType); - } + switch (Context_RendererType) { + case EGL_OPENGL_API: + printf("Using OpenGL API\n"); + break; + case EGL_OPENGL_ES_API: + printf("Using OpenGL ES API"); + break; + case EGL_OPENVG_API: + error_exit("Context Query Returned OpenVG. This is Unsupported\n"); + default: + error_exit("Unknown Context Type. %04X\n", Context_RendererType); + } #if 0 // obtain API function pointers _manually_ (see function pointer \ // declarations above) @@ -207,33 +221,29 @@ int graphics_setup_window(int xpos, int ypos, int width, int height, GL_APICALL void GL_APIENTRY glBindBuffer (GLenum target, GLuint buffer); #endif - return 1; + return 1; } -void graphics_set_windowtitle(const char *windowname) -{ - XStoreName(display, win, windowname); -} +void graphics_set_windowtitle(const char *windowname) { XStoreName(display, win, windowname); } void graphics_swap_buffers() { eglSwapBuffers(eglDisplay, eglSurface); } void graphics_close_window() { - if (eglDisplay != EGL_NO_DISPLAY) - { - eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); + if (eglDisplay != EGL_NO_DISPLAY) { + eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); - if (eglContext != EGL_NO_CONTEXT) - eglDestroyContext(eglDisplay, eglContext); + if (eglContext != EGL_NO_CONTEXT) + eglDestroyContext(eglDisplay, eglContext); - if (eglSurface != EGL_NO_SURFACE) - eglDestroySurface(eglDisplay, eglSurface); + if (eglSurface != EGL_NO_SURFACE) + eglDestroySurface(eglDisplay, eglSurface); - eglTerminate(eglDisplay); - } + eglTerminate(eglDisplay); + } - if (win) - XDestroyWindow(display, win); + if (win) + XDestroyWindow(display, win); - XCloseDisplay(display); + XCloseDisplay(display); } diff --git a/Samples/8_Platform_Specific/Tegra/simpleGLES/simpleGLES.cu b/Samples/8_Platform_Specific/Tegra/simpleGLES/simpleGLES.cu index efb479ff..108c88db 100644 --- a/Samples/8_Platform_Specific/Tegra/simpleGLES/simpleGLES.cu +++ b/Samples/8_Platform_Specific/Tegra/simpleGLES/simpleGLES.cu @@ -41,22 +41,22 @@ */ // includes, system -#include -#include -#include -#include - -#include -#include #include #include +#include +#include +#include +#include +#include +#include -void error_exit(const char *format, ...) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - exit(1); +void error_exit(const char *format, ...) +{ + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + exit(1); } #include "graphics_interface.c" @@ -68,64 +68,64 @@ void error_exit(const char *format, ...) { #endif // includes, cuda -#include #include +#include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check -//#include // helper functions for CUDA/GL interop +#include // helper functions for CUDA error check +// #include // helper functions for CUDA/GL interop #include #define MAX_EPSILON_ERROR 0.0f -#define THRESHOLD 0.0f -#define REFRESH_DELAY 1 // ms +#define THRESHOLD 0.0f +#define REFRESH_DELAY 1 // ms -#define GUI_IDLE 0x100 -#define GUI_ROTATE 0x101 +#define GUI_IDLE 0x100 +#define GUI_ROTATE 0x101 #define GUI_TRANSLATE 0x102 int gui_mode; //////////////////////////////////////////////////////////////////////////////// // constants -const unsigned int window_width = 512; +const unsigned int window_width = 512; const unsigned int window_height = 512; -const unsigned int mesh_width = 256; +const unsigned int mesh_width = 256; const unsigned int mesh_height = 256; // OpenGL ES variables and interop with CUDA C -GLuint mesh_vao, mesh_vbo; +GLuint mesh_vao, mesh_vbo; struct cudaGraphicsResource *cuda_vbo_resource; -void *d_vbo_buffer = NULL; +void *d_vbo_buffer = NULL; float g_fAnim = 0.0; // UI / mouse controls -int mouse_old_x, mouse_old_y; -int mouse_buttons = 0; +int mouse_old_x, mouse_old_y; +int mouse_buttons = 0; float rotate_x = 0.0, rotate_y = 0.0; float translate_z = -3.0; StopWatchInterface *timer = NULL; // Frame statistics -int frame; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -float avgFPS = 0.0f; -unsigned int frameCount = 0; +int frame; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +float avgFPS = 0.0f; +unsigned int frameCount = 0; unsigned int g_TotalErrors = 0; // Auto-Verification Code bool g_bQAReadback = false; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; #define MAX(a, b) ((a > b) ? a : b) @@ -140,76 +140,76 @@ void checkResultCuda(int argc, char **argv, const GLuint &vbo); const char *sSDKsample = "simpleGLES (VBO)"; -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - fpsCount = 0; - fpsLimit = (int)MAX(avgFPS, 1.f); + if (fpsCount == fpsLimit) { + avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + fpsCount = 0; + fpsLimit = (int)MAX(avgFPS, 1.f); - sdkResetTimer(&timer); - } + sdkResetTimer(&timer); + } - char fps[256]; - sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)", - avgFPS); - graphics_set_windowtitle(fps); + char fps[256]; + sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)", avgFPS); + graphics_set_windowtitle(fps); } /////////////////////////////////////////////////////////////////////////////// //! Simple kernel to modify vertex positions in sine wave pattern //! @param data data in global memory /////////////////////////////////////////////////////////////////////////////// -__global__ void simple_vbo_kernel(float4 *pos, unsigned int width, - unsigned int height, float time) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void simple_vbo_kernel(float4 *pos, unsigned int width, unsigned int height, float time) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - // calculate uv coordinates - float u = x / (float)width; - float v = y / (float)height; - u = u * 2.0f - 1.0f; - v = v * 2.0f - 1.0f; + // calculate uv coordinates + float u = x / (float)width; + float v = y / (float)height; + u = u * 2.0f - 1.0f; + v = v * 2.0f - 1.0f; - // calculate simple sine wave pattern - float freq = 4.0f; - float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; + // calculate simple sine wave pattern + float freq = 4.0f; + float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; - // write output vertex - pos[y * width + x] = make_float4(u, w, v, 1.0f); + // write output vertex + pos[y * width + x] = make_float4(u, w, v, 1.0f); } -void launch_kernel(float4 *pos, unsigned int mesh_width, - unsigned int mesh_height, float time) { - // execute the kernel - dim3 block(8, 8, 1); - dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); - simple_vbo_kernel<<>>(pos, mesh_width, mesh_height, time); +void launch_kernel(float4 *pos, unsigned int mesh_width, unsigned int mesh_height, float time) +{ + // execute the kernel + dim3 block(8, 8, 1); + dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); + simple_vbo_kernel<<>>(pos, mesh_width, mesh_height, time); } //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void runCuda(struct cudaGraphicsResource **vbo_resource) { - // map OpenGL buffer object for writing from CUDA - float4 *dptr; - cudaGraphicsMapResources(1, vbo_resource, 0); - size_t num_bytes; - cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, - *vbo_resource); - // printf("Sample CUDA mapped VBO: May access %ld bytes\n", num_bytes); +void runCuda(struct cudaGraphicsResource **vbo_resource) +{ + // map OpenGL buffer object for writing from CUDA + float4 *dptr; + cudaGraphicsMapResources(1, vbo_resource, 0); + size_t num_bytes; + cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *vbo_resource); + // printf("Sample CUDA mapped VBO: May access %ld bytes\n", num_bytes); - // execute the kernel - // dim3 block(8, 8, 1); - // dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); - // kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim); + // execute the kernel + // dim3 block(8, 8, 1); + // dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); + // kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim); - launch_kernel(dptr, mesh_width, mesh_height, g_fAnim); + launch_kernel(dptr, mesh_width, mesh_height, g_fAnim); - // unmap buffer object - cudaGraphicsUnmapResources(1, vbo_resource, 0); + // unmap buffer object + cudaGraphicsUnmapResources(1, vbo_resource, 0); } #ifdef _WIN32 @@ -222,406 +222,410 @@ void runCuda(struct cudaGraphicsResource **vbo_resource) { #endif #endif -void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) { - printf("sdkDumpBin: <%s>\n", filename); - FILE *fp; - FOPEN(fp, filename, "wb"); - fwrite(data, bytes, 1, fp); - fflush(fp); - fclose(fp); +void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) +{ + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); } //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void runAutoTest(int devID, char **argv, char *ref_file) { - char *reference_file = NULL; - void *imageData = malloc(mesh_width * mesh_height * sizeof(float)); +void runAutoTest(int devID, char **argv, char *ref_file) +{ + char *reference_file = NULL; + void *imageData = malloc(mesh_width * mesh_height * sizeof(float)); - // execute the kernel - launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim); + // execute the kernel + launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim); - cudaDeviceSynchronize(); - getLastCudaError("launch_kernel failed"); + cudaDeviceSynchronize(); + getLastCudaError("launch_kernel failed"); - cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float), - cudaMemcpyDeviceToHost); + cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float), cudaMemcpyDeviceToHost); - sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float), - "simpleGL.bin"); - reference_file = sdkFindFilePath(ref_file, argv[0]); + sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float), "simpleGL.bin"); + reference_file = sdkFindFilePath(ref_file, argv[0]); - if (reference_file && - !sdkCompareBin2BinFloat("simpleGL.bin", reference_file, - mesh_width * mesh_height * sizeof(float), - MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) { - g_TotalErrors++; - } + if (reference_file + && !sdkCompareBin2BinFloat("simpleGL.bin", + reference_file, + mesh_width * mesh_height * sizeof(float), + MAX_EPSILON_ERROR, + THRESHOLD, + pArgv[0])) { + g_TotalErrors++; + } } //////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// -void display_thisframe(float time_delta) { - sdkStartTimer(&timer); +void display_thisframe(float time_delta) +{ + sdkStartTimer(&timer); - // run CUDA kernel to generate vertex positions - runCuda(&cuda_vbo_resource); + // run CUDA kernel to generate vertex positions + runCuda(&cuda_vbo_resource); - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - // GET_GLERROR(0); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + // GET_GLERROR(0); - // set view matrix: broken, it doesn't work in OpenGL ES! Must put into shader - // glMatrixMode(GL_MODELVIEW); - // glLoadIdentity(); - // glTranslatef(0.0, 0.0, translate_z); - // glRotatef(rotate_x, 1.0, 0.0, 0.0); - // glRotatef(rotate_y, 0.0, 1.0, 0.0); + // set view matrix: broken, it doesn't work in OpenGL ES! Must put into shader + // glMatrixMode(GL_MODELVIEW); + // glLoadIdentity(); + // glTranslatef(0.0, 0.0, translate_z); + // glRotatef(rotate_x, 1.0, 0.0, 0.0); + // glRotatef(rotate_y, 0.0, 1.0, 0.0); - glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); + glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); - // GET_GLERROR(0); - glFinish(); - // GET_GLERROR(0); + // GET_GLERROR(0); + glFinish(); + // GET_GLERROR(0); - g_fAnim += time_delta; + g_fAnim += time_delta; - sdkStopTimer(&timer); - computeFPS(); + sdkStopTimer(&timer); + computeFPS(); } //////////////////////////////////////////////////////////////////////////////// //! Check if the result is correct or write data to file for external //! regression testing //////////////////////////////////////////////////////////////////////////////// -void checkResultCuda(int argc, char **argv, const GLuint &vbo) { - if (!d_vbo_buffer) { - printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__); +void checkResultCuda(int argc, char **argv, const GLuint &vbo) +{ + if (!d_vbo_buffer) { + printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__); - cudaGraphicsUnregisterResource(cuda_vbo_resource); + cudaGraphicsUnregisterResource(cuda_vbo_resource); - // map buffer object - glBindBuffer(GL_ARRAY_BUFFER, vbo); - float *data = (float *)glMapBufferRange( - GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float), - GL_READ_ONLY); + // map buffer object + glBindBuffer(GL_ARRAY_BUFFER, vbo); + float *data = + (float *)glMapBufferRange(GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float), GL_READ_ONLY); - // check result - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // write file for regression test - sdkWriteFile("./data/regression.dat", data, - mesh_width * mesh_height * 3, 0.0, false); + // check result + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // write file for regression test + sdkWriteFile("./data/regression.dat", data, mesh_width * mesh_height * 3, 0.0, false); + } + + // unmap GL buffer object + if (!glUnmapBuffer(GL_ARRAY_BUFFER)) { + fprintf(stderr, "Unmap buffer failed.\n"); + fflush(stderr); + } + + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); + + GET_GLERROR(0); } - - // unmap GL buffer object - if (!glUnmapBuffer(GL_ARRAY_BUFFER)) { - fprintf(stderr, "Unmap buffer failed.\n"); - fflush(stderr); - } - - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); - - GET_GLERROR(0); - } } GLuint mesh_shader = 0; -void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, - const char *filename, GLenum shaderType) { - FILE *file = fopen(filename, "rb"); // open shader text file - if (!file) error_exit("Filename %s does not exist\n", filename); +void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, const char *filename, GLenum shaderType) +{ + FILE *file = fopen(filename, "rb"); // open shader text file + if (!file) + error_exit("Filename %s does not exist\n", filename); - /* get the size of the file and read it */ - fseek(file, 0, SEEK_END); - GLint size = ftell(file); - char *data = (char *)malloc(sizeof(char) * (size + 1)); - memset(data, 0, sizeof(char) * (size + 1)); - fseek(file, 0, SEEK_SET); - size_t res = fread(data, 1, size, file); - fclose(file); + /* get the size of the file and read it */ + fseek(file, 0, SEEK_END); + GLint size = ftell(file); + char *data = (char *)malloc(sizeof(char) * (size + 1)); + memset(data, 0, sizeof(char) * (size + 1)); + fseek(file, 0, SEEK_SET); + size_t res = fread(data, 1, size, file); + fclose(file); - GLuint shader = glCreateShader(shaderType); - glShaderSource(shader, 1, (const GLchar **)&data, &size); - glCompileShader(shader); + GLuint shader = glCreateShader(shaderType); + glShaderSource(shader, 1, (const GLchar **)&data, &size); + glCompileShader(shader); - GET_GLERROR(0); - GLint compile_success = 0; - glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success); - GET_GLERROR(0); + GET_GLERROR(0); + GLint compile_success = 0; + glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success); + GET_GLERROR(0); - if (compile_success == GL_FALSE) { - printf("Compilation of %s failed!\n Reason:\n", filename); + if (compile_success == GL_FALSE) { + printf("Compilation of %s failed!\n Reason:\n", filename); - GLint maxLength = 0; - glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength); + GLint maxLength = 0; + glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength); - char errorLog[maxLength]; - glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]); + char errorLog[maxLength]; + glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]); - printf("%s", errorLog); + printf("%s", errorLog); - glDeleteShader(shader); - exit(1); - } + glDeleteShader(shader); + exit(1); + } - glAttachShader(new_shaderprogram, shader); - glDeleteShader(shader); // good to do? + glAttachShader(new_shaderprogram, shader); + glDeleteShader(shader); // good to do? - free(data); + free(data); } -GLuint ShaderCreate(const char *vshader_filename, - const char *fshader_filename) { - printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename); +GLuint ShaderCreate(const char *vshader_filename, const char *fshader_filename) +{ + printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename); - GLuint new_shaderprogram = glCreateProgram(); + GLuint new_shaderprogram = glCreateProgram(); - GET_GLERROR(0); - if (vshader_filename) - readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename, - GL_VERTEX_SHADER); + GET_GLERROR(0); + if (vshader_filename) + readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename, GL_VERTEX_SHADER); - GET_GLERROR(0); - if (fshader_filename) - readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename, - GL_FRAGMENT_SHADER); + GET_GLERROR(0); + if (fshader_filename) + readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename, GL_FRAGMENT_SHADER); - GET_GLERROR(0); + GET_GLERROR(0); - glLinkProgram(new_shaderprogram); + glLinkProgram(new_shaderprogram); - GET_GLERROR(0); - GLint link_success; - glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success); + GET_GLERROR(0); + GLint link_success; + glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success); - if (link_success == GL_FALSE) { - printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename, - fshader_filename); + if (link_success == GL_FALSE) { + printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename, fshader_filename); - GLint maxLength = 0; - glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength); + GLint maxLength = 0; + glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength); - char errorLog[maxLength]; - glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]); + char errorLog[maxLength]; + glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]); - printf("%s", errorLog); + printf("%s", errorLog); - exit(EXIT_FAILURE); - } + exit(EXIT_FAILURE); + } - return new_shaderprogram; + return new_shaderprogram; } //=========================================================================== // InitGraphicsState() - initialize OpenGL //=========================================================================== -static void InitGraphicsState(void) { - char *GL_version = (char *)glGetString(GL_VERSION); - char *GL_vendor = (char *)glGetString(GL_VENDOR); - char *GL_renderer = (char *)glGetString(GL_RENDERER); +static void InitGraphicsState(void) +{ + char *GL_version = (char *)glGetString(GL_VERSION); + char *GL_vendor = (char *)glGetString(GL_VENDOR); + char *GL_renderer = (char *)glGetString(GL_RENDERER); - printf("Version: %s\n", GL_version); - printf("Vendor: %s\n", GL_vendor); - printf("Renderer: %s\n", GL_renderer); + printf("Version: %s\n", GL_version); + printf("Vendor: %s\n", GL_vendor); + printf("Renderer: %s\n", GL_renderer); - // RENDERING SETUP (OpenGL ES or OpenGL Core Profile!) - glGenVertexArrays(1, &mesh_vao); // Features' Vertex Array Object allocation - glBindVertexArray(mesh_vao); // bind VAO + // RENDERING SETUP (OpenGL ES or OpenGL Core Profile!) + glGenVertexArrays(1, &mesh_vao); // Features' Vertex Array Object allocation + glBindVertexArray(mesh_vao); // bind VAO - // initialize buffer object - glGenBuffers(1, &mesh_vbo); - glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo); + // initialize buffer object + glGenBuffers(1, &mesh_vbo); + glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo); - unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); - glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW); - glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0); - glEnableVertexAttribArray(0); + unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); + glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW); + glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0); + glEnableVertexAttribArray(0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo, - cudaGraphicsMapFlagsNone)); - // glBindVertexArray(0); // keep above Vertex Array Object bound (it's the - // only one throughout) + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo, cudaGraphicsMapFlagsNone)); + // glBindVertexArray(0); // keep above Vertex Array Object bound (it's the + // only one throughout) - // GLSL stuff - char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", pArgv[0]); - char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", pArgv[0]); + // GLSL stuff + char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", pArgv[0]); + char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", pArgv[0]); - if (vertex_shader_path == NULL || fragment_shader_path == NULL) { - printf("Error finding shader file\n"); - exit(EXIT_FAILURE); - } + if (vertex_shader_path == NULL || fragment_shader_path == NULL) { + printf("Error finding shader file\n"); + exit(EXIT_FAILURE); + } - mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path); - GET_GLERROR(0); + mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path); + GET_GLERROR(0); - free(vertex_shader_path); - free(fragment_shader_path); + free(vertex_shader_path); + free(fragment_shader_path); - glUseProgram(mesh_shader); + glUseProgram(mesh_shader); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -bool runTest(int argc, char **argv, char *ref_file) { - // Create the CUTIL timer - sdkCreateTimer(&timer); +bool runTest(int argc, char **argv, char *ref_file) +{ + // Create the CUTIL timer + sdkCreateTimer(&timer); - int devID = 0; + int devID = 0; #if defined(__aarch64__) || defined(__arm__) - // find iGPU on the system which is compute capable which will perform - // GLES-CUDA interop - devID = findIntegratedGPU(); + // find iGPU on the system which is compute capable which will perform + // GLES-CUDA interop + devID = findIntegratedGPU(); #else - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - devID = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + devID = findCudaDevice(argc, (const char **)argv); #endif - // command line mode only - if (ref_file != NULL) { - // create VBO - checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, - mesh_width * mesh_height * 4 * sizeof(float))); + // command line mode only + if (ref_file != NULL) { + // create VBO + checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, mesh_width * mesh_height * 4 * sizeof(float))); - // run the cuda part - runAutoTest(devID, argv, ref_file); + // run the cuda part + runAutoTest(devID, argv, ref_file); - // check result of Cuda step - checkResultCuda(argc, argv, mesh_vbo); + // check result of Cuda step + checkResultCuda(argc, argv, mesh_vbo); - cudaFree(d_vbo_buffer); - d_vbo_buffer = NULL; - } else { - // this would use command-line specified CUDA device, note that CUDA - // defaults to highest Gflops/s device - if (checkCmdLineFlag(argc, (const char **)argv, "device")) - error_exit("Device setting not yet implemented!\n"); + cudaFree(d_vbo_buffer); + d_vbo_buffer = NULL; + } + else { + // this would use command-line specified CUDA device, note that CUDA + // defaults to highest Gflops/s device + if (checkCmdLineFlag(argc, (const char **)argv, "device")) + error_exit("Device setting not yet implemented!\n"); - // create X11 window and set up associated OpenGL ES context - graphics_setup_window(0, 0, window_width, window_height, sSDKsample); + // create X11 window and set up associated OpenGL ES context + graphics_setup_window(0, 0, window_width, window_height, sSDKsample); - InitGraphicsState(); // set up GLES stuff + InitGraphicsState(); // set up GLES stuff - glClearColor(0, 0.5, 1, 1); // blue-ish background - glClear(GL_COLOR_BUFFER_BIT); + glClearColor(0, 0.5, 1, 1); // blue-ish background + glClear(GL_COLOR_BUFFER_BIT); - // printf("WP%d\n", __LINE__); - graphics_swap_buffers(); + // printf("WP%d\n", __LINE__); + graphics_swap_buffers(); - XEvent event; - KeySym key; - char text[255]; + XEvent event; + KeySym key; + char text[255]; - int frame = 0; + int frame = 0; - while (frame < 100000) { - if (XPending(display)) { - XNextEvent(display, &event); + while (frame < 100000) { + if (XPending(display)) { + XNextEvent(display, &event); - if (event.type == Expose && event.xexpose.count == 0) { - printf("Redraw requested!\n"); - } - if (event.type == KeyPress && - XLookupString(&event.xkey, text, 255, &key, 0) == 1) { - if (text[0] == 27) goto label_stop_x; + if (event.type == Expose && event.xexpose.count == 0) { + printf("Redraw requested!\n"); + } + if (event.type == KeyPress && XLookupString(&event.xkey, text, 255, &key, 0) == 1) { + if (text[0] == 27) + goto label_stop_x; - printf("You pressed the %c key!\n", text[0]); + printf("You pressed the %c key!\n", text[0]); + } + + if (event.type == ButtonPress) { + printf( + "Mouse button %d press at (%d,%d)\n", event.xbutton.button, event.xbutton.x, event.xbutton.y); + + if (event.xbutton.button == Button1) + gui_mode = GUI_TRANSLATE; + if (event.xbutton.button == Button3) + gui_mode = GUI_ROTATE; + mouse_old_x = event.xbutton.x; + mouse_old_y = event.xbutton.y; + } + + if (event.type == ButtonRelease) { + printf("Mouse button %d released at (%d,%d)\n", + event.xbutton.button, + event.xbutton.x, + event.xbutton.y); + + gui_mode = GUI_IDLE; + mouse_old_x = event.xbutton.x; + mouse_old_y = event.xbutton.y; + } + + if (event.type == MotionNotify) { + // printf("Mouse motion towards %d %d, GUI mode is 0x%x\n", + // event.xmotion.x, event.xmotion.y, gui_mode); + float dx, dy; + dx = (float)(event.xmotion.x - mouse_old_x); + dy = (float)(event.xmotion.y - mouse_old_y); + + if (gui_mode == GUI_ROTATE) { + rotate_x += dy * 0.2f; + rotate_y += dx * 0.2f; + printf("rot x %f y %f\n", rotate_x, rotate_y); + } + if (gui_mode == GUI_TRANSLATE) { + translate_z += dy * 0.01f; + printf("translate z %f\n", translate_z); + } + + mouse_old_x = event.xmotion.x; + mouse_old_y = event.xmotion.y; + } + } + + display_thisframe(0.010); + usleep(1000); // need not take full CPU and GPU + + graphics_swap_buffers(); + // printf("frame %d\n",frame++); } - if (event.type == ButtonPress) { - printf("Mouse button %d press at (%d,%d)\n", event.xbutton.button, - event.xbutton.x, event.xbutton.y); + label_stop_x: + // NOTE: Before destroying OpenGL ES context, must unregister all shared + // resources from CUDA ! + cudaGraphicsUnregisterResource(cuda_vbo_resource); - if (event.xbutton.button == Button1) gui_mode = GUI_TRANSLATE; - if (event.xbutton.button == Button3) gui_mode = GUI_ROTATE; - mouse_old_x = event.xbutton.x; - mouse_old_y = event.xbutton.y; - } + graphics_close_window(); // close window and destroy OpenGL ES context - if (event.type == ButtonRelease) { - printf("Mouse button %d released at (%d,%d)\n", event.xbutton.button, - event.xbutton.x, event.xbutton.y); - - gui_mode = GUI_IDLE; - mouse_old_x = event.xbutton.x; - mouse_old_y = event.xbutton.y; - } - - if (event.type == MotionNotify) { - // printf("Mouse motion towards %d %d, GUI mode is 0x%x\n", - // event.xmotion.x, event.xmotion.y, gui_mode); - float dx, dy; - dx = (float)(event.xmotion.x - mouse_old_x); - dy = (float)(event.xmotion.y - mouse_old_y); - - if (gui_mode == GUI_ROTATE) { - rotate_x += dy * 0.2f; - rotate_y += dx * 0.2f; - printf("rot x %f y %f\n", rotate_x, rotate_y); - } - if (gui_mode == GUI_TRANSLATE) { - translate_z += dy * 0.01f; - printf("translate z %f\n", translate_z); - } - - mouse_old_x = event.xmotion.x; - mouse_old_y = event.xmotion.y; - } - } - - display_thisframe(0.010); - usleep(1000); // need not take full CPU and GPU - - graphics_swap_buffers(); - // printf("frame %d\n",frame++); + sdkDeleteTimer(&timer); } - label_stop_x: - // NOTE: Before destroying OpenGL ES context, must unregister all shared - // resources from CUDA ! - cudaGraphicsUnregisterResource(cuda_vbo_resource); - - graphics_close_window(); // close window and destroy OpenGL ES context - - sdkDeleteTimer(&timer); - } - - return true; + return true; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - char *ref_file = NULL; +int main(int argc, char **argv) +{ + char *ref_file = NULL; - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s starting...\n", sSDKsample); + printf("%s starting...\n", sSDKsample); - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - // In this mode, we run without OpenGL and see if VBO is generated - // correctly - getCmdLineArgumentString(argc, (const char **)argv, "file", - (char **)&ref_file); + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + // In this mode, we run without OpenGL and see if VBO is generated + // correctly + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); + } } - } - printf("\n"); + printf("\n"); - runTest(argc, argv, ref_file); + runTest(argc, argv, ref_file); - printf("%s completed, returned %s\n", sSDKsample, - (g_TotalErrors == 0) ? "OK" : "ERROR!"); + printf("%s completed, returned %s\n", sSDKsample, (g_TotalErrors == 0) ? "OK" : "ERROR!"); - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/README.md b/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/README.md index d79ce6e6..b2da41b5 100644 --- a/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/README.md +++ b/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/README.md @@ -39,4 +39,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## References (for more details) - diff --git a/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/graphics_interface_egloutput_via_egl.c b/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/graphics_interface_egloutput_via_egl.c index c5bfd899..e22834e9 100644 --- a/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/graphics_interface_egloutput_via_egl.c +++ b/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/graphics_interface_egloutput_via_egl.c @@ -29,68 +29,64 @@ int screen; // Window win = 0; +#include #include +#include #include #include -#include - -#include -//#include // not (yet) needed +// #include // not (yet) needed #include #include - -#include -#include #include +#include +#include #define MAX_DEVICES 16 -static PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT = NULL; -static PFNEGLQUERYDEVICESTRINGEXTPROC eglQueryDeviceStringEXT = NULL; -static PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT = NULL; -static PFNEGLGETOUTPUTLAYERSEXTPROC eglGetOutputLayersEXT = NULL; -static PFNEGLCREATESTREAMKHRPROC eglCreateStreamKHR = NULL; -static PFNEGLDESTROYSTREAMKHRPROC eglDestroyStreamKHR = NULL; -static PFNEGLSTREAMCONSUMEROUTPUTEXTPROC eglStreamConsumerOutputEXT = NULL; -static PFNEGLCREATESTREAMPRODUCERSURFACEKHRPROC - eglCreateStreamProducerSurfaceKHR = NULL; +static PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT = NULL; +static PFNEGLQUERYDEVICESTRINGEXTPROC eglQueryDeviceStringEXT = NULL; +static PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT = NULL; +static PFNEGLGETOUTPUTLAYERSEXTPROC eglGetOutputLayersEXT = NULL; +static PFNEGLCREATESTREAMKHRPROC eglCreateStreamKHR = NULL; +static PFNEGLDESTROYSTREAMKHRPROC eglDestroyStreamKHR = NULL; +static PFNEGLSTREAMCONSUMEROUTPUTEXTPROC eglStreamConsumerOutputEXT = NULL; +static PFNEGLCREATESTREAMPRODUCERSURFACEKHRPROC eglCreateStreamProducerSurfaceKHR = NULL; -#define GET_GLERROR(ret) \ - { \ - GLenum err = glGetError(); \ - if (err != GL_NO_ERROR) { \ - fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, \ - err); \ - fflush(stderr); \ - \ - switch (err) { \ - case GL_INVALID_ENUM: \ - printf("GL_INVALID_ENUM\n"); \ - break; \ - case GL_INVALID_VALUE: \ - printf("GL_INVALID_VALUE\n"); \ - break; \ - case GL_INVALID_OPERATION: \ - printf("GL_INVALID_OPERATION\n"); \ - break; \ - case GL_OUT_OF_MEMORY: \ - printf("GL_OUT_OF_MEMORY\n"); \ - break; \ - case GL_INVALID_FRAMEBUFFER_OPERATION: \ - printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ - break; \ - default: \ - printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ - }; \ - } \ - } +#define GET_GLERROR(ret) \ + { \ + GLenum err = glGetError(); \ + if (err != GL_NO_ERROR) { \ + fprintf(stderr, "[%s line %d] OpenGL Error: 0x%x\n", __FILE__, __LINE__, err); \ + fflush(stderr); \ + \ + switch (err) { \ + case GL_INVALID_ENUM: \ + printf("GL_INVALID_ENUM\n"); \ + break; \ + case GL_INVALID_VALUE: \ + printf("GL_INVALID_VALUE\n"); \ + break; \ + case GL_INVALID_OPERATION: \ + printf("GL_INVALID_OPERATION\n"); \ + break; \ + case GL_OUT_OF_MEMORY: \ + printf("GL_OUT_OF_MEMORY\n"); \ + break; \ + case GL_INVALID_FRAMEBUFFER_OPERATION: \ + printf("GL_INVALID_FRAMEBUFFER_OPERATION\n"); \ + break; \ + default: \ + printf("UKNOWN OPENGL ERROR CODE 0x%x\n", err); \ + }; \ + } \ + } EGLDisplay eglDisplay = EGL_NO_DISPLAY; EGLSurface eglSurface = EGL_NO_SURFACE; EGLContext eglContext = EGL_NO_CONTEXT; -#if 0 // needed for optional API call retrieval (= if libGLESv2.so wouldn't be - // linked explicitly) - tedious! consider GLEW. +#if 0 // needed for optional API call retrieval (= if libGLESv2.so wouldn't be + // linked explicitly) - tedious! consider GLEW. typedef GLenum (* glGetErrorTYPE) (void); glGetErrorTYPE my_glGetError; @@ -105,458 +101,442 @@ glGetProgramivTYPE my_glGetProgramiv; #endif // Extension checking utility -static bool CheckExtension(const char *exts, const char *ext) { - int extLen = (int)strlen(ext); - const char *end = exts + strlen(exts); +static bool CheckExtension(const char *exts, const char *ext) +{ + int extLen = (int)strlen(ext); + const char *end = exts + strlen(exts); - while (exts < end) { - while (*exts == ' ') { - exts++; + while (exts < end) { + while (*exts == ' ') { + exts++; + } + int n = strcspn(exts, " "); + if ((extLen == n) && (strncmp(ext, exts, n) == 0)) { + return true; + } + exts += n; } - int n = strcspn(exts, " "); - if ((extLen == n) && (strncmp(ext, exts, n) == 0)) { - return true; - } - exts += n; - } - return false; + return false; } -int graphics_setup_window(int xpos, int ypos, int width, int height, - const char *windowname) { - int device = 0, crtc = -1, plane = -1; - int xsurfsize = 0, ysurfsize = 0; - int xoffset = 0, yoffset = 0; - int xmodesize = 0, ymodesize = 0; - // int color = 0, duration = 10; - int fifo = 0; - int bounce = 0; - uint32_t fb_id = -1; +int graphics_setup_window(int xpos, int ypos, int width, int height, const char *windowname) +{ + int device = 0, crtc = -1, plane = -1; + int xsurfsize = 0, ysurfsize = 0; + int xoffset = 0, yoffset = 0; + int xmodesize = 0, ymodesize = 0; + // int color = 0, duration = 10; + int fifo = 0; + int bounce = 0; + uint32_t fb_id = -1; - EGLDeviceEXT egl_devs[MAX_DEVICES], egl_dev; - EGLOutputLayerEXT egl_lyr; - EGLConfig egl_cfg; - EGLStreamKHR egl_str; - EGLint major, minor; + EGLDeviceEXT egl_devs[MAX_DEVICES], egl_dev; + EGLOutputLayerEXT egl_lyr; + EGLConfig egl_cfg; + EGLStreamKHR egl_str; + EGLint major, minor; - const char *drm_name; - int drm_fd; - uint32_t drm_conn_id, drm_enc_id, drm_crtc_id, drm_plane_id; - uint32_t crtc_mask; - drmModeRes *drm_res_info = NULL; - drmModePlaneRes *drm_plane_res_info = NULL; - drmModeCrtc *drm_crtc_info = NULL; - drmModeConnector *drm_conn_info = NULL; - drmModeEncoder *drm_enc_info = NULL; - drmModePlane *drm_plane_info = NULL; - int drm_mode_index = 0; + const char *drm_name; + int drm_fd; + uint32_t drm_conn_id, drm_enc_id, drm_crtc_id, drm_plane_id; + uint32_t crtc_mask; + drmModeRes *drm_res_info = NULL; + drmModePlaneRes *drm_plane_res_info = NULL; + drmModeCrtc *drm_crtc_info = NULL; + drmModeConnector *drm_conn_info = NULL; + drmModeEncoder *drm_enc_info = NULL; + drmModePlane *drm_plane_info = NULL; + int drm_mode_index = 0; - bool set_mode = false; - int i, n; + bool set_mode = false; + int i, n; - // Load extension function pointers. - eglQueryDevicesEXT = - (PFNEGLQUERYDEVICESEXTPROC)eglGetProcAddress("eglQueryDevicesEXT"); - eglQueryDeviceStringEXT = (PFNEGLQUERYDEVICESTRINGEXTPROC)eglGetProcAddress( - "eglQueryDeviceStringEXT"); - eglGetPlatformDisplayEXT = (PFNEGLGETPLATFORMDISPLAYEXTPROC)eglGetProcAddress( - "eglGetPlatformDisplayEXT"); - eglGetOutputLayersEXT = - (PFNEGLGETOUTPUTLAYERSEXTPROC)eglGetProcAddress("eglGetOutputLayersEXT"); - eglCreateStreamKHR = - (PFNEGLCREATESTREAMKHRPROC)eglGetProcAddress("eglCreateStreamKHR"); - eglDestroyStreamKHR = - (PFNEGLDESTROYSTREAMKHRPROC)eglGetProcAddress("eglDestroyStreamKHR"); - eglStreamConsumerOutputEXT = - (PFNEGLSTREAMCONSUMEROUTPUTEXTPROC)eglGetProcAddress( - "eglStreamConsumerOutputEXT"); - eglCreateStreamProducerSurfaceKHR = - (PFNEGLCREATESTREAMPRODUCERSURFACEKHRPROC)eglGetProcAddress( - "eglCreateStreamProducerSurfaceKHR"); - if (!eglQueryDevicesEXT || !eglQueryDeviceStringEXT || - !eglGetPlatformDisplayEXT || !eglGetOutputLayersEXT || - !eglCreateStreamKHR || !eglDestroyStreamKHR || - !eglStreamConsumerOutputEXT || !eglCreateStreamProducerSurfaceKHR) { - printf("Missing required function(s)\n"); - exit(2); - } - printf("Loaded extension functions\n"); - - // Query device - if (!eglQueryDevicesEXT(device + 1, egl_devs, &n) || (n <= device)) { - printf("Requested device index (%d) not found\n", device); - exit(2); - } - egl_dev = egl_devs[device]; - - // Obtain and open DRM device file - drm_name = eglQueryDeviceStringEXT(egl_dev, EGL_DRM_DEVICE_FILE_EXT); - if (!drm_name) { - printf("Couldn't obtain device file from 0x%p\n", - (void *)(uintptr_t)egl_dev); - exit(3); - } - - if (!strcmp(drm_name, "drm-nvdc")) { - drm_fd = drmOpen(drm_name, NULL); - } else { - drm_fd = open(drm_name, O_RDWR, 0); - } - - if (drm_fd == -1) { - printf("Couldn't open device file '%s'\n", drm_name); - exit(3); - } - printf("Device file: %s\n", drm_name); - - // Obtain DRM-KMS resources - drm_res_info = drmModeGetResources(drm_fd); - if (!drm_res_info) { - printf("Couldn't obtain DRM-KMS resources\n"); - exit(3); - } - printf("Obtained device information\n"); - - // If a specific crtc was requested, make sure it exists - if (crtc >= drm_res_info->count_crtcs) { - printf("Requested crtc index (%d) exceeds count (%d)\n", crtc, - drm_res_info->count_crtcs); - exit(4); - } - crtc_mask = - (crtc >= 0) ? (1 << crtc) : ((1 << drm_res_info->count_crtcs) - 1); - - // If drawing to a plane is requested, obtain the plane info - if (plane >= 0) { - drm_plane_res_info = drmModeGetPlaneResources(drm_fd); - if (!drm_plane_res_info) { - printf("Unable to obtain plane resource list\n"); - exit(5); + // Load extension function pointers. + eglQueryDevicesEXT = (PFNEGLQUERYDEVICESEXTPROC)eglGetProcAddress("eglQueryDevicesEXT"); + eglQueryDeviceStringEXT = (PFNEGLQUERYDEVICESTRINGEXTPROC)eglGetProcAddress("eglQueryDeviceStringEXT"); + eglGetPlatformDisplayEXT = (PFNEGLGETPLATFORMDISPLAYEXTPROC)eglGetProcAddress("eglGetPlatformDisplayEXT"); + eglGetOutputLayersEXT = (PFNEGLGETOUTPUTLAYERSEXTPROC)eglGetProcAddress("eglGetOutputLayersEXT"); + eglCreateStreamKHR = (PFNEGLCREATESTREAMKHRPROC)eglGetProcAddress("eglCreateStreamKHR"); + eglDestroyStreamKHR = (PFNEGLDESTROYSTREAMKHRPROC)eglGetProcAddress("eglDestroyStreamKHR"); + eglStreamConsumerOutputEXT = (PFNEGLSTREAMCONSUMEROUTPUTEXTPROC)eglGetProcAddress("eglStreamConsumerOutputEXT"); + eglCreateStreamProducerSurfaceKHR = + (PFNEGLCREATESTREAMPRODUCERSURFACEKHRPROC)eglGetProcAddress("eglCreateStreamProducerSurfaceKHR"); + if (!eglQueryDevicesEXT || !eglQueryDeviceStringEXT || !eglGetPlatformDisplayEXT || !eglGetOutputLayersEXT + || !eglCreateStreamKHR || !eglDestroyStreamKHR || !eglStreamConsumerOutputEXT + || !eglCreateStreamProducerSurfaceKHR) { + printf("Missing required function(s)\n"); + exit(2); } - if (plane >= drm_plane_res_info->count_planes) { - printf("Requested plane index (%d) exceeds count (%d)\n", plane, - drm_plane_res_info->count_planes); - exit(5); + printf("Loaded extension functions\n"); + + // Query device + if (!eglQueryDevicesEXT(device + 1, egl_devs, &n) || (n <= device)) { + printf("Requested device index (%d) not found\n", device); + exit(2); } - drm_plane_id = drm_plane_res_info->planes[plane]; - drm_plane_info = drmModeGetPlane(drm_fd, drm_plane_id); - if (!drm_plane_info) { - printf("Unable to obtain info for plane (%d)\n", drm_plane_id); - exit(5); - } - crtc_mask &= drm_plane_info->possible_crtcs; - if (!crtc_mask) { - printf("Requested crtc and plane not compatible\n"); - exit(5); - } - printf("Obtained plane information\n"); - } + egl_dev = egl_devs[device]; - // Query info for requested connector - int conn = 0; - for (conn = 0; conn < drm_res_info->count_connectors; ++conn) { - drm_conn_id = drm_res_info->connectors[conn]; - drm_conn_info = drmModeGetConnector(drm_fd, drm_conn_id); - if (drm_conn_info != NULL) { - printf("connector %d found\n", drm_conn_info->connector_id); - if (drm_conn_info->connection == DRM_MODE_CONNECTED) { - break; - } - drmModeFreeConnector(drm_conn_info); - } - } - - if (conn == drm_res_info->count_connectors) { - printf("No active connectors found\n"); - exit(6); - } - printf("Obtained connector information\n"); - - // If there is already an encoder attached to the connector, choose - // it unless not compatible with crtc/plane - drm_enc_id = drm_conn_info->encoder_id; - drm_enc_info = drmModeGetEncoder(drm_fd, drm_enc_id); - if (drm_enc_info) { - if (!(drm_enc_info->possible_crtcs & crtc_mask)) { - drmModeFreeEncoder(drm_enc_info); - drm_enc_info = NULL; - } - } - - // If we didn't have a suitable encoder, find one - if (!drm_enc_info) { - for (i = 0; i < drm_conn_info->count_encoders; ++i) { - drm_enc_id = drm_conn_info->encoders[i]; - drm_enc_info = drmModeGetEncoder(drm_fd, drm_enc_id); - if (drm_enc_info) { - if (crtc_mask & drm_enc_info->possible_crtcs) { - crtc_mask &= drm_enc_info->possible_crtcs; - break; - } - drmModeFreeEncoder(drm_enc_info); - drm_enc_info = NULL; - } - } - if (i == drm_conn_info->count_encoders) { - printf("Unable to find suitable encoder\n"); - exit(7); - } - } - printf("Obtained encoder information\n"); - - // Select a suitable crtc. Give preference to any that's already - // attached to the encoder. (Could make this more sophisticated - // by finding one not already bound to any other encoders. But - // this is just a basic test, so we don't really care that much.) - assert(crtc_mask); - for (i = 0; i < drm_res_info->count_crtcs; ++i) { - if (crtc_mask & (1 << i)) { - drm_crtc_id = drm_res_info->crtcs[i]; - if (drm_res_info->crtcs[i] == drm_enc_info->crtc_id) { - break; - } - } - } - - // Query info for crtc - drm_crtc_info = drmModeGetCrtc(drm_fd, drm_crtc_id); - if (!drm_crtc_info) { - printf("Unable to obtain info for crtc (%d)\n", drm_crtc_id); - exit(4); - } - printf("Obtained crtc information\n"); - - // If dimensions are specified and not using a plane, find closest mode - if ((xmodesize || ymodesize) && (plane < 0)) { - // Find best fit among available modes - int best_index = 0; - int best_fit = 0x7fffffff; - for (i = 0; i < drm_conn_info->count_modes; ++i) { - drmModeModeInfoPtr mode = drm_conn_info->modes + i; - int fit = 0; - - if (xmodesize) { - fit += abs((int)mode->hdisplay - xmodesize) * (int)mode->vdisplay; - } - if (ymodesize) { - fit += abs((int)mode->vdisplay - ymodesize) * (int)mode->hdisplay; - } - - if (fit < best_fit) { - best_index = i; - best_fit = fit; - } + // Obtain and open DRM device file + drm_name = eglQueryDeviceStringEXT(egl_dev, EGL_DRM_DEVICE_FILE_EXT); + if (!drm_name) { + printf("Couldn't obtain device file from 0x%p\n", (void *)(uintptr_t)egl_dev); + exit(3); } - // Choose this size/mode - drm_mode_index = best_index; - xmodesize = (int)drm_conn_info->modes[best_index].hdisplay; - ymodesize = (int)drm_conn_info->modes[best_index].vdisplay; - } - - // We'll only set the mode if we have to. This hopefully allows - // multiple instances of this application to run, writing to - // separate planes of the same display, as long as they don't - // specifiy incompatible settings. - if ((drm_conn_info->encoder_id != drm_enc_id) || - (drm_enc_info->crtc_id != drm_crtc_id) || !drm_crtc_info->mode_valid || - ((plane < 0) && xmodesize && - (xmodesize != (int)drm_crtc_info->mode.hdisplay)) || - ((plane < 0) && ymodesize && - (ymodesize != (int)drm_crtc_info->mode.vdisplay))) { - set_mode = true; - } - - // If dimensions haven't been specified, figure out good values to use - if (!xmodesize || !ymodesize) { - // If mode requires reset, just pick the first one available - // from the connector - if (set_mode) { - xmodesize = (int)drm_conn_info->modes[0].hdisplay; - ymodesize = (int)drm_conn_info->modes[0].vdisplay; + if (!strcmp(drm_name, "drm-nvdc")) { + drm_fd = drmOpen(drm_name, NULL); } - - // Otherwise get it from the current crtc settings else { - xmodesize = (int)drm_crtc_info->mode.hdisplay; - ymodesize = (int)drm_crtc_info->mode.vdisplay; - } - } - printf("Determine mode settings\n"); - - // If surf size is unspecified, default to fullscreen normally - // or to 1/4 fullscreen if in animated bounce mode. - if (!xsurfsize || !ysurfsize) { - if (bounce) { - xsurfsize = xmodesize / 2; - ysurfsize = ymodesize / 2; - } else { - xsurfsize = xmodesize; - ysurfsize = ymodesize; - } - } - printf("Determine surface size\n"); - - // create framebuffer (required for nvidia-drm) - drmVersionPtr version = drmGetVersion(drm_fd); - if (!version) { - printf("drmGetVersion() failed..\n"); - exit(1); - } - - if (!strcmp(version->name, "nvidia-drm")) { - drm_mode_create_dumb prop; - memset(&prop, 0, sizeof(drm_mode_create_dumb)); - prop.width = xmodesize; - prop.height = ymodesize; - prop.bpp = 32; - - int res = drmIoctl(drm_fd, DRM_IOCTL_MODE_CREATE_DUMB, &prop); - if (res) { - printf("drmIoctl() failed..(%d)\n", res); - exit(1); + drm_fd = open(drm_name, O_RDWR, 0); } - uint32_t offset = 0; - res = drmModeAddFB2(drm_fd, xmodesize, ymodesize, DRM_FORMAT_ARGB8888, - &(prop.handle), &(prop.pitch), &offset, &fb_id, 0); - if (res) { - printf("drmModeAddFB() failed..(%d)\n", res); - exit(1); + if (drm_fd == -1) { + printf("Couldn't open device file '%s'\n", drm_name); + exit(3); } - } + printf("Device file: %s\n", drm_name); - if (version) { - drmFreeVersion(version); - version = NULL; - } + // Obtain DRM-KMS resources + drm_res_info = drmModeGetResources(drm_fd); + if (!drm_res_info) { + printf("Couldn't obtain DRM-KMS resources\n"); + exit(3); + } + printf("Obtained device information\n"); - // If necessary, set the mode - if (set_mode) { - drmModeSetCrtc(drm_fd, drm_crtc_id, fb_id, 0, 0, &drm_conn_id, 1, - drm_conn_info->modes + drm_mode_index); - printf("Set mode\n"); - } + // If a specific crtc was requested, make sure it exists + if (crtc >= drm_res_info->count_crtcs) { + printf("Requested crtc index (%d) exceeds count (%d)\n", crtc, drm_res_info->count_crtcs); + exit(4); + } + crtc_mask = (crtc >= 0) ? (1 << crtc) : ((1 << drm_res_info->count_crtcs) - 1); - // If plane is in use, set it - if (plane >= 0) { - drmModeSetPlane(drm_fd, drm_plane_id, drm_crtc_id, fb_id, 0, xoffset, - yoffset, xsurfsize, ysurfsize, 0, 0, xsurfsize << 16, - ysurfsize << 16); - printf("Set plane configuration\n"); - } + // If drawing to a plane is requested, obtain the plane info + if (plane >= 0) { + drm_plane_res_info = drmModeGetPlaneResources(drm_fd); + if (!drm_plane_res_info) { + printf("Unable to obtain plane resource list\n"); + exit(5); + } + if (plane >= drm_plane_res_info->count_planes) { + printf("Requested plane index (%d) exceeds count (%d)\n", plane, drm_plane_res_info->count_planes); + exit(5); + } + drm_plane_id = drm_plane_res_info->planes[plane]; + drm_plane_info = drmModeGetPlane(drm_fd, drm_plane_id); + if (!drm_plane_info) { + printf("Unable to obtain info for plane (%d)\n", drm_plane_id); + exit(5); + } + crtc_mask &= drm_plane_info->possible_crtcs; + if (!crtc_mask) { + printf("Requested crtc and plane not compatible\n"); + exit(5); + } + printf("Obtained plane information\n"); + } - // Obtain and initialize EGLDisplay - int attr[] = {EGL_DRM_MASTER_FD_EXT, drm_fd, EGL_NONE}; - eglDisplay = - eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, (void *)egl_dev, attr); - if (eglDisplay == EGL_NO_DISPLAY) { - printf("Couldn't obtain EGLDisplay for device\n"); - exit(8); - } - if (!eglInitialize(eglDisplay, &major, &minor)) { - printf("Couldn't initialize EGLDisplay (error 0x%x)\n", eglGetError()); - exit(8); - } - printf("Obtained EGLDisplay\n"); + // Query info for requested connector + int conn = 0; + for (conn = 0; conn < drm_res_info->count_connectors; ++conn) { + drm_conn_id = drm_res_info->connectors[conn]; + drm_conn_info = drmModeGetConnector(drm_fd, drm_conn_id); + if (drm_conn_info != NULL) { + printf("connector %d found\n", drm_conn_info->connector_id); + if (drm_conn_info->connection == DRM_MODE_CONNECTED) { + break; + } + drmModeFreeConnector(drm_conn_info); + } + } - // Check for stream_consumer_egloutput + output_drm support - const char *dpy_exts = eglQueryString(eglDisplay, EGL_EXTENSIONS); - const char *dev_exts = eglQueryDeviceStringEXT(egl_dev, EGL_EXTENSIONS); + if (conn == drm_res_info->count_connectors) { + printf("No active connectors found\n"); + exit(6); + } + printf("Obtained connector information\n"); - if (!CheckExtension(dpy_exts, "EGL_EXT_output_base")) { - printf("Missing required extension: EGL_EXT_output_base\n"); - exit(2); - } + // If there is already an encoder attached to the connector, choose + // it unless not compatible with crtc/plane + drm_enc_id = drm_conn_info->encoder_id; + drm_enc_info = drmModeGetEncoder(drm_fd, drm_enc_id); + if (drm_enc_info) { + if (!(drm_enc_info->possible_crtcs & crtc_mask)) { + drmModeFreeEncoder(drm_enc_info); + drm_enc_info = NULL; + } + } - if (!CheckExtension(dev_exts, "EGL_EXT_device_drm")) { - printf("Missing required extension: EGL_EXT_device_drm\n"); - exit(2); - } + // If we didn't have a suitable encoder, find one + if (!drm_enc_info) { + for (i = 0; i < drm_conn_info->count_encoders; ++i) { + drm_enc_id = drm_conn_info->encoders[i]; + drm_enc_info = drmModeGetEncoder(drm_fd, drm_enc_id); + if (drm_enc_info) { + if (crtc_mask & drm_enc_info->possible_crtcs) { + crtc_mask &= drm_enc_info->possible_crtcs; + break; + } + drmModeFreeEncoder(drm_enc_info); + drm_enc_info = NULL; + } + } + if (i == drm_conn_info->count_encoders) { + printf("Unable to find suitable encoder\n"); + exit(7); + } + } + printf("Obtained encoder information\n"); - if (!CheckExtension(dpy_exts, "EGL_EXT_output_drm")) { - printf("Missing required extension: EGL_EXT_output_drm\n"); - exit(2); - } + // Select a suitable crtc. Give preference to any that's already + // attached to the encoder. (Could make this more sophisticated + // by finding one not already bound to any other encoders. But + // this is just a basic test, so we don't really care that much.) + assert(crtc_mask); + for (i = 0; i < drm_res_info->count_crtcs; ++i) { + if (crtc_mask & (1 << i)) { + drm_crtc_id = drm_res_info->crtcs[i]; + if (drm_res_info->crtcs[i] == drm_enc_info->crtc_id) { + break; + } + } + } - if (!CheckExtension(dpy_exts, "EGL_EXT_stream_consumer_egloutput")) { - printf("Missing required extension: EGL_EXT_stream_consumer_egloutput\n"); - exit(2); - } + // Query info for crtc + drm_crtc_info = drmModeGetCrtc(drm_fd, drm_crtc_id); + if (!drm_crtc_info) { + printf("Unable to obtain info for crtc (%d)\n", drm_crtc_id); + exit(4); + } + printf("Obtained crtc information\n"); - // Choose a config and create a context - EGLint cfg_attr[] = {EGL_SURFACE_TYPE, - EGL_STREAM_BIT_KHR, - EGL_RENDERABLE_TYPE, - EGL_OPENGL_ES2_BIT, - EGL_ALPHA_SIZE, - 1, - EGL_NONE}; - if (!eglChooseConfig(eglDisplay, cfg_attr, &egl_cfg, 1, &n) || !n) { - printf( - "Unable to obtain config that supports stream rendering (error 0x%x)\n", - eglGetError()); - exit(9); - } - EGLint ctx_attr[] = {EGL_CONTEXT_CLIENT_VERSION, 2, EGL_NONE}; + // If dimensions are specified and not using a plane, find closest mode + if ((xmodesize || ymodesize) && (plane < 0)) { + // Find best fit among available modes + int best_index = 0; + int best_fit = 0x7fffffff; + for (i = 0; i < drm_conn_info->count_modes; ++i) { + drmModeModeInfoPtr mode = drm_conn_info->modes + i; + int fit = 0; - eglBindAPI(EGL_OPENGL_ES_API); + if (xmodesize) { + fit += abs((int)mode->hdisplay - xmodesize) * (int)mode->vdisplay; + } + if (ymodesize) { + fit += abs((int)mode->vdisplay - ymodesize) * (int)mode->hdisplay; + } - eglContext = eglCreateContext(eglDisplay, egl_cfg, EGL_NO_CONTEXT, ctx_attr); - if (eglContext == EGL_NO_CONTEXT) { - printf("Unable to create context (error 0x%x)\n", eglGetError()); - exit(9); - } - printf("Obtained EGLConfig and EGLContext\n"); + if (fit < best_fit) { + best_index = i; + best_fit = fit; + } + } - // Get the layer for this crtc/plane - EGLAttrib layer_attr[] = {EGL_NONE, EGL_NONE, EGL_NONE}; - if (plane >= 0) { - layer_attr[0] = EGL_DRM_PLANE_EXT; - layer_attr[1] = (EGLAttrib)drm_plane_id; - } else { - layer_attr[0] = EGL_DRM_CRTC_EXT; - layer_attr[1] = (EGLAttrib)drm_crtc_id; - } - if (!eglGetOutputLayersEXT(eglDisplay, layer_attr, &egl_lyr, 1, &n) || !n) { - printf("Unable to obtain EGLOutputLayer for %s 0x%x\n", - (plane >= 0) ? "plane" : "crtc", (int)layer_attr[1]); - exit(10); - } - printf("Obtained EGLOutputLayer\n"); + // Choose this size/mode + drm_mode_index = best_index; + xmodesize = (int)drm_conn_info->modes[best_index].hdisplay; + ymodesize = (int)drm_conn_info->modes[best_index].vdisplay; + } - // Create a stream and connect to the output - EGLint stream_attr[] = {EGL_STREAM_FIFO_LENGTH_KHR, fifo, EGL_NONE}; - egl_str = eglCreateStreamKHR(eglDisplay, stream_attr); - if (egl_str == EGL_NO_STREAM_KHR) { - printf("Unable to create stream (error 0x%x)\n", eglGetError()); - exit(11); - } - if (!eglStreamConsumerOutputEXT(eglDisplay, egl_str, egl_lyr)) { - printf("Unable to connect stream (error 0x%x)\n", eglGetError()); - exit(11); - } + // We'll only set the mode if we have to. This hopefully allows + // multiple instances of this application to run, writing to + // separate planes of the same display, as long as they don't + // specifiy incompatible settings. + if ((drm_conn_info->encoder_id != drm_enc_id) || (drm_enc_info->crtc_id != drm_crtc_id) + || !drm_crtc_info->mode_valid || ((plane < 0) && xmodesize && (xmodesize != (int)drm_crtc_info->mode.hdisplay)) + || ((plane < 0) && ymodesize && (ymodesize != (int)drm_crtc_info->mode.vdisplay))) { + set_mode = true; + } - // Create a surface to feed the stream - EGLint srf_attr[] = {EGL_WIDTH, xsurfsize, EGL_HEIGHT, ysurfsize, EGL_NONE}; - eglSurface = - eglCreateStreamProducerSurfaceKHR(eglDisplay, egl_cfg, egl_str, srf_attr); - if (eglSurface == EGL_NO_SURFACE) { - printf("Unable to create rendering surface (error 0x%x)\n", eglGetError()); - exit(12); - } - printf("Bound layer to rendering surface\n"); + // If dimensions haven't been specified, figure out good values to use + if (!xmodesize || !ymodesize) { + // If mode requires reset, just pick the first one available + // from the connector + if (set_mode) { + xmodesize = (int)drm_conn_info->modes[0].hdisplay; + ymodesize = (int)drm_conn_info->modes[0].vdisplay; + } - // Make current - if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) { - printf("Unable to make context/surface current (error 0x%x)\n", - eglGetError()); - exit(13); - } + // Otherwise get it from the current crtc settings + else { + xmodesize = (int)drm_crtc_info->mode.hdisplay; + ymodesize = (int)drm_crtc_info->mode.vdisplay; + } + } + printf("Determine mode settings\n"); - EGLint Context_RendererType; - eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, - &Context_RendererType); + // If surf size is unspecified, default to fullscreen normally + // or to 1/4 fullscreen if in animated bounce mode. + if (!xsurfsize || !ysurfsize) { + if (bounce) { + xsurfsize = xmodesize / 2; + ysurfsize = ymodesize / 2; + } + else { + xsurfsize = xmodesize; + ysurfsize = ymodesize; + } + } + printf("Determine surface size\n"); + + // create framebuffer (required for nvidia-drm) + drmVersionPtr version = drmGetVersion(drm_fd); + if (!version) { + printf("drmGetVersion() failed..\n"); + exit(1); + } + + if (!strcmp(version->name, "nvidia-drm")) { + drm_mode_create_dumb prop; + memset(&prop, 0, sizeof(drm_mode_create_dumb)); + prop.width = xmodesize; + prop.height = ymodesize; + prop.bpp = 32; + + int res = drmIoctl(drm_fd, DRM_IOCTL_MODE_CREATE_DUMB, &prop); + if (res) { + printf("drmIoctl() failed..(%d)\n", res); + exit(1); + } + + uint32_t offset = 0; + res = drmModeAddFB2( + drm_fd, xmodesize, ymodesize, DRM_FORMAT_ARGB8888, &(prop.handle), &(prop.pitch), &offset, &fb_id, 0); + if (res) { + printf("drmModeAddFB() failed..(%d)\n", res); + exit(1); + } + } + + if (version) { + drmFreeVersion(version); + version = NULL; + } + + // If necessary, set the mode + if (set_mode) { + drmModeSetCrtc(drm_fd, drm_crtc_id, fb_id, 0, 0, &drm_conn_id, 1, drm_conn_info->modes + drm_mode_index); + printf("Set mode\n"); + } + + // If plane is in use, set it + if (plane >= 0) { + drmModeSetPlane(drm_fd, + drm_plane_id, + drm_crtc_id, + fb_id, + 0, + xoffset, + yoffset, + xsurfsize, + ysurfsize, + 0, + 0, + xsurfsize << 16, + ysurfsize << 16); + printf("Set plane configuration\n"); + } + + // Obtain and initialize EGLDisplay + int attr[] = {EGL_DRM_MASTER_FD_EXT, drm_fd, EGL_NONE}; + eglDisplay = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, (void *)egl_dev, attr); + if (eglDisplay == EGL_NO_DISPLAY) { + printf("Couldn't obtain EGLDisplay for device\n"); + exit(8); + } + if (!eglInitialize(eglDisplay, &major, &minor)) { + printf("Couldn't initialize EGLDisplay (error 0x%x)\n", eglGetError()); + exit(8); + } + printf("Obtained EGLDisplay\n"); + + // Check for stream_consumer_egloutput + output_drm support + const char *dpy_exts = eglQueryString(eglDisplay, EGL_EXTENSIONS); + const char *dev_exts = eglQueryDeviceStringEXT(egl_dev, EGL_EXTENSIONS); + + if (!CheckExtension(dpy_exts, "EGL_EXT_output_base")) { + printf("Missing required extension: EGL_EXT_output_base\n"); + exit(2); + } + + if (!CheckExtension(dev_exts, "EGL_EXT_device_drm")) { + printf("Missing required extension: EGL_EXT_device_drm\n"); + exit(2); + } + + if (!CheckExtension(dpy_exts, "EGL_EXT_output_drm")) { + printf("Missing required extension: EGL_EXT_output_drm\n"); + exit(2); + } + + if (!CheckExtension(dpy_exts, "EGL_EXT_stream_consumer_egloutput")) { + printf("Missing required extension: EGL_EXT_stream_consumer_egloutput\n"); + exit(2); + } + + // Choose a config and create a context + EGLint cfg_attr[] = { + EGL_SURFACE_TYPE, EGL_STREAM_BIT_KHR, EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, EGL_ALPHA_SIZE, 1, EGL_NONE}; + if (!eglChooseConfig(eglDisplay, cfg_attr, &egl_cfg, 1, &n) || !n) { + printf("Unable to obtain config that supports stream rendering (error 0x%x)\n", eglGetError()); + exit(9); + } + EGLint ctx_attr[] = {EGL_CONTEXT_CLIENT_VERSION, 2, EGL_NONE}; + + eglBindAPI(EGL_OPENGL_ES_API); + + eglContext = eglCreateContext(eglDisplay, egl_cfg, EGL_NO_CONTEXT, ctx_attr); + if (eglContext == EGL_NO_CONTEXT) { + printf("Unable to create context (error 0x%x)\n", eglGetError()); + exit(9); + } + printf("Obtained EGLConfig and EGLContext\n"); + + // Get the layer for this crtc/plane + EGLAttrib layer_attr[] = {EGL_NONE, EGL_NONE, EGL_NONE}; + if (plane >= 0) { + layer_attr[0] = EGL_DRM_PLANE_EXT; + layer_attr[1] = (EGLAttrib)drm_plane_id; + } + else { + layer_attr[0] = EGL_DRM_CRTC_EXT; + layer_attr[1] = (EGLAttrib)drm_crtc_id; + } + if (!eglGetOutputLayersEXT(eglDisplay, layer_attr, &egl_lyr, 1, &n) || !n) { + printf("Unable to obtain EGLOutputLayer for %s 0x%x\n", (plane >= 0) ? "plane" : "crtc", (int)layer_attr[1]); + exit(10); + } + printf("Obtained EGLOutputLayer\n"); + + // Create a stream and connect to the output + EGLint stream_attr[] = {EGL_STREAM_FIFO_LENGTH_KHR, fifo, EGL_NONE}; + egl_str = eglCreateStreamKHR(eglDisplay, stream_attr); + if (egl_str == EGL_NO_STREAM_KHR) { + printf("Unable to create stream (error 0x%x)\n", eglGetError()); + exit(11); + } + if (!eglStreamConsumerOutputEXT(eglDisplay, egl_str, egl_lyr)) { + printf("Unable to connect stream (error 0x%x)\n", eglGetError()); + exit(11); + } + + // Create a surface to feed the stream + EGLint srf_attr[] = {EGL_WIDTH, xsurfsize, EGL_HEIGHT, ysurfsize, EGL_NONE}; + eglSurface = eglCreateStreamProducerSurfaceKHR(eglDisplay, egl_cfg, egl_str, srf_attr); + if (eglSurface == EGL_NO_SURFACE) { + printf("Unable to create rendering surface (error 0x%x)\n", eglGetError()); + exit(12); + } + printf("Bound layer to rendering surface\n"); + + // Make current + if (!eglMakeCurrent(eglDisplay, eglSurface, eglSurface, eglContext)) { + printf("Unable to make context/surface current (error 0x%x)\n", eglGetError()); + exit(13); + } + + EGLint Context_RendererType; + eglQueryContext(eglDisplay, eglContext, EGL_CONTEXT_CLIENT_TYPE, &Context_RendererType); #if 0 switch (Context_RendererType) @@ -577,8 +557,8 @@ int graphics_setup_window(int xpos, int ypos, int width, int height, } #endif -#if 0 // obtain API function pointers _manually_ (see function pointer - // declarations above) +#if 0 // obtain API function pointers _manually_ (see function pointer + // declarations above) my_glGetError = (glGetErrorTYPE) eglGetProcAddress("glGetError"); my_glGetString = (glGetStringTYPE) eglGetProcAddress("glGetString"); my_glGetProgramiv = (glGetProgramivTYPE) eglGetProcAddress("glGetProgramiv"); @@ -593,25 +573,26 @@ int graphics_setup_window(int xpos, int ypos, int width, int height, GL_APICALL void GL_APIENTRY glBindBuffer (GLenum target, GLuint buffer); #endif - return 1; + return 1; } -void graphics_set_windowtitle(const char *windowname) { - printf(" Window title would have been: %s\n", windowname); -} +void graphics_set_windowtitle(const char *windowname) { printf(" Window title would have been: %s\n", windowname); } void graphics_swap_buffers() { eglSwapBuffers(eglDisplay, eglSurface); } -void graphics_close_window() { - if (eglDisplay != EGL_NO_DISPLAY) { - eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); +void graphics_close_window() +{ + if (eglDisplay != EGL_NO_DISPLAY) { + eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT); - if (eglContext != EGL_NO_CONTEXT) eglDestroyContext(eglDisplay, eglContext); + if (eglContext != EGL_NO_CONTEXT) + eglDestroyContext(eglDisplay, eglContext); - if (eglSurface != EGL_NO_SURFACE) eglDestroySurface(eglDisplay, eglSurface); + if (eglSurface != EGL_NO_SURFACE) + eglDestroySurface(eglDisplay, eglSurface); - eglTerminate(eglDisplay); - } + eglTerminate(eglDisplay); + } #if 0 if (plane >= 0) @@ -623,5 +604,5 @@ void graphics_close_window() { NULL); #endif - printf("Released display resources\n"); + printf("Released display resources\n"); } diff --git a/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/simpleGLES_EGLOutput.cu b/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/simpleGLES_EGLOutput.cu index 328d16cb..a526ce46 100644 --- a/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/simpleGLES_EGLOutput.cu +++ b/Samples/8_Platform_Specific/Tegra/simpleGLES_EGLOutput/simpleGLES_EGLOutput.cu @@ -42,19 +42,19 @@ // includes, system #include +#include #include #include #include - -#include #include -void error_exit(const char *format, ...) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - exit(1); +void error_exit(const char *format, ...) +{ + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + exit(1); } #if 0 @@ -74,60 +74,60 @@ void error_exit(const char *format, ...) { #include // Utilities and timing functions -#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda.h and cuda_runtime_api.h // CUDA helper functions -#include // helper functions for CUDA error check - //#include // helper functions for CUDA/GL interop +#include // helper functions for CUDA error check + // #include // helper functions for CUDA/GL interop #include #define MAX_EPSILON_ERROR 0.0f -#define THRESHOLD 0.0f -#define REFRESH_DELAY 1 // ms +#define THRESHOLD 0.0f +#define REFRESH_DELAY 1 // ms -#define GUI_IDLE 0x100 -#define GUI_ROTATE 0x101 +#define GUI_IDLE 0x100 +#define GUI_ROTATE 0x101 #define GUI_TRANSLATE 0x102 int gui_mode; //////////////////////////////////////////////////////////////////////////////// // constants -const unsigned int window_width = 512; +const unsigned int window_width = 512; const unsigned int window_height = 512; -const unsigned int mesh_width = 256; +const unsigned int mesh_width = 256; const unsigned int mesh_height = 256; // OpenGL ES variables and interop with CUDA C -GLuint mesh_vao, mesh_vbo; +GLuint mesh_vao, mesh_vbo; struct cudaGraphicsResource *cuda_vbo_resource; -void *d_vbo_buffer = NULL; +void *d_vbo_buffer = NULL; float g_fAnim = 0.0; // UI / mouse controls -int mouse_old_x, mouse_old_y; -int mouse_buttons = 0; +int mouse_old_x, mouse_old_y; +int mouse_buttons = 0; float rotate_x = 0.0, rotate_y = 0.0; float translate_z = -3.0; StopWatchInterface *timer = NULL; // Frame statistics -int frame; -int fpsCount = 0; // FPS count for averaging -int fpsLimit = 1; // FPS limit for sampling -int g_Index = 0; -float avgFPS = 0.0f; -unsigned int frameCount = 0; +int frame; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +float avgFPS = 0.0f; +unsigned int frameCount = 0; unsigned int g_TotalErrors = 0; // Auto-Verification Code bool g_bQAReadback = false; -int *pArgc = NULL; +int *pArgc = NULL; char **pArgv = NULL; #define MAX(a, b) ((a > b) ? a : b) @@ -142,76 +142,76 @@ void checkResultCuda(int argc, char **argv, const GLuint &vbo); const char *sSDKsample = "simpleGLES (VBO)"; -void computeFPS() { - frameCount++; - fpsCount++; +void computeFPS() +{ + frameCount++; + fpsCount++; - if (fpsCount == fpsLimit) { - avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); - fpsCount = 0; - fpsLimit = (int)MAX(avgFPS, 1.f); + if (fpsCount == fpsLimit) { + avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + fpsCount = 0; + fpsLimit = (int)MAX(avgFPS, 1.f); - sdkResetTimer(&timer); - } + sdkResetTimer(&timer); + } - char fps[256]; - sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)", - avgFPS); - graphics_set_windowtitle(fps); + char fps[256]; + sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)", avgFPS); + graphics_set_windowtitle(fps); } /////////////////////////////////////////////////////////////////////////////// //! Simple kernel to modify vertex positions in sine wave pattern //! @param data data in global memory /////////////////////////////////////////////////////////////////////////////// -__global__ void simple_vbo_kernel(float4 *pos, unsigned int width, - unsigned int height, float time) { - unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void simple_vbo_kernel(float4 *pos, unsigned int width, unsigned int height, float time) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - // calculate uv coordinates - float u = x / (float)width; - float v = y / (float)height; - u = u * 2.0f - 1.0f; - v = v * 2.0f - 1.0f; + // calculate uv coordinates + float u = x / (float)width; + float v = y / (float)height; + u = u * 2.0f - 1.0f; + v = v * 2.0f - 1.0f; - // calculate simple sine wave pattern - float freq = 4.0f; - float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; + // calculate simple sine wave pattern + float freq = 4.0f; + float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f; - // write output vertex - pos[y * width + x] = make_float4(u, w, v, 1.0f); + // write output vertex + pos[y * width + x] = make_float4(u, w, v, 1.0f); } -void launch_kernel(float4 *pos, unsigned int mesh_width, - unsigned int mesh_height, float time) { - // execute the kernel - dim3 block(8, 8, 1); - dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); - simple_vbo_kernel<<>>(pos, mesh_width, mesh_height, time); +void launch_kernel(float4 *pos, unsigned int mesh_width, unsigned int mesh_height, float time) +{ + // execute the kernel + dim3 block(8, 8, 1); + dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); + simple_vbo_kernel<<>>(pos, mesh_width, mesh_height, time); } //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void runCuda(struct cudaGraphicsResource **vbo_resource) { - // map OpenGL buffer object for writing from CUDA - float4 *dptr; - cudaGraphicsMapResources(1, vbo_resource, 0); - size_t num_bytes; - cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, - *vbo_resource); - // printf("Sample CUDA mapped VBO: May access %ld bytes\n", num_bytes); +void runCuda(struct cudaGraphicsResource **vbo_resource) +{ + // map OpenGL buffer object for writing from CUDA + float4 *dptr; + cudaGraphicsMapResources(1, vbo_resource, 0); + size_t num_bytes; + cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *vbo_resource); + // printf("Sample CUDA mapped VBO: May access %ld bytes\n", num_bytes); - // execute the kernel - // dim3 block(8, 8, 1); - // dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); - // kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim); + // execute the kernel + // dim3 block(8, 8, 1); + // dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); + // kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim); - launch_kernel(dptr, mesh_width, mesh_height, g_fAnim); + launch_kernel(dptr, mesh_width, mesh_height, g_fAnim); - // unmap buffer object - cudaGraphicsUnmapResources(1, vbo_resource, 0); + // unmap buffer object + cudaGraphicsUnmapResources(1, vbo_resource, 0); } #ifdef _WIN32 @@ -224,348 +224,348 @@ void runCuda(struct cudaGraphicsResource **vbo_resource) { #endif #endif -void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) { - printf("sdkDumpBin: <%s>\n", filename); - FILE *fp; - FOPEN(fp, filename, "wb"); - fwrite(data, bytes, 1, fp); - fflush(fp); - fclose(fp); +void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) +{ + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); } //////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// -void runAutoTest(int devID, char **argv, char *ref_file) { - char *reference_file = NULL; - void *imageData = malloc(mesh_width * mesh_height * sizeof(float)); +void runAutoTest(int devID, char **argv, char *ref_file) +{ + char *reference_file = NULL; + void *imageData = malloc(mesh_width * mesh_height * sizeof(float)); - // execute the kernel - launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim); + // execute the kernel + launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim); - cudaDeviceSynchronize(); - getLastCudaError("launch_kernel failed"); + cudaDeviceSynchronize(); + getLastCudaError("launch_kernel failed"); - cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float), - cudaMemcpyDeviceToHost); + cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float), cudaMemcpyDeviceToHost); - sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float), - "simpleGL.bin"); - reference_file = sdkFindFilePath(ref_file, argv[0]); + sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float), "simpleGL.bin"); + reference_file = sdkFindFilePath(ref_file, argv[0]); - if (reference_file && - !sdkCompareBin2BinFloat("simpleGL.bin", reference_file, - mesh_width * mesh_height * sizeof(float), - MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) { - g_TotalErrors++; - } + if (reference_file + && !sdkCompareBin2BinFloat("simpleGL.bin", + reference_file, + mesh_width * mesh_height * sizeof(float), + MAX_EPSILON_ERROR, + THRESHOLD, + pArgv[0])) { + g_TotalErrors++; + } } //////////////////////////////////////////////////////////////////////////////// //! Display callback //////////////////////////////////////////////////////////////////////////////// -void display_thisframe(float time_delta) { - sdkStartTimer(&timer); +void display_thisframe(float time_delta) +{ + sdkStartTimer(&timer); - // run CUDA kernel to generate vertex positions - runCuda(&cuda_vbo_resource); + // run CUDA kernel to generate vertex positions + runCuda(&cuda_vbo_resource); - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - // GET_GLERROR(0); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + // GET_GLERROR(0); - // set view matrix: broken, it doesn't work in OpenGL ES! Must put into shader - // glMatrixMode(GL_MODELVIEW); - // glLoadIdentity(); - // glTranslatef(0.0, 0.0, translate_z); - // glRotatef(rotate_x, 1.0, 0.0, 0.0); - // glRotatef(rotate_y, 0.0, 1.0, 0.0); + // set view matrix: broken, it doesn't work in OpenGL ES! Must put into shader + // glMatrixMode(GL_MODELVIEW); + // glLoadIdentity(); + // glTranslatef(0.0, 0.0, translate_z); + // glRotatef(rotate_x, 1.0, 0.0, 0.0); + // glRotatef(rotate_y, 0.0, 1.0, 0.0); - glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); + glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); - // GET_GLERROR(0); - glFinish(); - // GET_GLERROR(0); + // GET_GLERROR(0); + glFinish(); + // GET_GLERROR(0); - g_fAnim += time_delta; + g_fAnim += time_delta; - sdkStopTimer(&timer); - computeFPS(); + sdkStopTimer(&timer); + computeFPS(); } //////////////////////////////////////////////////////////////////////////////// //! Check if the result is correct or write data to file for external //! regression testing //////////////////////////////////////////////////////////////////////////////// -void checkResultCuda(int argc, char **argv, const GLuint &vbo) { - if (!d_vbo_buffer) { - printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__); +void checkResultCuda(int argc, char **argv, const GLuint &vbo) +{ + if (!d_vbo_buffer) { + printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__); - cudaGraphicsUnregisterResource(cuda_vbo_resource); + cudaGraphicsUnregisterResource(cuda_vbo_resource); - // map buffer object - glBindBuffer(GL_ARRAY_BUFFER, vbo); - float *data = (float *)glMapBufferRange( - GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float), - GL_READ_ONLY); + // map buffer object + glBindBuffer(GL_ARRAY_BUFFER, vbo); + float *data = + (float *)glMapBufferRange(GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float), GL_READ_ONLY); - // check result - if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { - // write file for regression test - sdkWriteFile("./data/regression.dat", data, - mesh_width * mesh_height * 3, 0.0, false); + // check result + if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { + // write file for regression test + sdkWriteFile("./data/regression.dat", data, mesh_width * mesh_height * 3, 0.0, false); + } + + // unmap GL buffer object + if (!glUnmapBuffer(GL_ARRAY_BUFFER)) { + fprintf(stderr, "Unmap buffer failed.\n"); + fflush(stderr); + } + + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); + + GET_GLERROR(0); } - - // unmap GL buffer object - if (!glUnmapBuffer(GL_ARRAY_BUFFER)) { - fprintf(stderr, "Unmap buffer failed.\n"); - fflush(stderr); - } - - checkCudaErrors(cudaGraphicsGLRegisterBuffer( - &cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); - - GET_GLERROR(0); - } } GLuint mesh_shader = 0; -void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, - const char *filename, GLenum shaderType) { - FILE *file = fopen(filename, "rb"); // open shader text file - if (!file) error_exit("Filename %s does not exist\n", filename); +void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram, const char *filename, GLenum shaderType) +{ + FILE *file = fopen(filename, "rb"); // open shader text file + if (!file) + error_exit("Filename %s does not exist\n", filename); - /* get the size of the file and read it */ - fseek(file, 0, SEEK_END); - GLint size = ftell(file); - char *data = (char *)malloc(sizeof(char) * (size + 1)); - memset(data, 0, sizeof(char) * (size + 1)); - fseek(file, 0, SEEK_SET); - size_t res = fread(data, 1, size, file); - fclose(file); + /* get the size of the file and read it */ + fseek(file, 0, SEEK_END); + GLint size = ftell(file); + char *data = (char *)malloc(sizeof(char) * (size + 1)); + memset(data, 0, sizeof(char) * (size + 1)); + fseek(file, 0, SEEK_SET); + size_t res = fread(data, 1, size, file); + fclose(file); - GLuint shader = glCreateShader(shaderType); - glShaderSource(shader, 1, (const GLchar **)&data, &size); - glCompileShader(shader); + GLuint shader = glCreateShader(shaderType); + glShaderSource(shader, 1, (const GLchar **)&data, &size); + glCompileShader(shader); - GET_GLERROR(0); - GLint compile_success = 0; - glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success); - GET_GLERROR(0); + GET_GLERROR(0); + GLint compile_success = 0; + glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success); + GET_GLERROR(0); - if (compile_success == GL_FALSE) { - printf("Compilation of %s failed!\n Reason:\n", filename); + if (compile_success == GL_FALSE) { + printf("Compilation of %s failed!\n Reason:\n", filename); - GLint maxLength = 0; - glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength); + GLint maxLength = 0; + glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength); - char errorLog[maxLength]; - glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]); + char errorLog[maxLength]; + glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]); - printf("%s", errorLog); + printf("%s", errorLog); - glDeleteShader(shader); - exit(1); - } + glDeleteShader(shader); + exit(1); + } - glAttachShader(new_shaderprogram, shader); - glDeleteShader(shader); // good to do? + glAttachShader(new_shaderprogram, shader); + glDeleteShader(shader); // good to do? - free(data); + free(data); } -GLuint ShaderCreate(const char *vshader_filename, - const char *fshader_filename) { - printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename); +GLuint ShaderCreate(const char *vshader_filename, const char *fshader_filename) +{ + printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename); - GLuint new_shaderprogram = glCreateProgram(); + GLuint new_shaderprogram = glCreateProgram(); - GET_GLERROR(0); - if (vshader_filename) - readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename, - GL_VERTEX_SHADER); + GET_GLERROR(0); + if (vshader_filename) + readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename, GL_VERTEX_SHADER); - GET_GLERROR(0); - if (fshader_filename) - readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename, - GL_FRAGMENT_SHADER); + GET_GLERROR(0); + if (fshader_filename) + readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename, GL_FRAGMENT_SHADER); - GET_GLERROR(0); + GET_GLERROR(0); - glLinkProgram(new_shaderprogram); + glLinkProgram(new_shaderprogram); - GET_GLERROR(0); - GLint link_success; - glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success); + GET_GLERROR(0); + GLint link_success; + glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success); - if (link_success == GL_FALSE) { - printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename, - fshader_filename); + if (link_success == GL_FALSE) { + printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename, fshader_filename); - GLint maxLength = 0; - glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength); + GLint maxLength = 0; + glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength); - char errorLog[maxLength]; - glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]); + char errorLog[maxLength]; + glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]); - printf("%s", errorLog); + printf("%s", errorLog); - exit(EXIT_FAILURE); - } + exit(EXIT_FAILURE); + } - return new_shaderprogram; + return new_shaderprogram; } //=========================================================================== // InitGraphicsState() - initialize OpenGL //=========================================================================== -static void InitGraphicsState(char **argv) { - char *GL_version = (char *)glGetString(GL_VERSION); - char *GL_vendor = (char *)glGetString(GL_VENDOR); - char *GL_renderer = (char *)glGetString(GL_RENDERER); +static void InitGraphicsState(char **argv) +{ + char *GL_version = (char *)glGetString(GL_VERSION); + char *GL_vendor = (char *)glGetString(GL_VENDOR); + char *GL_renderer = (char *)glGetString(GL_RENDERER); - printf("Version: %s\n", GL_version); - printf("Vendor: %s\n", GL_vendor); - printf("Renderer: %s\n", GL_renderer); + printf("Version: %s\n", GL_version); + printf("Vendor: %s\n", GL_vendor); + printf("Renderer: %s\n", GL_renderer); - // RENDERING SETUP (OpenGL ES or OpenGL Core Profile!) - glGenVertexArrays(1, &mesh_vao); // Features' Vertex Array Object allocation - glBindVertexArray(mesh_vao); // bind VAO + // RENDERING SETUP (OpenGL ES or OpenGL Core Profile!) + glGenVertexArrays(1, &mesh_vao); // Features' Vertex Array Object allocation + glBindVertexArray(mesh_vao); // bind VAO - // initialize buffer object - glGenBuffers(1, &mesh_vbo); - glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo); + // initialize buffer object + glGenBuffers(1, &mesh_vbo); + glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo); - unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); - glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW); - glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0); - glEnableVertexAttribArray(0); + unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); + glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW); + glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0); + glEnableVertexAttribArray(0); - checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo, - cudaGraphicsMapFlagsNone)); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo, cudaGraphicsMapFlagsNone)); - // glBindVertexArray(0); // keep above Vertex Array Object bound (it's the - // only one throughout) + // glBindVertexArray(0); // keep above Vertex Array Object bound (it's the + // only one throughout) - // GLSL stuff - char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", argv[0]); - char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", argv[0]); + // GLSL stuff + char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", argv[0]); + char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", argv[0]); - if (vertex_shader_path == NULL || fragment_shader_path == NULL) { - printf("Error finding shader file\n"); - exit(EXIT_FAILURE); - } + if (vertex_shader_path == NULL || fragment_shader_path == NULL) { + printf("Error finding shader file\n"); + exit(EXIT_FAILURE); + } - mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path); - GET_GLERROR(0); + mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path); + GET_GLERROR(0); - free(vertex_shader_path); - free(fragment_shader_path); + free(vertex_shader_path); + free(fragment_shader_path); - glUseProgram(mesh_shader); + glUseProgram(mesh_shader); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// -bool runTest(int argc, char **argv, char *ref_file) { - // Create the CUTIL timer - sdkCreateTimer(&timer); +bool runTest(int argc, char **argv, char *ref_file) +{ + // Create the CUTIL timer + sdkCreateTimer(&timer); - int devID = 0; + int devID = 0; #if defined(__aarch64__) || defined(__arm__) - // find iGPU on the system which is compute capable which will perform - // GLES-CUDA interop - devID = findIntegratedGPU(); + // find iGPU on the system which is compute capable which will perform + // GLES-CUDA interop + devID = findIntegratedGPU(); #else - // use command-line specified CUDA device, otherwise use device with highest - // Gflops/s - devID = findCudaDevice(argc, (const char **)argv); + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + devID = findCudaDevice(argc, (const char **)argv); #endif - // command line mode only - if (ref_file != NULL) { - // create VBO - checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, - mesh_width * mesh_height * 4 * sizeof(float))); + // command line mode only + if (ref_file != NULL) { + // create VBO + checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, mesh_width * mesh_height * 4 * sizeof(float))); - // run the cuda part - runAutoTest(devID, argv, ref_file); + // run the cuda part + runAutoTest(devID, argv, ref_file); - // check result of Cuda step - checkResultCuda(argc, argv, mesh_vbo); + // check result of Cuda step + checkResultCuda(argc, argv, mesh_vbo); - cudaFree(d_vbo_buffer); - d_vbo_buffer = NULL; - } else { - // this would use command-line specified CUDA device, note that CUDA - // defaults to highest Gflops/s device - if (checkCmdLineFlag(argc, (const char **)argv, "device")) - error_exit("Device setting not yet implemented!\n"); + cudaFree(d_vbo_buffer); + d_vbo_buffer = NULL; + } + else { + // this would use command-line specified CUDA device, note that CUDA + // defaults to highest Gflops/s device + if (checkCmdLineFlag(argc, (const char **)argv, "device")) + error_exit("Device setting not yet implemented!\n"); - // create X11 window and set up associated OpenGL ES context - graphics_setup_window(0, 0, window_width, window_height, sSDKsample); + // create X11 window and set up associated OpenGL ES context + graphics_setup_window(0, 0, window_width, window_height, sSDKsample); - InitGraphicsState(argv); // set up GLES stuff + InitGraphicsState(argv); // set up GLES stuff - glClearColor(0, 0.5, 1, 1); // blue-ish background - glClear(GL_COLOR_BUFFER_BIT); + glClearColor(0, 0.5, 1, 1); // blue-ish background + glClear(GL_COLOR_BUFFER_BIT); - // printf("WP%d\n", __LINE__); - graphics_swap_buffers(); + // printf("WP%d\n", __LINE__); + graphics_swap_buffers(); - int frame = 0; + int frame = 0; - while (frame < 1000) { - display_thisframe(0.010); - usleep(1000); // need not take full CPU and GPU + while (frame < 1000) { + display_thisframe(0.010); + usleep(1000); // need not take full CPU and GPU - graphics_swap_buffers(); - // printf("frame %d\n",frame++); + graphics_swap_buffers(); + // printf("frame %d\n",frame++); + } + + // NOTE: Before destroying OpenGL ES context, must unregister all shared + // resources from CUDA ! + cudaGraphicsUnregisterResource(cuda_vbo_resource); + + graphics_close_window(); // close window and destroy OpenGL ES context + + sdkDeleteTimer(&timer); } - // NOTE: Before destroying OpenGL ES context, must unregister all shared - // resources from CUDA ! - cudaGraphicsUnregisterResource(cuda_vbo_resource); - - graphics_close_window(); // close window and destroy OpenGL ES context - - sdkDeleteTimer(&timer); - } - - return true; + return true; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { - char *ref_file = NULL; +int main(int argc, char **argv) +{ + char *ref_file = NULL; - pArgc = &argc; - pArgv = argv; + pArgc = &argc; + pArgv = argv; #if defined(__linux__) - setenv("DISPLAY", ":0", 0); + setenv("DISPLAY", ":0", 0); #endif - printf("%s starting...\n", sSDKsample); + printf("%s starting...\n", sSDKsample); - if (argc > 1) { - if (checkCmdLineFlag(argc, (const char **)argv, "file")) { - // In this mode, we run without OpenGL and see if VBO is generated - // correctly - getCmdLineArgumentString(argc, (const char **)argv, "file", - (char **)&ref_file); + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + // In this mode, we run without OpenGL and see if VBO is generated + // correctly + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); + } } - } - printf("\n"); + printf("\n"); - runTest(argc, argv, ref_file); + runTest(argc, argv, ref_file); - printf("%s completed, returned %s\n", sSDKsample, - (g_TotalErrors == 0) ? "OK" : "ERROR!"); + printf("%s completed, returned %s\n", sSDKsample, (g_TotalErrors == 0) ? "OK" : "ERROR!"); - exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/bin/x86_64/linux/release/APM_BlackScholes.txt b/bin/x86_64/linux/release/APM_BlackScholes.txt index f38b8578..82ba1aca 100644 --- a/bin/x86_64/linux/release/APM_BlackScholes.txt +++ b/bin/x86_64/linux/release/APM_BlackScholes.txt @@ -9,10 +9,10 @@ Initializing data... Data init done. Executing Black-Scholes GPU kernel (512 iterations)... -Options count : 8000000 +Options count : 8000000 BlackScholesGPU() time : 0.048059 msec Effective memory bandwidth: 1664.634581 GB/s -Gigaoptions per second : 166.463458 +Gigaoptions per second : 166.463458 BlackScholes, Throughput = 166.4635 GOptions/s, Time = 0.00005 s, Size = 8000000 options, NumDevsUsed = 1, Workgroup = 128 diff --git a/bin/x86_64/linux/release/APM_BlackScholes_nvrtc.txt b/bin/x86_64/linux/release/APM_BlackScholes_nvrtc.txt index a82bcc72..6cfcbee1 100644 --- a/bin/x86_64/linux/release/APM_BlackScholes_nvrtc.txt +++ b/bin/x86_64/linux/release/APM_BlackScholes_nvrtc.txt @@ -10,10 +10,10 @@ Initializing data... Data init done. Executing Black-Scholes GPU kernel (512 iterations)... -Options count : 8000000 +Options count : 8000000 BlackScholesGPU() time : 0.047896 msec Effective memory bandwidth: 1670.268678 GB/s -Gigaoptions per second : 167.026868 +Gigaoptions per second : 167.026868 BlackScholes, Throughput = 167.0269 GOptions/s, Time = 0.00005 s, Size = 8000000 options, NumDevsUsed = 1, Workgroup = 128 diff --git a/bin/x86_64/linux/release/APM_UnifiedMemoryPerf.txt b/bin/x86_64/linux/release/APM_UnifiedMemoryPerf.txt index 1d88907c..18f43ef8 100644 --- a/bin/x86_64/linux/release/APM_UnifiedMemoryPerf.txt +++ b/bin/x86_64/linux/release/APM_UnifiedMemoryPerf.txt @@ -2,7 +2,7 @@ GPU Device 0: "Hopper" with compute capability 9.0 Running ........................................................ -Overall Time For matrixMultiplyPerf +Overall Time For matrixMultiplyPerf Printing Average of 20 measurements in (ms) Size_KB UMhint UMhntAs UMeasy 0Copy MemCopy CpAsync CpHpglk CpPglAs diff --git a/bin/x86_64/linux/release/APM_batchCUBLAS.txt b/bin/x86_64/linux/release/APM_batchCUBLAS.txt index f9201c70..0a8230ea 100644 --- a/bin/x86_64/linux/release/APM_batchCUBLAS.txt +++ b/bin/x86_64/linux/release/APM_batchCUBLAS.txt @@ -3,7 +3,7 @@ batchCUBLAS Starting... GPU Device 0: "Hopper" with compute capability 9.0 - ==== Running single kernels ==== + ==== Running single kernels ==== Testing sgemm #### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0xbf800000, -1) beta= (0x40000000, 2) @@ -16,7 +16,7 @@ Testing dgemm ^^^^ elapsed = 0.00003910 sec GFLOPS=107.269 @@@@ dgemm test OK - ==== Running N=10 without streams ==== + ==== Running N=10 without streams ==== Testing sgemm #### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0xbf800000, -1) beta= (0x00000000, 0) @@ -29,7 +29,7 @@ Testing dgemm ^^^^ elapsed = 0.00144100 sec GFLOPS=29.1069 @@@@ dgemm test OK - ==== Running N=10 with streams ==== + ==== Running N=10 with streams ==== Testing sgemm #### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0x40000000, 2) beta= (0x40000000, 2) @@ -42,7 +42,7 @@ Testing dgemm ^^^^ elapsed = 0.00014997 sec GFLOPS=279.685 @@@@ dgemm test OK - ==== Running N=10 batched ==== + ==== Running N=10 batched ==== Testing sgemm #### args: ta=0 tb=0 m=128 n=128 k=128 alpha = (0x3f800000, 1) beta= (0xbf800000, -1) diff --git a/bin/x86_64/linux/release/APM_binaryPartitionCG.txt b/bin/x86_64/linux/release/APM_binaryPartitionCG.txt index f4ec7e58..b43e9ab3 100644 --- a/bin/x86_64/linux/release/APM_binaryPartitionCG.txt +++ b/bin/x86_64/linux/release/APM_binaryPartitionCG.txt @@ -6,4 +6,3 @@ Launching 228 blocks with 1024 threads... Array size = 102400 Num of Odds = 50945 Sum of Odds = 1272565 Sum of Evens 1233938 ...Done. - diff --git a/bin/x86_64/linux/release/APM_binomialOptions.txt b/bin/x86_64/linux/release/APM_binomialOptions.txt index 79dc9863..ba730256 100644 --- a/bin/x86_64/linux/release/APM_binomialOptions.txt +++ b/bin/x86_64/linux/release/APM_binomialOptions.txt @@ -3,10 +3,10 @@ GPU Device 0: "Hopper" with compute capability 9.0 Generating input data... Running GPU binomial tree... -Options count : 1024 -Time steps : 2048 +Options count : 1024 +Time steps : 2048 binomialOptionsGPU() time: 2.081000 msec -Options per second : 492071.098457 +Options per second : 492071.098457 Running CPU binomial tree... Comparing the results... GPU binomial vs. Black-Scholes diff --git a/bin/x86_64/linux/release/APM_binomialOptions_nvrtc.txt b/bin/x86_64/linux/release/APM_binomialOptions_nvrtc.txt index e68d12de..0bb26949 100644 --- a/bin/x86_64/linux/release/APM_binomialOptions_nvrtc.txt +++ b/bin/x86_64/linux/release/APM_binomialOptions_nvrtc.txt @@ -4,10 +4,10 @@ Running GPU binomial tree... > Using CUDA Device [0]: NVIDIA H100 PCIe > Using CUDA Device [0]: NVIDIA H100 PCIe > GPU Device has SM 9.0 compute capability -Options count : 1024 -Time steps : 2048 +Options count : 1024 +Time steps : 2048 binomialOptionsGPU() time: 3021.375000 msec -Options per second : 338.918539 +Options per second : 338.918539 Running CPU binomial tree... Comparing the results... GPU binomial vs. Black-Scholes diff --git a/bin/x86_64/linux/release/APM_conjugateGradientMultiBlockCG.txt b/bin/x86_64/linux/release/APM_conjugateGradientMultiBlockCG.txt index a0d0857e..9d14c109 100644 --- a/bin/x86_64/linux/release/APM_conjugateGradientMultiBlockCG.txt +++ b/bin/x86_64/linux/release/APM_conjugateGradientMultiBlockCG.txt @@ -4,5 +4,5 @@ GPU Device 0: "Hopper" with compute capability 9.0 > GPU device has 114 Multi-Processors, SM 9.0 compute capabilities GPU Final, residual = 1.600115e-06, kernel execution time = 16.014656 ms -Test Summary: Error amount = 0.000000 +Test Summary: Error amount = 0.000000 &&&& conjugateGradientMultiBlockCG PASSED diff --git a/bin/x86_64/linux/release/APM_conjugateGradientMultiDeviceCG.txt b/bin/x86_64/linux/release/APM_conjugateGradientMultiDeviceCG.txt index 3998a603..fee0ecb4 100644 --- a/bin/x86_64/linux/release/APM_conjugateGradientMultiDeviceCG.txt +++ b/bin/x86_64/linux/release/APM_conjugateGradientMultiDeviceCG.txt @@ -1,4 +1,4 @@ Starting [conjugateGradientMultiDeviceCG]... GPU Device 0: "NVIDIA H100 PCIe" with compute capability 9.0 -No two or more GPUs with same architecture capable of concurrentManagedAccess found. +No two or more GPUs with same architecture capable of concurrentManagedAccess found. Waiving the sample diff --git a/bin/x86_64/linux/release/APM_conjugateGradientPrecond.txt b/bin/x86_64/linux/release/APM_conjugateGradientPrecond.txt index f973b1d0..9a1e1520 100644 --- a/bin/x86_64/linux/release/APM_conjugateGradientPrecond.txt +++ b/bin/x86_64/linux/release/APM_conjugateGradientPrecond.txt @@ -1,19 +1,18 @@ conjugateGradientPrecond starting... GPU Device 0: "Hopper" with compute capability 9.0 -GPU selected Device ID = 0 +GPU selected Device ID = 0 > GPU device has 114 Multi-Processors, SM 9.0 compute capabilities laplace dimension = 128 -Convergence of CG without preconditioning: - iteration = 564, residual = 9.174634e-13 - Convergence Test: OK +Convergence of CG without preconditioning: + iteration = 564, residual = 9.174634e-13 + Convergence Test: OK -Convergence of CG using ILU(0) preconditioning: - iteration = 188, residual = 9.084683e-13 - Convergence Test: OK +Convergence of CG using ILU(0) preconditioning: + iteration = 188, residual = 9.084683e-13 + Convergence Test: OK Test Summary: Counted total of 0 errors qaerr1 = 0.000005 qaerr2 = 0.000003 - diff --git a/bin/x86_64/linux/release/APM_cppOverload.txt b/bin/x86_64/linux/release/APM_cppOverload.txt index b4ff5085..71332c78 100644 --- a/bin/x86_64/linux/release/APM_cppOverload.txt +++ b/bin/x86_64/linux/release/APM_cppOverload.txt @@ -28,4 +28,3 @@ Number of Registers: 14 PTX Version: 90 Binary Version: 90 simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED - diff --git a/bin/x86_64/linux/release/APM_cuSolverDn_LinearSolver.txt b/bin/x86_64/linux/release/APM_cuSolverDn_LinearSolver.txt index 7410608c..fe51a278 100644 --- a/bin/x86_64/linux/release/APM_cuSolverDn_LinearSolver.txt +++ b/bin/x86_64/linux/release/APM_cuSolverDn_LinearSolver.txt @@ -6,10 +6,10 @@ sparse matrix A is 900 x 900 with 7744 nonzeros, base=1 step 2: convert CSR(A) to dense matrix step 3: set right hand side vector (b) to 1 step 4: prepare data on device -step 5: solve A*x = b +step 5: solve A*x = b timing: cholesky = 0.000789 sec step 6: evaluate residual -|b - A*x| = 1.278977E-13 -|A| = 1.600000E+01 -|x| = 2.357708E+01 -|b - A*x|/(|A|*|x|) = 3.390413E-16 +|b - A*x| = 1.278977E-13 +|A| = 1.600000E+01 +|x| = 2.357708E+01 +|b - A*x|/(|A|*|x|) = 3.390413E-16 diff --git a/bin/x86_64/linux/release/APM_cuSolverRf.txt b/bin/x86_64/linux/release/APM_cuSolverRf.txt index e2191de7..87c8f655 100644 --- a/bin/x86_64/linux/release/APM_cuSolverRf.txt +++ b/bin/x86_64/linux/release/APM_cuSolverRf.txt @@ -3,47 +3,47 @@ step 1.1: read matrix market format GPU Device 0: "Hopper" with compute capability 9.0 Using default input file [../../../../Samples/4_CUDA_Libraries/cuSolverRf/lap2D_5pt_n100.mtx] -WARNING: cusolverRf only works for base-0 +WARNING: cusolverRf only works for base-0 sparse matrix A is 10000 x 10000 with 49600 nonzeros, base=0 step 1.2: set right hand side vector (b) to 1 step 2: reorder the matrix to reduce zero fill-in - Q = symrcm(A) or Q = symamd(A) + Q = symrcm(A) or Q = symamd(A) step 3: B = Q*A*Q^T step 4: solve A*x = b by LU(B) in cusolverSp step 4.1: create opaque info structure step 4.2: analyze LU(B) to know structure of Q and R, and upper bound for nnz(L+U) step 4.3: workspace for LU(B) -step 4.4: compute Ppivot*B = L*U -step 4.5: check if the matrix is singular -step 4.6: solve A*x = b - i.e. solve B*(Qx) = Q*b +step 4.4: compute Ppivot*B = L*U +step 4.5: check if the matrix is singular +step 4.6: solve A*x = b + i.e. solve B*(Qx) = Q*b step 4.7: evaluate residual r = b - A*x (result on CPU) -(CPU) |b - A*x| = 4.547474E-12 -(CPU) |A| = 8.000000E+00 -(CPU) |x| = 7.513384E+02 -(CPU) |b - A*x|/(|A|*|x|) = 7.565621E-16 -step 5: extract P, Q, L and U from P*B*Q^T = L*U +(CPU) |b - A*x| = 4.547474E-12 +(CPU) |A| = 8.000000E+00 +(CPU) |x| = 7.513384E+02 +(CPU) |b - A*x|/(|A|*|x|) = 7.565621E-16 +step 5: extract P, Q, L and U from P*B*Q^T = L*U L has implicit unit diagonal nnzL = 671550, nnzU = 681550 step 6: form P*A*Q^T = L*U step 6.1: P = Plu*Qreroder -step 6.2: Q = Qlu*Qreorder +step 6.2: Q = Qlu*Qreorder step 7: create cusolverRf handle -step 8: set parameters for cusolverRf -step 9: assemble P*A*Q = L*U -step 10: analyze to extract parallelism -step 11: import A to cusolverRf -step 12: refactorization -step 13: solve A*x = b +step 8: set parameters for cusolverRf +step 9: assemble P*A*Q = L*U +step 10: analyze to extract parallelism +step 11: import A to cusolverRf +step 12: refactorization +step 13: solve A*x = b step 14: evaluate residual r = b - A*x (result on GPU) -(GPU) |b - A*x| = 4.320100E-12 -(GPU) |A| = 8.000000E+00 -(GPU) |x| = 7.513384E+02 -(GPU) |b - A*x|/(|A|*|x|) = 7.187340E-16 -===== statistics +(GPU) |b - A*x| = 4.320100E-12 +(GPU) |A| = 8.000000E+00 +(GPU) |x| = 7.513384E+02 +(GPU) |b - A*x|/(|A|*|x|) = 7.187340E-16 +===== statistics nnz(A) = 49600, nnz(L+U) = 1353100, zero fill-in ratio = 27.280242 -===== timing profile +===== timing profile reorder A : 0.003304 sec B = Q*A*Q^T : 0.000761 sec diff --git a/bin/x86_64/linux/release/APM_cuSolverSp_LinearSolver.txt b/bin/x86_64/linux/release/APM_cuSolverSp_LinearSolver.txt index 38e871bd..ac248568 100644 --- a/bin/x86_64/linux/release/APM_cuSolverSp_LinearSolver.txt +++ b/bin/x86_64/linux/release/APM_cuSolverSp_LinearSolver.txt @@ -5,27 +5,27 @@ step 1: read matrix market format sparse matrix A is 10000 x 10000 with 49600 nonzeros, base=1 step 2: reorder the matrix A to minimize zero fill-in if the user choose a reordering by -P=symrcm, -P=symamd or -P=metis -step 2.1: no reordering is chosen, Q = 0:n-1 -step 2.2: B = A(Q,Q) -step 3: b(j) = 1 + j/n +step 2.1: no reordering is chosen, Q = 0:n-1 +step 2.2: B = A(Q,Q) +step 3: b(j) = 1 + j/n step 4: prepare data on device -step 5: solve A*x = b on CPU +step 5: solve A*x = b on CPU step 6: evaluate residual r = b - A*x (result on CPU) -(CPU) |b - A*x| = 5.393685E-12 -(CPU) |A| = 8.000000E+00 -(CPU) |x| = 1.136492E+03 -(CPU) |b| = 1.999900E+00 -(CPU) |b - A*x|/(|A|*|x| + |b|) = 5.931079E-16 +(CPU) |b - A*x| = 5.393685E-12 +(CPU) |A| = 8.000000E+00 +(CPU) |x| = 1.136492E+03 +(CPU) |b| = 1.999900E+00 +(CPU) |b - A*x|/(|A|*|x| + |b|) = 5.931079E-16 step 7: solve A*x = b on GPU step 8: evaluate residual r = b - A*x (result on GPU) -(GPU) |b - A*x| = 1.970424E-12 -(GPU) |A| = 8.000000E+00 -(GPU) |x| = 1.136492E+03 -(GPU) |b| = 1.999900E+00 -(GPU) |b - A*x|/(|A|*|x| + |b|) = 2.166745E-16 +(GPU) |b - A*x| = 1.970424E-12 +(GPU) |A| = 8.000000E+00 +(GPU) |x| = 1.136492E+03 +(GPU) |b| = 1.999900E+00 +(GPU) |b - A*x|/(|A|*|x| + |b|) = 2.166745E-16 timing chol: CPU = 0.097956 sec , GPU = 0.103812 sec -show last 10 elements of solution vector (GPU) -consistent result for different reordering and solver +show last 10 elements of solution vector (GPU) +consistent result for different reordering and solver x[9990] = 3.000016E+01 x[9991] = 2.807343E+01 x[9992] = 2.601354E+01 diff --git a/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelCholesky.txt b/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelCholesky.txt index f5060534..e4268ab0 100644 --- a/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelCholesky.txt +++ b/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelCholesky.txt @@ -6,19 +6,19 @@ sparse matrix A is 10000 x 10000 with 49600 nonzeros, base=1 step 2: create opaque info structure step 3: analyze chol(A) to know structure of L step 4: workspace for chol(A) -step 5: compute A = L*L^T -step 6: check if the matrix is singular -step 7: solve A*x = b +step 5: compute A = L*L^T +step 6: check if the matrix is singular +step 7: solve A*x = b step 8: evaluate residual r = b - A*x (result on CPU) -(CPU) |b - A*x| = 3.637979E-12 -(CPU) |A| = 8.000000E+00 -(CPU) |x| = 7.513384E+02 -(CPU) |b - A*x|/(|A|*|x|) = 6.052497E-16 +(CPU) |b - A*x| = 3.637979E-12 +(CPU) |A| = 8.000000E+00 +(CPU) |x| = 7.513384E+02 +(CPU) |b - A*x|/(|A|*|x|) = 6.052497E-16 step 9: create opaque info structure step 10: analyze chol(A) to know structure of L step 11: workspace for chol(A) -step 12: compute A = L*L^T -step 13: check if the matrix is singular -step 14: solve A*x = b -(GPU) |b - A*x| = 1.477929E-12 -(GPU) |b - A*x|/(|A|*|x|) = 2.458827E-16 +step 12: compute A = L*L^T +step 13: check if the matrix is singular +step 14: solve A*x = b +(GPU) |b - A*x| = 1.477929E-12 +(GPU) |b - A*x|/(|A|*|x|) = 2.458827E-16 diff --git a/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelQR.txt b/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelQR.txt index a713108a..fb8fe23a 100644 --- a/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelQR.txt +++ b/bin/x86_64/linux/release/APM_cuSolverSp_LowlevelQR.txt @@ -6,20 +6,20 @@ sparse matrix A is 1024 x 1024 with 3008 nonzeros, base=1 step 2: create opaque info structure step 3: analyze qr(A) to know structure of L step 4: workspace for qr(A) -step 5: compute A = L*L^T -step 6: check if the matrix is singular -step 7: solve A*x = b +step 5: compute A = L*L^T +step 6: check if the matrix is singular +step 7: solve A*x = b step 8: evaluate residual r = b - A*x (result on CPU) -(CPU) |b - A*x| = 5.329071E-15 -(CPU) |A| = 6.000000E+00 -(CPU) |x| = 5.000000E-01 -(CPU) |b - A*x|/(|A|*|x|) = 1.776357E-15 +(CPU) |b - A*x| = 5.329071E-15 +(CPU) |A| = 6.000000E+00 +(CPU) |x| = 5.000000E-01 +(CPU) |b - A*x|/(|A|*|x|) = 1.776357E-15 step 9: create opaque info structure step 10: analyze qr(A) to know structure of L step 11: workspace for qr(A) GPU buffer size = 3751424 bytes -step 12: compute A = L*L^T -step 13: check if the matrix is singular -step 14: solve A*x = b -(GPU) |b - A*x| = 4.218847E-15 -(GPU) |b - A*x|/(|A|*|x|) = 1.406282E-15 +step 12: compute A = L*L^T +step 13: check if the matrix is singular +step 14: solve A*x = b +(GPU) |b - A*x| = 4.218847E-15 +(GPU) |b - A*x|/(|A|*|x|) = 1.406282E-15 diff --git a/bin/x86_64/linux/release/APM_cudaTensorCoreGemm.txt b/bin/x86_64/linux/release/APM_cudaTensorCoreGemm.txt index b471728d..da7266b5 100644 --- a/bin/x86_64/linux/release/APM_cudaTensorCoreGemm.txt +++ b/bin/x86_64/linux/release/APM_cudaTensorCoreGemm.txt @@ -6,6 +6,6 @@ N: 4096 (16 x 256) K: 4096 (16 x 256) Preparing data for GPU... Required shared memory size: 64 Kb -Computing... using high performance kernel compute_gemm +Computing... using high performance kernel compute_gemm Time: 1.223904 ms TFLOPS: 112.30 diff --git a/bin/x86_64/linux/release/APM_dct8x8.txt b/bin/x86_64/linux/release/APM_dct8x8.txt index 1d261602..e5516e5c 100644 --- a/bin/x86_64/linux/release/APM_dct8x8.txt +++ b/bin/x86_64/linux/release/APM_dct8x8.txt @@ -16,9 +16,9 @@ Dumping result to teapot512_gold2.bmp... Success Dumping result to teapot512_cuda1.bmp... Success Dumping result to teapot512_cuda2.bmp... Success Dumping result to teapot512_cuda_short.bmp... Success -Processing time (CUDA 1) : 0.021800 ms -Processing time (CUDA 2) : 0.003180 ms -Processing time (CUDA short): 0.033000 ms +Processing time (CUDA 1) : 0.021800 ms +Processing time (CUDA 2) : 0.003180 ms +Processing time (CUDA short): 0.033000 ms PSNR Original <---> CPU(Gold 1) : 32.527462 PSNR Original <---> CPU(Gold 2) : 32.527309 PSNR Original <---> GPU(CUDA 1) : 32.527184 diff --git a/bin/x86_64/linux/release/APM_deviceQueryDrv.txt b/bin/x86_64/linux/release/APM_deviceQueryDrv.txt index 6ae5fdf4..e8b99ea6 100644 --- a/bin/x86_64/linux/release/APM_deviceQueryDrv.txt +++ b/bin/x86_64/linux/release/APM_deviceQueryDrv.txt @@ -1,6 +1,6 @@ ./deviceQueryDrv Starting... -CUDA Device Query (Driver API) statically linked version +CUDA Device Query (Driver API) statically linked version Detected 1 CUDA Capable device(s) Device 0: "NVIDIA H100 PCIe" diff --git a/bin/x86_64/linux/release/APM_fp16ScalarProduct.txt b/bin/x86_64/linux/release/APM_fp16ScalarProduct.txt index ec58bd69..be322e76 100644 --- a/bin/x86_64/linux/release/APM_fp16ScalarProduct.txt +++ b/bin/x86_64/linux/release/APM_fp16ScalarProduct.txt @@ -1,5 +1,5 @@ GPU Device 0: "Hopper" with compute capability 9.0 -Result native operators : 644622.000000 -Result intrinsics : 644622.000000 +Result native operators : 644622.000000 +Result intrinsics : 644622.000000 &&&& fp16ScalarProduct PASSED diff --git a/bin/x86_64/linux/release/APM_immaTensorCoreGemm.txt b/bin/x86_64/linux/release/APM_immaTensorCoreGemm.txt index 0a0b0cf8..de18873a 100644 --- a/bin/x86_64/linux/release/APM_immaTensorCoreGemm.txt +++ b/bin/x86_64/linux/release/APM_immaTensorCoreGemm.txt @@ -6,6 +6,6 @@ N: 4096 (16 x 256) K: 4096 (16 x 256) Preparing data for GPU... Required shared memory size: 64 Kb -Computing... using high performance kernel compute_gemm_imma +Computing... using high performance kernel compute_gemm_imma Time: 0.629184 ms TOPS: 218.44 diff --git a/bin/x86_64/linux/release/APM_interval.txt b/bin/x86_64/linux/release/APM_interval.txt index 8f9cb27c..f6d8b36f 100644 --- a/bin/x86_64/linux/release/APM_interval.txt +++ b/bin/x86_64/linux/release/APM_interval.txt @@ -13,4 +13,3 @@ Number of equations solved: 65536 Time per equation: 0.616870105266571 us Check against Host computation... - diff --git a/bin/x86_64/linux/release/APM_p2pBandwidthLatencyTest.txt b/bin/x86_64/linux/release/APM_p2pBandwidthLatencyTest.txt index 19ba7393..fc1d4fc7 100644 --- a/bin/x86_64/linux/release/APM_p2pBandwidthLatencyTest.txt +++ b/bin/x86_64/linux/release/APM_p2pBandwidthLatencyTest.txt @@ -8,28 +8,28 @@ P2P Connectivity Matrix D\D 0 0 1 Unidirectional P2P=Disabled Bandwidth Matrix (GB/s) - D\D 0 - 0 1628.72 + D\D 0 + 0 1628.72 Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s) - D\D 0 - 0 1625.75 + D\D 0 + 0 1625.75 Bidirectional P2P=Disabled Bandwidth Matrix (GB/s) - D\D 0 - 0 1668.11 + D\D 0 + 0 1668.11 Bidirectional P2P=Enabled Bandwidth Matrix (GB/s) - D\D 0 - 0 1668.39 + D\D 0 + 0 1668.39 P2P=Disabled Latency Matrix (us) - GPU 0 - 0 2.67 + GPU 0 + 0 2.67 - CPU 0 - 0 2.04 + CPU 0 + 0 2.04 P2P=Enabled Latency (P2P Writes) Matrix (us) - GPU 0 - 0 2.68 + GPU 0 + 0 2.68 - CPU 0 - 0 2.02 + CPU 0 + 0 2.02 NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled. diff --git a/bin/x86_64/linux/release/APM_segmentationTreeThrust.txt b/bin/x86_64/linux/release/APM_segmentationTreeThrust.txt index eff36b2f..2f33f37e 100644 --- a/bin/x86_64/linux/release/APM_segmentationTreeThrust.txt +++ b/bin/x86_64/linux/release/APM_segmentationTreeThrust.txt @@ -4,4 +4,3 @@ GPU Device 0: "Hopper" with compute capability 9.0 * Building segmentation tree... done in 24.6388 (ms) * Dumping levels for each tree... - diff --git a/bin/x86_64/linux/release/APM_shfl_scan.txt b/bin/x86_64/linux/release/APM_shfl_scan.txt index f386deac..f7af0ecb 100644 --- a/bin/x86_64/linux/release/APM_shfl_scan.txt +++ b/bin/x86_64/linux/release/APM_shfl_scan.txt @@ -20,5 +20,5 @@ CPU sum (naive) took 0.017810 ms Computing Integral Image Test on size 1920 x 1080 synthetic data --------------------------------------------------- Method: Fast Time (GPU Timer): 0.008032 ms Diff = 0 -Method: Vertical Scan Time (GPU Timer): 0.068576 ms +Method: Vertical Scan Time (GPU Timer): 0.068576 ms CheckSum: 2073600, (expect 1920x1080=2073600) diff --git a/bin/x86_64/linux/release/APM_simpleCUBLASXT.txt b/bin/x86_64/linux/release/APM_simpleCUBLASXT.txt index 4401c66b..ca39b742 100644 --- a/bin/x86_64/linux/release/APM_simpleCUBLASXT.txt +++ b/bin/x86_64/linux/release/APM_simpleCUBLASXT.txt @@ -1,4 +1,4 @@ Using 1 GPUs -GPU ID = 0, Name = NVIDIA H100 PCIe +GPU ID = 0, Name = NVIDIA H100 PCIe simpleCUBLASXT test running.. simpleCUBLASXT test passed. diff --git a/bin/x86_64/linux/release/APM_simpleCooperativeGroups.txt b/bin/x86_64/linux/release/APM_simpleCooperativeGroups.txt index a221a50d..3835cd55 100644 --- a/bin/x86_64/linux/release/APM_simpleCooperativeGroups.txt +++ b/bin/x86_64/linux/release/APM_simpleCooperativeGroups.txt @@ -11,4 +11,3 @@ Launching a single block with 64 threads... Sum of all ranks 0..15 in this tiledPartition16 group is 120 (expected 120) ...Done. - diff --git a/bin/x86_64/linux/release/APM_simpleCudaGraphs.txt b/bin/x86_64/linux/release/APM_simpleCudaGraphs.txt index 8cee9e74..39e67186 100644 --- a/bin/x86_64/linux/release/APM_simpleCudaGraphs.txt +++ b/bin/x86_64/linux/release/APM_simpleCudaGraphs.txt @@ -8,7 +8,7 @@ Num of nodes in the graph created manually = 7 [cudaGraphsManual] Host callback final reduced sum = 0.996214 [cudaGraphsManual] Host callback final reduced sum = 0.996214 [cudaGraphsManual] Host callback final reduced sum = 0.996214 -Cloned Graph Output.. +Cloned Graph Output.. [cudaGraphsManual] Host callback final reduced sum = 0.996214 [cudaGraphsManual] Host callback final reduced sum = 0.996214 [cudaGraphsManual] Host callback final reduced sum = 0.996214 @@ -17,7 +17,7 @@ Num of nodes in the graph created using stream capture API = 7 [cudaGraphsUsingStreamCapture] Host callback final reduced sum = 0.996214 [cudaGraphsUsingStreamCapture] Host callback final reduced sum = 0.996214 [cudaGraphsUsingStreamCapture] Host callback final reduced sum = 0.996214 -Cloned Graph Output.. +Cloned Graph Output.. [cudaGraphsUsingStreamCapture] Host callback final reduced sum = 0.996214 [cudaGraphsUsingStreamCapture] Host callback final reduced sum = 0.996214 [cudaGraphsUsingStreamCapture] Host callback final reduced sum = 0.996214 diff --git a/bin/x86_64/linux/release/APM_simpleMultiCopy.txt b/bin/x86_64/linux/release/APM_simpleMultiCopy.txt index f72d2a0e..8553a916 100644 --- a/bin/x86_64/linux/release/APM_simpleMultiCopy.txt +++ b/bin/x86_64/linux/release/APM_simpleMultiCopy.txt @@ -18,7 +18,7 @@ Measured timings (throughput): Kernel : 0.033408 ms (5021.915549 GB/s) Theoretical limits for speedup gained from overlapped data transfers: -No overlap at all (transfer-kernel-transfer): 1.338048 ms +No overlap at all (transfer-kernel-transfer): 1.338048 ms Compute can overlap with one transfer: 1.304640 ms Compute can overlap with both data transfers: 0.694048 ms diff --git a/bin/x86_64/linux/release/APM_simpleMultiGPU.txt b/bin/x86_64/linux/release/APM_simpleMultiGPU.txt index 56cd257b..9d3a2c89 100644 --- a/bin/x86_64/linux/release/APM_simpleMultiGPU.txt +++ b/bin/x86_64/linux/release/APM_simpleMultiGPU.txt @@ -10,5 +10,4 @@ Computing with Host CPU... Comparing GPU and Host CPU results... GPU sum: 16777296.000000 CPU sum: 16777294.395033 - Relative difference: 9.566307E-08 - + Relative difference: 9.566307E-08 diff --git a/bin/x86_64/linux/release/APM_simpleOccupancy.txt b/bin/x86_64/linux/release/APM_simpleOccupancy.txt index 1adf7e75..deae69b8 100644 --- a/bin/x86_64/linux/release/APM_simpleOccupancy.txt +++ b/bin/x86_64/linux/release/APM_simpleOccupancy.txt @@ -11,4 +11,3 @@ Potential occupancy: 100% Elapsed time: 0.012384ms Test PASSED - diff --git a/bin/x86_64/linux/release/APM_systemWideAtomics.txt b/bin/x86_64/linux/release/APM_systemWideAtomics.txt index 0c6d35c0..f935e62a 100644 --- a/bin/x86_64/linux/release/APM_systemWideAtomics.txt +++ b/bin/x86_64/linux/release/APM_systemWideAtomics.txt @@ -1,4 +1,4 @@ GPU Device 0: "Hopper" with compute capability 9.0 CANNOT access pageable memory -systemWideAtomics completed, returned OK +systemWideAtomics completed, returned OK diff --git a/bin/x86_64/linux/release/APM_threadMigration.txt b/bin/x86_64/linux/release/APM_threadMigration.txt index ed1f6c05..7143485f 100644 --- a/bin/x86_64/linux/release/APM_threadMigration.txt +++ b/bin/x86_64/linux/release/APM_threadMigration.txt @@ -15,4 +15,3 @@ Device 0: "NVIDIA H100 PCIe" (Compute 9.0) - ThreadProc() Finished! - ThreadProc() Finished! - diff --git a/bin/x86_64/linux/release/APM_warpAggregatedAtomicsCG.txt b/bin/x86_64/linux/release/APM_warpAggregatedAtomicsCG.txt index 3d2797ac..4948cd66 100644 --- a/bin/x86_64/linux/release/APM_warpAggregatedAtomicsCG.txt +++ b/bin/x86_64/linux/release/APM_warpAggregatedAtomicsCG.txt @@ -2,4 +2,4 @@ GPU Device 0: "Hopper" with compute capability 9.0 CPU max matches GPU max -Warp Aggregated Atomics PASSED +Warp Aggregated Atomics PASSED